Word lists - compression
Posted: Mon May 18, 2026 9:57 pm
There are quite a few 'games' that use five letter word lists - Wordle being best known, but there are also Quordle and Octordle.
The fun part is writing a compression for the text.
Wordle actually uses two word lists - a longer one containing all valid words (14855 words - 74275 bytes) that the user can play. There is also a much shorter list that contains acceptable words for the game 'solution'.
I used the longer list here. To compress it - I store a 3 bit bitfield that specifies how many letters are duplicated from the previous word :
bowat // a what???
bowed // 3 letters are the same
bowel // 4 letters the same
Then I store remaining letters as 5 bit codes (gives 32 possible values which is okay for A..Z) This compressed the word list to 22972bytes (~30% of the original
)
This seemed pretty good - so although it would be possible to shave a little more off by using variable bit lengths for the letters (more common = shorter code) and even the duplicate letters field - I left it as is. Left as an exercise (I've posted a Huffman encoder previously)
The word list passed to the compression program - can easily be found with a little googling (Wordle word list) - I mark the end of the data by having a 'duplicate' prefix length if 7 (0b111)
On an esp32 - memory isn't a problem - I put the data array into supplementary code (on smaller MCUs it would probably be necessary to save as an array in flash memory - and use a smaller word list)
The encoder outputs to a file as a 'C' file (-o) a binary file (-b) or to UART as C.
Here the decoder just outputs the word list to UART. A Wordle solver is fairly easy to do to - or a hardware 'player'?
The compression (in Python)
The decoder uses a 'bit field' reader - which allows groups of bits to be read as required. It is quite fast (remove the print of the words - leaving just the start and finish!)
The fun part is writing a compression for the text.
Wordle actually uses two word lists - a longer one containing all valid words (14855 words - 74275 bytes) that the user can play. There is also a much shorter list that contains acceptable words for the game 'solution'.
I used the longer list here. To compress it - I store a 3 bit bitfield that specifies how many letters are duplicated from the previous word :
bowat // a what???
bowed // 3 letters are the same
bowel // 4 letters the same
Then I store remaining letters as 5 bit codes (gives 32 possible values which is okay for A..Z) This compressed the word list to 22972bytes (~30% of the original
This seemed pretty good - so although it would be possible to shave a little more off by using variable bit lengths for the letters (more common = shorter code) and even the duplicate letters field - I left it as is. Left as an exercise (I've posted a Huffman encoder previously)
The word list passed to the compression program - can easily be found with a little googling (Wordle word list) - I mark the end of the data by having a 'duplicate' prefix length if 7 (0b111)
On an esp32 - memory isn't a problem - I put the data array into supplementary code (on smaller MCUs it would probably be necessary to save as an array in flash memory - and use a smaller word list)
The encoder outputs to a file as a 'C' file (-o) a binary file (-b) or to UART as C.
Here the decoder just outputs the word list to UART. A Wordle solver is fairly easy to do to - or a hardware 'player'?
The compression (in Python)
Code: Select all
import sys
import os
import argparse
class BitPacker:
def __init__(self):
self.byte_array = []
self.current_byte = 0
self.bits_in_current = 0
def add_bits(self, value, num_bits):
value = value & ((1 << num_bits) - 1)
while num_bits > 0:
bits_available = 8 - self.bits_in_current
if num_bits <= bits_available:
self.current_byte = (self.current_byte << num_bits) | value
self.bits_in_current += num_bits
num_bits = 0
if self.bits_in_current == 8:
self.byte_array.append(self.current_byte)
self.current_byte = 0
self.bits_in_current = 0
else:
shift = num_bits - bits_available
top_bits = value >> shift
self.current_byte = (self.current_byte << bits_available) | top_bits
self.byte_array.append(self.current_byte)
self.current_byte = 0
self.bits_in_current = 0
value = value & ((1 << shift) - 1)
num_bits = shift
def flush(self):
if self.bits_in_current > 0:
self.current_byte = self.current_byte << (8 - self.bits_in_current)
self.byte_array.append(self.current_byte)
self.current_byte = 0
self.bits_in_current = 0
return self.byte_array
def generate_c_header(words, compressed_bytes):
num_words = len(words)
num_bytes = len(compressed_bytes)
original_size = num_words * 5
lines = []
lines.append(f"// --- Compression Stats ---")
lines.append(f"// Original words : {num_words}")
lines.append(f"// Original size : {original_size} bytes")
lines.append(f"// Compressed size: {num_bytes} bytes")
lines.append(f"// Ratio : {(num_bytes / original_size * 100):.2f}% of original size\n")
lines.append(f"#include <stdint.h>\n")
lines.append(f"const uint32_t DICT_NUM_WORDS = {num_words};")
lines.append(f"const uint32_t DICT_SIZE_BYTES = {num_bytes};")
lines.append(f"const uint8_t compressed_dict[{num_bytes}] = {{")
for i in range(0, num_bytes, 12):
chunk = compressed_bytes[i:i+12]
hex_chunk = ", ".join([f"0x{b:02X}" for b in chunk])
lines.append(f" {hex_chunk},")
lines.append("};")
return "\n".join(lines)
def process_file(filepath, c_output, bin_output):
# 1. Read and clean the words
with open(filepath, 'r', encoding='utf-8') as f:
words = [line.strip().upper() for line in f if len(line.strip()) == 5]
if not words:
print("No valid 5-letter words found in the file.")
sys.exit(1)
# 2. Sort words alphabetically
words.sort()
# 3. Compress using the 3-bit / 5-bit scheme
packer = BitPacker()
prev_word = ""
for word in words:
prefix_len = 0
if prev_word:
for p, c in zip(prev_word, word):
if p == c:
prefix_len += 1
else:
break
packer.add_bits(prefix_len, 3)
for i in range(prefix_len, 5):
char_code = ord(word[i]) - ord('A')
if not (0 <= char_code <= 25):
raise ValueError(f"Invalid character '{word[i]}' in word '{word}'. Only A-Z allowed.")
packer.add_bits(char_code, 5)
prev_word = word
# Explicit 3-bit EOF marker (Value 7) to stop the C decoder safely
packer.add_bits(7, 3)
# Flush any remaining bits into the final byte
compressed_bytes = packer.flush()
# 4. Handle Outputs
if bin_output:
with open(bin_output, 'wb') as f:
f.write(bytearray(compressed_bytes))
print(f"[\u2713] Raw binary saved to: {bin_output}")
if c_output:
c_code = generate_c_header(words, compressed_bytes)
with open(c_output, 'w') as f:
f.write(c_code)
print(f"[\u2713] C header saved to : {c_output}")
# If no output files specified, fallback to printing to terminal
if not bin_output and not c_output:
print(generate_c_header(words, compressed_bytes))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compress a list of 5-letter words for an MCU.")
parser.add_argument("input_file", help="Input text file containing one word per line.")
parser.add_argument("-o", "--output", help="Output path for the C header file (e.g., dict.h)", default=None)
parser.add_argument("-b", "--binary", help="Output path for the raw binary file (e.g., dict.bin)", default=None)
args = parser.parse_args()
if not os.path.isfile(args.input_file):
print(f"Error: Input file '{args.input_file}' not found.")
sys.exit(1)
process_file(args.input_file, args.output, args.binary)The decoder uses a 'bit field' reader - which allows groups of bits to be read as required. It is quite fast (remove the print of the words - leaving just the start and finish!)