huffman/huffman.py

155 lines
4.2 KiB
Python
Raw Normal View History

2025-01-21 13:53:17 +01:00
import math
import sys
class Node:
def __init__(self, codeword, frequency, children):
self.codeword = codeword
self.frequency = frequency
self.children = children
def to_string(self):
s = f"{self.codeword} ({self.frequency})"
if self.children:
s += f" {[n.codeword for n in self.children]}"
return s
def get_frequency_table(word):
frequencies = {}
# get letter frequencies
for letter in word:
if letter in frequencies.keys():
# frequencies stored as integers to avoid floating point problems
frequencies[letter] += 1
else:
frequencies[letter] = 1
# convert to a sorted list
l = [(k, frequencies[k]) for k in frequencies]
l = sorted(l, key=lambda v : v[1])
return l
def calculate_entropy(word):
# -sum(p_i log_b(p_i)
# where b is usually 2, for binary
result = 0
for k in get_frequency_table(word):
# account for frequencies stored as floats
p = k[1] / len(word)
result -= p * math.log(p, 2)
return result
def huffman(word):
codewords = get_frequency_table(word)
# generate tree from codewords. priority is the set of unchecked nodes, tree
# is the final result
tree = [Node(e[0], e[1], []) for e in codewords]
# codewords are sorted, so take two lowest values and combine them
iteration = 0
while len(codewords) > 1:
iteration += 1
# take first two elements
least_frequent = codewords[:2]
# remove these codewords from the list
codewords = codewords[2:]
combined_word = least_frequent[0][0] + least_frequent[1][0]
combined_freq = least_frequent[0][1] + least_frequent[1][1]
new_codeword = (combined_word, combined_freq)
# find the original two nodes in the tree
orig_nodes = []
for codeword in least_frequent:
node = [n for n in tree if n.codeword == codeword[0]][0]
orig_nodes.append(node)
# create a new node for the tree with the original two nodes as children
parent_node = Node(combined_word, combined_freq, orig_nodes)
tree.append(parent_node)
# return new codeword to list
codewords.append(new_codeword)
codewords = sorted(codewords, key=lambda v : v[1])
return max(tree, key=lambda v : v.frequency)
def print_tree(root):
nodes = [root]
layer = 0
while len(nodes) > 0:
children = []
for n in nodes:
for c in n.children:
children.append(c)
node_str = ", ".join([n.to_string() for n in nodes])
print(f"{layer}: {node_str}")
nodes = children
layer += 1
def get_compressed_bits(root, letter):
bit_string = ""
node = root
while len(node.children) > 0:
if letter in node.children[0].codeword:
bit_string += "0"
node = node.children[0]
else:
bit_string += "1"
node = node.children[1]
return bit_string
def get_uncompressed_bits(letter):
return format(ord(letter), 'b')
word = "C'est plus efficace en français qu'en anglais !"
print(f"word: {word}")
print(f"entropy: {calculate_entropy(word)}\n")
root = huffman(word)
print_tree(root)
print()
uncompressed = [get_uncompressed_bits(letter) for letter in word]
# to get the uncompressed length we need to use the length of the longest word, since all words
# would be encoded at the same length. leading zeroes are omitted by Python, but in a transmission
# context all characters are the same number of bits.
max_uncompressed_word_length = max([len(w) for w in uncompressed])
print(f"uncompressed: {uncompressed}")
print(f"max individual word length: {max_uncompressed_word_length}")
total_uncompressed_length = max_uncompressed_word_length*len(uncompressed)
print(f"total length: {total_uncompressed_length}")
print()
# now we have the table, generate the string of bits from the word
compressed = [get_compressed_bits(root, letter) for letter in word]
print(f"compressed: {compressed}")
total_compressed_length = sum([len(w) for w in compressed])
print(f"total length: {total_compressed_length}")
print()
print(f"compression ratio: {total_compressed_length/total_uncompressed_length:.2f}")