import math import sys class Node: def __init__(self, codeword, frequency, children): self.codeword = codeword self.frequency = frequency self.children = children def to_string(self): s = f"{self.codeword} ({self.frequency})" if self.children: s += f" {[n.codeword for n in self.children]}" return s def get_frequency_table(word): frequencies = {} # get letter frequencies for letter in word: if letter in frequencies.keys(): # frequencies stored as integers to avoid floating point problems frequencies[letter] += 1 else: frequencies[letter] = 1 # convert to a sorted list l = [(k, frequencies[k]) for k in frequencies] l = sorted(l, key=lambda v : v[1]) return l def calculate_entropy(word): # -sum(p_i log_b(p_i) # where b is usually 2, for binary result = 0 for k in get_frequency_table(word): # account for frequencies stored as floats p = k[1] / len(word) result -= p * math.log(p, 2) return result def huffman(word): codewords = get_frequency_table(word) # generate tree from codewords. priority is the set of unchecked nodes, tree # is the final result tree = [Node(e[0], e[1], []) for e in codewords] # codewords are sorted, so take two lowest values and combine them iteration = 0 while len(codewords) > 1: iteration += 1 # take first two elements least_frequent = codewords[:2] # remove these codewords from the list codewords = codewords[2:] combined_word = least_frequent[0][0] + least_frequent[1][0] combined_freq = least_frequent[0][1] + least_frequent[1][1] new_codeword = (combined_word, combined_freq) # find the original two nodes in the tree orig_nodes = [] for codeword in least_frequent: node = [n for n in tree if n.codeword == codeword[0]][0] orig_nodes.append(node) # create a new node for the tree with the original two nodes as children parent_node = Node(combined_word, combined_freq, orig_nodes) tree.append(parent_node) # return new codeword to list codewords.append(new_codeword) codewords = sorted(codewords, key=lambda v : v[1]) return max(tree, key=lambda v : v.frequency) def print_tree(root): nodes = [root] layer = 0 while len(nodes) > 0: children = [] for n in nodes: for c in n.children: children.append(c) node_str = ", ".join([n.to_string() for n in nodes]) print(f"{layer}: {node_str}") nodes = children layer += 1 def get_compressed_bits(root, letter): bit_string = "" node = root while len(node.children) > 0: if letter in node.children[0].codeword: bit_string += "0" node = node.children[0] else: bit_string += "1" node = node.children[1] return bit_string def get_uncompressed_bits(letter): return format(ord(letter), 'b') word = "C'est plus efficace en français qu'en anglais !" print(f"word: {word}") print(f"entropy: {calculate_entropy(word)}\n") root = huffman(word) print_tree(root) print() uncompressed = [get_uncompressed_bits(letter) for letter in word] # to get the uncompressed length we need to use the length of the longest word, since all words # would be encoded at the same length. leading zeroes are omitted by Python, but in a transmission # context all characters are the same number of bits. max_uncompressed_word_length = max([len(w) for w in uncompressed]) print(f"uncompressed: {uncompressed}") print(f"max individual word length: {max_uncompressed_word_length}") total_uncompressed_length = max_uncompressed_word_length*len(uncompressed) print(f"total length: {total_uncompressed_length}") print() # now we have the table, generate the string of bits from the word compressed = [get_compressed_bits(root, letter) for letter in word] print(f"compressed: {compressed}") total_compressed_length = sum([len(w) for w in compressed]) print(f"total length: {total_compressed_length}") print() print(f"compression ratio: {total_compressed_length/total_uncompressed_length:.2f}")