commit e111b369ca7ce4797773b152f4dcea67736e8295 Author: ktyl Date: Tue Jan 21 12:53:17 2025 +0000 initial commit diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..a6607f1 --- /dev/null +++ b/Pipfile @@ -0,0 +1,12 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] + +[dev-packages] + +[requires] +python_version = "3.13" +python_full_version = "3.13.1" diff --git a/huffman.py b/huffman.py new file mode 100644 index 0000000..9ba18a3 --- /dev/null +++ b/huffman.py @@ -0,0 +1,154 @@ +import math +import sys + + +class Node: + def __init__(self, codeword, frequency, children): + self.codeword = codeword + self.frequency = frequency + self.children = children + + def to_string(self): + s = f"{self.codeword} ({self.frequency})" + if self.children: + s += f" {[n.codeword for n in self.children]}" + + return s + + +def get_frequency_table(word): + frequencies = {} + + # get letter frequencies + for letter in word: + if letter in frequencies.keys(): + # frequencies stored as integers to avoid floating point problems + frequencies[letter] += 1 + else: + frequencies[letter] = 1 + + # convert to a sorted list + l = [(k, frequencies[k]) for k in frequencies] + l = sorted(l, key=lambda v : v[1]) + + return l + + +def calculate_entropy(word): + # -sum(p_i log_b(p_i) + # where b is usually 2, for binary + + result = 0 + + for k in get_frequency_table(word): + # account for frequencies stored as floats + p = k[1] / len(word) + result -= p * math.log(p, 2) + + return result + + +def huffman(word): + codewords = get_frequency_table(word) + + # generate tree from codewords. priority is the set of unchecked nodes, tree + # is the final result + tree = [Node(e[0], e[1], []) for e in codewords] + + # codewords are sorted, so take two lowest values and combine them + iteration = 0 + while len(codewords) > 1: + iteration += 1 + + # take first two elements + least_frequent = codewords[:2] + + # remove these codewords from the list + codewords = codewords[2:] + + combined_word = least_frequent[0][0] + least_frequent[1][0] + combined_freq = least_frequent[0][1] + least_frequent[1][1] + new_codeword = (combined_word, combined_freq) + + # find the original two nodes in the tree + orig_nodes = [] + for codeword in least_frequent: + node = [n for n in tree if n.codeword == codeword[0]][0] + orig_nodes.append(node) + + # create a new node for the tree with the original two nodes as children + parent_node = Node(combined_word, combined_freq, orig_nodes) + tree.append(parent_node) + + # return new codeword to list + codewords.append(new_codeword) + codewords = sorted(codewords, key=lambda v : v[1]) + + return max(tree, key=lambda v : v.frequency) + + +def print_tree(root): + nodes = [root] + + layer = 0 + + while len(nodes) > 0: + children = [] + for n in nodes: + for c in n.children: + children.append(c) + node_str = ", ".join([n.to_string() for n in nodes]) + print(f"{layer}: {node_str}") + + nodes = children + layer += 1 + + +def get_compressed_bits(root, letter): + bit_string = "" + + node = root + + while len(node.children) > 0: + if letter in node.children[0].codeword: + bit_string += "0" + node = node.children[0] + else: + bit_string += "1" + node = node.children[1] + + return bit_string + +def get_uncompressed_bits(letter): + return format(ord(letter), 'b') + +word = "C'est plus efficace en français qu'en anglais !" + +print(f"word: {word}") +print(f"entropy: {calculate_entropy(word)}\n") + +root = huffman(word) +print_tree(root) + +print() + +uncompressed = [get_uncompressed_bits(letter) for letter in word] + +# to get the uncompressed length we need to use the length of the longest word, since all words +# would be encoded at the same length. leading zeroes are omitted by Python, but in a transmission +# context all characters are the same number of bits. +max_uncompressed_word_length = max([len(w) for w in uncompressed]) +print(f"uncompressed: {uncompressed}") +print(f"max individual word length: {max_uncompressed_word_length}") +total_uncompressed_length = max_uncompressed_word_length*len(uncompressed) +print(f"total length: {total_uncompressed_length}") +print() + +# now we have the table, generate the string of bits from the word +compressed = [get_compressed_bits(root, letter) for letter in word] +print(f"compressed: {compressed}") +total_compressed_length = sum([len(w) for w in compressed]) +print(f"total length: {total_compressed_length}") + +print() +print(f"compression ratio: {total_compressed_length/total_uncompressed_length:.2f}")