commit e111b369ca7ce4797773b152f4dcea67736e8295
Author: ktyl <me@ktyl.dev>
Date:   Tue Jan 21 12:53:17 2025 +0000

    initial commit

diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..a6607f1
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,12 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+
+[dev-packages]
+
+[requires]
+python_version = "3.13"
+python_full_version = "3.13.1"
diff --git a/huffman.py b/huffman.py
new file mode 100644
index 0000000..9ba18a3
--- /dev/null
+++ b/huffman.py
@@ -0,0 +1,154 @@
+import math
+import sys
+
+
+class Node:
+    def __init__(self, codeword, frequency, children):
+        self.codeword = codeword
+        self.frequency = frequency
+        self.children = children
+
+    def to_string(self):
+        s = f"{self.codeword} ({self.frequency})"
+        if self.children:
+            s += f" {[n.codeword for n in self.children]}"
+
+        return s
+
+
+def get_frequency_table(word):
+    frequencies = {}
+
+    # get letter frequencies 
+    for letter in word:
+        if letter in frequencies.keys():
+            # frequencies stored as integers to avoid floating point problems
+            frequencies[letter] += 1
+        else:
+            frequencies[letter] = 1
+
+    # convert to a sorted list
+    l = [(k, frequencies[k]) for k in frequencies]
+    l = sorted(l, key=lambda v : v[1])
+
+    return l
+
+
+def calculate_entropy(word):
+    # -sum(p_i log_b(p_i)
+    # where b is usually 2, for binary
+
+    result = 0
+
+    for k in get_frequency_table(word):
+        # account for frequencies stored as floats
+        p = k[1] / len(word)
+        result -= p * math.log(p, 2)
+
+    return result
+
+
+def huffman(word):
+    codewords = get_frequency_table(word)
+
+    # generate tree from codewords. priority is the set of unchecked nodes, tree
+    # is the final result
+    tree = [Node(e[0], e[1], []) for e in codewords]
+
+    # codewords are sorted, so take two lowest values and combine them
+    iteration = 0
+    while len(codewords) > 1:
+        iteration += 1
+
+        # take first two elements
+        least_frequent = codewords[:2]
+
+        # remove these codewords from the list
+        codewords = codewords[2:]
+
+        combined_word = least_frequent[0][0] + least_frequent[1][0]
+        combined_freq = least_frequent[0][1] + least_frequent[1][1]
+        new_codeword = (combined_word, combined_freq)
+        
+        # find the original two nodes in the tree
+        orig_nodes = []
+        for codeword in least_frequent:
+            node = [n for n in tree if n.codeword == codeword[0]][0]
+            orig_nodes.append(node)
+
+        # create a new node for the tree with the original two nodes as children
+        parent_node = Node(combined_word, combined_freq, orig_nodes)
+        tree.append(parent_node)
+
+        # return new codeword to list
+        codewords.append(new_codeword)
+        codewords = sorted(codewords, key=lambda v : v[1])
+
+    return max(tree, key=lambda v : v.frequency)
+
+
+def print_tree(root):
+    nodes = [root]
+
+    layer = 0
+
+    while len(nodes) > 0:
+        children = []
+        for n in nodes:
+            for c in n.children:
+                children.append(c)
+        node_str = ", ".join([n.to_string() for n in nodes])
+        print(f"{layer}: {node_str}")
+
+        nodes = children
+        layer += 1
+
+
+def get_compressed_bits(root, letter):
+    bit_string = ""
+
+    node = root
+
+    while len(node.children) > 0:
+        if letter in node.children[0].codeword:
+            bit_string += "0"
+            node = node.children[0]
+        else:
+            bit_string += "1"
+            node = node.children[1]
+
+    return bit_string
+
+def get_uncompressed_bits(letter):
+    return format(ord(letter), 'b')
+
+word = "C'est plus efficace en français qu'en anglais !"
+
+print(f"word: {word}")
+print(f"entropy: {calculate_entropy(word)}\n")
+
+root = huffman(word)
+print_tree(root)
+
+print()
+
+uncompressed = [get_uncompressed_bits(letter) for letter in word]
+
+# to get the uncompressed length we need to use the length of the longest word, since all words
+# would be encoded at the same length. leading zeroes are omitted by Python, but in a transmission
+# context all characters are the same number of bits.
+max_uncompressed_word_length = max([len(w) for w in uncompressed])
+print(f"uncompressed: {uncompressed}")
+print(f"max individual word length: {max_uncompressed_word_length}")
+total_uncompressed_length = max_uncompressed_word_length*len(uncompressed)
+print(f"total length: {total_uncompressed_length}")
+print()
+
+# now we have the table, generate the string of bits from the word
+compressed = [get_compressed_bits(root, letter) for letter in word]
+print(f"compressed: {compressed}")
+total_compressed_length = sum([len(w) for w in compressed])
+print(f"total length: {total_compressed_length}")
+
+print()
+print(f"compression ratio: {total_compressed_length/total_uncompressed_length:.2f}")