huffman/huffman.py

import math
import sys


class Node:
    def __init__(self, codeword, frequency, children):
        self.codeword = codeword
        self.frequency = frequency
        self.children = children

    def to_string(self):
        s = f"{self.codeword} ({self.frequency})"
        if self.children:
            s += f" {[n.codeword for n in self.children]}"

        return s


def get_frequency_table(word):
    frequencies = {}

    # get letter frequencies
    for letter in word:
        if letter in frequencies.keys():
            # frequencies stored as integers to avoid floating point problems
            frequencies[letter] += 1
        else:
            frequencies[letter] = 1

    # convert to a sorted list
    l = [(k, frequencies[k]) for k in frequencies]
    l = sorted(l, key=lambda v : v[1])

    return l


def calculate_entropy(word):
    # -sum(p_i log_b(p_i)
    # where b is usually 2, for binary

    result = 0

    for k in get_frequency_table(word):
        # account for frequencies stored as floats
        p = k[1] / len(word)
        result -= p * math.log(p, 2)

    return result


def huffman(word):
    codewords = get_frequency_table(word)

    # generate tree from codewords. priority is the set of unchecked nodes, tree
    # is the final result
    tree = [Node(e[0], e[1], []) for e in codewords]

    # codewords are sorted, so take two lowest values and combine them
    iteration = 0
    while len(codewords) > 1:
        iteration += 1

        # take first two elements
        least_frequent = codewords[:2]

        # remove these codewords from the list
        codewords = codewords[2:]

        combined_word = least_frequent[0][0] + least_frequent[1][0]
        combined_freq = least_frequent[0][1] + least_frequent[1][1]
        new_codeword = (combined_word, combined_freq)

        # find the original two nodes in the tree
        orig_nodes = []
        for codeword in least_frequent:
            node = [n for n in tree if n.codeword == codeword[0]][0]
            orig_nodes.append(node)

        # create a new node for the tree with the original two nodes as children
        parent_node = Node(combined_word, combined_freq, orig_nodes)
        tree.append(parent_node)

        # return new codeword to list
        codewords.append(new_codeword)
        codewords = sorted(codewords, key=lambda v : v[1])

    return max(tree, key=lambda v : v.frequency)


def print_tree(root):
    nodes = [root]

    layer = 0

    while len(nodes) > 0:
        children = []
        for n in nodes:
            for c in n.children:
                children.append(c)
        node_str = ", ".join([n.to_string() for n in nodes])
        print(f"{layer}: {node_str}")

        nodes = children
        layer += 1


def get_compressed_bits(root, letter):
    bit_string = ""

    node = root

    while len(node.children) > 0:
        if letter in node.children[0].codeword:
            bit_string += "0"
            node = node.children[0]
        else:
            bit_string += "1"
            node = node.children[1]

    return bit_string

def get_uncompressed_bits(letter):
    return format(ord(letter), 'b')

word = "C'est plus efficace en français qu'en anglais !"

print(f"word: {word}")
print(f"entropy: {calculate_entropy(word)}\n")

root = huffman(word)
print_tree(root)

print()

uncompressed = [get_uncompressed_bits(letter) for letter in word]

# to get the uncompressed length we need to use the length of the longest word, since all words
# would be encoded at the same length. leading zeroes are omitted by Python, but in a transmission
# context all characters are the same number of bits.
max_uncompressed_word_length = max([len(w) for w in uncompressed])
print(f"uncompressed: {uncompressed}")
print(f"max individual word length: {max_uncompressed_word_length}")
total_uncompressed_length = max_uncompressed_word_length*len(uncompressed)
print(f"total length: {total_uncompressed_length}")
print()

# now we have the table, generate the string of bits from the word
compressed = [get_compressed_bits(root, letter) for letter in word]
print(f"compressed: {compressed}")
total_compressed_length = sum([len(w) for w in compressed])
print(f"total length: {total_compressed_length}")

print()
print(f"compression ratio: {total_compressed_length/total_uncompressed_length:.2f}")