155 lines
4.2 KiB
Python
155 lines
4.2 KiB
Python
import math
|
|
import sys
|
|
|
|
|
|
class Node:
|
|
def __init__(self, codeword, frequency, children):
|
|
self.codeword = codeword
|
|
self.frequency = frequency
|
|
self.children = children
|
|
|
|
def to_string(self):
|
|
s = f"{self.codeword} ({self.frequency})"
|
|
if self.children:
|
|
s += f" {[n.codeword for n in self.children]}"
|
|
|
|
return s
|
|
|
|
|
|
def get_frequency_table(word):
|
|
frequencies = {}
|
|
|
|
# get letter frequencies
|
|
for letter in word:
|
|
if letter in frequencies.keys():
|
|
# frequencies stored as integers to avoid floating point problems
|
|
frequencies[letter] += 1
|
|
else:
|
|
frequencies[letter] = 1
|
|
|
|
# convert to a sorted list
|
|
l = [(k, frequencies[k]) for k in frequencies]
|
|
l = sorted(l, key=lambda v : v[1])
|
|
|
|
return l
|
|
|
|
|
|
def calculate_entropy(word):
|
|
# -sum(p_i log_b(p_i)
|
|
# where b is usually 2, for binary
|
|
|
|
result = 0
|
|
|
|
for k in get_frequency_table(word):
|
|
# account for frequencies stored as floats
|
|
p = k[1] / len(word)
|
|
result -= p * math.log(p, 2)
|
|
|
|
return result
|
|
|
|
|
|
def huffman(word):
|
|
codewords = get_frequency_table(word)
|
|
|
|
# generate tree from codewords. priority is the set of unchecked nodes, tree
|
|
# is the final result
|
|
tree = [Node(e[0], e[1], []) for e in codewords]
|
|
|
|
# codewords are sorted, so take two lowest values and combine them
|
|
iteration = 0
|
|
while len(codewords) > 1:
|
|
iteration += 1
|
|
|
|
# take first two elements
|
|
least_frequent = codewords[:2]
|
|
|
|
# remove these codewords from the list
|
|
codewords = codewords[2:]
|
|
|
|
combined_word = least_frequent[0][0] + least_frequent[1][0]
|
|
combined_freq = least_frequent[0][1] + least_frequent[1][1]
|
|
new_codeword = (combined_word, combined_freq)
|
|
|
|
# find the original two nodes in the tree
|
|
orig_nodes = []
|
|
for codeword in least_frequent:
|
|
node = [n for n in tree if n.codeword == codeword[0]][0]
|
|
orig_nodes.append(node)
|
|
|
|
# create a new node for the tree with the original two nodes as children
|
|
parent_node = Node(combined_word, combined_freq, orig_nodes)
|
|
tree.append(parent_node)
|
|
|
|
# return new codeword to list
|
|
codewords.append(new_codeword)
|
|
codewords = sorted(codewords, key=lambda v : v[1])
|
|
|
|
return max(tree, key=lambda v : v.frequency)
|
|
|
|
|
|
def print_tree(root):
|
|
nodes = [root]
|
|
|
|
layer = 0
|
|
|
|
while len(nodes) > 0:
|
|
children = []
|
|
for n in nodes:
|
|
for c in n.children:
|
|
children.append(c)
|
|
node_str = ", ".join([n.to_string() for n in nodes])
|
|
print(f"{layer}: {node_str}")
|
|
|
|
nodes = children
|
|
layer += 1
|
|
|
|
|
|
def get_compressed_bits(root, letter):
|
|
bit_string = ""
|
|
|
|
node = root
|
|
|
|
while len(node.children) > 0:
|
|
if letter in node.children[0].codeword:
|
|
bit_string += "0"
|
|
node = node.children[0]
|
|
else:
|
|
bit_string += "1"
|
|
node = node.children[1]
|
|
|
|
return bit_string
|
|
|
|
def get_uncompressed_bits(letter):
|
|
return format(ord(letter), 'b')
|
|
|
|
word = "C'est plus efficace en français qu'en anglais !"
|
|
|
|
print(f"word: {word}")
|
|
print(f"entropy: {calculate_entropy(word)}\n")
|
|
|
|
root = huffman(word)
|
|
print_tree(root)
|
|
|
|
print()
|
|
|
|
uncompressed = [get_uncompressed_bits(letter) for letter in word]
|
|
|
|
# to get the uncompressed length we need to use the length of the longest word, since all words
|
|
# would be encoded at the same length. leading zeroes are omitted by Python, but in a transmission
|
|
# context all characters are the same number of bits.
|
|
max_uncompressed_word_length = max([len(w) for w in uncompressed])
|
|
print(f"uncompressed: {uncompressed}")
|
|
print(f"max individual word length: {max_uncompressed_word_length}")
|
|
total_uncompressed_length = max_uncompressed_word_length*len(uncompressed)
|
|
print(f"total length: {total_uncompressed_length}")
|
|
print()
|
|
|
|
# now we have the table, generate the string of bits from the word
|
|
compressed = [get_compressed_bits(root, letter) for letter in word]
|
|
print(f"compressed: {compressed}")
|
|
total_compressed_length = sum([len(w) for w in compressed])
|
|
print(f"total length: {total_compressed_length}")
|
|
|
|
print()
|
|
print(f"compression ratio: {total_compressed_length/total_uncompressed_length:.2f}")
|