initial commit
This commit is contained in:
commit
e111b369ca
|
@ -0,0 +1,12 @@
|
|||
[[source]]
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
|
||||
[dev-packages]
|
||||
|
||||
[requires]
|
||||
python_version = "3.13"
|
||||
python_full_version = "3.13.1"
|
|
@ -0,0 +1,154 @@
|
|||
import math
|
||||
import sys
|
||||
|
||||
|
||||
class Node:
|
||||
def __init__(self, codeword, frequency, children):
|
||||
self.codeword = codeword
|
||||
self.frequency = frequency
|
||||
self.children = children
|
||||
|
||||
def to_string(self):
|
||||
s = f"{self.codeword} ({self.frequency})"
|
||||
if self.children:
|
||||
s += f" {[n.codeword for n in self.children]}"
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def get_frequency_table(word):
|
||||
frequencies = {}
|
||||
|
||||
# get letter frequencies
|
||||
for letter in word:
|
||||
if letter in frequencies.keys():
|
||||
# frequencies stored as integers to avoid floating point problems
|
||||
frequencies[letter] += 1
|
||||
else:
|
||||
frequencies[letter] = 1
|
||||
|
||||
# convert to a sorted list
|
||||
l = [(k, frequencies[k]) for k in frequencies]
|
||||
l = sorted(l, key=lambda v : v[1])
|
||||
|
||||
return l
|
||||
|
||||
|
||||
def calculate_entropy(word):
|
||||
# -sum(p_i log_b(p_i)
|
||||
# where b is usually 2, for binary
|
||||
|
||||
result = 0
|
||||
|
||||
for k in get_frequency_table(word):
|
||||
# account for frequencies stored as floats
|
||||
p = k[1] / len(word)
|
||||
result -= p * math.log(p, 2)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def huffman(word):
|
||||
codewords = get_frequency_table(word)
|
||||
|
||||
# generate tree from codewords. priority is the set of unchecked nodes, tree
|
||||
# is the final result
|
||||
tree = [Node(e[0], e[1], []) for e in codewords]
|
||||
|
||||
# codewords are sorted, so take two lowest values and combine them
|
||||
iteration = 0
|
||||
while len(codewords) > 1:
|
||||
iteration += 1
|
||||
|
||||
# take first two elements
|
||||
least_frequent = codewords[:2]
|
||||
|
||||
# remove these codewords from the list
|
||||
codewords = codewords[2:]
|
||||
|
||||
combined_word = least_frequent[0][0] + least_frequent[1][0]
|
||||
combined_freq = least_frequent[0][1] + least_frequent[1][1]
|
||||
new_codeword = (combined_word, combined_freq)
|
||||
|
||||
# find the original two nodes in the tree
|
||||
orig_nodes = []
|
||||
for codeword in least_frequent:
|
||||
node = [n for n in tree if n.codeword == codeword[0]][0]
|
||||
orig_nodes.append(node)
|
||||
|
||||
# create a new node for the tree with the original two nodes as children
|
||||
parent_node = Node(combined_word, combined_freq, orig_nodes)
|
||||
tree.append(parent_node)
|
||||
|
||||
# return new codeword to list
|
||||
codewords.append(new_codeword)
|
||||
codewords = sorted(codewords, key=lambda v : v[1])
|
||||
|
||||
return max(tree, key=lambda v : v.frequency)
|
||||
|
||||
|
||||
def print_tree(root):
|
||||
nodes = [root]
|
||||
|
||||
layer = 0
|
||||
|
||||
while len(nodes) > 0:
|
||||
children = []
|
||||
for n in nodes:
|
||||
for c in n.children:
|
||||
children.append(c)
|
||||
node_str = ", ".join([n.to_string() for n in nodes])
|
||||
print(f"{layer}: {node_str}")
|
||||
|
||||
nodes = children
|
||||
layer += 1
|
||||
|
||||
|
||||
def get_compressed_bits(root, letter):
|
||||
bit_string = ""
|
||||
|
||||
node = root
|
||||
|
||||
while len(node.children) > 0:
|
||||
if letter in node.children[0].codeword:
|
||||
bit_string += "0"
|
||||
node = node.children[0]
|
||||
else:
|
||||
bit_string += "1"
|
||||
node = node.children[1]
|
||||
|
||||
return bit_string
|
||||
|
||||
def get_uncompressed_bits(letter):
|
||||
return format(ord(letter), 'b')
|
||||
|
||||
word = "C'est plus efficace en français qu'en anglais !"
|
||||
|
||||
print(f"word: {word}")
|
||||
print(f"entropy: {calculate_entropy(word)}\n")
|
||||
|
||||
root = huffman(word)
|
||||
print_tree(root)
|
||||
|
||||
print()
|
||||
|
||||
uncompressed = [get_uncompressed_bits(letter) for letter in word]
|
||||
|
||||
# to get the uncompressed length we need to use the length of the longest word, since all words
|
||||
# would be encoded at the same length. leading zeroes are omitted by Python, but in a transmission
|
||||
# context all characters are the same number of bits.
|
||||
max_uncompressed_word_length = max([len(w) for w in uncompressed])
|
||||
print(f"uncompressed: {uncompressed}")
|
||||
print(f"max individual word length: {max_uncompressed_word_length}")
|
||||
total_uncompressed_length = max_uncompressed_word_length*len(uncompressed)
|
||||
print(f"total length: {total_uncompressed_length}")
|
||||
print()
|
||||
|
||||
# now we have the table, generate the string of bits from the word
|
||||
compressed = [get_compressed_bits(root, letter) for letter in word]
|
||||
print(f"compressed: {compressed}")
|
||||
total_compressed_length = sum([len(w) for w in compressed])
|
||||
print(f"total length: {total_compressed_length}")
|
||||
|
||||
print()
|
||||
print(f"compression ratio: {total_compressed_length/total_uncompressed_length:.2f}")
|
Loading…
Reference in New Issue