initial commit
This commit is contained in:
		
						commit
						e111b369ca
					
				
							
								
								
									
										12
									
								
								Pipfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								Pipfile
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,12 @@ | ||||
| [[source]] | ||||
| url = "https://pypi.org/simple" | ||||
| verify_ssl = true | ||||
| name = "pypi" | ||||
| 
 | ||||
| [packages] | ||||
| 
 | ||||
| [dev-packages] | ||||
| 
 | ||||
| [requires] | ||||
| python_version = "3.13" | ||||
| python_full_version = "3.13.1" | ||||
							
								
								
									
										154
									
								
								huffman.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								huffman.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,154 @@ | ||||
| import math | ||||
| import sys | ||||
| 
 | ||||
| 
 | ||||
| class Node: | ||||
|     def __init__(self, codeword, frequency, children): | ||||
|         self.codeword = codeword | ||||
|         self.frequency = frequency | ||||
|         self.children = children | ||||
| 
 | ||||
|     def to_string(self): | ||||
|         s = f"{self.codeword} ({self.frequency})" | ||||
|         if self.children: | ||||
|             s += f" {[n.codeword for n in self.children]}" | ||||
| 
 | ||||
|         return s | ||||
| 
 | ||||
| 
 | ||||
| def get_frequency_table(word): | ||||
|     frequencies = {} | ||||
| 
 | ||||
|     # get letter frequencies  | ||||
|     for letter in word: | ||||
|         if letter in frequencies.keys(): | ||||
|             # frequencies stored as integers to avoid floating point problems | ||||
|             frequencies[letter] += 1 | ||||
|         else: | ||||
|             frequencies[letter] = 1 | ||||
| 
 | ||||
|     # convert to a sorted list | ||||
|     l = [(k, frequencies[k]) for k in frequencies] | ||||
|     l = sorted(l, key=lambda v : v[1]) | ||||
| 
 | ||||
|     return l | ||||
| 
 | ||||
| 
 | ||||
| def calculate_entropy(word): | ||||
|     # -sum(p_i log_b(p_i) | ||||
|     # where b is usually 2, for binary | ||||
| 
 | ||||
|     result = 0 | ||||
| 
 | ||||
|     for k in get_frequency_table(word): | ||||
|         # account for frequencies stored as floats | ||||
|         p = k[1] / len(word) | ||||
|         result -= p * math.log(p, 2) | ||||
| 
 | ||||
|     return result | ||||
| 
 | ||||
| 
 | ||||
| def huffman(word): | ||||
|     codewords = get_frequency_table(word) | ||||
| 
 | ||||
|     # generate tree from codewords. priority is the set of unchecked nodes, tree | ||||
|     # is the final result | ||||
|     tree = [Node(e[0], e[1], []) for e in codewords] | ||||
| 
 | ||||
|     # codewords are sorted, so take two lowest values and combine them | ||||
|     iteration = 0 | ||||
|     while len(codewords) > 1: | ||||
|         iteration += 1 | ||||
| 
 | ||||
|         # take first two elements | ||||
|         least_frequent = codewords[:2] | ||||
| 
 | ||||
|         # remove these codewords from the list | ||||
|         codewords = codewords[2:] | ||||
| 
 | ||||
|         combined_word = least_frequent[0][0] + least_frequent[1][0] | ||||
|         combined_freq = least_frequent[0][1] + least_frequent[1][1] | ||||
|         new_codeword = (combined_word, combined_freq) | ||||
|          | ||||
|         # find the original two nodes in the tree | ||||
|         orig_nodes = [] | ||||
|         for codeword in least_frequent: | ||||
|             node = [n for n in tree if n.codeword == codeword[0]][0] | ||||
|             orig_nodes.append(node) | ||||
| 
 | ||||
|         # create a new node for the tree with the original two nodes as children | ||||
|         parent_node = Node(combined_word, combined_freq, orig_nodes) | ||||
|         tree.append(parent_node) | ||||
| 
 | ||||
|         # return new codeword to list | ||||
|         codewords.append(new_codeword) | ||||
|         codewords = sorted(codewords, key=lambda v : v[1]) | ||||
| 
 | ||||
|     return max(tree, key=lambda v : v.frequency) | ||||
| 
 | ||||
| 
 | ||||
| def print_tree(root): | ||||
|     nodes = [root] | ||||
| 
 | ||||
|     layer = 0 | ||||
| 
 | ||||
|     while len(nodes) > 0: | ||||
|         children = [] | ||||
|         for n in nodes: | ||||
|             for c in n.children: | ||||
|                 children.append(c) | ||||
|         node_str = ", ".join([n.to_string() for n in nodes]) | ||||
|         print(f"{layer}: {node_str}") | ||||
| 
 | ||||
|         nodes = children | ||||
|         layer += 1 | ||||
| 
 | ||||
| 
 | ||||
| def get_compressed_bits(root, letter): | ||||
|     bit_string = "" | ||||
| 
 | ||||
|     node = root | ||||
| 
 | ||||
|     while len(node.children) > 0: | ||||
|         if letter in node.children[0].codeword: | ||||
|             bit_string += "0" | ||||
|             node = node.children[0] | ||||
|         else: | ||||
|             bit_string += "1" | ||||
|             node = node.children[1] | ||||
| 
 | ||||
|     return bit_string | ||||
| 
 | ||||
| def get_uncompressed_bits(letter): | ||||
|     return format(ord(letter), 'b') | ||||
| 
 | ||||
| word = "C'est plus efficace en français qu'en anglais !" | ||||
| 
 | ||||
| print(f"word: {word}") | ||||
| print(f"entropy: {calculate_entropy(word)}\n") | ||||
| 
 | ||||
| root = huffman(word) | ||||
| print_tree(root) | ||||
| 
 | ||||
| print() | ||||
| 
 | ||||
| uncompressed = [get_uncompressed_bits(letter) for letter in word] | ||||
| 
 | ||||
| # to get the uncompressed length we need to use the length of the longest word, since all words | ||||
| # would be encoded at the same length. leading zeroes are omitted by Python, but in a transmission | ||||
| # context all characters are the same number of bits. | ||||
| max_uncompressed_word_length = max([len(w) for w in uncompressed]) | ||||
| print(f"uncompressed: {uncompressed}") | ||||
| print(f"max individual word length: {max_uncompressed_word_length}") | ||||
| total_uncompressed_length = max_uncompressed_word_length*len(uncompressed) | ||||
| print(f"total length: {total_uncompressed_length}") | ||||
| print() | ||||
| 
 | ||||
| # now we have the table, generate the string of bits from the word | ||||
| compressed = [get_compressed_bits(root, letter) for letter in word] | ||||
| print(f"compressed: {compressed}") | ||||
| total_compressed_length = sum([len(w) for w in compressed]) | ||||
| print(f"total length: {total_compressed_length}") | ||||
| 
 | ||||
| print() | ||||
| print(f"compression ratio: {total_compressed_length/total_uncompressed_length:.2f}") | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user