|
1 |
| -from __future__ import annotations |
2 |
| - |
| 1 | +import heapq |
| 2 | +from collections import defaultdict |
3 | 3 | import sys
|
4 | 4 |
|
| 5 | +class HuffmanNode: |
| 6 | + def __init__(self, char=None, freq=0): |
| 7 | + self.char = char |
| 8 | + self.freq = freq |
| 9 | + self.left = None |
| 10 | + self.right = None |
5 | 11 |
|
6 |
| -class Letter: |
7 |
| - def __init__(self, letter: str, freq: int): |
8 |
| - self.letter: str = letter |
9 |
| - self.freq: int = freq |
10 |
| - self.bitstring: dict[str, str] = {} |
11 |
| - |
12 |
| - def __repr__(self) -> str: |
13 |
| - return f"{self.letter}:{self.freq}" |
| 12 | + def __lt__(self, other): |
| 13 | + return self.freq < other.freq |
14 | 14 |
|
15 | 15 |
|
16 |
| -class TreeNode: |
17 |
| - def __init__(self, freq: int, left: Letter | TreeNode, right: Letter | TreeNode): |
18 |
| - self.freq: int = freq |
19 |
| - self.left: Letter | TreeNode = left |
20 |
| - self.right: Letter | TreeNode = right |
| 16 | +def calculate_frequencies(file_path): |
| 17 | + """ |
| 18 | + Reads the file and calculates the frequency of each character. |
| 19 | + """ |
| 20 | + freq = defaultdict(int) |
| 21 | + with open(file_path, 'r') as file: |
| 22 | + for line in file: |
| 23 | + for char in line: |
| 24 | + freq[char] += 1 |
| 25 | + return freq |
21 | 26 |
|
22 | 27 |
|
23 |
| -def parse_file(file_path: str) -> list[Letter]: |
| 28 | +def build_huffman_tree(freq_dict): |
24 | 29 | """
|
25 |
| - Read the file and build a dict of all letters and their |
26 |
| - frequencies, then convert the dict into a list of Letters. |
| 30 | + Builds the Huffman tree using a priority queue. |
27 | 31 | """
|
28 |
| - chars: dict[str, int] = {} |
29 |
| - with open(file_path) as f: |
30 |
| - while True: |
31 |
| - c = f.read(1) |
32 |
| - if not c: |
33 |
| - break |
34 |
| - chars[c] = chars[c] + 1 if c in chars else 1 |
35 |
| - return sorted((Letter(c, f) for c, f in chars.items()), key=lambda x: x.freq) |
| 32 | + priority_queue = [HuffmanNode(char, freq) for char, freq in freq_dict.items()] |
| 33 | + heapq.heapify(priority_queue) |
| 34 | + |
| 35 | + while len(priority_queue) > 1: |
| 36 | + left = heapq.heappop(priority_queue) |
| 37 | + right = heapq.heappop(priority_queue) |
| 38 | + |
| 39 | + merged = HuffmanNode(freq=left.freq + right.freq) |
| 40 | + merged.left = left |
| 41 | + merged.right = right |
| 42 | + |
| 43 | + heapq.heappush(priority_queue, merged) |
| 44 | + |
| 45 | + return priority_queue[0] |
36 | 46 |
|
37 | 47 |
|
38 |
| -def build_tree(letters: list[Letter]) -> Letter | TreeNode: |
| 48 | +def generate_codes(node, current_code="", code_map=None): |
39 | 49 | """
|
40 |
| - Run through the list of Letters and build the min heap |
41 |
| - for the Huffman Tree. |
| 50 | + Generates the Huffman codes by traversing the tree recursively. |
42 | 51 | """
|
43 |
| - response: list[Letter | TreeNode] = list(letters) |
44 |
| - while len(response) > 1: |
45 |
| - left = response.pop(0) |
46 |
| - right = response.pop(0) |
47 |
| - total_freq = left.freq + right.freq |
48 |
| - node = TreeNode(total_freq, left, right) |
49 |
| - response.append(node) |
50 |
| - response.sort(key=lambda x: x.freq) |
51 |
| - return response[0] |
52 |
| - |
53 |
| - |
54 |
| -def traverse_tree(root: Letter | TreeNode, bitstring: str) -> list[Letter]: |
| 52 | + if code_map is None: |
| 53 | + code_map = {} |
| 54 | + |
| 55 | + if node is not None: |
| 56 | + if node.char is not None: |
| 57 | + code_map[node.char] = current_code |
| 58 | + |
| 59 | + generate_codes(node.left, current_code + "0", code_map) |
| 60 | + generate_codes(node.right, current_code + "1", code_map) |
| 61 | + |
| 62 | + return code_map |
| 63 | + |
| 64 | + |
| 65 | +def encode_file(file_path, code_map): |
55 | 66 | """
|
56 |
| - Recursively traverse the Huffman Tree to set each |
57 |
| - Letter's bitstring dictionary, and return the list of Letters |
| 67 | + Encodes the file contents using the Huffman codes. |
58 | 68 | """
|
59 |
| - if isinstance(root, Letter): |
60 |
| - root.bitstring[root.letter] = bitstring |
61 |
| - return [root] |
62 |
| - treenode: TreeNode = root |
63 |
| - letters = [] |
64 |
| - letters += traverse_tree(treenode.left, bitstring + "0") |
65 |
| - letters += traverse_tree(treenode.right, bitstring + "1") |
66 |
| - return letters |
| 69 | + encoded_output = [] |
| 70 | + with open(file_path, 'r') as file: |
| 71 | + for line in file: |
| 72 | + for char in line: |
| 73 | + encoded_output.append(code_map[char]) |
| 74 | + |
| 75 | + return ''.join(encoded_output) |
67 | 76 |
|
68 | 77 |
|
69 |
| -def huffman(file_path: str) -> None: |
| 78 | +def huffman(file_path): |
70 | 79 | """
|
71 |
| - Parse the file, build the tree, then run through the file |
72 |
| - again, using the letters dictionary to find and print out the |
73 |
| - bitstring for each letter. |
| 80 | + Main function to perform Huffman encoding on a given file. |
74 | 81 | """
|
75 |
| - letters_list = parse_file(file_path) |
76 |
| - root = build_tree(letters_list) |
77 |
| - letters = { |
78 |
| - k: v for letter in traverse_tree(root, "") for k, v in letter.bitstring.items() |
79 |
| - } |
80 |
| - print(f"Huffman Coding of {file_path}: ") |
81 |
| - with open(file_path) as f: |
82 |
| - while True: |
83 |
| - c = f.read(1) |
84 |
| - if not c: |
85 |
| - break |
86 |
| - print(letters[c], end=" ") |
87 |
| - print() |
| 82 | + freq_dict = calculate_frequencies(file_path) |
| 83 | + huffman_tree_root = build_huffman_tree(freq_dict) |
| 84 | + code_map = generate_codes(huffman_tree_root) |
| 85 | + |
| 86 | + print(f"Huffman Codes for characters in {file_path}:") |
| 87 | + for char, code in code_map.items(): |
| 88 | + print(f"'{char}': {code}") |
| 89 | + |
| 90 | + encoded_data = encode_file(file_path, code_map) |
| 91 | + print("\nEncoded Data:") |
| 92 | + print(encoded_data) |
88 | 93 |
|
89 | 94 |
|
90 | 95 | if __name__ == "__main__":
|
91 |
| - # pass the file path to the huffman function |
92 |
| - huffman(sys.argv[1]) |
| 96 | + if len(sys.argv) < 2: |
| 97 | + print("Usage: python huffman.py <file_path>") |
| 98 | + else: |
| 99 | + huffman(sys.argv[1]) |
0 commit comments