Skip to content

Instantly share code, notes, and snippets.

@peteyoung
Created September 16, 2015 18:51
Show Gist options
  • Select an option

  • Save peteyoung/bc9cb7f1e081015b8f94 to your computer and use it in GitHub Desktop.

Select an option

Save peteyoung/bc9cb7f1e081015b8f94 to your computer and use it in GitHub Desktop.
package patmat
import common._
/**
* Assignment 4: Huffman coding
*
*/
object Huffman {
/**
* A huffman code is represented by a binary tree.
*
* Every `Leaf` node of the tree represents one character of the alphabet that the tree can encode.
* The weight of a `Leaf` is the frequency of appearance of the character.
*
* The branches of the huffman tree, the `Fork` nodes, represent a set containing all the characters
* present in the leaves below it. The weight of a `Fork` node is the sum of the weights of these
* leaves.
*/
abstract class CodeTree
case class Fork(left: CodeTree, right: CodeTree, chars: List[Char], weight: Int) extends CodeTree
case class Leaf(char: Char, weight: Int) extends CodeTree
// Part 1: Basics
def weight(tree: CodeTree): Int = {
tree match {
case Fork(l, r, c, w) => weight(l) + weight(r)
case Leaf(c, w) => w
}
}
def chars(tree: CodeTree): List[Char] = {
tree match {
case Fork(l, r, c, w) => chars(l) ::: chars(r)
case Leaf(c, w) => List[Char](c)
}
}
def makeCodeTree(left: CodeTree, right: CodeTree) =
Fork(left, right, chars(left) ::: chars(right), weight(left) + weight(right))
// Part 2: Generating Huffman trees
/**
* In this assignment, we are working with lists of characters. This function allows
* you to easily create a character list from a given string.
*/
def string2Chars(str: String): List[Char] = str.toList
/**
* This function computes for each unique character in the list `chars` the number of
* times it occurs. For example, the invocation
*
* times(List('a', 'b', 'a'))
*
* should return the following (the order of the resulting list is not important):
*
* List(('a', 2), ('b', 1))
*
* The type `List[(Char, Int)]` denotes a list of pairs, where each pair consists of a
* character and an integer. Pairs can be constructed easily using parentheses:
*
* val pair: (Char, Int) = ('c', 1)
*
* In order to access the two elements of a pair, you can use the accessors `_1` and `_2`:
*
* val theChar = pair._1
* val theInt = pair._2
*
* Another way to deconstruct a pair is using pattern matching:
*
* pair match {
* case (theChar, theInt) =>
* println("character is: "+ theChar)
* println("integer is : "+ theInt)
* }
*/
def times(chars: List[Char]): List[(Char, Int)] = {
def times0(chars0: List[Char], freqs: List[(Char, Int)]): List[(Char, Int)] = {
if (chars0.isEmpty) freqs
else times0(chars0.tail, incrementPair(chars0.head, freqs))
}
times0(chars, List[(Char, Int)]())
}
// finds a pair where the character in the pair matches the c: Char argument and
// updates the count or creates a new pair with count of 1.
def incrementPair(c: Char, freqs: List[(Char, Int)]): List[(Char, Int)] = {
val pair = findOrCreatePair(c, freqs)
val incPair = (pair._1, pair._2 + 1)
if (freqs.contains(pair))
freqs.take(freqs.indexOf(pair)) :::
List(incPair) :::
freqs.takeRight(freqs.length - (freqs.indexOf(pair) + 1))
else
freqs ::: List(incPair)
}
// finds a pair where the character in the pair matches the c: Char argument
// or creates a new one with a count of 0
def findOrCreatePair(c: Char, counts: List[(Char, Int)]): (Char, Int) = {
def find(c: Char, counts0: List[(Char, Int)]): (Char, Int) = {
if (counts0.isEmpty) (c, 0)
else if (counts0.head._1 == c) counts0.head
else find(c, counts0.tail)
}
find(c, counts)
}
/**
* Returns a list of `Leaf` nodes for a given frequency table `freqs`.
*
* The returned list should be ordered by ascending weights (i.e. the
* head of the list should have the smallest weight), where the weight
* of a leaf is the frequency of the character.
*/
// I got tired of implementing what was already in the List API
def makeOrderedLeafList(freqs: List[(Char, Int)]): List[Leaf] = {
(freqs.sortWith((p1, p2) => p1._2 < p2._2)).map(p => Leaf(p._1, p._2))
}
/**
* Checks whether the list `trees` contains only one single code tree.
*/
def singleton(trees: List[CodeTree]): Boolean = trees.length == 1
/**
* The parameter `trees` of this function is a list of code trees ordered
* by ascending weights.
*
* This function takes the first two elements of the list `trees` and combines
* them into a single `Fork` node. This node is then added back into the
* remaining elements of `trees` at a position such that the ordering by weights
* is preserved.
*
* If `trees` is a list of less than two elements, that list should be returned
* unchanged.
*/
// def combine(trees: List[CodeTree]): List[CodeTree] = {
// if (singleton(trees)) trees
// else combine(List[CodeTree](makeCodeTree(trees.head, trees.tail.head)) ::: trees.tail.tail)
// }
// def combine(trees: List[CodeTree]): List[CodeTree] = {
// if (trees == null) null
// else if (singleton(trees)) trees
// else List[CodeTree](makeCodeTree(trees.head, trees.tail.head)) ::: trees.tail.tail
// }
// def combine(trees: List[CodeTree]): List[CodeTree] = {
// if (trees == null) null
// else if (singleton(trees)) trees
// else {
// val newTree = makeCodeTree(trees.head, trees.tail.head) // car and cdar
// val splitList = trees.tail.tail.splitAt(trees.indexWhere(ct => newTree.weight < weight(ct)))
// splitList._1 ::: List[CodeTree](newTree) ::: splitList._2
// }
// }
def combine(trees: List[CodeTree]): List[CodeTree] = {
if (trees == Nil) Nil
else if (singleton(trees)) trees
else {
val mm1 = minMod(trees)
val mm2 = minMod(mm1._2)
val ct = makeCodeTree(mm1._1, mm2._1)
val trees2 = List[CodeTree](ct) ::: mm2._2
trees2.sortWith((l, r) => weight(l) < weight(r))
}
}
// find minimum weighted CodeTree and modulo the list with it
def minMod(trees: List[CodeTree]) : (CodeTree, List[CodeTree]) = {
val min = findMin(trees)
(min, trees.filter(ct => ct != min))
}
def findMin(trees: List[CodeTree]): CodeTree = {
def fm(trees: List[CodeTree], min: CodeTree): CodeTree = {
if(trees == Nil) min
else fm(trees.tail, if (weight(trees.head) < weight(min)) trees.head else min)
}
fm(trees.tail, trees.head)
}
/**
* This function will be called in the following way:
*
* until(singleton, combine)(trees)
*
* where `trees` is of type `List[CodeTree]`, `singleton` and `combine` refer to
* the two functions defined above.
*
* In such an invocation, `until` should call the two functions until the list of
* code trees contains only one single tree, and then return that singleton list.
*
* Hint: before writing the implementation,
* - start by defining the parameter types such that the above example invocation
* is valid. The parameter types of `until` should match the argument types of
* the example invocation. Also define the return type of the `until` function.
* - try to find sensible parameter names for `xxx`, `yyy` and `zzz`.
*/
def until(
terminate: List[CodeTree] => Boolean,
apply: List[CodeTree] => List[CodeTree])(list: List[CodeTree]): List[CodeTree] = {
if (terminate(list)) list
else until(terminate, apply)(apply(list))
}
// def until(
// terminate: List[CodeTree] => Boolean,
// apply: List[CodeTree] => List[CodeTree])(list: List[CodeTree]): List[CodeTree] = {
// def until0(list0: List[CodeTree]): List[CodeTree] = {
// if (terminate(list)) list
// else until0(apply(list))
// }
// until0(list)
// }
// def until[T](
// terminate: List[T] => Boolean,
// apply: List[T] => List[T])(list: List[T]): List[T] = {
// def until0(list0: List[T]): List[T] = {
// if (terminate(list)) list
// else until0(apply(list))
// }
// until0(list)
// }
/**
* This function creates a code tree which is optimal to encode the text `chars`.
*
* The parameter `chars` is an arbitrary text. This function extracts the character
* frequencies from that text and creates a code tree based on them.
*/
def createCodeTree(chars: List[Char]): CodeTree = {
val pairs = times(chars)
val leaves = makeOrderedLeafList(pairs)
(until(tree => singleton(tree), tree => combine(tree))(leaves)).head
}
// Part 3: Decoding
type Bit = Int
/**
* This function decodes the bit sequence `bits` using the code tree `tree` and returns
* the resulting list of characters.
*/
def decode(tree: CodeTree, bits: List[Bit]): List[Char] = {
def d (t: CodeTree, b: List[Bit], acc: List[Char]): List[Char] =
t match {
case Fork(l, r, c, w) => d(if (b.head == 0) l else r, b.tail, acc)
case Leaf(c, w) => if (b != Nil) d(tree, b, acc :+ c) else acc :+ c
}
d(tree, bits, List[Char]())
}
/**
* A Huffman coding tree for the French language.
* Generated from the data given at
* http://fr.wikipedia.org/wiki/Fr%C3%A9quence_d%27apparition_des_lettres_en_fran%C3%A7ais
*/
val frenchCode: CodeTree = Fork(Fork(Fork(Leaf('s', 121895), Fork(Leaf('d', 56269), Fork(Fork(Fork(Leaf('x', 5928), Leaf('j', 8351), List('x', 'j'), 14279), Leaf('f', 16351), List('x', 'j', 'f'), 30630), Fork(Fork(Fork(Fork(Leaf('z', 2093), Fork(Leaf('k', 745), Leaf('w', 1747), List('k', 'w'), 2492), List('z', 'k', 'w'), 4585), Leaf('y', 4725), List('z', 'k', 'w', 'y'), 9310), Leaf('h', 11298), List('z', 'k', 'w', 'y', 'h'), 20608), Leaf('q', 20889), List('z', 'k', 'w', 'y', 'h', 'q'), 41497), List('x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q'), 72127), List('d', 'x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q'), 128396), List('s', 'd', 'x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q'), 250291), Fork(Fork(Leaf('o', 82762), Leaf('l', 83668), List('o', 'l'), 166430), Fork(Fork(Leaf('m', 45521), Leaf('p', 46335), List('m', 'p'), 91856), Leaf('u', 96785), List('m', 'p', 'u'), 188641), List('o', 'l', 'm', 'p', 'u'), 355071), List('s', 'd', 'x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q', 'o', 'l', 'm', 'p', 'u'), 605362), Fork(Fork(Fork(Leaf('r', 100500), Fork(Leaf('c', 50003), Fork(Leaf('v', 24975), Fork(Leaf('g', 13288), Leaf('b', 13822), List('g', 'b'), 27110), List('v', 'g', 'b'), 52085), List('c', 'v', 'g', 'b'), 102088), List('r', 'c', 'v', 'g', 'b'), 202588), Fork(Leaf('n', 108812), Leaf('t', 111103), List('n', 't'), 219915), List('r', 'c', 'v', 'g', 'b', 'n', 't'), 422503), Fork(Leaf('e', 225947), Fork(Leaf('i', 115465), Leaf('a', 117110), List('i', 'a'), 232575), List('e', 'i', 'a'), 458522), List('r', 'c', 'v', 'g', 'b', 'n', 't', 'e', 'i', 'a'), 881025), List('s', 'd', 'x', 'j', 'f', 'z', 'k', 'w', 'y', 'h', 'q', 'o', 'l', 'm', 'p', 'u', 'r', 'c', 'v', 'g', 'b', 'n', 't', 'e', 'i', 'a'), 1486387)
/**
* What does the secret message say? Can you decode it?
* For the decoding use the `frenchCode' Huffman tree defined above.
*/
val secret: List[Bit] = List(0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1)
/**
* Write a function that returns the decoded secret
*/
def decodedSecret: List[Char] = decode(frenchCode, secret)
// Part 4a: Encoding using Huffman tree
/**
* This function encodes `text` using the code tree `tree`
* into a sequence of bits.
*/
def encode(tree: CodeTree)(text: List[Char]): List[Bit] = {
def enc0(tr: CodeTree, txt: List[Char], acc: List[Bit]): List[Bit] = {
if (txt == Nil) acc
else (enc0(tr, txt.tail, acc ::: encodeChar(tr, txt.head)))
}
enc0(tree, text, List[Bit]())
}
private def encodeChar(tree: CodeTree, char: Char): List[Bit] = {
def ec(t: CodeTree, acc: List[Bit]): List[Bit] = {
t match {
case Fork(l, r, c, w) =>
if (containsChar(l, char)) ec(l, acc :+ 0)
else if (containsChar(r, char)) ec(r, acc :+ 1)
else throw new Exception(char + " not found in " + t)
case Leaf(c, w) => acc
}
}
ec(tree, List[Bit]())
}
private def containsChar(t: CodeTree, char: Char): Boolean = {
t match {
case Fork(l, r, c, w) => c.contains(char)
case Leaf(c, w) => c == char
}
}
// Part 4b: Encoding using code table
type CodeTable = List[(Char, List[Bit])]
/**
* This function returns the bit sequence that represents the character `char` in
* the code table `table`.
*/
def codeBits(table: CodeTable)(char: Char): List[Bit] = {
def cb(tbl: CodeTable): List[Bit] = {
/*if (tbl == Nil) throw new Exception(char + " not found in CodeTable " + table)
else*/ if (tbl.head._1 == char) tbl.head._2
else cb(tbl.tail)
}
cb(table)
}
/**
* Given a code tree, create a code table which contains, for every character in the
* code tree, the sequence of bits representing that character.
*
* Hint: think of a recursive solution: every sub-tree of the code tree `tree` is itself
* a valid code tree that can be represented as a code table. Using the code tables of the
* sub-trees, think of how to build the code table for the entire tree.
*/
def convert(tree: CodeTree): CodeTable = {
def cnv(tree: CodeTree, acc: List[Bit]): CodeTable = {
tree match {
case Fork(l, r, c, w) => mergeCodeTables(cnv(l, acc :+ 0), cnv(r, acc :+ 1))
case Leaf(c, w) => List((c, acc))
}
}
cnv(tree, List[Bit]())
}
/**
* This function takes two code tables and merges them into one. Depending on how you
* use it in the `convert` method above, this merge method might also do some transformations
* on the two parameter code tables.
*/
def mergeCodeTables(a: CodeTable, b: CodeTable): CodeTable = a.union(b)
/**
* This function encodes `text` according to the code tree `tree`.
*
* To speed up the encoding process, it first converts the code tree to a code table
* and then uses it to perform the actual encoding.
*/
def quickEncode(tree: CodeTree)(text: List[Char]): List[Bit] = {
val cb = codeBits(convert(tree)) _
def qe(txt: List[Char], acc: List[Bit]): List[Bit] = {
if (txt == Nil) acc
else qe(txt.tail, acc ::: cb(txt.head))
}
qe(text, List[Bit]())
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment