Last active
November 2, 2016 11:26
-
-
Save pceccato/7f5bfd75b12276222448 to your computer and use it in GitHub Desktop.
Caesar Cypher Solver in Haskell using trigram probabilities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -- caesarsolver.hs | |
| -- this is part one of the optional programming assignment for the | |
| -- Stanford Univerity Introduction to Artificial Intelligence class | |
| -- it solves a rotating or caesar cypher by using the probabilities | |
| -- of three letter trigrams for the english language. | |
| -- All 26 rotations are generated, scored and the top 3 most likely | |
| -- candidates are displayed. Apologies for the Haskell, it's my first program | |
| -- | |
| -- source for trigram data is here: http://home.ccil.org/~cowan/trigrams | |
| -- the trigram data is read from input | |
| -- PAC 9th January, 2012 | |
| import Data.Char (ord, chr, isUpper, isLower, toLower, isLetter, toUpper) | |
| import Data.List (sortBy) | |
| import Data.Ord (comparing) | |
| import qualified Data.Map as M | |
| import System.IO | |
| import System.Environment | |
| import Control.Monad | |
| -- define a datatype for our map of trigram frequencies for clarity | |
| type TrigramFrequencies = M.Map String Int | |
| -- datatype for a scored string | |
| type ScoredString = ( Double, String ) | |
| -- the code to break | |
| defaultEncodedMsg = "Esp qtcde nzyqpcpynp zy esp ezatn zq Lcetqtntlw Tyepwwtrpynp hld spwo le Olcexzfes Nzwwprp ty estd jplc." | |
| -- Shifts a character to the right if positive, left if negative. Wraps around. | |
| shift :: Int -> Char -> Char -- Modulus handles the wraparound(shift 1 'z' = 'a') | |
| shift n c | isUpper c = chr $ ord 'A' + ((ord c + n - ord 'A') `mod` 26) | |
| | isLower c = chr $ ord 'a' + ((ord c + n - ord 'a') `mod` 26) | |
| | otherwise = c | |
| -- shifts input string by n | |
| shiftString:: Int -> String -> String | |
| shiftString n = map (shift n) | |
| --parse a line from the trigams data file into a tuple of trigram and frequency per 10000 words | |
| parseTrigram:: String -> [ (String,Int) ] | |
| parseTrigram contents = [ lineToTuple( words x ) | x <- lines(contents), head x /= ';' ] | |
| where lineToTuple [ trigram, frequency ] = ( trigram, read frequency :: Int) | |
| -- loads the trigram frequency data into a Map for fast lookup | |
| -- source for trigram data is here: http://home.ccil.org/~cowan/trigrams | |
| -- we read it in from stdin | |
| -- returns an IO action yielding a Map containing a trigram and it's frequency per 10000 words | |
| loadTrigrams:: IO (TrigramFrequencies) | |
| loadTrigrams = fmap (M.fromList . parseTrigram) getContents | |
| -- preprocess our string for trigram lookup. Converts all non letters to # and the rest to uppercase | |
| preprocessText:: String -> String | |
| preprocessText text = map processChar text | |
| where processChar c = if isLetter c then toUpper c else '#' | |
| -- split a string into character trigrams | |
| splitIntoTrigrams:: String -> [String] | |
| splitIntoTrigrams str = [ take 3 $ drop x str | x <- [ 0 .. (length str - 3) ] ] | |
| -- lookup the trigram frequencies per 10000 words. If not found it will return 0 | |
| lookupTrigramFrequency:: TrigramFrequencies -> String -> Int | |
| lookupTrigramFrequency trigramFrequencies trigram = M.findWithDefault 0 trigram trigramFrequencies | |
| -- converts a letter frequency into a probability using laplace smoothing so we don't end up with | |
| -- trigrams that were not found borking our calculations (due to multiplying by 0) | |
| laplaceSmoothedProbability:: Int -> Int -> Int -> Double | |
| laplaceSmoothedProbability frequency totalCategories k = (fromIntegral(frequency) + fromIntegral(k))/(10000.0 + fromIntegral(k)*fromIntegral(totalCategories)) | |
| -- returns the log of the probability for a trigram. We use logs so we don't endup dealing with | |
| -- really small numbers caused by multiplying small numbers. We can just sum the logs instead. | |
| calcLogOfSmoothedProbabilityForTrigram:: TrigramFrequencies -> String -> Double | |
| calcLogOfSmoothedProbabilityForTrigram trigramFrequencies trigram = log ( laplaceSmoothedProbability (lookupTrigramFrequency trigramFrequencies trigram) (M.size trigramFrequencies) 1 ) | |
| -- calculates the score for a string based on the probabilities of all the trigrams it contains | |
| scoreString:: TrigramFrequencies -> String -> ScoredString | |
| scoreString trigramFrequencies str = ( sum $ map ( calcLogOfSmoothedProbabilityForTrigram trigramFrequencies ) ( splitIntoTrigrams $ preprocessText str ) , str ) | |
| -- calculates the scores for all 26 rotations of the input string and returns them in descending order of score | |
| scoreAll:: TrigramFrequencies -> String -> [ ScoredString ] | |
| scoreAll trigramFrequencies str = reverse $ sortBy (comparing fst) $ map (scoreString trigramFrequencies) [ shiftString n str | n <- [1 .. 26] ] | |
| -- displays the string along with its score | |
| displayScoredString:: (Double, String ) -> IO () | |
| displayScoredString (score, str) = do | |
| putStrLn str | |
| putStr "score: " | |
| putStrLn $ show score | |
| putStrLn "" | |
| -- parse arguments, if no supplied use default | |
| parseArgs:: [String] -> String | |
| parseArgs args = if null args then defaultEncodedMsg else head args | |
| -- entry point | |
| main = do | |
| trigramFrequencies <- loadTrigrams | |
| args <- getArgs | |
| let msg = parseArgs args | |
| putStrLn "Caesar Cypher Solver (ai-class)\n" | |
| putStrLn "encoded message:" | |
| putStrLn msg | |
| putStrLn "\nThe top 3 candidates based on trigram letter probabilities for english language are:\n" | |
| let top3 = take 3 $ scoreAll trigramFrequencies msg | |
| mapM_ displayScoredString top3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment