Last active
November 21, 2024 09:47
-
-
Save jannesaranpaa/2b251dddd7c7de3a2a9a4b7b5a8edfb3 to your computer and use it in GitHub Desktop.
Spam Ham exercise
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| spam <- c( | |
| "million" = 156, | |
| "dollars" = 29, | |
| "adclick" = 51, | |
| "conferences" = 0.0001 | |
| ) | |
| ham <- c( | |
| "million" = 98, | |
| "dollars" = 119, | |
| "adclick" = 0.0001, | |
| "conferences" = 12 | |
| ) | |
| spam_tot <- 95791 | |
| ham_tot <- 306438 | |
| p_spam <- 0.5 | |
| p_ham <- 0.5 | |
| prob_ham <- function(word) { | |
| return(ham[word]/ham_tot) | |
| } | |
| prob_spam <- function(word) { | |
| return(spam[word]/spam_tot) | |
| } | |
| p_list_spam <- function(words) { | |
| spam_sum <- 0.5 | |
| ham_sum <- 0.5 | |
| for (word in words) { | |
| spam_sum <- spam_sum * prob_spam(word) | |
| ham_sum <- ham_sum * prob_ham(word) | |
| } | |
| R <- spam_sum / ham_sum | |
| return(R / (1+R)) | |
| } | |
| res <- p_list_spam(c("million", "dollars", "adclick", "conferences")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import math | |
| SMALL_NUMBER = 0.00001 | |
| def get_occurrences(filename): | |
| results = {} | |
| dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| try: | |
| with open(os.path.join(dir_path, '..', filename)) as file: | |
| for line in file: | |
| count, word = line.strip().split(' ') | |
| results[word] = int(count) | |
| return results | |
| except FileNotFoundError: | |
| print("File %s was not found." % filename) | |
| raise | |
| except Exception as e: | |
| print("Something terrible happened: %s" % str(e)) | |
| raise | |
| def get_words(filename): | |
| dir_path = os.path.dirname(os.path.realpath(__file__)) | |
| try: | |
| with open(os.path.join(dir_path, '..', filename)) as file: | |
| words = [word for line in file for word in line.split()] | |
| return words | |
| except FileNotFoundError: | |
| print("File %s was not found." % filename) | |
| raise | |
| except Exception as e: | |
| print("Something terrible happened: %s", str(e)) | |
| raise | |
| class SpamHam: | |
| """ Naive Bayes spam filter | |
| :attr spam: dictionary of occurrences for spam messages {word: count} | |
| :attr ham: dictionary of occurrences for ham messages {word: count} | |
| """ | |
| def __init__(self, spam_file, ham_file): | |
| self.spam = get_occurrences(spam_file) | |
| self.ham = get_occurrences(ham_file) | |
| self.spam_unique: int = 6245 | |
| self.spam_total: int = 75268 | |
| self.ham_unique: int = 16207 | |
| self.ham_total: int = 290673 | |
| def evaluate_from_file(self, filename): | |
| words = get_words(filename) | |
| return self.evaluate(words) | |
| def evaluate_from_input(self): | |
| words = input().split() | |
| return self.evaluate(words) | |
| def word_is_spam(self, word): | |
| if word in self.spam: | |
| return self.spam[word] / self.spam_total | |
| return SMALL_NUMBER | |
| def word_is_ham(self, word): | |
| if word in self.ham: | |
| return self.ham[word] / self.ham_total | |
| return SMALL_NUMBER | |
| def evaluate(self, words): | |
| """ | |
| :param words: Array of str | |
| :return: probability that the message is spam (float) | |
| """ | |
| p_spam: float = math.log(0.5) | |
| p_ham: float = math.log(0.5) | |
| spam_prod = p_spam | |
| ham_prod = p_ham | |
| for word in words: | |
| spam_prod = spam_prod + math.log(self.word_is_spam(word)) | |
| ham_prod = ham_prod + math.log(self.word_is_ham(word)) | |
| ratio: float = spam_prod - ham_prod | |
| return (ratio / (1+ratio)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment