Skip to content

Instantly share code, notes, and snippets.

@jannesaranpaa
Last active November 21, 2024 09:47
Show Gist options
  • Select an option

  • Save jannesaranpaa/2b251dddd7c7de3a2a9a4b7b5a8edfb3 to your computer and use it in GitHub Desktop.

Select an option

Save jannesaranpaa/2b251dddd7c7de3a2a9a4b7b5a8edfb3 to your computer and use it in GitHub Desktop.
Spam Ham exercise
spam <- c(
"million" = 156,
"dollars" = 29,
"adclick" = 51,
"conferences" = 0.0001
)
ham <- c(
"million" = 98,
"dollars" = 119,
"adclick" = 0.0001,
"conferences" = 12
)
spam_tot <- 95791
ham_tot <- 306438
p_spam <- 0.5
p_ham <- 0.5
prob_ham <- function(word) {
return(ham[word]/ham_tot)
}
prob_spam <- function(word) {
return(spam[word]/spam_tot)
}
p_list_spam <- function(words) {
spam_sum <- 0.5
ham_sum <- 0.5
for (word in words) {
spam_sum <- spam_sum * prob_spam(word)
ham_sum <- ham_sum * prob_ham(word)
}
R <- spam_sum / ham_sum
return(R / (1+R))
}
res <- p_list_spam(c("million", "dollars", "adclick", "conferences"))
import os
import math
SMALL_NUMBER = 0.00001
def get_occurrences(filename):
results = {}
dir_path = os.path.dirname(os.path.realpath(__file__))
try:
with open(os.path.join(dir_path, '..', filename)) as file:
for line in file:
count, word = line.strip().split(' ')
results[word] = int(count)
return results
except FileNotFoundError:
print("File %s was not found." % filename)
raise
except Exception as e:
print("Something terrible happened: %s" % str(e))
raise
def get_words(filename):
dir_path = os.path.dirname(os.path.realpath(__file__))
try:
with open(os.path.join(dir_path, '..', filename)) as file:
words = [word for line in file for word in line.split()]
return words
except FileNotFoundError:
print("File %s was not found." % filename)
raise
except Exception as e:
print("Something terrible happened: %s", str(e))
raise
class SpamHam:
""" Naive Bayes spam filter
:attr spam: dictionary of occurrences for spam messages {word: count}
:attr ham: dictionary of occurrences for ham messages {word: count}
"""
def __init__(self, spam_file, ham_file):
self.spam = get_occurrences(spam_file)
self.ham = get_occurrences(ham_file)
self.spam_unique: int = 6245
self.spam_total: int = 75268
self.ham_unique: int = 16207
self.ham_total: int = 290673
def evaluate_from_file(self, filename):
words = get_words(filename)
return self.evaluate(words)
def evaluate_from_input(self):
words = input().split()
return self.evaluate(words)
def word_is_spam(self, word):
if word in self.spam:
return self.spam[word] / self.spam_total
return SMALL_NUMBER
def word_is_ham(self, word):
if word in self.ham:
return self.ham[word] / self.ham_total
return SMALL_NUMBER
def evaluate(self, words):
"""
:param words: Array of str
:return: probability that the message is spam (float)
"""
p_spam: float = math.log(0.5)
p_ham: float = math.log(0.5)
spam_prod = p_spam
ham_prod = p_ham
for word in words:
spam_prod = spam_prod + math.log(self.word_is_spam(word))
ham_prod = ham_prod + math.log(self.word_is_ham(word))
ratio: float = spam_prod - ham_prod
return (ratio / (1+ratio))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment