jannesaranpaa · November 21, 2024 09:47
diff --git a/spam_ham.r b/spam_ham.r
 spam <- c(
  "million" = 156,
  "dollars" = 29,
  "adclick" = 51,
  "conferences" = 0.0001
 )

 ham <- c(
  "million" = 98,
  "dollars" = 119,
  "adclick" = 0.0001,
  "conferences" = 12
 )

 spam_tot <- 95791
 ham_tot <- 306438

 p_spam <- 0.5
 p_ham <- 0.5

 prob_ham <- function(word) {
  return(ham[word]/ham_tot)
 }

 prob_spam <- function(word) {
  return(spam[word]/spam_tot)
 }

 p_list_spam <- function(words) {
  spam_sum <- 0.5
  ham_sum <- 0.5
  
  for (word in words) {
    spam_sum <- spam_sum * prob_spam(word)
    ham_sum <- ham_sum * prob_ham(word)
  }
  
  R <- spam_sum / ham_sum
  
  return(R / (1+R))
 }

 res <- p_list_spam(c("million", "dollars", "adclick", "conferences"))
diff --git a/spamham.py b/spamham.py
 import os
 import math

 SMALL_NUMBER = 0.00001


 def get_occurrences(filename):
    results = {}
    dir_path = os.path.dirname(os.path.realpath(__file__))

    try:
        with open(os.path.join(dir_path, '..', filename)) as file:
            for line in file:
                count, word = line.strip().split(' ')
                results[word] = int(count)

        return results

    except FileNotFoundError:
        print("File %s was not found." % filename)
        raise
    except Exception as e:
        print("Something terrible happened: %s" % str(e))
        raise


 def get_words(filename):
    dir_path = os.path.dirname(os.path.realpath(__file__))

    try:
        with open(os.path.join(dir_path, '..', filename)) as file:
            words = [word for line in file for word in line.split()]

        return words

    except FileNotFoundError:
        print("File %s was not found." % filename)
        raise
    except Exception as e:
        print("Something terrible happened: %s", str(e))
        raise


 class SpamHam:
    """ Naive Bayes spam filter
        :attr spam: dictionary of occurrences for spam messages {word: count}
        :attr ham: dictionary of occurrences for ham messages {word: count}
    """

    def __init__(self, spam_file, ham_file):
        self.spam = get_occurrences(spam_file)
        self.ham = get_occurrences(ham_file)

        self.spam_unique: int = 6245
        self.spam_total: int = 75268
        self.ham_unique: int = 16207
        self.ham_total: int = 290673

    def evaluate_from_file(self, filename):
        words = get_words(filename)
        return self.evaluate(words)

    def evaluate_from_input(self):
        words = input().split()
        return self.evaluate(words)

    def word_is_spam(self, word):
        if word in self.spam:
            return self.spam[word] / self.spam_total
        
        return SMALL_NUMBER
    
    def word_is_ham(self, word):
        if word in self.ham:
            return self.ham[word] / self.ham_total
        
        return SMALL_NUMBER


    def evaluate(self, words):
        """
        :param words: Array of str
        :return: probability that the message is spam (float)
        """

        p_spam: float = math.log(0.5)
        p_ham: float = math.log(0.5)

        spam_prod = p_spam
        ham_prod = p_ham

        for word in words:
            spam_prod = spam_prod + math.log(self.word_is_spam(word))
            ham_prod = ham_prod + math.log(self.word_is_ham(word))

        ratio: float = spam_prod - ham_prod
        return (ratio / (1+ratio))
	spam <- c(
	"million" = 156,
	"dollars" = 29,
	"adclick" = 51,
	"conferences" = 0.0001
	)

	ham <- c(
	"million" = 98,
	"dollars" = 119,
	"adclick" = 0.0001,
	"conferences" = 12
	)

	spam_tot <- 95791
	ham_tot <- 306438

	p_spam <- 0.5
	p_ham <- 0.5

	prob_ham <- function(word) {
	return(ham[word]/ham_tot)
	}

	prob_spam <- function(word) {
	return(spam[word]/spam_tot)
	}

	p_list_spam <- function(words) {
	spam_sum <- 0.5
	ham_sum <- 0.5

	for (word in words) {
	spam_sum <- spam_sum * prob_spam(word)
	ham_sum <- ham_sum * prob_ham(word)
	}

	R <- spam_sum / ham_sum

	return(R / (1+R))
	}

	res <- p_list_spam(c("million", "dollars", "adclick", "conferences"))
	import os
	import math

	SMALL_NUMBER = 0.00001


	def get_occurrences(filename):
	results = {}
	dir_path = os.path.dirname(os.path.realpath(__file__))

	try:
	with open(os.path.join(dir_path, '..', filename)) as file:
	for line in file:
	count, word = line.strip().split(' ')
	results[word] = int(count)

	return results

	except FileNotFoundError:
	print("File %s was not found." % filename)
	raise
	except Exception as e:
	print("Something terrible happened: %s" % str(e))
	raise


	def get_words(filename):
	dir_path = os.path.dirname(os.path.realpath(__file__))

	try:
	with open(os.path.join(dir_path, '..', filename)) as file:
	words = [word for line in file for word in line.split()]

	return words

	except FileNotFoundError:
	print("File %s was not found." % filename)
	raise
	except Exception as e:
	print("Something terrible happened: %s", str(e))
	raise


	class SpamHam:
	""" Naive Bayes spam filter
	:attr spam: dictionary of occurrences for spam messages {word: count}
	:attr ham: dictionary of occurrences for ham messages {word: count}
	"""

	def __init__(self, spam_file, ham_file):
	self.spam = get_occurrences(spam_file)
	self.ham = get_occurrences(ham_file)

	self.spam_unique: int = 6245
	self.spam_total: int = 75268
	self.ham_unique: int = 16207
	self.ham_total: int = 290673

	def evaluate_from_file(self, filename):
	words = get_words(filename)
	return self.evaluate(words)

	def evaluate_from_input(self):
	words = input().split()
	return self.evaluate(words)

	def word_is_spam(self, word):
	if word in self.spam:
	return self.spam[word] / self.spam_total

	return SMALL_NUMBER

	def word_is_ham(self, word):
	if word in self.ham:
	return self.ham[word] / self.ham_total

	return SMALL_NUMBER


	def evaluate(self, words):
	"""
	:param words: Array of str
	:return: probability that the message is spam (float)
	"""

	p_spam: float = math.log(0.5)
	p_ham: float = math.log(0.5)

	spam_prod = p_spam
	ham_prod = p_ham

	for word in words:
	spam_prod = spam_prod + math.log(self.word_is_spam(word))
	ham_prod = ham_prod + math.log(self.word_is_ham(word))

	ratio: float = spam_prod - ham_prod
	return (ratio / (1+ratio))