-
-
Save SouravJohar/bcbbad0d0b7e881cd0dca3481e32381f to your computer and use it in GitHub Desktop.
| import cPickle as c | |
| import os | |
| from sklearn import * | |
| from collections import Counter | |
| def load(clf_file): | |
| with open(clf_file) as fp: | |
| clf = c.load(fp) | |
| return clf | |
| def make_dict(): | |
| direc = "emails/" | |
| files = os.listdir(direc) | |
| emails = [direc + email for email in files] | |
| words = [] | |
| c = len(emails) | |
| for email in emails: | |
| f = open(email) | |
| blob = f.read() | |
| words += blob.split(" ") | |
| print c | |
| c -= 1 | |
| for i in range(len(words)): | |
| if not words[i].isalpha(): | |
| words[i] = "" | |
| dictionary = Counter(words) | |
| del dictionary[""] | |
| return dictionary.most_common(3000) | |
| clf = load("text-classifier.mdl") | |
| d = make_dict() | |
| while True: | |
| features = [] | |
| inp = raw_input(">").split() | |
| if inp[0] == "exit": | |
| break | |
| for word in d: | |
| features.append(inp.count(word[0])) | |
| res = clf.predict([features]) | |
| print ["Not Spam", "Spam!"][res[0]] |
| import os | |
| from collections import Counter | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.model_selection import train_test_split as tts | |
| from sklearn.metrics import accuracy_score | |
| import cPickle as c | |
| def save(clf, name): | |
| with open(name, 'wb') as fp: | |
| c.dump(clf, fp) | |
| print "saved" | |
| def make_dict(): | |
| direc = "emails/" | |
| files = os.listdir(direc) | |
| emails = [direc + email for email in files] | |
| words = [] | |
| c = len(emails) | |
| for email in emails: | |
| f = open(email) | |
| blob = f.read() | |
| words += blob.split(" ") | |
| print c | |
| c -= 1 | |
| for i in range(len(words)): | |
| if not words[i].isalpha(): | |
| words[i] = "" | |
| dictionary = Counter(words) | |
| del dictionary[""] | |
| return dictionary.most_common(3000) | |
| def make_dataset(dictionary): | |
| direc = "emails/" | |
| files = os.listdir(direc) | |
| emails = [direc + email for email in files] | |
| feature_set = [] | |
| labels = [] | |
| c = len(emails) | |
| for email in emails: | |
| data = [] | |
| f = open(email) | |
| words = f.read().split(' ') | |
| for entry in dictionary: | |
| data.append(words.count(entry[0])) | |
| feature_set.append(data) | |
| if "ham" in email: | |
| labels.append(0) | |
| if "spam" in email: | |
| labels.append(1) | |
| print c | |
| c = c - 1 | |
| return feature_set, labels | |
| d = make_dict() | |
| features, labels = make_dataset(d) | |
| x_train, x_test, y_train, y_test = tts(features, labels, test_size=0.2) | |
| clf = MultinomialNB() | |
| clf.fit(x_train, y_train) | |
| preds = clf.predict(x_test) | |
| print accuracy_score(y_test, preds) | |
| save(clf, "text-classifier.mdl") |
I am getting this error
File "G:\ML proj\Email spam classification\detector.py", line 42, in
clf = load("text_classifier.mdl")File "G:\ML proj\Email spam classification\detector.py", line 19, in load
clf = c.load(fp)File "C:\Users\Kalaivani\anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 52: character maps to
How to clear this error
check out this:---> https://stackoverflow.com/questions/9233027/unicodedecodeerror-charmap-codec-cant-decode-byte-x-in-position-y-character
I am getting this error
File "G:\ML proj\Email spam classification\detector.py", line 42, in
clf = load("text_classifier.mdl")File "G:\ML proj\Email spam classification\detector.py", line 19, in load
clf = c.load(fp)File "C:\Users\Kalaivani\anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 52: character maps to
How to clear this error
check this:--> https://stackoverflow.com/questions/9233027/unicodedecodeerror-charmap-codec-cant-decode-byte-x-in-position-y-character
I am getting this error
File "G:\ML proj\Email spam classification\detector.py", line 42, in
clf = load("text_classifier.mdl")
File "G:\ML proj\Email spam classification\detector.py", line 19, in load
clf = c.load(fp)
File "C:\Users\Kalaivani\anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 52: character maps to
How to clear this error