Skip to content

Instantly share code, notes, and snippets.

@timm
Last active May 5, 2025 21:43
Show Gist options
  • Select an option

  • Save timm/60a0ac50059d6c4569759b070ab642c6 to your computer and use it in GitHub Desktop.

Select an option

Save timm/60a0ac50059d6c4569759b070ab642c6 to your computer and use it in GitHub Desktop.
Final Tiny Stemmer
def stem(w):
vowels = "aeiou"
# Plural normalization
if w.endswith("ies") and len(w) > 4:
return w[:-3] + "y"
if w.endswith("ves") and len(w) > 4:
return w[:-3] + "f"
# Long suffixes (Porter-inspired)
for suf in (
"ization", "ational", "fulness", "ousness", "iveness", "tional", "biliti",
"lessli", "entli", "ation", "alism", "aliti", "ousli", "iviti", "fulli",
"enci", "anci", "abli", "izer", "ator", "alli", "bli", "ogi", "li"
):
if w.endswith(suf) and len(w) > len(suf) + 2:
w = w[:-len(suf)]
break
# Common short suffixes
for suf in ("ing", "ed", "ly", "es", "er", "al", "ic", "en"):
if w.endswith(suf) and len(w) > len(suf) + 2:
w = w[:-len(suf)]
break
# Trailing 's' (safe removal)
if w.endswith("s") and len(w) > 3 and w[-2] not in vowels:
w = w[:-1]
# Double consonant (e.g., "hopping" → "hop")
if len(w) >= 3 and w[-1] == w[-2] and w[-1] not in vowels:
w = w[:-1]
return w
"""
Load stop words
Read multi-line records (blank-line-separated, last token = class label)
Tokenize + normalize
Remove stop words
Apply lightweight stemming
Compute TF-IDF
Pick top 100 terms by TF-IDF
Rank those 100 by Information Gain
"""
import math, sys
from collections import defaultdict, Counter
# Load stopwords
with open(sys.argv[1]) as f:
stop = set(w.strip().lower() for w in f if w.strip())
# Read multi-line records (records separated by blank lines)
docs, classes = [], []
with open(sys.argv[2]) as f:
rec = []
for line in f:
if line.strip() == "":
if rec:
docs.append("\n".join(rec))
classes.append(rec[-1].split()[-1])
rec = []
else:
rec.append(line.strip())
if rec:
docs.append("\n".join(rec))
classes.append(rec[-1].split()[-1])
# Light stemmer: plural handling, suffix stripping, double-consonants
def stem(w):
vowels = "aeiou"
if w.endswith("ies") and len(w) > 4: return w[:-3] + "y"
if w.endswith("ves") and len(w) > 4: return w[:-3] + "f"
for suf in (
"ization", "ational", "fulness", "ousness", "iveness", "tional", "biliti",
"lessli", "entli", "ation", "alism", "aliti", "ousli", "iviti", "fulli",
"enci", "anci", "abli", "izer", "ator", "alli", "bli", "ogi", "li"
):
if w.endswith(suf) and len(w) > len(suf) + 2:
w = w[:-len(suf)]
break
for suf in ("ing", "ed", "ly", "es", "er", "al", "ic", "en"):
if w.endswith(suf) and len(w) > len(suf) + 2:
w = w[:-len(suf)]
break
if w.endswith("s") and len(w) > 3 and w[-2] not in vowels:
w = w[:-1]
if len(w) >= 3 and w[-1] == w[-2] and w[-1] not in vowels:
w = w[:-1]
return w
# Tokenization and filtering
def tokenize(text):
return [stem(w) for w in text.lower().split()
if w.isalpha() and len(w) > 2 and w not in stop]
# Build TF, DF, class counts
tf, df = [], Counter()
tf_class = defaultdict(Counter)
class_counts = Counter(classes)
for text, label in zip(docs, classes):
tokens = tokenize(text)
counts = Counter(tokens)
tf.append(counts)
for word in counts:
df[word] += 1
tf_class[word][label] += 1
N = len(docs)
# Compute TF-IDF
tfidf = {w: sum(counts[w] * math.log(N / df[w]) for counts in tf if w in counts)
for w in df}
# Top 100 terms by TF-IDF
top = sorted(tfidf, key=tfidf.get, reverse=True)[:100]
# Info gain
H = -sum(p := class_counts[k]/N, p * math.log(p) for k in class_counts)
ig = {}
for w in top:
yes = sum(tf_class[w][k] for k in class_counts)
no = N - yes + 1e-9 # avoid div/0
py, pn = yes / N, no / N
Hy = -sum((tf_class[w][k]/yes) * math.log(tf_class[w][k]/yes)
for k in class_counts if tf_class[w][k])
Hn = -sum(((class_counts[k]-tf_class[w][k])/no) * math.log((class_counts[k]-tf_class[w][k])/no)
for k in class_counts if class_counts[k] > tf_class[w][k])
ig[w] = H - (py * Hy + pn * Hn)
# Output sorted by IG
for w in sorted(top, key=ig.get, reverse=True):
print(f"{w:20s} tfidf={tfidf[w]:8.2f} ig={ig[w]:.3f}")
@timm
Copy link
Author

timm commented May 5, 2025

find think tdidf is wrong

not

tfidf = {w: sum(counts[w] * math.log(N / df[w]) for counts in tf if w in counts)
         for w in df}

but should it be:

 total = sum(counts.values())
 tfidf = {w: sum(counts[w]/total * math.log(N / df[w]) for counts in tf if w in counts)
         for w in df}

But maybe that is ok if you want to rank words globally, not by doc relevance.

Because:

  • You're summing across all docs.
  • The document length denominator is constant for a given doc.
  • Across many docs, the variation in doc length can matter.
  • So it's not a constant across all terms. Long docs contribute more.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment