Last active
May 5, 2025 21:43
-
-
Save timm/60a0ac50059d6c4569759b070ab642c6 to your computer and use it in GitHub Desktop.
Final Tiny Stemmer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def stem(w): | |
| vowels = "aeiou" | |
| # Plural normalization | |
| if w.endswith("ies") and len(w) > 4: | |
| return w[:-3] + "y" | |
| if w.endswith("ves") and len(w) > 4: | |
| return w[:-3] + "f" | |
| # Long suffixes (Porter-inspired) | |
| for suf in ( | |
| "ization", "ational", "fulness", "ousness", "iveness", "tional", "biliti", | |
| "lessli", "entli", "ation", "alism", "aliti", "ousli", "iviti", "fulli", | |
| "enci", "anci", "abli", "izer", "ator", "alli", "bli", "ogi", "li" | |
| ): | |
| if w.endswith(suf) and len(w) > len(suf) + 2: | |
| w = w[:-len(suf)] | |
| break | |
| # Common short suffixes | |
| for suf in ("ing", "ed", "ly", "es", "er", "al", "ic", "en"): | |
| if w.endswith(suf) and len(w) > len(suf) + 2: | |
| w = w[:-len(suf)] | |
| break | |
| # Trailing 's' (safe removal) | |
| if w.endswith("s") and len(w) > 3 and w[-2] not in vowels: | |
| w = w[:-1] | |
| # Double consonant (e.g., "hopping" → "hop") | |
| if len(w) >= 3 and w[-1] == w[-2] and w[-1] not in vowels: | |
| w = w[:-1] | |
| return w |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Load stop words | |
| Read multi-line records (blank-line-separated, last token = class label) | |
| Tokenize + normalize | |
| Remove stop words | |
| Apply lightweight stemming | |
| Compute TF-IDF | |
| Pick top 100 terms by TF-IDF | |
| Rank those 100 by Information Gain | |
| """ | |
| import math, sys | |
| from collections import defaultdict, Counter | |
| # Load stopwords | |
| with open(sys.argv[1]) as f: | |
| stop = set(w.strip().lower() for w in f if w.strip()) | |
| # Read multi-line records (records separated by blank lines) | |
| docs, classes = [], [] | |
| with open(sys.argv[2]) as f: | |
| rec = [] | |
| for line in f: | |
| if line.strip() == "": | |
| if rec: | |
| docs.append("\n".join(rec)) | |
| classes.append(rec[-1].split()[-1]) | |
| rec = [] | |
| else: | |
| rec.append(line.strip()) | |
| if rec: | |
| docs.append("\n".join(rec)) | |
| classes.append(rec[-1].split()[-1]) | |
| # Light stemmer: plural handling, suffix stripping, double-consonants | |
| def stem(w): | |
| vowels = "aeiou" | |
| if w.endswith("ies") and len(w) > 4: return w[:-3] + "y" | |
| if w.endswith("ves") and len(w) > 4: return w[:-3] + "f" | |
| for suf in ( | |
| "ization", "ational", "fulness", "ousness", "iveness", "tional", "biliti", | |
| "lessli", "entli", "ation", "alism", "aliti", "ousli", "iviti", "fulli", | |
| "enci", "anci", "abli", "izer", "ator", "alli", "bli", "ogi", "li" | |
| ): | |
| if w.endswith(suf) and len(w) > len(suf) + 2: | |
| w = w[:-len(suf)] | |
| break | |
| for suf in ("ing", "ed", "ly", "es", "er", "al", "ic", "en"): | |
| if w.endswith(suf) and len(w) > len(suf) + 2: | |
| w = w[:-len(suf)] | |
| break | |
| if w.endswith("s") and len(w) > 3 and w[-2] not in vowels: | |
| w = w[:-1] | |
| if len(w) >= 3 and w[-1] == w[-2] and w[-1] not in vowels: | |
| w = w[:-1] | |
| return w | |
| # Tokenization and filtering | |
| def tokenize(text): | |
| return [stem(w) for w in text.lower().split() | |
| if w.isalpha() and len(w) > 2 and w not in stop] | |
| # Build TF, DF, class counts | |
| tf, df = [], Counter() | |
| tf_class = defaultdict(Counter) | |
| class_counts = Counter(classes) | |
| for text, label in zip(docs, classes): | |
| tokens = tokenize(text) | |
| counts = Counter(tokens) | |
| tf.append(counts) | |
| for word in counts: | |
| df[word] += 1 | |
| tf_class[word][label] += 1 | |
| N = len(docs) | |
| # Compute TF-IDF | |
| tfidf = {w: sum(counts[w] * math.log(N / df[w]) for counts in tf if w in counts) | |
| for w in df} | |
| # Top 100 terms by TF-IDF | |
| top = sorted(tfidf, key=tfidf.get, reverse=True)[:100] | |
| # Info gain | |
| H = -sum(p := class_counts[k]/N, p * math.log(p) for k in class_counts) | |
| ig = {} | |
| for w in top: | |
| yes = sum(tf_class[w][k] for k in class_counts) | |
| no = N - yes + 1e-9 # avoid div/0 | |
| py, pn = yes / N, no / N | |
| Hy = -sum((tf_class[w][k]/yes) * math.log(tf_class[w][k]/yes) | |
| for k in class_counts if tf_class[w][k]) | |
| Hn = -sum(((class_counts[k]-tf_class[w][k])/no) * math.log((class_counts[k]-tf_class[w][k])/no) | |
| for k in class_counts if class_counts[k] > tf_class[w][k]) | |
| ig[w] = H - (py * Hy + pn * Hn) | |
| # Output sorted by IG | |
| for w in sorted(top, key=ig.get, reverse=True): | |
| print(f"{w:20s} tfidf={tfidf[w]:8.2f} ig={ig[w]:.3f}") |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
find think tdidf is wrong
not
but should it be:
But maybe that is ok if you want to rank words globally, not by doc relevance.
Because: