timm · May 5, 2025 21:43 · timm · May 5, 2025
diff --git a/stemmer.py b/stemmer.py
 def stem(w):
    vowels = "aeiou"

    # Plural normalization
    if w.endswith("ies") and len(w) > 4:
        return w[:-3] + "y"
    if w.endswith("ves") and len(w) > 4:
        return w[:-3] + "f"

    # Long suffixes (Porter-inspired)
    for suf in (
        "ization", "ational", "fulness", "ousness", "iveness", "tional", "biliti",
        "lessli", "entli", "ation", "alism", "aliti", "ousli", "iviti", "fulli",
        "enci", "anci", "abli", "izer", "ator", "alli", "bli", "ogi", "li"
    ):
        if w.endswith(suf) and len(w) > len(suf) + 2:
            w = w[:-len(suf)]
            break

    # Common short suffixes
    for suf in ("ing", "ed", "ly", "es", "er", "al", "ic", "en"):
        if w.endswith(suf) and len(w) > len(suf) + 2:
            w = w[:-len(suf)]
            break

    # Trailing 's' (safe removal)
    if w.endswith("s") and len(w) > 3 and w[-2] not in vowels:
        w = w[:-1]

    # Double consonant (e.g., "hopping" → "hop")
    if len(w) >= 3 and w[-1] == w[-2] and w[-1] not in vowels:
        w = w[:-1]

    return w
diff --git a/textprep.py b/textprep.py
 """
 Load stop words

 Read multi-line records (blank-line-separated, last token = class label)

 Tokenize + normalize

 Remove stop words

 Apply lightweight stemming

 Compute TF-IDF

 Pick top 100 terms by TF-IDF

 Rank those 100 by Information Gain
 """

 import math, sys
 from collections import defaultdict, Counter

 # Load stopwords
 with open(sys.argv[1]) as f:
    stop = set(w.strip().lower() for w in f if w.strip())

 # Read multi-line records (records separated by blank lines)
 docs, classes = [], []
 with open(sys.argv[2]) as f:
    rec = []
    for line in f:
        if line.strip() == "":
            if rec:
                docs.append("\n".join(rec))
                classes.append(rec[-1].split()[-1])
                rec = []
        else:
            rec.append(line.strip())
    if rec:
        docs.append("\n".join(rec))
        classes.append(rec[-1].split()[-1])

 # Light stemmer: plural handling, suffix stripping, double-consonants
 def stem(w):
    vowels = "aeiou"
    if w.endswith("ies") and len(w) > 4: return w[:-3] + "y"
    if w.endswith("ves") and len(w) > 4: return w[:-3] + "f"
    for suf in (
        "ization", "ational", "fulness", "ousness", "iveness", "tional", "biliti",
        "lessli", "entli", "ation", "alism", "aliti", "ousli", "iviti", "fulli",
        "enci", "anci", "abli", "izer", "ator", "alli", "bli", "ogi", "li"
    ):
        if w.endswith(suf) and len(w) > len(suf) + 2:
            w = w[:-len(suf)]
            break
    for suf in ("ing", "ed", "ly", "es", "er", "al", "ic", "en"):
        if w.endswith(suf) and len(w) > len(suf) + 2:
            w = w[:-len(suf)]
            break
    if w.endswith("s") and len(w) > 3 and w[-2] not in vowels:
        w = w[:-1]
    if len(w) >= 3 and w[-1] == w[-2] and w[-1] not in vowels:
        w = w[:-1]
    return w

 # Tokenization and filtering
 def tokenize(text):
    return [stem(w) for w in text.lower().split()
            if w.isalpha() and len(w) > 2 and w not in stop]

 # Build TF, DF, class counts
 tf, df = [], Counter()
 tf_class = defaultdict(Counter)
 class_counts = Counter(classes)

 for text, label in zip(docs, classes):
    tokens = tokenize(text)
    counts = Counter(tokens)
    tf.append(counts)
    for word in counts:
        df[word] += 1
        tf_class[word][label] += 1

 N = len(docs)

 # Compute TF-IDF
 tfidf = {w: sum(counts[w] * math.log(N / df[w]) for counts in tf if w in counts)
         for w in df}

 # Top 100 terms by TF-IDF
 top = sorted(tfidf, key=tfidf.get, reverse=True)[:100]

 # Info gain
 H = -sum(p := class_counts[k]/N, p * math.log(p) for k in class_counts)

 ig = {}
 for w in top:
    yes = sum(tf_class[w][k] for k in class_counts)
    no = N - yes + 1e-9  # avoid div/0
    py, pn = yes / N, no / N

    Hy = -sum((tf_class[w][k]/yes) * math.log(tf_class[w][k]/yes)
              for k in class_counts if tf_class[w][k])
    Hn = -sum(((class_counts[k]-tf_class[w][k])/no) * math.log((class_counts[k]-tf_class[w][k])/no)
              for k in class_counts if class_counts[k] > tf_class[w][k])

    ig[w] = H - (py * Hy + pn * Hn)

 # Output sorted by IG
 for w in sorted(top, key=ig.get, reverse=True):
    print(f"{w:20s} tfidf={tfidf[w]:8.2f}  ig={ig[w]:.3f}")
	def stem(w):
	vowels = "aeiou"

	# Plural normalization
	if w.endswith("ies") and len(w) > 4:
	return w[:-3] + "y"
	if w.endswith("ves") and len(w) > 4:
	return w[:-3] + "f"

	# Long suffixes (Porter-inspired)
	for suf in (
	"ization", "ational", "fulness", "ousness", "iveness", "tional", "biliti",
	"lessli", "entli", "ation", "alism", "aliti", "ousli", "iviti", "fulli",
	"enci", "anci", "abli", "izer", "ator", "alli", "bli", "ogi", "li"
	):
	if w.endswith(suf) and len(w) > len(suf) + 2:
	w = w[:-len(suf)]
	break

	# Common short suffixes
	for suf in ("ing", "ed", "ly", "es", "er", "al", "ic", "en"):
	if w.endswith(suf) and len(w) > len(suf) + 2:
	w = w[:-len(suf)]
	break

	# Trailing 's' (safe removal)
	if w.endswith("s") and len(w) > 3 and w[-2] not in vowels:
	w = w[:-1]

	# Double consonant (e.g., "hopping" → "hop")
	if len(w) >= 3 and w[-1] == w[-2] and w[-1] not in vowels:
	w = w[:-1]

	return w
	"""
	Load stop words

	Read multi-line records (blank-line-separated, last token = class label)

	Tokenize + normalize

	Remove stop words

	Apply lightweight stemming

	Compute TF-IDF

	Pick top 100 terms by TF-IDF

	Rank those 100 by Information Gain
	"""

	import math, sys
	from collections import defaultdict, Counter

	# Load stopwords
	with open(sys.argv[1]) as f:
	stop = set(w.strip().lower() for w in f if w.strip())

	# Read multi-line records (records separated by blank lines)
	docs, classes = [], []
	with open(sys.argv[2]) as f:
	rec = []
	for line in f:
	if line.strip() == "":
	if rec:
	docs.append("\n".join(rec))
	classes.append(rec[-1].split()[-1])
	rec = []
	else:
	rec.append(line.strip())
	if rec:
	docs.append("\n".join(rec))
	classes.append(rec[-1].split()[-1])

	# Light stemmer: plural handling, suffix stripping, double-consonants
	def stem(w):
	vowels = "aeiou"
	if w.endswith("ies") and len(w) > 4: return w[:-3] + "y"
	if w.endswith("ves") and len(w) > 4: return w[:-3] + "f"
	for suf in (
	"ization", "ational", "fulness", "ousness", "iveness", "tional", "biliti",
	"lessli", "entli", "ation", "alism", "aliti", "ousli", "iviti", "fulli",
	"enci", "anci", "abli", "izer", "ator", "alli", "bli", "ogi", "li"
	):
	if w.endswith(suf) and len(w) > len(suf) + 2:
	w = w[:-len(suf)]
	break
	for suf in ("ing", "ed", "ly", "es", "er", "al", "ic", "en"):
	if w.endswith(suf) and len(w) > len(suf) + 2:
	w = w[:-len(suf)]
	break
	if w.endswith("s") and len(w) > 3 and w[-2] not in vowels:
	w = w[:-1]
	if len(w) >= 3 and w[-1] == w[-2] and w[-1] not in vowels:
	w = w[:-1]
	return w

	# Tokenization and filtering
	def tokenize(text):
	return [stem(w) for w in text.lower().split()
	if w.isalpha() and len(w) > 2 and w not in stop]

	# Build TF, DF, class counts
	tf, df = [], Counter()
	tf_class = defaultdict(Counter)
	class_counts = Counter(classes)

	for text, label in zip(docs, classes):
	tokens = tokenize(text)
	counts = Counter(tokens)
	tf.append(counts)
	for word in counts:
	df[word] += 1
	tf_class[word][label] += 1

	N = len(docs)

	# Compute TF-IDF
	tfidf = {w: sum(counts[w] * math.log(N / df[w]) for counts in tf if w in counts)
	for w in df}

	# Top 100 terms by TF-IDF
	top = sorted(tfidf, key=tfidf.get, reverse=True)[:100]

	# Info gain
	H = -sum(p := class_counts[k]/N, p * math.log(p) for k in class_counts)

	ig = {}
	for w in top:
	yes = sum(tf_class[w][k] for k in class_counts)
	no = N - yes + 1e-9 # avoid div/0
	py, pn = yes / N, no / N

	Hy = -sum((tf_class[w][k]/yes) * math.log(tf_class[w][k]/yes)
	for k in class_counts if tf_class[w][k])
	Hn = -sum(((class_counts[k]-tf_class[w][k])/no) * math.log((class_counts[k]-tf_class[w][k])/no)
	for k in class_counts if class_counts[k] > tf_class[w][k])

	ig[w] = H - (py * Hy + pn * Hn)

	# Output sorted by IG
	for w in sorted(top, key=ig.get, reverse=True):
	print(f"{w:20s} tfidf={tfidf[w]:8.2f} ig={ig[w]:.3f}")