birkin · November 20, 2025 13:49
diff --git a/cds_spacy_anything.py b/cds_spacy_anything.py
 # /// script
 # requires-python = "==3.12.*"
 # dependencies = [
 #   "spacy~=3.8.0",
 #   "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
 # ]
 # ///

 import argparse
 from collections import Counter
 from pathlib import Path
 from pprint import pprint

 import spacy

 parser = argparse.ArgumentParser(description='Run spaCy NER on a text file and print sorted entity counts plus a glossary.')
 parser.add_argument('--filepath', required=True, type=Path, help='Path to a UTF-8 text file to analyze')
 args = parser.parse_args()

 # Read text from the provided file path
 text = args.filepath.read_text(encoding='utf-8')

 ## get named entities from spacy ------------------------------------
 nlp = spacy.load('en_core_web_sm')
 doc = nlp(text)
 spacy_named_entities = []
 for ent in doc.ents:
    token = ent.text
    label = ent.label_
    tuple_ = (token, label)
    spacy_named_entities.append(tuple_)

 ## clean up a bit (whitespace and newlines) -------------------------
 spacy_named_entities_cleaned = []
 for value, label in spacy_named_entities:
    cleaned_value = value.strip()
    spacy_named_entities_cleaned.append((cleaned_value, label))

 ## add counts -------------------------------------------------------
 named_entity_counts = Counter(spacy_named_entities_cleaned)  # sorts by counts

 ## on count-ties, sort by text alphabetically ----------------------
 sorted_counts = sorted(named_entity_counts.items(), key=lambda item: (-item[1], item[0][0].casefold()))  # thanks, AI! 🙂
 print('\n----------\n')
 print('sorted_counts...')
 pprint(sorted_counts)

 ## print glossary ---------------------------------------------------
 glossary = [
    'CARDINAL — numerals that don’t fall under another type.',
    'DATE — absolute or relative dates or periods.',
    'EVENT — named hurricanes, battles, wars, sports events, etc.',
    'FAC — buildings, airports, highways, bridges, etc.',
    'GPE — countries, cities, states.',
    'LANGUAGE — any named language.',
    'LAW — named documents made into laws.',
    'LOC — non-GPE locations: mountain ranges, bodies of water, etc.',
    'MONEY — monetary values, including unit.',
    'NORP — nationalities or religious or political groups.',
    'ORDINAL — “first”, “second”, etc.',
    'ORG — companies, agencies, institutions, etc.',
    'PERCENT — percentage, including “%”.',
    'PERSON — people, including fictional.',
    'PRODUCT — objects, vehicles, foods, etc. (not services).',
    'QUANTITY — measurements (e.g., weight, distance).',
    'TIME — times smaller than a day.',
    'WORK_OF_ART — titles of books, songs, etc.',
 ]
 print('\n----------\n')
 print('glossary...')
 pprint(glossary)
	# /// script
	# requires-python = "==3.12.*"
	# dependencies = [
	# "spacy~=3.8.0",
	# "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
	# ]
	# ///

	import argparse
	from collections import Counter
	from pathlib import Path
	from pprint import pprint

	import spacy

	parser = argparse.ArgumentParser(description='Run spaCy NER on a text file and print sorted entity counts plus a glossary.')
	parser.add_argument('--filepath', required=True, type=Path, help='Path to a UTF-8 text file to analyze')
	args = parser.parse_args()

	# Read text from the provided file path
	text = args.filepath.read_text(encoding='utf-8')

	## get named entities from spacy ------------------------------------
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(text)
	spacy_named_entities = []
	for ent in doc.ents:
	token = ent.text
	label = ent.label_
	tuple_ = (token, label)
	spacy_named_entities.append(tuple_)

	## clean up a bit (whitespace and newlines) -------------------------
	spacy_named_entities_cleaned = []
	for value, label in spacy_named_entities:
	cleaned_value = value.strip()
	spacy_named_entities_cleaned.append((cleaned_value, label))

	## add counts -------------------------------------------------------
	named_entity_counts = Counter(spacy_named_entities_cleaned) # sorts by counts

	## on count-ties, sort by text alphabetically ----------------------
	sorted_counts = sorted(named_entity_counts.items(), key=lambda item: (-item[1], item[0][0].casefold())) # thanks, AI! 🙂
	print('\n----------\n')
	print('sorted_counts...')
	pprint(sorted_counts)

	## print glossary ---------------------------------------------------
	glossary = [
	'CARDINAL — numerals that don’t fall under another type.',
	'DATE — absolute or relative dates or periods.',
	'EVENT — named hurricanes, battles, wars, sports events, etc.',
	'FAC — buildings, airports, highways, bridges, etc.',
	'GPE — countries, cities, states.',
	'LANGUAGE — any named language.',
	'LAW — named documents made into laws.',
	'LOC — non-GPE locations: mountain ranges, bodies of water, etc.',
	'MONEY — monetary values, including unit.',
	'NORP — nationalities or religious or political groups.',
	'ORDINAL — “first”, “second”, etc.',
	'ORG — companies, agencies, institutions, etc.',
	'PERCENT — percentage, including “%”.',
	'PERSON — people, including fictional.',
	'PRODUCT — objects, vehicles, foods, etc. (not services).',
	'QUANTITY — measurements (e.g., weight, distance).',
	'TIME — times smaller than a day.',
	'WORK_OF_ART — titles of books, songs, etc.',
	]
	print('\n----------\n')
	print('glossary...')
	pprint(glossary)
No results found