Skip to content

Instantly share code, notes, and snippets.

@birkin
Created November 20, 2025 13:49
Show Gist options
  • Select an option

  • Save birkin/383430f90e67d014b7fcc073c12ac0b1 to your computer and use it in GitHub Desktop.

Select an option

Save birkin/383430f90e67d014b7fcc073c12ac0b1 to your computer and use it in GitHub Desktop.
cds_spacy_anything
# /// script
# requires-python = "==3.12.*"
# dependencies = [
# "spacy~=3.8.0",
# "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"
# ]
# ///
import argparse
from collections import Counter
from pathlib import Path
from pprint import pprint
import spacy
parser = argparse.ArgumentParser(description='Run spaCy NER on a text file and print sorted entity counts plus a glossary.')
parser.add_argument('--filepath', required=True, type=Path, help='Path to a UTF-8 text file to analyze')
args = parser.parse_args()
# Read text from the provided file path
text = args.filepath.read_text(encoding='utf-8')
## get named entities from spacy ------------------------------------
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
spacy_named_entities = []
for ent in doc.ents:
token = ent.text
label = ent.label_
tuple_ = (token, label)
spacy_named_entities.append(tuple_)
## clean up a bit (whitespace and newlines) -------------------------
spacy_named_entities_cleaned = []
for value, label in spacy_named_entities:
cleaned_value = value.strip()
spacy_named_entities_cleaned.append((cleaned_value, label))
## add counts -------------------------------------------------------
named_entity_counts = Counter(spacy_named_entities_cleaned) # sorts by counts
## on count-ties, sort by text alphabetically ----------------------
sorted_counts = sorted(named_entity_counts.items(), key=lambda item: (-item[1], item[0][0].casefold())) # thanks, AI! πŸ™‚
print('\n----------\n')
print('sorted_counts...')
pprint(sorted_counts)
## print glossary ---------------------------------------------------
glossary = [
'CARDINAL β€” numerals that don’t fall under another type.',
'DATE β€” absolute or relative dates or periods.',
'EVENT β€” named hurricanes, battles, wars, sports events, etc.',
'FAC β€” buildings, airports, highways, bridges, etc.',
'GPE β€” countries, cities, states.',
'LANGUAGE β€” any named language.',
'LAW β€” named documents made into laws.',
'LOC β€” non-GPE locations: mountain ranges, bodies of water, etc.',
'MONEY β€” monetary values, including unit.',
'NORP β€” nationalities or religious or political groups.',
'ORDINAL β€” β€œfirst”, β€œsecond”, etc.',
'ORG β€” companies, agencies, institutions, etc.',
'PERCENT β€” percentage, including β€œ%”.',
'PERSON β€” people, including fictional.',
'PRODUCT β€” objects, vehicles, foods, etc. (not services).',
'QUANTITY β€” measurements (e.g., weight, distance).',
'TIME β€” times smaller than a day.',
'WORK_OF_ART β€” titles of books, songs, etc.',
]
print('\n----------\n')
print('glossary...')
pprint(glossary)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment