Created
November 20, 2025 13:49
-
-
Save birkin/383430f90e67d014b7fcc073c12ac0b1 to your computer and use it in GitHub Desktop.
cds_spacy_anything
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = "==3.12.*" | |
| # dependencies = [ | |
| # "spacy~=3.8.0", | |
| # "en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" | |
| # ] | |
| # /// | |
| import argparse | |
| from collections import Counter | |
| from pathlib import Path | |
| from pprint import pprint | |
| import spacy | |
| parser = argparse.ArgumentParser(description='Run spaCy NER on a text file and print sorted entity counts plus a glossary.') | |
| parser.add_argument('--filepath', required=True, type=Path, help='Path to a UTF-8 text file to analyze') | |
| args = parser.parse_args() | |
| # Read text from the provided file path | |
| text = args.filepath.read_text(encoding='utf-8') | |
| ## get named entities from spacy ------------------------------------ | |
| nlp = spacy.load('en_core_web_sm') | |
| doc = nlp(text) | |
| spacy_named_entities = [] | |
| for ent in doc.ents: | |
| token = ent.text | |
| label = ent.label_ | |
| tuple_ = (token, label) | |
| spacy_named_entities.append(tuple_) | |
| ## clean up a bit (whitespace and newlines) ------------------------- | |
| spacy_named_entities_cleaned = [] | |
| for value, label in spacy_named_entities: | |
| cleaned_value = value.strip() | |
| spacy_named_entities_cleaned.append((cleaned_value, label)) | |
| ## add counts ------------------------------------------------------- | |
| named_entity_counts = Counter(spacy_named_entities_cleaned) # sorts by counts | |
| ## on count-ties, sort by text alphabetically ---------------------- | |
| sorted_counts = sorted(named_entity_counts.items(), key=lambda item: (-item[1], item[0][0].casefold())) # thanks, AI! π | |
| print('\n----------\n') | |
| print('sorted_counts...') | |
| pprint(sorted_counts) | |
| ## print glossary --------------------------------------------------- | |
| glossary = [ | |
| 'CARDINAL β numerals that donβt fall under another type.', | |
| 'DATE β absolute or relative dates or periods.', | |
| 'EVENT β named hurricanes, battles, wars, sports events, etc.', | |
| 'FAC β buildings, airports, highways, bridges, etc.', | |
| 'GPE β countries, cities, states.', | |
| 'LANGUAGE β any named language.', | |
| 'LAW β named documents made into laws.', | |
| 'LOC β non-GPE locations: mountain ranges, bodies of water, etc.', | |
| 'MONEY β monetary values, including unit.', | |
| 'NORP β nationalities or religious or political groups.', | |
| 'ORDINAL β βfirstβ, βsecondβ, etc.', | |
| 'ORG β companies, agencies, institutions, etc.', | |
| 'PERCENT β percentage, including β%β.', | |
| 'PERSON β people, including fictional.', | |
| 'PRODUCT β objects, vehicles, foods, etc. (not services).', | |
| 'QUANTITY β measurements (e.g., weight, distance).', | |
| 'TIME β times smaller than a day.', | |
| 'WORK_OF_ART β titles of books, songs, etc.', | |
| ] | |
| print('\n----------\n') | |
| print('glossary...') | |
| pprint(glossary) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment