Last active
March 10, 2026 16:35
-
-
Save edsu/2af9bc42982c793fd9faf18e14f82ac2 to your computer and use it in GitHub Desktop.
look for deleted wikidata QIDs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # stream wikidata ntriples dumps for two snapshots and look for QIDs that have been deleted. | |
| import os | |
| def main(): | |
| qids_before = get_qids("https://dumps.wikimedia.org/wikidatawiki/entities/20260123/wikidata-20260123-lexemes-BETA.nt.gz") | |
| print(f"before: {len(qids_before)}") | |
| qids_after = get_qids("https://dumps.wikimedia.org/wikidatawiki/entities/20260306/wikidata-20260306-lexemes-BETA.nt.gz") | |
| print(f"after: {len(qids_after)}") | |
| for qid in qids_before - qids_after: | |
| print(qid) | |
| def get_qids(url) -> set[str]: | |
| # cheat by shelling out to curl and gunzip instead of doing this (slower?) in python | |
| qids = set() | |
| for line in os.popen(f"curl --silent {url} | gunzip -c"): | |
| # lexeme dumps only seem to have lexeme entities in the subject position? | |
| # maybe I'm not using the right dump files here? | |
| for uri in line.split(" ")[0:3]: | |
| uri = uri.strip("<>") | |
| if uri.startswith('http://www.wikidata.org/entity/Q'): | |
| wikidata_id = uri.replace('http://www.wikidata.org/entity/', '') | |
| qids.add(wikidata_id) | |
| return qids | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment