Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active March 10, 2026 16:35
Show Gist options
  • Select an option

  • Save edsu/2af9bc42982c793fd9faf18e14f82ac2 to your computer and use it in GitHub Desktop.

Select an option

Save edsu/2af9bc42982c793fd9faf18e14f82ac2 to your computer and use it in GitHub Desktop.
look for deleted wikidata QIDs
#!/usr/bin/env python3
# stream wikidata ntriples dumps for two snapshots and look for QIDs that have been deleted.
import os
def main():
qids_before = get_qids("https://dumps.wikimedia.org/wikidatawiki/entities/20260123/wikidata-20260123-lexemes-BETA.nt.gz")
print(f"before: {len(qids_before)}")
qids_after = get_qids("https://dumps.wikimedia.org/wikidatawiki/entities/20260306/wikidata-20260306-lexemes-BETA.nt.gz")
print(f"after: {len(qids_after)}")
for qid in qids_before - qids_after:
print(qid)
def get_qids(url) -> set[str]:
# cheat by shelling out to curl and gunzip instead of doing this (slower?) in python
qids = set()
for line in os.popen(f"curl --silent {url} | gunzip -c"):
# lexeme dumps only seem to have lexeme entities in the subject position?
# maybe I'm not using the right dump files here?
for uri in line.split(" ")[0:3]:
uri = uri.strip("<>")
if uri.startswith('http://www.wikidata.org/entity/Q'):
wikidata_id = uri.replace('http://www.wikidata.org/entity/', '')
qids.add(wikidata_id)
return qids
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment