edsu · March 10, 2026 16:35
diff --git a/qids.py b/qids.py
 #!/usr/bin/env python3

 # stream wikidata ntriples dumps for two snapshots and look for QIDs that have been deleted.

 import os


 def main():
    qids_before = get_qids("https://dumps.wikimedia.org/wikidatawiki/entities/20260123/wikidata-20260123-lexemes-BETA.nt.gz")
    print(f"before: {len(qids_before)}")

    qids_after = get_qids("https://dumps.wikimedia.org/wikidatawiki/entities/20260306/wikidata-20260306-lexemes-BETA.nt.gz")
    print(f"after: {len(qids_after)}")

    for qid in qids_before - qids_after:
        print(qid)


 def get_qids(url) -> set[str]:
    # cheat by shelling out to curl and gunzip instead of doing this (slower?) in python
    qids = set()
    for line in os.popen(f"curl --silent {url} | gunzip -c"):
        # lexeme dumps only seem to have lexeme entities in the subject position?
        # maybe I'm not using the right dump files here?
        for uri in line.split(" ")[0:3]:
            uri = uri.strip("<>")
            if uri.startswith('http://www.wikidata.org/entity/Q'):
                wikidata_id = uri.replace('http://www.wikidata.org/entity/', '')
                qids.add(wikidata_id)
    return qids


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	# stream wikidata ntriples dumps for two snapshots and look for QIDs that have been deleted.

	import os


	def main():
	qids_before = get_qids("https://dumps.wikimedia.org/wikidatawiki/entities/20260123/wikidata-20260123-lexemes-BETA.nt.gz")
	print(f"before: {len(qids_before)}")

	qids_after = get_qids("https://dumps.wikimedia.org/wikidatawiki/entities/20260306/wikidata-20260306-lexemes-BETA.nt.gz")
	print(f"after: {len(qids_after)}")

	for qid in qids_before - qids_after:
	print(qid)


	def get_qids(url) -> set[str]:
	# cheat by shelling out to curl and gunzip instead of doing this (slower?) in python
	qids = set()
	for line in os.popen(f"curl --silent {url} \| gunzip -c"):
	# lexeme dumps only seem to have lexeme entities in the subject position?
	# maybe I'm not using the right dump files here?
	for uri in line.split(" ")[0:3]:
	uri = uri.strip("<>")
	if uri.startswith('http://www.wikidata.org/entity/Q'):
	wikidata_id = uri.replace('http://www.wikidata.org/entity/', '')
	qids.add(wikidata_id)
	return qids


	if __name__ == "__main__":
	main()
No results found