Last active
February 14, 2020 12:36
-
-
Save ajtucker/68c2b9c7d9df23d652367450695d2cc8 to your computer and use it in GitHub Desktop.
While we're using the conventions in table2qb, we need to keep track of name clashes in codelists.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import json | |
| import requests | |
| from urllib.parse import urljoin | |
| import re | |
| import pprint | |
| from collections import defaultdict | |
| import csv | |
| import codecs | |
| pp = pprint.PrettyPrinter(indent=2) | |
| s = requests.Session() | |
| ref_repos = [ | |
| ('ref_common', 'https://gss-cogs.github.io/ref_common/'), | |
| ('ref_trade', 'https://gss-cogs.github.io/ref_trade/'), | |
| ('ref_migration', 'https://gss-cogs.github.io/ref_migration/'), | |
| ('ref_alcohol', 'https://gss-cogs.github.io/ref_alcohol/'), | |
| ('disability', 'https://gss-cogs.github.io/family-disability/reference/'), | |
| ('affordable-housing', 'https://gss-cogs.github.io/family-affordable-housing/reference/') | |
| ] | |
| ref_codelists = {} | |
| ref_components = {} | |
| def pathify(label): | |
| return re.sub(r'-$', '', | |
| re.sub(r'-+', '-', | |
| re.sub(r'[^\w/]', '-', label.lower()))) | |
| for id, base_url in ref_repos: | |
| codelists_metadata_url = urljoin(base_url, 'codelists-metadata.json') | |
| ref_codelists[id] = s.get(codelists_metadata_url).json() | |
| components_url = urljoin(base_url, 'components.csv') | |
| comp_response = s.get(components_url, stream=True) | |
| comp_reader = csv.DictReader(codecs.iterdecode(comp_response.iter_lines(), 'utf-8')) | |
| ref_components[id] = set([ pathify(row['Label']) for row in comp_reader ]) | |
| codelist_idsets = {} | |
| for ref_id, codelists in ref_codelists.items(): | |
| codelist_idsets[ref_id] = set([pathify(t['rdfs:label']) for t in codelists['tables']]) | |
| codelist_repos = defaultdict(set) | |
| for ref_id, idset in codelist_idsets.items(): | |
| for codelist_id in idset: | |
| codelist_repos[codelist_id].add(ref_id) | |
| print('Duplicate codelists') | |
| pp.pprint({codelist_id: refs for codelist_id, refs in codelist_repos.items() if len(refs) > 1}) | |
| components_repos = defaultdict(set) | |
| for ref_id, compset in ref_components.items(): | |
| for comp_id in compset: | |
| components_repos[comp_id].add(ref_id) | |
| print('\nDuplicate components') | |
| pp.pprint({comp_id: refs for comp_id, refs in components_repos.items() if len(refs) > 1}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment