Skip to content

Instantly share code, notes, and snippets.

@ajtucker
Last active February 14, 2020 12:36
Show Gist options
  • Select an option

  • Save ajtucker/68c2b9c7d9df23d652367450695d2cc8 to your computer and use it in GitHub Desktop.

Select an option

Save ajtucker/68c2b9c7d9df23d652367450695d2cc8 to your computer and use it in GitHub Desktop.
While we're using the conventions in table2qb, we need to keep track of name clashes in codelists.
#!/usr/bin/env python3
import json
import requests
from urllib.parse import urljoin
import re
import pprint
from collections import defaultdict
import csv
import codecs
pp = pprint.PrettyPrinter(indent=2)
s = requests.Session()
ref_repos = [
('ref_common', 'https://gss-cogs.github.io/ref_common/'),
('ref_trade', 'https://gss-cogs.github.io/ref_trade/'),
('ref_migration', 'https://gss-cogs.github.io/ref_migration/'),
('ref_alcohol', 'https://gss-cogs.github.io/ref_alcohol/'),
('disability', 'https://gss-cogs.github.io/family-disability/reference/'),
('affordable-housing', 'https://gss-cogs.github.io/family-affordable-housing/reference/')
]
ref_codelists = {}
ref_components = {}
def pathify(label):
return re.sub(r'-$', '',
re.sub(r'-+', '-',
re.sub(r'[^\w/]', '-', label.lower())))
for id, base_url in ref_repos:
codelists_metadata_url = urljoin(base_url, 'codelists-metadata.json')
ref_codelists[id] = s.get(codelists_metadata_url).json()
components_url = urljoin(base_url, 'components.csv')
comp_response = s.get(components_url, stream=True)
comp_reader = csv.DictReader(codecs.iterdecode(comp_response.iter_lines(), 'utf-8'))
ref_components[id] = set([ pathify(row['Label']) for row in comp_reader ])
codelist_idsets = {}
for ref_id, codelists in ref_codelists.items():
codelist_idsets[ref_id] = set([pathify(t['rdfs:label']) for t in codelists['tables']])
codelist_repos = defaultdict(set)
for ref_id, idset in codelist_idsets.items():
for codelist_id in idset:
codelist_repos[codelist_id].add(ref_id)
print('Duplicate codelists')
pp.pprint({codelist_id: refs for codelist_id, refs in codelist_repos.items() if len(refs) > 1})
components_repos = defaultdict(set)
for ref_id, compset in ref_components.items():
for comp_id in compset:
components_repos[comp_id].add(ref_id)
print('\nDuplicate components')
pp.pprint({comp_id: refs for comp_id, refs in components_repos.items() if len(refs) > 1})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment