ajtucker · February 14, 2020 12:36
diff --git a/duplicates.py b/duplicates.py
 #!/usr/bin/env python3

 import json
 import requests
 from urllib.parse import urljoin
 import re
 import pprint
 from collections import defaultdict
 import csv
 import codecs

 pp = pprint.PrettyPrinter(indent=2)

 s = requests.Session()

 ref_repos = [
    ('ref_common', 'https://gss-cogs.github.io/ref_common/'),
    ('ref_trade', 'https://gss-cogs.github.io/ref_trade/'),
    ('ref_migration', 'https://gss-cogs.github.io/ref_migration/'),
    ('ref_alcohol', 'https://gss-cogs.github.io/ref_alcohol/'),
    ('disability', 'https://gss-cogs.github.io/family-disability/reference/'),
    ('affordable-housing', 'https://gss-cogs.github.io/family-affordable-housing/reference/')
 ]

 ref_codelists = {}
 ref_components = {}

 def pathify(label):
    return re.sub(r'-$', '',
                  re.sub(r'-+', '-',
                         re.sub(r'[^\w/]', '-', label.lower())))

 for id, base_url in ref_repos:
    codelists_metadata_url = urljoin(base_url, 'codelists-metadata.json')
    ref_codelists[id] = s.get(codelists_metadata_url).json()
    components_url = urljoin(base_url, 'components.csv')
    comp_response = s.get(components_url, stream=True)
    comp_reader = csv.DictReader(codecs.iterdecode(comp_response.iter_lines(), 'utf-8'))
    ref_components[id] = set([ pathify(row['Label']) for row in comp_reader ])

 codelist_idsets = {}

 for ref_id, codelists in ref_codelists.items():
    codelist_idsets[ref_id] = set([pathify(t['rdfs:label']) for t in codelists['tables']])

 codelist_repos = defaultdict(set)

 for ref_id, idset in codelist_idsets.items():
    for codelist_id in idset:
        codelist_repos[codelist_id].add(ref_id)

 print('Duplicate codelists')
 pp.pprint({codelist_id: refs for codelist_id, refs in codelist_repos.items() if len(refs) > 1})

 components_repos = defaultdict(set)

 for ref_id, compset in ref_components.items():
    for comp_id in compset:
        components_repos[comp_id].add(ref_id)

 print('\nDuplicate components')
 pp.pprint({comp_id: refs for comp_id, refs in components_repos.items() if len(refs) > 1})
	#!/usr/bin/env python3

	import json
	import requests
	from urllib.parse import urljoin
	import re
	import pprint
	from collections import defaultdict
	import csv
	import codecs

	pp = pprint.PrettyPrinter(indent=2)

	s = requests.Session()

	ref_repos = [
	('ref_common', 'https://gss-cogs.github.io/ref_common/'),
	('ref_trade', 'https://gss-cogs.github.io/ref_trade/'),
	('ref_migration', 'https://gss-cogs.github.io/ref_migration/'),
	('ref_alcohol', 'https://gss-cogs.github.io/ref_alcohol/'),
	('disability', 'https://gss-cogs.github.io/family-disability/reference/'),
	('affordable-housing', 'https://gss-cogs.github.io/family-affordable-housing/reference/')
	]

	ref_codelists = {}
	ref_components = {}

	def pathify(label):
	return re.sub(r'-$', '',
	re.sub(r'-+', '-',
	re.sub(r'[^\w/]', '-', label.lower())))

	for id, base_url in ref_repos:
	codelists_metadata_url = urljoin(base_url, 'codelists-metadata.json')
	ref_codelists[id] = s.get(codelists_metadata_url).json()
	components_url = urljoin(base_url, 'components.csv')
	comp_response = s.get(components_url, stream=True)
	comp_reader = csv.DictReader(codecs.iterdecode(comp_response.iter_lines(), 'utf-8'))
	ref_components[id] = set([ pathify(row['Label']) for row in comp_reader ])

	codelist_idsets = {}

	for ref_id, codelists in ref_codelists.items():
	codelist_idsets[ref_id] = set([pathify(t['rdfs:label']) for t in codelists['tables']])

	codelist_repos = defaultdict(set)

	for ref_id, idset in codelist_idsets.items():
	for codelist_id in idset:
	codelist_repos[codelist_id].add(ref_id)

	print('Duplicate codelists')
	pp.pprint({codelist_id: refs for codelist_id, refs in codelist_repos.items() if len(refs) > 1})

	components_repos = defaultdict(set)

	for ref_id, compset in ref_components.items():
	for comp_id in compset:
	components_repos[comp_id].add(ref_id)

	print('\nDuplicate components')
	pp.pprint({comp_id: refs for comp_id, refs in components_repos.items() if len(refs) > 1})
No results found