Skip to content

Instantly share code, notes, and snippets.

@jmcph4
Created November 10, 2025 04:58
Show Gist options
  • Select an option

  • Save jmcph4/6bbcf565c99e6119fa18786b179fe0fe to your computer and use it in GitHub Desktop.

Select an option

Save jmcph4/6bbcf565c99e6119fa18786b179fe0fe to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# Extract TOP500 listings for a given country from the provided XML.
# Source schema example: :contentReference[oaicite:0]{index=0}
import argparse
import csv
import json
import sys
import xml.etree.ElementTree as ET
from typing import Dict, List, Optional
NS = {"top500": "http://www.top500.org/xml/top500/1.0"}
# Fields to extract: (label, XPath relative to a top500:site element)
FIELDS: List[tuple[str, str]] = [
("rank", "top500:rank"),
("system_id", "top500:system-id"),
("system_name", "top500:system-name"),
("manufacturer", "top500:manufacturer"),
("computer", "top500:computer"),
("system_address", "top500:system-address"),
("r_max", "top500:r-max"),
("r_peak", "top500:r-peak"),
("power", "top500:power"),
("n_max", "top500:n-max"),
("n_half", "top500:n-half"),
("install_site_name", "top500:installation-site/top500:installation-site-name"),
("install_site_address", "top500:installation-site/top500:installation-site-address"),
("site_id", "top500:installation-site/top500:site-id"),
("town", "top500:town"),
("state", "top500:state"),
("country", "top500:country"),
("year", "top500:year"),
("area_of_installation", "top500:area-of-installation"),
("number_of_processors", "top500:number-of-processors"),
]
def gettext(elem: Optional[ET.Element]) -> str:
return (elem.text or "").strip() if elem is not None else ""
def extract_site(site: ET.Element) -> Dict[str, str]:
out: Dict[str, str] = {}
for label, path in FIELDS:
node = site.find(path, NS)
out[label] = gettext(node)
return out
def filter_country(record: Dict[str, str], country_query: str, exact: bool) -> bool:
country = record.get("country", "")
if exact:
return country == country_query
# substring, case-insensitive
return country_query.lower() in country.lower()
def load_sites(xml_path: str) -> List[ET.Element]:
# Fast parse with iterparse to avoid loading the whole tree into memory unnecessarily
ctx = ET.iterparse(xml_path, events=("start", "end"))
_, root = next(ctx) # get root element
sites: List[ET.Element] = []
for event, elem in ctx:
if event == "end" and elem.tag == f"{{{NS['top500']}}}site":
sites.append(elem)
# Clear children to keep memory bounded
elem.clear()
# After iterparse with clear(), elements in `sites` are minimal; re-parse normally for simplicity if needed:
# For robust correctness with the above clear(), instead parse normally (files are manageable in size).
# If memory is a concern, switch to streaming output. Here we re-parse once:
tree = ET.parse(xml_path)
return tree.getroot().findall("top500:site", NS)
def to_csv(records: List[Dict[str, str]], fields: List[str], file=sys.stdout) -> None:
writer = csv.DictWriter(file, fieldnames=fields)
writer.writeheader()
for r in records:
writer.writerow({k: r.get(k, "") for k in fields})
def to_tsv(records: List[Dict[str, str]], fields: List[str], file=sys.stdout) -> None:
writer = csv.DictWriter(file, fieldnames=fields, delimiter="\t", lineterminator="\n")
writer.writeheader()
for r in records:
writer.writerow({k: r.get(k, "") for k in fields})
def to_json(records: List[Dict[str, str]], file=sys.stdout, indent: Optional[int] = 2) -> None:
json.dump(records, file, indent=indent, ensure_ascii=False)
def main() -> None:
parser = argparse.ArgumentParser(
description="Extract TOP500 listings for a given country from an XML export."
)
parser.add_argument(
"-i", "--input", required=True, help="Path to TOP500 XML file (e.g., TOP500_202506_all.xml)"
)
parser.add_argument(
"-c", "--country", required=True,
help='Country to match (e.g., "United States"). Use --exact for exact match; otherwise substring match (case-insensitive).'
)
parser.add_argument(
"-f", "--format", choices=["csv", "tsv", "json"], default="csv", help="Output format (default: csv)"
)
parser.add_argument(
"--fields",
default=",".join(label for (label, _) in FIELDS),
help="Comma-separated list of fields to include. Defaults to all known fields."
)
parser.add_argument(
"--exact", action="store_true", help="Use exact match for country (default: substring case-insensitive)."
)
parser.add_argument(
"--pretty", action="store_true", help="Pretty-print JSON."
)
args = parser.parse_args()
wanted_fields = [f.strip() for f in args.fields.split(",") if f.strip()]
unknown = [f for f in wanted_fields if f not in {lbl for (lbl, _) in FIELDS}]
if unknown:
parser.error(f"Unknown field(s): {', '.join(unknown)}")
try:
sites = load_sites(args.input)
except ET.ParseError as e:
sys.stderr.write(f"XML parse error: {e}\n")
sys.exit(2)
except OSError as e:
sys.stderr.write(f"I/O error: {e}\n")
sys.exit(2)
records = [extract_site(s) for s in sites]
filtered = [r for r in records if filter_country(r, args.country, args.exact)]
if args.format == "csv":
to_csv(filtered, wanted_fields)
elif args.format == "tsv":
to_tsv(filtered, wanted_fields)
else: # json
to_json(filtered, indent=2 if args.pretty else None)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment