Created
November 10, 2025 04:58
-
-
Save jmcph4/6bbcf565c99e6119fa18786b179fe0fe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # Extract TOP500 listings for a given country from the provided XML. | |
| # Source schema example: :contentReference[oaicite:0]{index=0} | |
| import argparse | |
| import csv | |
| import json | |
| import sys | |
| import xml.etree.ElementTree as ET | |
| from typing import Dict, List, Optional | |
| NS = {"top500": "http://www.top500.org/xml/top500/1.0"} | |
| # Fields to extract: (label, XPath relative to a top500:site element) | |
| FIELDS: List[tuple[str, str]] = [ | |
| ("rank", "top500:rank"), | |
| ("system_id", "top500:system-id"), | |
| ("system_name", "top500:system-name"), | |
| ("manufacturer", "top500:manufacturer"), | |
| ("computer", "top500:computer"), | |
| ("system_address", "top500:system-address"), | |
| ("r_max", "top500:r-max"), | |
| ("r_peak", "top500:r-peak"), | |
| ("power", "top500:power"), | |
| ("n_max", "top500:n-max"), | |
| ("n_half", "top500:n-half"), | |
| ("install_site_name", "top500:installation-site/top500:installation-site-name"), | |
| ("install_site_address", "top500:installation-site/top500:installation-site-address"), | |
| ("site_id", "top500:installation-site/top500:site-id"), | |
| ("town", "top500:town"), | |
| ("state", "top500:state"), | |
| ("country", "top500:country"), | |
| ("year", "top500:year"), | |
| ("area_of_installation", "top500:area-of-installation"), | |
| ("number_of_processors", "top500:number-of-processors"), | |
| ] | |
| def gettext(elem: Optional[ET.Element]) -> str: | |
| return (elem.text or "").strip() if elem is not None else "" | |
| def extract_site(site: ET.Element) -> Dict[str, str]: | |
| out: Dict[str, str] = {} | |
| for label, path in FIELDS: | |
| node = site.find(path, NS) | |
| out[label] = gettext(node) | |
| return out | |
| def filter_country(record: Dict[str, str], country_query: str, exact: bool) -> bool: | |
| country = record.get("country", "") | |
| if exact: | |
| return country == country_query | |
| # substring, case-insensitive | |
| return country_query.lower() in country.lower() | |
| def load_sites(xml_path: str) -> List[ET.Element]: | |
| # Fast parse with iterparse to avoid loading the whole tree into memory unnecessarily | |
| ctx = ET.iterparse(xml_path, events=("start", "end")) | |
| _, root = next(ctx) # get root element | |
| sites: List[ET.Element] = [] | |
| for event, elem in ctx: | |
| if event == "end" and elem.tag == f"{{{NS['top500']}}}site": | |
| sites.append(elem) | |
| # Clear children to keep memory bounded | |
| elem.clear() | |
| # After iterparse with clear(), elements in `sites` are minimal; re-parse normally for simplicity if needed: | |
| # For robust correctness with the above clear(), instead parse normally (files are manageable in size). | |
| # If memory is a concern, switch to streaming output. Here we re-parse once: | |
| tree = ET.parse(xml_path) | |
| return tree.getroot().findall("top500:site", NS) | |
| def to_csv(records: List[Dict[str, str]], fields: List[str], file=sys.stdout) -> None: | |
| writer = csv.DictWriter(file, fieldnames=fields) | |
| writer.writeheader() | |
| for r in records: | |
| writer.writerow({k: r.get(k, "") for k in fields}) | |
| def to_tsv(records: List[Dict[str, str]], fields: List[str], file=sys.stdout) -> None: | |
| writer = csv.DictWriter(file, fieldnames=fields, delimiter="\t", lineterminator="\n") | |
| writer.writeheader() | |
| for r in records: | |
| writer.writerow({k: r.get(k, "") for k in fields}) | |
| def to_json(records: List[Dict[str, str]], file=sys.stdout, indent: Optional[int] = 2) -> None: | |
| json.dump(records, file, indent=indent, ensure_ascii=False) | |
| def main() -> None: | |
| parser = argparse.ArgumentParser( | |
| description="Extract TOP500 listings for a given country from an XML export." | |
| ) | |
| parser.add_argument( | |
| "-i", "--input", required=True, help="Path to TOP500 XML file (e.g., TOP500_202506_all.xml)" | |
| ) | |
| parser.add_argument( | |
| "-c", "--country", required=True, | |
| help='Country to match (e.g., "United States"). Use --exact for exact match; otherwise substring match (case-insensitive).' | |
| ) | |
| parser.add_argument( | |
| "-f", "--format", choices=["csv", "tsv", "json"], default="csv", help="Output format (default: csv)" | |
| ) | |
| parser.add_argument( | |
| "--fields", | |
| default=",".join(label for (label, _) in FIELDS), | |
| help="Comma-separated list of fields to include. Defaults to all known fields." | |
| ) | |
| parser.add_argument( | |
| "--exact", action="store_true", help="Use exact match for country (default: substring case-insensitive)." | |
| ) | |
| parser.add_argument( | |
| "--pretty", action="store_true", help="Pretty-print JSON." | |
| ) | |
| args = parser.parse_args() | |
| wanted_fields = [f.strip() for f in args.fields.split(",") if f.strip()] | |
| unknown = [f for f in wanted_fields if f not in {lbl for (lbl, _) in FIELDS}] | |
| if unknown: | |
| parser.error(f"Unknown field(s): {', '.join(unknown)}") | |
| try: | |
| sites = load_sites(args.input) | |
| except ET.ParseError as e: | |
| sys.stderr.write(f"XML parse error: {e}\n") | |
| sys.exit(2) | |
| except OSError as e: | |
| sys.stderr.write(f"I/O error: {e}\n") | |
| sys.exit(2) | |
| records = [extract_site(s) for s in sites] | |
| filtered = [r for r in records if filter_country(r, args.country, args.exact)] | |
| if args.format == "csv": | |
| to_csv(filtered, wanted_fields) | |
| elif args.format == "tsv": | |
| to_tsv(filtered, wanted_fields) | |
| else: # json | |
| to_json(filtered, indent=2 if args.pretty else None) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment