jmcph4 · November 10, 2025 04:58
diff --git a/top500.py b/top500.py
 #!/usr/bin/env python3
 # Extract TOP500 listings for a given country from the provided XML.
 # Source schema example: :contentReference[oaicite:0]{index=0}

 import argparse
 import csv
 import json
 import sys
 import xml.etree.ElementTree as ET
 from typing import Dict, List, Optional

 NS = {"top500": "http://www.top500.org/xml/top500/1.0"}

 # Fields to extract: (label, XPath relative to a top500:site element)
 FIELDS: List[tuple[str, str]] = [
    ("rank", "top500:rank"),
    ("system_id", "top500:system-id"),
    ("system_name", "top500:system-name"),
    ("manufacturer", "top500:manufacturer"),
    ("computer", "top500:computer"),
    ("system_address", "top500:system-address"),
    ("r_max", "top500:r-max"),
    ("r_peak", "top500:r-peak"),
    ("power", "top500:power"),
    ("n_max", "top500:n-max"),
    ("n_half", "top500:n-half"),
    ("install_site_name", "top500:installation-site/top500:installation-site-name"),
    ("install_site_address", "top500:installation-site/top500:installation-site-address"),
    ("site_id", "top500:installation-site/top500:site-id"),
    ("town", "top500:town"),
    ("state", "top500:state"),
    ("country", "top500:country"),
    ("year", "top500:year"),
    ("area_of_installation", "top500:area-of-installation"),
    ("number_of_processors", "top500:number-of-processors"),
 ]

 def gettext(elem: Optional[ET.Element]) -> str:
    return (elem.text or "").strip() if elem is not None else ""

 def extract_site(site: ET.Element) -> Dict[str, str]:
    out: Dict[str, str] = {}
    for label, path in FIELDS:
        node = site.find(path, NS)
        out[label] = gettext(node)
    return out

 def filter_country(record: Dict[str, str], country_query: str, exact: bool) -> bool:
    country = record.get("country", "")
    if exact:
        return country == country_query
    # substring, case-insensitive
    return country_query.lower() in country.lower()

 def load_sites(xml_path: str) -> List[ET.Element]:
    # Fast parse with iterparse to avoid loading the whole tree into memory unnecessarily
    ctx = ET.iterparse(xml_path, events=("start", "end"))
    _, root = next(ctx)  # get root element
    sites: List[ET.Element] = []
    for event, elem in ctx:
        if event == "end" and elem.tag == f"{{{NS['top500']}}}site":
            sites.append(elem)
            # Clear children to keep memory bounded
            elem.clear()
    # After iterparse with clear(), elements in `sites` are minimal; re-parse normally for simplicity if needed:
    # For robust correctness with the above clear(), instead parse normally (files are manageable in size).
    # If memory is a concern, switch to streaming output. Here we re-parse once:
    tree = ET.parse(xml_path)
    return tree.getroot().findall("top500:site", NS)

 def to_csv(records: List[Dict[str, str]], fields: List[str], file=sys.stdout) -> None:
    writer = csv.DictWriter(file, fieldnames=fields)
    writer.writeheader()
    for r in records:
        writer.writerow({k: r.get(k, "") for k in fields})

 def to_tsv(records: List[Dict[str, str]], fields: List[str], file=sys.stdout) -> None:
    writer = csv.DictWriter(file, fieldnames=fields, delimiter="\t", lineterminator="\n")
    writer.writeheader()
    for r in records:
        writer.writerow({k: r.get(k, "") for k in fields})

 def to_json(records: List[Dict[str, str]], file=sys.stdout, indent: Optional[int] = 2) -> None:
    json.dump(records, file, indent=indent, ensure_ascii=False)

 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Extract TOP500 listings for a given country from an XML export."
    )
    parser.add_argument(
        "-i", "--input", required=True, help="Path to TOP500 XML file (e.g., TOP500_202506_all.xml)"
    )
    parser.add_argument(
        "-c", "--country", required=True,
        help='Country to match (e.g., "United States"). Use --exact for exact match; otherwise substring match (case-insensitive).'
    )
    parser.add_argument(
        "-f", "--format", choices=["csv", "tsv", "json"], default="csv", help="Output format (default: csv)"
    )
    parser.add_argument(
        "--fields",
        default=",".join(label for (label, _) in FIELDS),
        help="Comma-separated list of fields to include. Defaults to all known fields."
    )
    parser.add_argument(
        "--exact", action="store_true", help="Use exact match for country (default: substring case-insensitive)."
    )
    parser.add_argument(
        "--pretty", action="store_true", help="Pretty-print JSON."
    )

    args = parser.parse_args()

    wanted_fields = [f.strip() for f in args.fields.split(",") if f.strip()]
    unknown = [f for f in wanted_fields if f not in {lbl for (lbl, _) in FIELDS}]
    if unknown:
        parser.error(f"Unknown field(s): {', '.join(unknown)}")

    try:
        sites = load_sites(args.input)
    except ET.ParseError as e:
        sys.stderr.write(f"XML parse error: {e}\n")
        sys.exit(2)
    except OSError as e:
        sys.stderr.write(f"I/O error: {e}\n")
        sys.exit(2)

    records = [extract_site(s) for s in sites]
    filtered = [r for r in records if filter_country(r, args.country, args.exact)]

    if args.format == "csv":
        to_csv(filtered, wanted_fields)
    elif args.format == "tsv":
        to_tsv(filtered, wanted_fields)
    else:  # json
        to_json(filtered, indent=2 if args.pretty else None)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# Extract TOP500 listings for a given country from the provided XML.
	# Source schema example: :contentReference[oaicite:0]{index=0}

	import argparse
	import csv
	import json
	import sys
	import xml.etree.ElementTree as ET
	from typing import Dict, List, Optional

	NS = {"top500": "http://www.top500.org/xml/top500/1.0"}

	# Fields to extract: (label, XPath relative to a top500:site element)
	FIELDS: List[tuple[str, str]] = [
	("rank", "top500:rank"),
	("system_id", "top500:system-id"),
	("system_name", "top500:system-name"),
	("manufacturer", "top500:manufacturer"),
	("computer", "top500:computer"),
	("system_address", "top500:system-address"),
	("r_max", "top500:r-max"),
	("r_peak", "top500:r-peak"),
	("power", "top500:power"),
	("n_max", "top500:n-max"),
	("n_half", "top500:n-half"),
	("install_site_name", "top500:installation-site/top500:installation-site-name"),
	("install_site_address", "top500:installation-site/top500:installation-site-address"),
	("site_id", "top500:installation-site/top500:site-id"),
	("town", "top500:town"),
	("state", "top500:state"),
	("country", "top500:country"),
	("year", "top500:year"),
	("area_of_installation", "top500:area-of-installation"),
	("number_of_processors", "top500:number-of-processors"),
	]

	def gettext(elem: Optional[ET.Element]) -> str:
	return (elem.text or "").strip() if elem is not None else ""

	def extract_site(site: ET.Element) -> Dict[str, str]:
	out: Dict[str, str] = {}
	for label, path in FIELDS:
	node = site.find(path, NS)
	out[label] = gettext(node)
	return out

	def filter_country(record: Dict[str, str], country_query: str, exact: bool) -> bool:
	country = record.get("country", "")
	if exact:
	return country == country_query
	# substring, case-insensitive
	return country_query.lower() in country.lower()

	def load_sites(xml_path: str) -> List[ET.Element]:
	# Fast parse with iterparse to avoid loading the whole tree into memory unnecessarily
	ctx = ET.iterparse(xml_path, events=("start", "end"))
	_, root = next(ctx) # get root element
	sites: List[ET.Element] = []
	for event, elem in ctx:
	if event == "end" and elem.tag == f"{{{NS['top500']}}}site":
	sites.append(elem)
	# Clear children to keep memory bounded
	elem.clear()
	# After iterparse with clear(), elements in `sites` are minimal; re-parse normally for simplicity if needed:
	# For robust correctness with the above clear(), instead parse normally (files are manageable in size).
	# If memory is a concern, switch to streaming output. Here we re-parse once:
	tree = ET.parse(xml_path)
	return tree.getroot().findall("top500:site", NS)

	def to_csv(records: List[Dict[str, str]], fields: List[str], file=sys.stdout) -> None:
	writer = csv.DictWriter(file, fieldnames=fields)
	writer.writeheader()
	for r in records:
	writer.writerow({k: r.get(k, "") for k in fields})

	def to_tsv(records: List[Dict[str, str]], fields: List[str], file=sys.stdout) -> None:
	writer = csv.DictWriter(file, fieldnames=fields, delimiter="\t", lineterminator="\n")
	writer.writeheader()
	for r in records:
	writer.writerow({k: r.get(k, "") for k in fields})

	def to_json(records: List[Dict[str, str]], file=sys.stdout, indent: Optional[int] = 2) -> None:
	json.dump(records, file, indent=indent, ensure_ascii=False)

	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Extract TOP500 listings for a given country from an XML export."
	)
	parser.add_argument(
	"-i", "--input", required=True, help="Path to TOP500 XML file (e.g., TOP500_202506_all.xml)"
	)
	parser.add_argument(
	"-c", "--country", required=True,
	help='Country to match (e.g., "United States"). Use --exact for exact match; otherwise substring match (case-insensitive).'
	)
	parser.add_argument(
	"-f", "--format", choices=["csv", "tsv", "json"], default="csv", help="Output format (default: csv)"
	)
	parser.add_argument(
	"--fields",
	default=",".join(label for (label, _) in FIELDS),
	help="Comma-separated list of fields to include. Defaults to all known fields."
	)
	parser.add_argument(
	"--exact", action="store_true", help="Use exact match for country (default: substring case-insensitive)."
	)
	parser.add_argument(
	"--pretty", action="store_true", help="Pretty-print JSON."
	)

	args = parser.parse_args()

	wanted_fields = [f.strip() for f in args.fields.split(",") if f.strip()]
	unknown = [f for f in wanted_fields if f not in {lbl for (lbl, _) in FIELDS}]
	if unknown:
	parser.error(f"Unknown field(s): {', '.join(unknown)}")

	try:
	sites = load_sites(args.input)
	except ET.ParseError as e:
	sys.stderr.write(f"XML parse error: {e}\n")
	sys.exit(2)
	except OSError as e:
	sys.stderr.write(f"I/O error: {e}\n")
	sys.exit(2)

	records = [extract_site(s) for s in sites]
	filtered = [r for r in records if filter_country(r, args.country, args.exact)]

	if args.format == "csv":
	to_csv(filtered, wanted_fields)
	elif args.format == "tsv":
	to_tsv(filtered, wanted_fields)
	else: # json
	to_json(filtered, indent=2 if args.pretty else None)

	if __name__ == "__main__":
	main()
No results found