Last active
September 8, 2025 02:11
-
-
Save birkin/2fabfa51a64a88274e06e190b5cb5d58 to your computer and use it in GitHub Desktop.
calculates BDR collection size for given collection-pid. (#repository #public-repo-tools)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = "==3.12.*" | |
| # dependencies = [ | |
| # "httpx" | |
| # ] | |
| # /// | |
| """ | |
| Computes total byte-size for a BDR collection and returns the size in both bytes and human-readable form. | |
| Usage: | |
| uv run ./calc_collection_size.py --collection-pid bdr:bwehb8b8 | |
| Tweak page-size if desired (API typically caps at <= 500): | |
| uv run ./calc_collection_size.py --collection-pid bdr:bwehb8b8 --rows 500 | |
| """ | |
| import argparse | |
| import logging | |
| import math | |
| import os | |
| import sys | |
| from collections.abc import Generator | |
| from typing import Any | |
| import httpx | |
| log_level_name: str = os.getenv('LOG_LEVEL', 'INFO').upper() | |
| log_level = getattr( | |
| logging, log_level_name, logging.INFO | |
| ) # maps the string name to the corresponding logging level constant; defaults to INFO | |
| logging.basicConfig( | |
| level=log_level, | |
| format='[%(asctime)s] %(levelname)s [%(module)s-%(funcName)s()::%(lineno)d] %(message)s', | |
| datefmt='%d/%b/%Y %H:%M:%S', | |
| ) | |
| log = logging.getLogger(__name__) | |
| ## prevent httpx from logging | |
| if log_level <= logging.INFO: | |
| for noisy in ('httpx', 'httpcore'): | |
| lg = logging.getLogger(noisy) | |
| lg.setLevel(logging.WARNING) # or logging.ERROR if you prefer only errors | |
| lg.propagate = False # don't bubble up to root | |
| SEARCH_BASE = 'https://repository.library.brown.edu/api/search/' | |
| # Hardcoded fields used for search requests | |
| FIELDS: list[str] = ['pid', 'object_size_lsi', 'fed_object_size_lsi'] | |
| ## -- secondary helper functions ------------------------------------ | |
| def human_bytes(n: int) -> str: | |
| """ | |
| Formats a byte count into a human-readable string (e.g., KB, MB, GB). | |
| Called by `print_results()`. | |
| """ | |
| if n < 1024: | |
| return f'{n} B' | |
| # Choose the next lower unit below the threshold | |
| # < 1 MB -> show KB; < 1 GB -> show MB; < 1 TB -> show GB; etc. | |
| thresholds = [ | |
| (1024**2, 'KB', 1), # up to MB threshold, show KB | |
| (1024**3, 'MB', 2), # up to GB threshold, show MB | |
| (1024**4, 'GB', 3), # up to TB threshold, show GB | |
| (1024**5, 'TB', 4), # up to PB threshold, show TB | |
| (1024**6, 'PB', 5), # up to EB threshold, show PB | |
| ] | |
| for upper, unit, power in thresholds: | |
| if n < upper: | |
| val = n / (1024**power) | |
| return f'{val:.2f} {unit}' | |
| # For extremely large values (>= 1 EB), show EB | |
| val = n / (1024**6) | |
| return f'{val:.2f} EB' | |
| def fetch_search_page( | |
| client: httpx.Client, | |
| collection_pid: str, | |
| start: int, | |
| rows: int, | |
| ) -> dict[str, Any]: | |
| """ | |
| Fetches one page of BDR Search API results for a collection. | |
| Called by `iter_collection_docs()` and `calculate_size()`. | |
| """ | |
| params = { | |
| 'q': f'rel_is_member_of_collection_ssim:"{collection_pid}"', | |
| 'rows': rows, | |
| 'start': start, | |
| 'fl': ','.join(FIELDS), | |
| } | |
| r = client.get(SEARCH_BASE, params=params, timeout=30) | |
| r.raise_for_status() | |
| return r.json() | |
| def iter_collection_docs( | |
| client: httpx.Client, | |
| collection_pid: str, | |
| rows: int, | |
| first_page: dict[str, Any] | None = None, | |
| ) -> Generator[dict[str, Any], None, int]: | |
| """ | |
| Iterates over all search docs for a collection, yielding docs across pages. | |
| If ``first_page`` is provided, it will be used as the first page response to | |
| avoid a duplicate fetch (useful when the caller already requested it to get | |
| ``numFound``). The generator "returns" ``num_found`` via StopIteration.value, | |
| but typical callers just iterate docs. | |
| Called by `calculate_size()`. | |
| """ | |
| ## processes all docs in first search response ------------------ | |
| first = first_page or fetch_search_page(client, collection_pid, 0, rows) | |
| response = first.get('response', {}) | |
| num_found = int(response.get('numFound', 0)) | |
| total_pages = math.ceil(num_found / rows) if rows > 0 else 0 | |
| log.debug(f'iter_collection_docs: num_found={num_found}, rows={rows}, expected_pages={total_pages}') | |
| docs = response.get('docs', []) | |
| log.info(f'processing: page=1 start=0 docs_returned={len(docs)}') | |
| log.debug('about to yield docs') | |
| yield from docs | |
| log.debug('yielded initial docs; about to start pagination') | |
| start = rows | |
| ## processes all docs in subsequent search responses ------------ | |
| while start < num_found: | |
| page = fetch_search_page(client, collection_pid, start, rows) | |
| docs = page.get('response', {}).get('docs', []) | |
| current_page = (start // rows) + 1 # 0-based offset + 1 for human page index | |
| log.info(f'processing: page={current_page}, start={start} docs_returned={len(docs)}') | |
| if not docs: | |
| log.warning('iter_collection_docs: received empty docs list before reaching num_found; stopping pagination') | |
| break | |
| log.debug('about to yield docs #2') | |
| yield from docs | |
| log.debug('yielded docs #2; about to increment start') | |
| start += rows | |
| log.debug(f'iter_collection_docs: finished pagination at start={start} (num_found={num_found})') | |
| return num_found # not used directly by caller (generator semantics) | |
| ## -- primary helper functions -------------------------------------- | |
| def print_results(collection_pid: str, results: dict[str, int], collection_title: str | None = None) -> None: | |
| """ | |
| Prints a human-friendly summary of the collection size results. | |
| Called by `main()`. | |
| """ | |
| print(' ') | |
| print(f'Collection: {collection_pid}') | |
| if collection_title: | |
| print(f'Title: {collection_title}') | |
| print(f'Items found: {results["num_found"]}') | |
| print(f'Items with size counted: {results["counted"]}') | |
| if results['missing']: | |
| print(f'Items still missing size: {results["missing"]}') | |
| print(f'Total bytes: {results["total_bytes"]}') | |
| print(f'Human: {human_bytes(results["total_bytes"])}') | |
| def fetch_collection_title_via_collection_api(client, collection_pid: str) -> str | None: | |
| """ | |
| Fetches the collection's title using the collection api. | |
| Called by `main()`. | |
| """ | |
| url: str = f'https://repository.library.brown.edu/api/collections/{collection_pid}/' | |
| r: httpx.Response = client.get(url, timeout=30) | |
| if r.status_code == 403: | |
| return None | |
| r.raise_for_status() | |
| data: dict[str, Any] = r.json() | |
| title: str | None = data.get('name') or data.get('primary_title') | |
| log.debug(f'title: ``{title}``') | |
| return title | |
| def calculate_size( | |
| collection_pid: str, | |
| rows: int, | |
| ) -> dict[str, int]: | |
| """ | |
| Calculates total bytes for a collection and returns summary stats. | |
| Returns a dict with keys: num_found, counted, missing, total_bytes. | |
| """ | |
| total_bytes = 0 | |
| counted = 0 | |
| missing = 0 | |
| with httpx.Client(headers={'Accept': 'application/json'}) as client: | |
| ## get first page to learn numFound for reporting | |
| first: dict[str, Any] = fetch_search_page(client, collection_pid, 0, rows) | |
| resp: dict[str, Any] = first.get('response', {}) | |
| num_found: int = int(resp.get('numFound', 0)) | |
| log.info(f'num_found: {num_found}') | |
| ## process all docs via iterator (avoids duplicating pagination logic) | |
| for d in iter_collection_docs(client, collection_pid, rows, first_page=first): | |
| log.debug(f'processing doc-pid ``{d.get("pid")}``') | |
| size: int | None = d.get('object_size_lsi') or d.get('fed_object_size_lsi') | |
| if size is None: | |
| missing += 1 | |
| log.debug(f'missing count now, ``{missing}``') | |
| else: | |
| total_bytes += int(size) | |
| counted += 1 | |
| return { | |
| 'num_found': num_found, | |
| 'counted': counted, | |
| 'missing': missing, | |
| 'total_bytes': total_bytes, | |
| } | |
| def parse_args(argv: list[str] | None = None) -> argparse.Namespace: | |
| """ | |
| Parses command-line arguments for collection PID and rows. | |
| Called by `main()`. | |
| """ | |
| parser = argparse.ArgumentParser(description='Sum total bytes for a BDR collection.') | |
| parser.add_argument('--collection-pid', type=str, required=True, help='e.g., bdr:bwehb8b8') | |
| parser.add_argument('--rows', type=int, default=500, help='page size (max usually 500)') | |
| return parser.parse_args(argv) | |
| def main() -> int: | |
| """ | |
| Main controller function. | |
| Called by dundermain. | |
| """ | |
| ## parse args --------------------------------------------------- | |
| args = parse_args() | |
| ## calculate size ----------------------------------------------- | |
| collection_pid: str = args.collection_pid | |
| rows: int = args.rows | |
| ## output results ----------------------------------------------- | |
| results = calculate_size(collection_pid, rows) | |
| # fetch title separately to preserve separation-of-concerns (collections API) | |
| with httpx.Client(headers={'Accept': 'application/json'}) as client: | |
| collection_title = fetch_collection_title_via_collection_api(client, collection_pid) | |
| print_results(collection_pid, results, collection_title) | |
| return 0 | |
| if __name__ == '__main__': | |
| sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment