Created
December 8, 2025 23:59
-
-
Save sminot/63cd0f63b08387db4ad94e10a1156c4f to your computer and use it in GitHub Desktop.
Convert NCBI BioSample Metadata to CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| from typing import List | |
| import click | |
| import pandas as pd | |
| import csv | |
| from pathlib import Path | |
| def convert_metadata_lines(lines: List[str]) -> pd.DataFrame: | |
| """Convert metadata from list of strings to a pandas DataFrame.""" | |
| dat = [] | |
| item = {} | |
| for line in lines: | |
| line = line.strip() | |
| if line == "": | |
| if item: | |
| dat.append(item) | |
| item = {} | |
| else: | |
| for kw, val in parse_line(line, is_first=len(item) == 0).items(): | |
| item[kw] = val | |
| if item: | |
| dat.append(item) | |
| return pd.DataFrame(dat) | |
| def parse_line(line: str, is_first: bool) -> dict: | |
| if is_first: | |
| return dict(zip(['index', 'name'], line.split(": ", 1))) | |
| elif line.startswith("Identifiers: "): | |
| _, rest = line.split(": ", 1) | |
| return { | |
| kw: val | |
| for pair in rest.split("; ") | |
| for kw, val in [pair.split(": ", 1)] | |
| } | |
| elif line.startswith("Accession: "): | |
| return { | |
| field.split(": ", 1)[0]: field.split(": ", 1)[1].strip('"') | |
| for field in line.split("\t") | |
| if ": " in field | |
| } | |
| elif ": " in line and len(line.split(": ", 1)) == 2: | |
| kw, val = line.split(": ", 1) | |
| return {kw: val} | |
| elif line.startswith(" /"): | |
| kw, val = line[4:].split("=", 1) | |
| return { | |
| kw: val.strip('"') | |
| } | |
| else: | |
| return {} | |
| @click.command() | |
| @click.argument('input_file', type=click.Path(exists=True)) | |
| def convert_metadata(input_file): | |
| """Convert a .txt file to .csv format.""" | |
| input_path = Path(input_file) | |
| # Validate file extension | |
| if input_path.suffix != '.txt': | |
| raise click.BadParameter('Input file must have a .txt extension') | |
| # Create output filename | |
| output_path = input_path.with_suffix('.csv') | |
| # Read input file and parse | |
| with open(input_path, 'r') as f: | |
| lines = f.readlines() | |
| df = convert_metadata_lines(lines) | |
| # Write to CSV | |
| df.to_csv(output_path, index=False) | |
| click.echo(f'Successfully converted {input_file} to {output_path}') | |
| if __name__ == '__main__': | |
| convert_metadata() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment