Skip to content

Instantly share code, notes, and snippets.

@sminot
Created December 8, 2025 23:59
Show Gist options
  • Select an option

  • Save sminot/63cd0f63b08387db4ad94e10a1156c4f to your computer and use it in GitHub Desktop.

Select an option

Save sminot/63cd0f63b08387db4ad94e10a1156c4f to your computer and use it in GitHub Desktop.
Convert NCBI BioSample Metadata to CSV
#!/usr/bin/env python3
from typing import List
import click
import pandas as pd
import csv
from pathlib import Path
def convert_metadata_lines(lines: List[str]) -> pd.DataFrame:
"""Convert metadata from list of strings to a pandas DataFrame."""
dat = []
item = {}
for line in lines:
line = line.strip()
if line == "":
if item:
dat.append(item)
item = {}
else:
for kw, val in parse_line(line, is_first=len(item) == 0).items():
item[kw] = val
if item:
dat.append(item)
return pd.DataFrame(dat)
def parse_line(line: str, is_first: bool) -> dict:
if is_first:
return dict(zip(['index', 'name'], line.split(": ", 1)))
elif line.startswith("Identifiers: "):
_, rest = line.split(": ", 1)
return {
kw: val
for pair in rest.split("; ")
for kw, val in [pair.split(": ", 1)]
}
elif line.startswith("Accession: "):
return {
field.split(": ", 1)[0]: field.split(": ", 1)[1].strip('"')
for field in line.split("\t")
if ": " in field
}
elif ": " in line and len(line.split(": ", 1)) == 2:
kw, val = line.split(": ", 1)
return {kw: val}
elif line.startswith(" /"):
kw, val = line[4:].split("=", 1)
return {
kw: val.strip('"')
}
else:
return {}
@click.command()
@click.argument('input_file', type=click.Path(exists=True))
def convert_metadata(input_file):
"""Convert a .txt file to .csv format."""
input_path = Path(input_file)
# Validate file extension
if input_path.suffix != '.txt':
raise click.BadParameter('Input file must have a .txt extension')
# Create output filename
output_path = input_path.with_suffix('.csv')
# Read input file and parse
with open(input_path, 'r') as f:
lines = f.readlines()
df = convert_metadata_lines(lines)
# Write to CSV
df.to_csv(output_path, index=False)
click.echo(f'Successfully converted {input_file} to {output_path}')
if __name__ == '__main__':
convert_metadata()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment