roelven · January 24, 2026 13:17
diff --git a/README.md b/README.md
diff --git a/parse-blood-results.md b/parse-blood-results.md
diff --git a/parse_blood_pdf.py b/parse_blood_pdf.py
 #!/usr/bin/env python3
 """
 Parse blood lab PDF using Claude Code.

 Usage:
    python parse_blood_pdf.py <pdf_file>
    python parse_blood_pdf.py befund-21012026.pdf

 Requirements:
    - anthropic Python package (pip install anthropic)
    - ANTHROPIC_API_KEY environment variable set
    - PDF file accessible at the given path
 """

 import argparse
 import base64
 import json
 import os
 import sys
 from datetime import datetime
 from pathlib import Path

 try:
    import anthropic
 except ImportError:
    print("Error: anthropic package not installed. Run: pip install anthropic")
    sys.exit(1)


 def load_command_prompt() -> str:
    """Load the parse-blood-results command prompt."""
    command_path = Path(__file__).parent / "commands" / "parse-blood-results.md"
    
    if not command_path.exists():
        # Try alternative location
        command_path = Path("commands/parse-blood-results.md")
    
    if not command_path.exists():
        raise FileNotFoundError(
            f"Command file not found at {command_path}. "
            "Ensure parse-blood-results.md is in the commands/ directory."
        )
    
    return command_path.read_text()


 def load_pdf_as_base64(pdf_path: str) -> tuple[str, str]:
    """Load PDF file and return base64 encoded content and filename."""
    path = Path(pdf_path)
    
    if not path.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    
    if not path.suffix.lower() == ".pdf":
        raise ValueError(f"File must be a PDF: {pdf_path}")
    
    with open(path, "rb") as f:
        content = base64.standard_b64encode(f.read()).decode("utf-8")
    
    return content, path.name


 def parse_blood_pdf(pdf_path: str, output_dir: str = None) -> dict:
    """
    Parse a blood lab PDF using Claude.
    
    Args:
        pdf_path: Path to the PDF file
        output_dir: Optional output directory for JSON (default: ./health-data/raw/blood-results/)
    
    Returns:
        Parsed results as a dictionary
    """
    # Load the command prompt
    command_prompt = load_command_prompt()
    
    # Load the PDF
    pdf_base64, pdf_filename = load_pdf_as_base64(pdf_path)
    
    # Initialize Anthropic client
    client = anthropic.Anthropic()
    
    # Create the message with PDF attachment
    print(f"Sending {pdf_filename} to Claude for parsing...")
    
    message = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=8192,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "document",
                        "source": {
                            "type": "base64",
                            "media_type": "application/pdf",
                            "data": pdf_base64,
                        },
                    },
                    {
                        "type": "text",
                        "text": f"""Parse this blood lab PDF according to the following instructions.

 {command_prompt}

 After parsing, output ONLY the JSON (no markdown code blocks, no explanation before or after).
 The JSON should be valid and complete.

 Source filename: {pdf_filename}
 Parsed at: {datetime.now().isoformat()}
 """
                    }
                ],
            }
        ],
    )
    
    # Extract the response
    response_text = message.content[0].text
    
    # Try to parse as JSON
    try:
        # Handle case where Claude wraps in code blocks
        if response_text.startswith("```"):
            # Extract JSON from code block
            lines = response_text.split("\n")
            json_lines = []
            in_block = False
            for line in lines:
                if line.startswith("```") and not in_block:
                    in_block = True
                    continue
                elif line.startswith("```") and in_block:
                    break
                elif in_block:
                    json_lines.append(line)
            response_text = "\n".join(json_lines)
        
        results = json.loads(response_text)
    except json.JSONDecodeError as e:
        print(f"Warning: Could not parse response as JSON: {e}")
        print("Raw response:")
        print(response_text[:2000])
        return {"error": "Failed to parse JSON", "raw_response": response_text}
    
    # Determine output path
    if output_dir is None:
        output_dir = Path("./health-data/raw/blood-results")
    else:
        output_dir = Path(output_dir)
    
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Generate output filename from date
    test_date = results.get("date", datetime.now().strftime("%Y-%m-%d"))
    output_file = output_dir / f"{test_date}.json"
    
    # Handle existing file
    if output_file.exists():
        lab_id = results.get("lab", {}).get("id", "unknown")
        output_file = output_dir / f"{test_date}_{lab_id}.json"
    
    # Save the results
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\n✓ Saved to: {output_file}")
    
    # Print summary
    print_summary(results)
    
    return results


 def print_summary(results: dict):
    """Print a human-readable summary of the parsed results."""
    print("\n" + "=" * 60)
    print(f"Parsed: {results.get('lab', {}).get('name', 'Unknown Lab')} ({results.get('date', 'Unknown date')})")
    
    summary = results.get("summary", {})
    print(f"Markers extracted: {summary.get('total_markers', len(results.get('results', [])))}")
    
    panels = results.get("panels_included", [])
    if panels:
        print(f"Panels: {', '.join(panels)}")
    
    # Show attention items
    attention = summary.get("attention_items", [])
    flags = results.get("flags", [])
    
    if attention or flags:
        print("\n⚠️  Attention:")
        for item in attention:
            print(f"  - {item}")
    
    # Count by status
    normal = summary.get("normal", 0)
    low = summary.get("low", 0)
    high = summary.get("high", 0)
    
    if low == 0 and high == 0:
        print("\n✓ All markers within normal range")
    else:
        if low > 0:
            print(f"\n⬇️  {low} marker(s) below range")
        if high > 0:
            print(f"⬆️  {high} marker(s) above range")
    
    print("=" * 60)


 def main():
    parser = argparse.ArgumentParser(
        description="Parse blood lab PDF into structured JSON using Claude"
    )
    parser.add_argument(
        "pdf_file",
        help="Path to the blood lab PDF file"
    )
    parser.add_argument(
        "-o", "--output-dir",
        help="Output directory for JSON files (default: ./health-data/raw/blood-results/)",
        default=None
    )
    parser.add_argument(
        "--json-only",
        action="store_true",
        help="Output only the JSON to stdout (for piping)"
    )
    
    args = parser.parse_args()
    
    # Check for API key
    if not os.environ.get("ANTHROPIC_API_KEY"):
        print("Error: ANTHROPIC_API_KEY environment variable not set")
        sys.exit(1)
    
    try:
        results = parse_blood_pdf(args.pdf_file, args.output_dir)
        
        if args.json_only:
            print(json.dumps(results, indent=2, ensure_ascii=False))
        
    except FileNotFoundError as e:
        print(f"Error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Error: {e}")
        sys.exit(1)


 if __name__ == "__main__":
    main()
Original (German/English variants)	Normalized `marker`
Hämoglobin, Hemoglobin, Hb	`hemoglobin`
Leukozyten, Leukocytes, WBC	`leukocytes`
Erythrozyten, Erythrocytes, RBC	`erythrocytes`
Thrombozyten, Platelets, PLT	`platelets`
Hämatokrit, Hematocrit, HCT	`hematocrit`
GPT/ALAT, ALT, SGPT	`alt`
GOT/ASAT, AST, SGOT	`ast`
Gamma-GT, GGT, γ-GT	`ggt`
Alkalische Phosphatase, ALP, AP	`alp`
Kreatinin, Creatinine, Crea	`creatinine`
Harnstoff, Urea, BUN	`urea`
Harnsäure, Uric Acid	`uric_acid`
Cholesterin, Cholesterol	`cholesterol_total`
Triglyceride, Triglycerides	`triglycerides`
HDL-Cholesterin, HDL Cholesterol, HDL	`hdl`
LDL-Cholesterin, LDL Cholesterol, LDL	`ldl`
Non-HDL-Cholesterin	`non_hdl`
Ferritin	`ferritin`
Eisen, Iron	`iron`
Transferrin	`transferrin`
Transferrinsättigung, Transferrin Saturation	`transferrin_saturation`
Vitamin D (25-OH), 25-Hydroxyvitamin D, Vitamin D3	`vitamin_d_25oh`
Vitamin B12, Cobalamin	`vitamin_b12`
Folsäure, Folate, Folic Acid	`folate`
TSH, TSH basal	`tsh`
fT3, Free T3, freies T3	`ft3`
fT4, Free T4, freies T4	`ft4`
HbA1c	`hba1c`
HbA1C (n. IFCC), HbA1c IFCC	`hba1c_ifcc`
Glucose, Glukose (nüchtern/fasting)	`glucose_fasting`
Glucose, Glukose (random/nicht nüchtern)	`glucose`
CRP, C-reaktives Protein, C-Reactive Protein	`crp`
hs-CRP, hochsensitives CRP	`crp_hs`
Bilirubin (gesamt), Total Bilirubin	`bilirubin_total`
Bilirubin (direkt), Direct Bilirubin	`bilirubin_direct`
Natrium, Sodium, Na	`sodium`
Kalium, Potassium, K	`potassium`
Calcium, Ca	`calcium`
Magnesium, Mg	`magnesium`
Phosphat, Phosphate, P	`phosphate`
Chlorid, Chloride, Cl	`chloride`
GFR (MDRD), GFR (CKD-EPI), eGFR	`gfr`
Gesamteiweiß, Total Protein	`total_protein`
Albumin	`albumin`
HBs-Antigen, HBsAg	`hbs_antigen`
Anti-HCV, HCV-Ak	`anti_hcv`
Anti-HBs, HBs-Ak	`anti_hbs`
Anti-HBc, HBc-Ak	`anti_hbc`
Marker	SI Unit	Conventional Unit	Conversion
Hemoglobin	mmol/l	g/dl	× 1.611
Hematocrit	l/l	%	× 100
Glucose	mmol/l	mg/dl	× 18.02
Cholesterol (total, HDL, LDL)	mmol/l	mg/dl	× 38.67
Triglycerides	mmol/l	mg/dl	× 88.57
Creatinine	µmol/l	mg/dl	÷ 88.4
Urea	mmol/l	mg/dl	× 6.006
Uric Acid	µmol/l	mg/dl	÷ 59.48
Bilirubin	µmol/l	mg/dl	÷ 17.1
Iron	µmol/l	µg/dl	× 5.587
Calcium	mmol/l	mg/dl	× 4.008
Magnesium	mmol/l	mg/dl	× 2.431
Phosphate	mmol/l	mg/dl	× 3.097
Total Protein	g/l	g/dl	÷ 10
Albumin	g/l	g/dl	÷ 10
MCH	fmol	pg	× 16.11
MCHC	mmol/l	g/dl	× 1.611
Panel	Markers
`complete_blood_count`	leukocytes, erythrocytes, hemoglobin, hematocrit, platelets, mcv, mch, mchc, rdw, mpv, neutrophils_, lymphocytes_, monocytes_, eosinophils_, basophils_*
`lipid_panel`	cholesterol_total, triglycerides, hdl, ldl, non_hdl, ldl_hdl_ratio, vldl
`liver_function`	alt, ast, ggt, alp, bilirubin_total, bilirubin_direct, albumin
`kidney_function`	creatinine, urea, gfr, cystatin_c, uric_acid
`thyroid`	tsh, ft3, ft4, t3, t4, anti_tpo, anti_tg
`iron_studies`	ferritin, iron, transferrin, transferrin_saturation, tibc
`glycemic`	glucose, glucose_fasting, hba1c, hba1c_ifcc, insulin, c_peptide
`vitamin_d`	vitamin_d_25oh, vitamin_d_1_25oh
`vitamin_b12`	vitamin_b12, holotranscobalamin
`folate`	folate
`inflammation_markers`	crp, crp_hs, esr, il6
`electrolytes`	sodium, potassium, chloride, calcium, magnesium, phosphate
`hepatitis_screening`	hbs_antigen, anti_hbs, anti_hbc, anti_hcv
`metabolic_basic`	glucose_fasting, total_protein, albumin, uric_acid
	#!/usr/bin/env python3
	"""
	Parse blood lab PDF using Claude Code.

	Usage:
	python parse_blood_pdf.py <pdf_file>
	python parse_blood_pdf.py befund-21012026.pdf

	Requirements:
	- anthropic Python package (pip install anthropic)
	- ANTHROPIC_API_KEY environment variable set
	- PDF file accessible at the given path
	"""

	import argparse
	import base64
	import json
	import os
	import sys
	from datetime import datetime
	from pathlib import Path

	try:
	import anthropic
	except ImportError:
	print("Error: anthropic package not installed. Run: pip install anthropic")
	sys.exit(1)


	def load_command_prompt() -> str:
	"""Load the parse-blood-results command prompt."""
	command_path = Path(__file__).parent / "commands" / "parse-blood-results.md"

	if not command_path.exists():
	# Try alternative location
	command_path = Path("commands/parse-blood-results.md")

	if not command_path.exists():
	raise FileNotFoundError(
	f"Command file not found at {command_path}. "
	"Ensure parse-blood-results.md is in the commands/ directory."
	)

	return command_path.read_text()


	def load_pdf_as_base64(pdf_path: str) -> tuple[str, str]:
	"""Load PDF file and return base64 encoded content and filename."""
	path = Path(pdf_path)

	if not path.exists():
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")

	if not path.suffix.lower() == ".pdf":
	raise ValueError(f"File must be a PDF: {pdf_path}")

	with open(path, "rb") as f:
	content = base64.standard_b64encode(f.read()).decode("utf-8")

	return content, path.name


	def parse_blood_pdf(pdf_path: str, output_dir: str = None) -> dict:
	"""
	Parse a blood lab PDF using Claude.

	Args:
	pdf_path: Path to the PDF file
	output_dir: Optional output directory for JSON (default: ./health-data/raw/blood-results/)

	Returns:
	Parsed results as a dictionary
	"""
	# Load the command prompt
	command_prompt = load_command_prompt()

	# Load the PDF
	pdf_base64, pdf_filename = load_pdf_as_base64(pdf_path)

	# Initialize Anthropic client
	client = anthropic.Anthropic()

	# Create the message with PDF attachment
	print(f"Sending {pdf_filename} to Claude for parsing...")

	message = client.messages.create(
	model="claude-sonnet-4-20250514",
	max_tokens=8192,
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "document",
	"source": {
	"type": "base64",
	"media_type": "application/pdf",
	"data": pdf_base64,
	},
	},
	{
	"type": "text",
	"text": f"""Parse this blood lab PDF according to the following instructions.

	{command_prompt}

	After parsing, output ONLY the JSON (no markdown code blocks, no explanation before or after).
	The JSON should be valid and complete.

	Source filename: {pdf_filename}
	Parsed at: {datetime.now().isoformat()}
	"""
	}
	],
	}
	],
	)

	# Extract the response
	response_text = message.content[0].text

	# Try to parse as JSON
	try:
	# Handle case where Claude wraps in code blocks
	if response_text.startswith("```"):
	# Extract JSON from code block
	lines = response_text.split("\n")
	json_lines = []
	in_block = False
	for line in lines:
	if line.startswith("```") and not in_block:
	in_block = True
	continue
	elif line.startswith("```") and in_block:
	break
	elif in_block:
	json_lines.append(line)
	response_text = "\n".join(json_lines)

	results = json.loads(response_text)
	except json.JSONDecodeError as e:
	print(f"Warning: Could not parse response as JSON: {e}")
	print("Raw response:")
	print(response_text[:2000])
	return {"error": "Failed to parse JSON", "raw_response": response_text}

	# Determine output path
	if output_dir is None:
	output_dir = Path("./health-data/raw/blood-results")
	else:
	output_dir = Path(output_dir)

	output_dir.mkdir(parents=True, exist_ok=True)

	# Generate output filename from date
	test_date = results.get("date", datetime.now().strftime("%Y-%m-%d"))
	output_file = output_dir / f"{test_date}.json"

	# Handle existing file
	if output_file.exists():
	lab_id = results.get("lab", {}).get("id", "unknown")
	output_file = output_dir / f"{test_date}_{lab_id}.json"

	# Save the results
	with open(output_file, "w", encoding="utf-8") as f:
	json.dump(results, f, indent=2, ensure_ascii=False)

	print(f"\n✓ Saved to: {output_file}")

	# Print summary
	print_summary(results)

	return results


	def print_summary(results: dict):
	"""Print a human-readable summary of the parsed results."""
	print("\n" + "=" * 60)
	print(f"Parsed: {results.get('lab', {}).get('name', 'Unknown Lab')} ({results.get('date', 'Unknown date')})")

	summary = results.get("summary", {})
	print(f"Markers extracted: {summary.get('total_markers', len(results.get('results', [])))}")

	panels = results.get("panels_included", [])
	if panels:
	print(f"Panels: {', '.join(panels)}")

	# Show attention items
	attention = summary.get("attention_items", [])
	flags = results.get("flags", [])

	if attention or flags:
	print("\n⚠️ Attention:")
	for item in attention:
	print(f" - {item}")

	# Count by status
	normal = summary.get("normal", 0)
	low = summary.get("low", 0)
	high = summary.get("high", 0)

	if low == 0 and high == 0:
	print("\n✓ All markers within normal range")
	else:
	if low > 0:
	print(f"\n⬇️ {low} marker(s) below range")
	if high > 0:
	print(f"⬆️ {high} marker(s) above range")

	print("=" * 60)


	def main():
	parser = argparse.ArgumentParser(
	description="Parse blood lab PDF into structured JSON using Claude"
	)
	parser.add_argument(
	"pdf_file",
	help="Path to the blood lab PDF file"
	)
	parser.add_argument(
	"-o", "--output-dir",
	help="Output directory for JSON files (default: ./health-data/raw/blood-results/)",
	default=None
	)
	parser.add_argument(
	"--json-only",
	action="store_true",
	help="Output only the JSON to stdout (for piping)"
	)

	args = parser.parse_args()

	# Check for API key
	if not os.environ.get("ANTHROPIC_API_KEY"):
	print("Error: ANTHROPIC_API_KEY environment variable not set")
	sys.exit(1)

	try:
	results = parse_blood_pdf(args.pdf_file, args.output_dir)

	if args.json_only:
	print(json.dumps(results, indent=2, ensure_ascii=False))

	except FileNotFoundError as e:
	print(f"Error: {e}")
	sys.exit(1)
	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)


	if __name__ == "__main__":
	main()