Created
August 21, 2025 03:44
-
-
Save kennethleungty/5a4091b3dcd7dea90c2953554b8c0ac3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import langextract as lx | |
| # Run PDF processor on insurance policy PDF and get concatenated parsed text | |
| processor = PDFProcessor("data/input/driveshield_specimen_policy_value_plan.pdf") | |
| input_text = processor.get_all_text() | |
| # Run extraction | |
| result = lx.extract( | |
| text_or_documents=input_text, | |
| prompt_description=prompt, | |
| examples=examples, | |
| model_id="gemma3:4b", | |
| model_url="http://localhost:11434", # Endpoint URL for self-hosted model. Default Ollama server URL is used here. | |
| fence_output=False, # Whether to expect/generate fenced output (```json or ```yaml). When True, model is prompted to generate fenced output and the resolver expects it. When False, raw JSON/YAML is expected. | |
| use_schema_constraints=False, # Whether to generate schema constraints for models. LangExtract doesn't implement schema constraints for Ollama models yet | |
| max_char_buffer=2000, # Max number of characters for inference | |
| extraction_passes=2, # Number of sequential extraction attempts to improve recall and find additional entities. Defaults to 1 (standard single extraction). When > 1, the system performs multiple independent extractions and merges non-overlapping results. | |
| temperature=0.0 | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment