Created
December 2, 2025 13:38
-
-
Save BramVanroy/90b5b787f6546b79720e35cc3ad928e9 to your computer and use it in GitHub Desktop.
Generate NER data with vLLM Structured Output (regex)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from vllm import LLM, SamplingParams | |
| from vllm.sampling_params import StructuredOutputsParams | |
| def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM) -> str: | |
| outputs = llm.generate(prompt, sampling_params=sampling_params) | |
| return outputs[0].outputs[0].text | |
| def main(): | |
| ner_regex = r"^# [^#\n]+\n(?:\S+\t(?:O|B-LOC|I-LOC|B-ORG|I-ORG|B-PER|I-PER)\n)+" | |
| sampling_params_ner = SamplingParams( | |
| structured_outputs=StructuredOutputsParams(regex=ner_regex), | |
| max_tokens=200 | |
| ) | |
| prompt_ner = """Generate an example of named entity recognition (NER) data in tab-seperated IOB2 format. Start with a comment line that begins with a '#' character describing the sentence. Each subsequent line should contain a word followed by its corresponding NER tag (O, B-LOC, I-LOC, B-ORG, I-ORG, B-PER, I-PER), separated by a tab character. | |
| Ensure that the output strictly adheres to this format. | |
| Example: | |
| # Crude-oil prices rose Wednesday as strengthening Hurricane Rita, now a Category 5 storm, threatened to disrupt oil production in the Gulf of Mexico. | |
| Crude O | |
| - O | |
| oil O | |
| prices O | |
| rose O | |
| Wednesday O | |
| as O | |
| strengthening O | |
| Hurricane O | |
| Rita O | |
| , O | |
| now O | |
| a O | |
| Category O | |
| 5 O | |
| storm O | |
| , O | |
| threatened O | |
| to O | |
| disrupt O | |
| oil O | |
| production O | |
| in O | |
| the O | |
| Gulf B-LOC | |
| of I-LOC | |
| Mexico I-LOC | |
| . O | |
| """ | |
| # Works better with a larger model, this is just an example. | |
| llm = LLM(model="Qwen/Qwen3-4B-Instruct-2507", max_model_len=480) | |
| output = generate_output(prompt_ner, sampling_params_ner, llm) | |
| print(output) | |
| # You should probably still verify that the input sentences (after #) has | |
| # the same tokens as the provided tokens. | |
| """ | |
| Output I got: | |
| # The United States Department of Defense announced a new policy to increase funding for military research in Boston and San Diego. | |
| The O | |
| United B-PER | |
| States B-ORG | |
| Department B-ORG | |
| of O | |
| Defense B-ORG | |
| announced O | |
| a O | |
| new O | |
| policy O | |
| to O | |
| increase O | |
| funding O | |
| for O | |
| military O | |
| research O | |
| in O | |
| Boston B-LOC | |
| and O | |
| San O | |
| Diego B-LOC | |
| . O | |
| """ | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment