BramVanroy · December 2, 2025 13:38
diff --git a/structured_ner.py b/structured_ner.py
 from vllm import LLM, SamplingParams
 from vllm.sampling_params import StructuredOutputsParams


 def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM) -> str:
    outputs = llm.generate(prompt, sampling_params=sampling_params)
    return outputs[0].outputs[0].text


 def main():
    ner_regex = r"^# [^#\n]+\n(?:\S+\t(?:O|B-LOC|I-LOC|B-ORG|I-ORG|B-PER|I-PER)\n)+"
    sampling_params_ner = SamplingParams(
        structured_outputs=StructuredOutputsParams(regex=ner_regex),
        max_tokens=200
    )
    prompt_ner = """Generate an example of named entity recognition (NER) data in tab-seperated IOB2 format. Start with a comment line that begins with a '#' character describing the sentence. Each subsequent line should contain a word followed by its corresponding NER tag (O, B-LOC, I-LOC, B-ORG, I-ORG, B-PER, I-PER), separated by a tab character.
 Ensure that the output strictly adheres to this format.

 Example:

 # Crude-oil prices rose Wednesday as strengthening Hurricane Rita, now a Category 5 storm, threatened to disrupt oil production in the Gulf of Mexico.
 Crude	O
 -	O
 oil	O
 prices	O
 rose	O
 Wednesday	O
 as	O
 strengthening	O
 Hurricane	O
 Rita	O
 ,	O
 now	O
 a	O
 Category	O
 5	O
 storm	O
 ,	O
 threatened	O
 to	O
 disrupt	O
 oil	O
 production	O
 in	O
 the	O
 Gulf	B-LOC
 of	I-LOC
 Mexico	I-LOC
 .	O


    """

    # Works better with a larger model, this is just an example.
    llm = LLM(model="Qwen/Qwen3-4B-Instruct-2507", max_model_len=480)
    output = generate_output(prompt_ner, sampling_params_ner, llm)
    print(output)
    # You should probably still verify that the input sentences (after #) has
    # the same tokens as the provided tokens.

    """
    Output I got:

 # The United States Department of Defense announced a new policy to increase funding for military research in Boston and San Diego.
 The     O
 United  B-PER
 States  B-ORG
 Department      B-ORG
 of      O
 Defense B-ORG
 announced       O
 a       O
 new     O
 policy  O
 to      O
 increase        O
 funding O
 for     O
 military        O
 research        O
 in      O
 Boston  B-LOC
 and     O
 San     O
 Diego   B-LOC
 .       O
    """

 if __name__ == "__main__":
    main()
	from vllm import LLM, SamplingParams
	from vllm.sampling_params import StructuredOutputsParams


	def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM) -> str:
	outputs = llm.generate(prompt, sampling_params=sampling_params)
	return outputs[0].outputs[0].text


	def main():
	ner_regex = r"^# [^#\n]+\n(?:\S+\t(?:O\|B-LOC\|I-LOC\|B-ORG\|I-ORG\|B-PER\|I-PER)\n)+"
	sampling_params_ner = SamplingParams(
	structured_outputs=StructuredOutputsParams(regex=ner_regex),
	max_tokens=200
	)
	prompt_ner = """Generate an example of named entity recognition (NER) data in tab-seperated IOB2 format. Start with a comment line that begins with a '#' character describing the sentence. Each subsequent line should contain a word followed by its corresponding NER tag (O, B-LOC, I-LOC, B-ORG, I-ORG, B-PER, I-PER), separated by a tab character.
	Ensure that the output strictly adheres to this format.

	Example:

	# Crude-oil prices rose Wednesday as strengthening Hurricane Rita, now a Category 5 storm, threatened to disrupt oil production in the Gulf of Mexico.
	Crude O
	- O
	oil O
	prices O
	rose O
	Wednesday O
	as O
	strengthening O
	Hurricane O
	Rita O
	, O
	now O
	a O
	Category O
	5 O
	storm O
	, O
	threatened O
	to O
	disrupt O
	oil O
	production O
	in O
	the O
	Gulf B-LOC
	of I-LOC
	Mexico I-LOC
	. O


	"""

	# Works better with a larger model, this is just an example.
	llm = LLM(model="Qwen/Qwen3-4B-Instruct-2507", max_model_len=480)
	output = generate_output(prompt_ner, sampling_params_ner, llm)
	print(output)
	# You should probably still verify that the input sentences (after #) has
	# the same tokens as the provided tokens.

	"""
	Output I got:

	# The United States Department of Defense announced a new policy to increase funding for military research in Boston and San Diego.
	The O
	United B-PER
	States B-ORG
	Department B-ORG
	of O
	Defense B-ORG
	announced O
	a O
	new O
	policy O
	to O
	increase O
	funding O
	for O
	military O
	research O
	in O
	Boston B-LOC
	and O
	San O
	Diego B-LOC
	. O
	"""

	if __name__ == "__main__":
	main()
No results found