Skip to content

Instantly share code, notes, and snippets.

@CodeWithOz
Last active August 26, 2025 22:34
Show Gist options
  • Select an option

  • Save CodeWithOz/0b01d558adac703673aceca48471dca7 to your computer and use it in GitHub Desktop.

Select an option

Save CodeWithOz/0b01d558adac703673aceca48471dca7 to your computer and use it in GitHub Desktop.
Agent for cleaning up named entities in YouTube video transcripts.
...
class TextReplacement(BaseModel):
replaced_text: str = Field(description="The updated text after replacing entities.")
...
class DemoEnrichmentAgent:
...
entity_replacer_llm: Runnable
def __init__(self):
...
self.entity_replacer_llm = init_chat_model(
"gpt-5-nano-2025-08-07",
model_provider="openai",
).with_structured_output(TextReplacement)
...
async def replace_entity_node(self, state: AgentState):
entity_to_replace_idx = state.get("replacement_loop_idx", 0)
entity_to_replace: VerifiedEntity = state.get("verified_entities", [])[
entity_to_replace_idx
]
pass_count = state.get("replacement_pass_count", 0) + 1
# if updated_transcript_text is not yet set
# i.e. it is the first pass on the first entity
# then use the original transcript_text
updated_text = state.get("updated_transcript_text") or state.get(
"transcript_text"
)
print(
f"Replacing entity: {entity_to_replace.extracted_name} (pass {pass_count})"
)
llm_res: TextReplacement = await self.entity_replacer_llm.ainvoke(
[
SystemMessage(
content=(
"You are a master at finding and replacing entities in text. "
"You will receive the text of a YouTube video transcript. "
"You will also receive the name of an entity to replace and the "
"canonical name of the entity. Your task is to replace all "
"occurrences of the entity name with the canonical name. "
"You should handle edge cases such as partial matches and "
"special characters. "
"ONLY output a JSON object with a single key `replaced_text`. "
"For example, `{'replaced_text': 'new text containing updated entity name'}`."
)
),
HumanMessage(content="TRANSCRIPT_TEXT:\n\n" + updated_text),
HumanMessage(
content="ENTITY_TO_REPLACE:\n\n" + entity_to_replace.extracted_name
),
HumanMessage(
content="ENTITY_CANONICAL_NAME:\n\n"
+ entity_to_replace.canonical_name
),
]
)
print(
f"Replaced entity: {entity_to_replace.extracted_name} (pass {pass_count})"
)
updated_transcript_text = llm_res.replaced_text
print(
f"Updated transcript text after replacing entity {entity_to_replace.extracted_name} (pass {pass_count}): {updated_transcript_text}"
)
return {
"updated_transcript_text": updated_transcript_text,
"replacement_pass_count": pass_count,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment