Last active
August 26, 2025 22:34
-
-
Save CodeWithOz/0b01d558adac703673aceca48471dca7 to your computer and use it in GitHub Desktop.
Agent for cleaning up named entities in YouTube video transcripts.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ... | |
| class TextReplacement(BaseModel): | |
| replaced_text: str = Field(description="The updated text after replacing entities.") | |
| ... | |
| class DemoEnrichmentAgent: | |
| ... | |
| entity_replacer_llm: Runnable | |
| def __init__(self): | |
| ... | |
| self.entity_replacer_llm = init_chat_model( | |
| "gpt-5-nano-2025-08-07", | |
| model_provider="openai", | |
| ).with_structured_output(TextReplacement) | |
| ... | |
| async def replace_entity_node(self, state: AgentState): | |
| entity_to_replace_idx = state.get("replacement_loop_idx", 0) | |
| entity_to_replace: VerifiedEntity = state.get("verified_entities", [])[ | |
| entity_to_replace_idx | |
| ] | |
| pass_count = state.get("replacement_pass_count", 0) + 1 | |
| # if updated_transcript_text is not yet set | |
| # i.e. it is the first pass on the first entity | |
| # then use the original transcript_text | |
| updated_text = state.get("updated_transcript_text") or state.get( | |
| "transcript_text" | |
| ) | |
| print( | |
| f"Replacing entity: {entity_to_replace.extracted_name} (pass {pass_count})" | |
| ) | |
| llm_res: TextReplacement = await self.entity_replacer_llm.ainvoke( | |
| [ | |
| SystemMessage( | |
| content=( | |
| "You are a master at finding and replacing entities in text. " | |
| "You will receive the text of a YouTube video transcript. " | |
| "You will also receive the name of an entity to replace and the " | |
| "canonical name of the entity. Your task is to replace all " | |
| "occurrences of the entity name with the canonical name. " | |
| "You should handle edge cases such as partial matches and " | |
| "special characters. " | |
| "ONLY output a JSON object with a single key `replaced_text`. " | |
| "For example, `{'replaced_text': 'new text containing updated entity name'}`." | |
| ) | |
| ), | |
| HumanMessage(content="TRANSCRIPT_TEXT:\n\n" + updated_text), | |
| HumanMessage( | |
| content="ENTITY_TO_REPLACE:\n\n" + entity_to_replace.extracted_name | |
| ), | |
| HumanMessage( | |
| content="ENTITY_CANONICAL_NAME:\n\n" | |
| + entity_to_replace.canonical_name | |
| ), | |
| ] | |
| ) | |
| print( | |
| f"Replaced entity: {entity_to_replace.extracted_name} (pass {pass_count})" | |
| ) | |
| updated_transcript_text = llm_res.replaced_text | |
| print( | |
| f"Updated transcript text after replacing entity {entity_to_replace.extracted_name} (pass {pass_count}): {updated_transcript_text}" | |
| ) | |
| return { | |
| "updated_transcript_text": updated_transcript_text, | |
| "replacement_pass_count": pass_count, | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment