Created
November 30, 2025 21:19
-
-
Save zzstoatzz/8975282e4e6751482165fc20b5d2f536 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """proof of concept: image moderation with pydantic-ai. | |
| tests pydantic-ai's ability to analyze images and detect policy violations | |
| with different policy strictness levels. | |
| usage: | |
| uv run sandbox/test_image_moderation.py | |
| """ | |
| import asyncio | |
| import os | |
| from pathlib import Path | |
| from typing import Literal | |
| import dotenv | |
| import logfire | |
| from pydantic import BaseModel, Field | |
| from pydantic_ai import Agent | |
| from pydantic_ai.messages import BinaryImage | |
| dotenv.load_dotenv() | |
| logfire.configure(token=os.getenv("LOGFIRE_WRITE_TOKEN")) | |
| logfire.instrument_pydantic_ai() | |
| class ModerationResult(BaseModel): | |
| """structured moderation result.""" | |
| is_safe: bool = Field(description="whether the image passes moderation") | |
| violated_categories: list[str] = Field( | |
| default_factory=list, | |
| description="categories that were violated (empty if safe)", | |
| ) | |
| severity: Literal["safe", "low", "medium", "high"] = Field( | |
| description="severity of violations" | |
| ) | |
| explanation: str = Field(description="explanation of the decision") | |
| async def moderate_image(image_path: Path, policy: str) -> ModerationResult: | |
| """moderate an image against a policy using pydantic-ai. | |
| args: | |
| image_path: path to image file | |
| policy: moderation policy description | |
| returns: | |
| structured moderation result | |
| """ | |
| # read image bytes | |
| image_bytes = image_path.read_bytes() | |
| # create BinaryImage from raw bytes | |
| binary_image = BinaryImage(data=image_bytes, media_type="image/png") | |
| # create agent with temperature=0 for deterministic moderation | |
| moderator_agent = Agent[None, ModerationResult]( | |
| "gateway/anthropic:claude-sonnet-4-5", output_type=ModerationResult | |
| ) | |
| # use instructions kwarg for policy enforcement | |
| instructions = f""" | |
| Analyze the provided image against this content policy: | |
| {policy} | |
| Be accurate and conservative - only flag clear violations. | |
| """ | |
| result = await moderator_agent.run( | |
| ["this is the image to moderate", binary_image], | |
| instructions=instructions, | |
| model_settings={"temperature": 0}, | |
| ) | |
| return result.output | |
| async def main() -> None: | |
| """run image moderation proof of concept.""" | |
| sandbox = Path(__file__).parent | |
| test_image = sandbox / "handguns.png" | |
| if not test_image.exists(): | |
| raise FileNotFoundError(f"test image not found: {test_image}") | |
| # policy 1: strict - no weapons at all | |
| strict_policy = """ | |
| Content Policy - STRICT: | |
| PROHIBITED CONTENT: | |
| - Firearms, guns, or any violent weapons | |
| - Depictions of weapons regardless of context | |
| Any image containing weapons should be flagged with HIGH severity. | |
| """ | |
| # policy 2: nuanced - weapons ok if not depicting violence | |
| nuanced_policy = """ | |
| Content Policy - NUANCED: | |
| PROHIBITED CONTENT: | |
| - Explicit acts of violence | |
| - Weapons being used to harm people or animals | |
| - Threatening or aggressive poses with weapons | |
| ALLOWED CONTENT: | |
| - Static displays of weapons (e.g., collections, product photos) | |
| - Historical or educational weapon imagery | |
| - Weapons not being actively used for violence | |
| Only flag images showing actual violence or threatening behavior. | |
| Neutral weapon displays without violent context are acceptable. | |
| """ | |
| print("\n" + "=" * 70) | |
| print(f"testing image moderation with: {test_image.name}") | |
| print("=" * 70) | |
| # test 1: strict policy (should flag) | |
| print("\ntest 1: strict policy (no weapons allowed)") | |
| print("-" * 70) | |
| strict_result = await moderate_image(test_image, strict_policy) | |
| print(f"is_safe: {strict_result.is_safe}") | |
| print(f"violated_categories: {strict_result.violated_categories}") | |
| print(f"severity: {strict_result.severity}") | |
| print(f"explanation: {strict_result.explanation}") | |
| # test 2: nuanced policy (should pass - no violence depicted) | |
| print("\ntest 2: nuanced policy (weapons ok if no violence)") | |
| print("-" * 70) | |
| nuanced_result = await moderate_image(test_image, nuanced_policy) | |
| print(f"is_safe: {nuanced_result.is_safe}") | |
| print(f"violated_categories: {nuanced_result.violated_categories}") | |
| print(f"severity: {nuanced_result.severity}") | |
| print(f"explanation: {nuanced_result.explanation}") | |
| print("\n" + "=" * 70) | |
| print("proof of concept complete") | |
| print("=" * 70) | |
| # validate results match expectations | |
| print("\nexpectations:") | |
| print(f" strict policy should flag: {not strict_result.is_safe} ✓") | |
| print( | |
| f" nuanced policy should pass: {nuanced_result.is_safe} {'✓' if nuanced_result.is_safe else '✗'}" | |
| ) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment