BexTuychiev · November 20, 2025 11:50
diff --git a/compare_models.py b/compare_models.py
 """
 Model Comparison Script: Fine-tuned Tinker Model vs Base Qwen3-8B
 Uses Kimi K2 Thinking as an unbiased judge to evaluate responses to financial questions.
 """

 import os
 import json
 import time
 import requests
 from dotenv import load_dotenv
 import tinker
 from tinker import types

 # Load environment variables
 load_dotenv()  # Requires Tinker API key

 # Configuration
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 TINKER_CHECKPOINT = "tinker://8f13c8d2-d406-4533-810a-268360972ff6/sampler_weights/fincot-checkpoint-400"
 TINKER_BASE_MODEL = "Qwen/Qwen3-8B"  # Base model used for fine-tuning
 BASE_MODEL = "qwen/qwen3-8b"  # OpenRouter model name
 JUDGE_MODEL = "openai/gpt-4o"

 # 10 curated financial questions covering diverse topics
 FINANCIAL_QUESTIONS = [
    "What are the main risks associated with investing in stocks?",
    "How does diversification help reduce portfolio risk?",
    "What is the difference between a stock and a bond?",
    "Explain the concept of compound interest and its importance in investing.",
    "What factors should I consider when choosing between mutual funds and ETFs?",
    "How do interest rates affect the stock market?",
    "What is dollar-cost averaging and when should it be used?",
    "Explain the difference between value investing and growth investing.",
    "What role does inflation play in investment decisions?",
    "How can I assess if a stock is overvalued or undervalued?",
 ]


 def get_tinker_response(
    question: str, checkpoint_path: str, base_model: str, max_retries: int = 3
 ) -> str:
    """
    Get response from fine-tuned Tinker model.

    Args:
        question: Financial question to ask
        checkpoint_path: Path to Tinker checkpoint
        base_model: Base model name for tokenizer
        max_retries: Number of retry attempts

    Returns:
        Model response as string
    """
    from transformers import AutoTokenizer

    # Get tokenizer for the base model
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    for attempt in range(max_retries):
        try:
            # Initialize Tinker service client
            service_client = tinker.ServiceClient()

            # Create sampling client from checkpoint
            sampling_client = service_client.create_sampling_client(
                model_path=checkpoint_path
            )

            # Format question in Qwen3 chat format
            prompt = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"

            # Create model input
            model_input = types.ModelInput.from_ints(tokenizer.encode(prompt))

            # Configure sampling parameters
            sampling_params = types.SamplingParams(
                max_tokens=300,
                temperature=0.7,
                top_p=0.9,
                stop_sequences=["<|im_end|>", "</s>"],
            )

            # Generate response
            response = sampling_client.sample(
                prompt=model_input, num_samples=1, sampling_params=sampling_params
            ).result()

            # Decode and return response
            answer = tokenizer.decode(response.sequences[0].tokens)
            return answer.strip()

        except Exception as e:
            if attempt == max_retries - 1:
                print(
                    f"  ⚠ Tinker API error after {max_retries} attempts: {str(e)[:100]}"
                )
                return f"[Error: Failed to get response - {str(e)[:50]}]"
            print(
                f"  ⚠ Tinker API error (attempt {attempt + 1}/{max_retries}), retrying..."
            )
            time.sleep(2)

    return "[Error: Max retries exceeded]"


 def get_openrouter_response(question: str, model: str, max_retries: int = 3) -> str:
    """
    Get response from OpenRouter API.

    Args:
        question: Financial question to ask
        model: Model identifier on OpenRouter
        max_retries: Number of retry attempts

    Returns:
        Model response as string
    """
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
    }

    payload = {
        "model": model,
        "messages": [{"role": "user", "content": question}],
        "max_tokens": 300,
        "temperature": 0.7,
        "top_p": 0.9,
    }

    for attempt in range(max_retries):
        try:
            response = requests.post(url, headers=headers, json=payload, timeout=30)
            response.raise_for_status()

            data = response.json()
            message = data["choices"][0]["message"]

            # Handle reasoning models that return reasoning separately
            answer = message.get("content", "")
            if not answer and "reasoning" in message:
                answer = message["reasoning"]

            return answer.strip()

        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:
                print(
                    f"  ⚠ OpenRouter API error after {max_retries} attempts: {str(e)[:100]}"
                )
                return f"[Error: Failed to get response - {str(e)[:50]}]"
            print(
                f"  ⚠ OpenRouter API error (attempt {attempt + 1}/{max_retries}), retrying..."
            )
            time.sleep(2)

    return "[Error: Max retries exceeded]"


 def judge_responses(
    question: str, response_a: str, response_b: str, max_retries: int = 3
 ) -> dict:
    """
    Use Kimi K2 Thinking to judge which response is better.

    Args:
        question: The original financial question
        response_a: Response from fine-tuned model
        response_b: Response from base model
        max_retries: Number of retry attempts

    Returns:
        Dictionary with scores and verdict
    """
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
    }

    judge_prompt = f"""You are an expert financial educator evaluating two AI responses to the same question. Assess both responses objectively based on:

 1. **Accuracy**: Correctness of financial information
 2. **Clarity**: How well-explained and understandable the response is
 3. **Completeness**: Coverage of key concepts relevant to the question
 4. **Domain appropriateness**: Use of proper financial terminology

 **Question**: {question}

 **Response A** (Fine-tuned Model):
 {response_a}

 **Response B** (Base Model):
 {response_b}

 Provide your evaluation in the following JSON format (max 200 words for verdict):
 {{
    "score_a": <1-10>,
    "score_b": <1-10>,
    "verdict": "<Concise, unbiased comparison explaining scores. Must not exceed 200 words.>"
 }}

 Be objective and unbiased. Only output valid JSON."""

    payload = {
        "model": JUDGE_MODEL,
        "messages": [{"role": "user", "content": judge_prompt}],
        "max_tokens": 500,
        "temperature": 0.3,  # Lower temperature for more consistent judging
    }

    for attempt in range(max_retries):
        try:
            response = requests.post(url, headers=headers, json=payload, timeout=60)
            response.raise_for_status()

            data = response.json()
            message = data["choices"][0]["message"]

            # Handle reasoning models that return reasoning separately
            judgment_text = message.get("content", "")
            if not judgment_text and "reasoning" in message:
                judgment_text = message["reasoning"]
            judgment_text = judgment_text.strip()

            # Extract JSON from response (handle markdown code blocks)
            if "```json" in judgment_text:
                judgment_text = (
                    judgment_text.split("```json")[1].split("```")[0].strip()
                )
            elif "```" in judgment_text:
                judgment_text = judgment_text.split("```")[1].split("```")[0].strip()

            judgment = json.loads(judgment_text)

            # Validate structure
            if (
                "score_a" in judgment
                and "score_b" in judgment
                and "verdict" in judgment
            ):
                return judgment
            else:
                raise ValueError("Invalid judgment structure")

        except (
            requests.exceptions.RequestException,
            json.JSONDecodeError,
            ValueError,
        ) as e:
            if attempt == max_retries - 1:
                print(
                    f"  ⚠ Judge API error after {max_retries} attempts: {str(e)[:100]}"
                )
                return {
                    "score_a": 5,
                    "score_b": 5,
                    "verdict": f"[Error: Failed to get judgment - {str(e)[:50]}]",
                }
            print(
                f"  ⚠ Judge API error (attempt {attempt + 1}/{max_retries}), retrying..."
            )
            time.sleep(3)

    return {"score_a": 5, "score_b": 5, "verdict": "[Error: Max retries exceeded]"}


 def main():
    """Main comparison workflow."""
    print("=" * 80)
    print(
        "MODEL COMPARISON: Fine-tuned Qwen3-8B (Tinker) vs Base Qwen3-8B (OpenRouter)"
    )
    print("=" * 80)
    print(f"\nFine-tuned Model: {TINKER_CHECKPOINT}")
    print(f"Base Model: {BASE_MODEL}")
    print(f"Judge: {JUDGE_MODEL}")
    print(f"Questions: {len(FINANCIAL_QUESTIONS)}\n")

    results = []
    total_score_finetuned = 0
    total_score_base = 0

    for i, question in enumerate(FINANCIAL_QUESTIONS, 1):
        print(f"\n[Question {i}/{len(FINANCIAL_QUESTIONS)}]")
        print(f"Q: {question}")
        print("-" * 80)

        # Get responses from both models
        print("  🔄 Getting fine-tuned model response...")
        finetuned_response = get_tinker_response(
            question, TINKER_CHECKPOINT, TINKER_BASE_MODEL
        )

        print("  🔄 Getting base model response...")
        base_response = get_openrouter_response(question, BASE_MODEL)

        # Judge the responses
        print("  ⚖️  Judging responses...")
        judgment = judge_responses(question, finetuned_response, base_response)

        # Store results
        result = {
            "question": question,
            "finetuned_response": finetuned_response,
            "base_response": base_response,
            "score_finetuned": judgment["score_a"],
            "score_base": judgment["score_b"],
            "verdict": judgment["verdict"],
        }
        results.append(result)

        # Update totals
        total_score_finetuned += judgment["score_a"]
        total_score_base += judgment["score_b"]

        # Display scores
        print(
            f"  📊 Scores: Fine-tuned={judgment['score_a']}/10 | Base={judgment['score_b']}/10"
        )
        print(f"  💭 Verdict: {judgment['verdict'][:100]}...")

        # Delay to respect rate limits
        if i < len(FINANCIAL_QUESTIONS):
            time.sleep(2)

    # Calculate final statistics
    avg_score_finetuned = total_score_finetuned / len(FINANCIAL_QUESTIONS)
    avg_score_base = total_score_base / len(FINANCIAL_QUESTIONS)

    wins_finetuned = sum(1 for r in results if r["score_finetuned"] > r["score_base"])
    wins_base = sum(1 for r in results if r["score_base"] > r["score_finetuned"])
    ties = sum(1 for r in results if r["score_finetuned"] == r["score_base"])

    # Display final summary
    print("\n" + "=" * 80)
    print("FINAL RESULTS")
    print("=" * 80)
    print(f"\nAverage Scores:")
    print(f"  Fine-tuned Model: {avg_score_finetuned:.2f}/10")
    print(f"  Base Model:       {avg_score_base:.2f}/10")
    print(f"\nWin/Loss/Tie:")
    print(f"  Fine-tuned Wins: {wins_finetuned}")
    print(f"  Base Wins:       {wins_base}")
    print(f"  Ties:            {ties}")

    # Determine overall winner
    if avg_score_finetuned > avg_score_base:
        print(
            f"\n🏆 WINNER: Fine-tuned Model (+{avg_score_finetuned - avg_score_base:.2f} points)"
        )
    elif avg_score_base > avg_score_finetuned:
        print(
            f"\n🏆 WINNER: Base Model (+{avg_score_base - avg_score_finetuned:.2f} points)"
        )
    else:
        print(f"\n🤝 RESULT: Tie")

    # Save results to JSON
    output_data = {
        "metadata": {
            "finetuned_checkpoint": TINKER_CHECKPOINT,
            "base_model": BASE_MODEL,
            "judge_model": JUDGE_MODEL,
            "total_questions": len(FINANCIAL_QUESTIONS),
        },
        "summary": {
            "avg_score_finetuned": avg_score_finetuned,
            "avg_score_base": avg_score_base,
            "wins_finetuned": wins_finetuned,
            "wins_base": wins_base,
            "ties": ties,
        },
        "detailed_results": results,
    }

    output_file = "comparison_results.json"
    with open(output_file, "w") as f:
        json.dump(output_data, f, indent=2)

    print(f"\n✓ Detailed results saved to: {output_file}")
    print("=" * 80 + "\n")


 if __name__ == "__main__":
    main()
	"""
	Model Comparison Script: Fine-tuned Tinker Model vs Base Qwen3-8B
	Uses Kimi K2 Thinking as an unbiased judge to evaluate responses to financial questions.
	"""

	import os
	import json
	import time
	import requests
	from dotenv import load_dotenv
	import tinker
	from tinker import types

	# Load environment variables
	load_dotenv() # Requires Tinker API key

	# Configuration
	OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
	TINKER_CHECKPOINT = "tinker://8f13c8d2-d406-4533-810a-268360972ff6/sampler_weights/fincot-checkpoint-400"
	TINKER_BASE_MODEL = "Qwen/Qwen3-8B" # Base model used for fine-tuning
	BASE_MODEL = "qwen/qwen3-8b" # OpenRouter model name
	JUDGE_MODEL = "openai/gpt-4o"

	# 10 curated financial questions covering diverse topics
	FINANCIAL_QUESTIONS = [
	"What are the main risks associated with investing in stocks?",
	"How does diversification help reduce portfolio risk?",
	"What is the difference between a stock and a bond?",
	"Explain the concept of compound interest and its importance in investing.",
	"What factors should I consider when choosing between mutual funds and ETFs?",
	"How do interest rates affect the stock market?",
	"What is dollar-cost averaging and when should it be used?",
	"Explain the difference between value investing and growth investing.",
	"What role does inflation play in investment decisions?",
	"How can I assess if a stock is overvalued or undervalued?",
	]


	def get_tinker_response(
	question: str, checkpoint_path: str, base_model: str, max_retries: int = 3
	) -> str:
	"""
	Get response from fine-tuned Tinker model.

	Args:
	question: Financial question to ask
	checkpoint_path: Path to Tinker checkpoint
	base_model: Base model name for tokenizer
	max_retries: Number of retry attempts

	Returns:
	Model response as string
	"""
	from transformers import AutoTokenizer

	# Get tokenizer for the base model
	tokenizer = AutoTokenizer.from_pretrained(base_model)

	for attempt in range(max_retries):
	try:
	# Initialize Tinker service client
	service_client = tinker.ServiceClient()

	# Create sampling client from checkpoint
	sampling_client = service_client.create_sampling_client(
	model_path=checkpoint_path
	)

	# Format question in Qwen3 chat format
	prompt = f"<\|im_start\|>user\n{question}<\|im_end\|>\n<\|im_start\|>assistant\n"

	# Create model input
	model_input = types.ModelInput.from_ints(tokenizer.encode(prompt))

	# Configure sampling parameters
	sampling_params = types.SamplingParams(
	max_tokens=300,
	temperature=0.7,
	top_p=0.9,
	stop_sequences=["<\|im_end\|>", "</s>"],
	)

	# Generate response
	response = sampling_client.sample(
	prompt=model_input, num_samples=1, sampling_params=sampling_params
	).result()

	# Decode and return response
	answer = tokenizer.decode(response.sequences[0].tokens)
	return answer.strip()

	except Exception as e:
	if attempt == max_retries - 1:
	print(
	f" ⚠ Tinker API error after {max_retries} attempts: {str(e)[:100]}"
	)
	return f"[Error: Failed to get response - {str(e)[:50]}]"
	print(
	f" ⚠ Tinker API error (attempt {attempt + 1}/{max_retries}), retrying..."
	)
	time.sleep(2)

	return "[Error: Max retries exceeded]"


	def get_openrouter_response(question: str, model: str, max_retries: int = 3) -> str:
	"""
	Get response from OpenRouter API.

	Args:
	question: Financial question to ask
	model: Model identifier on OpenRouter
	max_retries: Number of retry attempts

	Returns:
	Model response as string
	"""
	url = "https://openrouter.ai/api/v1/chat/completions"
	headers = {
	"Authorization": f"Bearer {OPENROUTER_API_KEY}",
	"Content-Type": "application/json",
	}

	payload = {
	"model": model,
	"messages": [{"role": "user", "content": question}],
	"max_tokens": 300,
	"temperature": 0.7,
	"top_p": 0.9,
	}

	for attempt in range(max_retries):
	try:
	response = requests.post(url, headers=headers, json=payload, timeout=30)
	response.raise_for_status()

	data = response.json()
	message = data["choices"][0]["message"]

	# Handle reasoning models that return reasoning separately
	answer = message.get("content", "")
	if not answer and "reasoning" in message:
	answer = message["reasoning"]

	return answer.strip()

	except requests.exceptions.RequestException as e:
	if attempt == max_retries - 1:
	print(
	f" ⚠ OpenRouter API error after {max_retries} attempts: {str(e)[:100]}"
	)
	return f"[Error: Failed to get response - {str(e)[:50]}]"
	print(
	f" ⚠ OpenRouter API error (attempt {attempt + 1}/{max_retries}), retrying..."
	)
	time.sleep(2)

	return "[Error: Max retries exceeded]"


	def judge_responses(
	question: str, response_a: str, response_b: str, max_retries: int = 3
	) -> dict:
	"""
	Use Kimi K2 Thinking to judge which response is better.

	Args:
	question: The original financial question
	response_a: Response from fine-tuned model
	response_b: Response from base model
	max_retries: Number of retry attempts

	Returns:
	Dictionary with scores and verdict
	"""
	url = "https://openrouter.ai/api/v1/chat/completions"
	headers = {
	"Authorization": f"Bearer {OPENROUTER_API_KEY}",
	"Content-Type": "application/json",
	}

	judge_prompt = f"""You are an expert financial educator evaluating two AI responses to the same question. Assess both responses objectively based on:

	1. Accuracy: Correctness of financial information
	2. Clarity: How well-explained and understandable the response is
	3. Completeness: Coverage of key concepts relevant to the question
	4. Domain appropriateness: Use of proper financial terminology

	Question: {question}

	Response A (Fine-tuned Model):
	{response_a}

	Response B (Base Model):
	{response_b}

	Provide your evaluation in the following JSON format (max 200 words for verdict):
	{{
	"score_a": <1-10>,
	"score_b": <1-10>,
	"verdict": "<Concise, unbiased comparison explaining scores. Must not exceed 200 words.>"
	}}

	Be objective and unbiased. Only output valid JSON."""

	payload = {
	"model": JUDGE_MODEL,
	"messages": [{"role": "user", "content": judge_prompt}],
	"max_tokens": 500,
	"temperature": 0.3, # Lower temperature for more consistent judging
	}

	for attempt in range(max_retries):
	try:
	response = requests.post(url, headers=headers, json=payload, timeout=60)
	response.raise_for_status()

	data = response.json()
	message = data["choices"][0]["message"]

	# Handle reasoning models that return reasoning separately
	judgment_text = message.get("content", "")
	if not judgment_text and "reasoning" in message:
	judgment_text = message["reasoning"]
	judgment_text = judgment_text.strip()

	# Extract JSON from response (handle markdown code blocks)
	if "```json" in judgment_text:
	judgment_text = (
	judgment_text.split("```json")[1].split("```")[0].strip()
	)
	elif "```" in judgment_text:
	judgment_text = judgment_text.split("```")[1].split("```")[0].strip()

	judgment = json.loads(judgment_text)

	# Validate structure
	if (
	"score_a" in judgment
	and "score_b" in judgment
	and "verdict" in judgment
	):
	return judgment
	else:
	raise ValueError("Invalid judgment structure")

	except (
	requests.exceptions.RequestException,
	json.JSONDecodeError,
	ValueError,
	) as e:
	if attempt == max_retries - 1:
	print(
	f" ⚠ Judge API error after {max_retries} attempts: {str(e)[:100]}"
	)
	return {
	"score_a": 5,
	"score_b": 5,
	"verdict": f"[Error: Failed to get judgment - {str(e)[:50]}]",
	}
	print(
	f" ⚠ Judge API error (attempt {attempt + 1}/{max_retries}), retrying..."
	)
	time.sleep(3)

	return {"score_a": 5, "score_b": 5, "verdict": "[Error: Max retries exceeded]"}


	def main():
	"""Main comparison workflow."""
	print("=" * 80)
	print(
	"MODEL COMPARISON: Fine-tuned Qwen3-8B (Tinker) vs Base Qwen3-8B (OpenRouter)"
	)
	print("=" * 80)
	print(f"\nFine-tuned Model: {TINKER_CHECKPOINT}")
	print(f"Base Model: {BASE_MODEL}")
	print(f"Judge: {JUDGE_MODEL}")
	print(f"Questions: {len(FINANCIAL_QUESTIONS)}\n")

	results = []
	total_score_finetuned = 0
	total_score_base = 0

	for i, question in enumerate(FINANCIAL_QUESTIONS, 1):
	print(f"\n[Question {i}/{len(FINANCIAL_QUESTIONS)}]")
	print(f"Q: {question}")
	print("-" * 80)

	# Get responses from both models
	print(" 🔄 Getting fine-tuned model response...")
	finetuned_response = get_tinker_response(
	question, TINKER_CHECKPOINT, TINKER_BASE_MODEL
	)

	print(" 🔄 Getting base model response...")
	base_response = get_openrouter_response(question, BASE_MODEL)

	# Judge the responses
	print(" ⚖️ Judging responses...")
	judgment = judge_responses(question, finetuned_response, base_response)

	# Store results
	result = {
	"question": question,
	"finetuned_response": finetuned_response,
	"base_response": base_response,
	"score_finetuned": judgment["score_a"],
	"score_base": judgment["score_b"],
	"verdict": judgment["verdict"],
	}
	results.append(result)

	# Update totals
	total_score_finetuned += judgment["score_a"]
	total_score_base += judgment["score_b"]

	# Display scores
	print(
	f" 📊 Scores: Fine-tuned={judgment['score_a']}/10 \| Base={judgment['score_b']}/10"
	)
	print(f" 💭 Verdict: {judgment['verdict'][:100]}...")

	# Delay to respect rate limits
	if i < len(FINANCIAL_QUESTIONS):
	time.sleep(2)

	# Calculate final statistics
	avg_score_finetuned = total_score_finetuned / len(FINANCIAL_QUESTIONS)
	avg_score_base = total_score_base / len(FINANCIAL_QUESTIONS)

	wins_finetuned = sum(1 for r in results if r["score_finetuned"] > r["score_base"])
	wins_base = sum(1 for r in results if r["score_base"] > r["score_finetuned"])
	ties = sum(1 for r in results if r["score_finetuned"] == r["score_base"])

	# Display final summary
	print("\n" + "=" * 80)
	print("FINAL RESULTS")
	print("=" * 80)
	print(f"\nAverage Scores:")
	print(f" Fine-tuned Model: {avg_score_finetuned:.2f}/10")
	print(f" Base Model: {avg_score_base:.2f}/10")
	print(f"\nWin/Loss/Tie:")
	print(f" Fine-tuned Wins: {wins_finetuned}")
	print(f" Base Wins: {wins_base}")
	print(f" Ties: {ties}")

	# Determine overall winner
	if avg_score_finetuned > avg_score_base:
	print(
	f"\n🏆 WINNER: Fine-tuned Model (+{avg_score_finetuned - avg_score_base:.2f} points)"
	)
	elif avg_score_base > avg_score_finetuned:
	print(
	f"\n🏆 WINNER: Base Model (+{avg_score_base - avg_score_finetuned:.2f} points)"
	)
	else:
	print(f"\n🤝 RESULT: Tie")

	# Save results to JSON
	output_data = {
	"metadata": {
	"finetuned_checkpoint": TINKER_CHECKPOINT,
	"base_model": BASE_MODEL,
	"judge_model": JUDGE_MODEL,
	"total_questions": len(FINANCIAL_QUESTIONS),
	},
	"summary": {
	"avg_score_finetuned": avg_score_finetuned,
	"avg_score_base": avg_score_base,
	"wins_finetuned": wins_finetuned,
	"wins_base": wins_base,
	"ties": ties,
	},
	"detailed_results": results,
	}

	output_file = "comparison_results.json"
	with open(output_file, "w") as f:
	json.dump(output_data, f, indent=2)

	print(f"\n✓ Detailed results saved to: {output_file}")
	print("=" * 80 + "\n")


	if __name__ == "__main__":
	main()
No results found