Skip to content

Instantly share code, notes, and snippets.

@BexTuychiev
Created November 20, 2025 11:50
Show Gist options
  • Select an option

  • Save BexTuychiev/1c760d0f0475cf2a3465043ceb04a6a6 to your computer and use it in GitHub Desktop.

Select an option

Save BexTuychiev/1c760d0f0475cf2a3465043ceb04a6a6 to your computer and use it in GitHub Desktop.
"""
Model Comparison Script: Fine-tuned Tinker Model vs Base Qwen3-8B
Uses Kimi K2 Thinking as an unbiased judge to evaluate responses to financial questions.
"""
import os
import json
import time
import requests
from dotenv import load_dotenv
import tinker
from tinker import types
# Load environment variables
load_dotenv() # Requires Tinker API key
# Configuration
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
TINKER_CHECKPOINT = "tinker://8f13c8d2-d406-4533-810a-268360972ff6/sampler_weights/fincot-checkpoint-400"
TINKER_BASE_MODEL = "Qwen/Qwen3-8B" # Base model used for fine-tuning
BASE_MODEL = "qwen/qwen3-8b" # OpenRouter model name
JUDGE_MODEL = "openai/gpt-4o"
# 10 curated financial questions covering diverse topics
FINANCIAL_QUESTIONS = [
"What are the main risks associated with investing in stocks?",
"How does diversification help reduce portfolio risk?",
"What is the difference between a stock and a bond?",
"Explain the concept of compound interest and its importance in investing.",
"What factors should I consider when choosing between mutual funds and ETFs?",
"How do interest rates affect the stock market?",
"What is dollar-cost averaging and when should it be used?",
"Explain the difference between value investing and growth investing.",
"What role does inflation play in investment decisions?",
"How can I assess if a stock is overvalued or undervalued?",
]
def get_tinker_response(
question: str, checkpoint_path: str, base_model: str, max_retries: int = 3
) -> str:
"""
Get response from fine-tuned Tinker model.
Args:
question: Financial question to ask
checkpoint_path: Path to Tinker checkpoint
base_model: Base model name for tokenizer
max_retries: Number of retry attempts
Returns:
Model response as string
"""
from transformers import AutoTokenizer
# Get tokenizer for the base model
tokenizer = AutoTokenizer.from_pretrained(base_model)
for attempt in range(max_retries):
try:
# Initialize Tinker service client
service_client = tinker.ServiceClient()
# Create sampling client from checkpoint
sampling_client = service_client.create_sampling_client(
model_path=checkpoint_path
)
# Format question in Qwen3 chat format
prompt = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
# Create model input
model_input = types.ModelInput.from_ints(tokenizer.encode(prompt))
# Configure sampling parameters
sampling_params = types.SamplingParams(
max_tokens=300,
temperature=0.7,
top_p=0.9,
stop_sequences=["<|im_end|>", "</s>"],
)
# Generate response
response = sampling_client.sample(
prompt=model_input, num_samples=1, sampling_params=sampling_params
).result()
# Decode and return response
answer = tokenizer.decode(response.sequences[0].tokens)
return answer.strip()
except Exception as e:
if attempt == max_retries - 1:
print(
f" ⚠ Tinker API error after {max_retries} attempts: {str(e)[:100]}"
)
return f"[Error: Failed to get response - {str(e)[:50]}]"
print(
f" ⚠ Tinker API error (attempt {attempt + 1}/{max_retries}), retrying..."
)
time.sleep(2)
return "[Error: Max retries exceeded]"
def get_openrouter_response(question: str, model: str, max_retries: int = 3) -> str:
"""
Get response from OpenRouter API.
Args:
question: Financial question to ask
model: Model identifier on OpenRouter
max_retries: Number of retry attempts
Returns:
Model response as string
"""
url = "https://openrouter.ai/api/v1/chat/completions"
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": model,
"messages": [{"role": "user", "content": question}],
"max_tokens": 300,
"temperature": 0.7,
"top_p": 0.9,
}
for attempt in range(max_retries):
try:
response = requests.post(url, headers=headers, json=payload, timeout=30)
response.raise_for_status()
data = response.json()
message = data["choices"][0]["message"]
# Handle reasoning models that return reasoning separately
answer = message.get("content", "")
if not answer and "reasoning" in message:
answer = message["reasoning"]
return answer.strip()
except requests.exceptions.RequestException as e:
if attempt == max_retries - 1:
print(
f" ⚠ OpenRouter API error after {max_retries} attempts: {str(e)[:100]}"
)
return f"[Error: Failed to get response - {str(e)[:50]}]"
print(
f" ⚠ OpenRouter API error (attempt {attempt + 1}/{max_retries}), retrying..."
)
time.sleep(2)
return "[Error: Max retries exceeded]"
def judge_responses(
question: str, response_a: str, response_b: str, max_retries: int = 3
) -> dict:
"""
Use Kimi K2 Thinking to judge which response is better.
Args:
question: The original financial question
response_a: Response from fine-tuned model
response_b: Response from base model
max_retries: Number of retry attempts
Returns:
Dictionary with scores and verdict
"""
url = "https://openrouter.ai/api/v1/chat/completions"
headers = {
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
}
judge_prompt = f"""You are an expert financial educator evaluating two AI responses to the same question. Assess both responses objectively based on:
1. **Accuracy**: Correctness of financial information
2. **Clarity**: How well-explained and understandable the response is
3. **Completeness**: Coverage of key concepts relevant to the question
4. **Domain appropriateness**: Use of proper financial terminology
**Question**: {question}
**Response A** (Fine-tuned Model):
{response_a}
**Response B** (Base Model):
{response_b}
Provide your evaluation in the following JSON format (max 200 words for verdict):
{{
"score_a": <1-10>,
"score_b": <1-10>,
"verdict": "<Concise, unbiased comparison explaining scores. Must not exceed 200 words.>"
}}
Be objective and unbiased. Only output valid JSON."""
payload = {
"model": JUDGE_MODEL,
"messages": [{"role": "user", "content": judge_prompt}],
"max_tokens": 500,
"temperature": 0.3, # Lower temperature for more consistent judging
}
for attempt in range(max_retries):
try:
response = requests.post(url, headers=headers, json=payload, timeout=60)
response.raise_for_status()
data = response.json()
message = data["choices"][0]["message"]
# Handle reasoning models that return reasoning separately
judgment_text = message.get("content", "")
if not judgment_text and "reasoning" in message:
judgment_text = message["reasoning"]
judgment_text = judgment_text.strip()
# Extract JSON from response (handle markdown code blocks)
if "```json" in judgment_text:
judgment_text = (
judgment_text.split("```json")[1].split("```")[0].strip()
)
elif "```" in judgment_text:
judgment_text = judgment_text.split("```")[1].split("```")[0].strip()
judgment = json.loads(judgment_text)
# Validate structure
if (
"score_a" in judgment
and "score_b" in judgment
and "verdict" in judgment
):
return judgment
else:
raise ValueError("Invalid judgment structure")
except (
requests.exceptions.RequestException,
json.JSONDecodeError,
ValueError,
) as e:
if attempt == max_retries - 1:
print(
f" ⚠ Judge API error after {max_retries} attempts: {str(e)[:100]}"
)
return {
"score_a": 5,
"score_b": 5,
"verdict": f"[Error: Failed to get judgment - {str(e)[:50]}]",
}
print(
f" ⚠ Judge API error (attempt {attempt + 1}/{max_retries}), retrying..."
)
time.sleep(3)
return {"score_a": 5, "score_b": 5, "verdict": "[Error: Max retries exceeded]"}
def main():
"""Main comparison workflow."""
print("=" * 80)
print(
"MODEL COMPARISON: Fine-tuned Qwen3-8B (Tinker) vs Base Qwen3-8B (OpenRouter)"
)
print("=" * 80)
print(f"\nFine-tuned Model: {TINKER_CHECKPOINT}")
print(f"Base Model: {BASE_MODEL}")
print(f"Judge: {JUDGE_MODEL}")
print(f"Questions: {len(FINANCIAL_QUESTIONS)}\n")
results = []
total_score_finetuned = 0
total_score_base = 0
for i, question in enumerate(FINANCIAL_QUESTIONS, 1):
print(f"\n[Question {i}/{len(FINANCIAL_QUESTIONS)}]")
print(f"Q: {question}")
print("-" * 80)
# Get responses from both models
print(" πŸ”„ Getting fine-tuned model response...")
finetuned_response = get_tinker_response(
question, TINKER_CHECKPOINT, TINKER_BASE_MODEL
)
print(" πŸ”„ Getting base model response...")
base_response = get_openrouter_response(question, BASE_MODEL)
# Judge the responses
print(" βš–οΈ Judging responses...")
judgment = judge_responses(question, finetuned_response, base_response)
# Store results
result = {
"question": question,
"finetuned_response": finetuned_response,
"base_response": base_response,
"score_finetuned": judgment["score_a"],
"score_base": judgment["score_b"],
"verdict": judgment["verdict"],
}
results.append(result)
# Update totals
total_score_finetuned += judgment["score_a"]
total_score_base += judgment["score_b"]
# Display scores
print(
f" πŸ“Š Scores: Fine-tuned={judgment['score_a']}/10 | Base={judgment['score_b']}/10"
)
print(f" πŸ’­ Verdict: {judgment['verdict'][:100]}...")
# Delay to respect rate limits
if i < len(FINANCIAL_QUESTIONS):
time.sleep(2)
# Calculate final statistics
avg_score_finetuned = total_score_finetuned / len(FINANCIAL_QUESTIONS)
avg_score_base = total_score_base / len(FINANCIAL_QUESTIONS)
wins_finetuned = sum(1 for r in results if r["score_finetuned"] > r["score_base"])
wins_base = sum(1 for r in results if r["score_base"] > r["score_finetuned"])
ties = sum(1 for r in results if r["score_finetuned"] == r["score_base"])
# Display final summary
print("\n" + "=" * 80)
print("FINAL RESULTS")
print("=" * 80)
print(f"\nAverage Scores:")
print(f" Fine-tuned Model: {avg_score_finetuned:.2f}/10")
print(f" Base Model: {avg_score_base:.2f}/10")
print(f"\nWin/Loss/Tie:")
print(f" Fine-tuned Wins: {wins_finetuned}")
print(f" Base Wins: {wins_base}")
print(f" Ties: {ties}")
# Determine overall winner
if avg_score_finetuned > avg_score_base:
print(
f"\nπŸ† WINNER: Fine-tuned Model (+{avg_score_finetuned - avg_score_base:.2f} points)"
)
elif avg_score_base > avg_score_finetuned:
print(
f"\nπŸ† WINNER: Base Model (+{avg_score_base - avg_score_finetuned:.2f} points)"
)
else:
print(f"\n🀝 RESULT: Tie")
# Save results to JSON
output_data = {
"metadata": {
"finetuned_checkpoint": TINKER_CHECKPOINT,
"base_model": BASE_MODEL,
"judge_model": JUDGE_MODEL,
"total_questions": len(FINANCIAL_QUESTIONS),
},
"summary": {
"avg_score_finetuned": avg_score_finetuned,
"avg_score_base": avg_score_base,
"wins_finetuned": wins_finetuned,
"wins_base": wins_base,
"ties": ties,
},
"detailed_results": results,
}
output_file = "comparison_results.json"
with open(output_file, "w") as f:
json.dump(output_data, f, indent=2)
print(f"\nβœ“ Detailed results saved to: {output_file}")
print("=" * 80 + "\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment