Created
November 20, 2025 11:50
-
-
Save BexTuychiev/1c760d0f0475cf2a3465043ceb04a6a6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Model Comparison Script: Fine-tuned Tinker Model vs Base Qwen3-8B | |
| Uses Kimi K2 Thinking as an unbiased judge to evaluate responses to financial questions. | |
| """ | |
| import os | |
| import json | |
| import time | |
| import requests | |
| from dotenv import load_dotenv | |
| import tinker | |
| from tinker import types | |
| # Load environment variables | |
| load_dotenv() # Requires Tinker API key | |
| # Configuration | |
| OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") | |
| TINKER_CHECKPOINT = "tinker://8f13c8d2-d406-4533-810a-268360972ff6/sampler_weights/fincot-checkpoint-400" | |
| TINKER_BASE_MODEL = "Qwen/Qwen3-8B" # Base model used for fine-tuning | |
| BASE_MODEL = "qwen/qwen3-8b" # OpenRouter model name | |
| JUDGE_MODEL = "openai/gpt-4o" | |
| # 10 curated financial questions covering diverse topics | |
| FINANCIAL_QUESTIONS = [ | |
| "What are the main risks associated with investing in stocks?", | |
| "How does diversification help reduce portfolio risk?", | |
| "What is the difference between a stock and a bond?", | |
| "Explain the concept of compound interest and its importance in investing.", | |
| "What factors should I consider when choosing between mutual funds and ETFs?", | |
| "How do interest rates affect the stock market?", | |
| "What is dollar-cost averaging and when should it be used?", | |
| "Explain the difference between value investing and growth investing.", | |
| "What role does inflation play in investment decisions?", | |
| "How can I assess if a stock is overvalued or undervalued?", | |
| ] | |
| def get_tinker_response( | |
| question: str, checkpoint_path: str, base_model: str, max_retries: int = 3 | |
| ) -> str: | |
| """ | |
| Get response from fine-tuned Tinker model. | |
| Args: | |
| question: Financial question to ask | |
| checkpoint_path: Path to Tinker checkpoint | |
| base_model: Base model name for tokenizer | |
| max_retries: Number of retry attempts | |
| Returns: | |
| Model response as string | |
| """ | |
| from transformers import AutoTokenizer | |
| # Get tokenizer for the base model | |
| tokenizer = AutoTokenizer.from_pretrained(base_model) | |
| for attempt in range(max_retries): | |
| try: | |
| # Initialize Tinker service client | |
| service_client = tinker.ServiceClient() | |
| # Create sampling client from checkpoint | |
| sampling_client = service_client.create_sampling_client( | |
| model_path=checkpoint_path | |
| ) | |
| # Format question in Qwen3 chat format | |
| prompt = f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n" | |
| # Create model input | |
| model_input = types.ModelInput.from_ints(tokenizer.encode(prompt)) | |
| # Configure sampling parameters | |
| sampling_params = types.SamplingParams( | |
| max_tokens=300, | |
| temperature=0.7, | |
| top_p=0.9, | |
| stop_sequences=["<|im_end|>", "</s>"], | |
| ) | |
| # Generate response | |
| response = sampling_client.sample( | |
| prompt=model_input, num_samples=1, sampling_params=sampling_params | |
| ).result() | |
| # Decode and return response | |
| answer = tokenizer.decode(response.sequences[0].tokens) | |
| return answer.strip() | |
| except Exception as e: | |
| if attempt == max_retries - 1: | |
| print( | |
| f" β Tinker API error after {max_retries} attempts: {str(e)[:100]}" | |
| ) | |
| return f"[Error: Failed to get response - {str(e)[:50]}]" | |
| print( | |
| f" β Tinker API error (attempt {attempt + 1}/{max_retries}), retrying..." | |
| ) | |
| time.sleep(2) | |
| return "[Error: Max retries exceeded]" | |
| def get_openrouter_response(question: str, model: str, max_retries: int = 3) -> str: | |
| """ | |
| Get response from OpenRouter API. | |
| Args: | |
| question: Financial question to ask | |
| model: Model identifier on OpenRouter | |
| max_retries: Number of retry attempts | |
| Returns: | |
| Model response as string | |
| """ | |
| url = "https://openrouter.ai/api/v1/chat/completions" | |
| headers = { | |
| "Authorization": f"Bearer {OPENROUTER_API_KEY}", | |
| "Content-Type": "application/json", | |
| } | |
| payload = { | |
| "model": model, | |
| "messages": [{"role": "user", "content": question}], | |
| "max_tokens": 300, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| } | |
| for attempt in range(max_retries): | |
| try: | |
| response = requests.post(url, headers=headers, json=payload, timeout=30) | |
| response.raise_for_status() | |
| data = response.json() | |
| message = data["choices"][0]["message"] | |
| # Handle reasoning models that return reasoning separately | |
| answer = message.get("content", "") | |
| if not answer and "reasoning" in message: | |
| answer = message["reasoning"] | |
| return answer.strip() | |
| except requests.exceptions.RequestException as e: | |
| if attempt == max_retries - 1: | |
| print( | |
| f" β OpenRouter API error after {max_retries} attempts: {str(e)[:100]}" | |
| ) | |
| return f"[Error: Failed to get response - {str(e)[:50]}]" | |
| print( | |
| f" β OpenRouter API error (attempt {attempt + 1}/{max_retries}), retrying..." | |
| ) | |
| time.sleep(2) | |
| return "[Error: Max retries exceeded]" | |
| def judge_responses( | |
| question: str, response_a: str, response_b: str, max_retries: int = 3 | |
| ) -> dict: | |
| """ | |
| Use Kimi K2 Thinking to judge which response is better. | |
| Args: | |
| question: The original financial question | |
| response_a: Response from fine-tuned model | |
| response_b: Response from base model | |
| max_retries: Number of retry attempts | |
| Returns: | |
| Dictionary with scores and verdict | |
| """ | |
| url = "https://openrouter.ai/api/v1/chat/completions" | |
| headers = { | |
| "Authorization": f"Bearer {OPENROUTER_API_KEY}", | |
| "Content-Type": "application/json", | |
| } | |
| judge_prompt = f"""You are an expert financial educator evaluating two AI responses to the same question. Assess both responses objectively based on: | |
| 1. **Accuracy**: Correctness of financial information | |
| 2. **Clarity**: How well-explained and understandable the response is | |
| 3. **Completeness**: Coverage of key concepts relevant to the question | |
| 4. **Domain appropriateness**: Use of proper financial terminology | |
| **Question**: {question} | |
| **Response A** (Fine-tuned Model): | |
| {response_a} | |
| **Response B** (Base Model): | |
| {response_b} | |
| Provide your evaluation in the following JSON format (max 200 words for verdict): | |
| {{ | |
| "score_a": <1-10>, | |
| "score_b": <1-10>, | |
| "verdict": "<Concise, unbiased comparison explaining scores. Must not exceed 200 words.>" | |
| }} | |
| Be objective and unbiased. Only output valid JSON.""" | |
| payload = { | |
| "model": JUDGE_MODEL, | |
| "messages": [{"role": "user", "content": judge_prompt}], | |
| "max_tokens": 500, | |
| "temperature": 0.3, # Lower temperature for more consistent judging | |
| } | |
| for attempt in range(max_retries): | |
| try: | |
| response = requests.post(url, headers=headers, json=payload, timeout=60) | |
| response.raise_for_status() | |
| data = response.json() | |
| message = data["choices"][0]["message"] | |
| # Handle reasoning models that return reasoning separately | |
| judgment_text = message.get("content", "") | |
| if not judgment_text and "reasoning" in message: | |
| judgment_text = message["reasoning"] | |
| judgment_text = judgment_text.strip() | |
| # Extract JSON from response (handle markdown code blocks) | |
| if "```json" in judgment_text: | |
| judgment_text = ( | |
| judgment_text.split("```json")[1].split("```")[0].strip() | |
| ) | |
| elif "```" in judgment_text: | |
| judgment_text = judgment_text.split("```")[1].split("```")[0].strip() | |
| judgment = json.loads(judgment_text) | |
| # Validate structure | |
| if ( | |
| "score_a" in judgment | |
| and "score_b" in judgment | |
| and "verdict" in judgment | |
| ): | |
| return judgment | |
| else: | |
| raise ValueError("Invalid judgment structure") | |
| except ( | |
| requests.exceptions.RequestException, | |
| json.JSONDecodeError, | |
| ValueError, | |
| ) as e: | |
| if attempt == max_retries - 1: | |
| print( | |
| f" β Judge API error after {max_retries} attempts: {str(e)[:100]}" | |
| ) | |
| return { | |
| "score_a": 5, | |
| "score_b": 5, | |
| "verdict": f"[Error: Failed to get judgment - {str(e)[:50]}]", | |
| } | |
| print( | |
| f" β Judge API error (attempt {attempt + 1}/{max_retries}), retrying..." | |
| ) | |
| time.sleep(3) | |
| return {"score_a": 5, "score_b": 5, "verdict": "[Error: Max retries exceeded]"} | |
| def main(): | |
| """Main comparison workflow.""" | |
| print("=" * 80) | |
| print( | |
| "MODEL COMPARISON: Fine-tuned Qwen3-8B (Tinker) vs Base Qwen3-8B (OpenRouter)" | |
| ) | |
| print("=" * 80) | |
| print(f"\nFine-tuned Model: {TINKER_CHECKPOINT}") | |
| print(f"Base Model: {BASE_MODEL}") | |
| print(f"Judge: {JUDGE_MODEL}") | |
| print(f"Questions: {len(FINANCIAL_QUESTIONS)}\n") | |
| results = [] | |
| total_score_finetuned = 0 | |
| total_score_base = 0 | |
| for i, question in enumerate(FINANCIAL_QUESTIONS, 1): | |
| print(f"\n[Question {i}/{len(FINANCIAL_QUESTIONS)}]") | |
| print(f"Q: {question}") | |
| print("-" * 80) | |
| # Get responses from both models | |
| print(" π Getting fine-tuned model response...") | |
| finetuned_response = get_tinker_response( | |
| question, TINKER_CHECKPOINT, TINKER_BASE_MODEL | |
| ) | |
| print(" π Getting base model response...") | |
| base_response = get_openrouter_response(question, BASE_MODEL) | |
| # Judge the responses | |
| print(" βοΈ Judging responses...") | |
| judgment = judge_responses(question, finetuned_response, base_response) | |
| # Store results | |
| result = { | |
| "question": question, | |
| "finetuned_response": finetuned_response, | |
| "base_response": base_response, | |
| "score_finetuned": judgment["score_a"], | |
| "score_base": judgment["score_b"], | |
| "verdict": judgment["verdict"], | |
| } | |
| results.append(result) | |
| # Update totals | |
| total_score_finetuned += judgment["score_a"] | |
| total_score_base += judgment["score_b"] | |
| # Display scores | |
| print( | |
| f" π Scores: Fine-tuned={judgment['score_a']}/10 | Base={judgment['score_b']}/10" | |
| ) | |
| print(f" π Verdict: {judgment['verdict'][:100]}...") | |
| # Delay to respect rate limits | |
| if i < len(FINANCIAL_QUESTIONS): | |
| time.sleep(2) | |
| # Calculate final statistics | |
| avg_score_finetuned = total_score_finetuned / len(FINANCIAL_QUESTIONS) | |
| avg_score_base = total_score_base / len(FINANCIAL_QUESTIONS) | |
| wins_finetuned = sum(1 for r in results if r["score_finetuned"] > r["score_base"]) | |
| wins_base = sum(1 for r in results if r["score_base"] > r["score_finetuned"]) | |
| ties = sum(1 for r in results if r["score_finetuned"] == r["score_base"]) | |
| # Display final summary | |
| print("\n" + "=" * 80) | |
| print("FINAL RESULTS") | |
| print("=" * 80) | |
| print(f"\nAverage Scores:") | |
| print(f" Fine-tuned Model: {avg_score_finetuned:.2f}/10") | |
| print(f" Base Model: {avg_score_base:.2f}/10") | |
| print(f"\nWin/Loss/Tie:") | |
| print(f" Fine-tuned Wins: {wins_finetuned}") | |
| print(f" Base Wins: {wins_base}") | |
| print(f" Ties: {ties}") | |
| # Determine overall winner | |
| if avg_score_finetuned > avg_score_base: | |
| print( | |
| f"\nπ WINNER: Fine-tuned Model (+{avg_score_finetuned - avg_score_base:.2f} points)" | |
| ) | |
| elif avg_score_base > avg_score_finetuned: | |
| print( | |
| f"\nπ WINNER: Base Model (+{avg_score_base - avg_score_finetuned:.2f} points)" | |
| ) | |
| else: | |
| print(f"\nπ€ RESULT: Tie") | |
| # Save results to JSON | |
| output_data = { | |
| "metadata": { | |
| "finetuned_checkpoint": TINKER_CHECKPOINT, | |
| "base_model": BASE_MODEL, | |
| "judge_model": JUDGE_MODEL, | |
| "total_questions": len(FINANCIAL_QUESTIONS), | |
| }, | |
| "summary": { | |
| "avg_score_finetuned": avg_score_finetuned, | |
| "avg_score_base": avg_score_base, | |
| "wins_finetuned": wins_finetuned, | |
| "wins_base": wins_base, | |
| "ties": ties, | |
| }, | |
| "detailed_results": results, | |
| } | |
| output_file = "comparison_results.json" | |
| with open(output_file, "w") as f: | |
| json.dump(output_data, f, indent=2) | |
| print(f"\nβ Detailed results saved to: {output_file}") | |
| print("=" * 80 + "\n") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment