Skip to content

Instantly share code, notes, and snippets.

@juvi21
Created June 3, 2024 20:54
Show Gist options
  • Select an option

  • Save juvi21/6b3a059686e189b293b2e047ec63e3b8 to your computer and use it in GitHub Desktop.

Select an option

Save juvi21/6b3a059686e189b293b2e047ec63e3b8 to your computer and use it in GitHub Desktop.
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.benchmarks import MMLU, GSM8K
import pandas as pd
# Define custom model class
class Hermes2ThetaLlama3_8B(DeepEvalBaseLLM):
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def load_model(self):
return self.model
def generate(self, prompt: str) -> str:
model = self.load_model()
device = "cuda" # Use GPU
model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
model.to(device)
generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
return self.tokenizer.batch_decode(generated_ids)[0]
async def a_generate(self, prompt: str) -> str:
return self.generate(prompt)
def batch_generate(self, prompts: list) -> list:
model = self.load_model()
device = "cuda"
model_inputs = self.tokenizer(prompts, return_tensors="pt").to(device)
model.to(device)
generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
return self.tokenizer.batch_decode(generated_ids)
def get_model_name(self):
return "Hermes-2-Theta-Llama-3-8B"
# Load model and tokenizer
model_name = "NousResearch/Hermes-2-Theta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Create custom model instance
hermes_2_theta_llama = Hermes2ThetaLlama3_8B(model=model, tokenizer=tokenizer)
# Benchmark on MMLU
mmlu_benchmark = MMLU() # 0-shot evaluation
mmlu_results = mmlu_benchmark.evaluate(model=hermes_2_theta_llama)
print("MMLU Overall Score:", mmlu_results.overall_score)
# Benchmark on GSM8K
gsm8k_benchmark = GSM8K(n_problems=1319, n_shots=0, enable_cot=False) # 0-shot evaluation
gsm8k_results = gsm8k_benchmark.evaluate(model=hermes_2_theta_llama)
print("GSM8K Overall Score:", gsm8k_results.overall_score)
# Save results to files
mmlu_task_scores_df = pd.DataFrame(mmlu_benchmark.task_scores)
gsm8k_task_scores_df = pd.DataFrame(gsm8k_benchmark.task_scores)
mmlu_predictions_df = pd.DataFrame(mmlu_benchmark.predictions)
gsm8k_predictions_df = pd.DataFrame(gsm8k_benchmark.predictions)
mmlu_task_scores_df.to_csv("mmlu_task_scores.csv", index=False)
gsm8k_task_scores_df.to_csv("gsm8k_task_scores.csv", index=False)
mmlu_predictions_df.to_csv("mmlu_predictions.csv", index=False)
gsm8k_predictions_df.to_csv("gsm8k_predictions.csv", index=False)
# Print detailed scores
print("MMLU Task-specific Scores: ", mmlu_benchmark.task_scores)
print("GSM8K Task-specific Scores: ", gsm8k_benchmark.task_scores)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment