Skip to content

Instantly share code, notes, and snippets.

@deshwalmahesh
Last active September 14, 2025 06:26
Show Gist options
  • Select an option

  • Save deshwalmahesh/acc64eea1990eeb87c25c28b117c195e to your computer and use it in GitHub Desktop.

Select an option

Save deshwalmahesh/acc64eea1990eeb87c25c28b117c195e to your computer and use it in GitHub Desktop.
Test of "hallbayes" on bechmark for paper: https://github.com/leochlon/hallbayes/blob/main/arxivPreprint.pdf
# Need OpenaI key. You can patch others too in the hallucination_toolkit.py for your own hosted ones
import os
import json
from pathlib import Path
from datasets import load_dataset
from dotenv import load_dotenv
from scripts.hallucination_toolkit import (
OpenAIBackend, OpenAIItem, OpenAIPlanner,
generate_answer_if_allowed
)
load_dotenv(override = True)
def load_sample_questions(n_limit=1000):
"""Load questions from HaluBench"""
ds = load_dataset("PatronusAI/HaluBench")
df = ds["test"].to_pandas()
df = df[(df["source_ds"] == "FinanceBench") & (df["label"] == "PASS")]
if n_limit:
df = df.sample(n=n_limit, random_state=13)
questions = []
for _, row in df.iterrows():
q = str(row["question"] or "").strip()
psg = str(row["passage"] or "").strip()
ans = str(row["answer"] or "").strip()
lab = str(row["label"] or "").strip()
if q and psg:
questions.append({
"question": q,
"evidence": psg,
"ground_truth_answer": ans,
"label": lab
})
return questions
def run_pipeline(questions, output_file="test_results.json"):
"""Run both open book and closed book tests"""
# Initialize
backend = OpenAIBackend(model="gpt-4o-mini")
planner = OpenAIPlanner(backend, temperature=0.3)
results = []
for i, q_data in enumerate(questions):
print(f"Processing {i+1}/{len(questions)}: {q_data['question'][:50]}...")
# Open book - question with evidence
open_book_prompt = f"""Task: Answer based on the evidence.
Question: {q_data['question']}
Evidence: {q_data['evidence']}"""
open_book_item = OpenAIItem(
prompt=open_book_prompt,
n_samples=7,
m=6,
fields_to_erase=["Evidence"],
skeleton_policy="evidence_erase"
)
# Run both tests
items = [open_book_item]
metrics = planner.run(
items,
h_star=0.05,
isr_threshold=1.0,
margin_extra_bits=0.2,
B_clip=12.0,
clip_mode="one-sided"
)
# print(metrics)
# Extract results
result = {
"index": i,
"question": q_data["question"],
"evidence": q_data["evidence"],
"ground_truth_answer": q_data["ground_truth_answer"],
"label": q_data["label"],
"open_book": {
"decision_answer": metrics[0].decision_answer,
"delta_bar": float(metrics[0].delta_bar),
"isr": float(metrics[0].isr),
"roh_bound": float(metrics[0].roh_bound),
"answer": None
}
}
# Generate actual answers if allowed
if metrics[0].decision_answer:
result["open_book"]["answer"] = generate_answer_if_allowed(backend, open_book_item, metrics[0])
results.append(result)
# Save after each iteration
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
print(f" Open book: {'ANSWER' if metrics[0].decision_answer else 'REFUSE'}")
print(f"\nCompleted! Results saved to {output_file}")
return results
if __name__ == "__main__":
questions = load_sample_questions(n_limit=100)
print(f"Loaded {len(questions)} questions")
results = run_pipeline(questions, "halubench_test_results.json")
open_answer_rate = sum(1 for r in results if r["open_book"]["decision_answer"]) / len(results)
print(f"\nSummary:")
print(f"Closed book answer rate: {closed_answer_rate:.1%}")
print(f"Open book answer rate: {open_answer_rate:.1%}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment