deshwalmahesh · September 14, 2025 06:26
diff --git a/hallbayes_test.py b/hallbayes_test.py
 # Need OpenaI key. You can patch others too in the hallucination_toolkit.py for your own hosted ones 

 import os
 import json
 from pathlib import Path
 from datasets import load_dataset
 from dotenv import load_dotenv

 from scripts.hallucination_toolkit import (
    OpenAIBackend, OpenAIItem, OpenAIPlanner,
    generate_answer_if_allowed
 )

 load_dotenv(override = True)

 def load_sample_questions(n_limit=1000):
    """Load questions from HaluBench"""
    ds = load_dataset("PatronusAI/HaluBench")
    df = ds["test"].to_pandas()

    df = df[(df["source_ds"] == "FinanceBench") & (df["label"] == "PASS")]
    
    if n_limit:
        df = df.sample(n=n_limit, random_state=13)
    
    questions = []
    for _, row in df.iterrows():
        q = str(row["question"] or "").strip()
        psg = str(row["passage"] or "").strip()
        ans = str(row["answer"] or "").strip()
        lab = str(row["label"] or "").strip()
        
        if q and psg:
            questions.append({
                "question": q,
                "evidence": psg,
                "ground_truth_answer": ans,
                "label": lab
            })
    
    return questions

 def run_pipeline(questions, output_file="test_results.json"):
    """Run both open book and closed book tests"""
    
    # Initialize
    backend = OpenAIBackend(model="gpt-4o-mini")
    planner = OpenAIPlanner(backend, temperature=0.3)
    
    results = []
    
    for i, q_data in enumerate(questions):
        print(f"Processing {i+1}/{len(questions)}: {q_data['question'][:50]}...")
    
        
        # Open book - question with evidence
        open_book_prompt = f"""Task: Answer based on the evidence.
 Question: {q_data['question']}
 Evidence: {q_data['evidence']}"""
        
        open_book_item = OpenAIItem(
            prompt=open_book_prompt,
            n_samples=7,
            m=6,
            fields_to_erase=["Evidence"],
            skeleton_policy="evidence_erase"
        )
        
        # Run both tests
        items = [open_book_item]
        metrics = planner.run(
            items,
            h_star=0.05,
            isr_threshold=1.0,
            margin_extra_bits=0.2,
            B_clip=12.0,
            clip_mode="one-sided"
        )
        
        # print(metrics)
        # Extract results
        result = {
            "index": i,
            "question": q_data["question"],
            "evidence": q_data["evidence"],
            "ground_truth_answer": q_data["ground_truth_answer"],
            "label": q_data["label"],

            "open_book": {
                "decision_answer": metrics[0].decision_answer,
                "delta_bar": float(metrics[0].delta_bar),
                "isr": float(metrics[0].isr),
                "roh_bound": float(metrics[0].roh_bound),
                "answer": None
            }
        }
        
        # Generate actual answers if allowed
        if metrics[0].decision_answer:
            result["open_book"]["answer"] = generate_answer_if_allowed(backend, open_book_item, metrics[0])

        results.append(result)
        
        # Save after each iteration
        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2)

        print(f"  Open book: {'ANSWER' if metrics[0].decision_answer else 'REFUSE'}")

    print(f"\nCompleted! Results saved to {output_file}")
    return results

 if __name__ == "__main__":

    questions = load_sample_questions(n_limit=100)
    print(f"Loaded {len(questions)} questions")
    

    results = run_pipeline(questions, "halubench_test_results.json")
    

    open_answer_rate = sum(1 for r in results if r["open_book"]["decision_answer"]) / len(results)
    
    print(f"\nSummary:")
    print(f"Closed book answer rate: {closed_answer_rate:.1%}")
    print(f"Open book answer rate: {open_answer_rate:.1%}")
	# Need OpenaI key. You can patch others too in the hallucination_toolkit.py for your own hosted ones

	import os
	import json
	from pathlib import Path
	from datasets import load_dataset
	from dotenv import load_dotenv

	from scripts.hallucination_toolkit import (
	OpenAIBackend, OpenAIItem, OpenAIPlanner,
	generate_answer_if_allowed
	)

	load_dotenv(override = True)

	def load_sample_questions(n_limit=1000):
	"""Load questions from HaluBench"""
	ds = load_dataset("PatronusAI/HaluBench")
	df = ds["test"].to_pandas()

	df = df[(df["source_ds"] == "FinanceBench") & (df["label"] == "PASS")]

	if n_limit:
	df = df.sample(n=n_limit, random_state=13)

	questions = []
	for _, row in df.iterrows():
	q = str(row["question"] or "").strip()
	psg = str(row["passage"] or "").strip()
	ans = str(row["answer"] or "").strip()
	lab = str(row["label"] or "").strip()

	if q and psg:
	questions.append({
	"question": q,
	"evidence": psg,
	"ground_truth_answer": ans,
	"label": lab
	})

	return questions

	def run_pipeline(questions, output_file="test_results.json"):
	"""Run both open book and closed book tests"""

	# Initialize
	backend = OpenAIBackend(model="gpt-4o-mini")
	planner = OpenAIPlanner(backend, temperature=0.3)

	results = []

	for i, q_data in enumerate(questions):
	print(f"Processing {i+1}/{len(questions)}: {q_data['question'][:50]}...")


	# Open book - question with evidence
	open_book_prompt = f"""Task: Answer based on the evidence.
	Question: {q_data['question']}
	Evidence: {q_data['evidence']}"""

	open_book_item = OpenAIItem(
	prompt=open_book_prompt,
	n_samples=7,
	m=6,
	fields_to_erase=["Evidence"],
	skeleton_policy="evidence_erase"
	)

	# Run both tests
	items = [open_book_item]
	metrics = planner.run(
	items,
	h_star=0.05,
	isr_threshold=1.0,
	margin_extra_bits=0.2,
	B_clip=12.0,
	clip_mode="one-sided"
	)

	# print(metrics)
	# Extract results
	result = {
	"index": i,
	"question": q_data["question"],
	"evidence": q_data["evidence"],
	"ground_truth_answer": q_data["ground_truth_answer"],
	"label": q_data["label"],

	"open_book": {
	"decision_answer": metrics[0].decision_answer,
	"delta_bar": float(metrics[0].delta_bar),
	"isr": float(metrics[0].isr),
	"roh_bound": float(metrics[0].roh_bound),
	"answer": None
	}
	}

	# Generate actual answers if allowed
	if metrics[0].decision_answer:
	result["open_book"]["answer"] = generate_answer_if_allowed(backend, open_book_item, metrics[0])

	results.append(result)

	# Save after each iteration
	with open(output_file, 'w') as f:
	json.dump(results, f, indent=2)

	print(f" Open book: {'ANSWER' if metrics[0].decision_answer else 'REFUSE'}")

	print(f"\nCompleted! Results saved to {output_file}")
	return results

	if __name__ == "__main__":

	questions = load_sample_questions(n_limit=100)
	print(f"Loaded {len(questions)} questions")


	results = run_pipeline(questions, "halubench_test_results.json")


	open_answer_rate = sum(1 for r in results if r["open_book"]["decision_answer"]) / len(results)

	print(f"\nSummary:")
	print(f"Closed book answer rate: {closed_answer_rate:.1%}")
	print(f"Open book answer rate: {open_answer_rate:.1%}")
No results found