Skip to content

Instantly share code, notes, and snippets.

@davidgilbertson
Last active November 25, 2025 03:26
Show Gist options
  • Select an option

  • Save davidgilbertson/f107102b69cd44209f8b146f7ea25248 to your computer and use it in GitHub Desktop.

Select an option

Save davidgilbertson/f107102b69cd44209f8b146f7ea25248 to your computer and use it in GitHub Desktop.
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.read_csv("data.csv") # Columns are 'Prompt' and 'Target'
model = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-it")
model.cuda().eval().requires_grad_(False)
tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)
# Get the activations for the dataset
inputs = tokenizer(df.Prompt.tolist(), return_tensors="pt", padding=True).to("cuda")
hidden_states = model(**inputs, output_hidden_states=True).hidden_states
# Arrange hidden state as [sample, layers x hidden_dim], using only the last input token
# This assumes tokenizer.padding_side is 'left'
activations = np.stack(
[layer[:, -1, :].cpu() for layer in hidden_states],
axis=1,
).reshape(len(df), -1)
# Save for later use
# np.save("activations.npy", activations)
# Split into train and test sets
act_trn, act_tst, df_trn, df_tst = train_test_split(
activations,
df,
stratify=df.Target,
random_state=42,
)
# Standardize (z-score) the activations, based only on training set mean/std
act_trn_z = (act_trn - act_trn.mean(axis=0)) / act_trn.std(axis=0)
act_tst_z = (act_tst - act_trn.mean(axis=0)) / act_trn.std(axis=0)
# Compute the two centroids from the training set
pos_centroid = act_trn_z[df_trn.Target].mean(axis=0)
neg_centroid = act_trn_z[~df_trn.Target].mean(axis=0)
dist_to_pos = np.linalg.norm(pos_centroid - act_tst_z, axis=1)
dist_to_neg = np.linalg.norm(neg_centroid - act_tst_z, axis=1)
df_tst["Score"] = dist_to_neg - dist_to_pos
df_tst["Pred"] = dist_to_pos < dist_to_neg
# The below 4 lines are equivalent to the distance comparison above, but much faster.
# midpoint = (pos_centroid + neg_centroid) / 2
# direction = pos_centroid - neg_centroid
# df_tst["Score"] = (act_tst_z - midpoint) @ direction
# df_tst["Pred"] = df_tst.Score > 0
accuracy = df_tst.Target.eq(df_tst.Pred).mean()
print(f"Test set accuracy: {accuracy:.1%}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment