davidgilbertson · November 25, 2025 03:26
diff --git a/llm_esp.py b/llm_esp.py
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import numpy as np
 from sklearn.model_selection import train_test_split

 df = pd.read_csv("data.csv")  # Columns are 'Prompt' and 'Target'

 model = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-it")
 model.cuda().eval().requires_grad_(False)
 tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)

 # Get the activations for the dataset
 inputs = tokenizer(df.Prompt.tolist(), return_tensors="pt", padding=True).to("cuda")
 hidden_states = model(**inputs, output_hidden_states=True).hidden_states

 # Arrange hidden state as [sample, layers x hidden_dim], using only the last input token
 # This assumes tokenizer.padding_side is 'left'
 activations = np.stack(
    [layer[:, -1, :].cpu() for layer in hidden_states],
    axis=1,
 ).reshape(len(df), -1)

 # Save for later use
 # np.save("activations.npy", activations)

 # Split into train and test sets
 act_trn, act_tst, df_trn, df_tst = train_test_split(
    activations,
    df,
    stratify=df.Target,
    random_state=42,
 )
 # Standardize (z-score) the activations, based only on training set mean/std
 act_trn_z = (act_trn - act_trn.mean(axis=0)) / act_trn.std(axis=0)
 act_tst_z = (act_tst - act_trn.mean(axis=0)) / act_trn.std(axis=0)

 # Compute the two centroids from the training set
 pos_centroid = act_trn_z[df_trn.Target].mean(axis=0)
 neg_centroid = act_trn_z[~df_trn.Target].mean(axis=0)

 dist_to_pos = np.linalg.norm(pos_centroid - act_tst_z, axis=1)
 dist_to_neg = np.linalg.norm(neg_centroid - act_tst_z, axis=1)
 df_tst["Score"] = dist_to_neg - dist_to_pos
 df_tst["Pred"] = dist_to_pos < dist_to_neg

 # The below 4 lines are equivalent to the distance comparison above, but much faster.
 # midpoint = (pos_centroid + neg_centroid) / 2
 # direction = pos_centroid - neg_centroid
 # df_tst["Score"] = (act_tst_z - midpoint) @ direction
 # df_tst["Pred"] = df_tst.Score > 0

 accuracy = df_tst.Target.eq(df_tst.Pred).mean()

 print(f"Test set accuracy: {accuracy:.1%}")
	import pandas as pd
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import numpy as np
	from sklearn.model_selection import train_test_split

	df = pd.read_csv("data.csv") # Columns are 'Prompt' and 'Target'

	model = AutoModelForCausalLM.from_pretrained("google/gemma-3-1b-it")
	model.cuda().eval().requires_grad_(False)
	tokenizer = AutoTokenizer.from_pretrained(model.name_or_path)

	# Get the activations for the dataset
	inputs = tokenizer(df.Prompt.tolist(), return_tensors="pt", padding=True).to("cuda")
	hidden_states = model(**inputs, output_hidden_states=True).hidden_states

	# Arrange hidden state as [sample, layers x hidden_dim], using only the last input token
	# This assumes tokenizer.padding_side is 'left'
	activations = np.stack(
	[layer[:, -1, :].cpu() for layer in hidden_states],
	axis=1,
	).reshape(len(df), -1)

	# Save for later use
	# np.save("activations.npy", activations)

	# Split into train and test sets
	act_trn, act_tst, df_trn, df_tst = train_test_split(
	activations,
	df,
	stratify=df.Target,
	random_state=42,
	)
	# Standardize (z-score) the activations, based only on training set mean/std
	act_trn_z = (act_trn - act_trn.mean(axis=0)) / act_trn.std(axis=0)
	act_tst_z = (act_tst - act_trn.mean(axis=0)) / act_trn.std(axis=0)

	# Compute the two centroids from the training set
	pos_centroid = act_trn_z[df_trn.Target].mean(axis=0)
	neg_centroid = act_trn_z[~df_trn.Target].mean(axis=0)

	dist_to_pos = np.linalg.norm(pos_centroid - act_tst_z, axis=1)
	dist_to_neg = np.linalg.norm(neg_centroid - act_tst_z, axis=1)
	df_tst["Score"] = dist_to_neg - dist_to_pos
	df_tst["Pred"] = dist_to_pos < dist_to_neg

	# The below 4 lines are equivalent to the distance comparison above, but much faster.
	# midpoint = (pos_centroid + neg_centroid) / 2
	# direction = pos_centroid - neg_centroid
	# df_tst["Score"] = (act_tst_z - midpoint) @ direction
	# df_tst["Pred"] = df_tst.Score > 0

	accuracy = df_tst.Target.eq(df_tst.Pred).mean()

	print(f"Test set accuracy: {accuracy:.1%}")
No results found