Skip to content

Instantly share code, notes, and snippets.

@Lukass14
Forked from USM-F/Makefile
Created February 13, 2026 10:43
Show Gist options
  • Select an option

  • Save Lukass14/bbd281b9079cec4985e418b46e1b7812 to your computer and use it in GitHub Desktop.

Select an option

Save Lukass14/bbd281b9079cec4985e418b46e1b7812 to your computer and use it in GitHub Desktop.
microgpt by karpathy but in pure C
CC = gcc
CFLAGS = -O3 -Wall -Wextra
LDFLAGS = -lm
all: microgpt
microgpt: microgpt.c
$(CC) $(CFLAGS) microgpt.c -o microgpt $(LDFLAGS)
clean:
rm -f microgpt input.txt
/*
ORIGINAL in python: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif
// --- Configuration ---
#define N_EMBD 16
#define N_HEAD 4
#define N_LAYER 1
#define BLOCK_SIZE 16
#define HEAD_DIM (N_EMBD / N_HEAD)
#define MAX_NODES 2000000 // Autograd arena size
#define MAX_PTRS 2000000 // Pointer arena size
#define MAX_DOCS 50000 // Max lines in dataset
#define MAX_DOC_LEN 256
#define MAX_VOCAB 256
// --- Global Dataset & Tokenizer ---
char* docs[MAX_DOCS];
int num_docs = 0;
char uchars[MAX_VOCAB];
int vocab_size = 0;
// --- Autograd Engine ---
typedef struct Value {
float data;
float grad;
struct Value* children[2];
float local_grads[2];
int num_children;
} Value;
// Memory Arenas
Value arena[MAX_NODES];
int arena_ptr = 0;
int num_params = 0;
Value* ptr_arena[MAX_PTRS];
int ptr_arena_ptr = 0;
Value* new_value(float data) {
if (arena_ptr >= MAX_NODES) { printf("Arena OOM!\n"); exit(1); }
Value* v = &arena[arena_ptr++];
v->data = data;
v->grad = 0.0f;
v->num_children = 0;
return v;
}
Value** alloc_ptrs(int size) {
if (ptr_arena_ptr + size > MAX_PTRS) { printf("Ptr Arena OOM!\n"); exit(1); }
Value** p = &ptr_arena[ptr_arena_ptr];
ptr_arena_ptr += size;
return p;
}
Value* val_add(Value* a, Value* b) {
Value* v = new_value(a->data + b->data);
v->num_children = 2;
v->children[0] = a; v->local_grads[0] = 1.0f;
v->children[1] = b; v->local_grads[1] = 1.0f;
return v;
}
Value* val_add_const(Value* a, float b) {
Value* v = new_value(a->data + b);
v->num_children = 1;
v->children[0] = a; v->local_grads[0] = 1.0f;
return v;
}
Value* val_mul(Value* a, Value* b) {
Value* v = new_value(a->data * b->data);
v->num_children = 2;
v->children[0] = a; v->local_grads[0] = b->data;
v->children[1] = b; v->local_grads[1] = a->data;
return v;
}
Value* val_mul_const(Value* a, float b) {
Value* v = new_value(a->data * b);
v->num_children = 1;
v->children[0] = a; v->local_grads[0] = b;
return v;
}
Value* val_pow_const(Value* a, float b) {
Value* v = new_value(powf(a->data, b));
v->num_children = 1;
v->children[0] = a; v->local_grads[0] = b * powf(a->data, b - 1.0f);
return v;
}
Value* val_exp(Value* a) {
Value* v = new_value(expf(a->data));
v->num_children = 1;
v->children[0] = a; v->local_grads[0] = expf(a->data);
return v;
}
Value* val_log(Value* a) {
Value* v = new_value(logf(a->data));
v->num_children = 1;
v->children[0] = a; v->local_grads[0] = 1.0f / a->data;
return v;
}
Value* val_relu(Value* a) {
Value* v = new_value(a->data > 0 ? a->data : 0.0f);
v->num_children = 1;
v->children[0] = a; v->local_grads[0] = a->data > 0 ? 1.0f : 0.0f;
return v;
}
void backward(Value* loss) {
loss->grad = 1.0f;
for (int i = arena_ptr - 1; i >= num_params; i--) {
Value* v = &arena[i];
for (int c = 0; c < v->num_children; c++) {
v->children[c]->grad += v->local_grads[c] * v->grad;
}
}
}
// --- Parameter Initialization ---
float rand_gauss(float mean, float std) {
float u1 = (float)rand() / RAND_MAX;
float u2 = (float)rand() / RAND_MAX;
float z0 = sqrtf(-2.0f * logf(u1 + 1e-8f)) * cosf(2.0f * (float)M_PI * u2);
return z0 * std + mean;
}
Value** init_matrix(int nout, int nin, float std) {
Value** mat = alloc_ptrs(nout * nin);
for (int i = 0; i < nout * nin; i++) {
mat[i] = new_value(rand_gauss(0.0f, std));
num_params++;
}
return mat;
}
struct {
Value** wte; Value** wpe; Value** lm_head;
Value** attn_wq; Value** attn_wk; Value** attn_wv; Value** attn_wo;
Value** mlp_fc1; Value** mlp_fc2;
} model;
// --- Neural Network Operations ---
Value** linear(Value** x, Value** w, int nout, int nin) {
Value** out = alloc_ptrs(nout);
for (int o = 0; o < nout; o++) {
Value* sum = new_value(0.0f);
for (int i = 0; i < nin; i++) {
sum = val_add(sum, val_mul(w[o * nin + i], x[i]));
}
out[o] = sum;
}
return out;
}
Value** softmax(Value** logits, int len) {
float max_val = -1e9;
for (int i = 0; i < len; i++) if (logits[i]->data > max_val) max_val = logits[i]->data;
Value** exps = alloc_ptrs(len);
Value* total = new_value(0.0f);
for (int i = 0; i < len; i++) {
exps[i] = val_exp(val_add_const(logits[i], -max_val));
total = val_add(total, exps[i]);
}
Value** out = alloc_ptrs(len);
Value* total_inv = val_pow_const(total, -1.0f);
for (int i = 0; i < len; i++) {
out[i] = val_mul(exps[i], total_inv);
}
return out;
}
Value** rmsnorm(Value** x, int len) {
Value* ms = new_value(0.0f);
for (int i = 0; i < len; i++) ms = val_add(ms, val_mul(x[i], x[i]));
ms = val_mul_const(ms, 1.0f / len);
Value* scale = val_pow_const(val_add_const(ms, 1e-5f), -0.5f);
Value** out = alloc_ptrs(len);
for (int i = 0; i < len; i++) out[i] = val_mul(x[i], scale);
return out;
}
Value** gpt(int token_id, int pos_id, Value*** keys, Value*** values, int cache_len) {
Value** tok_emb = &model.wte[token_id * N_EMBD];
Value** pos_emb = &model.wpe[pos_id * N_EMBD];
Value** x = alloc_ptrs(N_EMBD);
for (int i = 0; i < N_EMBD; i++) x[i] = val_add(tok_emb[i], pos_emb[i]);
x = rmsnorm(x, N_EMBD);
Value** x_residual = x;
x = rmsnorm(x, N_EMBD);
Value** q = linear(x, model.attn_wq, N_EMBD, N_EMBD);
Value** k = linear(x, model.attn_wk, N_EMBD, N_EMBD);
Value** v = linear(x, model.attn_wv, N_EMBD, N_EMBD);
keys[cache_len] = k;
values[cache_len] = v;
int curr_len = cache_len + 1;
Value** x_attn = alloc_ptrs(N_EMBD);
for (int h = 0; h < N_HEAD; h++) {
int hs = h * HEAD_DIM;
Value** q_h = &q[hs];
Value** attn_logits = alloc_ptrs(curr_len);
for (int t = 0; t < curr_len; t++) {
Value* score = new_value(0.0f);
for (int j = 0; j < HEAD_DIM; j++) {
score = val_add(score, val_mul(q_h[j], keys[t][hs + j]));
}
attn_logits[t] = val_mul_const(score, 1.0f / sqrtf((float)HEAD_DIM));
}
Value** attn_weights = softmax(attn_logits, curr_len);
for (int j = 0; j < HEAD_DIM; j++) {
Value* head_out = new_value(0.0f);
for (int t = 0; t < curr_len; t++) {
head_out = val_add(head_out, val_mul(attn_weights[t], values[t][hs + j]));
}
x_attn[hs + j] = head_out;
}
}
x = linear(x_attn, model.attn_wo, N_EMBD, N_EMBD);
Value** attn_out = alloc_ptrs(N_EMBD);
for (int i = 0; i < N_EMBD; i++) attn_out[i] = val_add(x[i], x_residual[i]);
x = attn_out;
x_residual = x;
x = rmsnorm(x, N_EMBD);
x = linear(x, model.mlp_fc1, 4 * N_EMBD, N_EMBD);
Value** x_relu = alloc_ptrs(4 * N_EMBD);
for (int i = 0; i < 4 * N_EMBD; i++) x_relu[i] = val_relu(x[i]);
x = linear(x_relu, model.mlp_fc2, N_EMBD, 4 * N_EMBD);
Value** mlp_out = alloc_ptrs(N_EMBD);
for (int i = 0; i < N_EMBD; i++) mlp_out[i] = val_add(x[i], x_residual[i]);
Value** logits = linear(mlp_out, model.lm_head, vocab_size, N_EMBD);
return logits;
}
void load_dataset() {
int ret = system("wget -qO input.txt https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt || curl -so input.txt https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt");
(void)ret;
FILE* f = fopen("input.txt", "r");
if (!f) { printf("Failed to open input.txt\n"); exit(1); }
char line[MAX_DOC_LEN];
int char_counts[256] = {0};
while (fgets(line, sizeof(line), f) && num_docs < MAX_DOCS) {
int len = strlen(line);
if (len > 0 && line[len-1] == '\n') line[--len] = '\0';
if (len == 0) continue;
docs[num_docs++] = strdup(line);
for (int i = 0; i < len; i++) char_counts[(unsigned char)line[i]] = 1;
}
fclose(f);
for (int i = 0; i < 256; i++) {
if (char_counts[i]) uchars[vocab_size++] = i;
}
vocab_size++;
printf("num docs: %d\nvocab size: %d\n", num_docs, vocab_size);
}
int char_to_id(char c) {
for (int i = 0; i < vocab_size - 1; i++) if (uchars[i] == c) return i;
return 0;
}
int main() {
srand(42);
load_dataset();
int BOS = vocab_size - 1;
model.wte = init_matrix(vocab_size, N_EMBD, 0.08f);
model.wpe = init_matrix(BLOCK_SIZE, N_EMBD, 0.08f);
model.lm_head = init_matrix(vocab_size, N_EMBD, 0.08f);
model.attn_wq = init_matrix(N_EMBD, N_EMBD, 0.08f);
model.attn_wk = init_matrix(N_EMBD, N_EMBD, 0.08f);
model.attn_wv = init_matrix(N_EMBD, N_EMBD, 0.08f);
model.attn_wo = init_matrix(N_EMBD, N_EMBD, 0.08f);
model.mlp_fc1 = init_matrix(4 * N_EMBD, N_EMBD, 0.08f);
model.mlp_fc2 = init_matrix(N_EMBD, 4 * N_EMBD, 0.08f);
printf("num params: %d\n", num_params);
float* m = calloc(num_params, sizeof(float));
float* v = calloc(num_params, sizeof(float));
float lr = 0.01f, beta1 = 0.85f, beta2 = 0.99f, eps_adam = 1e-8f;
int num_steps = 1000;
int base_arena_ptr = arena_ptr;
int base_ptr_arena_ptr = ptr_arena_ptr;
for (int step = 0; step < num_steps; step++) {
char* doc = docs[step % num_docs];
int len = strlen(doc);
int tokens[MAX_DOC_LEN + 2];
tokens[0] = BOS;
for (int i = 0; i < len; i++) tokens[i+1] = char_to_id(doc[i]);
tokens[len+1] = BOS;
int n = (BLOCK_SIZE < len + 1) ? BLOCK_SIZE : len + 1;
Value** keys[BLOCK_SIZE];
Value** values[BLOCK_SIZE];
Value** losses = alloc_ptrs(n);
for (int pos_id = 0; pos_id < n; pos_id++) {
int token_id = tokens[pos_id];
int target_id = tokens[pos_id + 1];
Value** logits = gpt(token_id, pos_id, keys, values, pos_id);
Value** probs = softmax(logits, vocab_size);
losses[pos_id] = val_mul_const(val_log(probs[target_id]), -1.0f);
}
Value* total_loss = new_value(0.0f);
for (int i = 0; i < n; i++) total_loss = val_add(total_loss, losses[i]);
Value* loss = val_mul_const(total_loss, 1.0f / n);
backward(loss);
float lr_t = lr * (1.0f - (float)step / num_steps);
for (int i = 0; i < num_params; i++) {
Value* p = &arena[i];
m[i] = beta1 * m[i] + (1.0f - beta1) * p->grad;
v[i] = beta2 * v[i] + (1.0f - beta2) * (p->grad * p->grad);
float m_hat = m[i] / (1.0f - powf(beta1, step + 1));
float v_hat = v[i] / (1.0f - powf(beta2, step + 1));
p->data -= lr_t * m_hat / (sqrtf(v_hat) + eps_adam);
p->grad = 0.0f;
}
printf("step %4d / %4d | loss %.4f\n", step + 1, num_steps, loss->data);
arena_ptr = base_arena_ptr;
ptr_arena_ptr = base_ptr_arena_ptr;
}
float temperature = 0.5f;
printf("\n--- inference (new, hallucinated names) ---\n");
for (int sample_idx = 0; sample_idx < 20; sample_idx++) {
Value** keys[BLOCK_SIZE];
Value** values[BLOCK_SIZE];
int token_id = BOS;
char sample[BLOCK_SIZE + 1];
int sample_len = 0;
for (int pos_id = 0; pos_id < BLOCK_SIZE; pos_id++) {
Value** logits = gpt(token_id, pos_id, keys, values, pos_id);
Value** scaled_logits = alloc_ptrs(vocab_size);
for(int i=0; i<vocab_size; i++) scaled_logits[i] = val_mul_const(logits[i], 1.0f/temperature);
Value** probs = softmax(scaled_logits, vocab_size);
float r = (float)rand() / RAND_MAX;
float cdf = 0.0f;
for (int i = 0; i < vocab_size; i++) {
cdf += probs[i]->data;
if (r <= cdf || i == vocab_size - 1) { token_id = i; break; }
}
if (token_id == BOS) break;
sample[sample_len++] = uchars[token_id];
}
sample[sample_len] = '\0';
printf("sample %2d: %s\n", sample_idx + 1, sample);
ptr_arena_ptr = base_ptr_arena_ptr;
arena_ptr = base_arena_ptr;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment