Lukass14 · February 13, 2026 10:43
diff --git a/Makefile b/Makefile
 CC = gcc
 CFLAGS = -O3 -Wall -Wextra
 LDFLAGS = -lm

 all: microgpt

 microgpt: microgpt.c
 	$(CC) $(CFLAGS) microgpt.c -o microgpt $(LDFLAGS)

 clean:
 	rm -f microgpt input.txt
diff --git a/microgpt.c b/microgpt.c
 /*
 ORIGINAL in python: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>

 #ifndef M_PI
 #define M_PI 3.14159265358979323846
 #endif

 // --- Configuration ---
 #define N_EMBD 16
 #define N_HEAD 4
 #define N_LAYER 1
 #define BLOCK_SIZE 16
 #define HEAD_DIM (N_EMBD / N_HEAD)
 #define MAX_NODES 2000000  // Autograd arena size
 #define MAX_PTRS 2000000   // Pointer arena size
 #define MAX_DOCS 50000     // Max lines in dataset
 #define MAX_DOC_LEN 256
 #define MAX_VOCAB 256

 // --- Global Dataset & Tokenizer ---
 char* docs[MAX_DOCS];
 int num_docs = 0;
 char uchars[MAX_VOCAB];
 int vocab_size = 0;

 // --- Autograd Engine ---
 typedef struct Value {
    float data;
    float grad;
    struct Value* children[2];
    float local_grads[2];
    int num_children;
 } Value;

 // Memory Arenas
 Value arena[MAX_NODES];
 int arena_ptr = 0;
 int num_params = 0;

 Value* ptr_arena[MAX_PTRS];
 int ptr_arena_ptr = 0;

 Value* new_value(float data) {
    if (arena_ptr >= MAX_NODES) { printf("Arena OOM!\n"); exit(1); }
    Value* v = &arena[arena_ptr++];
    v->data = data;
    v->grad = 0.0f;
    v->num_children = 0;
    return v;
 }

 Value** alloc_ptrs(int size) {
    if (ptr_arena_ptr + size > MAX_PTRS) { printf("Ptr Arena OOM!\n"); exit(1); }
    Value** p = &ptr_arena[ptr_arena_ptr];
    ptr_arena_ptr += size;
    return p;
 }

 Value* val_add(Value* a, Value* b) {
    Value* v = new_value(a->data + b->data);
    v->num_children = 2;
    v->children[0] = a; v->local_grads[0] = 1.0f;
    v->children[1] = b; v->local_grads[1] = 1.0f;
    return v;
 }

 Value* val_add_const(Value* a, float b) {
    Value* v = new_value(a->data + b);
    v->num_children = 1;
    v->children[0] = a; v->local_grads[0] = 1.0f;
    return v;
 }

 Value* val_mul(Value* a, Value* b) {
    Value* v = new_value(a->data * b->data);
    v->num_children = 2;
    v->children[0] = a; v->local_grads[0] = b->data;
    v->children[1] = b; v->local_grads[1] = a->data;
    return v;
 }

 Value* val_mul_const(Value* a, float b) {
    Value* v = new_value(a->data * b);
    v->num_children = 1;
    v->children[0] = a; v->local_grads[0] = b;
    return v;
 }

 Value* val_pow_const(Value* a, float b) {
    Value* v = new_value(powf(a->data, b));
    v->num_children = 1;
    v->children[0] = a; v->local_grads[0] = b * powf(a->data, b - 1.0f);
    return v;
 }

 Value* val_exp(Value* a) {
    Value* v = new_value(expf(a->data));
    v->num_children = 1;
    v->children[0] = a; v->local_grads[0] = expf(a->data);
    return v;
 }

 Value* val_log(Value* a) {
    Value* v = new_value(logf(a->data));
    v->num_children = 1;
    v->children[0] = a; v->local_grads[0] = 1.0f / a->data;
    return v;
 }

 Value* val_relu(Value* a) {
    Value* v = new_value(a->data > 0 ? a->data : 0.0f);
    v->num_children = 1;
    v->children[0] = a; v->local_grads[0] = a->data > 0 ? 1.0f : 0.0f;
    return v;
 }

 void backward(Value* loss) {
    loss->grad = 1.0f;
    for (int i = arena_ptr - 1; i >= num_params; i--) {
        Value* v = &arena[i];
        for (int c = 0; c < v->num_children; c++) {
            v->children[c]->grad += v->local_grads[c] * v->grad;
        }
    }
 }

 // --- Parameter Initialization ---
 float rand_gauss(float mean, float std) {
    float u1 = (float)rand() / RAND_MAX;
    float u2 = (float)rand() / RAND_MAX;
    float z0 = sqrtf(-2.0f * logf(u1 + 1e-8f)) * cosf(2.0f * (float)M_PI * u2);
    return z0 * std + mean;
 }

 Value** init_matrix(int nout, int nin, float std) {
    Value** mat = alloc_ptrs(nout * nin);
    for (int i = 0; i < nout * nin; i++) {
        mat[i] = new_value(rand_gauss(0.0f, std));
        num_params++; 
    }
    return mat;
 }

 struct {
    Value** wte; Value** wpe; Value** lm_head;
    Value** attn_wq; Value** attn_wk; Value** attn_wv; Value** attn_wo;
    Value** mlp_fc1; Value** mlp_fc2;
 } model;

 // --- Neural Network Operations ---
 Value** linear(Value** x, Value** w, int nout, int nin) {
    Value** out = alloc_ptrs(nout);
    for (int o = 0; o < nout; o++) {
        Value* sum = new_value(0.0f);
        for (int i = 0; i < nin; i++) {
            sum = val_add(sum, val_mul(w[o * nin + i], x[i]));
        }
        out[o] = sum;
    }
    return out;
 }

 Value** softmax(Value** logits, int len) {
    float max_val = -1e9;
    for (int i = 0; i < len; i++) if (logits[i]->data > max_val) max_val = logits[i]->data;
    
    Value** exps = alloc_ptrs(len);
    Value* total = new_value(0.0f);
    for (int i = 0; i < len; i++) {
        exps[i] = val_exp(val_add_const(logits[i], -max_val));
        total = val_add(total, exps[i]);
    }
    
    Value** out = alloc_ptrs(len);
    Value* total_inv = val_pow_const(total, -1.0f);
    for (int i = 0; i < len; i++) {
        out[i] = val_mul(exps[i], total_inv);
    }
    return out;
 }

 Value** rmsnorm(Value** x, int len) {
    Value* ms = new_value(0.0f);
    for (int i = 0; i < len; i++) ms = val_add(ms, val_mul(x[i], x[i]));
    ms = val_mul_const(ms, 1.0f / len);
    Value* scale = val_pow_const(val_add_const(ms, 1e-5f), -0.5f);
    
    Value** out = alloc_ptrs(len);
    for (int i = 0; i < len; i++) out[i] = val_mul(x[i], scale);
    return out;
 }

 Value** gpt(int token_id, int pos_id, Value*** keys, Value*** values, int cache_len) {
    Value** tok_emb = &model.wte[token_id * N_EMBD];
    Value** pos_emb = &model.wpe[pos_id * N_EMBD];
    
    Value** x = alloc_ptrs(N_EMBD);
    for (int i = 0; i < N_EMBD; i++) x[i] = val_add(tok_emb[i], pos_emb[i]);
    
    x = rmsnorm(x, N_EMBD);
    
    Value** x_residual = x;
    x = rmsnorm(x, N_EMBD);
    
    Value** q = linear(x, model.attn_wq, N_EMBD, N_EMBD);
    Value** k = linear(x, model.attn_wk, N_EMBD, N_EMBD);
    Value** v = linear(x, model.attn_wv, N_EMBD, N_EMBD);
    
    keys[cache_len] = k;
    values[cache_len] = v;
    int curr_len = cache_len + 1;
    
    Value** x_attn = alloc_ptrs(N_EMBD);
    for (int h = 0; h < N_HEAD; h++) {
        int hs = h * HEAD_DIM;
        Value** q_h = &q[hs];
        
        Value** attn_logits = alloc_ptrs(curr_len);
        for (int t = 0; t < curr_len; t++) {
            Value* score = new_value(0.0f);
            for (int j = 0; j < HEAD_DIM; j++) {
                score = val_add(score, val_mul(q_h[j], keys[t][hs + j]));
            }
            attn_logits[t] = val_mul_const(score, 1.0f / sqrtf((float)HEAD_DIM));
        }
        
        Value** attn_weights = softmax(attn_logits, curr_len);
        
        for (int j = 0; j < HEAD_DIM; j++) {
            Value* head_out = new_value(0.0f);
            for (int t = 0; t < curr_len; t++) {
                head_out = val_add(head_out, val_mul(attn_weights[t], values[t][hs + j]));
            }
            x_attn[hs + j] = head_out;
        }
    }
    
    x = linear(x_attn, model.attn_wo, N_EMBD, N_EMBD);
    Value** attn_out = alloc_ptrs(N_EMBD);
    for (int i = 0; i < N_EMBD; i++) attn_out[i] = val_add(x[i], x_residual[i]);
    x = attn_out;
    
    x_residual = x;
    x = rmsnorm(x, N_EMBD);
    x = linear(x, model.mlp_fc1, 4 * N_EMBD, N_EMBD);
    
    Value** x_relu = alloc_ptrs(4 * N_EMBD);
    for (int i = 0; i < 4 * N_EMBD; i++) x_relu[i] = val_relu(x[i]);
    
    x = linear(x_relu, model.mlp_fc2, N_EMBD, 4 * N_EMBD);
    
    Value** mlp_out = alloc_ptrs(N_EMBD);
    for (int i = 0; i < N_EMBD; i++) mlp_out[i] = val_add(x[i], x_residual[i]);
    
    Value** logits = linear(mlp_out, model.lm_head, vocab_size, N_EMBD);
    return logits;
 }

 void load_dataset() {
    int ret = system("wget -qO input.txt https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt || curl -so input.txt https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt");
    (void)ret;

    FILE* f = fopen("input.txt", "r");
    if (!f) { printf("Failed to open input.txt\n"); exit(1); }
    
    char line[MAX_DOC_LEN];
    int char_counts[256] = {0};
    
    while (fgets(line, sizeof(line), f) && num_docs < MAX_DOCS) {
        int len = strlen(line);
        if (len > 0 && line[len-1] == '\n') line[--len] = '\0';
        if (len == 0) continue;
        
        docs[num_docs++] = strdup(line);
        for (int i = 0; i < len; i++) char_counts[(unsigned char)line[i]] = 1;
    }
    fclose(f);
    
    for (int i = 0; i < 256; i++) {
        if (char_counts[i]) uchars[vocab_size++] = i;
    }
    vocab_size++; 
    printf("num docs: %d\nvocab size: %d\n", num_docs, vocab_size);
 }

 int char_to_id(char c) {
    for (int i = 0; i < vocab_size - 1; i++) if (uchars[i] == c) return i;
    return 0;
 }

 int main() {
    srand(42); 
    load_dataset();
    
    int BOS = vocab_size - 1;
    
    model.wte = init_matrix(vocab_size, N_EMBD, 0.08f);
    model.wpe = init_matrix(BLOCK_SIZE, N_EMBD, 0.08f);
    model.lm_head = init_matrix(vocab_size, N_EMBD, 0.08f);
    model.attn_wq = init_matrix(N_EMBD, N_EMBD, 0.08f);
    model.attn_wk = init_matrix(N_EMBD, N_EMBD, 0.08f);
    model.attn_wv = init_matrix(N_EMBD, N_EMBD, 0.08f);
    model.attn_wo = init_matrix(N_EMBD, N_EMBD, 0.08f);
    model.mlp_fc1 = init_matrix(4 * N_EMBD, N_EMBD, 0.08f);
    model.mlp_fc2 = init_matrix(N_EMBD, 4 * N_EMBD, 0.08f);
    
    printf("num params: %d\n", num_params);
    
    float* m = calloc(num_params, sizeof(float));
    float* v = calloc(num_params, sizeof(float));
    float lr = 0.01f, beta1 = 0.85f, beta2 = 0.99f, eps_adam = 1e-8f;
    int num_steps = 1000;
    
    int base_arena_ptr = arena_ptr;
    int base_ptr_arena_ptr = ptr_arena_ptr;
    
    for (int step = 0; step < num_steps; step++) {
        char* doc = docs[step % num_docs];
        int len = strlen(doc);
        int tokens[MAX_DOC_LEN + 2];
        tokens[0] = BOS;
        for (int i = 0; i < len; i++) tokens[i+1] = char_to_id(doc[i]);
        tokens[len+1] = BOS;
        int n = (BLOCK_SIZE < len + 1) ? BLOCK_SIZE : len + 1;
        
        Value** keys[BLOCK_SIZE];
        Value** values[BLOCK_SIZE];
        Value** losses = alloc_ptrs(n);
        
        for (int pos_id = 0; pos_id < n; pos_id++) {
            int token_id = tokens[pos_id];
            int target_id = tokens[pos_id + 1];
            
            Value** logits = gpt(token_id, pos_id, keys, values, pos_id);
            Value** probs = softmax(logits, vocab_size);
            
            losses[pos_id] = val_mul_const(val_log(probs[target_id]), -1.0f);
        }
        
        Value* total_loss = new_value(0.0f);
        for (int i = 0; i < n; i++) total_loss = val_add(total_loss, losses[i]);
        Value* loss = val_mul_const(total_loss, 1.0f / n);
        
        backward(loss);
        
        float lr_t = lr * (1.0f - (float)step / num_steps);
        for (int i = 0; i < num_params; i++) {
            Value* p = &arena[i];
            m[i] = beta1 * m[i] + (1.0f - beta1) * p->grad;
            v[i] = beta2 * v[i] + (1.0f - beta2) * (p->grad * p->grad);
            float m_hat = m[i] / (1.0f - powf(beta1, step + 1));
            float v_hat = v[i] / (1.0f - powf(beta2, step + 1));
            p->data -= lr_t * m_hat / (sqrtf(v_hat) + eps_adam);
            p->grad = 0.0f;
        }
        
        printf("step %4d / %4d | loss %.4f\n", step + 1, num_steps, loss->data);
        
        arena_ptr = base_arena_ptr;
        ptr_arena_ptr = base_ptr_arena_ptr;
    }
    
    float temperature = 0.5f;
    printf("\n--- inference (new, hallucinated names) ---\n");
    for (int sample_idx = 0; sample_idx < 20; sample_idx++) {
        Value** keys[BLOCK_SIZE];
        Value** values[BLOCK_SIZE];
        int token_id = BOS;
        char sample[BLOCK_SIZE + 1];
        int sample_len = 0;
        
        for (int pos_id = 0; pos_id < BLOCK_SIZE; pos_id++) {
            Value** logits = gpt(token_id, pos_id, keys, values, pos_id);
            
            Value** scaled_logits = alloc_ptrs(vocab_size);
            for(int i=0; i<vocab_size; i++) scaled_logits[i] = val_mul_const(logits[i], 1.0f/temperature);
            
            Value** probs = softmax(scaled_logits, vocab_size);
            
            float r = (float)rand() / RAND_MAX;
            float cdf = 0.0f;
            for (int i = 0; i < vocab_size; i++) {
                cdf += probs[i]->data;
                if (r <= cdf || i == vocab_size - 1) { token_id = i; break; }
            }
            
            if (token_id == BOS) break;
            sample[sample_len++] = uchars[token_id];
        }
        sample[sample_len] = '\0';
        printf("sample %2d: %s\n", sample_idx + 1, sample);
        
        ptr_arena_ptr = base_ptr_arena_ptr;
        arena_ptr = base_arena_ptr; 
    }
    
    return 0;
 }
	CC = gcc
	CFLAGS = -O3 -Wall -Wextra
	LDFLAGS = -lm

	all: microgpt

	microgpt: microgpt.c
	$(CC) $(CFLAGS) microgpt.c -o microgpt $(LDFLAGS)

	clean:
	rm -f microgpt input.txt
	/*
	ORIGINAL in python: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95
	*/
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>

	#ifndef M_PI
	#define M_PI 3.14159265358979323846
	#endif

	// --- Configuration ---
	#define N_EMBD 16
	#define N_HEAD 4
	#define N_LAYER 1
	#define BLOCK_SIZE 16
	#define HEAD_DIM (N_EMBD / N_HEAD)
	#define MAX_NODES 2000000 // Autograd arena size
	#define MAX_PTRS 2000000 // Pointer arena size
	#define MAX_DOCS 50000 // Max lines in dataset
	#define MAX_DOC_LEN 256
	#define MAX_VOCAB 256

	// --- Global Dataset & Tokenizer ---
	char* docs[MAX_DOCS];
	int num_docs = 0;
	char uchars[MAX_VOCAB];
	int vocab_size = 0;

	// --- Autograd Engine ---
	typedef struct Value {
	float data;
	float grad;
	struct Value* children[2];
	float local_grads[2];
	int num_children;
	} Value;

	// Memory Arenas
	Value arena[MAX_NODES];
	int arena_ptr = 0;
	int num_params = 0;

	Value* ptr_arena[MAX_PTRS];
	int ptr_arena_ptr = 0;

	Value* new_value(float data) {
	if (arena_ptr >= MAX_NODES) { printf("Arena OOM!\n"); exit(1); }
	Value* v = &arena[arena_ptr++];
	v->data = data;
	v->grad = 0.0f;
	v->num_children = 0;
	return v;
	}

	Value** alloc_ptrs(int size) {
	if (ptr_arena_ptr + size > MAX_PTRS) { printf("Ptr Arena OOM!\n"); exit(1); }
	Value** p = &ptr_arena[ptr_arena_ptr];
	ptr_arena_ptr += size;
	return p;
	}

	Value* val_add(Value* a, Value* b) {
	Value* v = new_value(a->data + b->data);
	v->num_children = 2;
	v->children[0] = a; v->local_grads[0] = 1.0f;
	v->children[1] = b; v->local_grads[1] = 1.0f;
	return v;
	}

	Value* val_add_const(Value* a, float b) {
	Value* v = new_value(a->data + b);
	v->num_children = 1;
	v->children[0] = a; v->local_grads[0] = 1.0f;
	return v;
	}

	Value* val_mul(Value* a, Value* b) {
	Value* v = new_value(a->data * b->data);
	v->num_children = 2;
	v->children[0] = a; v->local_grads[0] = b->data;
	v->children[1] = b; v->local_grads[1] = a->data;
	return v;
	}

	Value* val_mul_const(Value* a, float b) {
	Value* v = new_value(a->data * b);
	v->num_children = 1;
	v->children[0] = a; v->local_grads[0] = b;
	return v;
	}

	Value* val_pow_const(Value* a, float b) {
	Value* v = new_value(powf(a->data, b));
	v->num_children = 1;
	v->children[0] = a; v->local_grads[0] = b * powf(a->data, b - 1.0f);
	return v;
	}

	Value* val_exp(Value* a) {
	Value* v = new_value(expf(a->data));
	v->num_children = 1;
	v->children[0] = a; v->local_grads[0] = expf(a->data);
	return v;
	}

	Value* val_log(Value* a) {
	Value* v = new_value(logf(a->data));
	v->num_children = 1;
	v->children[0] = a; v->local_grads[0] = 1.0f / a->data;
	return v;
	}

	Value* val_relu(Value* a) {
	Value* v = new_value(a->data > 0 ? a->data : 0.0f);
	v->num_children = 1;
	v->children[0] = a; v->local_grads[0] = a->data > 0 ? 1.0f : 0.0f;
	return v;
	}

	void backward(Value* loss) {
	loss->grad = 1.0f;
	for (int i = arena_ptr - 1; i >= num_params; i--) {
	Value* v = &arena[i];
	for (int c = 0; c < v->num_children; c++) {
	v->children[c]->grad += v->local_grads[c] * v->grad;
	}
	}
	}

	// --- Parameter Initialization ---
	float rand_gauss(float mean, float std) {
	float u1 = (float)rand() / RAND_MAX;
	float u2 = (float)rand() / RAND_MAX;
	float z0 = sqrtf(-2.0f * logf(u1 + 1e-8f)) * cosf(2.0f * (float)M_PI * u2);
	return z0 * std + mean;
	}

	Value** init_matrix(int nout, int nin, float std) {
	Value** mat = alloc_ptrs(nout * nin);
	for (int i = 0; i < nout * nin; i++) {
	mat[i] = new_value(rand_gauss(0.0f, std));
	num_params++;
	}
	return mat;
	}

	struct {
	Value wte; Value wpe; Value** lm_head;
	Value attn_wq; Value attn_wk; Value attn_wv; Value attn_wo;
	Value mlp_fc1; Value mlp_fc2;
	} model;

	// --- Neural Network Operations ---
	Value linear(Value x, Value** w, int nout, int nin) {
	Value** out = alloc_ptrs(nout);
	for (int o = 0; o < nout; o++) {
	Value* sum = new_value(0.0f);
	for (int i = 0; i < nin; i++) {
	sum = val_add(sum, val_mul(w[o * nin + i], x[i]));
	}
	out[o] = sum;
	}
	return out;
	}

	Value softmax(Value logits, int len) {
	float max_val = -1e9;
	for (int i = 0; i < len; i++) if (logits[i]->data > max_val) max_val = logits[i]->data;

	Value** exps = alloc_ptrs(len);
	Value* total = new_value(0.0f);
	for (int i = 0; i < len; i++) {
	exps[i] = val_exp(val_add_const(logits[i], -max_val));
	total = val_add(total, exps[i]);
	}

	Value** out = alloc_ptrs(len);
	Value* total_inv = val_pow_const(total, -1.0f);
	for (int i = 0; i < len; i++) {
	out[i] = val_mul(exps[i], total_inv);
	}
	return out;
	}

	Value rmsnorm(Value x, int len) {
	Value* ms = new_value(0.0f);
	for (int i = 0; i < len; i++) ms = val_add(ms, val_mul(x[i], x[i]));
	ms = val_mul_const(ms, 1.0f / len);
	Value* scale = val_pow_const(val_add_const(ms, 1e-5f), -0.5f);

	Value** out = alloc_ptrs(len);
	for (int i = 0; i < len; i++) out[i] = val_mul(x[i], scale);
	return out;
	}

	Value gpt(int token_id, int pos_id, Value* keys, Value*** values, int cache_len) {
	Value** tok_emb = &model.wte[token_id * N_EMBD];
	Value** pos_emb = &model.wpe[pos_id * N_EMBD];

	Value** x = alloc_ptrs(N_EMBD);
	for (int i = 0; i < N_EMBD; i++) x[i] = val_add(tok_emb[i], pos_emb[i]);

	x = rmsnorm(x, N_EMBD);

	Value** x_residual = x;
	x = rmsnorm(x, N_EMBD);

	Value** q = linear(x, model.attn_wq, N_EMBD, N_EMBD);
	Value** k = linear(x, model.attn_wk, N_EMBD, N_EMBD);
	Value** v = linear(x, model.attn_wv, N_EMBD, N_EMBD);

	keys[cache_len] = k;
	values[cache_len] = v;
	int curr_len = cache_len + 1;

	Value** x_attn = alloc_ptrs(N_EMBD);
	for (int h = 0; h < N_HEAD; h++) {
	int hs = h * HEAD_DIM;
	Value** q_h = &q[hs];

	Value** attn_logits = alloc_ptrs(curr_len);
	for (int t = 0; t < curr_len; t++) {
	Value* score = new_value(0.0f);
	for (int j = 0; j < HEAD_DIM; j++) {
	score = val_add(score, val_mul(q_h[j], keys[t][hs + j]));
	}
	attn_logits[t] = val_mul_const(score, 1.0f / sqrtf((float)HEAD_DIM));
	}

	Value** attn_weights = softmax(attn_logits, curr_len);

	for (int j = 0; j < HEAD_DIM; j++) {
	Value* head_out = new_value(0.0f);
	for (int t = 0; t < curr_len; t++) {
	head_out = val_add(head_out, val_mul(attn_weights[t], values[t][hs + j]));
	}
	x_attn[hs + j] = head_out;
	}
	}

	x = linear(x_attn, model.attn_wo, N_EMBD, N_EMBD);
	Value** attn_out = alloc_ptrs(N_EMBD);
	for (int i = 0; i < N_EMBD; i++) attn_out[i] = val_add(x[i], x_residual[i]);
	x = attn_out;

	x_residual = x;
	x = rmsnorm(x, N_EMBD);
	x = linear(x, model.mlp_fc1, 4 * N_EMBD, N_EMBD);

	Value** x_relu = alloc_ptrs(4 * N_EMBD);
	for (int i = 0; i < 4 * N_EMBD; i++) x_relu[i] = val_relu(x[i]);

	x = linear(x_relu, model.mlp_fc2, N_EMBD, 4 * N_EMBD);

	Value** mlp_out = alloc_ptrs(N_EMBD);
	for (int i = 0; i < N_EMBD; i++) mlp_out[i] = val_add(x[i], x_residual[i]);

	Value** logits = linear(mlp_out, model.lm_head, vocab_size, N_EMBD);
	return logits;
	}

	void load_dataset() {
	int ret = system("wget -qO input.txt https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt \|\| curl -so input.txt https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt");
	(void)ret;

	FILE* f = fopen("input.txt", "r");
	if (!f) { printf("Failed to open input.txt\n"); exit(1); }

	char line[MAX_DOC_LEN];
	int char_counts[256] = {0};

	while (fgets(line, sizeof(line), f) && num_docs < MAX_DOCS) {
	int len = strlen(line);
	if (len > 0 && line[len-1] == '\n') line[--len] = '\0';
	if (len == 0) continue;

	docs[num_docs++] = strdup(line);
	for (int i = 0; i < len; i++) char_counts[(unsigned char)line[i]] = 1;
	}
	fclose(f);

	for (int i = 0; i < 256; i++) {
	if (char_counts[i]) uchars[vocab_size++] = i;
	}
	vocab_size++;
	printf("num docs: %d\nvocab size: %d\n", num_docs, vocab_size);
	}

	int char_to_id(char c) {
	for (int i = 0; i < vocab_size - 1; i++) if (uchars[i] == c) return i;
	return 0;
	}

	int main() {
	srand(42);
	load_dataset();

	int BOS = vocab_size - 1;

	model.wte = init_matrix(vocab_size, N_EMBD, 0.08f);
	model.wpe = init_matrix(BLOCK_SIZE, N_EMBD, 0.08f);
	model.lm_head = init_matrix(vocab_size, N_EMBD, 0.08f);
	model.attn_wq = init_matrix(N_EMBD, N_EMBD, 0.08f);
	model.attn_wk = init_matrix(N_EMBD, N_EMBD, 0.08f);
	model.attn_wv = init_matrix(N_EMBD, N_EMBD, 0.08f);
	model.attn_wo = init_matrix(N_EMBD, N_EMBD, 0.08f);
	model.mlp_fc1 = init_matrix(4 * N_EMBD, N_EMBD, 0.08f);
	model.mlp_fc2 = init_matrix(N_EMBD, 4 * N_EMBD, 0.08f);

	printf("num params: %d\n", num_params);

	float* m = calloc(num_params, sizeof(float));
	float* v = calloc(num_params, sizeof(float));
	float lr = 0.01f, beta1 = 0.85f, beta2 = 0.99f, eps_adam = 1e-8f;
	int num_steps = 1000;

	int base_arena_ptr = arena_ptr;
	int base_ptr_arena_ptr = ptr_arena_ptr;

	for (int step = 0; step < num_steps; step++) {
	char* doc = docs[step % num_docs];
	int len = strlen(doc);
	int tokens[MAX_DOC_LEN + 2];
	tokens[0] = BOS;
	for (int i = 0; i < len; i++) tokens[i+1] = char_to_id(doc[i]);
	tokens[len+1] = BOS;
	int n = (BLOCK_SIZE < len + 1) ? BLOCK_SIZE : len + 1;

	Value** keys[BLOCK_SIZE];
	Value** values[BLOCK_SIZE];
	Value** losses = alloc_ptrs(n);

	for (int pos_id = 0; pos_id < n; pos_id++) {
	int token_id = tokens[pos_id];
	int target_id = tokens[pos_id + 1];

	Value** logits = gpt(token_id, pos_id, keys, values, pos_id);
	Value** probs = softmax(logits, vocab_size);

	losses[pos_id] = val_mul_const(val_log(probs[target_id]), -1.0f);
	}

	Value* total_loss = new_value(0.0f);
	for (int i = 0; i < n; i++) total_loss = val_add(total_loss, losses[i]);
	Value* loss = val_mul_const(total_loss, 1.0f / n);

	backward(loss);

	float lr_t = lr * (1.0f - (float)step / num_steps);
	for (int i = 0; i < num_params; i++) {
	Value* p = &arena[i];
	m[i] = beta1 * m[i] + (1.0f - beta1) * p->grad;
	v[i] = beta2 * v[i] + (1.0f - beta2) * (p->grad * p->grad);
	float m_hat = m[i] / (1.0f - powf(beta1, step + 1));
	float v_hat = v[i] / (1.0f - powf(beta2, step + 1));
	p->data -= lr_t * m_hat / (sqrtf(v_hat) + eps_adam);
	p->grad = 0.0f;
	}

	printf("step %4d / %4d \| loss %.4f\n", step + 1, num_steps, loss->data);

	arena_ptr = base_arena_ptr;
	ptr_arena_ptr = base_ptr_arena_ptr;
	}

	float temperature = 0.5f;
	printf("\n--- inference (new, hallucinated names) ---\n");
	for (int sample_idx = 0; sample_idx < 20; sample_idx++) {
	Value** keys[BLOCK_SIZE];
	Value** values[BLOCK_SIZE];
	int token_id = BOS;
	char sample[BLOCK_SIZE + 1];
	int sample_len = 0;

	for (int pos_id = 0; pos_id < BLOCK_SIZE; pos_id++) {
	Value** logits = gpt(token_id, pos_id, keys, values, pos_id);

	Value** scaled_logits = alloc_ptrs(vocab_size);
	for(int i=0; i<vocab_size; i++) scaled_logits[i] = val_mul_const(logits[i], 1.0f/temperature);

	Value** probs = softmax(scaled_logits, vocab_size);

	float r = (float)rand() / RAND_MAX;
	float cdf = 0.0f;
	for (int i = 0; i < vocab_size; i++) {
	cdf += probs[i]->data;
	if (r <= cdf \|\| i == vocab_size - 1) { token_id = i; break; }
	}

	if (token_id == BOS) break;
	sample[sample_len++] = uchars[token_id];
	}
	sample[sample_len] = '\0';
	printf("sample %2d: %s\n", sample_idx + 1, sample);

	ptr_arena_ptr = base_ptr_arena_ptr;
	arena_ptr = base_arena_ptr;
	}

	return 0;
	}