-
-
Save Lukass14/bbd281b9079cec4985e418b46e1b7812 to your computer and use it in GitHub Desktop.
microgpt by karpathy but in pure C
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| CC = gcc | |
| CFLAGS = -O3 -Wall -Wextra | |
| LDFLAGS = -lm | |
| all: microgpt | |
| microgpt: microgpt.c | |
| $(CC) $(CFLAGS) microgpt.c -o microgpt $(LDFLAGS) | |
| clean: | |
| rm -f microgpt input.txt |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| ORIGINAL in python: https://gist.github.com/karpathy/8627fe009c40f57531cb18360106ce95 | |
| */ | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <math.h> | |
| #ifndef M_PI | |
| #define M_PI 3.14159265358979323846 | |
| #endif | |
| // --- Configuration --- | |
| #define N_EMBD 16 | |
| #define N_HEAD 4 | |
| #define N_LAYER 1 | |
| #define BLOCK_SIZE 16 | |
| #define HEAD_DIM (N_EMBD / N_HEAD) | |
| #define MAX_NODES 2000000 // Autograd arena size | |
| #define MAX_PTRS 2000000 // Pointer arena size | |
| #define MAX_DOCS 50000 // Max lines in dataset | |
| #define MAX_DOC_LEN 256 | |
| #define MAX_VOCAB 256 | |
| // --- Global Dataset & Tokenizer --- | |
| char* docs[MAX_DOCS]; | |
| int num_docs = 0; | |
| char uchars[MAX_VOCAB]; | |
| int vocab_size = 0; | |
| // --- Autograd Engine --- | |
| typedef struct Value { | |
| float data; | |
| float grad; | |
| struct Value* children[2]; | |
| float local_grads[2]; | |
| int num_children; | |
| } Value; | |
| // Memory Arenas | |
| Value arena[MAX_NODES]; | |
| int arena_ptr = 0; | |
| int num_params = 0; | |
| Value* ptr_arena[MAX_PTRS]; | |
| int ptr_arena_ptr = 0; | |
| Value* new_value(float data) { | |
| if (arena_ptr >= MAX_NODES) { printf("Arena OOM!\n"); exit(1); } | |
| Value* v = &arena[arena_ptr++]; | |
| v->data = data; | |
| v->grad = 0.0f; | |
| v->num_children = 0; | |
| return v; | |
| } | |
| Value** alloc_ptrs(int size) { | |
| if (ptr_arena_ptr + size > MAX_PTRS) { printf("Ptr Arena OOM!\n"); exit(1); } | |
| Value** p = &ptr_arena[ptr_arena_ptr]; | |
| ptr_arena_ptr += size; | |
| return p; | |
| } | |
| Value* val_add(Value* a, Value* b) { | |
| Value* v = new_value(a->data + b->data); | |
| v->num_children = 2; | |
| v->children[0] = a; v->local_grads[0] = 1.0f; | |
| v->children[1] = b; v->local_grads[1] = 1.0f; | |
| return v; | |
| } | |
| Value* val_add_const(Value* a, float b) { | |
| Value* v = new_value(a->data + b); | |
| v->num_children = 1; | |
| v->children[0] = a; v->local_grads[0] = 1.0f; | |
| return v; | |
| } | |
| Value* val_mul(Value* a, Value* b) { | |
| Value* v = new_value(a->data * b->data); | |
| v->num_children = 2; | |
| v->children[0] = a; v->local_grads[0] = b->data; | |
| v->children[1] = b; v->local_grads[1] = a->data; | |
| return v; | |
| } | |
| Value* val_mul_const(Value* a, float b) { | |
| Value* v = new_value(a->data * b); | |
| v->num_children = 1; | |
| v->children[0] = a; v->local_grads[0] = b; | |
| return v; | |
| } | |
| Value* val_pow_const(Value* a, float b) { | |
| Value* v = new_value(powf(a->data, b)); | |
| v->num_children = 1; | |
| v->children[0] = a; v->local_grads[0] = b * powf(a->data, b - 1.0f); | |
| return v; | |
| } | |
| Value* val_exp(Value* a) { | |
| Value* v = new_value(expf(a->data)); | |
| v->num_children = 1; | |
| v->children[0] = a; v->local_grads[0] = expf(a->data); | |
| return v; | |
| } | |
| Value* val_log(Value* a) { | |
| Value* v = new_value(logf(a->data)); | |
| v->num_children = 1; | |
| v->children[0] = a; v->local_grads[0] = 1.0f / a->data; | |
| return v; | |
| } | |
| Value* val_relu(Value* a) { | |
| Value* v = new_value(a->data > 0 ? a->data : 0.0f); | |
| v->num_children = 1; | |
| v->children[0] = a; v->local_grads[0] = a->data > 0 ? 1.0f : 0.0f; | |
| return v; | |
| } | |
| void backward(Value* loss) { | |
| loss->grad = 1.0f; | |
| for (int i = arena_ptr - 1; i >= num_params; i--) { | |
| Value* v = &arena[i]; | |
| for (int c = 0; c < v->num_children; c++) { | |
| v->children[c]->grad += v->local_grads[c] * v->grad; | |
| } | |
| } | |
| } | |
| // --- Parameter Initialization --- | |
| float rand_gauss(float mean, float std) { | |
| float u1 = (float)rand() / RAND_MAX; | |
| float u2 = (float)rand() / RAND_MAX; | |
| float z0 = sqrtf(-2.0f * logf(u1 + 1e-8f)) * cosf(2.0f * (float)M_PI * u2); | |
| return z0 * std + mean; | |
| } | |
| Value** init_matrix(int nout, int nin, float std) { | |
| Value** mat = alloc_ptrs(nout * nin); | |
| for (int i = 0; i < nout * nin; i++) { | |
| mat[i] = new_value(rand_gauss(0.0f, std)); | |
| num_params++; | |
| } | |
| return mat; | |
| } | |
| struct { | |
| Value** wte; Value** wpe; Value** lm_head; | |
| Value** attn_wq; Value** attn_wk; Value** attn_wv; Value** attn_wo; | |
| Value** mlp_fc1; Value** mlp_fc2; | |
| } model; | |
| // --- Neural Network Operations --- | |
| Value** linear(Value** x, Value** w, int nout, int nin) { | |
| Value** out = alloc_ptrs(nout); | |
| for (int o = 0; o < nout; o++) { | |
| Value* sum = new_value(0.0f); | |
| for (int i = 0; i < nin; i++) { | |
| sum = val_add(sum, val_mul(w[o * nin + i], x[i])); | |
| } | |
| out[o] = sum; | |
| } | |
| return out; | |
| } | |
| Value** softmax(Value** logits, int len) { | |
| float max_val = -1e9; | |
| for (int i = 0; i < len; i++) if (logits[i]->data > max_val) max_val = logits[i]->data; | |
| Value** exps = alloc_ptrs(len); | |
| Value* total = new_value(0.0f); | |
| for (int i = 0; i < len; i++) { | |
| exps[i] = val_exp(val_add_const(logits[i], -max_val)); | |
| total = val_add(total, exps[i]); | |
| } | |
| Value** out = alloc_ptrs(len); | |
| Value* total_inv = val_pow_const(total, -1.0f); | |
| for (int i = 0; i < len; i++) { | |
| out[i] = val_mul(exps[i], total_inv); | |
| } | |
| return out; | |
| } | |
| Value** rmsnorm(Value** x, int len) { | |
| Value* ms = new_value(0.0f); | |
| for (int i = 0; i < len; i++) ms = val_add(ms, val_mul(x[i], x[i])); | |
| ms = val_mul_const(ms, 1.0f / len); | |
| Value* scale = val_pow_const(val_add_const(ms, 1e-5f), -0.5f); | |
| Value** out = alloc_ptrs(len); | |
| for (int i = 0; i < len; i++) out[i] = val_mul(x[i], scale); | |
| return out; | |
| } | |
| Value** gpt(int token_id, int pos_id, Value*** keys, Value*** values, int cache_len) { | |
| Value** tok_emb = &model.wte[token_id * N_EMBD]; | |
| Value** pos_emb = &model.wpe[pos_id * N_EMBD]; | |
| Value** x = alloc_ptrs(N_EMBD); | |
| for (int i = 0; i < N_EMBD; i++) x[i] = val_add(tok_emb[i], pos_emb[i]); | |
| x = rmsnorm(x, N_EMBD); | |
| Value** x_residual = x; | |
| x = rmsnorm(x, N_EMBD); | |
| Value** q = linear(x, model.attn_wq, N_EMBD, N_EMBD); | |
| Value** k = linear(x, model.attn_wk, N_EMBD, N_EMBD); | |
| Value** v = linear(x, model.attn_wv, N_EMBD, N_EMBD); | |
| keys[cache_len] = k; | |
| values[cache_len] = v; | |
| int curr_len = cache_len + 1; | |
| Value** x_attn = alloc_ptrs(N_EMBD); | |
| for (int h = 0; h < N_HEAD; h++) { | |
| int hs = h * HEAD_DIM; | |
| Value** q_h = &q[hs]; | |
| Value** attn_logits = alloc_ptrs(curr_len); | |
| for (int t = 0; t < curr_len; t++) { | |
| Value* score = new_value(0.0f); | |
| for (int j = 0; j < HEAD_DIM; j++) { | |
| score = val_add(score, val_mul(q_h[j], keys[t][hs + j])); | |
| } | |
| attn_logits[t] = val_mul_const(score, 1.0f / sqrtf((float)HEAD_DIM)); | |
| } | |
| Value** attn_weights = softmax(attn_logits, curr_len); | |
| for (int j = 0; j < HEAD_DIM; j++) { | |
| Value* head_out = new_value(0.0f); | |
| for (int t = 0; t < curr_len; t++) { | |
| head_out = val_add(head_out, val_mul(attn_weights[t], values[t][hs + j])); | |
| } | |
| x_attn[hs + j] = head_out; | |
| } | |
| } | |
| x = linear(x_attn, model.attn_wo, N_EMBD, N_EMBD); | |
| Value** attn_out = alloc_ptrs(N_EMBD); | |
| for (int i = 0; i < N_EMBD; i++) attn_out[i] = val_add(x[i], x_residual[i]); | |
| x = attn_out; | |
| x_residual = x; | |
| x = rmsnorm(x, N_EMBD); | |
| x = linear(x, model.mlp_fc1, 4 * N_EMBD, N_EMBD); | |
| Value** x_relu = alloc_ptrs(4 * N_EMBD); | |
| for (int i = 0; i < 4 * N_EMBD; i++) x_relu[i] = val_relu(x[i]); | |
| x = linear(x_relu, model.mlp_fc2, N_EMBD, 4 * N_EMBD); | |
| Value** mlp_out = alloc_ptrs(N_EMBD); | |
| for (int i = 0; i < N_EMBD; i++) mlp_out[i] = val_add(x[i], x_residual[i]); | |
| Value** logits = linear(mlp_out, model.lm_head, vocab_size, N_EMBD); | |
| return logits; | |
| } | |
| void load_dataset() { | |
| int ret = system("wget -qO input.txt https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt || curl -so input.txt https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"); | |
| (void)ret; | |
| FILE* f = fopen("input.txt", "r"); | |
| if (!f) { printf("Failed to open input.txt\n"); exit(1); } | |
| char line[MAX_DOC_LEN]; | |
| int char_counts[256] = {0}; | |
| while (fgets(line, sizeof(line), f) && num_docs < MAX_DOCS) { | |
| int len = strlen(line); | |
| if (len > 0 && line[len-1] == '\n') line[--len] = '\0'; | |
| if (len == 0) continue; | |
| docs[num_docs++] = strdup(line); | |
| for (int i = 0; i < len; i++) char_counts[(unsigned char)line[i]] = 1; | |
| } | |
| fclose(f); | |
| for (int i = 0; i < 256; i++) { | |
| if (char_counts[i]) uchars[vocab_size++] = i; | |
| } | |
| vocab_size++; | |
| printf("num docs: %d\nvocab size: %d\n", num_docs, vocab_size); | |
| } | |
| int char_to_id(char c) { | |
| for (int i = 0; i < vocab_size - 1; i++) if (uchars[i] == c) return i; | |
| return 0; | |
| } | |
| int main() { | |
| srand(42); | |
| load_dataset(); | |
| int BOS = vocab_size - 1; | |
| model.wte = init_matrix(vocab_size, N_EMBD, 0.08f); | |
| model.wpe = init_matrix(BLOCK_SIZE, N_EMBD, 0.08f); | |
| model.lm_head = init_matrix(vocab_size, N_EMBD, 0.08f); | |
| model.attn_wq = init_matrix(N_EMBD, N_EMBD, 0.08f); | |
| model.attn_wk = init_matrix(N_EMBD, N_EMBD, 0.08f); | |
| model.attn_wv = init_matrix(N_EMBD, N_EMBD, 0.08f); | |
| model.attn_wo = init_matrix(N_EMBD, N_EMBD, 0.08f); | |
| model.mlp_fc1 = init_matrix(4 * N_EMBD, N_EMBD, 0.08f); | |
| model.mlp_fc2 = init_matrix(N_EMBD, 4 * N_EMBD, 0.08f); | |
| printf("num params: %d\n", num_params); | |
| float* m = calloc(num_params, sizeof(float)); | |
| float* v = calloc(num_params, sizeof(float)); | |
| float lr = 0.01f, beta1 = 0.85f, beta2 = 0.99f, eps_adam = 1e-8f; | |
| int num_steps = 1000; | |
| int base_arena_ptr = arena_ptr; | |
| int base_ptr_arena_ptr = ptr_arena_ptr; | |
| for (int step = 0; step < num_steps; step++) { | |
| char* doc = docs[step % num_docs]; | |
| int len = strlen(doc); | |
| int tokens[MAX_DOC_LEN + 2]; | |
| tokens[0] = BOS; | |
| for (int i = 0; i < len; i++) tokens[i+1] = char_to_id(doc[i]); | |
| tokens[len+1] = BOS; | |
| int n = (BLOCK_SIZE < len + 1) ? BLOCK_SIZE : len + 1; | |
| Value** keys[BLOCK_SIZE]; | |
| Value** values[BLOCK_SIZE]; | |
| Value** losses = alloc_ptrs(n); | |
| for (int pos_id = 0; pos_id < n; pos_id++) { | |
| int token_id = tokens[pos_id]; | |
| int target_id = tokens[pos_id + 1]; | |
| Value** logits = gpt(token_id, pos_id, keys, values, pos_id); | |
| Value** probs = softmax(logits, vocab_size); | |
| losses[pos_id] = val_mul_const(val_log(probs[target_id]), -1.0f); | |
| } | |
| Value* total_loss = new_value(0.0f); | |
| for (int i = 0; i < n; i++) total_loss = val_add(total_loss, losses[i]); | |
| Value* loss = val_mul_const(total_loss, 1.0f / n); | |
| backward(loss); | |
| float lr_t = lr * (1.0f - (float)step / num_steps); | |
| for (int i = 0; i < num_params; i++) { | |
| Value* p = &arena[i]; | |
| m[i] = beta1 * m[i] + (1.0f - beta1) * p->grad; | |
| v[i] = beta2 * v[i] + (1.0f - beta2) * (p->grad * p->grad); | |
| float m_hat = m[i] / (1.0f - powf(beta1, step + 1)); | |
| float v_hat = v[i] / (1.0f - powf(beta2, step + 1)); | |
| p->data -= lr_t * m_hat / (sqrtf(v_hat) + eps_adam); | |
| p->grad = 0.0f; | |
| } | |
| printf("step %4d / %4d | loss %.4f\n", step + 1, num_steps, loss->data); | |
| arena_ptr = base_arena_ptr; | |
| ptr_arena_ptr = base_ptr_arena_ptr; | |
| } | |
| float temperature = 0.5f; | |
| printf("\n--- inference (new, hallucinated names) ---\n"); | |
| for (int sample_idx = 0; sample_idx < 20; sample_idx++) { | |
| Value** keys[BLOCK_SIZE]; | |
| Value** values[BLOCK_SIZE]; | |
| int token_id = BOS; | |
| char sample[BLOCK_SIZE + 1]; | |
| int sample_len = 0; | |
| for (int pos_id = 0; pos_id < BLOCK_SIZE; pos_id++) { | |
| Value** logits = gpt(token_id, pos_id, keys, values, pos_id); | |
| Value** scaled_logits = alloc_ptrs(vocab_size); | |
| for(int i=0; i<vocab_size; i++) scaled_logits[i] = val_mul_const(logits[i], 1.0f/temperature); | |
| Value** probs = softmax(scaled_logits, vocab_size); | |
| float r = (float)rand() / RAND_MAX; | |
| float cdf = 0.0f; | |
| for (int i = 0; i < vocab_size; i++) { | |
| cdf += probs[i]->data; | |
| if (r <= cdf || i == vocab_size - 1) { token_id = i; break; } | |
| } | |
| if (token_id == BOS) break; | |
| sample[sample_len++] = uchars[token_id]; | |
| } | |
| sample[sample_len] = '\0'; | |
| printf("sample %2d: %s\n", sample_idx + 1, sample); | |
| ptr_arena_ptr = base_ptr_arena_ptr; | |
| arena_ptr = base_arena_ptr; | |
| } | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment