Skip to content

Instantly share code, notes, and snippets.

@jonas1ara
Last active March 9, 2026 21:45
Show Gist options
  • Select an option

  • Save jonas1ara/218e759c330aeb5fc191b8f2c631dc07 to your computer and use it in GitHub Desktop.

Select an option

Save jonas1ara/218e759c330aeb5fc191b8f2c631dc07 to your computer and use it in GitHub Desktop.
#!/usr/bin/dotnet fsi
open System
open System.Collections.Generic
open System.Globalization
open System.IO
open System.Net.Http
open System.Numerics
open System.Text
type Value(data: float, children: Value[], localGrads: float[]) =
let mutable _data = data
let mutable _grad = 0.0
new(data: float) = Value(data, [||], [||])
member _.Data with get() = _data and set(v) = _data <- v
member _.Grad with get() = _grad and set(v) = _grad <- v
member _.Children = children
member _.LocalGrads = localGrads
static member (+)(a: Value, b: Value) = Value(a.Data + b.Data, [|a; b|], [|1.0; 1.0|])
static member (+)(a: Value, b: float) = a + Value b
static member (+)(a: float, b: Value) = Value a + b
static member (*)(a: Value, b: Value) = Value(a.Data * b.Data, [|a; b|], [|b.Data; a.Data|])
static member (*)(a: Value, b: float) = a * Value b
static member (*)(a: float, b: Value) = Value a * b
static member (~-)(a: Value) = a * -1.0
static member (-)(a: Value, b: Value) = a + -b
static member (-)(a: float, b: Value) = Value a + -b
static member (-)(a: Value, b: float) = a + -Value b
static member (/)(a: Value, b: Value) = a * b.Pow(-1.0)
static member (/)(a: float, b: Value) = Value a * b.Pow(-1.0)
static member (/)(a: Value, b: float) = a * Math.Pow(b, -1.0)
member this.Pow(other: float) =
Value(Math.Pow(_data, other), [|this|], [|other * Math.Pow(_data, other - 1.0)|])
member this.Log() = Value(Math.Log(_data), [|this|], [|1.0 / _data|])
member this.Exp() =
let e = Math.Exp _data
Value(e, [|this|], [|e|])
member this.Relu() =
Value(Math.Max(0.0, _data), [|this|], [|if _data > 0.0 then 1.0 else 0.0|])
static member Dot(a: ResizeArray<Value>, b: ResizeArray<Value>) =
let n = a.Count
let children = Array.zeroCreate<Value> (2 * n)
let localGrads = Array.zeroCreate<float> (2 * n)
for i in 0 .. n - 1 do
let va = a.[i]
let vb = b.[i]
children.[i] <- va
children.[n + i] <- vb
localGrads.[i] <- vb.Data // grad of a_i = b_i
localGrads.[n + i] <- va.Data // grad of b_i = a_i
let mutable dotData = 0.0
let vecCount = Vector<float>.Count
let vecLoopEnd = n - vecCount
let mutable j = 0
if vecLoopEnd >= 0 then
let bDataSpan = localGrads.AsSpan(0, n)
let aDataSpan = localGrads.AsSpan(n, n)
let mutable sumVec = Vector<float>.Zero
while j <= vecLoopEnd do
let va = Vector<float>(aDataSpan.Slice(j, vecCount))
let vb = Vector<float>(bDataSpan.Slice(j, vecCount))
sumVec <- sumVec + va * vb
j <- j + vecCount
dotData <- Vector.Sum sumVec
while j < n do
dotData <- dotData + localGrads.[n + j] * localGrads.[j]
j <- j + 1
Value(dotData, children, localGrads)
member this.Backward
(topo: ResizeArray<Value>, visited: HashSet<Value>,
stack: Stack<struct (Value * int)>) =
stack.Push(struct (this, 0))
while stack.Count > 0 do
let struct (current, childIndex) = stack.Pop()
let ch = current.Children
if ch.Length > 0 && childIndex < ch.Length then
stack.Push(struct (current, childIndex + 1))
let child = ch.[childIndex]
if visited.Add child then
stack.Push(struct (child, 0))
else
topo.Add current
this.Grad <- 1.0
for topoIdx = topo.Count - 1 downto 0 do
let v = topo.[topoIdx]
let vGrad = v.Grad
if vGrad <> 0.0 then
let ch = v.Children
if ch.Length > 0 then
let lg = v.LocalGrads
let len = ch.Length
match len with
| 1 ->
ch.[0].Grad <- ch.[0].Grad + lg.[0] * vGrad
| 2 ->
ch.[0].Grad <- ch.[0].Grad + lg.[0] * vGrad
ch.[1].Grad <- ch.[1].Grad + lg.[1] * vGrad
| _ ->
for i in 0 .. len - 1 do
ch.[i].Grad <- ch.[i].Grad + lg.[i] * vGrad
override _.ToString() = sprintf "Value(data=%f)" _data
let parseArg (args: string[]) (name: string) (defaultVal: 'T) : 'T =
let mutable result = defaultVal
let mutable i = 0
while i < args.Length - 1 do
if args.[i].StartsWith "--" && args.[i].[2..] = name then
result <-
if typeof<'T> = typeof<bool> then box (args.[i+1].Equals("true", StringComparison.OrdinalIgnoreCase)) :?> 'T
elif typeof<'T> = typeof<int> then box (int args.[i+1]) :?> 'T
elif typeof<'T> = typeof<float> then box (Double.Parse(args.[i+1], CultureInfo.InvariantCulture)) :?> 'T
elif typeof<'T> = typeof<string> then box args.[i+1] :?> 'T
else failwithf "Invalid type %A for argument %s" typeof<'T> name
i <- i + 1
result
let gauss (rng: Random) mean std =
let u1 = 1.0 - rng.NextDouble()
let u2 = 1.0 - rng.NextDouble()
mean + std * Math.Sqrt(-2.0 * Math.Log u1) * Math.Sin(2.0 * Math.PI * u2)
let createMatrix (rng: Random) nout nin std =
ResizeArray(Array.init nout (fun _ ->
ResizeArray(Array.init nin (fun _ -> Value(gauss rng 0.0 std)))))
let linear (x: ResizeArray<Value>) (w: ResizeArray<ResizeArray<Value>>) =
ResizeArray(w |> Seq.map (fun wo -> Value.Dot(wo, x)))
let softmax (logits: ResizeArray<Value>) =
let maxVal = logits |> Seq.map (fun v -> v.Data) |> Seq.max
let exps = ResizeArray(logits |> Seq.map (fun v -> (v - maxVal).Exp()))
let total = exps |> Seq.fold (fun acc e -> acc + e) (Value 0.0)
ResizeArray(exps |> Seq.map (fun e -> e / total))
let rmsNorm (x: ResizeArray<Value>) =
let sumSq = x |> Seq.fold (fun acc xi -> acc + xi * xi) (Value 0.0)
let ms = sumSq / float x.Count
let scale = (ms + 1e-5).Pow(-0.5)
ResizeArray(x |> Seq.map (fun xi -> xi * scale))
let shuffle (rng: Random) (lst: ResizeArray<'T>) =
let arr = lst.ToArray()
for i = arr.Length - 1 downto 1 do
let j = rng.Next(i + 1)
let tmp = arr.[i]
arr.[i] <- arr.[j]
arr.[j] <- tmp
ResizeArray arr
// Parse CLI arguments (--n_embd 16)
let args = fsi.CommandLineArgs.[1..]
let nEmbd = parseArg args "n_embd" 16
let nLayer = parseArg args "n_layer" 1
let blockSize = parseArg args "block_size" 8
let numSteps = parseArg args "num_steps" 10000
let nHead = parseArg args "n_head" 4
let learningRate = parseArg args "learning_rate" 1e-2
let seed = parseArg args "seed" 42
let inputUrl = parseArg args "input_url"
"https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt"
let headDim = nEmbd / nHead
let random = Random seed
// Input dataset
if not (File.Exists "input.txt") then
printfn "Downloading input.txt..."
use client = new HttpClient()
let data = client.GetStringAsync(inputUrl).GetAwaiter().GetResult()
File.WriteAllText("input.txt", data)
let docs =
File.ReadAllLines "input.txt"
|> Array.map (fun l -> l.Trim())
|> Array.filter (fun l -> not (String.IsNullOrEmpty l))
|> (fun arr -> shuffle random (ResizeArray arr))
printfn "num docs: %d" docs.Count
// Tokenizer
let allChars =
docs |> Seq.collect id |> Seq.distinct |> Seq.sort |> Array.ofSeq
let bos = allChars.Length
let vocabSize = allChars.Length + 1
printfn "vocab size: %d" vocabSize
let encode (c: char) = Array.findIndex ((=) c) allChars
let decode (i: int) = if i = bos then '.' else allChars.[i]
let stateDict = Dictionary<string, ResizeArray<ResizeArray<Value>>>()
stateDict.["wte"] <- createMatrix random vocabSize nEmbd 0.08
stateDict.["wpe"] <- createMatrix random blockSize nEmbd 0.08
stateDict.["lm_head"] <- createMatrix random vocabSize nEmbd 0.08
for i in 0 .. nLayer - 1 do
stateDict.[sprintf "layer%d.attn_wq" i] <- createMatrix random nEmbd nEmbd 0.08
stateDict.[sprintf "layer%d.attn_wk" i] <- createMatrix random nEmbd nEmbd 0.08
stateDict.[sprintf "layer%d.attn_wv" i] <- createMatrix random nEmbd nEmbd 0.08
stateDict.[sprintf "layer%d.attn_wo" i] <- createMatrix random nEmbd nEmbd 0.08
stateDict.[sprintf "layer%d.mlp_fc1" i] <- createMatrix random (4 * nEmbd) nEmbd 0.08
stateDict.[sprintf "layer%d.mlp_fc2" i] <- createMatrix random nEmbd (4 * nEmbd) 0.08
let paramsList =
stateDict.Values |> Seq.collect id |> Seq.collect id |> Seq.toList
printfn "num params: %d" paramsList.Length
// Cache
let topo = ResizeArray<Value>()
let visited = HashSet<Value>()
let stack = Stack<struct (Value * int)>()
// Adam Optimizer
let beta1 = 0.85
let beta2 = 0.99
let epsAdam = 1e-8
let mArr = Array.zeroCreate<float> paramsList.Length
let vArr = Array.zeroCreate<float> paramsList.Length
// GPT function
let gpt tokenId posId
(keys: ResizeArray<ResizeArray<Value>>[])
(values: ResizeArray<ResizeArray<Value>>[]) =
let tokEmb = stateDict.["wte"].[tokenId]
let posEmb = stateDict.["wpe"].[posId]
let mutable x =
ResizeArray(Array.init nEmbd (fun i -> tokEmb.[i] + posEmb.[i]))
x <- rmsNorm x
for li in 0 .. nLayer - 1 do
let xResidual = ResizeArray x
x <- rmsNorm x
let q = linear x stateDict.[sprintf "layer%d.attn_wq" li]
let k = linear x stateDict.[sprintf "layer%d.attn_wk" li]
let v = linear x stateDict.[sprintf "layer%d.attn_wv" li]
keys.[li].Add k
values.[li].Add v
let xAttn = ResizeArray<Value>()
for h in 0 .. nHead - 1 do
let hs = h * headDim
let qH = q.GetRange(hs, headDim)
let T = keys.[li].Count
let attnLogits = ResizeArray<Value>()
for t in 0 .. T - 1 do
let kH = keys.[li].[t].GetRange(hs, headDim)
let mutable dot = Value 0.0
for j in 0 .. headDim - 1 do
dot <- dot + qH.[j] * kH.[j]
attnLogits.Add(dot / Math.Sqrt(float headDim))
let attnWeights = softmax attnLogits
let headOut = ResizeArray(Array.init headDim (fun _ -> Value 0.0))
for t in 0 .. T - 1 do
let vH = values.[li].[t].GetRange(hs, headDim)
let w = attnWeights.[t]
for j in 0 .. headDim - 1 do
headOut.[j] <- headOut.[j] + w * vH.[j]
xAttn.AddRange headOut
x <- linear xAttn stateDict.[sprintf "layer%d.attn_wo" li]
for i in 0 .. nEmbd - 1 do
x.[i] <- x.[i] + xResidual.[i]
// MLP
let xResidual2 = ResizeArray x
x <- rmsNorm x
x <- linear x stateDict.[sprintf "layer%d.mlp_fc1" li]
x <- ResizeArray(x |> Seq.map (fun xi -> xi.Relu()))
x <- linear x stateDict.[sprintf "layer%d.mlp_fc2" li]
for i in 0 .. nEmbd - 1 do
x.[i] <- x.[i] + xResidual2.[i]
linear x stateDict.["lm_head"]
for step in 0 .. numSteps - 1 do
let doc = docs.[step % docs.Count]
let tokens = ResizeArray<int>()
tokens.Add bos
tokens.AddRange(doc |> Seq.map encode)
tokens.Add bos
let n = min blockSize (tokens.Count - 1)
// Initialize KV cache
let keys = Array.init nLayer (fun _ -> ResizeArray<ResizeArray<Value>>())
let values = Array.init nLayer (fun _ -> ResizeArray<ResizeArray<Value>>())
let losses = ResizeArray<Value>()
for posId in 0 .. n - 1 do
let tokenId = tokens.[posId]
let targetId = tokens.[posId + 1]
let logits = gpt tokenId posId keys values
let probs = softmax logits
losses.Add(-probs.[targetId].Log())
let mutable loss = Value 0.0
for l in losses do loss <- loss + l
loss <- loss * (1.0 / float n)
for p in paramsList do p.Grad <- 0.0
topo.Clear()
visited.Clear()
stack.Clear()
loss.Backward(topo, visited, stack)
let lrT = learningRate * (1.0 - float step / float numSteps)
for i in 0 .. paramsList.Length - 1 do
let p = paramsList.[i]
mArr.[i] <- beta1 * mArr.[i] + (1.0 - beta1) * p.Grad
vArr.[i] <- beta2 * vArr.[i] + (1.0 - beta2) * (p.Grad ** 2.0)
let mHat = mArr.[i] / (1.0 - beta1 ** float (step + 1))
let vHat = vArr.[i] / (1.0 - beta2 ** float (step + 1))
p.Data <- p.Data - lrT * mHat / (Math.Sqrt vHat + epsAdam)
if (step + 1) % 100 = 0 then
printfn "step %4d / %4d | loss %.4f" (step + 1) numSteps loss.Data
// Inference
printfn "\n--- inference (new, hallucinated names) ---"
let temperature = 0.5
for sampleIdx in 0 .. 19 do
let keys = Array.init nLayer (fun _ -> ResizeArray<ResizeArray<Value>>())
let values = Array.init nLayer (fun _ -> ResizeArray<ResizeArray<Value>>())
let mutable tokenId = bos
let sample = StringBuilder()
let mutable posId = 0
let mutable stop = false
while posId < blockSize && not stop do
let logits = gpt tokenId posId keys values
let scaledLogits = ResizeArray(logits |> Seq.map (fun l -> l / temperature))
let probs = softmax scaledLogits
// Weighted random choice
let probsData = probs |> Seq.map (fun p -> p.Data) |> Array.ofSeq
let mutable r = random.NextDouble() * Array.sum probsData
let mutable sum = 0.0
let mutable nextToken = probsData.Length - 1
let mutable found = false
let mutable i = 0
while i < probsData.Length && not found do
sum <- sum + probsData.[i]
if r <= sum then
nextToken <- i
found <- true
i <- i + 1
tokenId <- nextToken
if tokenId = bos then stop <- true
else sample.Append(decode tokenId) |> ignore
posId <- posId + 1
printfn "sample %2d: %s" (sampleIdx + 1) (string sample)
@jonas1ara
Copy link
Author

jonas1ara commented Mar 6, 2026

I ported microgpt – Andrej Karpathy's elegant, dependency-free, single-file GPT implementation – to F#.

Karpathy's original (~200 LOC Python) is a masterpiece for learning transformers, autograd, and training loops without frameworks.

Martin Škuta elevated it significantly in C# with serious .NET optimizations: SIMD vectorization (System.Numerics.Vector), iterative backward pass to avoid recursion limits, zero-allocation hot paths, and loop unrolling.

Building on that optimized foundation, I created a functional F# version that keeps the same performance while embracing F# idioms:

  • Immutability by default + expressive pipelines (|>) for readable data flow
  • Strong type inference, concise syntax, no boilerplate
  • Explicit mutable only where needed
  • Stack-allocated structs and idiomatic collections

Run it instantly with dotnet fsi MicroGPT.fsx

You can customize the model and training with these arguments:

Argument Default Description
--n_embd 16 Embedding dimension
--n_layer 1 Number of transformer layers
--block_size 8 Context length (max tokens per forward pass)
--num_steps 10000 Training steps
--n_head 4 Number of attention heads
--learning_rate 0.01 Initial learning rate (linearly decayed)
--seed 42 Random seed for reproducibility

Example — larger model, more steps:

dotnet fsi MicroGPT.fsx --n_embd 64 --n_layer 4 --n_head 4 --block_size 16 --num_steps 50000

Great exercise to understand LLMs from first principles in a functional-first .NET language.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment