Skip to content

Instantly share code, notes, and snippets.

@skypenguins
Created November 29, 2025 17:24
Show Gist options
  • Select an option

  • Save skypenguins/ddf06ae8c590e3b7792b95254cd97946 to your computer and use it in GitHub Desktop.

Select an option

Save skypenguins/ddf06ae8c590e3b7792b95254cd97946 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "4ae13933-1bb6-4ac9-8606-cac172afdec4",
"metadata": {},
"source": [
"# Running Julia + Flux.jl on GB10"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Julia Version 1.11.7\n",
"Commit f2b3dbda30a (2025-09-08 12:10 UTC)\n",
"Build Info:\n",
" Official https://julialang.org/ release\n",
"Platform Info:\n",
" OS: Linux (aarch64-linux-gnu)\n",
" CPU: 20 × unknown\n",
" WORD_SIZE: 64\n",
" LLVM: libLLVM-16.0.6 (ORCJIT, generic)\n",
"Threads: 1 default, 0 interactive, 1 GC (on 20 virtual cores)\n"
]
}
],
"source": [
"versioninfo()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6a59061b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"XXX XXX XX XX:XX:XX XXXX \n",
"+-----------------------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|=========================================+========================+======================|\n",
"| 0 NVIDIA GB10 On | XXXXXXXX:XX:XX.X On | N/A |\n",
"| N/A 41C P0 13W / N/A | Not Supported | 2% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+------------------------+----------------------+\n",
"\n",
"+-----------------------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=========================================================================================|\n",
"| 0 N/A N/A XXXXX G /usr/lib/xorg/Xorg 301MiB |\n",
"| 0 N/A N/A XXXXX G /usr/bin/gnome-shell 207MiB |\n",
"| 0 N/A N/A XXXXX G .../7421/usr/lib/firefox/firefox 554MiB |\n",
"| 0 N/A N/A XXXXX G /usr/bin/nautilus 37MiB |\n",
"+-----------------------------------------------------------------------------------------+\n"
]
},
{
"data": {
"text/plain": [
"Process(`\u001b[4mnvidia-smi\u001b[24m`, ProcessExited(0))"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"run(`nvidia-smi`)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2258be8a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" total used free shared buff/cache available\n",
"Mem: 119Gi 8.0Gi 102Gi 129Mi 9.9Gi 111Gi\n",
"Swap: 15Gi 0B 15Gi\n"
]
},
{
"data": {
"text/plain": [
"Process(`\u001b[4mfree\u001b[24m \u001b[4m-h\u001b[24m`, ProcessExited(0))"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"run(`free -h`)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "44cb78c3",
"metadata": {},
"outputs": [],
"source": [
"using CUDA\n",
"using Flux\n",
"using Flux: onehotbatch, onecold\n",
"using Flux.Losses: logitcrossentropy\n",
"using MLDatasets\n",
"using MLUtils: DataLoader\n",
"using Optimisers\n",
"using Statistics\n",
"using ProgressMeter"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "fcfdf7af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CUDA toolchain: \n",
"- runtime 13.0, artifact installation\n",
"- driver 580.95.5 for 13.0\n",
"- compiler 13.0\n",
"\n",
"CUDA libraries: \n",
"- CUBLAS: 13.1.0\n",
"- CURAND: 10.4.0\n",
"- CUFFT: 12.0.0\n",
"- CUSOLVER: 12.0.4\n",
"- CUSPARSE: 12.6.3\n",
"- CUPTI: 2025.3.1 (API 13.0.1)\n",
"- NVML: 13.0.0+580.95.5\n",
"\n",
"Julia packages: \n",
"- CUDA: 5.9.5\n",
"- CUDA_Driver_jll: 13.0.2+0\n",
"- CUDA_Compiler_jll: 0.3.0+0\n",
"- CUDA_Runtime_jll: 0.19.2+0\n",
"\n",
"Toolchain:\n",
"- Julia: 1.11.7\n",
"- LLVM: 16.0.6\n",
"\n",
"1 device:\n",
" 0: NVIDIA GB10 (sm_121, 101.059 GiB / 119.697 GiB available)\n"
]
}
],
"source": [
"CUDA.versioninfo()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "13306bcf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"getdata (generic function with 1 method)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function getdata(args, device)\n",
" ENV[\"DATADEPS_ALWAYS_ACCEPT\"] = \"true\"\n",
"\n",
" # Loading Dataset\n",
" train_data = MLDatasets.MNIST(split=:train)\n",
" test_data = MLDatasets.MNIST(split=:test)\n",
" \n",
" # 前処理をまとめた関数\n",
" function preprocess(data)\n",
" x = Float32.(data.features)\n",
" x = Flux.flatten(x)\n",
" y = onehotbatch(data.targets, 0:9)\n",
" return (x, y)\n",
" end\n",
" \n",
" xtrain, ytrain = preprocess(train_data)\n",
" xtest, ytest = preprocess(test_data)\n",
"\n",
" # Create DataLoaders\n",
" train_loader = DataLoader((xtrain, ytrain) |> device, \n",
" batchsize=args.batchsize, \n",
" shuffle=true)\n",
" test_loader = DataLoader((xtest, ytest) |> device, \n",
" batchsize=args.batchsize)\n",
"\n",
" return train_loader, test_loader\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "58b5c513",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"build_model (generic function with 1 method)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function build_model(; imgsize=(28, 28, 1), nclasses=10)\n",
" input_size = prod(imgsize)\n",
" \n",
" return Chain(\n",
" Dense(input_size => 32, relu),\n",
" Dense(32 => nclasses)\n",
" )\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "be6873e8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"loss_and_accuracy (generic function with 1 method)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function loss_and_accuracy(data_loader, model, device)\n",
" acc = 0\n",
" ls = 0.0f0\n",
" num = 0\n",
" for (x, y) in data_loader\n",
" x, y = device(x), device(y)\n",
" ŷ = model(x)\n",
" ls += logitcrossentropy(ŷ, y, agg=sum)\n",
" acc += sum(onecold(ŷ) .== onecold(y))\n",
" num += size(x)[end]\n",
" end\n",
" return ls / num, acc / num\n",
"end\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "fdba656b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Args"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"@kwdef mutable struct Args\n",
" η::Float64 = 3e-4 # learning rate\n",
" batchsize::Int = 256 # batch size\n",
" epochs::Int = 10 # number of epochs\n",
" use_cuda::Bool = true # use gpu (if cuda available)\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "73a68083",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"train (generic function with 1 method)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function train(; kws...)\n",
" args = Args(; kws...)\n",
"\n",
" # Device setup\n",
" device = if CUDA.functional() && args.use_cuda\n",
" @info \"Training on CUDA GPU\"\n",
" CUDA.allowscalar(false)\n",
" gpu\n",
" else\n",
" @info \"Training on CPU\"\n",
" cpu\n",
" end\n",
"\n",
" # Data\n",
" train_loader, test_loader = getdata(args, device)\n",
"\n",
" # Model & Optimizer\n",
" model = build_model() |> device\n",
" opt_state = Optimisers.setup(Adam(args.η), model)\n",
" \n",
" # Training loop\n",
" @showprogress for epoch in 1:args.epochs\n",
" for (x, y) in train_loader\n",
" x, y = device(x), device(y)\n",
" \n",
" grads = Flux.gradient(model) do m\n",
" logitcrossentropy(m(x), y)\n",
" end\n",
" \n",
" opt_state, model = Optimisers.update!(opt_state, model, grads[1])\n",
" end\n",
" \n",
" # Evaluation\n",
" train_loss, train_acc = loss_and_accuracy(train_loader, model, device)\n",
" test_loss, test_acc = loss_and_accuracy(test_loader, model, device)\n",
" \n",
" @info \"Epoch $epoch\" train_loss train_acc test_loss test_acc\n",
" end\n",
" \n",
" return model\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "4c2d78da",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mTraining on CUDA GPU\n",
"\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 1\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.55677825f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.8614333333333334\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.53808856f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.8693\n",
"\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 2\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.38993272f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.8958166666666667\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.37647548f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.8987\n",
"\u001b[32mProgress: 20%|████████▎ | ETA: 0:01:39\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 3\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.33460784f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.90815\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.32494542f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9103\n",
"\u001b[32mProgress: 30%|████████████▎ | ETA: 0:00:59\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 4\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.30106047f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.9171\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.29319176f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9185\n",
"\u001b[32mProgress: 40%|████████████████▍ | ETA: 0:00:39\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 5\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.279726f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.92155\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.27430114f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9238\n",
"\u001b[32mProgress: 50%|████████████████████▌ | ETA: 0:00:26\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 6\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.26288173f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.92675\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.25974548f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9269\n",
"\u001b[32mProgress: 60%|████████████████████████▋ | ETA: 0:00:18\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 7\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.2496861f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.9304166666666667\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.2493859f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9296\n",
"\u001b[32mProgress: 70%|████████████████████████████▊ | ETA: 0:00:12\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 8\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.23734426f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.9341\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.23815677f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9314\n",
"\u001b[32mProgress: 80%|████████████████████████████████▊ | ETA: 0:00:07\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 9\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.2268096f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.9369666666666666\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.22869739f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9344\n",
"\u001b[32mProgress: 90%|████████████████████████████████████▉ | ETA: 0:00:03\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 10\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.21937917f0\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.93865\n",
"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.22186221f0\n",
"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9354\n",
"\u001b[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:27\u001b[39m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" 30.506029 seconds (90.89 M allocations: 4.707 GiB, 2.40% gc time, 22 lock conflicts, 79.89% compilation time: 1% of which was recompilation)\n"
]
},
{
"data": {
"text/plain": [
"Chain(\n",
" Dense(784 => 32, relu), \u001b[90m# 25_120 parameters\u001b[39m\n",
" Dense(32 => 10), \u001b[90m# 330 parameters\u001b[39m\n",
") \u001b[90m # Total: 4 arrays, \u001b[39m25_450 parameters, 664 bytes."
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Run training \n",
"@time model = train()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Julia 1.11",
"language": "julia",
"name": "julia-1.11"
},
"language_info": {
"file_extension": ".jl",
"mimetype": "application/julia",
"name": "julia",
"version": "1.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment