skypenguins · November 29, 2025 17:24
diff --git a/julia_with_dgx_spark_masked.ipynb b/julia_with_dgx_spark_masked.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4ae13933-1bb6-4ac9-8606-cac172afdec4",
   "metadata": {},
   "source": [
    "# Running Julia + Flux.jl on GB10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Julia Version 1.11.7\n",
      "Commit f2b3dbda30a (2025-09-08 12:10 UTC)\n",
      "Build Info:\n",
      "  Official https://julialang.org/ release\n",
      "Platform Info:\n",
      "  OS: Linux (aarch64-linux-gnu)\n",
      "  CPU: 20 × unknown\n",
      "  WORD_SIZE: 64\n",
      "  LLVM: libLLVM-16.0.6 (ORCJIT, generic)\n",
      "Threads: 1 default, 0 interactive, 1 GC (on 20 virtual cores)\n"
     ]
    }
   ],
   "source": [
    "versioninfo()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6a59061b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "XXX XXX XX XX:XX:XX XXXX       \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                        |               MIG M. |\n",
      "|=========================================+========================+======================|\n",
      "|   0  NVIDIA GB10                    On  |   XXXXXXXX:XX:XX.X  On |                  N/A |\n",
      "| N/A   41C    P0             13W /  N/A  | Not Supported          |      2%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "\n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                              |\n",
      "|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |\n",
      "|        ID   ID                                                               Usage      |\n",
      "|=========================================================================================|\n",
      "|    0   N/A  N/A           XXXXX      G   /usr/lib/xorg/Xorg                      301MiB |\n",
      "|    0   N/A  N/A           XXXXX      G   /usr/bin/gnome-shell                    207MiB |\n",
      "|    0   N/A  N/A           XXXXX      G   .../7421/usr/lib/firefox/firefox        554MiB |\n",
      "|    0   N/A  N/A           XXXXX      G   /usr/bin/nautilus                        37MiB |\n",
      "+-----------------------------------------------------------------------------------------+\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Process(`\u001b[4mnvidia-smi\u001b[24m`, ProcessExited(0))"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "run(`nvidia-smi`)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2258be8a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               total        used        free      shared  buff/cache   available\n",
      "Mem:           119Gi       8.0Gi       102Gi       129Mi       9.9Gi       111Gi\n",
      "Swap:           15Gi          0B        15Gi\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Process(`\u001b[4mfree\u001b[24m \u001b[4m-h\u001b[24m`, ProcessExited(0))"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "run(`free -h`)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "44cb78c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "using CUDA\n",
    "using Flux\n",
    "using Flux: onehotbatch, onecold\n",
    "using Flux.Losses: logitcrossentropy\n",
    "using MLDatasets\n",
    "using MLUtils: DataLoader\n",
    "using Optimisers\n",
    "using Statistics\n",
    "using ProgressMeter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fcfdf7af",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CUDA toolchain: \n",
      "- runtime 13.0, artifact installation\n",
      "- driver 580.95.5 for 13.0\n",
      "- compiler 13.0\n",
      "\n",
      "CUDA libraries: \n",
      "- CUBLAS: 13.1.0\n",
      "- CURAND: 10.4.0\n",
      "- CUFFT: 12.0.0\n",
      "- CUSOLVER: 12.0.4\n",
      "- CUSPARSE: 12.6.3\n",
      "- CUPTI: 2025.3.1 (API 13.0.1)\n",
      "- NVML: 13.0.0+580.95.5\n",
      "\n",
      "Julia packages: \n",
      "- CUDA: 5.9.5\n",
      "- CUDA_Driver_jll: 13.0.2+0\n",
      "- CUDA_Compiler_jll: 0.3.0+0\n",
      "- CUDA_Runtime_jll: 0.19.2+0\n",
      "\n",
      "Toolchain:\n",
      "- Julia: 1.11.7\n",
      "- LLVM: 16.0.6\n",
      "\n",
      "1 device:\n",
      "  0: NVIDIA GB10 (sm_121, 101.059 GiB / 119.697 GiB available)\n"
     ]
    }
   ],
   "source": [
    "CUDA.versioninfo()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "13306bcf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "getdata (generic function with 1 method)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function getdata(args, device)\n",
    "    ENV[\"DATADEPS_ALWAYS_ACCEPT\"] = \"true\"\n",
    "\n",
    "    # Loading Dataset\n",
    "    train_data = MLDatasets.MNIST(split=:train)\n",
    "    test_data = MLDatasets.MNIST(split=:test)\n",
    "    \n",
    "    # 前処理をまとめた関数\n",
    "    function preprocess(data)\n",
    "        x = Float32.(data.features)\n",
    "        x = Flux.flatten(x)\n",
    "        y = onehotbatch(data.targets, 0:9)\n",
    "        return (x, y)\n",
    "    end\n",
    "    \n",
    "    xtrain, ytrain = preprocess(train_data)\n",
    "    xtest, ytest = preprocess(test_data)\n",
    "\n",
    "    # Create DataLoaders\n",
    "    train_loader = DataLoader((xtrain, ytrain) |> device, \n",
    "                              batchsize=args.batchsize, \n",
    "                              shuffle=true)\n",
    "    test_loader = DataLoader((xtest, ytest) |> device, \n",
    "                             batchsize=args.batchsize)\n",
    "\n",
    "    return train_loader, test_loader\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "58b5c513",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "build_model (generic function with 1 method)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function build_model(; imgsize=(28, 28, 1), nclasses=10)\n",
    "    input_size = prod(imgsize)\n",
    "    \n",
    "    return Chain(\n",
    "        Dense(input_size => 32, relu),\n",
    "        Dense(32 => nclasses)\n",
    "    )\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "be6873e8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "loss_and_accuracy (generic function with 1 method)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function loss_and_accuracy(data_loader, model, device)\n",
    "    acc = 0\n",
    "    ls = 0.0f0\n",
    "    num = 0\n",
    "    for (x, y) in data_loader\n",
    "        x, y = device(x), device(y)\n",
    "        ŷ = model(x)\n",
    "        ls += logitcrossentropy(ŷ, y, agg=sum)\n",
    "        acc += sum(onecold(ŷ) .== onecold(y))\n",
    "        num += size(x)[end]\n",
    "    end\n",
    "    return ls / num, acc / num\n",
    "end\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fdba656b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Args"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "@kwdef mutable struct Args\n",
    "    η::Float64 = 3e-4       # learning rate\n",
    "    batchsize::Int = 256    # batch size\n",
    "    epochs::Int = 10        # number of epochs\n",
    "    use_cuda::Bool = true   # use gpu (if cuda available)\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "73a68083",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "train (generic function with 1 method)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function train(; kws...)\n",
    "    args = Args(; kws...)\n",
    "\n",
    "    # Device setup\n",
    "    device = if CUDA.functional() && args.use_cuda\n",
    "        @info \"Training on CUDA GPU\"\n",
    "        CUDA.allowscalar(false)\n",
    "        gpu\n",
    "    else\n",
    "        @info \"Training on CPU\"\n",
    "        cpu\n",
    "    end\n",
    "\n",
    "    # Data\n",
    "    train_loader, test_loader = getdata(args, device)\n",
    "\n",
    "    # Model & Optimizer\n",
    "    model = build_model() |> device\n",
    "    opt_state = Optimisers.setup(Adam(args.η), model)\n",
    "    \n",
    "    # Training loop\n",
    "    @showprogress for epoch in 1:args.epochs\n",
    "        for (x, y) in train_loader\n",
    "            x, y = device(x), device(y)\n",
    "            \n",
    "            grads = Flux.gradient(model) do m\n",
    "                logitcrossentropy(m(x), y)\n",
    "            end\n",
    "            \n",
    "            opt_state, model = Optimisers.update!(opt_state, model, grads[1])\n",
    "        end\n",
    "        \n",
    "        # Evaluation\n",
    "        train_loss, train_acc = loss_and_accuracy(train_loader, model, device)\n",
    "        test_loss, test_acc = loss_and_accuracy(test_loader, model, device)\n",
    "        \n",
    "        @info \"Epoch $epoch\" train_loss train_acc test_loss test_acc\n",
    "    end\n",
    "    \n",
    "    return model\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4c2d78da",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mTraining on CUDA GPU\n",
      "\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 1\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.55677825f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.8614333333333334\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.53808856f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.8693\n",
      "\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 2\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.38993272f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.8958166666666667\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.37647548f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.8987\n",
      "\u001b[32mProgress:  20%|████████▎                                |  ETA: 0:01:39\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 3\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.33460784f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.90815\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.32494542f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.9103\n",
      "\u001b[32mProgress:  30%|████████████▎                            |  ETA: 0:00:59\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 4\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.30106047f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.9171\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.29319176f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.9185\n",
      "\u001b[32mProgress:  40%|████████████████▍                        |  ETA: 0:00:39\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 5\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.279726f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.92155\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.27430114f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.9238\n",
      "\u001b[32mProgress:  50%|████████████████████▌                    |  ETA: 0:00:26\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 6\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.26288173f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.92675\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.25974548f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.9269\n",
      "\u001b[32mProgress:  60%|████████████████████████▋                |  ETA: 0:00:18\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 7\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.2496861f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.9304166666666667\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.2493859f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.9296\n",
      "\u001b[32mProgress:  70%|████████████████████████████▊            |  ETA: 0:00:12\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 8\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.23734426f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.9341\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.23815677f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.9314\n",
      "\u001b[32mProgress:  80%|████████████████████████████████▊        |  ETA: 0:00:07\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 9\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.2268096f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.9369666666666666\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.22869739f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.9344\n",
      "\u001b[32mProgress:  90%|████████████████████████████████████▉    |  ETA: 0:00:03\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 10\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_loss = 0.21937917f0\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  train_acc = 0.93865\n",
      "\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m  test_loss = 0.22186221f0\n",
      "\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m  test_acc = 0.9354\n",
      "\u001b[32mProgress: 100%|█████████████████████████████████████████| Time: 0:00:27\u001b[39m\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 30.506029 seconds (90.89 M allocations: 4.707 GiB, 2.40% gc time, 22 lock conflicts, 79.89% compilation time: 1% of which was recompilation)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Chain(\n",
       "  Dense(784 => 32, relu),               \u001b[90m# 25_120 parameters\u001b[39m\n",
       "  Dense(32 => 10),                      \u001b[90m# 330 parameters\u001b[39m\n",
       ") \u001b[90m                  # Total: 4 arrays, \u001b[39m25_450 parameters, 664 bytes."
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Run training \n",
    "@time model = train()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 1.11",
   "language": "julia",
   "name": "julia-1.11"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "1.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "4ae13933-1bb6-4ac9-8606-cac172afdec4",
	"metadata": {},
	"source": [
	"# Running Julia + Flux.jl on GB10"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Julia Version 1.11.7\n",
	"Commit f2b3dbda30a (2025-09-08 12:10 UTC)\n",
	"Build Info:\n",
	" Official https://julialang.org/ release\n",
	"Platform Info:\n",
	" OS: Linux (aarch64-linux-gnu)\n",
	" CPU: 20 × unknown\n",
	" WORD_SIZE: 64\n",
	" LLVM: libLLVM-16.0.6 (ORCJIT, generic)\n",
	"Threads: 1 default, 0 interactive, 1 GC (on 20 virtual cores)\n"
	]
	}
	],
	"source": [
	"versioninfo()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "6a59061b",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"XXX XXX XX XX:XX:XX XXXX \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| NVIDIA-SMI 580.95.05 Driver Version: 580.95.05 CUDA Version: 13.0 \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\| GPU Name Persistence-M \| Bus-Id Disp.A \| Volatile Uncorr. ECC \|\n",
	"\| Fan Temp Perf Pwr:Usage/Cap \| Memory-Usage \| GPU-Util Compute M. \|\n",
	"\| \| \| MIG M. \|\n",
	"\|=========================================+========================+======================\|\n",
	"\| 0 NVIDIA GB10 On \| XXXXXXXX:XX:XX.X On \| N/A \|\n",
	"\| N/A 41C P0 13W / N/A \| Not Supported \| 2% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	"\n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| Processes: \|\n",
	"\| GPU GI CI PID Type Process name GPU Memory \|\n",
	"\| ID ID Usage \|\n",
	"\|=========================================================================================\|\n",
	"\| 0 N/A N/A XXXXX G /usr/lib/xorg/Xorg 301MiB \|\n",
	"\| 0 N/A N/A XXXXX G /usr/bin/gnome-shell 207MiB \|\n",
	"\| 0 N/A N/A XXXXX G .../7421/usr/lib/firefox/firefox 554MiB \|\n",
	"\| 0 N/A N/A XXXXX G /usr/bin/nautilus 37MiB \|\n",
	"+-----------------------------------------------------------------------------------------+\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"Process(`\u001b[4mnvidia-smi\u001b[24m`, ProcessExited(0))"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"run(`nvidia-smi`)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "2258be8a",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" total used free shared buff/cache available\n",
	"Mem: 119Gi 8.0Gi 102Gi 129Mi 9.9Gi 111Gi\n",
	"Swap: 15Gi 0B 15Gi\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"Process(`\u001b[4mfree\u001b[24m \u001b[4m-h\u001b[24m`, ProcessExited(0))"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"run(`free -h`)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "44cb78c3",
	"metadata": {},
	"outputs": [],
	"source": [
	"using CUDA\n",
	"using Flux\n",
	"using Flux: onehotbatch, onecold\n",
	"using Flux.Losses: logitcrossentropy\n",
	"using MLDatasets\n",
	"using MLUtils: DataLoader\n",
	"using Optimisers\n",
	"using Statistics\n",
	"using ProgressMeter"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "fcfdf7af",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CUDA toolchain: \n",
	"- runtime 13.0, artifact installation\n",
	"- driver 580.95.5 for 13.0\n",
	"- compiler 13.0\n",
	"\n",
	"CUDA libraries: \n",
	"- CUBLAS: 13.1.0\n",
	"- CURAND: 10.4.0\n",
	"- CUFFT: 12.0.0\n",
	"- CUSOLVER: 12.0.4\n",
	"- CUSPARSE: 12.6.3\n",
	"- CUPTI: 2025.3.1 (API 13.0.1)\n",
	"- NVML: 13.0.0+580.95.5\n",
	"\n",
	"Julia packages: \n",
	"- CUDA: 5.9.5\n",
	"- CUDA_Driver_jll: 13.0.2+0\n",
	"- CUDA_Compiler_jll: 0.3.0+0\n",
	"- CUDA_Runtime_jll: 0.19.2+0\n",
	"\n",
	"Toolchain:\n",
	"- Julia: 1.11.7\n",
	"- LLVM: 16.0.6\n",
	"\n",
	"1 device:\n",
	" 0: NVIDIA GB10 (sm_121, 101.059 GiB / 119.697 GiB available)\n"
	]
	}
	],
	"source": [
	"CUDA.versioninfo()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "13306bcf",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"getdata (generic function with 1 method)"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function getdata(args, device)\n",
	" ENV[\"DATADEPS_ALWAYS_ACCEPT\"] = \"true\"\n",
	"\n",
	" # Loading Dataset\n",
	" train_data = MLDatasets.MNIST(split=:train)\n",
	" test_data = MLDatasets.MNIST(split=:test)\n",
	" \n",
	" # 前処理をまとめた関数\n",
	" function preprocess(data)\n",
	" x = Float32.(data.features)\n",
	" x = Flux.flatten(x)\n",
	" y = onehotbatch(data.targets, 0:9)\n",
	" return (x, y)\n",
	" end\n",
	" \n",
	" xtrain, ytrain = preprocess(train_data)\n",
	" xtest, ytest = preprocess(test_data)\n",
	"\n",
	" # Create DataLoaders\n",
	" train_loader = DataLoader((xtrain, ytrain) \|> device, \n",
	" batchsize=args.batchsize, \n",
	" shuffle=true)\n",
	" test_loader = DataLoader((xtest, ytest) \|> device, \n",
	" batchsize=args.batchsize)\n",
	"\n",
	" return train_loader, test_loader\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "58b5c513",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"build_model (generic function with 1 method)"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function build_model(; imgsize=(28, 28, 1), nclasses=10)\n",
	" input_size = prod(imgsize)\n",
	" \n",
	" return Chain(\n",
	" Dense(input_size => 32, relu),\n",
	" Dense(32 => nclasses)\n",
	" )\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "be6873e8",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"loss_and_accuracy (generic function with 1 method)"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function loss_and_accuracy(data_loader, model, device)\n",
	" acc = 0\n",
	" ls = 0.0f0\n",
	" num = 0\n",
	" for (x, y) in data_loader\n",
	" x, y = device(x), device(y)\n",
	" ŷ = model(x)\n",
	" ls += logitcrossentropy(ŷ, y, agg=sum)\n",
	" acc += sum(onecold(ŷ) .== onecold(y))\n",
	" num += size(x)[end]\n",
	" end\n",
	" return ls / num, acc / num\n",
	"end\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"id": "fdba656b",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Args"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"@kwdef mutable struct Args\n",
	" η::Float64 = 3e-4 # learning rate\n",
	" batchsize::Int = 256 # batch size\n",
	" epochs::Int = 10 # number of epochs\n",
	" use_cuda::Bool = true # use gpu (if cuda available)\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"id": "73a68083",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"train (generic function with 1 method)"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"function train(; kws...)\n",
	" args = Args(; kws...)\n",
	"\n",
	" # Device setup\n",
	" device = if CUDA.functional() && args.use_cuda\n",
	" @info \"Training on CUDA GPU\"\n",
	" CUDA.allowscalar(false)\n",
	" gpu\n",
	" else\n",
	" @info \"Training on CPU\"\n",
	" cpu\n",
	" end\n",
	"\n",
	" # Data\n",
	" train_loader, test_loader = getdata(args, device)\n",
	"\n",
	" # Model & Optimizer\n",
	" model = build_model() \|> device\n",
	" opt_state = Optimisers.setup(Adam(args.η), model)\n",
	" \n",
	" # Training loop\n",
	" @showprogress for epoch in 1:args.epochs\n",
	" for (x, y) in train_loader\n",
	" x, y = device(x), device(y)\n",
	" \n",
	" grads = Flux.gradient(model) do m\n",
	" logitcrossentropy(m(x), y)\n",
	" end\n",
	" \n",
	" opt_state, model = Optimisers.update!(opt_state, model, grads[1])\n",
	" end\n",
	" \n",
	" # Evaluation\n",
	" train_loss, train_acc = loss_and_accuracy(train_loader, model, device)\n",
	" test_loss, test_acc = loss_and_accuracy(test_loader, model, device)\n",
	" \n",
	" @info \"Epoch $epoch\" train_loss train_acc test_loss test_acc\n",
	" end\n",
	" \n",
	" return model\n",
	"end"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"id": "4c2d78da",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mTraining on CUDA GPU\n",
	"\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 1\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.55677825f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.8614333333333334\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.53808856f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.8693\n",
	"\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 2\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.38993272f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.8958166666666667\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.37647548f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.8987\n",
	"\u001b[32mProgress: 20%\|████████▎ \| ETA: 0:01:39\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 3\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.33460784f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.90815\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.32494542f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9103\n",
	"\u001b[32mProgress: 30%\|████████████▎ \| ETA: 0:00:59\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 4\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.30106047f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.9171\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.29319176f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9185\n",
	"\u001b[32mProgress: 40%\|████████████████▍ \| ETA: 0:00:39\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 5\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.279726f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.92155\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.27430114f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9238\n",
	"\u001b[32mProgress: 50%\|████████████████████▌ \| ETA: 0:00:26\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 6\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.26288173f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.92675\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.25974548f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9269\n",
	"\u001b[32mProgress: 60%\|████████████████████████▋ \| ETA: 0:00:18\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 7\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.2496861f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.9304166666666667\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.2493859f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9296\n",
	"\u001b[32mProgress: 70%\|████████████████████████████▊ \| ETA: 0:00:12\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 8\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.23734426f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.9341\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.23815677f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9314\n",
	"\u001b[32mProgress: 80%\|████████████████████████████████▊ \| ETA: 0:00:07\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 9\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.2268096f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.9369666666666666\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.22869739f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9344\n",
	"\u001b[32mProgress: 90%\|████████████████████████████████████▉ \| ETA: 0:00:03\u001b[39m\u001b[36m\u001b[1m┌ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mEpoch 10\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_loss = 0.21937917f0\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m train_acc = 0.93865\n",
	"\u001b[36m\u001b[1m│ \u001b[22m\u001b[39m test_loss = 0.22186221f0\n",
	"\u001b[36m\u001b[1m└ \u001b[22m\u001b[39m test_acc = 0.9354\n",
	"\u001b[32mProgress: 100%\|█████████████████████████████████████████\| Time: 0:00:27\u001b[39m\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" 30.506029 seconds (90.89 M allocations: 4.707 GiB, 2.40% gc time, 22 lock conflicts, 79.89% compilation time: 1% of which was recompilation)\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"Chain(\n",
	" Dense(784 => 32, relu), \u001b[90m# 25_120 parameters\u001b[39m\n",
	" Dense(32 => 10), \u001b[90m# 330 parameters\u001b[39m\n",
	") \u001b[90m # Total: 4 arrays, \u001b[39m25_450 parameters, 664 bytes."
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Run training \n",
	"@time model = train()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Julia 1.11",
	"language": "julia",
	"name": "julia-1.11"
	},
	"language_info": {
	"file_extension": ".jl",
	"mimetype": "application/julia",
	"name": "julia",
	"version": "1.11.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
No results found