NODE_NAME="$(kubectl get node -o jsonpath="{.items[0].metadata.name}")"
kubectl node-shell $NODE_NAMEcopy rebuild_kernel.sh into /opt/rebuild_kernel.sh or similar, and bash rebuild_kernel.sh
it'll reboot into the new kernel if successful.
| #!/usr/bin/env bash | |
| # version must be available from https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/ | |
| # version must be paired with a cuda release from https://developer.nvidia.com/cuda-downloads | |
| # not every driver version releases fabricmanager artifacts or pairs with a cuda version -_- | |
| # we recommend using production or LTS branches when possible. | |
| # e.g. 12.6.2 + 560.35.03 are here | |
| # https://developer.nvidia.com/cuda-12-6-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=runfile_local | |
| # https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-x86_64/fabricmanager-linux-x86_64-560.35.03-archive.tar.xz | |
| NVIDIA_DRIVER_VERSION="560.35.03" |
| # install nvidia device plugin (without env var) | |
| kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/main/deployments/static/nvidia-device-plugin-compat-with-cpumanager.yml | |
| # ssh OR nsenter node using node-shell + privileged pod | |
| # tried both to eliminate any container mount issues. | |
| # same behavior | |
| # https://github.com/kvaps/kubectl-node-shell | |
| kubectl node-shell aks-nca100-36400834-vmss000000 | |
| apiVersion: kyverno.io/v1 | |
| kind: ClusterPolicy | |
| metadata: | |
| name: exclude-all-pods-http-proxy | |
| spec: | |
| mutateExistingOnPolicyUpdate: false | |
| rules: | |
| - name: pod-ns | |
| match: | |
| any: |
| apiVersion: apps/v1 | |
| kind: DaemonSet | |
| metadata: | |
| name: nvidia-device-plugin-daemonset | |
| namespace: kube-system | |
| spec: | |
| selector: | |
| matchLabels: | |
| name: nvidia-device-plugin-ds | |
| updateStrategy: |
| import http.client | |
| import xml.etree.ElementTree as ET | |
| from urllib.parse import urlparse | |
| import json | |
| from subprocess import Popen, PIPE, STDOUT | |
| import base64 | |
| try: | |
| # request goalstate from wireserver | |
| wireserver = "168.63.129.16" |
| export GROUP=ace-mig | |
| export NAME=ace-mig | |
| export LOCATION=eastus | |
| az group create -g "${GROUP}" -l ${LOCATION} | |
| # create a cluster with a default pool with some typical parameters. | |
| # not really relevant. | |
| # only key piece is use k8s version >= 1.25.0 for Ubuntu 22.04 with cgroupv2. |
| # n.b.: nodes are in 172.18.0.0/16 | |
| # hash subnet range with sha256 -> first 10 digit for prefix | |
| hash_prefix=$(ip r | grep -E "\/[0-9]+ dev eth0" | cut -d' ' -f1 | sha256sum | head -c 10) | |
| # prepend with fd for unique local address predix for 6rd routing | |
| rd_prefix="fd${hash_prefix}" | |
| # add colons between each 4 hex chars | |
| rd_prefix_formatted=$(echo "${rd_prefix}" | fold -w4 | paste -sd:) | |
| local_addr=$(ip a show dev eth0 | grep -E 'inet ' | cut -d' ' -f6 | cut -d'/' -f1) | |
| # get local IPv4 subnet as XX.XX.XX.XX/XX |
| apiVersion: apps/v1 | |
| kind: DaemonSet | |
| metadata: | |
| name: &name kubelet-killer-30sec | |
| labels: | |
| app: *name | |
| spec: | |
| selector: | |
| matchLabels: | |
| app: *name |
| # for context, this is a 22.04 hetzner machine upgraded from 20.04 with maybe a kernel I rebuilt (?) I forgot. | |
| root@Ubuntu-2004-focal-64-minimal ~ # cat /etc/os-release | |
| PRETTY_NAME="Ubuntu 22.04.2 LTS" | |
| NAME="Ubuntu" | |
| VERSION_ID="22.04" | |
| VERSION="22.04.2 LTS (Jammy Jellyfish)" | |
| VERSION_CODENAME=jammy | |
| ID=ubuntu | |
| ID_LIKE=debian | |
| HOME_URL="https://www.ubuntu.com/" |