Created
June 18, 2024 11:41
-
-
Save zceemja/fe727e57ad57d7fddc5052eef3b9d10c to your computer and use it in GitHub Desktop.
Keep same environment for of different linux hosts with different nvidia drivers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash -e | |
| # This is a script to run a lxroot container on a server, including cuda support | |
| # Check https://github.com/parke/lxroot | |
| # | |
| # The main idea that I have a bunch of servers at university running slightly different versions | |
| # of nvidia driver, python versions and its a real headacke to have one single environment for all of them. | |
| # | |
| # With this script each container gets its own ld.cache.so that links to correct place for nvidia driver. | |
| # This is done by storing cache in ./ld/$hostname/ and binding it to /usr/host/ld/ | |
| # | |
| # Setup: | |
| # 1) You need a working lxroot container (tested to work with archlinux container on bunch of centos 8 / rocky 9.x hosts) | |
| # also note that this assumes hat work directory is shared over NFS across all servers. | |
| # 2) In container, add "include /usr/host/ld/*.conf" to "/etc/ld.so.conf" | |
| # 3) In container, install nvidia drivers to /usr/lib/nvidia/$version | |
| # e.g. you should find /usr/lib/nvidia/550.90.07/libnvidia-ml.so.550.90.07 | |
| # you should keep as many versions as there are unique versions in servers | |
| # from exprience you want to install all libs fresh from nvidia rather can binding .so from | |
| # host as host may be not have x86 not x86_64 versions (or vise versa) as container expects. | |
| # 4) Optionally do some script to point to correct /usr/bin/nvidia/$version too | |
| HERE=$(dirname -- "$( readlink -f -- "$0")") | |
| ROOT="$HERE/root.x86_64" | |
| LXROOT="$HERE/lxroot" | |
| unset LD_LIBRARY_PATH | |
| unset CUDA_HOME | |
| unset CNN_HOME | |
| unset TF_CUDA_PATHS | |
| unset XLA_FLAGS | |
| export TZ='Europe/London' # helps with some issues | |
| export $(dbus-launch) | |
| C0=$(tput sgr0) | |
| C1=$(tput setaf 1) | |
| C2=$(tput setaf 2) | |
| C3=$(tput setaf 3) | |
| BIND=( | |
| bind $HOME $HOME | |
| bind /tmp /tmp | |
| bind /run /run | |
| bind /dev /dev | |
| bind /proc /proc | |
| ) | |
| # Make sure user is registered, adds multi-user support! | |
| if ! grep -q "$USER" "$ROOT/etc/passwd"; then | |
| UNAME=$(showuser $USER) | |
| NAME=$(echo "$UNAME" | sed -n "s/.*: \([A-Za-z ]*\) .*/\1/p") | |
| echo "Adding user $(tput setaf 2)$USER$(tput setaf 5) ($NAME)$(tput sgr0)" | |
| echo "$USER:x:$UID:$(id -g $USER):$NAME:$HOME:/bin/bash" >> "$ROOT/etc/passwd" | |
| fi | |
| # Each host gets its own ld cache | |
| _LDDIR="$HERE/ld/$HOSTNAME" | |
| mkdir -p "$_LDDIR" | |
| BIND+=(bind /usr/host/ld "$_LDDIR") | |
| if [ -f "/sys/module/nvidia/version" ]; then | |
| _NVVER="$(cat /sys/module/nvidia/version)" # Host NV driver version | |
| [[ -f "$_LDDIR/nvidia.conf" ]] && _CNVVER=$(cat "$_LDDIR/nvidia.conf") || _CNVVER="/$(tput bold)*no version*${C0}" | |
| _CNVVER=${_CNVVER##*/} # Container version | |
| if [ ! -d "$ROOT/usr/lib/nvidia/$_NVVER" ]; then | |
| echo "${C1}CUDA will not work with server driver: container missing nvidia driver version ${C2}$_NVVER${C0}!" | |
| else | |
| if [ "$_NVVER" != "$_CNVVER" ]; then | |
| echo "/usr/lib/nvidia/$_NVVER" > "$_LDDIR/nvidia.conf" | |
| echo "${C3}Updated nvidia driver version from ${C2}$_CNVVER${C3} to ${C2}$_NVVER${C0}" | |
| rm -fv "$_LDDIR/ld.so.cache" # update cache next | |
| fi | |
| fi | |
| fi | |
| # Rebuild cache if needed | |
| if [ ! -f "$_LDDIR/ld.so.cache" ]; then | |
| echo ${C3}Rebuilding ld cache..${C0} | |
| rm "$ROOT/etc/ld.so.cache" | |
| ln -s "/usr/host/ld/ld.so.cache" "$ROOT/etc/ld.so.cache" | |
| $LXROOT rw "$ROOT" -rw bind /usr/host/ld "$_LDDIR" -- ldconfig -C /usr/host/ld/ld.so.cache | |
| fi | |
| # Make sure directories bind exists on container | |
| for (( i = 1; i < ${#BIND[*]}; ++ i )); do | |
| [[ ${BIND[$i-1]} == 'bind' ]] && mkdir -vp ${ROOT}/${BIND[$i]} | |
| done | |
| if [[ "$1" == "" ]]; then | |
| $LXROOT rw $ROOT -nwe ${BIND[@]} cd "$(pwd)" -- /bin/bash | |
| else | |
| $LXROOT rw $ROOT -nwe ${BIND[@]} cd "$(pwd)" -- $@ | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment