zceemja · June 18, 2024 11:41
diff --git a/container.sh b/container.sh
 #!/bin/bash -e

 # This is a script to run a lxroot container on a server, including cuda support
 # Check https://github.com/parke/lxroot
 #
 # The main idea that I have a bunch of servers at university running slightly different versions 
 # of nvidia driver, python versions and its a real headacke to have one single environment for all of them.
 # 
 # With this script each container gets its own ld.cache.so that links to correct place for nvidia driver.
 # This is done by storing cache in ./ld/$hostname/ and binding it to /usr/host/ld/
 # 
 # Setup:
 # 1) You need a working lxroot container (tested to work with archlinux container on bunch of centos 8 / rocky 9.x hosts)
 #        also note that this assumes hat work directory is shared over NFS across all servers.
 # 2) In container, add "include /usr/host/ld/*.conf" to "/etc/ld.so.conf"
 # 3) In container, install nvidia drivers to /usr/lib/nvidia/$version
 #        e.g. you should find /usr/lib/nvidia/550.90.07/libnvidia-ml.so.550.90.07
 #        you should keep as many versions as there are unique versions in servers
 #        from exprience you want to install all libs fresh from nvidia rather can binding .so from 
 #        host as host may be not have x86 not x86_64 versions (or vise versa) as container expects.
 # 4) Optionally do some script to point to correct /usr/bin/nvidia/$version too

 HERE=$(dirname -- "$( readlink -f -- "$0")")
 ROOT="$HERE/root.x86_64"
 LXROOT="$HERE/lxroot"

 unset LD_LIBRARY_PATH
 unset CUDA_HOME
 unset CNN_HOME
 unset TF_CUDA_PATHS
 unset XLA_FLAGS

 export TZ='Europe/London'  # helps with some issues
 export $(dbus-launch)

 C0=$(tput sgr0)
 C1=$(tput setaf 1)
 C2=$(tput setaf 2)
 C3=$(tput setaf 3)

 BIND=(
  bind $HOME $HOME
  bind /tmp /tmp
  bind /run /run
  bind /dev /dev
  bind /proc /proc
 )

 # Make sure user is registered, adds multi-user support!
 if ! grep -q "$USER" "$ROOT/etc/passwd"; then
  UNAME=$(showuser $USER)
  NAME=$(echo "$UNAME" | sed -n "s/.*: \([A-Za-z ]*\) .*/\1/p")
  echo "Adding user $(tput setaf 2)$USER$(tput setaf 5) ($NAME)$(tput sgr0)"
  echo "$USER:x:$UID:$(id -g $USER):$NAME:$HOME:/bin/bash" >> "$ROOT/etc/passwd"
 fi

 # Each host gets its own ld cache
 _LDDIR="$HERE/ld/$HOSTNAME"
 mkdir -p "$_LDDIR"
 BIND+=(bind /usr/host/ld "$_LDDIR")

 if [ -f "/sys/module/nvidia/version" ]; then
  _NVVER="$(cat /sys/module/nvidia/version)"  # Host NV driver version

  [[ -f "$_LDDIR/nvidia.conf" ]] && _CNVVER=$(cat "$_LDDIR/nvidia.conf") || _CNVVER="/$(tput bold)*no version*${C0}"
  _CNVVER=${_CNVVER##*/}                      # Container version
  if [ ! -d "$ROOT/usr/lib/nvidia/$_NVVER" ]; then
    echo "${C1}CUDA will not work with server driver: container missing nvidia driver version ${C2}$_NVVER${C0}!"
  else
    if [ "$_NVVER" != "$_CNVVER" ]; then
      echo "/usr/lib/nvidia/$_NVVER" > "$_LDDIR/nvidia.conf"
      echo "${C3}Updated nvidia driver version from ${C2}$_CNVVER${C3} to ${C2}$_NVVER${C0}"
      rm -fv "$_LDDIR/ld.so.cache"  # update cache next
    fi
  fi
 fi

 # Rebuild cache if needed
 if [ ! -f "$_LDDIR/ld.so.cache" ]; then
  echo ${C3}Rebuilding ld cache..${C0}
  rm "$ROOT/etc/ld.so.cache"
  ln -s "/usr/host/ld/ld.so.cache" "$ROOT/etc/ld.so.cache"
  $LXROOT rw "$ROOT" -rw bind /usr/host/ld "$_LDDIR" -- ldconfig -C /usr/host/ld/ld.so.cache
 fi

 # Make sure directories bind exists on container
 for (( i = 1; i < ${#BIND[*]}; ++ i )); do
  [[ ${BIND[$i-1]} == 'bind' ]] && mkdir -vp ${ROOT}/${BIND[$i]}
 done

 if [[ "$1" == "" ]]; then
  $LXROOT rw $ROOT -nwe ${BIND[@]} cd "$(pwd)" -- /bin/bash
 else
  $LXROOT rw $ROOT -nwe ${BIND[@]} cd "$(pwd)" -- $@
 fi
	#!/bin/bash -e

	# This is a script to run a lxroot container on a server, including cuda support
	# Check https://github.com/parke/lxroot
	#
	# The main idea that I have a bunch of servers at university running slightly different versions
	# of nvidia driver, python versions and its a real headacke to have one single environment for all of them.
	#
	# With this script each container gets its own ld.cache.so that links to correct place for nvidia driver.
	# This is done by storing cache in ./ld/$hostname/ and binding it to /usr/host/ld/
	#
	# Setup:
	# 1) You need a working lxroot container (tested to work with archlinux container on bunch of centos 8 / rocky 9.x hosts)
	# also note that this assumes hat work directory is shared over NFS across all servers.
	# 2) In container, add "include /usr/host/ld/*.conf" to "/etc/ld.so.conf"
	# 3) In container, install nvidia drivers to /usr/lib/nvidia/$version
	# e.g. you should find /usr/lib/nvidia/550.90.07/libnvidia-ml.so.550.90.07
	# you should keep as many versions as there are unique versions in servers
	# from exprience you want to install all libs fresh from nvidia rather can binding .so from
	# host as host may be not have x86 not x86_64 versions (or vise versa) as container expects.
	# 4) Optionally do some script to point to correct /usr/bin/nvidia/$version too

	HERE=$(dirname -- "$( readlink -f -- "$0")")
	ROOT="$HERE/root.x86_64"
	LXROOT="$HERE/lxroot"

	unset LD_LIBRARY_PATH
	unset CUDA_HOME
	unset CNN_HOME
	unset TF_CUDA_PATHS
	unset XLA_FLAGS

	export TZ='Europe/London' # helps with some issues
	export $(dbus-launch)

	C0=$(tput sgr0)
	C1=$(tput setaf 1)
	C2=$(tput setaf 2)
	C3=$(tput setaf 3)

	BIND=(
	bind $HOME $HOME
	bind /tmp /tmp
	bind /run /run
	bind /dev /dev
	bind /proc /proc
	)

	# Make sure user is registered, adds multi-user support!
	if ! grep -q "$USER" "$ROOT/etc/passwd"; then
	UNAME=$(showuser $USER)
	NAME=$(echo "$UNAME" \| sed -n "s/.: \([A-Za-z ]\) .*/\1/p")
	echo "Adding user $(tput setaf 2)$USER$(tput setaf 5) ($NAME)$(tput sgr0)"
	echo "$USER:x:$UID:$(id -g $USER):$NAME:$HOME:/bin/bash" >> "$ROOT/etc/passwd"
	fi

	# Each host gets its own ld cache
	_LDDIR="$HERE/ld/$HOSTNAME"
	mkdir -p "$_LDDIR"
	BIND+=(bind /usr/host/ld "$_LDDIR")

	if [ -f "/sys/module/nvidia/version" ]; then
	_NVVER="$(cat /sys/module/nvidia/version)" # Host NV driver version

	[[ -f "$_LDDIR/nvidia.conf" ]] && _CNVVER=$(cat "$_LDDIR/nvidia.conf") \|\| _CNVVER="/$(tput bold)no version${C0}"
	_CNVVER=${_CNVVER##*/} # Container version
	if [ ! -d "$ROOT/usr/lib/nvidia/$_NVVER" ]; then
	echo "${C1}CUDA will not work with server driver: container missing nvidia driver version ${C2}$_NVVER${C0}!"
	else
	if [ "$_NVVER" != "$_CNVVER" ]; then
	echo "/usr/lib/nvidia/$_NVVER" > "$_LDDIR/nvidia.conf"
	echo "${C3}Updated nvidia driver version from ${C2}$_CNVVER${C3} to ${C2}$_NVVER${C0}"
	rm -fv "$_LDDIR/ld.so.cache" # update cache next
	fi
	fi
	fi

	# Rebuild cache if needed
	if [ ! -f "$_LDDIR/ld.so.cache" ]; then
	echo ${C3}Rebuilding ld cache..${C0}
	rm "$ROOT/etc/ld.so.cache"
	ln -s "/usr/host/ld/ld.so.cache" "$ROOT/etc/ld.so.cache"
	$LXROOT rw "$ROOT" -rw bind /usr/host/ld "$_LDDIR" -- ldconfig -C /usr/host/ld/ld.so.cache
	fi

	# Make sure directories bind exists on container
	for (( i = 1; i < ${#BIND[*]}; ++ i )); do
	[[ ${BIND[$i-1]} == 'bind' ]] && mkdir -vp ${ROOT}/${BIND[$i]}
	done

	if [[ "$1" == "" ]]; then
	$LXROOT rw $ROOT -nwe ${BIND[@]} cd "$(pwd)" -- /bin/bash
	else
	$LXROOT rw $ROOT -nwe ${BIND[@]} cd "$(pwd)" -- $@
	fi
No results found