Wildcarde · May 4, 2021 05:13
diff --git a/01_notes.md b/01_notes.md
diff --git a/capstart.sh b/capstart.sh
 isi_for_array -n9-13 'nohup /bin/bash /ifs/data/Isilon_Support/pcaps/capture_control.sh > /ifs/data/Isilon_Support/pcaps/logfile 2>&1 &'
diff --git a/capstop.sh b/capstop.sh
 isi_for_array -n9-13 killall -SIGINT capture_control.sh
 isi_for_array -n9-13 killall -SIGINT tcpdump
 sleep 2
 isi_for_array -s ps auxw |egrep "capture_control|tcpdump"
diff --git a/capture_control.sh b/capture_control.sh
 #!/bin/bash
 tcpdump -w /ifs/data/Isilon_Support/pcaps/$(hostname)/$(date '+%Y-%m-%dT%H.%M.%S').$(hostname).bxe1.pcap -i bxe1 -s 640 -C 250 -W 100
diff --git a/check-nfs-datacapture.sh b/check-nfs-datacapture.sh
 #! /bin/env bash

 #this script will handles a few things:
 # 1) stop the tcpdump capture running on the node
 # 2) clone pcap output to the sink admin folder
 # 3) remote into bucket and terminate pcap and related processes
 #also:
 ## reaper program to remove old stat block captures from isiperf, check if
 #they are older than 5 hours if so remove them.7
 #files will be located in: /ifs/data/Isilon_Support/corecollect this program
 #should call the reaper if it's still searchign for problems and skip if the run has finished.

 #kill commands for tcpdump

 #check value of nfsmonitoring script
 read -r errorcode < /tmp/nfsmonitor.semaphore

 #once the issue has been captured set this so that it doesn't keep attempting to run
 if [ -f /tmp/nfsissuecaptured ]; then
  #logger "CAPTUREPCAP: Issue Captured Already"
  exit 0
 fi
 #if empty or something other than one quit and do nothing.
 #the initial file has no value in it so errorcode is actually null
 if [ -z "$errorcode" ]; then
  #logger "CAPTUREPCAP: No Issue Found"
  exit 0
 fi
 #if capture has been reset by sending an echo 0 this will apply.
 if [ $errorcode != "1" ]; then
  #logger "CAPTUREPCAP: No Issue Found"
  exit 0
 fi

 logger "CAPTUREPCAP: Issue discovered, capturing"

 #Kill all tcpdump tasks
 killall -SIGINT tcpdump
 sleep 2 #make sure the kill finishes

 #turn off this script for the future
 touch /tmp/nfsissuecaptured

 # ssh into bucket node and stop tcpdump
 ssh 10.2.147.221 'bash /ifs/data/Isilon_Support/pcaps/capstop.sh'

 #store the pcaps into our working area so they can be sent to dell
 rsync /tmp/pcaps/* /mnt/bucket/PNI-facilities/sw/gmcgrath/work/pcaps

 exit 0

 ## notes and referrences.

 #killall -SIGINT tcpdump
 #sleep 2
 #ps auxw |egrep "capture_control|tcpdump"

 #tcp dump to run on node
 #tcpdump -w /tmp/pcaps/$(date '+%Y-%m-%dT%H.%M.%S').$(hostname).em1.pcap -i em1 -s 640 -C 250 -W 100 -Z root
diff --git a/check-nfs.sh b/check-nfs.sh
 #!/bin/bash
 # original script this was built off of: https://gist.github.com/cinsk/840ed553905cb6e8f0ae
 PATH=/bin:/usr/bin:/usr/local/bin

 check-nfs () {
  #this temp file holds the pid of the orphan shell that needs to be cleaned up
  local TMPFILE=/tmp/checknfs.$$ RET=0 ORPHAN SUBSHELLPID

  #arg checks
  if [ "$#" -eq 0 ]; then
    cat<<EOF
    usage: check-nfs NFS-DIRECTORY...
    Check if accessing any of NFS-DIRECTORY failed
 EOF
    return 1
  fi

  while [ -n "$1" ]; do
    read -t35 < <(echo $BASHPID >"$TMPFILE"; stat -t "$1" 2>/dev/null)
      if [ "$?" -gt 128 ]; then
        #echo "error: $1"
        ORPHAN=$(cat $TMPFILE)
        SUBSHELLPID=$(ps --ppid $ORPHAN -o pid=)
        [ -n "$SUBSHELLPID" ] && kill -9 $SUBSHELLPID
        kill -9 $ORPHAN
        RET=1
        #if mount is hung notify
        logger "CHECKNFS: $1 hung; attempting to fix"
        echo 1 > /tmp/nfsmonitor.semaphore
        umount -l $1
        sleep 1
        mount $1

        if [ "$?" -eq 0 ]; then
          logger "CHECKNFS: $1 fixed"
          #only print success notification if it works
        fi
      fi
      shift
      rm -f $TMPFILE
    done
    return "$RET"
 }

 #parse mount command and check all NFS mounts with check-nfs function
 while read _ _ mount _; do

  check-nfs "$mount"

 done < <(mount -t nfs)
	isi_for_array -n9-13 killall -SIGINT capture_control.sh
	isi_for_array -n9-13 killall -SIGINT tcpdump
	sleep 2
	isi_for_array -s ps auxw \|egrep "capture_control\|tcpdump"
	#!/bin/bash
	tcpdump -w /ifs/data/Isilon_Support/pcaps/$(hostname)/$(date '+%Y-%m-%dT%H.%M.%S').$(hostname).bxe1.pcap -i bxe1 -s 640 -C 250 -W 100
	#! /bin/env bash

	#this script will handles a few things:
	# 1) stop the tcpdump capture running on the node
	# 2) clone pcap output to the sink admin folder
	# 3) remote into bucket and terminate pcap and related processes
	#also:
	## reaper program to remove old stat block captures from isiperf, check if
	#they are older than 5 hours if so remove them.7
	#files will be located in: /ifs/data/Isilon_Support/corecollect this program
	#should call the reaper if it's still searchign for problems and skip if the run has finished.

	#kill commands for tcpdump

	#check value of nfsmonitoring script
	read -r errorcode < /tmp/nfsmonitor.semaphore

	#once the issue has been captured set this so that it doesn't keep attempting to run
	if [ -f /tmp/nfsissuecaptured ]; then
	#logger "CAPTUREPCAP: Issue Captured Already"
	exit 0
	fi
	#if empty or something other than one quit and do nothing.
	#the initial file has no value in it so errorcode is actually null
	if [ -z "$errorcode" ]; then
	#logger "CAPTUREPCAP: No Issue Found"
	exit 0
	fi
	#if capture has been reset by sending an echo 0 this will apply.
	if [ $errorcode != "1" ]; then
	#logger "CAPTUREPCAP: No Issue Found"
	exit 0
	fi

	logger "CAPTUREPCAP: Issue discovered, capturing"

	#Kill all tcpdump tasks
	killall -SIGINT tcpdump
	sleep 2 #make sure the kill finishes

	#turn off this script for the future
	touch /tmp/nfsissuecaptured

	# ssh into bucket node and stop tcpdump
	ssh 10.2.147.221 'bash /ifs/data/Isilon_Support/pcaps/capstop.sh'

	#store the pcaps into our working area so they can be sent to dell
	rsync /tmp/pcaps/* /mnt/bucket/PNI-facilities/sw/gmcgrath/work/pcaps

	exit 0

	## notes and referrences.

	#killall -SIGINT tcpdump
	#sleep 2
	#ps auxw \|egrep "capture_control\|tcpdump"

	#tcp dump to run on node
	#tcpdump -w /tmp/pcaps/$(date '+%Y-%m-%dT%H.%M.%S').$(hostname).em1.pcap -i em1 -s 640 -C 250 -W 100 -Z root
	#!/bin/bash
	# original script this was built off of: https://gist.github.com/cinsk/840ed553905cb6e8f0ae
	PATH=/bin:/usr/bin:/usr/local/bin

	check-nfs () {
	#this temp file holds the pid of the orphan shell that needs to be cleaned up
	local TMPFILE=/tmp/checknfs.$$ RET=0 ORPHAN SUBSHELLPID

	#arg checks
	if [ "$#" -eq 0 ]; then
	cat<<EOF
	usage: check-nfs NFS-DIRECTORY...
	Check if accessing any of NFS-DIRECTORY failed
	EOF
	return 1
	fi

	while [ -n "$1" ]; do
	read -t35 < <(echo $BASHPID >"$TMPFILE"; stat -t "$1" 2>/dev/null)
	if [ "$?" -gt 128 ]; then
	#echo "error: $1"
	ORPHAN=$(cat $TMPFILE)
	SUBSHELLPID=$(ps --ppid $ORPHAN -o pid=)
	[ -n "$SUBSHELLPID" ] && kill -9 $SUBSHELLPID
	kill -9 $ORPHAN
	RET=1
	#if mount is hung notify
	logger "CHECKNFS: $1 hung; attempting to fix"
	echo 1 > /tmp/nfsmonitor.semaphore
	umount -l $1
	sleep 1
	mount $1

	if [ "$?" -eq 0 ]; then
	logger "CHECKNFS: $1 fixed"
	#only print success notification if it works
	fi
	fi
	shift
	rm -f $TMPFILE
	done
	return "$RET"
	}

	#parse mount command and check all NFS mounts with check-nfs function
	while read _ _ mount _; do

	check-nfs "$mount"

	done < <(mount -t nfs)