#!/bin/bash

## Monitoring EDAC errors on Linux systems
##  This monitor requires the installation of 'edac-utils'
##  11/19/2012 - cortrigl

#DEBUG=1

COLUMN='dimm'
COLOR='green'
VERSION='1.1'
NO_MECH=0
EDAC_BASEDIR='/sys/devices/system/edac/mc'
EDAC_UTIL=$(which edac-util 2>/dev/null)
MCELOG=$(which mcelog 2>/dev/null)
declare -a MEM_ERRS
DATE=$(date)

DESC=''
LINE=''
UEFLAG=0

## NOTE: Old "Mike" nodes (Tyan motherboards) are lacking the E75xx RASUM controller
##  which is where EDAC pulls DIMM data from.  Or, more succinctly, the controller
##  exists but isn't being reported back to the kernel for whatever reason.
##
## A possible workaround exists, namely the use of the mcelog.  Currently (11/26/12),
##  it is difficult to write a routine for parsing mcelog data since mcelog only has
##  something to say when something breaks.  Will likely find some known bad DIMMs
##  and begin purposeively failing a machine to collect this data.  This is TODO.
##
## Another NOTE: One big caveat with rcelog is that it only handles 32-bit equipment
##  as of kernel 2.6.30.  The parameter "CONFIG_X86_RCE" was introduced in 2.6.30.
##  Thus, these CentOS 5.5 rigs are SoL.  Time to upgrade?

function mcelog_report ()
{
	## TODO: parse mcelog --client
	if [[ ${DEBUG} ]]; then
		echo "No EDAC available: using mcelog."
	fi

	MCE_OUTPUT=$(${MCELOG})
	if [[ -z "${MCE_OUTPUT}" ]]; then
		if [[ ${DEBUG} ]]; then
			echo "mcelog output is blank: all things normal?"
		else
    	## Send it to Xymon
			DESC="MCELOG reports "
   	 	xymon_submit
		fi
	fi
}

function edac_report ()
{
    i=0
    if [[ ${DEBUG} ]]; then
        echo "Color: $COLOR; edac_base: $EDAC_BASEDIR; edac-util: $EDAC_UTIL; ueflag: $UEFLAG"
    fi

	if [[ -f "$EDAC_UTIL" ]]; then
		EDAC_UTIL_OUT=`(${EDAC_UTIL}) 2>&1`
		## Otherwise, attempt the edac-util command
        if [ "${EDAC_UTIL_OUT}" == "edac-util: No errors to report." ]; then
            COLOR='green'
            if [[ ${DEBUG} ]]; then
                echo "should be catching a 'no error' output\n"
                echo "edac util output ::: ${EDAC_UTIL_OUT}\n"
                echo -n "sizeof MEM_ERRS: "
                echo ${#MEM_ERRS[@]}
            fi
        else
		    for line in ${EDAC_UTIL_OUT}; do
                IFS=\n
    			E=$(echo $line | sed 's/: /:/g')
    			IFS=':'
    			set $E
    			CNTRL=$1
    			DIMM=$2
    			CHAN=$3
    			IFS=' '
    			set $4
    			TYPE=$2
    			COUNT=$1
    			MEM_ERRS[$i]="$CNTRL:$DIMM:$TYPE:$COUNT"
    			i=$(( i + 1 ))
             if [[ ${DEBUG} ]]; then
                 echo "$CNTRL ::: $DIMM ::: $TYPE :: $COUNT"
                 echo -n "sizeof MEM_ERRS: "
                 echo ${#MEM_ERRS[@]}
             fi
    		done
        fi
	else
		## No EDAC utils?  No problem...
		##  We'll use the /sys structure to harvest EDAC error info
		##  A little more messy, but doable
 	 cd ${EDAC_BASEDIR}
 	 for d in $(find ${EDAC_BASEDIR} -maxdepth 1 -type d \( ! -iname ".*" \)); do
 	   if [[ "${d}" != "${EDAC_BASEDIR}" ]]; then
 	     cd ${d}
 	     CHAN_CE_ERR=$(cat ce_count)
 	     CHAN_UE_ERR=$(cat ue_count)
 	     if [ ${CHAN_UE_ERR} -gt 0 ] || [ ${CHAN_CE_ERR} -gt 0 ]; then
 	       if [[ ${DEBUG} ]]; then
 	         echo "EDAC_BASEDIR: ${EDAC_BASEDIR}"
 	         echo "d: ${d}"
 	         echo "chan ue: ${CHAN_UE_ERR} ::: chan ce: ${CHAN_CE_ERR}"
 	         echo
 	       fi
	
 	       for c in $(find ${d} -maxdepth 1 -type d \( ! -iname ".*" \)); do
 	         if [[ "${c}" != "${d}" ]]; then
 	           cd ${c}
 	           DIMM_CE_ERR=$(cat ce_count)
 	           DIMM_UE_ERR=$(cat ue_count)
 	           if [[ ${DIMM_CE_ERR} -gt 0 ]]; then
 	             MEM_ERRS[$i]="${d##*/}:${c##*/}:Corrected:${DIMM_CE_ERR}"
		     i=$(( i + 1 ))
 	           fi
	
 	           if [[ ${DIMM_UE_ERR} -gt 0 ]]; then
 	             MEM_ERRS[$i]="${d##*/}:${c##*/}:Uncorrected:${DIMM_UE_ERR}"
		     i=$(( i + 1 ))
 	           fi
	
 	           if [[ ${DEBUG} ]]; then
 	             echo "c: ${c}"
 	             echo "dimm ue: ${DIMM_UE_ERR} ::: dimm ce ${DIMM_CE_ERR}"
 	             echo
 	           fi
 	         fi
 	         cd ..
 	       done
 	     fi
 	   fi
 	   cd ..
 	 done
	fi
    ## Send it to xymon
		DESC="EDAC reports "
    xymon_submit
}

function xymon_submit
{
    if [[ ${DEBUG} ]]; then
        echo "In xymon_submit: sizeof MEM_ERRS: ${#MEM_ERRS[@]}"
    fi
	## Fish through the array to find our errors
	##  if there any UE exist, set Xymon to CRIT (red)
	##  if CE exist without UE, set Xymon to WARN (yellow)
	##  otherwise, we're super-green
	if [[ ${#MEM_ERRS[@]} > 0 ]]; then
			DESC="${DESC} memory errors found: <dashboard-break> <br>"
			for e in "${MEM_ERRS[@]}"; do
					TYPE=''
					IFS=':'
					set $e
					if [ "$3" == "uncorrectable" ]; then
							UEFLAG=1
					fi
					IFS=' '
					DIMM_SLOT=$((${2#${2%?}} + 1))
					DESC="${DESC}Controller: $1<br>DIMM Slot: $DIMM_SLOT<br>Type: $3<br>Number of errors: $4<br><br>"
			done
			
			## Check to see if we have uncorrectable (read: fatal) ECC errors
			##  Set Xymon to 'red' if yes, 'yellow' if no.
			if [ $UEFLAG -eq 1 ]; then
					COLOR='red'
			else
					COLOR='yellow'
			fi

			if [[ ${DEBUG} ]]; then
					echo "${DESC}"
			else
					LINE="status ${MACHINE}.${COLUMN} ${COLOR} ${DATE} ${DESC}<br><br>Version: ${VERSION}"
			fi
	else
			## Otherwise, we're green (by default)
            if [ $NO_MECH -eq 0 ]; then
			    DESC="${DESC} no memory problems found"
            fi

			if [[ ${DEBUG} ]]; then
					echo "${DESC}"
			else
					LINE="status ${MACHINE}.${COLUMN} ${COLOR} ${DATE} ${DESC}<br><br>Version: ${VERSION}"
			fi
	fi

	## Send the status to Xymon (or not)
	if [[ ${DEBUG} ]]; then
			echo "Running in debug mode, no data sent to Xymon."
	else
			exec $BB $BBDISP "$LINE"
	fi
}

## So if EDAC knows about the memory controller, let's go ahead and attempt to use it
##  Otherwise go right to mcelog (if available)
if [[ -d "${EDAC_BASEDIR}/mc0" ]]; then
	edac_report
elif [[ -f "${MCELOG}" ]]; then
	mcelog_report
else
    ## By design we first try EDAC if mc0 exists, then MCELOG
    ##  If neither one is available, set a flag and report to Xymon
    ##  Otherwise, assume everything's green
    if [[ ${DEBUG} ]]; then
       	echo "No mechanisms available for reporting on memory errors"
    else
        COLOR='clear'
        DESC="This machine has no available mechanisms for monitoring DIMM errors.<br>"
        DESC="${DESC}If ECC memory is installed, it is likely that the BIOS is presenting a problem for EDAC.<br>"
        DESC="${DESC}If the machine has a 64-bit OS or a 32-bit OS with kernel >= 2.6.30, "
        DESC="${DESC}consider installing MCELOG.<br>"
        LINE="status ${MACHINE}.${COLUMN} ${COLOR} ${DATE} ${DESC}<br><br>Version: ${VERSION}"
        ## Send it to xymon
        NO_MECH=1
        xymon_submit
   fi
fi