#!/bin/bash ## Monitoring EDAC errors on Linux systems ## This monitor requires the installation of 'edac-utils' ## 11/19/2012 - cortrigl #DEBUG=1 COLUMN='dimm' COLOR='green' VERSION='1.1' NO_MECH=0 EDAC_BASEDIR='/sys/devices/system/edac/mc' EDAC_UTIL=$(which edac-util 2>/dev/null) MCELOG=$(which mcelog 2>/dev/null) declare -a MEM_ERRS DATE=$(date) DESC='' LINE='' UEFLAG=0 ## NOTE: Old "Mike" nodes (Tyan motherboards) are lacking the E75xx RASUM controller ## which is where EDAC pulls DIMM data from. Or, more succinctly, the controller ## exists but isn't being reported back to the kernel for whatever reason. ## ## A possible workaround exists, namely the use of the mcelog. Currently (11/26/12), ## it is difficult to write a routine for parsing mcelog data since mcelog only has ## something to say when something breaks. Will likely find some known bad DIMMs ## and begin purposeively failing a machine to collect this data. This is TODO. ## ## Another NOTE: One big caveat with rcelog is that it only handles 32-bit equipment ## as of kernel 2.6.30. The parameter "CONFIG_X86_RCE" was introduced in 2.6.30. ## Thus, these CentOS 5.5 rigs are SoL. Time to upgrade? function mcelog_report () { ## TODO: parse mcelog --client if [[ ${DEBUG} ]]; then echo "No EDAC available: using mcelog." fi MCE_OUTPUT=$(${MCELOG}) if [[ -z "${MCE_OUTPUT}" ]]; then if [[ ${DEBUG} ]]; then echo "mcelog output is blank: all things normal?" else ## Send it to Xymon DESC="MCELOG reports " xymon_submit fi fi } function edac_report () { i=0 if [[ ${DEBUG} ]]; then echo "Color: $COLOR; edac_base: $EDAC_BASEDIR; edac-util: $EDAC_UTIL; ueflag: $UEFLAG" fi if [[ -f "$EDAC_UTIL" ]]; then EDAC_UTIL_OUT=`(${EDAC_UTIL}) 2>&1` ## Otherwise, attempt the edac-util command if [ "${EDAC_UTIL_OUT}" == "edac-util: No errors to report." ]; then COLOR='green' if [[ ${DEBUG} ]]; then echo "should be catching a 'no error' output\n" echo "edac util output ::: ${EDAC_UTIL_OUT}\n" echo -n "sizeof MEM_ERRS: " echo ${#MEM_ERRS[@]} fi else for line in ${EDAC_UTIL_OUT}; do IFS=\n E=$(echo $line | sed 's/: /:/g') IFS=':' set $E CNTRL=$1 DIMM=$2 CHAN=$3 IFS=' ' set $4 TYPE=$2 COUNT=$1 MEM_ERRS[$i]="$CNTRL:$DIMM:$TYPE:$COUNT" i=$(( i + 1 )) if [[ ${DEBUG} ]]; then echo "$CNTRL ::: $DIMM ::: $TYPE :: $COUNT" echo -n "sizeof MEM_ERRS: " echo ${#MEM_ERRS[@]} fi done fi else ## No EDAC utils? No problem... ## We'll use the /sys structure to harvest EDAC error info ## A little more messy, but doable cd ${EDAC_BASEDIR} for d in $(find ${EDAC_BASEDIR} -maxdepth 1 -type d \( ! -iname ".*" \)); do if [[ "${d}" != "${EDAC_BASEDIR}" ]]; then cd ${d} CHAN_CE_ERR=$(cat ce_count) CHAN_UE_ERR=$(cat ue_count) if [ ${CHAN_UE_ERR} -gt 0 ] || [ ${CHAN_CE_ERR} -gt 0 ]; then if [[ ${DEBUG} ]]; then echo "EDAC_BASEDIR: ${EDAC_BASEDIR}" echo "d: ${d}" echo "chan ue: ${CHAN_UE_ERR} ::: chan ce: ${CHAN_CE_ERR}" echo fi for c in $(find ${d} -maxdepth 1 -type d \( ! -iname ".*" \)); do if [[ "${c}" != "${d}" ]]; then cd ${c} DIMM_CE_ERR=$(cat ce_count) DIMM_UE_ERR=$(cat ue_count) if [[ ${DIMM_CE_ERR} -gt 0 ]]; then MEM_ERRS[$i]="${d##*/}:${c##*/}:Corrected:${DIMM_CE_ERR}" i=$(( i + 1 )) fi if [[ ${DIMM_UE_ERR} -gt 0 ]]; then MEM_ERRS[$i]="${d##*/}:${c##*/}:Uncorrected:${DIMM_UE_ERR}" i=$(( i + 1 )) fi if [[ ${DEBUG} ]]; then echo "c: ${c}" echo "dimm ue: ${DIMM_UE_ERR} ::: dimm ce ${DIMM_CE_ERR}" echo fi fi cd .. done fi fi cd .. done fi ## Send it to xymon DESC="EDAC reports " xymon_submit } function xymon_submit { if [[ ${DEBUG} ]]; then echo "In xymon_submit: sizeof MEM_ERRS: ${#MEM_ERRS[@]}" fi ## Fish through the array to find our errors ## if there any UE exist, set Xymon to CRIT (red) ## if CE exist without UE, set Xymon to WARN (yellow) ## otherwise, we're super-green if [[ ${#MEM_ERRS[@]} > 0 ]]; then DESC="${DESC} memory errors found:
" for e in "${MEM_ERRS[@]}"; do TYPE='' IFS=':' set $e if [ "$3" == "uncorrectable" ]; then UEFLAG=1 fi IFS=' ' DIMM_SLOT=$((${2#${2%?}} + 1)) DESC="${DESC}Controller: $1
DIMM Slot: $DIMM_SLOT
Type: $3
Number of errors: $4

" done ## Check to see if we have uncorrectable (read: fatal) ECC errors ## Set Xymon to 'red' if yes, 'yellow' if no. if [ $UEFLAG -eq 1 ]; then COLOR='red' else COLOR='yellow' fi if [[ ${DEBUG} ]]; then echo "${DESC}" else LINE="status ${MACHINE}.${COLUMN} ${COLOR} ${DATE} ${DESC}

Version: ${VERSION}" fi else ## Otherwise, we're green (by default) if [ $NO_MECH -eq 0 ]; then DESC="${DESC} no memory problems found" fi if [[ ${DEBUG} ]]; then echo "${DESC}" else LINE="status ${MACHINE}.${COLUMN} ${COLOR} ${DATE} ${DESC}

Version: ${VERSION}" fi fi ## Send the status to Xymon (or not) if [[ ${DEBUG} ]]; then echo "Running in debug mode, no data sent to Xymon." else exec $BB $BBDISP "$LINE" fi } ## So if EDAC knows about the memory controller, let's go ahead and attempt to use it ## Otherwise go right to mcelog (if available) if [[ -d "${EDAC_BASEDIR}/mc0" ]]; then edac_report elif [[ -f "${MCELOG}" ]]; then mcelog_report else ## By design we first try EDAC if mc0 exists, then MCELOG ## If neither one is available, set a flag and report to Xymon ## Otherwise, assume everything's green if [[ ${DEBUG} ]]; then echo "No mechanisms available for reporting on memory errors" else COLOR='clear' DESC="This machine has no available mechanisms for monitoring DIMM errors.
" DESC="${DESC}If ECC memory is installed, it is likely that the BIOS is presenting a problem for EDAC.
" DESC="${DESC}If the machine has a 64-bit OS or a 32-bit OS with kernel >= 2.6.30, " DESC="${DESC}consider installing MCELOG.
" LINE="status ${MACHINE}.${COLUMN} ${COLOR} ${DATE} ${DESC}

Version: ${VERSION}" ## Send it to xymon NO_MECH=1 xymon_submit fi fi