#!/bin/sh

## Program Name: scancluster
##
## Runs arbitrary commands on cluster nodes and
## collects interesting lines from system log
## files across the nodes of a beowulf cluster
## and writes them to a timestamped output file.
##
## All commands are added to an internal database
## of commands, and are accessible via an index,
## and may be combined for complex reports.
##
## In most cases, this script must be run on the
## gateway node of the beowulf cluster.
##
## Note that nodes are assumed to be named
## using a simple basename and index schema.
##
## RUN THIS SCRIPT WITH THE -help OPTION FOR A
## USEFUL SUMMARY OF IT'S BEHAVIOR!
##
## In addition to this script, it is a good idea
## to run:
##
##    recon -v bootschema.lam
##
##    and
##
##    lamboot -d bootschema.lam  && lamclean
##
## to diagnose lam related non-system issues.
##
## By: Phil Ehrens for The LIGO Lab at Caltech 2004
##

## The next line tells sh to execute the script using tclshexe \
exec tclsh "$0" ${1+"$@"}

# declare all options required for this script and for embedded
# commands here. Comment the options in the 'usage' procedure
# below!
set ::option_defaults [ list \
                        -help                    0 \
                        -basename             node \
                        -nodes               1-400 \
                        -machines         [ list ] \
                        -taillines              10 \
                        -asynchronous            0 \
                        -user           $env(USER) \
                        -syslog  /var/log/messages \
                        -logpat 'oops|error|panic' \
                        -runlog    scancluster.log \
                        -commands     [ list 1 2 ] \
                        -documentation "no comment given for next cmd" \
                        -useansicolors           1 \
                      ]

## ******************************************************** 
##
## Name: usage 
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc usage { args } {
     set commands [ print_commands ]
     puts stderr "
        USAGE:
               scancluster -option1 value -option2 value ...

           Where available options are:
 
               -basename       node name base (default: node)
               
               -nodes          node indices, cslists and ranges
                               i.e. \"1,3,5,10-20\"
               
               -machines       a list of machine names which, if this
                               option is used, overrides the -basename
                               and -nodes options, causing the script
                               to iterate over the explicitly declared
                               list of machine names.
               
               -taillines      number of lines of log tail to report
                               (default: 10)
              
               -asynchronous   run tasks concurrently in background
                               logging final state in order that the
                               commands return.
                               \$node.tmp files will be created as the
                               tasks progress, and final log entries
                               will be made when completed.
                               this makes the progress of the job more
                               difficult to follow, but for things like
                               badblocks, which can run for a very long
                               time, it is useful.
                               (default: 0 - set to 1 for concurrency)

               -user           user to run as (defaults to USER)
               
               -syslog         name of system log file to query
                               (default: /var/log/messages)
               
               -logpat         interesting log line filter egrep regexp
                               (default: 'oops|error|panic'
               
               -runlog         base name of log file this program will
                               write to. A timestamp will be appended
                               as an extension.
                               (default: scancluster.log)
                               setting '-runlog stdout' will cause output
                               to be written to stdout.
               
               -commands       list of commands to run
                               (default: \"1 2\")

                   * Running the script like this, for example:

                     ./scancluster -commands \"1 2 {ps -al}\" -documentation \"ps -al\"

                   Will add a new command and a description of the command
                   AND will execute scancluster running commands 1, 2 and
                   the new command.
               
               -documentation  documentation line for new command, if
                               any was given.
               
               -help           displays this message, but does so using
                               the values of any other arguments to
                               update internal information first, so that
                               values of options are displayed as they
                               would be used in the command summary.
                               This is useful for debugging.
               
               -useansicolors  adds some color to the output for smart
                               terminals.
                               (default: 0 - set to 1 to colorize!)

     CURRENTLY AVAILABLE COMMANDS: (command '0' is prepended to all
                                    other commands internally.)
               $commands\n\n"
     exit
}
## ******************************************************** 

## ******************************************************** 
##
## Name: numRange
##
## Description:
## Expand embedded numerical ranges, i.e. 12-24 in
## { 1 2 3 4 12-24 } in place to create lists of
## integers, i.e. { 12 13 14 15 16... }
## Tries to handle all sorts of badness.
##
## Parameters:
## numbers - a Tcl list of numbers and ranges.
##
## Usage:
##        numRange "1 2 3 4 5-12 13..21, 23, 24"
## Comments:
##

proc numRange { numbers } {
     set nums {}
     ;## try to handle comma delimited lists, yeccchh.
     regsub -all {\,\s*} $numbers " " numbers
     ;## and accomodate that other way of doing things.
     regsub -all {\.\.} $numbers "-" numbers
     foreach elem $numbers {
        set elem [ string trim $elem ]
        if { [ regexp {^(\d+)-(\d+)$} $elem -> begin end ] } {
           if { $end < $begin } {
              return -code error "Bad range: '$begin-$end'"
              }
           for { set i $begin } { $i <= $end } { incr i } {
               lappend nums $i
               }
           } else {
           lappend nums $elem
           }
        }
     return $nums   
}
## ******************************************************** 

## ******************************************************** 
##
## Name: command_manager 
##
## Description:
##
## Parameters:
##
## Usage:
##       Running the script like this, for example:
##
##         ./scancluster -commands "1 2 {ps -al}" -documentation "ps -al"
##
##       Will add a new command and a description of the command
##       AND will execute scancluster running commands 1, 2 and
##       the new command.
##
## Comments:
# #

proc command_manager { newcmd { comment none }  } {
     
     if { [ catch {   
        set available [ list ]
        set cmd_rx {set\s+::Command\((\d+)\)\s+}
        set script $::argv0
        set fid [ open $script r ]
        set old [ read $fid [ file size $script ] ]
        close $fid
        set fid [ open $script w ]
        set done  0
        foreach line [ split $old "\n" ] {
           if { [ regexp $cmd_rx $line -> n ] } {
              set done $n
              append available "$line\n"
           } elseif { $done > 0 && ! [ string match #* $line ] } {   
              set newcmd "set ::Command([ incr done ]) \"$newcmd\""
              eval $newcmd
              append available "$newcmd\n"
              puts $fid "# $comment"
              puts $fid "$newcmd"
              lappend ::_commands $done
              set done 0
           }
           puts $fid $line
        }
        close $fid
     } err ] } {
        return -code error "command_manager: $err"
     }
}
## ******************************************************** 

## ******************************************************** 
##
## Name: parseopts
##
## Description:
##
## Usage:
##
## Comments:
## An ambiguous option (one that matches more than one
## default) will throw an exception.

proc parseopts { inputs } {
     set matches [ list ]
     ;## get inputs and massage them into a well-formed list
     regsub -all -- {\s+} $inputs { } inputs
     regexp {^\{(.+)\}$}  $inputs  -> inputs

     ;## and GO! (trimming loose spaces from values)
     foreach [ list option default ] $::option_defaults {
        
        set option [ string trim $option - ]
        
        set matched 0

        foreach { name value } $inputs {
           set name [ string trim $name ]
           regsub -- {^[-]+} $name {} name
           set value [ string trim $value ]
           
           if { [ string length $value ] == 0 && \
              ! [ string equal h $name ] } {
              set err "parseopts: option \"$name\" is missing value!"
              return -code error $err
           }

           
           if { [ unrecognised_option $name ] } {
              set err    "unrecognised option: '-$name'\n"
              append err "run '$::argv0 -h' for help\n"
              return -code error $err
           }
           
           if { [ regexp -nocase -- ^$name $option ] } {
              ;## next test fails if ambiguous item is last
              ;## option on command line...
              if { [ lsearch -exact $matches $name ] > -1 } {
                 set err "parseopts: ambiguous option: \"$name\""
                 return -code error $err
              }
              
              lappend matches $name
              set ::$option $value
              set matched 1
              break
           }
        }
        if { $matched == 0 } {
           set ::$option $default
        }
     }   
}
## ********************************************************                       
## ******************************************************** 
##
## Name: unrecognised_option
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc unrecognised_option { useropt } {
     
     if { [ catch {
        set unmatched 1
        foreach [ list option default ] $::option_defaults {
           if { [ regexp -nocase -- ^-$useropt $option ] } {
              set unmatched 0
              break
           }
        }
     } err ] } {
        return -code error "unrecognised_option: $err"
     }
     return $unmatched
}
## ******************************************************** 

## ******************************************************** 
##
## Name: print_commands
##
## Description:
## Helper function for usage procedure.
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc print_commands { args } {
     
     if { [ catch {
        foreach index [ lsort [ array names ::Command ] ] {
           set command $::Command($index)
           append text "$index:  $command\n               "
        }
     } err ] } {
        return -code error "print_commands: $err"
     }
     return $text
}
## ******************************************************** 

## ******************************************************** 
##
## ANSI Coloring stuff
##

set ansimap [ list \
            normal  0 bold    1 \
            light   2 blink   5 \
            invert  7 \
            black  30 red    31 \
            green  32 yellow 33 \
            blue   34 purple 35 \
            cyan   36 white  37 \
            Black  40 Red    41 \
            Green  42 Yellow 43 \
            Blue   44 Purple 45 \
            Cyan   46 White  47 ]

proc + { args } {
     if { [ string equal 1 $::useansicolors ] } {
        set t 0
        foreach i $args {
           set ix [ lsearch -exact $::ansimap $i ]
           if { $ix > -1 } {
              lappend t [ lindex $::ansimap [ incr ix ] ]
           }
        }
        return "\033\[[join $t {;}]m"
     } else {
        return [ list ]
     }
}
## ******************************************************** 

## ******************************************************** 
##
## Name: shellpipe
##
## Description:
## Shell wrapper that redirects stderr to stdout.
##
## Returns as soon as data are available OR after a
## timeout, whichever comes first.
##
## Parameters:
##
## Usage:
##
##  cmd must be listified
##
##  subcommands passed to ssh MUST be enclosed in
##  single quotes!!
##
## Comments:
## Can only return the FIRST error from the command
##

proc shellpipe { cmd node i } {
     
     if { [ catch {
        set cmd [ subst -nocommands $::Command(0) ]
        set fid [ open "|$cmd 2>@stdout" w+ ]
        fconfigure $fid -blocking off
        fconfigure $fid -buffering line
        
        fileevent $fid readable [ list handle $fid $node $i ]
        handle $fid $node $i
        
     } err ] } {
        catch { close $fid }
        set ::data($node,$i) "shellpipe: $err"
        handle ERROR $node
     }
     return $fid
}
## ******************************************************** 

## ******************************************************** 
##
## Name: handle 
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc handle { fid node i } {
     
     if { [ catch {
        
        if { [ string equal ERROR $fid ] } {
           set data ERROR
        } else {   
           set data [ read $fid ]
           if { [ string length $data ] } {
              append ::data($node,$i) $data
           }
        }
        
        if { [ string equal ERROR $fid ] || [ eof $fid ] } {
           catch { close $fid }
           puts stderr \
              "$node task $i: [ + bold purple ]done![ + normal ]"
           report $node $data 1
        } elseif { [ string length $data ] } {
           report $node $data 0
        }
        
     } err ] } {
        catch { close $fid }
        report $node  "handle: $err" 1
     }
}
## ******************************************************** 

## ******************************************************** 
##
## Name: report
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc report { node data done } {
     
     if { [ catch {
        if { $done == 1 } {
           if { [ info exists ::tempfilefid($node) ] } {
              set fid $::tempfilefid($node)
              close $fid
              unset ::tempfilefid($node)
           }
           foreach name [ array names ::data $node,* ] {
              set data $::data($name)
              puts $::logfid $data
              unset ::data($name)
           }
           file delete -force ${node}.tmp
           if { [ llength [ array names ::data ] ] == 0 } {
              # there was a race condition when hosts
              # were unreachable.
              after 1000 finish
           }
        } else {
           if { ! [ info exists ::tempfilefid($node) ] } {
              set fid [ open ${node}.tmp a+ ]
              fconfigure $fid -buffering line
              set ::tempfilefid($node) $fid
           }
           set fid $::tempfilefid($node)
           puts $fid $data
           flush $fid
        }
     } err ] } {
        catch { close $fid }
        return -code error "report: $err"
     }
}
## ******************************************************** 

## ******************************************************** 
##
## Name: finish 
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc finish { args } {
     if { [ llength [ array names ::data ] ] == 0 } {
        set ::DONE 1
      }
}
## ******************************************************** 

## ******************************************************** 
##
## Name: concurrent
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc concurrent { node } {
     set i -1
     foreach [ list running msg ] \
        [ sockdiagnostic $node ] { break }
     if { $running } {
        foreach cmd [ lsort -unique $::_commands ] {
           incr i
           set cmd [ set ::Command($cmd) ]
           set ::data($node,$i) \
              "[ + bold red ]$node *************** $node[ + normal ]\n"
           append ::data($node,$i) \
              "[ + bold blue ]COMMAND: $cmd[ + normal ]\n"
           shellpipe $cmd $node $i
        }
     } else {
        puts stderr \
              "\n$node: [ + bold red ]ERROR![ + normal ]"
        set ::data($node,$i) \
           "[ + bold red ]$node *************** $node[ + normal ]\n"
        append ::data($node,$i) \
           "[ + bold blue ]SOCKET DIAGNOSTIC FAILURE: "
        append ::data($node,$i) "[ + bold red ]$msg[ + normal ]\n"
        report $node $msg 1
     }
}
## ******************************************************** 

## ******************************************************** 
##
## Name: nonconcurrent
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc nonconcurrent { node } {
     set fid $::logfid
     puts $fid \
        "[ + bold red ]$node *************** $node[ + normal ]"
     foreach [ list running msg ] \
        [ sockdiagnostic $node ] { break }
     if { $running } {
        foreach cmd [ lsort -unique $::_commands ] {
           set cmd [ set ::Command($cmd) ]
           puts $fid "[ + bold blue ]COMMAND: $cmd[ + normal ]\n"
           catch { set data [ eval exec $::Command(0) ] } data
           puts $fid "$data\n"
        }
     } else {
        puts $fid \
           "[ + bold blue ]SOCKET DIAGNOSTIC FAILURE: [ + bold red ]$msg[ + normal ]\n"
 
     }
}
## ******************************************************** 

## ******************************************************** 
##
## Name: create_nodelist
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc create_nodelist { args } {
     
     if { [ catch {
        set nodelist [ list ]
        if { [ llength $::machines ] > 0 } {
           set nodelist $::machines
        } else {
           set nodes [ numRange $::nodes ]
           foreach node $nodes {
              lappend nodelist $::basename$node
           }
        }
     } err ] } {
        return -code error "[ myName ]: $err"
     }
     return $nodelist
}
## ******************************************************** 

## ******************************************************** 
##
## Name: sockdiagnostic
##
## Description:
## Try to diagnose machines ability to function within the
## LDAS system. The ping times for this procedure are
## comparable to those of the UNIX ping command.
##
## Parameters:
##
## Usage:
##
## Comments:
## Uses the default ssh port to determine whether a machine
## is alive and able to talk.

proc sockdiagnostic { host { port 22 } } {
     set running 1
     set msg "ok"
     set timeout 2000
     
     ;## parse hostname from foo@hostname:NNN
     ;## cannot recall why we need to do this!!
     if { [ regexp {\@([^\:]+)} $host -> tmp ] } {
        set host $tmp
     }
     
     ;## calculate time periods equal to 1%, 10% and 100%
     ;## of the ping timeout panic limit.
     set short  [ expr { round($timeout / 100.0) } ]
     set medium [ expr { $short * 9 } ]
     set long   [ expr { $short * 90 } ]
     
     if { [ catch {
        
        set sid [ socket -async $host $port ]
        
        ;## three stage ping response eval. once after 1% of
        ;## the panic limit, again after 10%, then at the limit.
        after $short
        if { [ catch { fconfigure $sid -peername } ] } {
           after $medium
           if { [ catch { fconfigure $sid -peername } ] } {
              after $long
              if { [ catch { fconfigure $sid -peername } ] } {
                 set running 0
                 set msg "$host is down or has no service"
                 append msg " on port $port"            
              } else {
                 set msg "$host ping more than ${short}ms and "
                 append msg "less than ${timeout}ms"
                 puts stderr $msg
              }
           }   
        }
        
        if { $running == 1 } {
           set err [ fconfigure $sid -error ]
           if { [ string length $err ] } {
              set running 0
              set msg "error reported on ${host}:${port}: $err"
           }
        }
     } err ] } {
        set running 0
        set msg "host $host unreachable"
     }
     catch { ::close $sid }
     return [ list $running $msg ]
}
## ********************************************************

## ******************************************************** 
##
## Name: openlog
##
## Description:
##
## Parameters:
##
## Usage:
##
## Comments:
##

proc openlog { args } {
     # the output will get a unique filename with a timestamp
     if { [ string equal stdout $::runlog ] } {
        if { $::asynchronous == 1 } {
           set err "I can't write to STDOUT in ASYNCHRONOUS MODE!!"
           puts stderr "\n\n$err\n\n"
           exit
        }
        set ::logfid stdout
     } else {
        set ::mylog $::runlog.[ clock seconds ]
        puts stderr "output will be written to: '$::mylog'"
        set ::logfid [ open $::mylog w ]
     }
}
## ******************************************************** 

## ******************************************************** 
proc bgerror { msg } {
     set trace {}
     catch { set trace $::errorInfo }
     if { [ string length $trace ] } {
        set msg $trace
     }
     
     set time [ clock format [ clock seconds ] -format "%m/%d/%y-%r %Z" ]
     puts stderr "$time $msg"
}
## ******************************************************** 

parseopts $argv

;## ******************************************************** 
;## ************** BEGIN COMMAND DATABASE ******************
;## ******************************************************** 
# These are the remotely executable commands available.
# Note that Command '0' is the rsh base command.
# An ssh base command might be:
# ssh -x -n -obatchmode=yes \$node \$cmd 2>@stdout
# -n can be -odontreadstdin=yes for clarity.
# or not...
set ::Command(0) "rsh -l $user \$node \$cmd"
# find critical system log file entries
set ::Command(1) "egrep -i $logpat $syslog"
# report the last N log file entries unfiltered
set ::Command(2) "tail -n $taillines $syslog"
# find all lamd processes and get useful data about them
set ::Command(3) "ps -Ao fname,pid,user,etime,state | grep lamd"
# find all search users processes and get useful data about them
set ::Command(4) "ps -Ao fname,pid,user,etime,state | grep search"
# date
set ::Command(5) "date +'%A %B %d %Y %r %Z'"
;## ********************************************************
;## **************** END COMMAND DATABASE ******************
;## ******************************************************** 

## ******************************************************** 
##
## Name: main loop
##
## Description:
## Bare code. Not a procedure!
##
## Parameters:
##
## Usage:
##
## Comments:
##

foreach command $commands {
   if { [ regexp {^[0-9]+$} $command ] } {
      lappend ::_commands $command
   } else {
      command_manager $command $documentation
   }   
}

# usage if -h
if { [ regexp -nocase -- {\s+-+h} $::argv ] || \
     [ regexp -nocase -- {^-+h}   $::argv ] } {
   usage
}

# the output will get a unique filename with a timestamp
openlog

set ::nodelist [ create_nodelist ]

puts -nonewline stderr "scanning: "

set i 0

foreach node $::nodelist {
   if { $::asynchronous == 1 } {
      concurrent    $node
   } elseif { $::asynchronous == 0 } {
      nonconcurrent $node
   } else {
      puts stderr "\n\n-asychronous option can ONLY be 0 or 1.\n\n"
      exit
   }
   if { ! [ string equal stdout $runlog ] } {
      incr i
      if { [ expr {$i % 10} ] } {
         puts -nonewline stderr [ + bold blue ]*[ + normal ]
      } else {
         puts -nonewline stderr [ + bold red ]$i[ + normal ]
      }
   }   
}

if { $::asynchronous == 0 } {
   puts stderr " [ +  bold red ]DONE![ + normal ]"
   close $::logfid
   exit
} else {
   puts stderr " [ + bold red ]WAITING...[ + normal ]"
   vwait ::DONE
   puts stderr " [ +  bold red ]DONE![ + normal ]"
   close $::logfid
   exit
}
## ******************************************************** 

