#!/usr/bin/env ruby # # checkubc.rb: A script to monitor the health of OpenVZ containers by reading # user_beancounters # # This script reads /proc/user_beancounters and will generate warnings if # certain thresholds are crossed. It also stores maxheld and failcnt in a # gdbm file and warns if those numbers grow. You may tweak the thresholds # and the location of the gdbm file by editing the constants defined below. # # Depends on: gdbm, Time, Net::SMTP # All of these should be part of your basic Ruby distribution # # Usage: ./checkubc.rb # Best run regularly from cron(8) # # Copyright (C) 2006-2008 Alkaloid Networks. # http://projects.alkaloid.net | ben@alkaloid.net # Released under the terms of the GPL. See: http://www.gnu.org/copyleft/gpl.html # $Id$ # BEGIN EDITS: Edit these constants to your liking: # Since we are running as root, use a safe PATH ENV["PATH"] = "/sbin:/bin:/usr/sbin:/usr/bin" # ENV["HOSTNAME"] = 'my.local.hostname.override' if !ENV.has_key?('HOSTNAME') # Note that hostname -f is not portable. This should not be a problem # though because OpenVZ is Linux-specific. On Solaris, for example, this # would set the hostname to "-f" # In this case we want to try to get the fully-qualified hostname ENV['HOSTNAME'] = `hostname -f`.strip end # Warning and Critical thresholds. Remember to keep the # leading 0 on these numbers so Ruby sees them as Floats WARN_THRESHOLD = 0.80 # 80% CRIT_THRESHOLD = 0.95 # 95% # Whether or not to print messages to the console PRINT_CONSOLE = false # Address which should receive notifications TO = "root@localhost" # Customize the "from" information for generated email FROM = "root@#{ENV['HOSTNAME']}" FROM_NAME = "#{ENV['HOSTNAME']} CHECKUBC" # Location of the statistics DB. This persistently stores maxheld and failcnt numbers. UBC_STATDB = "/var/run/checkubc/ubcstats.db" # File from which to read beancounters data. Used mostly for debugging. UBC_FILE = "/proc/user_beancounters" # Hostname of the SMTP server which will relay notifications SMTPHOST = "localhost" # END EDITS: No edits necessary beyond this point require 'gdbm' require 'net/smtp' KNOWN_VERSIONS = ["2.5"] ERRLEVEL = { :crit => 3, 3 => "CRITICAL", :warn => 2, 2 => "Warning", :notice => 1, 1 => "Notice", :debug => 0, 0 => "debug", } NOTIFY_LEVEL = ERRLEVEL[:crit] $err_level = 0 $err_msg = "" def debug(debugmsg) if (PRINT_CONSOLE && $DEBUG) $stderr.print("#{debugmsg}\n") end end def warn(warnmsg) if ($err_level < ERRLEVEL[:warn]) $err_level = ERRLEVEL[:warn] end if (PRINT_CONSOLE) $stderr.print "WARN: #{warnmsg}\n" end $err_msg += "#{warnmsg}\n" end def crit(critmsg) if ($err_level < ERRLEVEL[:crit]) $err_level = ERRLEVEL[:crit] end if (PRINT_CONSOLE) $stderr.print "CRITICAL: #{critmsg}\n" end $err_msg += "#{critmsg}\n" end def fatal(fatalmsg) # Should only be called if we hit an internal error (like a parsing error) $stderr.print "FATAL: #{fatalmsg}\n" $err_msg += "#{fatalmsg}\n" # Disregard any exceptions when trying to send mail begin sendmail("[checkubc] FATAL", $err_msg) end exit(1) end def sendmail(subject, message) if (Time.respond_to?("rfc2822")) datestr = Time.now.rfc2822 else datestr = `date -R` end mail = < To: <#{TO}> Subject: #{subject} Date: #{datestr} #{message} END_OF_MESSAGE Net::SMTP.start(SMTPHOST) do |smtp| smtp.send_message mail, FROM, TO end end $dbh = GDBM.open(UBC_STATDB, 0600) # List of VE IDs and their names or IP addresses vzlist = { "0" => "OpenVZ Host" } # use vzlist to get a VEID -> Name mapping output = `vzlist -an` if ($?.exitstatus != 0) fatal("vzlist returned non-zero") end output.split("\n").each{|line| hostname ="" fields = line.chomp.split if (fields.length == 4) veid, nproc, status, ipaddr = fields elsif (fields.length == 5) veid, nproc, status, ipaddr, hostname = fields else fatal("Invalid data received from vzlist.") end if (veid == "VEID") # skip header line next end # Use the IP address if the hostname has not been specified if (hostname.length < 1) vzlist[veid] = ipaddr else vzlist[veid] = hostname end } curuid = "" version = "" fh = File.open(UBC_FILE) #Process::UID.change_privilege(99) fh.each{|line| line.chomp! # Ensure we are attempting to parse a known version if (line =~ /^Version: ([\d\.]+)/) version = $1.chomp if (!(KNOWN_VERSIONS.include?(version))) warn("Unknown version #{$1}, results may be unpredictable.") end next end if (version.length < 1) warn("No version found. Results may be unpredictable.") end data = line.split if (data.length == 7) field = data.shift if (field == "uid") # Header line, ignore it # uid resource held maxheld barrier limit failcnt next end if (field =~ /([\d\.]+):/) curuid = $1 end end # At this point we should be processing a data line. # Fail if if no UID has been found if (curuid.length < 1) fatal("Data line found before UID.") end if (data.length != 6) fatal("Invalid data line supplied.") end resource, held, maxheld, barrier, limit, failcnt = data debug("#{curuid} Values: #{data.join(' ')}" ) if (resource == "dummy") # Don't bother measuring dummy lines next end # Ignore OOMGUARPAGES for now. These don't really need a warning as it only # represents the *guaranteed* memory pages in the event of an OOM. if (resource == "oomguarpages") next end held = held.to_f barrier = barrier.to_f limit = limit.to_f if (held < barrier) if ((held / barrier) > CRIT_THRESHOLD) crit("#{vzlist[curuid]} (#{curuid}): #{resource} above #{CRIT_THRESHOLD * 100}% of barrier") elsif ((held / barrier) > WARN_THRESHOLD) warn("#{vzlist[curuid]} (#{curuid}): #{resource} above #{WARN_THRESHOLD * 100}% of barrier") end else # We're above barrier, check for limit if ((held / limit) > CRIT_THRESHOLD) crit("#{vzlist[curuid]} (#{curuid}): #{resource} above #{CRIT_THRESHOLD * 100}% of limit") elsif ((held / limit) > WARN_THRESHOLD) warn("#{vzlist[curuid]} (#{curuid}): #{resource} above #{WARN_THRESHOLD * 100}% of limit") end end maxheld = maxheld.to_i failcnt = failcnt.to_i if ($dbh.has_key?("#{curuid}/#{resource}/maxheld")) diff = maxheld - $dbh["#{curuid}/#{resource}/maxheld"].to_i if (diff > 0) barrier = barrier.to_f limit = limit.to_f if (maxheld < barrier) if ((maxheld / barrier) > CRIT_THRESHOLD) crit("#{vzlist[curuid]} (#{curuid}): #{resource} reached new peak above #{CRIT_THRESHOLD * 100}% of barrier") elsif ((maxheld / barrier) > WARN_THRESHOLD) warn("#{vzlist[curuid]} (#{curuid}): #{resource} reached new peak above #{WARN_THRESHOLD * 100}% of barrier") end else # We're above barrier, check for limit if ((maxheld / limit) > CRIT_THRESHOLD) crit("#{vzlist[curuid]} (#{curuid}): #{resource} reached new peak above #{CRIT_THRESHOLD * 100}% of limit") elsif ((maxheld / limit) > WARN_THRESHOLD) warn("#{vzlist[curuid]} (#{curuid}): #{resource} reached new peak above #{WARN_THRESHOLD * 100}% of limit") end end $dbh["#{curuid}/#{resource}/maxheld"] = maxheld.to_s end else $dbh["#{curuid}/#{resource}/maxheld"] = maxheld.to_s end if ($dbh.has_key?("#{curuid}/#{resource}/failcnt")) diff = failcnt - $dbh["#{curuid}/#{resource}/failcnt"].to_i if (diff > 0) crit("#{vzlist[curuid]} (#{curuid}): #{resource} failcount increased by #{diff} to #{failcnt}") $dbh["#{curuid}/#{resource}/failcnt"] = failcnt.to_s end else $dbh["#{curuid}/#{resource}/failcnt"] = failcnt.to_s end } $dbh.close if ($err_level >= NOTIFY_LEVEL) debug("Sending notification to #{TO}") sendmail("[checkubc]: Check result #{ERRLEVEL[$err_level]}", $err_msg) else debug("Error level #{$err_level} below notification level #{NOTIFY_LEVEL}, skipping mail") end