#!/usr/local/bin/perl # nagios: +epn =pod =head1 NAME check_net_snmp_cpu.pl - Check CPU utilization on a Net-SNMP enabled device. =head1 SYNOPSIS Check CPU idle, system, user, and nice % utilization on a Net-SNMP enabled device. e.g. check_net_snmp_cpu.pl [ .. options .. ] -w 'idle,lt,5' -c 'system,gt,98' The plugin will output a list of all thresholds that have been breached and all that are ok; the most critical status becomes the return status of the plugin. For perfdata the plugin will output all metrics checked by the script. Output is in % for all perfdata metrics. e.g. 'system'=44%;0;0 'idle'=48%;0;0 'nice'=0%;0;0 'user'=8%;0;0 =cut sub check_net_snmp_cpu { use strict; use FindBin; use lib "$FindBin::Bin/lib"; use Nagios::Plugin::Threshold; use Nagios::Plugin::SNMP; use Nenm::Utils; my $USAGE = <,number:metric,,number'] \\ --critical 'metric,,number:metric,,number' EOF my $LABEL = 'NET-SNMP-CPU'; my $plugin = Nagios::Plugin::SNMP->new( 'shortname' => $LABEL, 'usage' => $USAGE ); $plugin->add_arg( 'spec' => 'sleep-time|S=i', 'required' => 0, 'help' => "-S, --sleep-time\n" . " Seconds to sleep between CPU samples (default 15s)", 'default' => '15' ); $plugin->getopts; $Nenm::Utils::DEBUG = $plugin->opts->get('snmp-debug'); my %cpu = ( 'user' => {qw(oid .1.3.6.1.4.1.2021.11.50.0 raw 0 s0 0 s1 0 value 0)}, 'nice' => {qw(oid .1.3.6.1.4.1.2021.11.51.0 raw 0 s0 0 s1 0 value 0)}, 'system' => {qw(oid .1.3.6.1.4.1.2021.11.52.0 raw 0 s0 0 s1 0 value 0)}, 'idle' => {qw(oid .1.3.6.1.4.1.2021.11.53.0 raw 0 s0 0 s1 0 value 0)}, 'wait' => {qw(oid .1.3.6.1.4.1.2021.11.54.0 raw 0 s0 0 s1 0 value 0)}, 'kernel' => {qw(oid .1.3.6.1.4.1.2021.11.55.0 raw 0 s0 0 s1 0 value 0)}, 'interrupt' => {qw(oid .1.3.6.1.4.1.2021.11.56.0 raw 0 s0 0 s1 0 value 0)} ); my ($wthr, $werrs)= ([], []); if (defined $plugin->opts->warning) { ($wthr, $werrs) = Nenm::Utils::parse_multi_threshold($plugin->opts->warning, \%cpu); } if (scalar(@$werrs) > 0) { $plugin->nagios_die("Errors found in warning thresholds specified:" . "\n " . join("\n ", @$werrs)); } my ($cthr, $cerrs) = Nenm::Utils::parse_multi_threshold($plugin->opts->critical, \%cpu); if (scalar(@$cerrs) > 0) { $plugin->nagios_die("Errors found in critical thresholds specified:" . "\n " . join("\n ", @$cerrs)); } my @oids; for my $metric (keys %cpu) { push(@oids, $cpu{$metric}->{'oid'}); } my $snmp_results = $plugin->get(@oids); Nenm::Utils::debug("First sample of CPU metrics taken"); # Sample once $cpu{'user'}->{'s0'} = $snmp_results->{$cpu{'user'}->{'oid'}}; $cpu{'nice'}->{'s0'} = $snmp_results->{$cpu{'nice'}->{'oid'}}; $cpu{'system'}->{'s0'} = $snmp_results->{$cpu{'system'}->{'oid'}}; $cpu{'idle'}->{'s0'} = $snmp_results->{$cpu{'idle'}->{'oid'}}; $cpu{'wait'}->{'s0'} = $snmp_results->{$cpu{'wait'}->{'oid'}}; $cpu{'kernel'}->{'s0'} = $snmp_results->{$cpu{'kernel'}->{'oid'}}; $cpu{'interrupt'}->{'s0'} = $snmp_results->{$cpu{'interrupt'}->{'oid'}}; # Sleep long enough that the agent will return fresh values sleep $plugin->opts->get('sleep-time'); # Sample again to get values to use for % change $snmp_results = $plugin->get(@oids); Nenm::Utils::debug("Second sample of CPU metrics taken"); $plugin->close(); $cpu{'user'}->{'s1'} = $snmp_results->{$cpu{'user'}->{'oid'}}; $cpu{'nice'}->{'s1'} = $snmp_results->{$cpu{'nice'}->{'oid'}}; $cpu{'system'}->{'s1'} = $snmp_results->{$cpu{'system'}->{'oid'}}; $cpu{'idle'}->{'s1'} = $snmp_results->{$cpu{'idle'}->{'oid'}}; # wait, kernel, and interrupt are not present on all OSes, so check # for each; if they are present, query them a second time, otherwise # delete them. for my $might_have (sort keys %cpu) { if ($cpu{$might_have}->{'s0'} ne 'noSuchObject') { Nenm::Utils::debug("Agent has $might_have"); $cpu{$might_have}->{'s1'} = $snmp_results->{$cpu{$might_have}->{'oid'}}; } else { Nenm::Utils::debug("Agent does not support $might_have"); delete $cpu{$might_have}; } } my ($ostype, $sysdescr) = $plugin->get_sys_info(); Nenm::Utils::debug("OS type is $ostype, sysDescr is $sysdescr"); for my $metric (keys %cpu) { my $s0 = $cpu{$metric}->{'s0'}; my $s1 = $cpu{$metric}->{'s1'}; my $diff = $s1 - $s0; Nenm::Utils::debug("CPU: $metric; $s1 - $s0 = $diff ticks"); $cpu{$metric}->{'raw'} = $diff; } # Net-SNMP platform differences # # There may be differences for other platforms, but for # now just covering Linux, BSD, and Solaris. if (($ostype =~ /bsd/i) || ($ostype eq 'solaris')) { my $system = $cpu{'system'}->{'raw'}; my $wait = (exists $cpu{'wait'}) ? $cpu{'wait'}->{'raw'} : 0; my $kernel = (exists $cpu{'kernel'}) ? $cpu{'kernel'}->{'raw'} : 0; my $interrupt = (exists $cpu{'interrupt'}) ? $cpu{'interrupt'}->{'raw'} : 0; Nenm::Utils::debug('Performing platform-specific % calculations'); # On Solaris, system == wait + kernel # On BSD, system == system + interrupts # # We skip system in calculating % and calculate # it after the rest so we don't throw off the metrics # and the end user can use this script without having # to worry about platform differences. Thanks to David # Shield for pointing me to where the CPU specific Net-SNMP # code lives in the Net-SNMP source. my $total; # Sum all but OS-specific metrics for my $metric (sort keys %cpu) { next if (($ostype =~ /bsd/) && ($metric eq 'interrupt')); next if (($ostype eq 'solaris') && ($metric =~ /wait|kernel/)); $total += $cpu{$metric}->{'raw'}; } # Calculate all but system for my $metric (sort keys %cpu) { next if (($ostype =~ /bsd/) && ($metric eq 'interrupt')); next if (($ostype eq 'solaris') && ($metric =~ /wait|kernel/)); my $raw = $cpu{$metric}->{'raw'}; $cpu{$metric}->{'value'} = sprintf("%.2f", ($raw / $total) * 100); } # Now subtract system ticks diff so we can get % utilization # for the platform-specific metrics that comprise it $total -= $cpu{'system'}->{'raw'}; if ($ostype =~ /bsd/i) { # For BSD we have to add back interrupt ticks as system # system on BSD is CPU_SYS + CPU_INTR. $total += $cpu{'interrupt'}->{'raw'}; $cpu{'interrupt'}->{'value'} = sprintf("%.2f", ($cpu{'interrupt'}->{'raw'} / $total) * 100); } elsif ($ostype eq 'solaris') { # For Solaris we have to add interrupt and kernel to the # total to get an accurate % utilization for system as system # is the sum of kernel and interrupt. $total += ($cpu{'interrupt'}->{'raw'} + $cpu{'kernel'}->{'raw'}); $cpu{'wait'}->{'value'} = sprintf("%.2f", ($cpu{'wait'}->{'raw'} / $total) * 100); $cpu{'kernel'}->{'value'} = sprintf("%.2f", ($cpu{'kernel'}->{'raw'} / $total) * 100); } } else { Nenm::Utils::convert_to('%', \%cpu); } my $results = Nenm::Utils::check_multi_thresholds(\%cpu, $wthr, $cthr, '%'); return Nenm::Utils::output_multi_results($LABEL, $results); } exit check_net_snmp_cpu()