#!/usr/bin/perl -w ## # SMART statistics - extra statistics not returned by the normal # smart plugin. Specifically we focus on the SATA Phy error counters, # although the actual details might be extended in the future. # # Unlike many of the plugins for munin which manipulate discs, we do not # associate the statistics by the device name in the filesystem. Instead # we use the model and serial information as the discriminating feature # of the devices. This means that each invocation of the device needs to # do a little more work, but guarantees that if devices appear in a # different order during startup, or due to the physical disc being connected # differently (eg moving bays) the data is still correctly associated. # # # Configuration - you will need to add entries to the munin configuration # to ensure that the plugin runs as root: # # [smartstats_*] # user root # # Additional environment variables that may be set: # env.SMARTCTL /usr/local/bin/smartctl # - Overrides the default location of smartctl (/usr/sbin/smartctl). # env.CACHE___ # - Set how long the results will be cached for, by device info # env.CACHE_ # - Set how long the results will be cached for, by device name (eg sdc) # env.CACHE # - Set how long the results will be cached for, for all devices not # set by the above two cache options. Time is in minutes. # Omitting this option (or using 0) will disable caching and will always # query the device. # env.ACTIVEONLY___ # env.ACTIVEONLY_ # env.ACTIVEONLY # '1' to only query the device if it is sleeping (otherwise, always # query the device regardless of its state). # # Magic configuration markers: # #%# family=auto #%# capabilities=autoconf suggest # # History: # # 2014-03-05: Original version, supporting sataphy statistics. # 2014-05-13: Added devstat support, just for SSD 'percentage used'. # 2014-06-22: Updated to handle udevadm as well as udevinfo. # use strict; use warnings; use File::Basename; use Data::Dumper; use lib $ENV{'MUNIN_LIBDIR'} // '.'; # Get any prerequisite components my @problems; eval { require Munin::Plugin; Munin::Plugin->import; }; if ($@) { push @problems, "Munin::Plugin not found"; } # Extract the device from the program name my $prog = $0; my ($mode, $devicename) = ($prog =~ /smartstats_([a-z]+)_(.*)$/); my $arg = shift || 'fetch'; my $tool = $ENV{'SMARTCTL'} || '/usr/sbin/smartctl'; my $debug = 0; # The descriptions given from smartctl are short, but they're still a bit # too long to use as field descriptions in munin, so we shorten them where # we can. my %sataphy_shortname = ( 0x0001 => 'ICRC error', 0x0002 => 'Data error', 0x0003 => 'Data error (to host)', 0x0004 => 'Data error (to device)', 0x0005 => 'Non-data error', 0x0006 => 'Non-data error (to host)', 0x0007 => 'Non-data error (to device)', 0x0008 => 'Non-data error retries (to host)', 0x0009 => 'Not-ready to ready transitions', 0x000a => 'COMRESET response sent', 0x000b => 'CRC error (to device)', # Not sure what C is 0x000d => 'Non-CRC error (to device)', # Not sure what E is 0x000f => 'Data CRC error (to device)', 0x0010 => 'Data non-CRC error (to device)', # Not sure what 11 is 0x0012 => 'Non-Data CRC error (to device)', 0x0013 => 'Non-Data non-CRC error (to device)', ); my %devstat_shortname = ( 0x7008 => 'Percentage used', ); if ($arg eq 'autoconf') { # Can this tool be automatically configured? if (@problems) { print "no (", join(", ", @problems), ")\n"; exit 0; } if (-x "$tool") { my $output = `$tool -l help 2>&1`; my ($list) = ($output =~ /=======> VALID ARGUMENTS ARE: (.*?) <=======/); my %arg = map { /^([a-z]+)(.*)$/; $1=>$2 } split /,\s/, $list; if (!defined $arg{'sataphy'}) { print "no (no 'sataphy' log discovery in smartctl)\n"; exit 0; } print "yes\n"; exit 0; } print "no (cannot find smartctl)\n"; exit 0; } # From here on-in we need the lists of discs. my $discs = discs_listdiscs(); my %ids; for my $disc (values %$discs) { $disc->{'id'} = $disc->{'model'} . '__' . $disc->{'serial'}; $ids{$disc->{'id'}} = $disc; } if ($arg eq 'suggest') { # Suggest a list of devices we can use. for my $id (sort keys %ids) { # Let us check the behaviour of the device first - some devices # might not be queryable. # This request is always performed live - we never use the cache, # and we override the 'ACTIVEONLY' setting. # Can we handle SATA Phy statistics? my %stats = query_sataphy("/dev/$ids{$id}->{'device'}", $id, 'nocache', 'always'); if (scalar(keys %stats) > 0) { print "sataphy_$id\n"; } else { print STDERR "$Munin::Plugin::me: Cannot probe device for SATA Phy '$ids{$id}->{'device'}' ($id)\n"; } # Can we handle device statistics? %stats = query_devstat("/dev/$ids{$id}->{'device'}", $id, 'nocache', 'always'); if (scalar(keys %stats) > 0) { # We have some statistics; let's see which ones we will be able to # produce graphs for: if (scalar(grep { /^7...$/ } keys %stats) > 0) { print "ssdstat_$id\n"; } else { print STDERR "$Munin::Plugin::me: No SSD statistics for '$ids{$id}->{'device'}' ($id)\n"; } } else { print STDERR "$Munin::Plugin::me: Cannot probe device for DevStat '$ids{$id}->{'device'}' ($id)\n"; } } exit; } # From here on we are using the device, so validate that we have useful # information. my $device = $ids{$devicename}; if (!defined $ids{$devicename}) { die "$Munin::Plugin::me: No device '$devicename' present\n"; } if ($mode ne 'sataphy' && $mode ne 'ssdstat') { die "$Munin::Plugin::me: Unsupported mode '$mode'\n"; } # Device name on disc. my $dev = "/dev/$device->{'device'}"; my %stats; if ($mode eq 'sataphy') { %stats = query_sataphy($dev, $devicename, 'cache'); } elsif ($mode eq 'ssdstat') { %stats = query_devstat($dev, $devicename, 'cache'); } if ($arg eq 'config') { # Configuration information requested, so we need to describe what # this graph is. # Look for any devices that happen to have labelled discs on this # device my $pretty_device = $device->{'device'}; my $labels = discs_listlabels(); my @applicable_labels = map { $labels->{$_} } grep { /^$device->{'device'}\d+$/ } keys %$labels; if (scalar(@applicable_labels) > 0) { $pretty_device .= " (" . join(", ", @applicable_labels) . ")"; } if ($mode eq 'sataphy') { print <{'id'} <=> $b->{'id'} } values %stats) { if ($info->{'id'} & (1<<15)) { # Vendor value, so we cannot know what it means. # Skip it. next; } my $label = $sataphy_shortname{$info->{'id'}} // "ID $info->{'hex'}"; print "id$info->{'hex'}.label $label\n"; print "id$info->{'hex'}.type GAUGE\n"; print "id$info->{'hex'}.info ID $info->{'hex'}: $info->{'desc'}\n"; print "id$info->{'hex'}.min 0\n"; } } elsif ($mode eq 'ssdstat') { print <{'id'} <=> $b->{'id'} } values %stats) { if ($info->{'id'} >= 0x7000 && $info->{'id'} < 0x8000) { my $label = $devstat_shortname{$info->{'id'}} // "ID $info->{'hex'}"; print "id$info->{'hex'}.label $label\n"; print "id$info->{'hex'}.type GAUGE\n"; print "id$info->{'hex'}.info ID $info->{'hex'}: $info->{'desc'}\n"; print "id$info->{'hex'}.min 0\n"; if ($info->{'id'} eq 0x7008) { # Percentage endurance is clearly limited to 100%. print "id$info->{'hex'}.max 100\n"; } } } } else { die "Config not implemented for mode $mode"; } exit; } if ($arg eq 'fetch' || $arg eq '') { if ($mode eq 'sataphy') { for my $info (sort { $a->{'id'} <=> $b->{'id'} } values %stats) { if ($info->{'id'} & (1<<15)) { # Vendor value, so we cannot know what it means. # Skip it. next; } print "id$info->{'hex'}.value $info->{'value'}\n"; } } else { for my $info (sort { $a->{'id'} <=> $b->{'id'} } values %stats) { if ($info->{'id'} >= 0x7000 && $info->{'id'} < 0x8000) { print "id$info->{'hex'}.value $info->{'value'}\n"; } } } exit; } # Unrecognised. print STDERR Dumper(\%stats); exit 1; ## # Query the SATA Phy table and return its statistics in a parsed format. # # @param[in] $dev Device filename # @param[in] $name Name of the device, if we need to look anything up # @param[in] $cache 'cache' if we should cache the result, or 'nocache' to # ignore any caching. Caching time is defined by the # environment variables, as a number of minutes to cache # for: # 'CACHE_<$name>' # 'CACHE_' # 'CACHE' # and defaults to 0 (disabled). # @param[in] $always 'always' if we should always perform the operation, # even if it would be disabled by "ACTIVEONLY" settings. # This allows us to get information for 'suggest' where # we would otherwise ignore discs that weren't running. # # @return hash, keyed by the identifier, with values as hashref: # 'id' The identifier converted to decimal. # 'hex' The hex identifier (same as the key) in the form 0x????. # 'size' The width of the counter (1,2,4,8 bytes) # 'value' The value of that counter # 'desc' Smartctl's description of that field # or an empty hash if nothing was parsed. sub query_sataphy { my ($dev, $name, $cache, $always) = @_; my $cmd = "$tool $dev -l sataphy"; my $output; my $cachedoutput; my $cacheperiod = 0; my $activeonly = $ENV{'ACTIVEONLY_$name'} // $ENV{'ACTIVEONLY_' . (basename($dev))} // $ENV{'ACTIVEONLY'} // 0; $always //= 'no'; if ($activeonly && $always eq 'no') { # Disable the query if the device is in standby or sleep # (will fall back to last cache values) $cmd .= " -n standby"; } # Read from the correct file (we do multiple state stores, so we need # to write to different files) Munin::Plugin::set_state_name("smartstats_sataphy_$name"); if (defined $cache && $cache eq 'cache') { # Look up the cache period $cacheperiod = $ENV{'CACHE_$name'} // $ENV{'CACHE_' . (basename($dev))} // $ENV{'CACHE'} // 0; if ($cacheperiod != 0) { my $cachetime; print STDERR "Retrieving cached state for $name\n" if ($debug); ($cachedoutput, $cachetime) = restore_state(); if (defined $cachedoutput) { if (time() - $cachetime < 60*$cacheperiod) { # Cache is valid, so we can use it. $output = $cachedoutput } else { print STDERR "Cached state is outdated\n" if ($debug); } } else { print STDERR "No cache present\n" if ($debug); } } } if (!defined $output) { # No cached result usable print STDERR "Retrieving live state for $name ($dev)\n" if ($debug); $output = `$cmd`; if ($output =~ /Device is in (STANDBY|SLEEP) mode/) { # The drive isn't available right now, so don't perform the # query. Use the old data if we have any (even if it is outdated, # so that we don't wake up the drive). print STDERR "Device is asleep\n" if ($debug); if (defined $cachedoutput) { print STDERR "Using old cached content\n" if ($debug); $output = $cachedoutput; } else { print STDERR "No cached data to use\n" if ($debug); } } else { # We can cache this result (but only do so if caching is enabled if (defined $cache && $cache eq 'cache' && $cacheperiod != 0) { print STDERR "Storing cached state\n" if ($debug); save_state($output, time()); } } } my %stats = map { /^(0x[0-9a-fA-F]{4})\s+(\d+)\s+(\d+)\s+(.*)$/ ? ($1 => { 'id' => hex($1), 'hex' => $1, 'size' => $2, 'value' => $3, 'desc' => $4, }) : (); } split /\n/, $output; return %stats; } ## # Query the device statistics and return its statistics in a parsed format. # # @param[in] $dev Device filename # @param[in] $name Name of the device, if we need to look anything up # @param[in] $cache 'cache' if we should cache the result, or 'nocache' to # ignore any caching. Caching time is defined by the # environment variables, as a number of minutes to cache # for: # 'CACHE_<$name>' # 'CACHE_' # 'CACHE' # and defaults to 0 (disabled). # @param[in] $always 'always' if we should always perform the operation, # even if it would be disabled by "ACTIVEONLY" settings. # This allows us to get information for 'suggest' where # we would otherwise ignore discs that weren't running. # # @return hash, keyed by the identifier, with values as hashref: # 'id' The identifier converted to decimal. # 'hex' The hex identifier (same as the key) in the form 0x????. # 'size' The width of the counter (1,2,4,8 bytes) # 'value' The value of that counter # 'desc' Smartctl's description of that field # or an empty hash if nothing was parsed. sub query_devstat { my ($dev, $name, $cache, $always) = @_; my $cmd = "$tool $dev -l devstat"; my $output; my $cachedoutput; my $cacheperiod = 0; my $activeonly = $ENV{'ACTIVEONLY_$name'} // $ENV{'ACTIVEONLY_' . (basename($dev))} // $ENV{'ACTIVEONLY'} // 0; $always //= 'no'; if ($activeonly && $always eq 'no') { # Disable the query if the device is in standby or sleep # (will fall back to last cache values) $cmd .= " -n standby"; } # FIXME Make this cache generic. # Read from the correct file (we do multiple state stores, so we need # to write to different files) Munin::Plugin::set_state_name("smartstats_devstat_$name"); if (defined $cache && $cache eq 'cache') { # Look up the cache period $cacheperiod = $ENV{'CACHE_$name'} // $ENV{'CACHE_' . (basename($dev))} // $ENV{'CACHE'} // 0; if ($cacheperiod != 0) { my $cachetime; print STDERR "Retrieving cached state for $name\n" if ($debug); ($cachedoutput, $cachetime) = restore_state(); if (defined $cachedoutput) { if (time() - $cachetime < 60*$cacheperiod) { # Cache is valid, so we can use it. $output = $cachedoutput } else { print STDERR "Cached state is outdated\n" if ($debug); } } else { print STDERR "No cache present\n" if ($debug); } } } if (!defined $output) { # No cached result usable print STDERR "Retrieving live state for $name ($dev)\n" if ($debug); $output = `$cmd`; if ($output =~ /Device is in (STANDBY|SLEEP) mode/) { # The drive isn't available right now, so don't perform the # query. Use the old data if we have any (even if it is outdated, # so that we don't wake up the drive). print STDERR "Device is asleep\n" if ($debug); if (defined $cachedoutput) { print STDERR "Using old cached content\n" if ($debug); $output = $cachedoutput; } else { print STDERR "No cached data to use\n" if ($debug); } } else { # We can cache this result (but only do so if caching is enabled if (defined $cache && $cache eq 'cache' && $cacheperiod != 0) { print STDERR "Storing cached state\n" if ($debug); save_state($output, time()); } } } # Format of the content is like: # Page Offset Size Value Description # 1 ===== = = == General Statistics (rev 2) == # 1 0x008 4 32 Lifetime Power-On Resets # 1 0x010 4 433 Power-on Hours # 7 0x008 1 5~ Percentage Used Endurance Indicator # |_ ~ normalized value my %stats = map { /^\s+(\d+)\s+0x([0-9a-fA-F]{3})\s+(\d+)\s+(\d+)[~\s]+(.*)$/ ? ("$1$2" => { 'id' => hex("$1$2"), 'hex' => "0x$1$2", 'size' => $3, 'value' => $4, 'desc' => $5, }) : (); } split /\n/, $output; return %stats; } ## # List all the discs that we've got. # We use the discs available as /dev/sd?. # # @return hashref containing the disc information for all discs, keyed # by the device name. Each value is a hashref of parameters: # 'device' => device name # 'serial' => serial number # 'model' => model number sub discs_listdiscs { # Get a list of all potential devices opendir(my $dh, "/dev") || die "$Munin::Plugin::me: Cannot read /dev: $!\n"; my %devices = map { ($_ => { 'device' => $_ }) } grep { /^sd[a-z]+$/ } readdir($dh); close($dh); for my $dev (values %devices) { my $udev = discs_udevinfo($dev->{'device'}); $dev->{'serial'} = $udev->{'ID_SERIAL_SHORT'}; $dev->{'model'} = $udev->{'ID_MODEL'}; } return \%devices; } ## # Read the parameters for a disc. # # @param[in] $dev Device name # # @return Parameters from udev as a hashref # 'ID_VENDOR' => ATA # 'ID_MODEL' => WDC_WD20EARS-00J # 'ID_REVISION' => 80.0 # 'ID_SERIAL' => SATA_WDC_WD20EARS-00_WD-WCAYY0221180 # 'ID_SERIAL_SHORT' => WD-WCAYY0221180 # 'ID_TYPE' => disk # 'ID_BUS' => scsi # 'ID_ATA_COMPAT' => WDC_WD20EARS-00J2GB0_WD-WCAYY0221180 # 'ID_PATH' => pci-0000:03:00.0-scsi-0:1:0:0 sub discs_udevinfo { my ($dev) = @_; my $cmd = "udevadm info --query all --name /dev/$dev"; my $output = `$cmd 2>/dev/null`; if ($? != 0) { # Command not found, so switch to udevinfo (the old name). $cmd =~ s/udevadm info/udevinfo/; $output = `$cmd 2>/dev/null`; } my %udev = ($output =~ /^E: (.*?)=(.*)$/gm); return \%udev; } ## # Read the labels on all the devices. # # @return[in] hashref, keyed by the device name, of the labelled discs, # the value being the label. sub discs_listlabels { my $devdir = "/dev/disk/by-label"; opendir(my $dh, $devdir); my @labels = grep { ! /^\./ } readdir($dh); closedir($dh); return { map { (basename( readlink("/dev/disk/by-label/$_") ) => $_); } @labels }; }