getopsstats.pl
#!/usr/bin/perl
# the main functionality of this program creates an array of arrays,
# @rows = [ col1, col2, col3 ... ];
# which will be iterated through, assuming that a proper insert statement has
# been crafted which will match the expected number of columns found
# during parsing (+/- any additional data points needed, i.e. hostname,
# timestamp).
# currently expects HTML data to be presented horizontally; i.e.
# col1 col2 col3 col4 ...
# val1 val2 val3 val4 ...
# make sure to set the 'orientation' to vetical if the HTML data is presented
# vertically; i.e.
# col1 val1
# col2 val2
# col3 val3
# ...
# any good production perl script should work under use strict & use warnings.
use strict;
use warnings;
# modules required for proper functionality
use WWW::Mechanize;
use DBI;
# this is for long names of 'magic' variables.
use English qw(-no_match_vars);
# variables required for setting up database connection
my $database = 'test';
my $host = '127.0.0.1';
my $port = 3306;
my $user = 'root';
my $pass = q{};
# database connection string
my $dsn = "DBI:mysql:database=$database;host=$host;port=$port";
# make sure we don't try to go through the proxy, also a timeout value for
# retreiving the web page.
$ENV{ 'http_proxy' } = q{};
$ENV{ 'HTTP_PROXY' } = q{};
my $timeout = 180;
my $mech = WWW::Mechanize->new( timeout => $timeout );
my $DEBUG = 1;
# the %checks hash is a hash of hashes which stores:
# 1. the URL to scrape, along with
# a. the corresponding SQL insert statement
# b. the regex used to find data to scrape.
# c. when parsing the HTML, how to 'split' the HTML to find unique rows
# d. what characters to strip out of the string found by the regex
# e. whether the layout of the web page to scrape is horizontal or vertical.
my %checks = (
'http://aroxnyme01a.tradearca.com:5000/queues.htm' => {
'insert' =>
'insert into queues values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );',
'regex' => q{<TD[^>]*>(.*?)</TD>},
'split' => qq{\n},
'strip' => q{,%},
'orientation' => 'horizontal',
},
'http://aroxnyme01b.tradearca.com:5000/queues.htm' => {
'insert' =>
'insert into queues values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );',
'regex' => q{<TD[^>]*>(.*?)</TD>},
'split' => qq{\n},
'strip' => q{,%},
'orientation' => 'horizontal',
},
'http://aroxnyfe01c.tradearca.com:10001/sub.htm' => {
'insert' => 'insert into ab_subscribers values'
. '( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );',
'regex' => q{<TD[^>]*>([ a-zA-Z0-9:.,%\n]*?)</TD>},
'split' => q{TR},
'strip' => qq{,\%\n},
'orientation' => 'horizontal',
},
'http://aroxnyme01.tradearca.com:5000/headers.htm' => {
'insert' => 'insert into me_header values'
. '( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,'
. ' ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, '
. '?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);',
'regex' => q{<TD[^>]*>([ a-zA-Z0-9:.,%\n]*?)</TD>},
'regex' => q{<TD>.*?</TD><TD>(.*?)</TD>},
'split' => q{TR},
'strip' => qq{,\%\n},
'orientation' => 'vertical',
},
);
# the assumption is that whomever is populating items in the %checks hash
# knows A) the URL, B) the table layout (num of columns) for the mysql table
# to populate, and C) the regex with which the HTML of the monitor page can be
# parsed for data.
my $dbh;
eval { $dbh = DBI->connect( $dsn, $user, $pass, { 'RaiseError' => 1 } ) };
handle_error( "Failed to connect to $dsn : $EVAL_ERROR\n" ) if $EVAL_ERROR;
do_checks( $dbh, $mech, \%checks );
eval { $dbh->disconnect() };
handle_error( "Failed to disconnect from $dsn : $EVAL_ERROR\n" )
if $EVAL_ERROR;
# this performs the meat of the work.
# this cycles through the check URLs, grabbing their content, and hands off
# parsing the content to build_rows. then hands off to prep_and_exec to
# insert the data into the database.
sub do_checks {
my( $db_handle, $mech, $checklist ) = @_;
for my $url ( sort keys %{ $checklist } ) {
my( $hostname, $location, $node );
( $hostname, $location, $node ) = ( $1, $2, $3 )
if $url =~ m{^http://(\w{4}(\w{2})\w+(\w))\.tradearca.*};
print "hostname: $hostname\tlocation: $location\tnode: $node\n" if $DEBUG;
$mech->get( $url );
my $date = get_date();
my @rows = build_rows( $mech,
$checklist->{ $url }{ regex },
$checklist->{ $url }{ strip },
$hostname,
$date,
$checklist->{ $url }{ split },
$checklist->{ $url }{ orientation }
);
#use Data::Dumper;
#print Dumper @rows;
#exit;
prep_and_exec( $db_handle, $checklist->{ $url }{ insert }, @rows );
}
return;
}
# this handles building the structure to insert into the database.
# hands off the actual parsing of HTML data to parse_table.
sub build_rows {
my( $mech, $regex, $strip, $host, $date, $split, $orientation ) = @_;
my @rows;
for my $line ( split $split, $mech->content() ) {
my $found = 0;
my $foundlist = q{};
my @foundrow;
if( $orientation eq 'vertical' ){
push @foundrow, $host, $date if @rows < 1;
push @foundrow,
parse_table( $line, $regex, $strip, \$found, \$foundlist );
push @rows, @foundrow if $found;
}
else {
push @foundrow, $host, $date;
push @foundrow,
parse_table( $line, $regex, $strip, \$found, \$foundlist );
push @rows, [ @foundrow ] if $found;
}
print "next row: $foundlist\n" if $DEBUG;
}
if( $orientation eq 'vertical' ){
return [ @rows ];
}
else {
return @rows;
}
}
# this handles searching through the HTML for a given regex, collecting data
# as it encounters it.
sub parse_table {
my( $line, $regex, $strip, $inc, $text ) = @_;
my @row;
while( $line =~ m{$regex}g ) {
my $foundval = $1;
$foundval =
$foundval =~ m/[0-9$strip]+/
? strip_chars( $foundval, $strip )
: $foundval;
#$foundval ||= 'NULL';
$foundval = $foundval eq '0'
? $foundval
: ( $foundval ||= 'NULL' );
push @row, $foundval;
${ $text } .= " $foundval," if $DEBUG;
${ $inc }++;
}
return @row;
}
sub strip_chars {
my( $text, $chars ) = @_;
$text =~ s/[$chars]//g;
return $text;
}
# this just handles the drudgery of taking the data sructure created,
# applying it to a prepared statement, and telling the database to execute
# every last stinking one of 'em.
sub prep_and_exec {
my( $db_handle, $insert_query, @bindvalues ) = @_;
my $st_handle;
eval { $st_handle = $db_handle->prepare( $insert_query ) };
handle_error( "Prepare \"$insert_query\" failed: $EVAL_ERROR\n" )
if $EVAL_ERROR;
for my $row ( @bindvalues ) {
eval { $st_handle->execute( @{ $row } ) };
handle_error(
'Execute failed ' . join( q{ }, @{ $row } ) . ": $EVAL_ERROR\n" )
if $EVAL_ERROR;
}
$st_handle->finish();
handle_error( "Finish failed: $EVAL_ERROR\n" ) if $EVAL_ERROR;
return;
}
# just formats the current time nicely. generates a new date every time it's
# called in case the list of checks grows to an unmanagable number. er, in
# case the exact time it's run is important.
sub get_date {
my( $sec, $min, $hr, $day, $mo, $year ) = ( localtime time )[ 0 .. 5 ];
$min = sprintf '%02d', $min;
$sec = sprintf '%02d', $sec;
$mo++;
$year += 1900;
my $date = "$year-$mo-$day $hr:$min:$sec";
return $date;
}
# a bit of a misnomer, as it doesn't handle much of anything.
sub handle_error {
my $message = shift;
if( $message =~ m/ conne/ ) {
die $message
}
else {
warn $message
}
return;
}
1;