getopsstats.pl


#!/usr/bin/perl

# the main functionality of this program creates an array of arrays,
# @rows = [ col1, col2, col3 ... ];
# which will be iterated through, assuming that a proper insert statement has
# been crafted  which will match the expected number of columns found
# during parsing (+/- any additional data points needed, i.e. hostname,
# timestamp).

# currently expects HTML data to be presented horizontally; i.e.
# col1 col2 col3 col4 ...
# val1 val2 val3 val4 ...

# make sure to set the 'orientation' to vetical if the HTML data is presented
# vertically; i.e.
# col1 val1
# col2 val2
# col3 val3
# ...

# any good production perl script should work under use strict & use warnings.
use strict;
use warnings;

# modules required for proper functionality
use WWW::Mechanize;
use DBI;

# this is for long names of 'magic' variables.
use English qw(-no_match_vars);

# variables required for setting up database connection
my $database = 'test';
my $host     = '127.0.0.1';
my $port     = 3306;
my $user     = 'root';
my $pass     = q{};
# database connection string
my $dsn      = "DBI:mysql:database=$database;host=$host;port=$port";

# make sure we don't try to go through the proxy, also a timeout value for
# retreiving the web page.
$ENV{ 'http_proxy' } = q{};
$ENV{ 'HTTP_PROXY' } = q{};

my $timeout = 180;
my $mech    = WWW::Mechanize->new( timeout => $timeout );

my $DEBUG = 1;

# the %checks hash is a hash of hashes which stores:
# 1. the URL to scrape, along with
#   a. the corresponding SQL insert statement
#   b. the regex used to find data to scrape.
#   c. when parsing the HTML, how to 'split' the HTML to find unique rows
#   d. what characters to strip out of the string found by the regex
#   e. whether the layout of the web page to scrape is horizontal or vertical.
my %checks = (
          'http://aroxnyme01a.tradearca.com:5000/queues.htm' => {
            'insert' =>
              'insert into queues values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );',
            'regex' => q{<TD[^>]*>(.*?)</TD>},
            'split' => qq{\n},
            'strip' => q{,%},
            'orientation' => 'horizontal',
          },
          'http://aroxnyme01b.tradearca.com:5000/queues.htm' => {
            'insert' =>
              'insert into queues values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );',
            'regex' => q{<TD[^>]*>(.*?)</TD>},
            'split' => qq{\n},
            'strip' => q{,%},
            'orientation' => 'horizontal',
          },
          'http://aroxnyfe01c.tradearca.com:10001/sub.htm' => {
            'insert' => 'insert into ab_subscribers values'
              . '( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? );',
            'regex' => q{<TD[^>]*>([ a-zA-Z0-9:.,%\n]*?)</TD>},
            'split' => q{TR},
            'strip' => qq{,\%\n},
            'orientation' => 'horizontal',
          },
          'http://aroxnyme01.tradearca.com:5000/headers.htm' => {
            'insert' => 'insert into me_header values'
              . '( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,'
              . ' ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, '
              . '?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);',
            'regex' => q{<TD[^>]*>([ a-zA-Z0-9:.,%\n]*?)</TD>},
            'regex' => q{<TD>.*?</TD><TD>(.*?)</TD>},
            'split' => q{TR},
            'strip' => qq{,\%\n},
            'orientation' => 'vertical',
          },
);

# the assumption is that whomever is populating items in the %checks hash
# knows A) the URL, B) the table layout (num of columns) for the mysql table
# to populate, and C) the regex with which the HTML of the monitor page can be
# parsed for data.

my $dbh;

eval { $dbh = DBI->connect( $dsn, $user, $pass, { 'RaiseError' => 1 } ) };

handle_error( "Failed to connect to $dsn : $EVAL_ERROR\n" ) if $EVAL_ERROR;

do_checks( $dbh, $mech, \%checks );

eval { $dbh->disconnect() };

handle_error( "Failed to disconnect from $dsn : $EVAL_ERROR\n" )
  if $EVAL_ERROR;

# this performs the meat of the work.
# this cycles through the check URLs, grabbing their content, and hands off
# parsing the content to build_rows.  then hands off to prep_and_exec to
# insert the data into the database.
sub do_checks {
  my( $db_handle, $mech, $checklist ) = @_;

  for my $url ( sort keys %{ $checklist } ) {
    my( $hostname, $location, $node );
    ( $hostname, $location, $node ) = ( $1, $2, $3 )
      if $url =~ m{^http://(\w{4}(\w{2})\w+(\w))\.tradearca.*};
    print "hostname: $hostname\tlocation: $location\tnode: $node\n" if $DEBUG;
    $mech->get( $url );

    my $date = get_date();
    my @rows = build_rows( $mech,
                           $checklist->{ $url }{ regex },
                           $checklist->{ $url }{ strip },
                           $hostname,
                           $date,
                           $checklist->{ $url }{ split },
                           $checklist->{ $url }{ orientation }
                         );

    #use Data::Dumper;
    #print Dumper @rows;
    #exit;
    prep_and_exec( $db_handle, $checklist->{ $url }{ insert }, @rows );

  }
  return;
}

# this handles building the structure to insert into the database.
# hands off the actual parsing of HTML data to parse_table.
sub build_rows {
  my( $mech, $regex, $strip, $host, $date, $split, $orientation ) = @_;
  my @rows;
  for my $line ( split $split, $mech->content() ) {
    my $found     = 0;
    my $foundlist = q{};
    my @foundrow;
    if( $orientation eq 'vertical' ){
      push @foundrow, $host, $date if @rows < 1;
      push @foundrow,
        parse_table( $line, $regex, $strip, \$found, \$foundlist );
      push @rows, @foundrow if $found;
    }
    else {
      push @foundrow, $host, $date;
      push @foundrow,
        parse_table( $line, $regex, $strip, \$found, \$foundlist );
      push @rows, [ @foundrow ] if $found;
    }
    print "next row: $foundlist\n" if $DEBUG;
  }
  if( $orientation eq 'vertical' ){
    return [ @rows ];
  }
  else {
    return @rows;
  }
}

# this handles searching through the HTML for a given regex, collecting data
# as it encounters it.
sub parse_table {
  my( $line, $regex, $strip, $inc, $text ) = @_;
  my @row;
  while( $line =~ m{$regex}g ) {
    my $foundval = $1;
    $foundval =
      $foundval =~ m/[0-9$strip]+/
      ? strip_chars( $foundval, $strip )
      : $foundval;
    #$foundval ||= 'NULL';
    $foundval = $foundval eq '0'
              ? $foundval
              : ( $foundval ||= 'NULL' );
    push @row, $foundval;
    ${ $text } .= " $foundval," if $DEBUG;
    ${ $inc }++;
  }
  return @row;
}

sub strip_chars {
  my( $text, $chars ) = @_;
  $text =~ s/[$chars]//g;
  return $text;
}

# this just handles the drudgery of taking the data sructure created,
# applying it to a prepared statement, and telling the database to execute
# every last stinking one of 'em.
sub prep_and_exec {
  my( $db_handle, $insert_query, @bindvalues ) = @_;
  my $st_handle;
  eval { $st_handle = $db_handle->prepare( $insert_query ) };
  handle_error( "Prepare \"$insert_query\" failed: $EVAL_ERROR\n" )
    if $EVAL_ERROR;
  for my $row ( @bindvalues ) {
    eval { $st_handle->execute( @{ $row } ) };
    handle_error(
             'Execute failed ' . join( q{ }, @{ $row } ) . ": $EVAL_ERROR\n" )
      if $EVAL_ERROR;
  }
  $st_handle->finish();
  handle_error( "Finish failed: $EVAL_ERROR\n" ) if $EVAL_ERROR;
  return;
}

# just formats the current time nicely.  generates a new date every time it's
# called in case the list of checks grows to an unmanagable number.  er, in
# case the exact time it's run is important.
sub get_date {
  my( $sec, $min, $hr, $day, $mo, $year ) = ( localtime time )[ 0 .. 5 ];
  $min = sprintf '%02d', $min;
  $sec = sprintf '%02d', $sec;
  $mo++;
  $year += 1900;
  my $date = "$year-$mo-$day $hr:$min:$sec";
  return $date;
}

# a bit of a misnomer, as it doesn't handle much of anything.
sub handle_error {
  my $message = shift;
  if( $message =~ m/ conne/ ) {
    die $message
  }
  else {
    warn $message
  }
  return;
}

1;