#!/usr/bin/perl ######################################## # This script logs into a google mini search appliance and retrieves the raw log # files for the previous (or specified) day. # # Questions about this script can be directed to: # Blake Crosby (bcrosby@nm.cbc.ca) # # This script was tested on FreeBSD and RHEL and requires the following perl modules: # libwwwperl (LWP::UserAgent) # HTML::TokeParser # HTTP::Cookies # POSIX (core) # # This script was tested on an M2 model Google Mini Search Appliance (Version 4.4.102.M.24) # ######################################### ################################# # Config section # # Turn on or off the debugging output $print_debugging_output = 0; # # hostname of the google appliance you want to retreive logs from # For example: google.hostname.ca $googlehost = 'google.hostname.ca'; # # username and password for the perl script to log in as. I suggest you create a new account specifically # for this script. # For example: user = googlescript password = google # #$username = 'googlescript'; #$password = 'google'; # # the collection name of the logs you want to retreive. CASE IS IMPORTANT # for example: webpages $collection = 'webpages'; # # the front end name of the logs you want to retreive. CASE IS IMPORTANT # for example: webpages. #Note: The mini doesn't use the frontend in reporting. #$frontend = 'webpages'; # # File name to store the log file. # for example googlesearch.log # # By default the log date will be used (20051128.log) however you can # override that by uncommenting the following line #$logfilename = 'googlesearch.log'; # # Prefix for the log file name. Using the default 'gsa_log_' will make the # filename actually be 'gsa_log_20051128.log', for example. $logfilename_prefix = 'gsa_log_'; # # Format for the log file, "log" or "gz" (text or compressed). Note: the gz # option likely won't work on windows boxes, unless gnu utils are installed. $log_format = 'gz'; # # Path to where you want to save log files # for example /var/logs # # NOTE: You must **NOT** have a trailing slash! $log_output_path = "."; # # Path/name for the file to use for the LWP library's cookies. Default should # usually be fine. $cookie_file = '/tmp/lwp_gsa_cookies.txt'; #### END OF CONFIG # # # Don't edit below this line, unless you know what you are doing. # # Load libwwwperl use HTML::TokeParser; use LWP::UserAgent; use HTTP::Cookies; use POSIX 'mktime'; # Default to grabbing the log for "yesterday." We have to do this using a # two-step process to get us the right day, despite issues with daylight savings # and such. (One day a year, 11:30pm minus 24 hours = 12:30am the same day.) ($day,$month,$year) = (localtime())[3,4,5]; $log_date_timestamp = mktime(0,0,0,$day-1,$month,$year); # Allow two command line args -- the output path, and the date. $args_valid = 1; if ($#ARGV > 1) { # More than two args? Invalid usage. $args_valid = 0; } if ($#ARGV > 0) { # Second arg is date in YYYY-MM-DD format. if ($ARGV[1] =~ /^(\d{4})[-\/\.]?(\d{2})[-\/\.]?(\d{2})$/) { $log_date_timestamp = mktime(0,0,0,$3,$2-1,$1-1900); } else { $args_valid = 0; } } if ($#ARGV > -1) { # First arg is an output path. Must exist. $temp_path = $ARGV[0]; $temp_path =~ s/\/$//; if (-d $temp_path) { $log_output_path = $temp_path; } else { print "Invalid output path: $temp_path\n"; exit(); # Immediate die (don't print usage). } } unless ($args_valid) { $script_name = $0; $script_name =~ s/^.*\///; print "Usage: $script_name [/output/path] [YYYY-MM-DD]\n"; exit(); } # Get the full month/day/year, including zero-prefixed versions for use in # the log file name. ($day, $month, $year) = (localtime($log_date_timestamp))[3,4,5]; $theyear=$year+1900; $themonth = $month+1; $theday = $day; $filemonth = sprintf("%02d",$themonth); $fileday = sprintf("%02d",$theday); # Enforce that the specified log format is valid. if ($log_format ne 'gz') { $log_format = 'log'; } # If a logfile name was specified, use that, otherwise use date. if ($logfilename) { $logfilename = "$log_output_path/$logfilename"; } else { $logfilename = "$log_output_path/$logfilename_prefix$theyear$filemonth$fileday.$log_format"; } print "Log will be saved to $logfilename\n" if $print_debugging_output; # Set up a LWP robot to interact with the search appliance. $ua = LWP::UserAgent->new; $ua->agent('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'); $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1)); print "Logging in...\n" if $print_debugging_output; #get some magic cookies $req = HTTP::Request->new(GET => "http://$googlehost:8000/"); $req->header('Referrer' => ''); $res = $ua->request($req); #then load the login page (and get more cookies) $req = HTTP::Request->new(GET => "http://$googlehost:8000/EnterpriseController"); $req->header('Referrer' => ''); $res = $ua->request($req); #let's log into the box :) $req = HTTP::Request->new(POST => "http://$googlehost:8000/EnterpriseController"); $req->header('Referrer' => "http://$googlehost:8000/EnterpriseController"); $req->content_type('application/x-www-form-urlencoded'); $req->content("actionType=authenticateUser&userName=$username&password=$password&login=Login"); $res = $ua->request($req); print "Generating report...\n" if $print_debugging_output; # If we ask for a report, the appliance will automatically generate it, but it'll # just tell us it's "currently generating" it, until the report is actually ready. # So we'll just periodically ask for it until it's no longer being generated. $output = "Currently generating log for ... Please wait"; while ($output =~ /Currently generating log for.*Please wait/) { $req = HTTP::Request->new(GET => "http://$googlehost:8000/EnterpriseController?actionType=collViewLogs&collection=${collection}&refreshDate=date_${themonth}_${theday}_${theyear}"); $req->header('Referrer' => ''); $res = $ua->request($req); #check the outcome if (!($res->is_success)) { print "Error: " . $res->status_line . "\n"; die; } $output = $res->content; print "Report not done yet.\n" if $print_debugging_output;; sleep 10; } print "Report done.\n" if $print_debugging_output; print "Downloading report file...\n" if $print_debugging_output;; # Now that the report is generated, we can download the file. $req = HTTP::Request->new(GET => "http://$googlehost:8000/EnterpriseController?actionType=fileExport&fileExport=Export+to+File&fileBrowse=WEB_LOG&fileArgs=date_${themonth}_${theday}_${theyear}&collection=${collection}"); $req->header('Referrer' => "http://$googlehost:8000/EnterpriseController"); $res = $ua->request($req); # check the outcome if (!($res->is_success)) { print "Error: " . $res->status_line . "\n"; die; } # Since the Google appliance gives us the search logs in reverse chronological # order, we can't just write 'em out to a log. We need to reverse the lines # first so the log ends up in chronological order. @log_lines = reverse(split(/[\r\n]+/,$res->content)); print "Logging out...\n" if $print_debugging_output; #log out of the box $req = HTTP::Request->new(GET => "http://$googlehost:8000/EnterpriseController?actionType=logout"); $req->header('Referrer' => ''); $res = $ua->request($req); # Get rid of the cookie file, as we no longer need it. unlink($cookie_file); # Finally we can write the log lines out to the log file. Depending on the # log type, we might run it through gzip first, though. if ($log_format eq 'gz') { open(OUTFILE, "| gzip > $logfilename"); } else { open(OUTFILE, ">$logfilename"); } print OUTFILE join("\n", @log_lines)."\n"; close(OUTFILE);