#!/usr/bin/perl
########################################
# This script logs into a google mini search appliance and retrieves the raw log
# files for the previous (or specified) day.
#
# Questions about this script can be directed to:
# Blake Crosby (bcrosby@nm.cbc.ca)
#
# This script was tested on FreeBSD and RHEL and requires the following perl modules:
# libwwwperl (LWP::UserAgent)
# HTML::TokeParser
# HTTP::Cookies
# POSIX (core)
#
# This script was tested on an M2 model Google Mini Search Appliance (Version 4.4.102.M.24)
#
#########################################


#################################
# Config section
#

# Turn on or off the debugging output
$print_debugging_output = 0;


#
# hostname of the google appliance you want to retreive logs from
# For example: google.hostname.ca

$googlehost = 'google.hostname.ca';

#
# username and password for the perl script to log in as. I suggest you create a new account specifically
# for this script.
# For example: user = googlescript password = google
#

#$username = 'googlescript';
#$password = 'google';

#
# the collection name of the logs you want to retreive. CASE IS IMPORTANT
# for example: webpages

$collection = 'webpages';

#
# the front end name of the logs you want to retreive. CASE IS IMPORTANT
# for example: webpages.  
#Note: The mini doesn't use the frontend in reporting.

#$frontend = 'webpages';

#
# File name to store the log file.
# for example googlesearch.log
#
# By default the log date will be used (20051128.log) however you can
# override that by uncommenting the following line

#$logfilename = 'googlesearch.log';

#
# Prefix for the log file name.  Using the default 'gsa_log_' will make the 
# filename actually be 'gsa_log_20051128.log', for example.

$logfilename_prefix = 'gsa_log_';

#
# Format for the log file, "log" or "gz" (text or compressed).  Note: the gz
# option likely won't work on windows boxes, unless gnu utils are installed.

$log_format = 'gz';


#
# Path to where you want to save log files
# for example /var/logs
#
# NOTE: You must **NOT** have a trailing slash!

$log_output_path = ".";

#
# Path/name for the file to use for the LWP library's cookies.  Default should
# usually be fine.

$cookie_file = '/tmp/lwp_gsa_cookies.txt';


#### END OF CONFIG

#
#
# Don't edit below this line, unless you know what you are doing.
#


# Load libwwwperl
use HTML::TokeParser;
use LWP::UserAgent;
use HTTP::Cookies;
use POSIX 'mktime';


# Default to grabbing the log for "yesterday."  We have to do this using a 
# two-step process to get us the right day, despite issues with daylight savings
# and such.  (One day a year, 11:30pm minus 24 hours = 12:30am the same day.)
($day,$month,$year) = (localtime())[3,4,5];
$log_date_timestamp = mktime(0,0,0,$day-1,$month,$year);


# Allow two command line args -- the output path, and the date.
$args_valid = 1;

if ($#ARGV > 1) {          # More than two args?  Invalid usage.
	$args_valid = 0;
}
if ($#ARGV > 0) {          # Second arg is date in YYYY-MM-DD format.
	if ($ARGV[1] =~ /^(\d{4})[-\/\.]?(\d{2})[-\/\.]?(\d{2})$/) {
		$log_date_timestamp = mktime(0,0,0,$3,$2-1,$1-1900);
	} else {
		$args_valid = 0;
	}
}
if ($#ARGV > -1) {          # First arg is an output path.  Must exist.
	$temp_path = $ARGV[0];
	$temp_path =~ s/\/$//;
	if (-d $temp_path) {
		$log_output_path = $temp_path;
	} else {
		print "Invalid output path: $temp_path\n";
		exit(); # Immediate die (don't print usage).
	}
}

unless ($args_valid) {
	$script_name = $0;
	$script_name =~ s/^.*\///;
	print "Usage: $script_name [/output/path] [YYYY-MM-DD]\n";
	exit();
}



# Get the full month/day/year, including zero-prefixed versions for use in 
# the log file name.

($day, $month, $year) = (localtime($log_date_timestamp))[3,4,5];
$theyear=$year+1900;
$themonth = $month+1;
$theday = $day;
$filemonth = sprintf("%02d",$themonth);
$fileday = sprintf("%02d",$theday);


# Enforce that the specified log format is valid.
if ($log_format ne 'gz') {
	$log_format = 'log';
}

# If a logfile name was specified, use that, otherwise use date.
if ($logfilename) {
	$logfilename = "$log_output_path/$logfilename";
} else {
	$logfilename = "$log_output_path/$logfilename_prefix$theyear$filemonth$fileday.$log_format";
}

print "Log will be saved to $logfilename\n" if $print_debugging_output;

# Set up a LWP robot to interact with the search appliance.
$ua = LWP::UserAgent->new;
$ua->agent('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)');

$ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1));

print "Logging in...\n" if $print_debugging_output;

#get some magic cookies
$req = HTTP::Request->new(GET => "http://$googlehost:8000/");
$req->header('Referrer' => '');
$res = $ua->request($req);

#then load the login page (and get more cookies)
$req = HTTP::Request->new(GET => "http://$googlehost:8000/EnterpriseController");
$req->header('Referrer' => '');
$res = $ua->request($req);

#let's log into the box :)
$req = HTTP::Request->new(POST => "http://$googlehost:8000/EnterpriseController");
$req->header('Referrer' => "http://$googlehost:8000/EnterpriseController");
$req->content_type('application/x-www-form-urlencoded');
$req->content("actionType=authenticateUser&userName=$username&password=$password&login=Login");
$res = $ua->request($req);

print "Generating report...\n" if $print_debugging_output;

# If we ask for a report, the appliance will automatically generate it, but it'll
# just tell us it's "currently generating" it, until the report is actually ready.
# So we'll just periodically ask for it until it's no longer being generated.
$output = "Currently generating log for ... Please wait";
while ($output =~ /Currently generating log for.*Please wait/) {
	$req = HTTP::Request->new(GET => "http://$googlehost:8000/EnterpriseController?actionType=collViewLogs&collection=${collection}&refreshDate=date_${themonth}_${theday}_${theyear}");
	$req->header('Referrer' => '');
	$res = $ua->request($req);
	
	#check the outcome
	if (!($res->is_success)) {
		print "Error: " . $res->status_line . "\n";
		die;
	}
	
	$output = $res->content;
	print "Report not done yet.\n" if $print_debugging_output;;
	sleep 10;
}

print "Report done.\n" if $print_debugging_output;


print "Downloading report file...\n" if $print_debugging_output;;

# Now that the report is generated, we can download the file.
$req = HTTP::Request->new(GET => "http://$googlehost:8000/EnterpriseController?actionType=fileExport&fileExport=Export+to+File&fileBrowse=WEB_LOG&fileArgs=date_${themonth}_${theday}_${theyear}&collection=${collection}");
$req->header('Referrer' => "http://$googlehost:8000/EnterpriseController");
$res = $ua->request($req);                                     

# check the outcome

if (!($res->is_success)) {
	print "Error: " . $res->status_line . "\n";
	die;
}


# Since the Google appliance gives us the search logs in reverse chronological
# order, we can't just write 'em out to a log.  We need to reverse the lines
# first so the log ends up in chronological order.
@log_lines = reverse(split(/[\r\n]+/,$res->content));

print "Logging out...\n" if $print_debugging_output;

#log out of the box
$req = HTTP::Request->new(GET => "http://$googlehost:8000/EnterpriseController?actionType=logout");
$req->header('Referrer' => '');
$res = $ua->request($req);

# Get rid of the cookie file, as we no longer need it.
unlink($cookie_file);


# Finally we can write the log lines out to the log file.  Depending on the 
# log type, we might run it through gzip first, though.
if ($log_format eq 'gz') {
	open(OUTFILE, "| gzip > $logfilename");
} else {
	open(OUTFILE, ">$logfilename");
}
print OUTFILE join("\n", @log_lines)."\n";
close(OUTFILE);



