#!/usr/bin/perl

##
## timesScraper - Make local HTML copies of the top stories on the 
##                New York Times RSS feed.
## 
## Copyright (c) 2008 by Doug Letterman
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
## 
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
## 
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
## 
##

use strict;
use LWP::UserAgent;
use HTTP::Cookies::Netscape;
use XML::Simple;
use Getopt::Long;
use Encode;
use HTML::Entities;
use CGI qw/:standard/;
use Pod::Usage;

=head1 NAME

timesScraper - Make local HTML copies of the top stories on the 
New York Times RSS feed.

=head1 SYNOPSIS

Usage: timesScraper [-wait n]

=head1 OPTIONS

wait n seconds - the maximum number of seconds to wait in between fetching 
pages. The default maximum is 10. Fetching waits a random number of seconds 
from 0 to wait seconds.

=cut

## We need to point to a web browser's cookies file to get access to the Times
my $browser_data = "$ENV{HOME}/Library/Application Support/Firefox";

## Which RSS feeds do we want to grab?
my $channels = [ 
				 [ "National", 			"http://www.nytimes.com/services/xml/rss/nyt/National.xml" 		],
				 [ "International",		"http://www.nytimes.com/services/xml/rss/nyt/International.xml"	],
				 [ "New York Region", 	"http://www.nytimes.com/services/xml/rss/nyt/NYRegion.xml" 		],
				 [ "Washington", 		"http://www.nytimes.com/services/xml/rss/nyt/Washington.xml" 	],
				 [ "Editorials", 		"http://www.nytimes.com/services/xml/rss/nyt/Opinion.xml" 		],
				 [ "Business",			"http://www.nytimes.com/services/xml/rss/nyt/Business.xml"		],
				 [ "Science", 			"http://www.nytimes.com/services/xml/rss/nyt/Science.xml"		],
				 [ "Magazine", 			"http://www.nytimes.com/services/xml/rss/nyt/Magazine.xml"		],
				 [ "Week in Review",	"http://www.nytimes.com/services/xml/rss/nyt/WeekinReview.xml"	]
			];
## Where should I save the resulting pages?
my $destination = "$ENV{HOME}/Documents/nytimes";
## The longest wait time between grabbing pages
## So our requests don't look too unusual
my $opt_wait = 10;
my ($opt_help, $opt_cookies);

GetOptions( 
			'cookies=s' => \$opt_cookies,
			'wait=i' => \$opt_wait,
			'help' => \$opt_help
			) || pod2usage();

pod2usage(
	-msg => "timesScraper - Fetch local copies of top stories on The New York Times", 
	-verbose => 1
	) if ($opt_help);

## Use the cookies file specified by the user or find one
my $cookies = ($opt_cookies) ? $opt_cookies : getCookies();

## Initialize our user-agent
my $ua = LWP::UserAgent->new;

# pretend we are Firefox
$ua->agent("Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; rv:1.7.3) Gecko/20040913 Firefox/0.10");
$ua->cookie_jar(HTTP::Cookies::Netscape->new(file => $cookies, autosave => 0)) if ($cookies);

if (!-e $destination) {
	if (!mkdir($destination)) {
		die("Error making destination directory '$destination': $!")
	}
}

## Start writing the index page
open(INDEX, ">$destination/index.html");
my %saved_files = ("index.html");
print INDEX start_html(('New York Times '.localtime(time))), h1(('New York Times '.localtime(time)));
readChannels();
print INDEX end_html();
close(INDEX);
cleanup();
exit 0;
                                       
## Read the channels
sub readChannels {
	CHANNEL:                                    
	foreach my $cat ( @{$channels}) {
		print "Requesting channel '$cat->[0]'...\n";
		my $req = HTTP::Request->new(GET => $cat->[1]);
		$req->header('Accept' => 'text/html');
	
		ATTEMPT:
		for (1..3) {
			# send request
			my $res = $ua->request($req);
	
			# check the outcome
			if ($res->is_success) {
				print "Received RSS for '$cat->[0]'.\n";
				my $channel;
				eval { $channel = XMLin($res->content) };
				if ( !$@ && $channel ) {
					grabChannel( $channel );
				} else {
					print "XML Error: ".$@."\n";
				}
				last ATTEMPT;
			} else {
				print "Error: " . $res->status_line . "\n";
				print INDEX "<pre>Error requesting '$cat->[0]' ($cat->[1]):\n " . 
							$res->status_line . "\n</pre>\n";
			}
		}
	}
}

sub grabChannel {
	my ($channel) = @_;
	print INDEX h3($channel->{'channel'}->{'title'});
	print INDEX "<ul>\n";
	ARTICLE:
	foreach my $article (@{$channel->{'channel'}->{'item'}}) {
		## Change the URL to point to the printer-friendly copy
		my $file;
		if ($article->{'link'} =~ /\/([^\/]+)\?/) {
			$file = $1;
			$article->{'link'} .= "&pagewanted=print";
		} elsif ( $article->{'link'} =~ /\/([^\/]+)$/) {
			$file = $1;
			$article->{'link'} .= "?pagewanted=print";
		}
		#print "Link: $article->{'link'}\n";
		my $authorStr;
		if (ref($article->{'author'})) { 
			 while(my($key,$value) = each %{$article->{'author'}}) {
				 $authorStr .= "$key, $value";
			 }
		} else { $authorStr = $article->{'author'}; }
		print INDEX "\t<li><a href=\"$file\">".encode_entities($article->{'title'})."</a> ".$authorStr."</li>\n";
		## Remember the name of each file we create
		$saved_files{$file} = 1;
		## If we've already got the file then go on
		if (-e "$destination/$file") {
			print "\talready have $file\n";
			next;
		}
		my $req = HTTP::Request->new(GET => $article->{'link'});
		$req->header('Accept' => 'text/html');
		my $content;
		## Start looping in case of advertisement page...
		do {
			my $res = $ua->request( $req );
			if (!$res->is_success) {
				print "Error: " . $res->status_line . "\n";
				print INDEX "<pre>Error saving $file: ".$res->status_line."\n</pre>";
				next;
			}
			$content = $res->content
		## Test to see if we grabbed an advertisement page
		} while ( $content =~ /<meta http-equiv=refresh content="\d+;url=/m );
		
		print "\tsaving '".printUtf8($article->{'title'})."'\n";
		## Replace the drop cap image
		$content =~ s/<IMG[^<>]*alt="(\w)"[^<>]*>/$1/m;
		open(OF, ">$destination/$file");
		print OF printUtf8($content);
		close(OF);
		sleep(rand($opt_wait)) if ($opt_wait);
	}
	print INDEX "</ul>\n";
}


sub cleanup {
	if (!chdir($destination)) {
		print STDERR "Could not change to dir $destination: $!\n";
		exit 1;
	}
	print "Cleaning up...\n";
	opendir(DEST, $destination);
	## Go through the directory and delete any files we didn't just create
	for (grep(!/^\.\.?$/, readdir(DEST))) {
		if (!exists($saved_files{$_})) {
			print "\tdeleting $_\n";
			unlink($_);
		}
	}
}

sub printUtf8 {
	return Encode::encode( 'iso-8859-1', $_[0] );
}

sub getCookies {
	## Locate the cookies.txt file in the browser's profiles area
	my $profiles = "$browser_data/profiles.ini";
	if (-f $profiles) {
		open(RF, $profiles);
		while (<RF>) {
			if (/^Name=default/) {
				while (<RF>) {
					if (/^Path=(.*)$/) {
						close(RF);
						my $cookies = $browser_data."/".$1."/cookies.txt";
						if (-e $cookies) {
							return $cookies;
						}
					}
				}
			}
		}
		close(RF);
	}
	return undef;
}

