#!/usr/bin/perl
##
## timesScraper - Make local HTML copies of the top stories on the
## New York Times RSS feed.
##
## Copyright (c) 2008 by Doug Letterman
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see .
##
##
use strict;
use LWP::UserAgent;
use HTTP::Cookies::Netscape;
use XML::Simple;
use Getopt::Long;
use Encode;
use HTML::Entities;
use CGI qw/:standard/;
use Pod::Usage;
=head1 NAME
timesScraper - Make local HTML copies of the top stories on the
New York Times RSS feed.
=head1 SYNOPSIS
Usage: timesScraper [-wait n]
=head1 OPTIONS
wait n seconds - the maximum number of seconds to wait in between fetching
pages. The default maximum is 10. Fetching waits a random number of seconds
from 0 to wait seconds.
=cut
## We need to point to a web browser's cookies file to get access to the Times
my $browser_data = "$ENV{HOME}/Library/Application Support/Firefox";
## Which RSS feeds do we want to grab?
my $channels = [
[ "National", "http://www.nytimes.com/services/xml/rss/nyt/National.xml" ],
[ "International", "http://www.nytimes.com/services/xml/rss/nyt/International.xml" ],
[ "New York Region", "http://www.nytimes.com/services/xml/rss/nyt/NYRegion.xml" ],
[ "Washington", "http://www.nytimes.com/services/xml/rss/nyt/Washington.xml" ],
[ "Editorials", "http://www.nytimes.com/services/xml/rss/nyt/Opinion.xml" ],
[ "Business", "http://www.nytimes.com/services/xml/rss/nyt/Business.xml" ],
[ "Science", "http://www.nytimes.com/services/xml/rss/nyt/Science.xml" ],
[ "Magazine", "http://www.nytimes.com/services/xml/rss/nyt/Magazine.xml" ],
[ "Week in Review", "http://www.nytimes.com/services/xml/rss/nyt/WeekinReview.xml" ]
];
## Where should I save the resulting pages?
my $destination = "$ENV{HOME}/Documents/nytimes";
## The longest wait time between grabbing pages
## So our requests don't look too unusual
my $opt_wait = 10;
my ($opt_help, $opt_cookies);
GetOptions(
'cookies=s' => \$opt_cookies,
'wait=i' => \$opt_wait,
'help' => \$opt_help
) || pod2usage();
pod2usage(
-msg => "timesScraper - Fetch local copies of top stories on The New York Times",
-verbose => 1
) if ($opt_help);
## Use the cookies file specified by the user or find one
my $cookies = ($opt_cookies) ? $opt_cookies : getCookies();
## Initialize our user-agent
my $ua = LWP::UserAgent->new;
# pretend we are Firefox
$ua->agent("Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; rv:1.7.3) Gecko/20040913 Firefox/0.10");
$ua->cookie_jar(HTTP::Cookies::Netscape->new(file => $cookies, autosave => 0)) if ($cookies);
if (!-e $destination) {
if (!mkdir($destination)) {
die("Error making destination directory '$destination': $!")
}
}
## Start writing the index page
open(INDEX, ">$destination/index.html");
my %saved_files = ("index.html");
print INDEX start_html(('New York Times '.localtime(time))), h1(('New York Times '.localtime(time)));
readChannels();
print INDEX end_html();
close(INDEX);
cleanup();
exit 0;
## Read the channels
sub readChannels {
CHANNEL:
foreach my $cat ( @{$channels}) {
print "Requesting channel '$cat->[0]'...\n";
my $req = HTTP::Request->new(GET => $cat->[1]);
$req->header('Accept' => 'text/html');
ATTEMPT:
for (1..3) {
# send request
my $res = $ua->request($req);
# check the outcome
if ($res->is_success) {
print "Received RSS for '$cat->[0]'.\n";
my $channel;
eval { $channel = XMLin($res->content) };
if ( !$@ && $channel ) {
grabChannel( $channel );
} else {
print "XML Error: ".$@."\n";
}
last ATTEMPT;
} else {
print "Error: " . $res->status_line . "\n";
print INDEX "
Error requesting '$cat->[0]' ($cat->[1]):\n " .
$res->status_line . "\n
\n";
}
}
}
}
sub grabChannel {
my ($channel) = @_;
print INDEX h3($channel->{'channel'}->{'title'});
print INDEX "\n";
ARTICLE:
foreach my $article (@{$channel->{'channel'}->{'item'}}) {
## Change the URL to point to the printer-friendly copy
my $file;
if ($article->{'link'} =~ /\/([^\/]+)\?/) {
$file = $1;
$article->{'link'} .= "&pagewanted=print";
} elsif ( $article->{'link'} =~ /\/([^\/]+)$/) {
$file = $1;
$article->{'link'} .= "?pagewanted=print";
}
#print "Link: $article->{'link'}\n";
my $authorStr;
if (ref($article->{'author'})) {
while(my($key,$value) = each %{$article->{'author'}}) {
$authorStr .= "$key, $value";
}
} else { $authorStr = $article->{'author'}; }
print INDEX "\t- ".encode_entities($article->{'title'})." ".$authorStr."
\n";
## Remember the name of each file we create
$saved_files{$file} = 1;
## If we've already got the file then go on
if (-e "$destination/$file") {
print "\talready have $file\n";
next;
}
my $req = HTTP::Request->new(GET => $article->{'link'});
$req->header('Accept' => 'text/html');
my $content;
## Start looping in case of advertisement page...
do {
my $res = $ua->request( $req );
if (!$res->is_success) {
print "Error: " . $res->status_line . "\n";
print INDEX "
Error saving $file: ".$res->status_line."\n
";
next;
}
$content = $res->content
## Test to see if we grabbed an advertisement page
} while ( $content =~ /{'title'})."'\n";
## Replace the drop cap image
$content =~ s/
]*alt="(\w)"[^<>]*>/$1/m;
open(OF, ">$destination/$file");
print OF printUtf8($content);
close(OF);
sleep(rand($opt_wait)) if ($opt_wait);
}
print INDEX "\n";
}
sub cleanup {
if (!chdir($destination)) {
print STDERR "Could not change to dir $destination: $!\n";
exit 1;
}
print "Cleaning up...\n";
opendir(DEST, $destination);
## Go through the directory and delete any files we didn't just create
for (grep(!/^\.\.?$/, readdir(DEST))) {
if (!exists($saved_files{$_})) {
print "\tdeleting $_\n";
unlink($_);
}
}
}
sub printUtf8 {
return Encode::encode( 'iso-8859-1', $_[0] );
}
sub getCookies {
## Locate the cookies.txt file in the browser's profiles area
my $profiles = "$browser_data/profiles.ini";
if (-f $profiles) {
open(RF, $profiles);
while () {
if (/^Name=default/) {
while () {
if (/^Path=(.*)$/) {
close(RF);
my $cookies = $browser_data."/".$1."/cookies.txt";
if (-e $cookies) {
return $cookies;
}
}
}
}
}
close(RF);
}
return undef;
}