#!/usr/bin/perl ## ## timesScraper - Make local HTML copies of the top stories on the ## New York Times RSS feed. ## ## Copyright (c) 2008 by Doug Letterman ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . ## ## use strict; use LWP::UserAgent; use HTTP::Cookies::Netscape; use XML::Simple; use Getopt::Long; use Encode; use HTML::Entities; use CGI qw/:standard/; use Pod::Usage; =head1 NAME timesScraper - Make local HTML copies of the top stories on the New York Times RSS feed. =head1 SYNOPSIS Usage: timesScraper [-wait n] =head1 OPTIONS wait n seconds - the maximum number of seconds to wait in between fetching pages. The default maximum is 10. Fetching waits a random number of seconds from 0 to wait seconds. =cut ## We need to point to a web browser's cookies file to get access to the Times my $browser_data = "$ENV{HOME}/Library/Application Support/Firefox"; ## Which RSS feeds do we want to grab? my $channels = [ [ "National", "http://www.nytimes.com/services/xml/rss/nyt/National.xml" ], [ "International", "http://www.nytimes.com/services/xml/rss/nyt/International.xml" ], [ "New York Region", "http://www.nytimes.com/services/xml/rss/nyt/NYRegion.xml" ], [ "Washington", "http://www.nytimes.com/services/xml/rss/nyt/Washington.xml" ], [ "Editorials", "http://www.nytimes.com/services/xml/rss/nyt/Opinion.xml" ], [ "Business", "http://www.nytimes.com/services/xml/rss/nyt/Business.xml" ], [ "Science", "http://www.nytimes.com/services/xml/rss/nyt/Science.xml" ], [ "Magazine", "http://www.nytimes.com/services/xml/rss/nyt/Magazine.xml" ], [ "Week in Review", "http://www.nytimes.com/services/xml/rss/nyt/WeekinReview.xml" ] ]; ## Where should I save the resulting pages? my $destination = "$ENV{HOME}/Documents/nytimes"; ## The longest wait time between grabbing pages ## So our requests don't look too unusual my $opt_wait = 10; my ($opt_help, $opt_cookies); GetOptions( 'cookies=s' => \$opt_cookies, 'wait=i' => \$opt_wait, 'help' => \$opt_help ) || pod2usage(); pod2usage( -msg => "timesScraper - Fetch local copies of top stories on The New York Times", -verbose => 1 ) if ($opt_help); ## Use the cookies file specified by the user or find one my $cookies = ($opt_cookies) ? $opt_cookies : getCookies(); ## Initialize our user-agent my $ua = LWP::UserAgent->new; # pretend we are Firefox $ua->agent("Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; rv:1.7.3) Gecko/20040913 Firefox/0.10"); $ua->cookie_jar(HTTP::Cookies::Netscape->new(file => $cookies, autosave => 0)) if ($cookies); if (!-e $destination) { if (!mkdir($destination)) { die("Error making destination directory '$destination': $!") } } ## Start writing the index page open(INDEX, ">$destination/index.html"); my %saved_files = ("index.html"); print INDEX start_html(('New York Times '.localtime(time))), h1(('New York Times '.localtime(time))); readChannels(); print INDEX end_html(); close(INDEX); cleanup(); exit 0; ## Read the channels sub readChannels { CHANNEL: foreach my $cat ( @{$channels}) { print "Requesting channel '$cat->[0]'...\n"; my $req = HTTP::Request->new(GET => $cat->[1]); $req->header('Accept' => 'text/html'); ATTEMPT: for (1..3) { # send request my $res = $ua->request($req); # check the outcome if ($res->is_success) { print "Received RSS for '$cat->[0]'.\n"; my $channel; eval { $channel = XMLin($res->content) }; if ( !$@ && $channel ) { grabChannel( $channel ); } else { print "XML Error: ".$@."\n"; } last ATTEMPT; } else { print "Error: " . $res->status_line . "\n"; print INDEX "
Error requesting '$cat->[0]' ($cat->[1]):\n " . 
							$res->status_line . "\n
\n"; } } } } sub grabChannel { my ($channel) = @_; print INDEX h3($channel->{'channel'}->{'title'}); print INDEX "\n"; } sub cleanup { if (!chdir($destination)) { print STDERR "Could not change to dir $destination: $!\n"; exit 1; } print "Cleaning up...\n"; opendir(DEST, $destination); ## Go through the directory and delete any files we didn't just create for (grep(!/^\.\.?$/, readdir(DEST))) { if (!exists($saved_files{$_})) { print "\tdeleting $_\n"; unlink($_); } } } sub printUtf8 { return Encode::encode( 'iso-8859-1', $_[0] ); } sub getCookies { ## Locate the cookies.txt file in the browser's profiles area my $profiles = "$browser_data/profiles.ini"; if (-f $profiles) { open(RF, $profiles); while () { if (/^Name=default/) { while () { if (/^Path=(.*)$/) { close(RF); my $cookies = $browser_data."/".$1."/cookies.txt"; if (-e $cookies) { return $cookies; } } } } } close(RF); } return undef; }