#!/usr/bin/perl # newsnow-rss.pl - by James Powell / 2004 # convert http://www.newsnow.co.uk/ pages to RSS feeds use strict; use Data::Dumper; use LWP::Simple; use HTML::Parser; use XML::RSS; use POSIX qw(strftime); my $url = $ARGV[0] || 'http://www.newsnow.co.uk/newsfeed/?name=Tottenham+Hotspur'; # {{{ initialize vars my $in_link = undef; my $recent_in_link = undef; my $in_title = undef; my $in_span = undef; my $title_txt = ''; my $nowday = strftime '%g', localtime; $nowday =~ s/^ +//; # strip leading space my $nowdate = (strftime "%e-%b-", localtime) . $nowday; my %links; my $content; # for date calcs my @mons = qw{FILLER Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec}; my $idx = 0; my %mon2num = map { $_ => $idx++ } @mons; # }}} if ($ENV{DEBUG}) { print "starting\n"; } if ($ENV{DEBUG} && $ENV{DEBUG} == 2) { local $/ = undef; open (HTML, "$ENV{HOME}/.nn.html"); $content = ; close (HTML); } else { $content = get($url); } unless($content) { if ($ENV{DEBUG}) { print "$url - no content\n"; } die; } if ($ENV{DEBUG}) { print "parsing\n"; } # save locally for dev if ($ENV{DEBUG} && $ENV{DEBUG} != 2) { open (HTML, ">$ENV{HOME}/.nn.html"); print HTML $content; close(HTML); } # {{{ initialize parser and parse my $p = HTML::Parser->new( api_version => 3, start_h => [\&start, "tagname, attr"], end_h => [\&end, "tagname"], text_h => [\&text, "text"], marked_sections => 1, ); $p->parse($content); $p->eof; # }}} print Dumper(\%links) if ($ENV{DEBUG}); my $rss = new XML::RSS (version => '1.0'); $rss->channel( title => $title_txt, link => 'http://www.newsnow.co.uk/', description => '' ); while (my ($link, $info) = each %links) { $rss->add_item(title => $info->{title}, link => $link, dc => { date => &conv_date($info->{date}) } ); } print $rss->as_string; if ($ENV{DEBUG}) { print "ending\n"; } exit; ###### SUBROUTINES ###### # {{{ start - handler for start of tags sub start { # basically switches some booleans # if ($ENV{DEBUG}) { print Dumper(\@_); } my($tagname, $attr) = @_; if (($tagname eq 'a') && ($attr->{href} =~ m#^/cgi/NGoto/#)) { # grab link $in_link = 'http://www.newsnow.co.uk' . $attr->{href}; # print Dumper($attr); } elsif ($tagname eq 'title') { $in_title = 1; } elsif (($tagname eq 'span') && ($attr->{class} eq 'src')) { $in_span = 1; } } # }}} # {{{ end - handler for end of tags sub end { # basically switches off booleans # if ($ENV{DEBUG}) { print Dumper(\@_); } my($tagname) = @_; if ($tagname eq 'a') { $in_link = undef; } elsif ($tagname eq 'title') { $in_title = undef; } elsif ($tagname eq 'span') { $in_span = undef; } } # }}} # {{{ text - handler for text section sub text { my($origtext) = @_; # if ($ENV{DEBUG}) { print Dumper(\@_); } if ($in_link) { # store title in hash # print $origtext . "\n"; $links{$in_link}->{title} = $origtext; $recent_in_link = $in_link; } elsif ($in_title) { $title_txt = $origtext; } elsif ($in_span) { # store date in hash # format: 00:40 9-Feb-04 my ($source, $time1, $time2, $date) = $origtext =~ m/^(.*) (\d\d):(\d\d) ?(\d\d?-\w\w\w-\d\d)?$/; my $time = "$time1:$time2"; unless ($date) { $date = $nowdate; } $links{$recent_in_link}->{date} = "$time $date"; print "$source - TIME: $time $date\n" if ($ENV{DEBUG}); $recent_in_link = undef; } } # }}} # {{{ conv_date - put date in dc format sub conv_date { my $date = shift; # example = 14:56 1-Mar-04 # output 2000-01-01T12:00+00:00 my ($time, $day, $month, $year) = $date =~ /^(\d\d:\d\d) *(\d\d?)-(\w\w\w)-(\d\d)$/; $year += 2000; my $outstr = sprintf "%s-%02d-%02dT%s+00:00", $year, $mon2num{$month}, $day, $time; if ($ENV{DEBUG}) { print "$date - $month - OUT: $outstr\n"; } return $outstr; } # }}}