#!/usr/bin/perl -w use HTML::Parse; use HTML::FormatText; use HTML::TableContentParser; use POSIX qw(strftime); use Time::ParseDate; use strict; # Ineffieicnt, but hey, it's a hack anyway. my @html = <>; my $html = "@html"; my $ft = HTML::FormatText->new(); my $tables = HTML::TableContentParser->new()->parse($html); # Clear out the tables used for layout (bastards). shift @$tables; shift @$tables; pop @$tables; # "31 (WEEKDAY) : HALETHORPE / UMBC / PENN STATION" my $title = shift @$tables; my (%times); my (@days, @directions, @timepoints); my (%seendays, %seendirs, %seentps); # Parse. my ($t); for $t (@$tables) { my ($day, $dir, $timepoint, $tprow, $cell); my ($r, $c); my (@tmppoints, @tpdir); # "Weekday" $day = shift @{$t->{rows}}; # "TO PENN STATION" $dir = shift @{$t->{rows}}; # time point names $tprow = shift @{$t->{rows}}; # Indication of whether time is leaving or arriving. Useless for this app shift @{$t->{rows}}; # Clean up day and direction for use. $day = $ft->format(parse_html(${$day->{cells}}[0]->{data})); $dir = $ft->format(parse_html(${$dir->{cells}}[0]->{data})); chomp $day; chomp $dir; $day =~ s/^\s+|\s+$//g; $dir =~ s/^\s+|\s+$//g; if (not $seendays{$day}) { push @days, $day; $seendays{$day}++; } if (not $seendirs{$dir}) { push @directions, $dir; $seendirs{$dir}++; } # Last entry is "attributes". pop @{$tprow->{cells}}; for $c (@{$tprow->{cells}}) { $cell = HTML::FormatText->new->format(parse_html($c->{data})); chomp($cell); $cell =~ s/\band\b/&/gi; $cell =~ s/(\w+)/\u\L$1/g; $cell =~ s/^\s+|\s+$//g; $cell =~ s/\.+$//g; if (not $seentps{$cell}) { push @timepoints, $cell; $seentps{$cell}++; } push @tpdir, $cell; } # Each row is one line from the schedule. for $r (@{$t->{rows}}) { # Last cell of the row is attributes, so not a valid stop. pop @{$r->{cells}}; @tmppoints = @tpdir; for $c (@{$r->{cells}}) { $timepoint = shift @tmppoints; $cell = HTML::FormatText->new->format(parse_html($c->{data})); chomp($cell); $cell =~ s/^\s+|\s+$//g; if ($cell eq "") { $cell = "-----"; } else { if (parsedate($cell) == 0) { print STDERR "Don't like '$cell'.\ndir: $dir\nday: $day\ntimepoint: $timepoint\n"; } $cell = POSIX::strftime('%H:%M', localtime(parsedate($cell))); } push @{$times{$dir}{$day}{$timepoint}}, $cell; } } } # Sort the timepoints. The order is dictated by the first direction; order of # the second direction is swapped so both arrays point the same way. @timepoints = sort { my $result = cmptimes($a, $b, $directions[0]); if ($result == 0) { $result = cmptimes($b, $a, $directions[1]); } $result; } @timepoints; # Start output. # Time points are easy now. print join("\t", @timepoints), "\n"; # As are days. print join("\t", @days), "\n"; my $trips = ""; my @startlines; my $line = 4; my $dir = $directions[0]; for my $day (@days) { # Remember where this block of data started. push @startlines, $line; while (more_data($dir, $day)) { for my $tp (@timepoints) { if (exists $times{$dir}{$day}{$tp}) { $trips .= shift(@{$times{$dir}{$day}{$tp}}) . "\t"; } else { $trips .= "-----\t"; } } chop($trips); $trips .= "\n"; $line++; } } # Kinda bad, but it'd be more work to actually write a subroutine for it. $dir = $directions[1]; @timepoints = reverse @timepoints; for my $day (@days) { # Remember where this block of data started. push @startlines, $line; while (more_data($dir, $day)) { for my $tp (@timepoints) { if (exists $times{$dir}{$day}{$tp}) { $trips .= shift(@{$times{$dir}{$day}{$tp}}) . "\t"; } else { $trips .= "-----\t"; } } chop($trips); $trips .= "\n"; $line++; } } print join("\t", @startlines), "\n"; print $trips; sub cmptimes { my ($a, $b, $dir) = @_; my $i; my $day = $days[0]; if (exists $times{$dir}{$day}{$a} and exists $times{$dir}{$day}{$b}) { for ($i = 0; $i < $#{$times{$dir}{$day}{$a}}; $i++) { if (${$times{$dir}{$day}{$a}}[$i] ne "-----" and ${$times{$dir}{$day}{$b}}[$i] ne "-----") { last; } } if ($i < $#{$times{$dir}{$day}{$a}}) { # Because of the format of the data, string compare will work. return ${$times{$dir}{$day}{$a}}[$i] cmp ${$times{$dir}{$day}{$b}}[$i]; } } return 0; } sub more_data { my ($dir, $day) = @_; my @keys = keys %{$times{$dir}{$day}}; my $count = @{$times{$dir}{$day}{$keys[0]}}; return $count > 0; }