#!/usr/bin/perl
# parse.pl:
#   input: a Web page with links to images
#   output: a listing file
# Read files in one gulp.

undef $/;
my $buf = <>;

# Ugly regexp to extract links.
while ($buf =~
    s/.*?<a[^>]*href\s*=\s*"?([^ ">]*).*?>(.*?)<\/a>//si) {
  my $href = $1;
  my $title = $2;
  # Accept only jpegs.
  next if ($href !~ /\.jpe?g/i);
  # Remove unneeded whitespace in text.
  $title =~ s/\n/ /g;
  $title =~ s/\s\s+/ /g;
  # Remove any tags in the title.
  $title =~ s/<.*?>//g;
  print "$href\t$title\n";
}