#!/usr/bin/perl # parse.pl: # input: a Web page with links to images # output: a listing file # Read files in one gulp. undef $/; my $buf = <>; # Ugly regexp to extract links. while ($buf =~ s/.*?<a[^>]*href\s*=\s*"?([^ ">]*).*?>(.*?)<\/a>//si) { my $href = $1; my $title = $2; # Accept only jpegs. next if ($href !~ /\.jpe?g/i); # Remove unneeded whitespace in text. $title =~ s/\n/ /g; $title =~ s/\s\s+/ /g; # Remove any tags in the title. $title =~ s/<.*?>//g; print "$href\t$title\n"; }