#!/usr/local/bin/perl use strict; use CGI; use File::Basename; use LWP::UserAgent; use HTTP::Cookies; use HTTP::Request::Common; use HTML::LinkExtor; my $me = basename($ENV{SCRIPT_FILENAME}); my $default_ua = 'DoCoMo/1.0 N504iS'; my $q = CGI->new; print_form($q); if ($q->param('url')) { do_calc($q); } sub print_form { my $q = shift; $q->param(ua => $default_ua) unless $q->param('ua'); # default print $q->header, $q->h2("HTML + Image size calcurator"), $q->start_form(-method => 'GET', -action => $me), "URL: ", $q->textfield(-name => 'url', -size => 64), $q->br, "User-Agent: ", $q->textfield(-name => 'ua', -size => 32), $q->submit(-value => " OK "), $q->end_form; } my %LINKS = # subset of %HTML::Tagset::linkElements ( 'applet' => ['archive', 'codebase', 'code'], 'bgsound' => ['src'], 'body' => ['background'], 'embed' => ['src'], 'frame' => ['src'], 'iframe' => ['src'], 'ilayer' => ['background'], 'img' => ['src', 'lowsrc'], 'input' => ['src'], 'layer' => ['background', 'src'], ## 'link' => ['href'], ## durn, some of these are stylesheets 'script' => ['src'], 'table' => ['background'], 'td' => ['background'], 'th' => ['background'], 'tr' => ['background'], ); sub do_calc { my $q = shift; my $ua = LWP::UserAgent->new; $ua->env_proxy; $ua->agent($q->param('ua')); $ua->cookie_jar(HTTP::Cookies->new); # capture cookies if needed report($ua, $q->param('url')); } sub report { my($ua, $start) = @_; my @todo = ["", $start]; my %done; my(@links, %links); local $URI::URL::STRICT = 0; # for javascript: while (@todo) { my ($refer, $url) = @{shift @todo}; next if exists $done{$url}; my $request = GET $url, [referer => $refer]; my $response = $ua->simple_request($request); if ($response->is_success) { $done{$url} = length (my $content = $response->content); next if $response->content_type ne "text/html"; my $base = $response->base; # relative URLs measured relative to here my $p = HTML::LinkExtor->new(undef, $base) or die; $p->parse($content); $p->eof; for my $link ($p->links) { my ($tag, %attr) = @$link; # a href= if ($tag eq 'a' && (my $h = $attr{href})) { push @links, $h unless $links{$h}++; next; } if ($LINKS{$tag}) { for (@{$LINKS{$tag}}) { next unless exists $attr{$_}; next unless length (my $a = $attr{$_}); warn "$base $tag $_ => $a\n"; ## debug push @todo, [$base, $a]; } } } } elsif ($response->is_redirect) { $done{$url} = length $response->content; # this counts my $location = $response->header('location') or next; push @todo, [$url, $location]; # but get this too } elsif ($response->is_error) { print "$url ERROR: ", $response->status_line, "\n"; } } # end of outer loop print $q->hr; print qq(<table width="100%"><tr><td valign="top" width="80%">); { my $total = 0; print $q->h3("HTML + Image size"); print qq(<TABLE cellspacing="1"><TR><TH>URL</TH><TH>bytes</TH></TR>); my @bg = qw(#dddddd #ffffff); my $i; for my $url (sort { $done{$b} <=> $done{$a} } keys %done) { $total += $done{$url}; printf qq(<tr bgcolor="%s"><td>%s</td><td>%10d</td></tr>\n), $bg[$i++ % 2], $q->a({-href => $url}, $url), $done{$url}; } print "</TABLE>\n"; printf "TOTAL: <b>%10d</b> bytes<br>", $total; print $q->hr; print $q->h3("Links"); print qq(<TABLE cellspacing="1"><TR><TH>Check</TH><TH>URL</TH></TR>); my $j; for my $link (@links) { $q->param(url => $link); printf qq(<tr bgcolor="%s"><td>%s</td><td>%s</td></tr>\n), $bg[$j++ % 2], $q->a({ -href => $q->self_url }, "Check"), $q->a({-href => $link}, $link), } print "</TABLE>\n"; } print qq(</td><td valign="top" width="20%"><iframe src="$start" height=500 width=150 frameborder=0 hspace=0 vspace=0 marginheight=0 marginwidth=0 align=center scrolling=yes>Web Page is shown here.</iframe></td></table>\n); }
HDML::LinkExtorもあるよ!
Posted by: milano on November 21, 2003 12:42 AMキター
なつかしいでつな。
CPAN に!
既に。
http://search.cpan.org/~milano/HDML-LinkExtor-0.01/lib/HDML/LinkExtor.pm
おぉ!そかもうあったのねー。