#!/usr/local/bin/perl
use strict;
use CGI;
use File::Basename;
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request::Common;
use HTML::LinkExtor;
my $me = basename($ENV{SCRIPT_FILENAME});
my $default_ua = 'DoCoMo/1.0 N504iS';
my $q = CGI->new;
print_form($q);
if ($q->param('url')) {
do_calc($q);
}
sub print_form {
my $q = shift;
$q->param(ua => $default_ua) unless $q->param('ua'); # default
print $q->header, $q->h2("HTML + Image size calcurator"),
$q->start_form(-method => 'GET', -action => $me),
"URL: ", $q->textfield(-name => 'url', -size => 64), $q->br,
"User-Agent: ", $q->textfield(-name => 'ua', -size => 32),
$q->submit(-value => " OK "), $q->end_form;
}
my %LINKS = # subset of %HTML::Tagset::linkElements
(
'applet' => ['archive', 'codebase', 'code'],
'bgsound' => ['src'],
'body' => ['background'],
'embed' => ['src'],
'frame' => ['src'],
'iframe' => ['src'],
'ilayer' => ['background'],
'img' => ['src', 'lowsrc'],
'input' => ['src'],
'layer' => ['background', 'src'],
## 'link' => ['href'], ## durn, some of these are stylesheets
'script' => ['src'],
'table' => ['background'],
'td' => ['background'],
'th' => ['background'],
'tr' => ['background'],
);
sub do_calc {
my $q = shift;
my $ua = LWP::UserAgent->new;
$ua->env_proxy;
$ua->agent($q->param('ua'));
$ua->cookie_jar(HTTP::Cookies->new); # capture cookies if needed
report($ua, $q->param('url'));
}
sub report {
my($ua, $start) = @_;
my @todo = ["", $start];
my %done;
my(@links, %links);
local $URI::URL::STRICT = 0; # for javascript:
while (@todo) {
my ($refer, $url) = @{shift @todo};
next if exists $done{$url};
my $request = GET $url, [referer => $refer];
my $response = $ua->simple_request($request);
if ($response->is_success) {
$done{$url} = length (my $content = $response->content);
next if $response->content_type ne "text/html";
my $base = $response->base; # relative URLs measured relative to here
my $p = HTML::LinkExtor->new(undef, $base) or die;
$p->parse($content);
$p->eof;
for my $link ($p->links) {
my ($tag, %attr) = @$link;
# a href=
if ($tag eq 'a' && (my $h = $attr{href})) {
push @links, $h unless $links{$h}++;
next;
}
if ($LINKS{$tag}) {
for (@{$LINKS{$tag}}) {
next unless exists $attr{$_};
next unless length (my $a = $attr{$_});
warn "$base $tag $_ => $a\n"; ## debug
push @todo, [$base, $a];
}
}
}
} elsif ($response->is_redirect) {
$done{$url} = length $response->content; # this counts
my $location = $response->header('location') or next;
push @todo, [$url, $location]; # but get this too
} elsif ($response->is_error) {
print "$url ERROR: ", $response->status_line, "\n";
}
} # end of outer loop
print $q->hr;
print qq(<table width="100%"><tr><td valign="top" width="80%">);
{
my $total = 0;
print $q->h3("HTML + Image size");
print qq(<TABLE cellspacing="1"><TR><TH>URL</TH><TH>bytes</TH></TR>);
my @bg = qw(#dddddd #ffffff);
my $i;
for my $url (sort { $done{$b} <=> $done{$a} } keys %done) {
$total += $done{$url};
printf qq(<tr bgcolor="%s"><td>%s</td><td>%10d</td></tr>\n),
$bg[$i++ % 2], $q->a({-href => $url}, $url), $done{$url};
}
print "</TABLE>\n";
printf "TOTAL: <b>%10d</b> bytes<br>", $total;
print $q->hr;
print $q->h3("Links");
print qq(<TABLE cellspacing="1"><TR><TH>Check</TH><TH>URL</TH></TR>);
my $j;
for my $link (@links) {
$q->param(url => $link);
printf qq(<tr bgcolor="%s"><td>%s</td><td>%s</td></tr>\n),
$bg[$j++ % 2],
$q->a({ -href => $q->self_url }, "Check"),
$q->a({-href => $link}, $link),
}
print "</TABLE>\n";
}
print qq(</td><td valign="top" width="20%"><iframe src="$start" height=500 width=150 frameborder=0 hspace=0 vspace=0 marginheight=0 marginwidth=0 align=center scrolling=yes>Web Page is shown here.</iframe></td></table>\n);
}
HDML::LinkExtorもあるよ!
Posted by: milano on November 21, 2003 12:42 AMキター
なつかしいでつな。
CPAN に!
既に。
http://search.cpan.org/~milano/HDML-LinkExtor-0.01/lib/HDML/LinkExtor.pm
おぉ!そかもうあったのねー。