November 20, 2003

Monday Module

Monday Module 予行演習のテーマは HTML::LinkExtor。

URL を指定すると、そこに含まれる画像と、HTML自体のサイズを合計する CGI スクリプトです。ユーザビリティテストや、携帯端末向けのページ作成でのサイズ容量制限チェックに便利でしょう。

#!/usr/local/bin/perl

use strict;
use CGI;
use File::Basename;
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request::Common;
use HTML::LinkExtor;

my $me = basename($ENV{SCRIPT_FILENAME});
my $default_ua = 'DoCoMo/1.0 N504iS';

my $q = CGI->new;
print_form($q);
if ($q->param('url')) {
    do_calc($q);
}

sub print_form {
    my $q = shift;
    $q->param(ua => $default_ua) unless $q->param('ua'); # default
    print $q->header, $q->h2("HTML + Image size calcurator"),
	$q->start_form(-method => 'GET', -action => $me),
	"URL: ", $q->textfield(-name => 'url', -size => 64), $q->br,
	"User-Agent: ", $q->textfield(-name => 'ua', -size => 32),
	$q->submit(-value => " OK "), $q->end_form;
}

my %LINKS =                     # subset of %HTML::Tagset::linkElements
    (
	'applet'  => ['archive', 'codebase', 'code'],
	'bgsound' => ['src'],
	'body'    => ['background'],
	'embed'   => ['src'],
	'frame'   => ['src'],
	'iframe'  => ['src'],
	'ilayer'  => ['background'],
	'img'     => ['src', 'lowsrc'],
	'input'   => ['src'],
	'layer'   => ['background', 'src'],
	## 'link'    => ['href'], ## durn, some of these are stylesheets
	'script'  => ['src'],
	'table'   => ['background'],
	'td'      => ['background'],
	'th'      => ['background'],
	'tr'      => ['background'],
    );


sub do_calc {
    my $q = shift;

    my $ua = LWP::UserAgent->new;
    $ua->env_proxy;
    $ua->agent($q->param('ua'));
    $ua->cookie_jar(HTTP::Cookies->new); # capture cookies if needed

    report($ua, $q->param('url'));
}

sub report {
    my($ua, $start) = @_;

    my @todo = ["", $start];
    my %done;
    my(@links, %links);

    local $URI::URL::STRICT = 0; # for javascript:
    while (@todo) {
	my ($refer, $url) = @{shift @todo};
	next if exists $done{$url};

	my $request = GET $url, [referer => $refer];
	my $response = $ua->simple_request($request);

	if ($response->is_success) {
	    $done{$url} = length (my $content = $response->content);

	    next if $response->content_type ne "text/html";

	    my $base = $response->base; # relative URLs measured relative to here
	    my $p = HTML::LinkExtor->new(undef, $base) or die;
	    $p->parse($content);
	    $p->eof;
	    for my $link ($p->links) {
		my ($tag, %attr) = @$link;

		# a href=
		if ($tag eq 'a' && (my $h = $attr{href})) {
		    push @links, $h unless $links{$h}++;
		    next;
		}
		if ($LINKS{$tag}) {
		    for (@{$LINKS{$tag}}) {
			next unless exists $attr{$_};
			next unless length (my $a = $attr{$_});
			warn "$base $tag $_ => $a\n"; ## debug
			push @todo, [$base, $a];
		    }
		}
	    }
	} elsif ($response->is_redirect) {
	    $done{$url} = length $response->content; # this counts
	    my $location = $response->header('location') or next;
	    push @todo, [$url, $location]; # but get this too
	} elsif ($response->is_error) {
	    print "$url ERROR: ", $response->status_line, "\n";
	}
    }                             # end of outer loop

    print $q->hr;
    print qq(<table width="100%"><tr><td valign="top" width="80%">);
    {
	my $total = 0;
	print $q->h3("HTML + Image size");
	print qq(<TABLE cellspacing="1"><TR><TH>URL</TH><TH>bytes</TH></TR>);
	my @bg = qw(#dddddd #ffffff);
	my $i;
	for my $url (sort { $done{$b} <=> $done{$a} } keys %done) {
	    $total += $done{$url};
	    printf qq(<tr bgcolor="%s"><td>%s</td><td>%10d</td></tr>\n),
		$bg[$i++ % 2], $q->a({-href => $url}, $url), $done{$url};
	}
	print "</TABLE>\n";
	printf "TOTAL: <b>%10d</b> bytes<br>", $total;

	print $q->hr;
	print $q->h3("Links");
	print qq(<TABLE cellspacing="1"><TR><TH>Check</TH><TH>URL</TH></TR>);
	my $j;
	for my $link (@links) {
	    $q->param(url => $link);
	    printf qq(<tr bgcolor="%s"><td>%s</td><td>%s</td></tr>\n),
		$bg[$j++ % 2],
		    $q->a({ -href => $q->self_url }, "Check"),
			$q->a({-href => $link}, $link),

	}
	print "</TABLE>\n";
    }

    print qq(</td><td valign="top" width="20%"><iframe src="$start" height=500 width=150 frameborder=0 hspace=0 vspace=0 marginheight=0 marginwidth=0 align=center scrolling=yes>Web Page is shown here.</iframe></td></table>\n);
}
Posted by miyagawa at November 20, 2003 11:12 PM | Permalink | Comments (4) | TrackBack(0)
Comments

HDML::LinkExtorもあるよ!

Posted by: milano on November 21, 2003 12:42 AM

キター
なつかしいでつな。
CPAN に!

Posted by: miyagawa on November 21, 2003 01:09 AM

既に。
http://search.cpan.org/~milano/HDML-LinkExtor-0.01/lib/HDML/LinkExtor.pm

Posted by: milano on November 21, 2003 01:19 AM

おぉ!そかもうあったのねー。

Posted by: miyagawa on November 21, 2003 01:24 AM
Trackbacks
TrackBack URL for this entry: http://blog.bulknews.net/mt3/mt-tb.cgi/557
Post a comment