#!/usr/bin/perl use strict; use warnings; use Encode; use Getopt::Long; use URI; use LWP::UserAgent; use HTTP::Cookies::Guess; use constant CHARSET => 'utf-8'; my $VERSION = '20080913_1'; my $url = pop @ARGV; my ( $xpath, $referer, $cookie, $agent, $nextlink, $depth, $as_xml, $verbose, $procedure, $weight, ); my $result = GetOptions( "x|xpath=s" => \$xpath, "e|referer=s" => \$referer, "c|cookie-jar=s" => \$cookie, "a|agent=s" => \$agent, "n|nextlink=s" => \$nextlink, "d|depth=i" => \$depth, "s|as-source" => \$as_xml, "f" => \$verbose, "p|procedure=s" => \$procedure, "w=i" => \$weight, ); $depth += 1; unless ($url && $xpath) { die "version $VERSION\nusage: ./exthtml.pl [ -a [AGENT] -e [REFERER] -c [COOKIE_JAR] -n [NEXTPAGE_XPATH] -d [NEXTPAGE_DEPTH] -w [NEXTPAGE_SLEEP(sec)] -p [PROCEDURE: \$v(scalar value), \$n(HTML::Element object), \$u(URI object)] -s -f ] -x [XPATH] [URL]|-"; } my $ua = LWP::UserAgent->new; $ua->cookie_jar(HTTP::Cookies::Guess->create(file => $cookie)) if ($cookie); $ua->agent($agent) if ($agent); my %options = ( url => $url, xpath => decode(CHARSET, $xpath), referer => $referer, ua => $ua, as_xml => $as_xml, nextlink => decode(CHARSET, $nextlink), depth => $depth, verbose => $verbose, procedure => $procedure, weight => $weight || 0, ); if ($url eq '-') { while (my $url_line = <>) { chomp $url_line; $options{url} = $url_line; extract(%options); } } else { extract(%options); } sub proc { my ($proc, $n, $v, $u) = @_; return eval($proc); } sub extract { my %opt = @_; my $xpath = $opt{xpath}; my $depth = $opt{depth}; my $url = $opt{url}; my $referer = $opt{referer}; my $procedure = $opt{procedure}; my $weight = $opt{weight} || 0; my %hist; while ($url && ($depth--)) { last if ($hist{$url}); $hist{$url} = 1; print "$url\n" if ($opt{verbose}); my $tree = HTML::TreeBuilder::XPath::Remote->new_from_uri( $url, $opt{ua}, $referer ); for my $node ($tree->findnodes($opt{xpath})) { print "\t" if ($opt{verbose}); my $value = ($opt{as_xml} && $node->isa('HTML::Element')) ? $node->as_XML('<>&"') : $node->getValue."\n"; $value = proc($procedure, $node, $value, URI->new($url)) if ($procedure); print encode(CHARSET, $value); } $referer = $url; $url = ''; if ($opt{nextlink}) { my @urls = grep /^http/, map {$_->getValue} $tree->findnodes($opt{nextlink}); $url = $urls[0] if (@urls); } $tree->delete; sleep $weight; } } package HTML::TreeBuilder::XPath::Remote; use strict; use warnings; use List::Util qw( first ); use Encode; use HTML::TreeBuilder::XPath; use HTML::ResolveLink; use LWP::UserAgent; use HTTP::Request; use HTTP::Response::Encoding; sub new_from_uri { my ($pkg, $uri, $ua, $referer) = @_; my $resolver = HTML::ResolveLink->new( base => $uri, ); my $html = $resolver->resolve( $pkg->get($uri, $ua, $referer) ); return HTML::TreeBuilder::XPath->new_from_content($html); } sub get { my ($self, $uri, $ua, $referer) = @_; my $html; $ua ||= LWP::UserAgent->new(); my $req = HTTP::Request->new('GET', $uri); $req->header(referer => $referer) if ($referer); my $res = $ua->request($req); # this detection is based on Web::Scraper. if ($res->is_success) { my @encoding = ( $res->encoding, ($res->header('Content-Type') =~ /charset=([\w\-]+)/g), ); if (eval {require Encode::Detect;}) { push @encoding, "Detect"; } push @encoding, "shift-jis"; my $encoding = first { defined $_ && Encode::find_encoding($_) } @encoding; $html = Encode::decode($encoding, $res->content); } return $html; } 1;