Hi there,
I try to fetch links from a URL using HTML::LinkExtor, but it always return 0 links even if the status code is 200 OK. I am running the following code in Ubuntu 9.04, just curious if the module is too old and its ways of HTTP request is disabled by some platforms.
Any idea is well appreciated.
Thanks!
Code:
#!/usr/bin/perl
use HTML::LinkExtor;
use LWP::UserAgent;
my %urls;
*INPUT=STDIN;
*OUTPUT=STDOUT;
*LOGPUT=STDERR;
sub mychomp
{
$_[0]=~s/\r|\n//g;
}
sub get_links
{
my @links=();
my $url = shift;
my $browser = LWP::UserAgent->new();
$browser->timeout(10);
my $request = HTTP::Request->new(GET => $url);
my $response=$browser->request($request);
if($response->is_success)
{
my $contents = $response->content;
my $page_parser = HTML::LinkExtor->new();
$page_parser->parse($contens);
@links=$page_parser->links();
$urls{$url}=$response->code;
print OUTPUT $url." ".$response->code." links: ".@links."\n";
print OUTPUT shift @links while @links;
}
else
{
print LOGPUT "$url: %s\n",$response->status_line;
}
return \@links;
}
sub init
{
if(@ARGV>0)
{
*INPUT= shift @ARGV;
}
if(@ARGV>1)
{
*OUTPUT=shift @ARGV;
}
if(@ARGV > 2)
{
*LOGPUT=shift @ARGV;
}
}
sub run
{
while(<INPUT>)
{
mychomp($_);
if(exists($urls{$_}))
{
# do nothing
}
else
{
my @urls=@{get_links($_)};
print OUTPUT (shift @urls)."\n" while @urls;
}
}
}
sub done
{
close(INPUT);
close(OUTPUT);
close(LOGPUT);
}
#init;
run;
done;