HTML::LinkExtor doesn't work...

android2009 · 09-25-2010, 12:45 AM

Hi there,

I try to fetch links from a URL using HTML::LinkExtor, but it always return 0 links even if the status code is 200 OK. I am running the following code in Ubuntu 9.04, just curious if the module is too old and its ways of HTTP request is disabled by some platforms.

Any idea is well appreciated.
Thanks!

Code:

#!/usr/bin/perl

use HTML::LinkExtor;
use LWP::UserAgent;

my %urls;

*INPUT=STDIN;
*OUTPUT=STDOUT;
*LOGPUT=STDERR;

sub mychomp
{
   $_[0]=~s/\r|\n//g;
}


sub get_links
{
    my @links=();
    my $url = shift;
    my $browser = LWP::UserAgent->new();
    $browser->timeout(10);
    my $request = HTTP::Request->new(GET => $url);
    my $response=$browser->request($request);

    if($response->is_success)
    {  
        my $contents = $response->content;
        my $page_parser = HTML::LinkExtor->new();

        $page_parser->parse($contens);
        @links=$page_parser->links();
        
        $urls{$url}=$response->code;
        print OUTPUT $url." ".$response->code." links: ".@links."\n";
        print OUTPUT shift @links while @links;
    }
    else
    {
        print LOGPUT "$url: %s\n",$response->status_line;
    }
    return \@links;    
}

sub init
{
    if(@ARGV>0)
    {
       
       *INPUT= shift @ARGV;
    }
    if(@ARGV>1)
    {
       *OUTPUT=shift @ARGV;
    }

    if(@ARGV > 2)
    {
       *LOGPUT=shift @ARGV;
    }
}


sub run
{
    while(<INPUT>)
    {
        mychomp($_);
        if(exists($urls{$_}))
        {
           # do nothing
        }
        else
        {
            my @urls=@{get_links($_)};
          
            print OUTPUT (shift @urls)."\n" while @urls;
        }
        

    }
}

sub done
{
   close(INPUT);
   close(OUTPUT);
   close(LOGPUT);
}

#init;
run;
done;

Sergei Steshenko · 09-25-2010, 06:00 PM

Quote:

Originally Posted by android2009

Hi there,
...
Any idea is well appreciated.
...

First put

Code:

use strict;
use warning;

just after '#!/usr/bin/perl' and make sure there are no compilation errors and runtime warnings.

Sergei Steshenko · 09-25-2010, 06:11 PM

And check the return value of your 'mychomp' subroutine.