Subject: | possible problem with regex when url has no trailing / |
found that if you pass a URL to your crawler with out a trailing slash
it will fail to down load the robots.txt.
I also added a new hook to give me more control over what URLs to
follow, you might be interested in adding it.
Subject: | fix-regex.patch |
Index: lib/WWW/Crawler/Lite.pm
===================================================================
--- lib/WWW/Crawler/Lite.pm (revision 2)
+++ lib/WWW/Crawler/Lite.pm (revision 7)
@@ -10,7 +10,7 @@
use Time::HiRes 'usleep';
use Carp 'confess';
-our $VERSION = '0.003';
+our $VERSION = '0.004';
sub new
@@ -24,7 +24,8 @@
on_new_urls => sub { my @urls = @_; },
on_bad_url => sub { my ($bad_url) = @_; },
on_response => sub { my ($url, $http_response) = @_; },
- on_link => sub { my ($from, $to, $text) = @_ },
+ on_link => sub { my ($from, $to, $text) = @_; },
+ follow_ok => sub { my ($url) = @_; return 1; },
delay_seconds => 1,
disallowed => [ ],
%args,
@@ -80,7 +81,13 @@
return @_ ? $s->{on_link} = shift : $s->{on_link};
}# end on_link()
+sub follow_ok
+{
+ my $s = shift;
+ return @_ ? $s->{follow_ok} = shift : $s->{follow_ok};
+}# end follow_ok()
+
sub url_count
{
my ($s) = @_;
@@ -105,7 +112,7 @@
});
# Try to find robots.txt:
- my ($proto, $domain) = $args{url} =~ m{^(https?)://(.*?)/};
+ my ($proto, $domain) = $args{url} =~ m{^(https?)://(.*?)(?:/|$)};
eval {
local $SIG{__DIE__} = \&confess;
my $robots_url = "$proto://$domain/robots.txt";
@@ -182,7 +189,8 @@
(my $new = url($href, $base)->abs->as_string) =~ s/\#.*$//;
$anchortext =~ s/^\s+//s;
$anchortext =~ s/\s+$//s;
- push @new_urls, { href => $new, text => $anchortext };
+ push @new_urls, { href => $new, text => $anchortext }
+ if $s->follow_ok->($new);
}# end if()
"";
}isgxe;