Subject: | patch for "WWW:Mechanize::Polite" |
See http://perlmonks.org/index.pl?node_id=330872 for explanation. (jeffa is lazy today)
*** Mechanize.pm 2005-05-27 11:29:53.000000000 -0500
--- /usr/lib/perl5/site_perl/5.8.3/WWW/Mechanize.pm 2005-05-27 11:41:10.000000000 -0500
***************
*** 100,105 ****
--- 100,106 ----
use HTML::Form 1.00;
use HTML::TokeParser;
use URI::URL;
+ use WWW::RobotRules;
use base 'LWP::UserAgent';
***************
*** 206,211 ****
--- 207,214 ----
my $self = $class->SUPER::new( %parent_parms );
bless $self, $class;
+ $self->{robo_rules} = WWW::RobotRules->new($self->{agent});
+
# Use the mech parms now that we have a mech object.
for my $parm ( keys %mech_parms ) {
$self->{$parm} = $mech_parms{$parm};
***************
*** 287,292 ****
--- 290,309 ----
return sort keys %known_agents;
}
+ =head2 parse_robots()
+
+ Tells Mech where to find a site's robots.txt file
+
+ =cut
+
+ sub parse_robots {
+ my $self = shift;
+ my $url = shift;
+
+ $self->get($url);
+ $self->{robo_rules}->parse($url, $self->content);
+ }
+
=head1 PAGE-FETCHING METHODS
=head2 $mech->get( $url )
***************
*** 322,327 ****
--- 339,363 ----
return $self->SUPER::get( $uri->as_string, @_ );
}
+ =head2 polite_get()
+
+ Simply calls Mech's get() method, but consults with the site's
+ robots.txt file (via WWW::RobotRules) before fetching the URI
+
+ =cut
+
+ sub polite_get {
+ my $self = shift;
+ my $uri = shift;
+
+ if ($self->{robo_rules}->allowed($uri)) {
+ $self->get($uri);
+ }
+ else {
+ undef $self->{content};
+ }
+ }
+
=head2 $mech->reload()
Acts like the reload button in a browser: repeats the current