Subject: | Bug in HTML::PullParser / HTML::TokeParser |
Hello, I am Arun Persad, using Activestate Perl 5.8 on Win NT.
I think I've found a bug in HTML::PullParser / HTML::TokeParser - text tokens are not tokenized consistently. Some text tokens (which should count as a single token) are being split.
You can see this inconsistency by looking at $VAR7 to $VAR12 in the output of the demo script. 'British Newspaper Index' is treated as a single chunk by HTML::Parser, but I don't see why HTML::TokeParser should split it into two tokens - it is a single token in the html source.
Thanks,
Arun
# demo script
use strict;
use HTML::Parser;
use HTML::PullParser;
use HTML::TokeParser;
use LWP::Simple;
use Data::Dumper;
my $Pat = qr/British|Newspaper/;
my $url = 'http://www.bl.uk/collections/wider/eresources/title/eresourcesb.html';
my $content = get($url) or die "Couldn't get $url\n";
use_html_parser();
use_pullparser();
use_tokeparser();
# Test subs - ineach case, look for text tokens containing the word 'British'
sub use_html_parser
{
print "Parsing with HTML::Parser ...\n";
my @text;
my $p = HTML::Parser->new(
api_version => 3,
text_h => [ sub {push @text, $_[0] if $_[0] =~ /$Pat/}, "dtext" ]
);
$p->parse($content) || die $!;
print Dumper(@text), "\n";
}
sub use_pullparser
{
print "Parsing with HTML::PullParser ...\n";
my @text;
my $p = HTML::PullParser->new(
doc => \$content,
text => '@{dtext}',
) || die $!;
while (my $token = $p->get_token) {
push @text, $token if $token =~ /$Pat/;
}
print Dumper(@text), "\n";
}
sub use_tokeparser
{
print "Parsing with HTML::TokeParser ...\n";
my @text;
my $p = HTML::TokeParser->new(\$content) or die "$!";
while(my $tok = $p->get_token) {
if ($tok->[0] eq 'T' && $tok->[1] =~ /$Pat/) {
push @text, $tok->[1];
}
}
print Dumper(@text), "\n";
}