Subject: | Text etraction and nested tags within <a ...> ... </a> |
perl -v: This is perl, v5.6.1 built for i386-linux
LinkExtractor: $VERSION = '0.10';
The text within <a ...> ... </a> in the following test program is not
captured correctly. Capturing stops after '</i>'
use HTML::LinkExtractor;
$html = '<a href="http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=9804551&dopt=Abstract">Gardner <i>et al.</i> (1998). Science 282:1126-1132.</a>';
my $lx = new HTML::LinkExtractor(undef,undef,1);
$lx->parse(\$html);
for my $link ( @{$lx->links} ) {
print $$link{_TEXT} . "\n";
print $$link{tag} . ': '; print $$link{href} . "\n";
}
The following simple patch solves that particular problem, but
may ofcourse break other things :-(
121c121
<
---
Show quoted text
> my $lookingFor = '';
153a154
Show quoted text> $lookingFor = $Tag;
194,197c195,201
< my $pop = pop @TEXT;
< $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
< $pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip;
< $self->{_cb}->($self, $pop) if exists $self->{_cb};
---
Show quoted text> if ( $Tag eq $lookingFor ) {
> my $pop = pop @TEXT;
> $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
> $pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip;
> $self->{_cb}->($self, $pop) if exists $self->{_cb};
> $lookingFor='';
> }