Subject: | HTML parser error : htmlParseEntityRef: expecting ';' |
Hello.
I used XML-LIBXML for parsing HTML documents. In version 1.69 everything
was ok, but when I started to use 1.70, I took a warnings. But data,
what I wanted to get is right.
Example test.pl:
use strict;
use warnings;
use XXX::Sender;
use XXX::Parser;
my $url = 'http://video.mail.ru/mail/kadaj.ff/16/18.html';
my $sender = XXX::Sender->new();
my $page_content = $sender->get($url);
return unless $page_content;
my $parser = XXX::Parser->new();
$parser->parse_html($page_content);
my $favicon_uri = $parser->get_favicon_src();
if($favicon_uri){
print "Favicon_uri:".$favicon_uri."\n";
}
Need modules:
package XXX::Sender;
use strict;
use LWP::UserAgent;
use HTTP::Request;
use base qw( Class::Accessor::Fast );
__PACKAGE__->mk_ro_accessors( qw( parser root ) );
sub new{
my $class = shift;
$class = ref $class || $class;
my $self = bless {}, $class;
$self = $self->init;
return $self;
}
sub init{
my $self = shift;
$self->{ua} = LWP::UserAgent->new();
$self->{ua}->timeout(40);
return $self;
}
sub get{
my $self = shift;
my $uri = shift;
my $req = HTTP::Request->new(GET =>$uri);
my $res = $self->{ua}->request($req);
if($res->is_success){
return $res->content;
}
return undef;
}
1;
package XXX::Parser;
use strict;
use XML::LibXML;
local $XML::LibXML::skipXMLDeclaration = 1;
local $XML::LibXML::skipDTD = 1;
use base qw( Class::Accessor::Fast );
__PACKAGE__->mk_ro_accessors( qw( parser root ) );
sub new{
my $class = shift;
$class = ref $class || $class;
my $self = bless {}, $class;
$self = $self->_init;
return $self;
}
sub _init{
my $self = shift;
my $parser = XML::LibXML->new();
$parser->expand_entities(0);
$parser->validation(0);
$parser->no_network(1);
$parser->recover_silently(1);
$self->{parser} = $parser;
return $self;
}
sub parse_html{
my $self = shift;
my $html = shift;
my $dom = $self->parser->parse_html_string($html);
$self->{root} = $dom->documentElement();
return $self->{root};
}
sub get_favicon_src{
my $self = shift;
foreach my $node (@{$self->root->findnodes('//link[@rel="shortcut
icon"]')}){
return $node->getAttribute("href");
}
return undef;
}
1;
And result what I took after starting:
HTML parser error : htmlParseEntityRef: expecting ';'
rel="video_src"
href="http://img.mail.ru/r/video2/player_v2.swf?orig=2&movieSrc
^
HTML parser error : htmlParseEntityRef: expecting ';'
tp://img.mail.ru/r/video2/player_v2.swf?orig=2&movieSrc=mail/kadaj.ff/16/18&host
^
HTML parser error : htmlParseEntityRef: expecting ';'
player_v2.swf?orig=2&movieSrc=mail/kadaj.ff/16/18&host=video.mail.ru&contentHost
^
validity error : ID cln6259 already defined
<a class="lw lw-mail" href="http://mail.ru"
name="cln6259"><i></i></a>
^
validity error : ID cln6259 already defined
<a class="lw lw-video" href="http://video.mail.ru"
name="cln6259"><i></i></a>
^
validity error : ID cln4880 already defined
A"><a href="https://money.mail.ru/" name="cln4880" class="shAaa"
target="_blank"
^
HTML parser error : htmlParseEntityRef: expecting ';'
<a
href="http://www.mail.ru/agent?message&to=kadaj.ff@mail.ru" title="Щелк
^
HTML parser error : Element script embeds close tag
'<i class="mf_spIco" onclick="return
Captcha.hide();"></i>' +
^
HTML parser error : Element script embeds close tag
'</form>' +
^
HTML parser error : Element script embeds close tag
'</div>' +
^
HTML parser error : Element script embeds close tag
'</div>';
^
HTML parser error : htmlParseEntityRef: expecting ';'
<param name="flashvars"
value="orig=2&movieSrc=mail/kadaj.ff/16/18" /
^
HTML parser error : Element script embeds close tag
de").value = fotoNothing; gebi("lj-code").value = "<lj-embed>" +
fotoNothing + '
^
validity error : ID goleft_listId already defined
<div
id="goleft_listId"><a href="#" class="u"><img height="22" width="100%
^
validity error : ID goright_listId already defined
<div
id="goright_listId"><a href="#" class="d"><img height="22" width="100
^
HTML parser error : htmlParseEntityRef: expecting ';'
<img src="http://rs.mail.ru/d275994.gif?rnd=203403146&ts=1287384542"
width="1" h
^
HTML parser error : htmlParseEntityRef: expecting ';'
<img src="http://rs.mail.ru/d288730.gif?rnd=157728719&ts=1287384542"
width="1" h
^
HTML parser error : Element script embeds close tag
ww.macromedia.com/shockwave/download/index.cgiP1_Prod_Version=ShockwaveFlash"
/>
^
HTML parser error : Element script embeds close tag
edia.com/shockwave/download/index.cgiP1_Prod_Version=ShockwaveFlash"
/></object>
^
HTML parser error : Element script embeds close tag
rs.mail.ru/b11675457.jpg" width="200" height="300" border="0" alt=""
title="" />
^
HTML parser error : Element script embeds close tag
ail.ru/b11675457.jpg" width="200" height="300" border="0" alt=""
title="" /></a>
^
HTML parser error : htmlParseEntityRef: expecting ';'
"rb_banner"><a name="clb288730"
href="http://1link.mail.ru/c.php?site_id=49118&p
^
HTML parser error : htmlParseEntityRef: expecting ';'
Show quoted text
><a name="clb288730"
href="http://1link.mail.ru/c.php?site_id=49118&p=231&sub_id
^
HTML parser error : htmlParseEntityRef: expecting ';'
сайте</a> <a type="mrim-status-9" href="http://www.mail.ru/agent?message&to
^
HTML parser error : htmlParseEntityRef: expecting ';'
сайте</a> <a type="mrim-status-9" href="http://www.mail.ru/agent?message&to
^
HTML parser error : Tag wbr invalid
y.mail.ru/mail/kolpakova-d/" class="booster-sc">Дарья Колпаков<wbr
^
HTML parser error : htmlParseEntityRef: expecting ';'
сайте</a> <a type="mrim-status-9" href="http://www.mail.ru/agent?message&to
^
HTML parser error : Tag wbr invalid
.mail.ru/mail/green_apple-/" class="booster-sc">Мария Сергеевн<wbr
^
HTML parser error : htmlParseEntityRef: expecting ';'
сайте</a> <a type="mrim-status-9" href="http://www.mail.ru/agent?message&to
^
HTML parser error : Tag wbr invalid
href="http://my.mail.ru/mail/roni-74/" class="booster-sc">Вероника<wbr
^
HTML parser error : Tag wbr invalid
/mail/roni-74/" class="booster-sc">Вероника<wbr /> Свиридов<wbr
^
HTML parser error : htmlParseEntityRef: expecting ';'
<!-- start slot 1879 --><img
src="http://rs.mail.ru/d225277.gif?rnd=174129342&ts
^
HTML parser error : htmlParseEntityRef: expecting ';'
<!-- Start slot 3 --><img
src="http://rs.mail.ru/d292152.gif?rnd=991533156&ts=12
^
HTML parser error : Element script embeds close tag
('<sc'+'ript type="text/javascript"
src="http://an.yandex.ru/system/context.js">
^
HTML parser error : Element script embeds close tag
+'ript type="text/javascript"
src="http://autocontext.begun.ru/autocontext2.js">
^
And then I took right answer:
Favicon_uri:http://video.mail.ru/favicon.ico
Is it a bug in a library? Or do I do something wrong?