Skip Menu |

This queue is for tickets about the HTML-Parser CPAN distribution.

Report information
The Basics
Id: 18936
Status: resolved
Priority: 0/
Queue: HTML-Parser

People
Owner: Nobody in particular
Requestors: cpan [...] yaakovnet.net
Cc:
AdminCc:

Bug Information
Severity: Normal
Broken in: 3.51
Fixed in: (no value)



Subject: ignore_element treats </script> like <script>
* thank you very much for the excellent module. *** Problem *** I use $p->ignore_elements("script") in order to skip JavaScript. An (incorrectly formatted) html page contains a closing </script> tag without the starting <script> tag. HTML::Parser treats this tag like an opening tag and ignores the rest of the document *** Solution *** The attached file hparser.c contains the fix; I have added a test for this problem to the file filter-methods.t and attached it as well. *** Explanation *** Here is the part of hparser.c that deals with opening ignored elements. As you see, I have added a test 'if (event == E_START)': if ( p_state->ignore_elements && hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0) ) { if (event == E_START) { /* This line is added */ p_state->ignoring_element = newSVsv(tagname); p_state->ignore_depth = 1; } /* This line is added */ goto IGNORE_EVENT; } If this is a start tag, the test succeeds and the program works as before. But if this is a closing tag (without a previous opening tag), the event is simply ignored.
Subject: hparser.c

Message body is not shown because it is too large.

Subject: filter-methods.t
#!/usr/bin/perl -w use Test::More tests => 12; use strict; use HTML::Parser; my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]); $p->ignore_elements("script"); $p->unbroken_text(1); $p->handler(default => [], "event, text"); $p->parse(<<"EOT")->eof; <html><head><title>foo</title><Script language="Perl"> while (<B>) { # ... } </Script><body> This is an <i>italic</i> and <b>bold</b> text. </body> </html> EOT my $t = join("||", map join("|", @$_), @{$p->handler("default")}); #diag $t; is($t, "start_document|||start|<html>||start|<head>||start|<title>||text|foo||end|</title>||start|<body>||text| This is an italic and bold text. ||end|</body>||text| ||end|</html>||text| ||end_document|", 'ignore_elements'); #------------------------------------------------------ $p = HTML::Parser->new(api_version => 3); $p->report_tags("a"); $p->handler(start => sub { my($tagname, %attr) = @_; ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start'); }, 'tagname, @attr'); $p->handler(end => sub { my $tagname = shift; is($tagname, "a", 'report_tags end'); }, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> This is <a href="#a">very nice</a> example. EOT #------------------------------------------------------ my @tags; $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(a em)); $p->ignore_tags(qw(em)); $p->handler(end => sub {push @tags, @_;}, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> This is <em>yet another</em> <a href="#a">very nice</a> example. EOT is(join('|', @tags), 'a', 'report_tags followed by ignore_tags'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(h1)); $p->report_tags(); $p->handler(end => sub {push @tags, @_;}, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1|h2', 'reset report_tags filter'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(h1 h2)); $p->ignore_tags(qw(h2)); $p->report_tags(qw(h1 h2)); $p->handler(end => sub {push @tags, @_;}, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(h1 h2)); $p->ignore_tags(qw(h2)); $p->report_tags(); $p->handler(end => sub {push @tags, @_;}, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->report_tags(qw(h1 h2)); $p->report_tags(qw(h3)); $p->handler(end => sub {push @tags, @_;}, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> <h3>Next example</h3> EOT is(join('|', @tags), 'h3', 'report_tags replaces filter'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->ignore_tags(qw(h1 h2)); $p->ignore_tags(qw(h3)); $p->handler(end => sub {push @tags, @_;}, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> <h3>Next example</h3> EOT is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->ignore_tags(qw(h2)); $p->ignore_tags(); $p->handler(end => sub {push @tags, @_;}, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter'); #------------------------------------------------------ @tags = (); $p = HTML::Parser->new(api_version => 3); $p->ignore_tags(qw(h2)); $p->report_tags(qw(h1 h2)); $p->handler(end => sub {push @tags, @_;}, 'tagname'); $p->parse(<<EOT)->eof; <h1>Next example</h1> <h2>Next example</h2> EOT is(join('|', @tags), 'h1', 'ignore_tags before report_tags'); #------------------------------------------------------ $p = HTML::Parser->new(api_version => 3); $p->ignore_elements("script"); my $res=""; $p->handler(default=> sub {$res.=$_[0];}, 'text'); $p->parse(<<'EOT')->eof; A <script> B </script> C </script> D <script> E </script> F EOT is($res,"A C D F\n","ignore </script> without <script> correctly");