Bug #8362 for HTML-Parser: Closing <plaintext> patch

Subject:

Closing <plaintext> patch

The patch adds a boolean option to resume parsing after "</plaintext>" tag. This emulates gecko-based browsers behaviour. POD and test patches included.

diff -ruN HTML-Parser-3.37/Parser.pm HTML-Parser-3.37-pl/Parser.pm --- HTML-Parser-3.37/Parser.pm Wed Nov 10 21:50:31 2004 +++ HTML-Parser-3.37-pl/Parser.pm Thu Nov 11 12:36:51 2004 @@ -443,6 +443,16 @@ By default, tagnames and attribute names are down-cased. Enabling this attribute leaves them as found in the HTML source document. +=item $p->closing_plaintext + +=item $p->closing_plaintext( $bool ) + +By default, "plaintext" element can never be closed. Everything up to +the end of the document is parsed in CDATA mode. This historical +behaviour is what at least MSIE does. Enabling this attribute makes +closing "</plaintext>" tag effective and the parsing process will resume +after seeing this tag. This emulates gecko-based browsers. + =back As markup and text is recognized, handlers are invoked. The following diff -ruN HTML-Parser-3.37/Parser.xs HTML-Parser-3.37-pl/Parser.xs --- HTML-Parser-3.37/Parser.xs Tue Sep 14 17:47:16 2004 +++ HTML-Parser-3.37-pl/Parser.xs Thu Nov 11 12:36:51 2004 @@ -301,6 +301,7 @@ HTML::Parser::attr_encoded = 6 HTML::Parser::case_sensitive = 7 HTML::Parser::strict_end = 8 + HTML::Parser::closing_plaintext = 9 PREINIT: bool *attr; CODE: @@ -318,6 +319,7 @@ case 6: attr = &pstate->attr_encoded; break; case 7: attr = &pstate->case_sensitive; break; case 8: attr = &pstate->strict_end; break; + case 9: attr = &pstate->closing_plaintext; break; default: croak("Unknown boolean attribute (%d)", ix); } diff -ruN HTML-Parser-3.37/hparser.c HTML-Parser-3.37-pl/hparser.c --- HTML-Parser-3.37/hparser.c Mon Nov 8 17:33:01 2004 +++ HTML-Parser-3.37-pl/hparser.c Thu Nov 11 12:36:51 2004 @@ -1438,7 +1438,7 @@ l++; } - if (!*l && strNE(p_state->literal_mode, "plaintext")) { + if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) { /* matched it all */ token_pos_t end_token; end_token.beg = end_text + 2; diff -ruN HTML-Parser-3.37/hparser.h HTML-Parser-3.37-pl/hparser.h --- HTML-Parser-3.37/hparser.h Fri Aug 15 09:31:49 2003 +++ HTML-Parser-3.37-pl/hparser.h Thu Nov 11 12:36:51 2004 @@ -103,6 +103,7 @@ bool unbroken_text; bool attr_encoded; bool case_sensitive; + bool closing_plaintext; /* other configuration stuff */ SV* bool_attr_val; diff -ruN HTML-Parser-3.37/t/plaintext.t HTML-Parser-3.37-pl/t/plaintext.t --- HTML-Parser-3.37/t/plaintext.t Wed May 9 11:50:20 2001 +++ HTML-Parser-3.37-pl/t/plaintext.t Thu Nov 11 12:36:51 2004 @@ -1,4 +1,4 @@ -print "1..1\n"; +print "1..2\n"; use strict; use HTML::Parser; @@ -26,3 +26,22 @@ :1:end_document::"; print "ok 1\n"; +@a = (); +$p->closing_plaintext('yep, emulate gecko'); +$p->parse(<<EOT)->eof; +<plaintext><foo> +</plaintext>foo<b></b> +EOT + +for (@a) { + $_ = "" unless defined; +} + +$doc = join(":", @a); + +#warn "$doc\n"; + +print "not " unless $doc eq "start_document:::start:<plaintext>::text:<foo> +:1:end:</plaintext>::text:foo::start:<b>::end:</b>::text: +::end_document::"; +print "ok 2\n";