Subject: | Closing <plaintext> patch |
The patch adds a boolean option to resume parsing after "</plaintext>" tag. This emulates gecko-based browsers behaviour. POD and test patches included.
diff -ruN HTML-Parser-3.37/Parser.pm HTML-Parser-3.37-pl/Parser.pm
--- HTML-Parser-3.37/Parser.pm Wed Nov 10 21:50:31 2004
+++ HTML-Parser-3.37-pl/Parser.pm Thu Nov 11 12:36:51 2004
@@ -443,6 +443,16 @@
By default, tagnames and attribute names are down-cased. Enabling this
attribute leaves them as found in the HTML source document.
+=item $p->closing_plaintext
+
+=item $p->closing_plaintext( $bool )
+
+By default, "plaintext" element can never be closed. Everything up to
+the end of the document is parsed in CDATA mode. This historical
+behaviour is what at least MSIE does. Enabling this attribute makes
+closing "</plaintext>" tag effective and the parsing process will resume
+after seeing this tag. This emulates gecko-based browsers.
+
=back
As markup and text is recognized, handlers are invoked. The following
diff -ruN HTML-Parser-3.37/Parser.xs HTML-Parser-3.37-pl/Parser.xs
--- HTML-Parser-3.37/Parser.xs Tue Sep 14 17:47:16 2004
+++ HTML-Parser-3.37-pl/Parser.xs Thu Nov 11 12:36:51 2004
@@ -301,6 +301,7 @@
HTML::Parser::attr_encoded = 6
HTML::Parser::case_sensitive = 7
HTML::Parser::strict_end = 8
+ HTML::Parser::closing_plaintext = 9
PREINIT:
bool *attr;
CODE:
@@ -318,6 +319,7 @@
case 6: attr = &pstate->attr_encoded; break;
case 7: attr = &pstate->case_sensitive; break;
case 8: attr = &pstate->strict_end; break;
+ case 9: attr = &pstate->closing_plaintext; break;
default:
croak("Unknown boolean attribute (%d)", ix);
}
diff -ruN HTML-Parser-3.37/hparser.c HTML-Parser-3.37-pl/hparser.c
--- HTML-Parser-3.37/hparser.c Mon Nov 8 17:33:01 2004
+++ HTML-Parser-3.37-pl/hparser.c Thu Nov 11 12:36:51 2004
@@ -1438,7 +1438,7 @@
l++;
}
- if (!*l && strNE(p_state->literal_mode, "plaintext")) {
+ if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) {
/* matched it all */
token_pos_t end_token;
end_token.beg = end_text + 2;
diff -ruN HTML-Parser-3.37/hparser.h HTML-Parser-3.37-pl/hparser.h
--- HTML-Parser-3.37/hparser.h Fri Aug 15 09:31:49 2003
+++ HTML-Parser-3.37-pl/hparser.h Thu Nov 11 12:36:51 2004
@@ -103,6 +103,7 @@
bool unbroken_text;
bool attr_encoded;
bool case_sensitive;
+ bool closing_plaintext;
/* other configuration stuff */
SV* bool_attr_val;
diff -ruN HTML-Parser-3.37/t/plaintext.t HTML-Parser-3.37-pl/t/plaintext.t
--- HTML-Parser-3.37/t/plaintext.t Wed May 9 11:50:20 2001
+++ HTML-Parser-3.37-pl/t/plaintext.t Thu Nov 11 12:36:51 2004
@@ -1,4 +1,4 @@
-print "1..1\n";
+print "1..2\n";
use strict;
use HTML::Parser;
@@ -26,3 +26,22 @@
:1:end_document::";
print "ok 1\n";
+@a = ();
+$p->closing_plaintext('yep, emulate gecko');
+$p->parse(<<EOT)->eof;
+<plaintext><foo>
+</plaintext>foo<b></b>
+EOT
+
+for (@a) {
+ $_ = "" unless defined;
+}
+
+$doc = join(":", @a);
+
+#warn "$doc\n";
+
+print "not " unless $doc eq "start_document:::start:<plaintext>::text:<foo>
+:1:end:</plaintext>::text:foo::start:<b>::end:</b>::text:
+::end_document::";
+print "ok 2\n";