Subject: | API extension request |
Perl version: v5.8.3 built for i386-linux-thread-multi
HTML::Parser version: 3.36
on Linux 2.4.25, Debian testing dist
I am working with emulation of web browsers and found I need to have some level of preprocessing in the HTML parser. A primitive I could use for this is the ability to inject input immediately after the current parse token.
As best I can tell, when a browser hits a chunk of content such as:
<script>
document.write('<a href="http://www.perl.org/">the stuff</a>');
</script>
it essentially injects that text immediately after the </script> element in the input parse buffer.
The attached patch adds an ->inject(chunk) method to an HTML::Parser object, and is far from a clean patch, but shows my intent.
Here is a sample use of the inject method to do simple preprocessing:
#!/usr/bin/perl
use strict;
use warnings;
use lib 'blib/lib';
use lib 'blib/arch';
use HTML::Parser qw();
use URI::Escape qw();
use IO::String qw();
use IO::Handle qw();
my $h = <<EOF;
<deftag name="foo">bar</deftag>
<deftag name="navbar">
<foo>
<table>
<tr><td><a href="http://www.perl.org/">perl</a>
<tr><td><a href="http://www.apache.org/">apache</a>
<tr><td><a href="http://www.mozilla.org/">mozilla</a>
</table>
</deftag>
<html><head><title>foo</title></head><body>
<navbar>
Testing 1... 2... 3...
</body></html>
EOF
my %special = ();
my $cdt = undef;
my $p;
my @out = (\*STDOUT);
$p = new HTML::Parser(
'start_h' => [ sub { my($tag, $attr, $txt) = @_;
if(exists $special{$tag}) {
$p->inject($special{$tag});
} elsif($tag eq 'deftag') {
$cdt = $attr->{'name'};
unshift @out, IO::String->new();
} else {
$out[0]->print($txt);
}
}, 'tag,attr,text' ],
'text_h' => [ sub { $out[0]->print(shift) }, 'text' ],
'end_h' => [ sub { my($tag, $txt) = @_;
if($tag eq '/deftag') {
$special{$cdt} = ${$out[0]->string_ref()};
shift @out;
} else {
$out[0]->print($txt);
}
}, 'tag,text' ],
) or die "No parser: $!";
$p->parse($h);
diff -Naur HTML-Parser-3.36.orig/Parser.xs HTML-Parser-3.36/Parser.xs
--- HTML-Parser-3.36.orig/Parser.xs 2003-10-27 13:32:48.000000000 -0800
+++ HTML-Parser-3.36/Parser.xs 2004-04-05 10:40:21.000000000 -0700
@@ -225,6 +225,17 @@
hv_store(hv, "_hparser_xs_state", 17, newRV_noinc(sv), 0);
void
+inject(self, chunk)
+ SV* self;
+ SV* chunk;
+ PREINIT:
+ PSTATE* p_state = get_pstate_hv(aTHX_ self);
+ CODE:
+ if (p_state->parsing == 0)
+ croak("inject invalid outside of parse");
+ inject(aTHX_ p_state, chunk, self);
+
+void
parse(self, chunk)
SV* self;
SV* chunk
diff -Naur HTML-Parser-3.36.orig/hparser.c HTML-Parser-3.36/hparser.c
--- HTML-Parser-3.36.orig/hparser.c 2004-04-01 03:56:37.000000000 -0800
+++ HTML-Parser-3.36/hparser.c 2004-04-05 15:22:01.000000000 -0700
@@ -1406,7 +1406,7 @@
char *t = beg;
char *new_pos;
- while (!p_state->eof) {
+ while (!p_state->eof && !p_state->inject) {
/*
* At the start of this loop we will always be ready for eating text
* or a new tag. We will never be inside some tag. The 't' points
@@ -1580,6 +1580,27 @@
}
+/* The goal here is to be able to insert content at the "current" parse point.
+ * That allows the parser to also act as a macro preprocessor.
+ * right now the "current" parse point is immediately following the
+ * current or next CDATA section (though it would be better to have insertion
+ * occur immediately after the next ending tag.
+ */
+EXTERN void
+inject(pTHX_
+ PSTATE* p_state,
+ SV* chunk,
+ SV* self)
+{
+ STRLEN plen;
+ char *pstr = SvPV(chunk, plen);
+ if(p_state->inject) {
+ sv_catsv(p_state->inject, chunk);
+ } else {
+ p_state->inject = newSVsv(chunk);
+ }
+}
+
EXTERN void
parse(pTHX_
PSTATE* p_state,
@@ -1646,13 +1667,16 @@
if (p_state->buf && SvOK(p_state->buf)) {
sv_catsv(p_state->buf, chunk);
- beg = SvPV(p_state->buf, len);
}
else {
- beg = SvPV(chunk, len);
- if (p_state->offset == 0)
+ p_state->buf = newSVsv(chunk);
+ if (p_state->offset == 0) {
+ beg = SvPV(chunk, len);
report_event(p_state, E_START_DOCUMENT, beg, beg, 0, 0, self);
+ }
}
+RETRY:
+ beg = SvPV(p_state->buf, len);
if (!len)
return; /* nothing to do */
@@ -1678,5 +1702,13 @@
p_state->buf = newSVpv(s, end - s);
}
}
+ if (p_state->inject) {
+ /* insert any ->inject(chunk) at the beginning of the buffer */
+ STRLEN plen;
+ char *pstr = SvPV(p_state->inject, plen);
+ sv_insert(p_state->buf, 0, 0, pstr, plen);
+ p_state->inject = NULL;
+ goto RETRY;
+ }
return;
}
diff -Naur HTML-Parser-3.36.orig/hparser.h HTML-Parser-3.36/hparser.h
--- HTML-Parser-3.36.orig/hparser.h 2003-08-14 22:31:49.000000000 -0700
+++ HTML-Parser-3.36/hparser.h 2004-04-01 15:11:02.000000000 -0800
@@ -72,6 +72,7 @@
STRLEN column;
bool parsing;
bool eof;
+ SV* inject;
/* special parsing modes */
char* literal_mode;