Subject: | Feature implementation: joining congruent character data together in SAX driver |
I've modified the SAX layer of XML::LibXML to add a feature that joins together congruent
character data events in C and then delivers them as larger single events. This is a feature
that XML::SAX::ExpatXS offers and in my performance testing against various languages of
the Wikipedia it can yield between 30% and 100% more processing speed. There is a
negligible penalty for having this feature but leaving it off (measured to be 1% or less against
my test files). There may be a penalty for turning the feature on when there is a very high
markup ratio.
I've modified XML::LibXML::SAX to support turning the feature on and off via the SAX features
API and documented how to enable the feature. The feature currently defaults to off to
preserve existing behavior. There is also a slight issue in that 2 tests from 02parse.t fail and I
can not figure out why. The 14sax.t passes all tests. If you want to see the test failures you
must enable the feature by default as when the feature is off all tests pass as normal.
The feature works like this:
* A new character buffering library was created and placed inline in perl-libxml-sax.c
* The character event handler was abstracted out to the handler itself and a dispatcher for
character events.
* The parser context vector had a field added to it for the character buffer and a flag to
enable or disable the feature.
* The character event handler appends event data to a buffer instead of delivering the event
immediately.
* Each time another event happens (start element, end element, processing instruction, etc)
any built up character data is delivered as an event, the buffer is flushed out, and the new
event is passed as normal.
I hope you see this patch as fit for inclusion in the distribution because in my tests it brings
the processing speed of XML::LibXML::SAX close to or exceeding the speed of XML::Parser.
Tyler Riddle
Subject: | XML-LibXML-1.70_joinchars.patch |
Only in XML-LibXML: .project
diff -r -u /Users/tyler/Downloads/XML-LibXML-1.70/LibXML.xs XML-LibXML/LibXML.xs
--- /Users/tyler/Downloads/XML-LibXML-1.70/LibXML.xs 2009-10-06 15:02:23.000000000 -0700
+++ XML-LibXML/LibXML.xs 2009-12-02 08:16:32.000000000 -0800
@@ -534,7 +534,7 @@
}
read_length = SvIV(read_results);
-
+
chars = SvPV(tbuff, read_length);
strncpy(buffer, chars, read_length);
@@ -1846,6 +1846,7 @@
int ret;
while ((read_length = LibXML_read_perl(fh, buffer, 1024))) {
ret = xmlParseChunk(ctxt, buffer, read_length, 0);
+
if ( ret != 0 ) {
break;
}
diff -r -u /Users/tyler/Downloads/XML-LibXML-1.70/lib/XML/LibXML/SAX.pm XML-LibXML/lib/XML/LibXML/SAX.pm
--- /Users/tyler/Downloads/XML-LibXML-1.70/lib/XML/LibXML/SAX.pm 2009-09-23 01:11:58.000000000 -0700
+++ XML-LibXML/lib/XML/LibXML/SAX.pm 2009-12-01 21:56:43.000000000 -0800
@@ -26,6 +26,18 @@
return $XML::LibXML::__threads_shared ? 0 : 1;
}
+sub set_feature {
+ my ($self, $feat, $val) = @_;
+
+ if ($feat eq 'http://xmlns.perl.org/sax/join-character-data') {
+ $self->{JOIN_CHARACTERS} = $val;
+ return 1;
+ }
+
+ shift(@_);
+ return $self->SUPER::set_feature(@_);
+}
+
sub _parse_characterstream {
my ( $self, $fh ) = @_;
# this my catch the xml decl, so the parser won't get confused about
@@ -35,6 +47,7 @@
sub _parse_bytestream {
my ( $self, $fh ) = @_;
+
$self->{ParserOptions}{LibParser} = XML::LibXML->new;
$self->{ParserOptions}{ParseFunc} = \&XML::LibXML::parse_fh;
$self->{ParserOptions}{ParseFuncParam} = $fh;
@@ -74,6 +87,12 @@
sub _parse {
my $self = shift;
my $args = bless $self->{ParserOptions}, ref($self);
+
+ if (defined($self->{JOIN_CHARACTERS})) {
+ $args->{LibParser}->{JOIN_CHARACTERS} = $self->{JOIN_CHARACTERS};
+ } else {
+ $args->{LibParser}->{JOIN_CHARACTERS} = 0;
+ }
$args->{LibParser}->set_handler( $self );
eval {
@@ -92,6 +111,5 @@
return;
}
-
1;
diff -r -u /Users/tyler/Downloads/XML-LibXML-1.70/lib/XML/LibXML/SAX.pod XML-LibXML/lib/XML/LibXML/SAX.pod
--- /Users/tyler/Downloads/XML-LibXML-1.70/lib/XML/LibXML/SAX.pod 2009-10-07 05:20:45.000000000 -0700
+++ XML-LibXML/lib/XML/LibXML/SAX.pod 2009-12-02 09:13:45.000000000 -0800
@@ -27,6 +27,21 @@
wrong behaviour. If you run into specific problems using this part of
XML::LibXML, let me know.
+=head1 FEATURES
+
+I<<<<<< NOTE: >>>>>> This feature is experimental.
+
+You can enable character data joining which may yield a significant speed
+boost in your XML processing in lower markup ratio situations by enabling the
+http://xmlns.perl.org/sax/join-character-data feature of this parser. This
+is done via the set_feature method like this:
+
+ $p->set_feature('http://xmlns.perl.org/sax/join-character-data', 1);
+
+You can also specify a 0 to disable. The default is to disable this feature.
+
+=back
+
=head1 AUTHORS
Matt Sergeant,
diff -r -u /Users/tyler/Downloads/XML-LibXML-1.70/perl-libxml-sax.c XML-LibXML/perl-libxml-sax.c
--- /Users/tyler/Downloads/XML-LibXML-1.70/perl-libxml-sax.c 2009-09-24 01:38:57.000000000 -0700
+++ XML-LibXML/perl-libxml-sax.c 2009-12-02 09:16:23.000000000 -0800
@@ -56,10 +56,23 @@
xmlDocPtr ns_stack_root;
SV * handler;
SV * saved_error;
+ struct CBuffer *charbuf;
+ int joinchars;
} PmmSAXVector;
typedef PmmSAXVector* PmmSAXVectorPtr;
+struct CBufferChunk {
+ struct CBufferChunk *next;
+ xmlChar *data;
+ int len;
+};
+
+struct CBuffer {
+ struct CBufferChunk *head;
+ struct CBufferChunk *tail;
+};
+
static U32 PrefixHash; /* pre-computed */
static U32 NsURIHash;
static U32 NameHash;
@@ -115,7 +128,6 @@
return retval;
}
-
void
PmmSAXInitialize(pTHX)
{
@@ -134,6 +146,137 @@
}
xmlSAXHandlerPtr PSaxGetHandler();
+int PSaxCharactersFlush(void *, struct CBuffer *);
+
+
+/* Character buffering functions */
+
+struct CBufferChunk * CBufferChunkNew(void) {
+ struct CBufferChunk *newchunk = xmlMalloc(sizeof(struct CBufferChunk));
+ memset(newchunk, 0, sizeof(struct CBufferChunk));
+ return newchunk;
+}
+
+struct CBuffer * CBufferNew(void) {
+ struct CBuffer *new = xmlMalloc(sizeof(struct CBuffer));
+ struct CBufferChunk *newchunk = CBufferChunkNew();
+
+ memset(new, 0, sizeof(struct CBuffer));
+
+ new->head = newchunk;
+ new->tail = newchunk;
+
+ return new;
+}
+
+void CBufferPurge(struct CBuffer *buffer) {
+ struct CBufferChunk *p1;
+ struct CBufferChunk *p2;
+
+ if (buffer == NULL || buffer->head->data == NULL) {
+ return;
+ }
+
+ if (p1 = buffer->head) {
+
+ while(p1) {
+ p2 = p1->next;
+
+ if (p1->data) {
+ xmlFree(p1->data);
+ }
+
+ xmlFree(p1);
+
+ p1 = p2;
+ }
+ }
+
+ buffer->head = CBufferChunkNew();
+ buffer->tail = buffer->head;
+}
+
+void CBufferFree(struct CBuffer *buffer) {
+ struct CBufferChunk *p1;
+ struct CBufferChunk *p2;
+
+ if (buffer == NULL) {
+ return;
+ }
+
+ if (p1 = buffer->head) {
+
+ while(p1) {
+ p2 = p1->next;
+
+ if (p1->data) {
+ xmlFree(p1->data);
+ }
+
+ xmlFree(p1);
+
+ p1 = p2;
+ }
+ }
+
+ xmlFree(buffer);
+
+ return;
+}
+
+int CBufferLength(struct CBuffer *buffer) {
+ int length = 0;
+ struct CBufferChunk *cur;
+
+ for(cur = buffer->head; cur; cur = cur->next) {
+ length += cur->len;
+ }
+
+ return length;
+}
+
+void CBufferAppend(struct CBuffer *buffer, const xmlChar *newstring, int len) {
+ char *copy = xmlMalloc(len);
+
+ memcpy(copy, newstring, len);
+
+ buffer->tail->data = copy;
+ buffer->tail->len = len;
+ buffer->tail->next = CBufferChunkNew();
+ buffer->tail = buffer->tail->next;
+}
+
+xmlChar * CBufferCharacters(struct CBuffer *buffer) {
+ int length = CBufferLength(buffer);
+ xmlChar *new = xmlMalloc(length + 1);
+ char *p = new;
+ int copied = 0;
+ struct CBufferChunk *cur;
+
+ if (buffer->head->data == NULL) {
+ return NULL;
+ }
+
+ for(cur = buffer->head;cur;cur = cur->next) {
+ if (! cur->data) {
+ continue;
+ }
+
+ if ((copied = copied + cur->len) > length) {
+ fprintf(stderr, "string overflow\n");
+ abort();
+ }
+
+ memcpy(p, cur->data, cur->len);
+ p += cur->len;
+ }
+
+ new[length] = '\0';
+
+ return new;
+}
+
+/* end character buffering functions */
void
@@ -141,6 +284,8 @@
{
PmmSAXVectorPtr vec = NULL;
SV ** th;
+ SV ** joinchars;
+
dTHX;
CLEAR_SERROR_HANDLER
@@ -164,7 +309,21 @@
vec->handler = SvREFCNT_inc(*th) ;
}
else {
- vec->handler = NULL ;
+ vec->handler = NULL;
+ }
+
+ joinchars = hv_fetch((HV*)SvRV(parser), "JOIN_CHARACTERS", 15, 0);
+
+ if (joinchars != NULL) {
+ vec->joinchars = (SvIV(*joinchars));
+ } else {
+ vec->joinchars = 0;
+ }
+
+ if (vec->joinchars) {
+ vec->charbuf = CBufferNew();
+ } else {
+ vec->charbuf = NULL;
}
if ( ctxt->sax ) {
@@ -173,6 +332,7 @@
ctxt->sax = PSaxGetHandler();
ctxt->_private = (void*)vec;
+
}
void
@@ -186,6 +346,9 @@
vec->handler = NULL;
}
+ CBufferFree(vec->charbuf);
+ vec->charbuf = NULL;
+
xmlFree( ctxt->sax );
ctxt->sax = NULL;
@@ -716,6 +879,8 @@
xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
PmmSAXVectorPtr sax = (PmmSAXVectorPtr)ctxt->_private;
+ sax->joinchars && PSaxCharactersFlush(ctxt, sax->charbuf);
+
dTHX;
dSP;
@@ -749,6 +914,8 @@
SV * rv;
SV * arv;
+ sax->joinchars && PSaxCharactersFlush(ctxt, sax->charbuf);
+
dSP;
ENTER;
@@ -795,6 +962,8 @@
SV * rv;
HV * element;
+ sax->joinchars && PSaxCharactersFlush(ctxt, sax->charbuf);
+
dSP;
ENTER;
@@ -825,9 +994,10 @@
}
int
-PSaxCharacters(void *ctx, const xmlChar * ch, int len) {
+PSaxCharactersDispatch(void *ctx, const xmlChar * ch, int len) {
xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
PmmSAXVectorPtr sax = (PmmSAXVectorPtr)ctxt->_private;
+
dTHX;
HV* element;
SV * handler;
@@ -863,15 +1033,47 @@
if (SvTRUE(ERRSV)) {
croak_obj;
}
-
+
FREETMPS ;
LEAVE ;
}
CLEAR_SERROR_HANDLER;
+
return 1;
}
+int PSaxCharactersFlush (void *ctx, struct CBuffer *buffer) {
+ xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
+ PmmSAXVectorPtr sax = (PmmSAXVectorPtr)ctxt->_private;
+ xmlChar *ch;
+ int len;
+
+ if (buffer->head->data == NULL) {
+ return 1;
+ }
+
+ ch = CBufferCharacters(sax->charbuf);
+ len = CBufferLength(sax->charbuf);
+
+ CBufferPurge(buffer);
+
+ return PSaxCharactersDispatch(ctx, ch, len);
+}
+
+int PSaxCharacters (void *ctx, const xmlChar * ch, int len) {
+ xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
+ PmmSAXVectorPtr sax = (PmmSAXVectorPtr)ctxt->_private;
+
+ if (sax->joinchars) {
+ struct CBuffer *buffer = sax->charbuf;
+ CBufferAppend(buffer, ch, len);
+ return 1;
+ }
+
+ return PSaxCharactersDispatch(ctx, ch, len);
+}
+
int
PSaxComment(void *ctx, const xmlChar * ch) {
xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr)ctx;
@@ -885,6 +1087,8 @@
if ( ch != NULL && handler != NULL ) {
int len = xmlStrlen( ch );
+ sax->joinchars && PSaxCharactersFlush(ctxt, sax->charbuf);
+
dSP;
ENTER;
@@ -925,6 +1129,8 @@
if ( ch != NULL && handler != NULL ) {
+ sax->joinchars && PSaxCharactersFlush(ctxt, sax->charbuf);
+
dSP;
ENTER;
@@ -987,7 +1193,9 @@
SV * rv = NULL;
if ( handler != NULL ) {
- dSP;
+ sax->joinchars && PSaxCharactersFlush(ctxt, sax->charbuf);
+
+ dSP;
ENTER;
SAVETMPS;