Subject: | Serious performance issue parsing large strings in v0.16 (+ patch to fix) |
Date: | Wed, 12 Sep 2007 11:50:48 +0100 |
To: | bug-XML-SAX [...] rt.cpan.org |
From: | "Gordon Lack" <gml4410 [...] gsk.com> |
I noticed some *very* slow parsing (from the PurePerl code) when it was
given large (2MB) variable.
The same data from a file was much quicker (3hrs to 30s).
I discovered (by strace) that the code was doing lots of memory
allocate/frees, so assumed that the variable was being copied as each
token was cut from the front. By changing the String.pm code so that it
fed in characters from the string 4kB at a time (which is how Stream.pm
works) the problem was resolved.
I also changed the DISCARDED value from 7 to 8 (since
XML::SAX::PurePerl::Reader already defines XML_VERSION as 7) and only
preserved the length of what has been discarded (which is all that is
actually used) rather than it's actual contents (which would also
involve large memory allocate/frees for the large-string case).
Patch attached.
--- SAX/PurePerl/Reader/String.pm.orig 2003-07-30 14:39:23.000000000 +0100
+++ SAX/PurePerl/Reader/String.pm 2007-09-11 13:56:13.000000000 +0100
@@ -15,23 +15,57 @@
@ISA = ('XML::SAX::PurePerl::Reader');
-use constant DISCARDED => 7;
+#FIX
+# XML::SAX::PurePerl::Reader already defines XML_VERSION as 7
+# Code change needs others
+#
+# use constant DISCARDED => 7;
+#
+use constant DISCARDED => 8;
+use constant STRING => 9;
+use constant USED => 10;
sub new {
my $class = shift;
my $string = shift;
my @parts;
- @parts[BUFFER, EOF, LINE, COLUMN, DISCARDED] =
- ($string, 0, 1, 0, '');
+#FIX - add some more bits
+# @parts[BUFFER, EOF, LINE, COLUMN, DISCARDED] =
+# ($string, 0, 1, 0, '');
+#
+ @parts[BUFFER, EOF, LINE, COLUMN, DISCARDED, STRING, USED] =
+ ('', 0, 1, 0, 0, $string, 0);
return bless \@parts, $class;
}
-sub read_more () { }
+#FIX
+# We actually drip-feed in the variable
+#
+# sub read_more () { }
+#
+sub read_more () {
+ my $self = shift;
+ if ($self->[USED] >= length($self->[STRING])) {
+ $self->[EOF]++;
+ return 0;
+ }
+ my $amount = 4096;
+ if ($amount > (length($self->[STRING]) - $self->[USED])) {
+ $amount = (length($self->[STRING]) - $self->[USED]);
+ }
+ $self->[BUFFER] .= substr($self->[STRING], $self->[USED], $amount);
+ $self->[USED] += $amount;
+ return 1;
+ }
+
sub move_along {
my $self = shift;
my $discarded = substr($self->[BUFFER], 0, $_[0], '');
- $self->[DISCARDED] .= $discarded;
+#FIX - only intersted in the total length of what we have discarded
+# $self->[DISCARDED] .= $discarded;
+#
+ $self->[DISCARDED] += length($discarded);
# Wish I could skip this lot - tells us where we are in the file
my $lines = $discarded =~ tr/\n//;
@@ -55,7 +89,10 @@
sub bytepos {
my $self = shift;
- length($self->[DISCARDED]);
+#FIX - we alreayd have just the length
+# length($self->[DISCARDED]);
+#
+ $self->[DISCARDED];
}
1;