Subject: | Bug in character-entity parsing in PurePerl.pm, w/patch |
Character entity decoding (&#<digits>; and &#x<hexdigits>;) is broken if
the entities cross a reader block boundary. The attached patch fix the
problem and is reasonably clean. I don't really like the moved_already
part of the diff, yet it was simple and obvious for giving the correct
parsing and I needed to fix this in production.
For easy review of the diff, I would apply it and run a diff -ub between
the .orig and the patched file.
I have only checked version 0.14, so I don't know if the bug exists in
other versions.
Eivind.
Subject: | PurePerl.pm.patch |
--- PurePerl.pm.orig Tue May 23 14:52:45 2006
+++ PurePerl.pm Tue May 23 17:44:18 2006
@@ -375,23 +375,38 @@
my $data = $reader->data;
- if ($data =~ /^#x([0-9a-fA-F]+);/) {
- my $ref = $1;
- $reader->move_along(length($ref) + 3);
- my $char = chr_ref(hex($ref));
- $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader)
- unless $char =~ /$SingleChar/o;
- $self->characters({ Data => $char });
- return 1;
- }
- elsif ($data =~ /^#([0-9]+);/) {
- my $ref = $1;
- $reader->move_along(length($ref) + 2);
- my $char = chr_ref($ref);
- $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader)
- unless $char =~ /$SingleChar/o;
- $self->characters({ Data => $char });
- return 1;
+ if ($data =~ /^#/) {
+ my $moved_already = 0;
+ for (;;) {
+ if ($data =~ /^#x([0-9a-fA-F]+);/) {
+ my $ref = $1;
+ $reader->move_along(length($ref) + 3 - $moved_already);
+ my $char = chr_ref(hex($ref));
+ $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader)
+ unless $char =~ /$SingleChar/o;
+ $self->characters({ Data => $char });
+ return 1;
+ }
+ elsif ($data =~ /^#([0-9]+);/) {
+ my $ref = $1;
+ $reader->move_along(length($ref) + 2 - $moved_already);
+ my $char = chr_ref($ref);
+ $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader)
+ unless $char =~ /$SingleChar/o;
+ $self->characters({ Data => $char });
+ return 1;
+ }
+ elsif ($data =~ /^#([0-9]*|x[0-9a-fA-F]*)$/) {
+ $reader->move_along(length($data) - $moved_already);
+ $moved_already += length($data);
+ my $more_data = $reader->data;
+ $self->parser_error("Invalid name in entity: Out of data after \"$data\"", $reader) unless length($more_data);
+ $data .= $more_data;
+ }
+ else {
+ $self->parser_error("Invalid data in #-entity: \"" . substr($data, 0, 20) . "\"", $reader);
+ }
+ }
}
else {
# EntityRef