Subject: | several escaping issues (with patch against 0.602) |
Date: | Fri, 16 Mar 2007 11:53:32 -0400 |
To: | bug-XML-Writer [...] rt.cpan.org |
From: | Chapman Flack <jflack [...] math.purdue.edu> |
This patch addresses several escaping issues:
Attribute value literals:
Whitespace:
Rev. 0.530 added a line to escape newlines in attribute values.
This is necessary but not sufficient, because Attribute-Value
Normalization (sec. 3.3.3) specifies that all three of the
characters #xD, #xA, and #x9 will be replaced with #x20 when
the output is read by a conforming parser, so to avoid data
corruption all three must be escaped.
Interoperability:
Attribute-value escaping did not replace ]]> with >>>
as MUST be done for interoperability (sec 2.4, Character Data
and Markup). This was also an issue for general escaping
done by $characters below.
Quoting:
While making these changes, it was easy enough to also
choose "" or '' quoting per attribute based on which will
require less escaping.
General $characters:
Interoperability:
Attribute-value escaping did not replace ]]> with >>>
as MUST be done for interoperability (sec 2.4, Character Data
and Markup). This was also an issue for attribute-value escaping
below.
General efficiency:
Some logic was duplicated between $characters, $showAttributes,
and _escapeLiteral. $escapeEncoding was done as a separate step
unnecessarily.
The single routine $xmlEscape is now common to all uses. Since the
rules for how to escape a character never change, but which
characters need escaping depends on the circumstances, $xmlEscape is
simply passed a compiled regexp (different per call site) that tells
it which characters to escape. In the case of ASCII encoding, the
nonASCII characters are simply compiled into this regexp at the time
of object construction. There is no setEncoding() mutator to change
the encoding after construction, so this is safe to do. All
necessary escaping is now done in a single pass.
Chapman Flack
Mathematics, Purdue University
--- Writer.pm.orig Thu Feb 22 10:00:02 2007
+++ Writer.pm Fri Mar 2 15:30:19 2007
@@ -17,7 +17,6 @@
use IO::Handle;
$VERSION = "0.602";
-
########################################################################
# Constructor.
@@ -62,11 +61,10 @@
my ($checkUnencodedRepertoire, $escapeEncoding);
if (lc($outputEncoding) eq 'us-ascii') {
$checkUnencodedRepertoire = \&_croakUnlessASCII;
- $escapeEncoding = \&_escapeASCII;
+ $escapeEncoding = qr/[^\x00-\x7f]/;
} else {
- my $doNothing = sub {};
- $checkUnencodedRepertoire = $doNothing;
- $escapeEncoding = $doNothing;
+ $checkUnencodedRepertoire = sub {};
+ $escapeEncoding = qr/(?=a)b/; # just an re that can't ever match
}
# Parse variables
@@ -80,6 +78,23 @@
my @hasElementStack = ();
my $hasHeading = 0; # Does this document have anything before the first element?
+ my $xmlEscape = sub {
+ my ( $str, $patToEscape ) = @_;
+ $str =~ s/$patToEscape/_escapeFor($&)/eg;
+ return $str;
+ };
+
+ my $attPatDQ = qr{[\t\n\r&<"]|]]>|$escapeEncoding};
+ my $attPatSQ = qr{[\t\n\r&<']|]]>|$escapeEncoding};
+ my $charPat = qr{[&<]|]]>|$escapeEncoding};
+
+ my $escapeAttribute = sub {
+ my ( $str ) = @_;
+ return ( ( $str =~ tr/"// ) > ( $str =~ tr/'// ) )
+ ? "'" . &{$xmlEscape}($str,$attPatSQ) . "'"
+ : '"' . &{$xmlEscape}($str,$attPatDQ) . '"';
+ };
+
#
# Private method to show attributes.
#
@@ -88,10 +103,8 @@
my $i = 1;
while ($atts->[$i]) {
my $aname = $atts->[$i++];
- my $value = _escapeLiteral($atts->[$i++]);
- $value =~ s/\x0a/\
\;/g;
- &{$escapeEncoding}($value);
- $output->print(" $aname=\"$value\"");
+ my $value = &{$escapeAttribute}($atts->[$i++]);
+ $output->print(" $aname=$value"); # $value includes the needed quotes
}
};
@@ -340,14 +353,7 @@
};
my $characters = sub {
- my $data = $_[0];
- if ($data =~ /[\&\<\>]/) {
- $data =~ s/\&/\&\;/g;
- $data =~ s/\</\<\;/g;
- $data =~ s/\>/\>\;/g;
- }
- &{$escapeEncoding}($data);
- $output->print($data);
+ $output->print(&{$xmlEscape}($_[0],$charPat));
$hasData = 1;
};
@@ -738,24 +744,6 @@
}
}
-#
-# Private: escape an attribute value literal.
-#
-sub _escapeLiteral {
- my $data = $_[0];
- if ($data =~ /[\&\<\>\"]/) {
- $data =~ s/\&/\&\;/g;
- $data =~ s/\</\<\;/g;
- $data =~ s/\>/\>\;/g;
- $data =~ s/\"/\"\;/g;
- }
- return $data;
-}
-
-sub _escapeASCII($) {
- $_[0] =~ s/([^\x00-\x7F])/sprintf('&#x%X;', ord($1))/ge;
-}
-
sub _croakUnlessASCII($) {
if ($_[0] =~ /[^\x00-\x7F]/) {
croak('Non-ASCII characters are not permitted in this part of a US-ASCII document');
@@ -770,6 +758,21 @@
}
}
+my %specialEscapes = ( # an re determines which of these to replace in any case
+ '<', '<',
+ '>', '>', # MAY be represented using the string > ...
+ '&', '&',
+ "'", ''',
+ '"', '"',
+ ']]>', ']]>' # MUST, for compatibility, be escaped in the string ]]>
+); # any other char will be &#x'd if escaping needed
+
+sub _escapeFor($) {
+ return exists($specialEscapes{$_[0]})
+ ? $specialEscapes{$_[0]}
+ : sprintf '&#x%x;', ord($_[0]);
+}
+
########################################################################
# XML::Writer::Namespaces - subclass for Namespace processing.