Subject: | output_string() doesn't set the UTF-8 flag |
output_string() doesn't set the UTF-8 flag when it returns UTF-8 output. This means the the result gets treated as bytes not characters. So, if you're not careful you end up with double encoded Unicode in your output stream.
The workaround is simple - run Encode::decode_utf8() over the result. But this shouldn't be necessary.
The attached patch adds a test for this case and also provides a fix. When the output encoding is marked as UTF-8 we set the UTF-8 flag so that the results are treated as characters.
diff -ruN XML-LibXSLT-1.57.orig/LibXSLT.xs XML-LibXSLT-1.57/LibXSLT.xs
--- XML-LibXSLT-1.57.orig/LibXSLT.xs 2004-03-01 18:42:43.000000000 +0000
+++ XML-LibXSLT-1.57/LibXSLT.xs 2004-12-14 13:36:42.000000000 +0000
@@ -923,6 +923,8 @@
croak("output to scalar failed");
}
xmlOutputBufferClose(output);
+ if (xmlStrEqual(encoding, (const xmlChar *) "UTF-8"))
+ SvUTF8_on( results );
RETVAL = results;
OUTPUT:
RETVAL
diff -ruN XML-LibXSLT-1.57.orig/t/11utf8.t XML-LibXSLT-1.57/t/11utf8.t
--- XML-LibXSLT-1.57.orig/t/11utf8.t 1970-01-01 01:00:00.000000000 +0100
+++ XML-LibXSLT-1.57/t/11utf8.t 2004-12-14 13:37:11.000000000 +0000
@@ -0,0 +1,41 @@
+use strict; # -*- perl -*-
+use Test;
+BEGIN { plan tests => 7; }
+
+use XML::LibXSLT;
+use XML::LibXML;
+
+my $parser = XML::LibXML->new();
+ok( $parser );
+
+# U+0100 == LATIN CAPITAL LETTER A WITH MACRON
+my $doc = $parser->parse_string(<<XML);
+<unicode>\x{0100}dam</unicode>
+XML
+ok( $doc );
+
+my $xslt = XML::LibXSLT->new();
+my $style_doc = $parser->parse_string(<<XSLT);
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output method="text" encoding="UTF-8"/>
+ <xsl:template match="/unicode">
+ <xsl:value-of select="."/>
+ </xsl:template>
+</xsl:stylesheet>
+XSLT
+ok( $style_doc );
+
+my $stylesheet = $xslt->parse_stylesheet($style_doc);
+ok( $stylesheet );
+
+my $results = $stylesheet->transform($doc);
+ok( $results );
+
+my $output = $stylesheet->output_string( $results );
+ok( $output );
+
+# Test that we've correctly converted to characters seeing as the
+# output format was UTF-8.
+ok( $output eq "\x{0100}dam" )
+ or warn "# output is [[$output]]\n";