I believe this is the same problem as Debian bug #324882 (
https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=324882)
From the Debian package, I am attaching a patch that should fix the issue, as well as a test.
Description: testcase for encoding issues in parsing and printing to a file
Author: MartÃn Ferrari <tincho@debian.org>
--- /dev/null
+++ b/t/encodings.t
@@ -0,0 +1,48 @@
+#!/usr/bin/perl
+use strict;
+use utf8; # for embedded strings
+use XML::DOM;
+use Test::More tests => 16;
+use Test::NoWarnings;
+use constant TMPFILE => "test_encoding.xml";
+
+my $str =
+q(<?xml version="1.0" encoding="UTF-8"?>
+<blah>
+ <foo baz="î¶î·">ã</foo>
+ <bar>ï¾ï¿îµî¶î·î¸î¹îºî»î¼î½î¾</bar>
+</blah>);
+
+# test 1 -- check for correct parsing of input string
+my $parser = new XML::DOM::Parser;
+my $doc = eval { $parser->parse($str); };
+ok(((not $@) && defined $doc), 'loads ok, parses str');
+
+try($doc);
+$doc->printToFile(TMPFILE);
+$doc->dispose;
+
+ok(system("xmllint", "--noout", TMPFILE) == 0, 'xmllint runs ok');
+
+my $doc2 = eval { $parser->parsefile(TMPFILE) };
+ok(((not $@) && defined $doc2), 'parses TMPFILE ok');
+
+try($doc2);
+$doc2->dispose;
+unlink TMPFILE;
+
+sub try {
+ my $doc = shift;
+ my $foo = ${$doc->getDocumentElement->getElementsByTagName("foo")}[0];
+ my $bar = ${$doc->getDocumentElement->getElementsByTagName("bar")}[0];
+ my $baz = $foo->getAttribute("baz");
+ my $footext = $foo->getFirstChild->getData;
+ my $bartext = $bar->getFirstChild->getData;
+
+ ok(utf8::is_utf8($baz), 'baz is_utf8...');
+ is($baz, "\x{E4B6}\x{E4B7}", '...and correct');
+ ok(utf8::is_utf8($footext), 'footext is_utf8...');
+ is($footext, "\xE3\x{F8E8}", '...and correct');
+ ok(utf8::is_utf8($bartext), 'bartext is_utf8');
+ is($bartext, "\x{FB7E}\x{FB7F}\x{E4B5}\x{E4B6}\x{E4B7}\x{E4B8}\x{E4B9}\x{E4BA}\x{E4BB}\x{E4BC}\x{E4BD}\x{E4BE}", 'and correct');
+}
Description: properly encode output for printToFile
closes: #324882
Author: Gregor Herrmann <gregoa@debian.org>
--- a/lib/XML/DOM.pm
+++ b/lib/XML/DOM.pm
@@ -1218,7 +1218,8 @@ sub to_sax
sub printToFile
{
my ($self, $fileName) = @_;
- my $fh = new FileHandle ($fileName, "w") ||
+ my $encoding = $self->getXMLDecl()->getEncoding();
+ my $fh = new FileHandle ($fileName, ">:encoding($encoding)") ||
croak "printToFile - can't open output file $fileName";
$self->print ($fh);