Hello,
HTTP::Message's decoded_content properly removes the content encoding
from XML responses but it now removes the character encoding too. That's
making XML downloads unparseable.
In order to parse an XML document returned by decoded_content, one must
first A) loosely parse the XML to extract the encoding from the <?xml?>
tag, and B) re-encode the XML according to that type.
There's a reason the base type for the XML MIME type is "application",
but HTTP::Message violates it.
$ perl -e'
use strict;
use warnings;
use Test::More tests => 4;
use Encode qw( encode );
use LWP qw( );
use XML::LibXML qw( );
use PerlIO::encoding qw( );
sub check {
my ($file, $test) = @_;
if (!eval {
my $name =
XML::LibXML->new()
->parse_string($file)
->documentElement()
->textContent();
is($name, "\x{C9}ric", $test);
1;
}) {
fail($test)
or diag("died with $@");
}
}
{
my $builder = Test::More->builder;
local $PerlIO::encoding::fallback = Encode::PERLQQ();
binmode $builder->output, ":encoding(US-ASCII)";
binmode $builder->failure_output, ":encoding(US-ASCII)";
binmode $builder->todo_output, ":encoding(US-ASCII)";
}
for my $enc (qw( cp850 UTF-16le )) {
my $file = encode($enc,
($enc =~ /^UTF-/ ? "\x{FEFF}" : "") .
qq{<?xml version="1.0" encoding="$enc"?>\n} .
qq{<root>\x{C9}ric</root>\n}
);
check($file, "$enc direct");
my $headers = HTTP::Headers->new(Content_Type =>
"application/xml");
my $response = HTTP::Response->new(200, "OK", $headers, $file);
check($response->decoded_content(), "$enc from response");
}
'
1..4
ok 1 - cp850 direct
not ok 2 - cp850 from response
# Failed test 'cp850 from response'
# at -e line 20.
# got: '\x{251c}\x{00eb}ric'
# expected: '\x{00c9}ric'
ok 3 - UTF-16le direct
not ok 4 - UTF-16le from response
# Failed test 'UTF-16le from response'
# at -e line 14.
# died with :1: parser error : Blank needed here
#
\x{3e3f}\x{3c0a}\x{6f72}\x{746f}\x{c33e}\x{7289}\x{6369}\x{2f3c}\x{6f72}\x{746f}\x{0a3e}
# ^
# :1: parser error : parsing XML declaration: '?>' expected
#
\x{3e3f}\x{3c0a}\x{6f72}\x{746f}\x{c33e}\x{7289}\x{6369}\x{2f3c}\x{6f72}\x{746f}\x{0a3e}
# ^
# :1: parser error : Start tag expected, '<' not found
#
\x{3e3f}\x{3c0a}\x{6f72}\x{746f}\x{c33e}\x{7289}\x{6369}\x{2f3c}\x{6f72}\x{746f}\x{0a3e}
# ^
# Looks like you failed 2 tests of 4.
The bug was introduced some version after 5.814 but before 5.828.
Probably 5.827 according to CHANGES.
Workaround if you're specifically expecting XML:
my $xml = $reponse->decoded_content($response, charset => 'none');
Generic workaround:
sub _decoded_content {
my ($mess, %opts) = @_;
if (!$opts{charset} || $opts{charset} ne 'none') {
if ($HTTP::Message::VERSION >= 5.527) {
$opts{charset} = 'none' if $mess->content_is_xml();
return $mess->decoded_content(%opts);
}
}
return $mess->decoded_content(%opts);
}
#$response->decoded_content(...); # Buggy
my $file = _decoded_content($response, ...);
Eric "ikegami" Brine