Subject: | Encoding problems in POST requests |
While interacting with a page that had a form with UTF-8 encoded chars outside ISO-8859-
1, I bumped across an error in which HTTP::Message complained about not receiving bytes in
content(). In the particular case, the problem stems from the fact that POST allows us to pass
values that are character strings, not octet strings; this is the correct behaviour IMHO, but
POST should Do The Right Thing with the stuff it receives.
The patch I'm proposing addresses this problem, taking the charset from the content-type
and defaulting to ISO-8859-1 like the standard says. In this way, any string for which is_utf8
returns true is transformed to a sequence of bytes according to the charset, which makes
HTTP::Message happy.
The patch does not break the test suite. I'm also adding another test for this added feature.
Hope this helps,
Flavio.
Subject: | libwww-perl-5.826.patch |
diff -ru libwww-perl-5.826/lib/HTTP/Request/Common.pm patched/libwww-perl-5.826/lib/HTTP/Request/Common.pm
--- libwww-perl-5.826/lib/HTTP/Request/Common.pm 2009-02-13 14:57:52.000000000 +0100
+++ patched/libwww-perl-5.826/lib/HTTP/Request/Common.pm 2009-06-01 18:36:22.000000000 +0200
@@ -118,6 +118,18 @@
$req;
}
+sub _get_charset {
+ my ($req) = @_;
+
+ # Taken from HTTP::Message
+ require HTTP::Headers::Util;
+ if (my @ct = HTTP::Headers::Util::split_header_words($req->header("Content-Type"))) {
+ my ($ct, undef, %ct_param) = @{$ct[-1]};
+ return $ct_param{charset};
+ }
+
+ return;
+}
sub form_data # RFC1867
{
@@ -125,10 +137,17 @@
my @data = ref($data) eq "HASH" ? %$data : @$data; # copy
my $fhparts;
my @parts;
+
+ my $charset = lc(_get_charset($req) || 'ISO-8859-1');
+
my($k,$v);
while (($k,$v) = splice(@data, 0, 2)) {
if (!ref($v)) {
$k =~ s/([\\\"])/\\$1/g; # escape quotes and backslashes
+ if (utf8::is_utf8($v)) {
+ require Encode;
+ $v = Encode::encode($charset, $v, Encode::FB_CROAK());
+ }
push(@parts,
qq(Content-Disposition: form-data; name="$k"$CRLF$CRLF$v));
}
Subject: | form-charset.t |
#!perl -w
use strict;
use Test qw(plan ok);
plan tests => 4;
use HTML::Form;
use Encode qw( is_utf8 );
my $form = HTML::Form->parse(<<"EOT", base => "http://example.com", strict => 1);
<form method="POST" enctype="multipart/form-data; charset=utf-8">
<input type="hidden" name="encoded" value="ä™®">
</form>
EOT
# We expect the form's parameter to be a string of characters
ok(is_utf8($form->value('encoded')));
# This should survive in this case, because UTF-8 encoding applies
my $request = eval { $form->click() };
ok($request);
# This form has wrong settings
$form = HTML::Form->parse(<<"EOT", base => "http://example.com", strict => 1);
<form method="POST" enctype="multipart/form-data; charset="ISO-8859-1">
<input type="hidden" name="encoded" value="ä™®">
</form>
EOT
# We expect the form's parameter to be a string of characters
ok(is_utf8($form->value('encoded')));
# This should die in this case, because ISO-8859-1 encoding DOES NOT APPLY
$request = eval { $form->click() };
ok(!$request);