2009-04-06 Stepan Kasal <skasal@redhat.com>
* t/util-58.t: Add tests reflecting common usage.
* CGI/Util.pm (encode): State what conversions are needed, in
accordance to the common usage mentioned above; and code it.
diff -ur CGI.pm-3.42/CGI/Util.pm CGI.pm-3.42/CGI/Util.pm
--- CGI.pm-3.42/CGI/Util.pm 2008-09-08 15:58:52.000000000 +0200
+++ CGI.pm-3.42/CGI/Util.pm 2009-04-04 16:30:29.000000000 +0200
@@ -210,7 +210,6 @@
my $todecode = shift;
return undef unless defined($todecode);
$todecode =~ tr/+/ /; # pluses become spaces
- $EBCDIC = "\t" ne "\011";
if ($EBCDIC) {
$todecode =~ s/%([0-9a-fA-F]{2})/chr $A2E[hex($1)]/ge;
} else {
@@ -232,16 +231,24 @@
}
# URL-encode data
+#
+# We cannot use the %u escapes, they were rejected by W3C, so the official
+# way is %XX-escaped utf-8 encoding.
+# Naturally, Unicode strings have to be converted to their utf-8 byte
+# representation. (No action is required on 5.6.)
+# Byte strings were traditionally used directly as a sequence of octets.
+# This worked if they actually represented binary data (i.e. in CGI::Compress).
+# This also worked if these byte strings were actually utf-8 encoded; e.g.,
+# when the source file used utf-8 without the apropriate "use utf8;".
+# This fails if the byte string is actually a Latin 1 encoded string, but it
+# was always so and cannot be fixed without breaking the binary data case.
+# -- Stepan Kasal <skasal@redhat.com>
+#
sub escape {
shift() if @_ > 1 and ( ref($_[0]) || (defined $_[1] && $_[0] eq $CGI::DefaultClass));
my $toencode = shift;
return undef unless defined($toencode);
- $toencode = eval { pack("C*", unpack("U0C*", $toencode))} || pack("C*", unpack("C*", $toencode));
-
- # force bytes while preserving backward compatibility -- dankogai
- # but commented out because it was breaking CGI::Compress -- lstein
- # $toencode = eval { pack("U*", unpack("U0C*", $toencode))} || pack("C*", unpack("C*", $toencode));
-
+ utf8::encode($toencode) if ($] > 5.007 && utf8::is_utf8($toencode));
if ($EBCDIC) {
$toencode=~s/([^a-zA-Z0-9_.~-])/uc sprintf("%%%02x",$E2A[ord($1)])/eg;
} else {
diff -ur CGI.pm-3.42/t/util-58.t CGI.pm-3.42/t/util-58.t
--- CGI.pm-3.42/t/util-58.t 2003-04-14 20:32:22.000000000 +0200
+++ CGI.pm-3.42/t/util-58.t 2009-04-06 16:49:42.000000000 +0200
@@ -1,16 +1,29 @@
+# test CGI::Util::escape
+use Test::More tests => 4;
+use_ok("CGI::Util");
+
+# Byte strings should be escaped byte by byte:
+# 1) not a valid utf-8 sequence:
+my $uri = "pe\x{f8}\x{ed}\x{e8}ko.ogg";
+is(CGI::Util::escape($uri), "pe%F8%ED%E8ko.ogg", "Escape a Latin-2 string");
+
+# 2) is a valid utf-8 sequence, but not an UTF-8-flagged string
+# This happens often: people write utf-8 strings to source, but forget
+# to tell perl about it by "use utf8;"--this is obviously wrong, but we
+# have to handle it gracefully, for compatibility with GCI.pm under
+# perl-5.8.x
#
-# This tests CGI::Util::escape() when fed with UTF-8-flagged string
-# -- dankogai
-BEGIN {
- if ($] < 5.008) {
- print "1..0 # \$] == $] < 5.008\n";
- exit(0);
- }
-}
+$uri = "pe\x{c5}\x{99}\x{c3}\x{ad}\x{c4}\x{8d}ko.ogg";
+is(CGI::Util::escape($uri), "pe%C5%99%C3%AD%C4%8Dko.ogg",
+ "Escape an utf-8 byte string");
-use Test::More tests => 2;
-use_ok("CGI::Util");
-my $uri = "\x{5c0f}\x{98fc} \x{5f3e}.txt"; # KOGAI, Dan, in Kanji
-is(CGI::Util::escape($uri), "%E5%B0%8F%E9%A3%BC%20%E5%BC%BE.txt",
- "# Escape string with UTF-8 flag");
+SKIP:
+{
+ # This tests CGI::Util::escape() when fed with UTF-8-flagged string
+ # -- dankogai
+ skip("Unicode strings not available in $]", 1) if ($] < 5.008);
+ $uri = "\x{5c0f}\x{98fc} \x{5f3e}.txt"; # KOGAI, Dan, in Kanji
+ is(CGI::Util::escape($uri), "%E5%B0%8F%E9%A3%BC%20%E5%BC%BE.txt",
+ "Escape string with UTF-8 flag");
+}
__END__