Am Mi 01. Aug 2007, 05:55:10, MARTINI schrieb:
Show quoted text> Data::Dumper::Dumper is not dumping utf8 strings as latin1/8bit instead
> of utf8.
>
> How to reproduce:
> <script1>
> #!/usr/bin/perl -w
> use strict;
> use utf8;
> use Encode;
> use Data::Dumper;
>
> my $I = {A => 'ü'};
> if (Encode::is_utf8($I->{A})) {
> print "It is utf8.\n";
> }
> print Data::Dumper::Dumper($I);
> </script1>
>
> The output is:
>
> me@madrid:~> ./ut.pl
> It is utf8.
> $VAR1 = {
> 'A' => "\x{fc}"
> };
> me@madrid:~>
>
> \x{fc} is the 8bit/latin1 sign. :-( This should be \x{00C3}\x{00BC}.
>
Hi Martin,
I think that Data::Dumper does the right thing. Data::Dumper creates
Perl-code, and quotes wide characters in strings with the \x notation.
The code point for 'ü' is U+00FC, encoded in UTF-8 it is the two bytes
0xC3 and 0xBC. The \x notation is encoding agnostic and used the
codepoint. Therefore "\{fc}" is the same thing as an 'ü'.
Here is some sample code:
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use Encode;
use Data::Dumper;
use DBI;
# print Unicode
binmode STDOUT, ':utf8';
my $I = {A => 'ü'};
if (Encode::is_utf8($I->{A})) {
print "It is utf8.\n";
}
print Data::Dumper::Dumper($I);
if ( "\x{00FC}" eq "\x{FC}" ) {
print "an ü is an ü\n";
}
if ( 'ü' eq "\x{FC}" ) {
print "an ü is still an ü\n";
}
if ( 'ü' eq "\x{00FC}" ) {
print "an ü is still an ü\n";
}
# latin1 string with two characters
my $c3_bc = pack 'W2', 0xc3,0xbc;
print "'$c3_bc':" . DBI::data_string_desc( $c3_bc ), "\n";
# Unicode string with two characters
my $decoded_c3_bc = Encode::decode_utf8( $c3_bc );
print "'$decoded_c3_bc':", DBI::data_string_desc( $decoded_c3_bc ), "\n";
print Dumper( $c3_bc, $decoded_c3_bc ), "\n";
# a two character string is never equal to a single character string
if ( $c3_bc eq $decoded_c3_bc ) {
print "now I'm confused\n";
}