Subject: | decode_utf8 doesn't do the same as decode("utf8") |
The decode_utf8 doesn't do the same as decode("utf8",...) for all inputs despite the documentation explicitly saying that
$string = decode_utf8($octets [, CHECK]);
Equivalent to "$string = decode("utf8", $octets [, CHECK])".
It acts differently when $octets has the UTF-8 flag turned on. decode("utf8",...) treats each character in the string as a byte. decode_utf8 simply returns the string unaltered.
Failing test suite attached.
Subject: | decode_utf_bug.t |
#!/usr/bin/env perl
use strict;
use warnings;
use Encode;
use Test::More tests => 4;
# decode_utf8(...) and decode('utf8',...) are MEANT TO BE THE SAME
# from the perldoc for Encode:
#
# $string = decode_utf8($octets [, CHECK]);
# Equivalent to "$string = decode("utf8", $octets [, CHECK])".
#######
# decode_utf8($bytes)
#######
{
my $bytes = "test:\x{ee}\x{80}\x{80}";
my $chars = Encode::decode_utf8($bytes);
is($chars, "test:\x{e000}", "decode_utf8 without utf-8 flag");
}
{
my $bytes = "test:\x{ee}\x{80}\x{80}";
# do something that makes the utf-8 flag turn on without
# altering the contents of the string
$bytes .= "\x{2603}";
chop $bytes;
my $chars = Encode::decode_utf8($bytes);
is($chars, "test:\x{e000}", "decode_utf8 with utf-8 flag");
}
#######
# decode("utf8",$bytes)
#######
{
my $bytes = "test:\x{ee}\x{80}\x{80}";
my $chars = Encode::decode("utf-8",$bytes);
is($chars, "test:\x{e000}", "decode('utf8',...) without utf-8 flag");
}
{
my $bytes = "test:\x{ee}\x{80}\x{80}";
# do something that makes the utf-8 flag turn on without
# altering the contents of the string
$bytes .= "\x{2603}";
chop $bytes;
my $chars = Encode::decode("utf-8",$bytes);
is($chars, "test:\x{e000}", "decode('utf8',...) with utf-8 flag");
}