Bug #18556 for Encode: UTF-16, UTF-32, UCS, UTF-7 decoders mishandle illegal characters

Subject:

UTF-16, UTF-32, UCS, UTF-7 decoders mishandle illegal characters

The UTF-16/32 decoder has the following problems with respect to illegal input: * Similar to bug 18105, it fails to detect/replace noncharacters * Unpaired surrogates incorrectly croak when the decoder is in non-check mode * The UTF-32 decoder fails to detect/replace surrogates Attached is a proposed fix. A sample test program follows: use Encode; use strict; use warnings; sub trydecode { my ($utf16) = (@_); print $utf16."\n"; my $text; eval { $text = Encode::decode('UTF-16BE', $utf16, 0); printf "%x\n", ord(substr($text, 3, 1)); }; if ($@) { warn $@; } $text =~ /\b(?:https?|ftp)/o; } trydecode("\000a\000a\000a\xff\xfe\000b\000b\000b"); #fffe trydecode("\000a\000a\000a\xff\xff\000b\000b\000b"); #ffff trydecode("\000a\000a\000a\xfd\xd0\000b\000b\000b"); #fdd0 trydecode("\000a\000a\000a\xd8\x3f\xdf\xfe\000b\000b\000b"); #1fffe trydecode("\000a\000a\000a\xd8\x3f\xdf\xff\000b\000b\000b"); #1ffff trydecode("\000a\000a\000a\xd8\x7f\xdf\xfe\000b\000b\000b"); #2fffe trydecode("\000a\000a\000a\xd8\x7f\xdf\xff\000b\000b\000b"); #2ffff trydecode("\000a\000a\000a\xdb\xff\xdf\xfe\000b\000b\000b"); #10fffe trydecode("\000a\000a\000a\xdb\xff\xdf\xff\000b\000b\000b"); #10ffff trydecode("\000a\000a\000a\xd8\x00\000b\000b\000b"); #d800 trydecode("\000a\000a\000a\xdc\x00\000b\000b\000b"); #dc00

Subject:

Encode-2.12-Utf16nonchar.diff

diff -ur Encode-2.12-1utf8nonchar/Unicode/Unicode.xs Encode-2.12-2utf7nonchar/Unicode/Unicode.xs --- Encode-2.12-1utf8nonchar/Unicode/Unicode.xs 2006-03-13 11:18:58.000000000 -0800 +++ Encode-2.12-2utf7nonchar/Unicode/Unicode.xs 2006-04-05 13:55:35.000000000 -0700 @@ -132,8 +132,8 @@ while (s < e && s+size <= e) { UV ord = enc_unpack(aTHX_ &s,e,size,endian); U8 *d; - if (size != 4 && invalid_ucs2(ord)) { - if (ucs2) { + if (issurrogate(ord)) { + if (ucs2 || size == 4) { if (check) { croak("%"SVf":no surrogates allowed %"UVxf, *hv_fetch((HV *)SvRV(obj),"Name",4,0), @@ -148,24 +148,49 @@ else { UV lo; if (!isHiSurrogate(ord)) { - croak("%"SVf":Malformed HI surrogate %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); - } - if (s+size > e) { - /* Partial character */ - s -= size; /* back up to 1st half */ - break; /* And exit loop */ + if (check) { + croak("%"SVf":Malformed HI surrogate %"UVxf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } + else { + ord = FBCHAR; + } } - lo = enc_unpack(aTHX_ &s,e,size,endian); - if (!isLoSurrogate(lo)){ - croak("%"SVf":Malformed LO surrogate %"UVxf, - *hv_fetch((HV *)SvRV(obj),"Name",4,0), - ord); + else { + if (s+size > e) { + /* Partial character */ + s -= size; /* back up to 1st half */ + break; /* And exit loop */ + } + lo = enc_unpack(aTHX_ &s,e,size,endian); + if (!isLoSurrogate(lo)){ + if (check) { + croak("%"SVf":Malformed LO surrogate %"UVxf, + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } + else { + ord = FBCHAR; + } + } + else { + ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); + } } - ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); } } + + if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) { + if (check) { + croak("%"SVf":Unicode character %"UVxf" is illegal", + *hv_fetch((HV *)SvRV(obj),"Name",4,0), + ord); + } else { + ord = FBCHAR; + } + } + d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); SvCUR_set(result,d - (U8 *)SvPVX(result));