Subject: | UTF-16, UTF-32, UCS, UTF-7 decoders mishandle illegal characters |
The UTF-16/32 decoder has the following problems with respect to illegal
input:
* Similar to bug 18105, it fails to detect/replace noncharacters
* Unpaired surrogates incorrectly croak when the decoder is in non-check
mode
* The UTF-32 decoder fails to detect/replace surrogates
Attached is a proposed fix. A sample test program follows:
use Encode;
use strict;
use warnings;
sub trydecode {
my ($utf16) = (@_);
print $utf16."\n";
my $text;
eval {
$text = Encode::decode('UTF-16BE', $utf16, 0);
printf "%x\n", ord(substr($text, 3, 1));
};
if ($@) {
warn $@;
}
$text =~ /\b(?:https?|ftp)/o;
}
trydecode("\000a\000a\000a\xff\xfe\000b\000b\000b"); #fffe
trydecode("\000a\000a\000a\xff\xff\000b\000b\000b"); #ffff
trydecode("\000a\000a\000a\xfd\xd0\000b\000b\000b"); #fdd0
trydecode("\000a\000a\000a\xd8\x3f\xdf\xfe\000b\000b\000b"); #1fffe
trydecode("\000a\000a\000a\xd8\x3f\xdf\xff\000b\000b\000b"); #1ffff
trydecode("\000a\000a\000a\xd8\x7f\xdf\xfe\000b\000b\000b"); #2fffe
trydecode("\000a\000a\000a\xd8\x7f\xdf\xff\000b\000b\000b"); #2ffff
trydecode("\000a\000a\000a\xdb\xff\xdf\xfe\000b\000b\000b"); #10fffe
trydecode("\000a\000a\000a\xdb\xff\xdf\xff\000b\000b\000b"); #10ffff
trydecode("\000a\000a\000a\xd8\x00\000b\000b\000b"); #d800
trydecode("\000a\000a\000a\xdc\x00\000b\000b\000b"); #dc00
Subject: | Encode-2.12-Utf16nonchar.diff |
diff -ur Encode-2.12-1utf8nonchar/Unicode/Unicode.xs Encode-2.12-2utf7nonchar/Unicode/Unicode.xs
--- Encode-2.12-1utf8nonchar/Unicode/Unicode.xs 2006-03-13 11:18:58.000000000 -0800
+++ Encode-2.12-2utf7nonchar/Unicode/Unicode.xs 2006-04-05 13:55:35.000000000 -0700
@@ -132,8 +132,8 @@
while (s < e && s+size <= e) {
UV ord = enc_unpack(aTHX_ &s,e,size,endian);
U8 *d;
- if (size != 4 && invalid_ucs2(ord)) {
- if (ucs2) {
+ if (issurrogate(ord)) {
+ if (ucs2 || size == 4) {
if (check) {
croak("%"SVf":no surrogates allowed %"UVxf,
*hv_fetch((HV *)SvRV(obj),"Name",4,0),
@@ -148,24 +148,49 @@
else {
UV lo;
if (!isHiSurrogate(ord)) {
- croak("%"SVf":Malformed HI surrogate %"UVxf,
- *hv_fetch((HV *)SvRV(obj),"Name",4,0),
- ord);
- }
- if (s+size > e) {
- /* Partial character */
- s -= size; /* back up to 1st half */
- break; /* And exit loop */
+ if (check) {
+ croak("%"SVf":Malformed HI surrogate %"UVxf,
+ *hv_fetch((HV *)SvRV(obj),"Name",4,0),
+ ord);
+ }
+ else {
+ ord = FBCHAR;
+ }
}
- lo = enc_unpack(aTHX_ &s,e,size,endian);
- if (!isLoSurrogate(lo)){
- croak("%"SVf":Malformed LO surrogate %"UVxf,
- *hv_fetch((HV *)SvRV(obj),"Name",4,0),
- ord);
+ else {
+ if (s+size > e) {
+ /* Partial character */
+ s -= size; /* back up to 1st half */
+ break; /* And exit loop */
+ }
+ lo = enc_unpack(aTHX_ &s,e,size,endian);
+ if (!isLoSurrogate(lo)){
+ if (check) {
+ croak("%"SVf":Malformed LO surrogate %"UVxf,
+ *hv_fetch((HV *)SvRV(obj),"Name",4,0),
+ ord);
+ }
+ else {
+ ord = FBCHAR;
+ }
+ }
+ else {
+ ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
+ }
}
- ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
}
}
+
+ if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
+ if (check) {
+ croak("%"SVf":Unicode character %"UVxf" is illegal",
+ *hv_fetch((HV *)SvRV(obj),"Name",4,0),
+ ord);
+ } else {
+ ord = FBCHAR;
+ }
+ }
+
d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
SvCUR_set(result,d - (U8 *)SvPVX(result));