For now I've applied this modification of your patch.
Index: util.c
===================================================================
RCS file: /cvsroot/libwww-perl/html-parser/util.c,v
retrieving revision 2.18
retrieving revision 2.19
diff -u -p -u -r2.18 -r2.19
--- util.c 14 Sep 2004 13:47:16 -0000 2.18
+++ util.c 8 Nov 2004 12:54:57 -0000 2.19
@@ -1,4 +1,4 @@
-/* $Id: util.c,v 2.18 2004/09/14 13:47:16 gisle Exp $
+/* $Id: util.c,v 2.19 2004/11/08 12:54:57 gisle Exp $
*
* Copyright 1999-2001, Gisle Aas.
*
@@ -76,6 +76,7 @@ decode_entities(pTHX_ SV* sv, HV* entity
#ifdef UNICODE_ENTITIES
char buf[UTF8_MAXLEN];
int repl_utf8;
+ int high_surrogate = 0;
#else
char buf[1];
#endif
@@ -138,7 +139,30 @@ decode_entities(pTHX_ SV* sv, HV* entity
repl_utf8 = 0;
}
else {
- char *tmp = uvuni_to_utf8(buf, num);
+ char *tmp;
+ if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */
+ if (high_surrogate != 0) {
+ t -= 3; /* Back up past 0xFFFD */
+ num = ((high_surrogate - 0xD800) << 10) +
+ (num - 0xDC00) + 0x10000;
+ high_surrogate = 0;
+ } else {
+ num = 0xFFFD;
+ }
+ }
+ else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */
+ high_surrogate = num;
+ num = 0xFFFD;
+ }
+ else {
+ high_surrogate = 0;
+ /* otherwise invalid? */
+ if (num == 0xFFFE || num == 0xFFFF || num > 0x1F0000) {
+ num = 0xFFFD;
+ }
+ }
+
+ tmp = uvuni_to_utf8(buf, num);
repl = buf;
repl_len = tmp - buf;
repl_utf8 = 1;
@@ -165,6 +189,9 @@ decode_entities(pTHX_ SV* sv, HV* entity
#endif
}
}
+#ifdef UNICODE_ENTITIES
+ high_surrogate = 0;
+#endif
}
if (repl) {
@@ -174,6 +201,10 @@ decode_entities(pTHX_ SV* sv, HV* entity
t--; /* '&' already copied, undo it */
#ifdef UNICODE_ENTITIES
+ if (*s != '&') {
+ high_surrogate = 0;
+ }
+
if (!SvUTF8(sv) && repl_utf8) {
STRLEN len = t - SvPVX(sv);
if (len) {
Index: t/uentities.t
===================================================================
RCS file: /cvsroot/libwww-perl/html-parser/t/uentities.t,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -p -u -r1.6 -r1.7
--- t/uentities.t 3 Oct 2003 14:50:08 -0000 1.6
+++ t/uentities.t 8 Nov 2004 12:55:06 -0000 1.7
@@ -14,7 +14,7 @@ unless (&HTML::Entities::UNICODE_SUPPORT
exit;
}
-print "1..10\n";
+print "1..13\n";
print "not " unless decode_entities("&euro") eq "\x{20AC}";
print "ok 1\n";
@@ -25,18 +25,18 @@ print "ok 2\n";
print "not " unless decode_entities("񺄠") eq chr(500000);
print "ok 3\n";
-{
- no warnings 'utf8'; # These are illegal unicode chars
- print "not " unless decode_entities("") eq "\x{FFFF}";
- print "ok 4\n";
-
- print "not " unless decode_entities("") eq chr(0x10FFFF);
- print "ok 5\n";
+print "not " unless decode_entities("") eq "\x{FFFD}";
+print "ok 4\n";
- print "not " unless decode_entities("�") eq chr(0xFFFF_FFFF);
- print "ok 6\n";
+{
+ no warnings 'utf8'; # workaround for perl bug
+ print "not " unless decode_entities("") eq chr(0x10FFFF);
+ print "ok 5\n";
}
+print "not " unless decode_entities("�") eq chr(0xFFFD);
+print "ok 6\n";
+
print "not " unless decode_entities("�") eq "\0" &&
decode_entities("�") eq "\0" &&
decode_entities("�") eq "\0" &&
@@ -77,3 +77,11 @@ print "not " if $err;
print "ok 10\n";
+print "not " unless decode_entities("��") eq chr(0x100085);
+print "ok 11\n";
+
+print "not " unless decode_entities("��") eq chr(0x100085);
+print "ok 12\n";
+
+print "not " unless decode_entities("�") eq chr(0xFFFD);
+print "ok 13\n";