CC: | khampton [...] totalcinema.com |
Subject: | Can't diff strings with "wide" characters because of Digest::MD5 |
A report on londom.pm could not use Test::XML on UTF-8 strings.
This is because XML::SemanticDiff uses Digest::MD5 and that module
does not like multi-byte characters. There is a work-around in the
Digest::MD5 documentation that I have applied to XML::SemanticDiff.
See attached patch.
Robin
Subject: | xml-semanticdiff-wide.patch |
diff -ur XML-SemanticDiff-0.95/SemanticDiff.pm XML-SemanticDiff-copy/SemanticDiff.pm
--- XML-SemanticDiff-0.95/SemanticDiff.pm Tue Apr 9 09:57:59 2002
+++ XML-SemanticDiff-copy/SemanticDiff.pm Thu Feb 1 13:50:17 2007
@@ -136,6 +136,7 @@
package PathFinder;
use strict;
use Digest::MD5 qw(md5_base64);
+use Encode qw(encode_utf8);
my $descendents = {};
my $position_index = {};
my $char_accumulator = {};
@@ -190,7 +191,7 @@
# $ctx->add("$text");
# $doc->{"$test_context"}->{TextChecksum} = $ctx->b64digest;
- $doc->{"$test_context"}->{TextChecksum} = md5_base64("$text");
+ $doc->{"$test_context"}->{TextChecksum} = md5_base64(encode_utf8("$text"));
if ($opts->{keepdata}) {
$doc->{"$test_context"}->{CData} = $text;
}
diff -ur XML-SemanticDiff-0.95/t/03simple_compare.t XML-SemanticDiff-copy/t/03simple_compare.t
--- XML-SemanticDiff-0.95/t/03simple_compare.t Fri May 25 05:54:12 2001
+++ XML-SemanticDiff-copy/t/03simple_compare.t Thu Feb 1 13:53:01 2007
@@ -1,5 +1,5 @@
use Test;
-BEGIN { plan tests => 2 }
+BEGIN { plan tests => 4 }
use XML::SemanticDiff;
@@ -30,3 +30,13 @@
@results = $diff->compare($xml1, $xml1);
ok(@results == 0);
+
+$xml3 = <<"EOX";
+<?xml version="1.0" encoding="UTF-8"?>
+<root>\x{263A}</root>
+EOX
+
+@results = eval { $diff->compare($xml3, $xml3) };
+ok($@, ''); # compare did not die
+ok(scalar @results, 0); # wide characters ok!
+