Subject: | UTF8 mangling of parsed data |
UTF8 strings within XML come out as strings with the UTF8 flag set, but
set as individual bytes rather than utf8 codepoints.
Attached patch attempts to fix this and includes 2 test sets to
indicate the issue.
Tested on OS X 10.5.7 with perl 5.8.8 (system perl).
However this appears to be portable :-)
Subject: | xml-base-utf8.patch |
Bare.xs | 9 +++++++++
t/utf8-attributes.t | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
t/utf8-values.t | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 110 insertions(+), 0 deletions(-)
diff --git a/Bare.xs b/Bare.xs
index 7189f1e..29d498a 100644
--- a/Bare.xs
+++ b/Bare.xs
@@ -39,6 +39,7 @@ SV *cxml2obj() {
if( !length ) {
if( curnode->vallen ) {
SV * sv = newSVpvn( curnode->value, curnode->vallen );
+ SvUTF8_on (sv);
hv_store( output, "value", 5, sv, vhash );
if( curnode->type ) {
SV *svi = newSViv( 1 );
@@ -47,12 +48,14 @@ SV *cxml2obj() {
}
if( curnode->comlen ) {
SV * sv = newSVpvn( curnode->comment, curnode->comlen );
+ SvUTF8_on (sv);
hv_store( output, "comment", 7, sv, chash );
}
}
else {
if( curnode->vallen ) {
SV *sv = newSVpvn( curnode->value, curnode->vallen );
+ SvUTF8_on (sv);
hv_store( output, "value", 5, sv, vhash );
if( curnode->type ) {
SV *svi = newSViv( 1 );
@@ -61,6 +64,7 @@ SV *cxml2obj() {
}
if( curnode->comlen ) {
SV *sv = newSVpvn( curnode->comment, curnode->comlen );
+ SvUTF8_on (sv);
hv_store( output, "comment", 7, sv, chash );
}
@@ -126,6 +130,7 @@ SV *cxml2obj() {
hv_store( output, curatt->name, curatt->namelen, atthref, 0 );
attval = newSVpvn( curatt->value, curatt->vallen );
+ SvUTF8_on (attval);
hv_store( atth, "value", 5, attval, vhash );
attatt = newSViv( 1 );
hv_store( atth, "_att", 4, attatt, ahash );
@@ -146,6 +151,7 @@ SV *cxml2obj_simple() {
if( ( length + numatts ) == 0 ) {
if( curnode->vallen ) {
SV * sv = newSVpvn( curnode->value, curnode->vallen );
+ SvUTF8_on (sv);
return sv;
}
return newSViv( 1 ); //&PL_sv_undef;
@@ -158,6 +164,7 @@ SV *cxml2obj_simple() {
curnode = curnode->firstchild;
for( i = 0; i < length; i++ ) {
SV *namesv = newSVpvn( curnode->name, curnode->namelen );
+ SvUTF8_on (namesv);
SV **cur = hv_fetch( output, curnode->name, curnode->namelen, 0 );
@@ -209,6 +216,7 @@ SV *cxml2obj_simple() {
STRLEN len;
char *ptr = SvPV(*cur, len);
SV *newsv = newSVpvn( ptr, len );
+ SvUTF8_on (newsv);
av_push( newarray, newsv );
hv_delete( output, curnode->name, curnode->namelen, 0 );
@@ -225,6 +233,7 @@ SV *cxml2obj_simple() {
curatt = curnode->firstatt;
for( i = 0; i < numatts; i++ ) {
attval = newSVpvn( curatt->value, curatt->vallen );
+ SvUTF8_on (attval);
hv_store( output, curatt->name, curatt->namelen, attval, 0 );
if( i != ( numatts - 1 ) ) curatt = curatt->next;
}
diff --git a/t/utf8-attributes.t b/t/utf8-attributes.t
new file mode 100644
index 0000000..9dc09b2
--- /dev/null
+++ b/t/utf8-attributes.t
@@ -0,0 +1,50 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+# NB we have use utf8 here, but the source should be 7bit clean
+# however I need the utf8::is_utf8 and utf8::valid names which
+# are no longer exposed without the use line.
+use utf8;
+
+use Test::More qw(no_plan);
+
+use_ok('XML::Bare');
+
+my $data = {
+ hash => "#",
+ oo => "\x{f6}",
+ iso_a => "\x{c4}",
+ iso_oo => "\x{d6}",
+ aa => "\x{e4}",
+ euro => "\x{20ac}",
+};
+
+# build XML string with UTF8 values
+my $xmldata = "<data>\n";
+foreach ( keys %{$data} ) {
+ $xmldata .= " <$_ char=\"" . $data->{$_} . "\" />\n";
+}
+$xmldata .= "</data>\n";
+
+# parse the provided XML
+my $obj = new XML::Bare( text => $xmldata );
+my $root = $obj->parse;
+
+# convert back to XML from parse
+my $roundtrip = $obj->xml($root);
+
+## this isn't valid as order/spacing not preserved
+#is( $roundtrip, $xmldata, 'Round trip XML identical' );
+
+while ( my ( $name, $char ) = each %{$data} ) {
+ my $str = $root->{data}{$name}{char}{value};
+ ok( $root->{data}{$name}{char}{_att}, "$name has char attribute" );
+ ok( utf8::is_utf8($str), "Character $name is correct encoding" )
+ if ( utf8::is_utf8($char) );
+ ok( utf8::valid($str), "Character $name is Valid" );
+ ok( ( length($str) == 1 ), "String returned for $name is 1 char long" );
+
+ is( $str, $char, "Character $name OK" );
+}
diff --git a/t/utf8-values.t b/t/utf8-values.t
new file mode 100644
index 0000000..94fad45
--- /dev/null
+++ b/t/utf8-values.t
@@ -0,0 +1,51 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+# NB we have use utf8 here, but the source should be 7bit clean
+# however I need the utf8::is_utf8 and utf8::valid names which
+# are no longer exposed without the use line.
+use utf8;
+
+use Test::More qw(no_plan);
+
+use_ok('XML::Bare');
+
+my $data = {
+ hash => "#",
+ oo => "\x{f6}",
+ iso_a => "\x{c4}",
+ iso_oo => "\x{d6}",
+ aa => "\x{e4}",
+ euro => "\x{20ac}",
+};
+
+# build XML string with UTF8 values
+my $xmldata = "<data>\n";
+foreach ( keys %{$data} ) {
+ $xmldata .= " <$_>";
+ $xmldata .= $data->{$_};
+ $xmldata .= "</$_>\n";
+}
+$xmldata .= "</data>\n";
+
+# parse the provided XML
+my $obj = new XML::Bare( text => $xmldata );
+my $root = $obj->parse;
+
+# convert back to XML from parse
+my $roundtrip = $obj->xml($root);
+
+## this isn't valid as order/spacing not preserved
+#is( $roundtrip, $xmldata, 'Round trip XML identical' );
+
+while ( my ( $name, $char ) = each %{$data} ) {
+ my $str = $root->{data}{$name}{value};
+ ok( utf8::is_utf8($str), "Character $name is correct encoding" )
+ if ( utf8::is_utf8($char) );
+ ok( utf8::valid($str), "Character $name is Valid" );
+ ok( ( length($str) == 1 ), "String returned for $name is 1 char long" );
+
+ is( $str, $char, "Character $name OK" );
+}