Subject: | HTML::TreeBuilder sometimes removes [PATCH] |
I've discovered that HTML::TreeBuilder sometimes removes
characters from the tree. I investigated, and discovered the problem
was that \s both did and did not match \xA0 (the non-breaking space),
depending on whether Perl thought the string contained UTF-8 or not.
This is really a bug in Perl (see
http://guest:guest@rt.perl.org/rt3/Ticket/Display.html?id=36839), but it
is apparently not going to get fixed any time soon.
The attached patch avoids the problem by never using \s (or \S) when we
do not want \xA0 to be considered whitespace. Instead, it uses a
character class that explicitly lists the whitespace characters
[\n\r\f\t ]. It also adds a new test (unicode.t) to demonstrate the
problem and ensure the fix works. The patch is against
HTML-Tree-3.19_04, but can be applied as far back as 3.17 (although you
may get some conflicts due to extraneous whitespace in the code, they're
not hard to resolve manually).
Thanks for resuming development on this very useful module.
--
Chris Madsen cjm@pobox.com
------------------ http://www.pobox.com/~cjm ------------------
Subject: | unicode.patch |
diff -ur HTML-Tree-3.19_04/lib/HTML/Element.pm HTML-Tree-3.19_04cjm/lib/HTML/Element.pm
--- HTML-Tree-3.19_04/lib/HTML/Element.pm 2006-01-31 18:51:14.000000000 -0600
+++ HTML-Tree-3.19_04cjm/lib/HTML/Element.pm 2006-02-04 11:48:41.158809374 -0600
@@ -1282,7 +1282,7 @@
# thru this sibling list. I doubt it actually matters, tho.
next;
}
- next unless $sibs->[$i] =~ m<^\s+$>s; # it's /all/ whitespace
+ next if $sibs->[$i] =~ m<[^\n\r\f\t ]>s; # it's /all/ whitespace
print "Under $ptag whose canTighten ",
"value is ", 0 + $HTML::Element::canTighten{$ptag}, ".\n"
@@ -1559,7 +1559,7 @@
push @html, $node; # say no go
} else {
if($last_tag_tightenable) {
- $node =~ s<\s+>< >s;
+ $node =~ s<[\n\r\f\t ]+>< >s;
#$node =~ s< $><>s;
$node =~ s<^ ><>s;
push
@@ -1661,9 +1661,9 @@
sub as_trimmed_text {
my $text = shift->as_text(@_);
- $text =~ s/\s+$//s;
- $text =~ s/^\s+//s;
- $text =~ s/\s+/ /g;
+ $text =~ s/[\n\r\f\t ]+$//s;
+ $text =~ s/^[\n\r\f\t ]+//s;
+ $text =~ s/[\n\r\f\t ]+/ /g;
return $text;
}
diff -ur HTML-Tree-3.19_04/lib/HTML/TreeBuilder.pm HTML-Tree-3.19_04cjm/lib/HTML/TreeBuilder.pm
--- HTML-Tree-3.19_04/lib/HTML/TreeBuilder.pm 2005-12-18 22:57:07.000000000 -0600
+++ HTML-Tree-3.19_04cjm/lib/HTML/TreeBuilder.pm 2006-02-04 11:45:45.670887934 -0600
@@ -690,7 +690,7 @@
($sibs = ( $par = $self->{'_pos'} || $self )->{'_content'})
and @$sibs # parent already has content
and !ref($sibs->[-1]) # and the last one there is a text node
- and $sibs->[-1] !~ m<\S>s # and it's all whitespace
+ and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace
and ( # one of these has to be eligible...
$HTML::TreeBuilder::canTighten{$tag}
@@ -1024,8 +1024,8 @@
$pos->push_content($text);
} else {
# return unless $text =~ /\S/; # This is sometimes wrong
-
- if (!$self->{'_implicit_tags'} || $text !~ /\S/) {
+
+ if (!$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/) {
# don't change anything
} elsif ($ptag eq 'head' or $ptag eq 'noframes') {
if($self->{'_implicit_body_p_tag'}) {
@@ -1103,8 +1103,9 @@
#print "POS is now $pos, ", $pos->{'_tag'}, "\n";
return if $ignore_text;
- $text =~ s/\s+/ /g unless $no_space_compacting ; # canonical space
-
+ $text =~ s/[\n\r\f\t ]+/ /g # canonical space
+ unless $no_space_compacting ;
+
print
$indent, " (Attaching text node ($nugget) under ",
# was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'},
diff -urN HTML-Tree-3.19_04/t/unicode.t HTML-Tree-3.19_04cjm/t/unicode.t
--- HTML-Tree-3.19_04/t/unicode.t 1969-12-31 18:00:00.000000000 -0600
+++ HTML-Tree-3.19_04cjm/t/unicode.t 2006-02-04 11:35:24.178656756 -0600
@@ -0,0 +1,114 @@
+#!perl -w
+# -*-Perl-*-
+# Time-stamp: "2003-09-15 01:45:14 ADT"
+
+use strict;
+use Test::More;
+my $DEBUG = 2;
+
+BEGIN {
+ # Make sure we've got Unicode support:
+ eval "use v5.8.0; utf8::is_utf8('x');";
+ if ($@) {
+ plan skip_all => "Perl 5.8.0 or newer required for Unicode tests";
+ exit;
+ }
+
+ plan tests => 11;
+} # end BEGIN
+
+use Encode;
+use HTML::TreeBuilder;
+
+print "#Using Encode version v", $Encode::VERSION || "?", "\n";
+print "#Using HTML::TreeBuilder version v$HTML::TreeBuilder::VERSION\n";
+print "#Using HTML::Element version v$HTML::Element::VERSION\n";
+print "#Using HTML::Parser version v", $HTML::Parser::VERSION || "?", "\n";
+print "#Using HTML::Entities version v", $HTML::Entities::VERSION || "?", "\n";
+print "#Using HTML::Tagset version v", $HTML::Tagset::VERSION || "?", "\n";
+print "# Running under perl version $] for $^O",
+ (chr(65) eq 'A') ? "\n" : " in a non-ASCII world\n";
+print "# Win32::BuildNumber ", &Win32::BuildNumber(), "\n"
+ if defined(&Win32::BuildNumber) and defined &Win32::BuildNumber();
+print "# MacPerl verison $MacPerl::Version\n"
+ if defined $MacPerl::Version;
+printf
+ "# Current time local: %s\n# Current time GMT: %s\n",
+ scalar(localtime($^T)), scalar(gmtime($^T));
+
+ok 1;
+
+ok same('<p> </p>', decode('latin1', "<p>\xA0</p>"));
+
+ok !same('<p></p>', decode('latin1', "<p>\xA0</p>"), 1);
+ok !same('<p> </p>', decode('latin1', "<p>\xA0</p>"), 1);
+
+ok same('<p> </p>', decode('latin1', "<p>\xA0\xA0\xA0</p>"));
+ok same("<p>\xA0\xA0\xA0</p>", decode('latin1', "<p>\xA0\xA0\xA0</p>"));
+
+ok !same('<p></p>', decode('latin1', "<p>\xA0\xA0\xA0</p>"), 1);
+ok !same('<p> </p>', decode('latin1', "<p>\xA0\xA0\xA0</p>"), 1);
+
+ok same('<p> — </p>',
+ "<p>\xA0\xA0\x{2014}\xA0\xA0</p>");
+
+ok same('<p> XXmdashXX </p>',
+ "<p>\xA0\xA0\x{2014}\xA0\xA0</p>",
+ 0, sub { $_[0] =~ s/XXmdashXX/—/ });
+
+ok same('<p> <b>bold</b> </p>',
+ decode('latin1', "<p>\xA0<b>bold</b>\xA0\xA0</p>"));
+
+sub same {
+ my($code1, $code2, $flip, $fixup) = @_;
+ my $t1 = HTML::TreeBuilder->new;
+ my $t2 = HTML::TreeBuilder->new;
+
+ if(ref $code1) { $t1->implicit_tags(0); $code1 = $$code1 }
+ if(ref $code2) { $t2->implicit_tags(0); $code2 = $$code2 }
+
+ $t1->parse($code1); $t1->eof;
+ $t2->parse($code2); $t2->eof;
+
+ my $out1 = $t1->as_XML;
+ my $out2 = $t2->as_XML;
+
+ $fixup->($out1, $out2) if $fixup;
+
+ my $rv = ($out1 eq $out2);
+
+ #print $rv? "RV TRUE\n" : "RV FALSE\n";
+ #print $flip? "FLIP TRUE\n" : "FLIP FALSE\n";
+
+ if($flip ? (!$rv) : $rv) {
+ if($DEBUG > 2) {
+ print
+ "In1 $code1\n",
+ "In2 $code2\n",
+ "Out1 $out1\n",
+ "Out2 $out2\n",
+ "\n\n";
+ }
+ } else {
+ local $_;
+ foreach my $line (
+ '',
+ "The following failure is at " . join(' : ' ,caller),
+ "Explanation of failure: " . ($flip ? 'same' : 'different')
+ . " parse trees!",
+ sprintf("Input code 1 (utf8=%d):", utf8::is_utf8($code1)), $code1,
+ sprintf("Input code 2 (utf8=%d):", utf8::is_utf8($code2)), $code2,
+ "Output tree (as XML) 1:", $out1,
+ "Output tree (as XML) 2:", $out2,
+ ) {
+ $_ = $line;
+ s/\n/\n# /g;
+ print "# ", $_, "\n";
+ }
+ }
+
+ $t1->delete;
+ $t2->delete;
+
+ return $rv;
+} # end same