Subject: | xml_strip_whitespace not stripping everything |
Hello Petr,
the current implementation of xml_strip_whitespace does not strip all
the whitespace in XML documents. I attached a script where the problem
is visible. The script already contains a "monkeypatched" solution to
the problem, so try the script with and without the monkeypatch.
I think that the problem is caused by calling unbindNode too early. If
unbindNode is called, then the node is not anymore part of the document.
So when XML::LibXML::Iterator calls ->next on this node, then it returns
nothing, effectively stopping iteration at the level of the DOM tree. A
solution is to first collect all nodes to be deleted, and then do it
after iteration (like in the monkeypatch).
Regards,
Slaven
Subject: | stripwhitespace.pl |
#!/usr/bin/perl
use strict;
use warnings;
use XML::LibXML;
use XML::Normalize::LibXML;
{
package XML::Normalize::LibXML;
sub xml_strip_whitespace {
my ($dom, $strip_attributes)=@_;
xml_normalize($dom);
my $iter= XML::LibXML::Iterator->new( $dom );
my @nodes_to_delete;
if ($strip_attributes) {
$iter->iterate(sub {
xml_strip_whitespace_attributes($_[1]);
xml_strip_whitespace_text_node($_[1], \@nodes_to_delete);
});
} else {
$iter->iterate(sub {
xml_strip_whitespace_text_node($_[1], \@nodes_to_delete);
});
}
for (@nodes_to_delete) { $_->unbindNode }
}
sub xml_strip_whitespace_text_node {
my ($node, $nodes_to_delete)=@_;
if ($node->nodeType() == XML::LibXML::XML_TEXT_NODE) {
my $data=trim($node->getData());
if ($data ne "") {
$node->setData($data);
} else {
push @$nodes_to_delete, $node;
}
}
}
}
my $root = XML::LibXML->new->parse_string(<<EOF)->documentElement;
<foo><a/> <b/> <c/><d/></foo>
EOF
XML::Normalize::LibXML::xml_strip_whitespace($root);
print $root->serialize(0), "\n";