Subject: | XML::LibXML whitespace settings can leak between parsers |
Parsing with one parser that ignores whitespace can cause other parsers to ignore whitespace even when they have not requested it.
It appears to be a result of the code reintroduced in 1.97 for https://rt.cpan.org/Public/Bug/Display.html?id=76696
See attached test that has been developed trying to reliably reproduce the issue. The "no_blanks" setting appears to be getting latched in the parse_string calls in a weird way that we don't entirely understand.
We have a potential patch that I'll be posting shortly. It involves resetting the xmlKeepBlanksDefaultValue after the parsing has completed. We're still working on testing it as we don't fully understand the underlying issue.
Subject: | 30blanks.t |
#!/usr/bin/perl
# whitespace (no_blanks) bleeds from parser to parser
# Bottom line on top: no_blanks is latched not in the instance
# but somewhere else (likely the c code)
# in a weird way
# that I've not been able to figure out how to fix or work-around.
use warnings;
use strict;
use Test::More tests => 23;
use XML::LibXML;
my $chunk
= "<foo>\n"
. " <bar/>\n"
. "</foo>\n";
my $sampleXML
= qq{<?xml version="1.0" encoding="utf-8"?>\n$chunk};
my $exp_wsp = "\n \n";
# WEIRD SEQUENCE I DONT UNDERSTAND
assert_text_content([], $sampleXML, $exp_wsp); # 1. shows whitespace as expected
assert_text_content([no_blanks=>1], $sampleXML, ''); # 2. no whitespace as expected
assert_text_content([], $sampleXML, $exp_wsp); # 3. The known bug: should but doesn't show whitespace
assert_text_content([], $sampleXML, $exp_wsp); # 4. Surprise: shows whitespace
assert_text_content([no_blanks=>1], $sampleXML, ''); # 5. as expected: no whitespace
assert_text_content([no_blanks=>0], $sampleXML, $exp_wsp); # 6. frustrated: doesn't show whitespace
# (as sequencd below hits maybe it might)
assert_text_content([], $sampleXML, $exp_wsp); # 7. now (!!?) it shows whitespace
# The following sequence had me believing that explicitly new'ing up
# with no_blanks=>0 would work around the fault.
my $exp_no_wsp_chunk = qq{<foo><bar/></foo>\n};
my $exp_no_wsp = qq{<?xml version="1.0" encoding="utf-8"?>\n$exp_no_wsp_chunk};
my $no_blanks_parser = XML::LibXML->new(no_blanks => 1);
my $d1 = $no_blanks_parser->parse_string($sampleXML);
is($d1, $exp_no_wsp, 'XML::LibXML->new(no_blanks => 1) parse_string');
is($no_blanks_parser->parse_balanced_chunk($chunk), $exp_no_wsp_chunk, 'XML::LibXML->new(no_blanks => 1) parse_balanced_chunk');
my $no_args_parser = XML::LibXML->new;
my $d2 = $no_args_parser->parse_string($sampleXML);
is($d2, $sampleXML, 'XML::LibXML->new parse_string');
my $s2 = $d2->textContent;
is($s2, $exp_wsp, 'XML::LibXML->new textContent');
# OKay, here's some REALLY weird stuff (same instance, new parse):
my $d2b = $no_args_parser->parse_string($sampleXML);
my $s2b = $d2b->textContent;
is($s2b, $exp_wsp, 'XML::LibXML->new textContent');
# Note, new'ing up another parser with no_blanks=>0,
# (w/out running parse_string),
# is NOT enough to break D1 or fix D2:
my $blanks_parser = XML::LibXML->new(no_blanks => 0);
my $d1b = $no_blanks_parser->parse_string($sampleXML);
is($d1b, $exp_no_wsp, 'XML::LibXML->new(no_blanks => 1) parse_string 2');
my $d2c = $no_args_parser->parse_string($sampleXML);
is($d2c, $sampleXML, 'XML::LibXML->new parse_string 2');
# But WAIT: if we re-parse (same instance) we get whitespace now
my $d2d = $no_args_parser->parse_string($sampleXML);
is($d2d, $sampleXML, 'XML::LibXML->new parse_string 3');
is($d2d->textContent, $exp_wsp, 'XML::LibXML->new textContent 2');
# Note, newing up with no_blanks=>0, AND running parse_string,
# turned off the "no_blanks" for this parser (has textContent)
# BUT did not break D1 or fix D2:
# D3 (FIXED: has whitespace): $d3
my $d3 = $blanks_parser->parse_string($sampleXML);
is($d3, $sampleXML, 'XML::LibXML->new(no_blanks => 0) parse_string');
my $s3 = $d3->textContent;
is($s3, $exp_wsp, 'XML::LibXML->new(no_blanks => 0) textContent');
# Redo D1 (still no_blanks=GOOD)
is(
$no_blanks_parser->parse_string($sampleXML), $exp_no_wsp,
'XML::LibXML->new(no_blanks => 1) parse_string 3'
);
# Redo D2 (re-broke no_blanks=?)
is(
$no_args_parser->parse_string($sampleXML), $sampleXML,
'XML::LibXML->new parse_string 4'
);
# Earlier experiments had suggested a workaround might be to edit
# XML::LibXML to always set no_blanks to something, even zero.
# Now we believe the latching is in the parse_string
# We wanted to confirm setting the 3rd instance to no_blanks=>0
# does not mess up the original's =>1.
# We now believe the D's are not changed (??)
# Redo S1 (still no_blanks)
is($d1->textContent, '', 'XML::LibXML->new(no_blanks => 1) textContent');
my $d4 = $no_blanks_parser->parse_string($sampleXML); # This latches the no_blanks from P1
my $s4 = $d4->textContent;
# There should be NO newline in S4
is($s4, '', 'XML::LibXML->new(no_blanks => 1) textContent 2');
# Redo S1 (still no_blanks)
is($d1->textContent, '', 'XML::LibXML->new(no_blanks => 1) textContent 3');
exit;
sub assert_text_content {
my ( $args, $xml, $exp ) = @_;
$args ||= [];
my $ans = XML::LibXML->new(@$args)->parse_string($xml)->textContent;
is($ans, $exp, "XML::LibXML->new(@$args)->parse_string->textContent");
return;
}