Subject: | Bug fix for terms that are the single character 0 |
The single term 0 (the digit zero) causes problems when indexing.
To reproduce, try indexing using a WhiteSpaceAnalyzer the text
"a 0 is higher than . in ascii"
and/or
"a 0 causes problems with 0.0.0.0"
The attached file has one patch each for
lib/Plucene/Index/TermInfosWriter.pm
lib/Plucene/Index/SegmentTermEnum.pm
A unit test (t/regress-05.t) is also included that tests for this problem.
For more details see the thread titled "out-of-order term"
http://www.kasei.com/pipermail/plucene/2005-April/thread.html#345
===============================================================================
lib/Plucene/Index/TermInfosWriter.pm
131c131,132
< my $text = $term->text || "";
---
> my $text = $term->text;
> if (not defined($text)) { $text = ''; }
===============================================================================
lib/Plucene/Index/SegmentTermEnum.pm
136c136
< $self->{buffer} ||= " " x $length;
---
> if (not defined($self->{buffer})) { $self->{buffer} = " " x $length; }
===============================================================================
t/regress-05.t
#!/usr/bin/perl -w
=head1 NAME
regress-05.t
Check an index is created with the terms you expect.
Introduced for testing bugs in Plucene v 1.21 which had
problems dealing with a term that was the single character
zero (0).
We create an index using various chunks of text, then test
that each term in the index matches what we are expecting.
=cut
use strict;
use warnings;
use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Writer;
use Plucene::Analysis::WhitespaceAnalyzer;
use Plucene::Search::IndexSearcher;
use File::Temp qw(tempdir);
require Test::More;
$| = 0;
my $dir = tempdir(CLEANUP => 1);
my @strings = (
'a simple test that should pass',
'something lower than 0 in ascii is . (aka a period)',
'a test with a 0 and 0.0.0.0 terms',
);
Test::More->import(tests => scalar(@strings));
foreach (@strings) { &test_build($_); }
sub test_build {
my $string = shift;
# Setup out index
my $analyzer = Plucene::Analysis::WhitespaceAnalyzer->new();
my $writer = Plucene::Index::Writer->new($dir, $analyzer, 1);
my $doc = Plucene::Document->new;
# Index the string and close the writer/index.
$doc->add(Plucene::Document::Field->Text("content", $string));
$writer->add_document($doc);
$writer->optimize(); # This invalidates $writer
undef $writer; # Forces $writer->DESTROY() to be called, merging segments
# Read the index back in and compare each term
my $searcher = Plucene::Search::IndexSearcher->new( $dir );
my $enum = $searcher->reader->terms();
my @all = sort split(/\s+/, $string);
my @keys;
for (my $i = 0; $i < scalar(@all); $i++) {
if ( ($i > 0) and ($all[$i-1] eq $all[$i])) { next; }
push(@keys, $all[$i]);
}
my ($pos, $success) = (0,1);
while($enum->next) {
if ($enum->term->text ne $keys[$pos++]) {
$success = 0;
last;
}
}
if (not $success) {
ok(0, "Term not matching expected result\n" .
"Expecting term '" . $keys[$pos - 1] . "' but got '" .
$enum->term->text . "'\nwhile testing the string '$string'");
}
elsif (scalar(@keys ne $pos)) {
ok(0, "Not enough terms in the index\n" .
"Expecting " . scalar(@keys) . " but only found $pos\n" .
"while testing the string '$string'");
}
else { ok(1); }
}
===============================================================================