Subject: | Bug fix for terms that are the single character 0 |
The single term 0 (the digit zero) causes problems when indexing.
To reproduce, try indexing using a WhiteSpaceAnalyzer the text
"a 0 is higher than . in ascii"
"a 0 causes problems with"
The attached file has one patch each for
A unit test (t/regress-05.t) is also included that tests for this problem.
For more details see the thread titled "out-of-order term"
< my $text = $term->text || "";
> my $text = $term->text;
> if (not defined($text)) { $text = ''; }
< $self->{buffer} ||= " " x $length;
> if (not defined($self->{buffer})) { $self->{buffer} = " " x $length; }
#!/usr/bin/perl -w
=head1 NAME
Check an index is created with the terms you expect.
Introduced for testing bugs in Plucene v 1.21 which had
problems dealing with a term that was the single character
zero (0).
We create an index using various chunks of text, then test
that each term in the index matches what we are expecting.
use strict;
use warnings;
use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Writer;
use Plucene::Analysis::WhitespaceAnalyzer;
use Plucene::Search::IndexSearcher;
use File::Temp qw(tempdir);
require Test::More;
$| = 0;
my $dir = tempdir(CLEANUP => 1);
my @strings = (
'a simple test that should pass',
'something lower than 0 in ascii is . (aka a period)',
'a test with a 0 and terms',
Test::More->import(tests => scalar(@strings));
foreach (@strings) { &test_build($_); }
sub test_build {
my $string = shift;
# Setup out index
my $analyzer = Plucene::Analysis::WhitespaceAnalyzer->new();
my $writer = Plucene::Index::Writer->new($dir, $analyzer, 1);
my $doc = Plucene::Document->new;
# Index the string and close the writer/index.
$doc->add(Plucene::Document::Field->Text("content", $string));
$writer->optimize(); # This invalidates $writer
undef $writer; # Forces $writer->DESTROY() to be called, merging segments
# Read the index back in and compare each term
my $searcher = Plucene::Search::IndexSearcher->new( $dir );
my $enum = $searcher->reader->terms();
my @all = sort split(/\s+/, $string);
my @keys;
for (my $i = 0; $i < scalar(@all); $i++) {
if ( ($i > 0) and ($all[$i-1] eq $all[$i])) { next; }
push(@keys, $all[$i]);
my ($pos, $success) = (0,1);
while($enum->next) {
if ($enum->term->text ne $keys[$pos++]) {
$success = 0;
if (not $success) {
ok(0, "Term not matching expected result\n" .
"Expecting term '" . $keys[$pos - 1] . "' but got '" .
$enum->term->text . "'\nwhile testing the string '$string'");
elsif (scalar(@keys ne $pos)) {
ok(0, "Not enough terms in the index\n" .
"Expecting " . scalar(@keys) . " but only found $pos\n" .
"while testing the string '$string'");
else { ok(1); }