Subject: | re-indexing a document several times corrupts the index |
After re-indexing a document two times with add_doc() the score via search() gets unrealistic high.
After updating the same document one more time via add_doc() and searching for the same common word, search() displays an error and a rebuild of the index is necessary.
I am using:
* gentoo linux 2.6.9-gentoo-r1 on i686 Pentium III
* Perl 5.8.6
* DBI 1.46
* DBD::SQlite 1.07
* DBIx::TextIndex 0.25
I have used the attached script to track down the described behaviour. It seems to happen when a document gets re-indexed and the doc contains a very common word several times. If this is an excpected behaviour, then maybe it should be mentioned in the documentation?
(btw: sorry for my bad english).
#!perl
use strict;
use warnings;
use DBI;
use TestData;
use DBIx::TextIndex;
use Data::Dumper;
# connect to the database
#
my $dbh = DBI->connect('dbi:SQLite:dbname=./test.db', '', '',
{ RaiseError => 1, PrintError => 0, AutoCommit => 1 });
# load some docs
#
$dbh->do(<<END);
CREATE TABLE textindex_doc(
doc_id INT NOT NULL PRIMARY KEY,
doc TEXT)
END
my $sth = $dbh->prepare( qq(INSERT INTO textindex_doc (doc_id, doc) values (?, ?)) )
|| die $dbh->errstr;
my $testdata = [
'foo ist bar und das ist wunderbar' ,
'und jetzt wollte ich noch einmal sage, das ich foobar finde!',
'questions that besieged us in live are testaments of our helplessness',
'Der Sinn, der sich aussprechen läÃt heiÃt FOO!',
'und und und und und'
];
my $doc_id = 1;
foreach my $doc ( @$testdata ) {
$sth->execute($doc_id, $doc) || die $dbh->errstr;
$doc_id++;
}
# indexing the docs
#
my $index = DBIx::TextIndex->new({
doc_dbh => $dbh,
doc_table => 'textindex_doc',
doc_fields => ['doc'],
doc_id_field => 'doc_id',
index_dbh => $dbh,
collection => 'foo',
update_commit_interval => 15,
proximity_index => 1,
});
$index->initialize;
$index->add_doc( [ 1,2,3,4,5 ] );
# search 'und'
#
my $results = $index->search( { doc => 'und' } );
print Dumper( $results );
# this yields:
# $VAR1 = {
# '1' => '-0.343761567321247',
# '2' => '-0.326713819654248',
# '5' => '-0.609387539363521'
# };
# update doc 1
#
$dbh->do(<<END, undef, 1, qq(wie ist und das und lahaha luhuhu foo bar), 1);
UPDATE textindex_doc set doc_id = ?, doc = ? WHERE doc_id = ?
END
$index->add_doc( [ 1 ] );
# search 'und' again...
#
my $results_2 = $index->search( { doc => 'und' } );
print Dumper( $results_2 );
# yields:
# $VAR1 = {
# '1' => '-0.764411161363',
# '2' => '-0.529289614627142',
# '5' => '-1.04020116126465'
# };
# update doc 1 again
#
$dbh->do(<<END, undef, 1, qq(unverfaenglicher Text mit dem Wort und), 1);
UPDATE textindex_doc set doc_id = ?, doc = ? WHERE doc_id = ?
END
$index->add_doc( [ 1 ] );
# search 'und' again...
#
my $results_3 = $index->search( { doc => 'und' } );
print Dumper( $results_3 );
# yields:
# $VAR1 = {
# '1' => '4294967291.80121',
# '2' => '-1.04702789827953',
# '5' => '-2.21734396294902'
# };
# one more time..
#
$dbh->do(<<END, undef, 1, qq(unverfänglicher Text mit dem Wort und), 1);
UPDATE textindex_doc set doc_id = ?, doc = ? WHERE doc_id = ?
END
$index->add_doc( [ 1 ] );
my $results_4 = $index->search( { doc => 'und' } );
# -> 'Your search did not produce any matching documents.'