*** perlindex.PL~ Sun Oct 19 10:25:51 2008
--- perlindex.PL Sun Oct 19 11:32:51 2008
***************
*** 14,21 ****
# under the same terms as Perl itself.
#
#
! # %SEEN is used to store the absolute pathes to files which have been
! # indexed. Probably this could be replaced by %FN.
#
# %FN $FN{'last'} greatest documentid
# $FN{$did} a pair of $mtf and $filename where $mtf is the
--- 14,21 ----
# under the same terms as Perl itself.
#
#
! # %SEEN is used to store the mtime and absolute pathes to
! # files which have been indexed.
#
# %FN $FN{'last'} greatest documentid
# $FN{$did} a pair of $mtf and $filename where $mtf is the
***************
*** 124,139 ****
&find($dir);
}
}
for $name (@ARGV) {
my $fns = $name;
$fns =~ s:\Q$prefix/::;
! next if $SEEN{$fns};
next unless -f $name;
if ($name !~ /(~|,v)$/) {
$did = $FN{'last'}++;
! $SEEN{$fns} = &index($name, $fns, $did);
}
}
untie %IF;
untie %IDF;
untie %FN;
--- 124,171 ----
&find($dir);
}
}
+ my $gc_required = 0;
for $name (@ARGV) {
my $fns = $name;
$fns =~ s:\Q$prefix/::;
! if ($SEEN{$fns}) {
! my ($mtime, $did) = unpack "$p$p", $SEEN{$fns};
! if ((stat $name)[9] > $mtime) {
! # mark document as deleted
! delete $FN{$did};
! warn "Marking $did ($name) as deleted\n";
! $gc_required++;
! } else {
! # index up to date
! next;
! }
! }
next unless -f $name;
if ($name !~ /(~|,v)$/) {
$did = $FN{'last'}++;
! if (&index($name, $fns, $did)) {
! my ($mtime) = (stat $name)[9];
! $SEEN{$fns} = pack "$p$p", (stat $name)[9], $did;
! }
}
}
+ if ($gc_required) {
+ # garbadge collection, this is awfully slow
+ while (my ($word,$list) = each %IF) {
+ my %post = unpack($p.'*',$list);
+
+ delete $IF{$word};
+ while (my ($did,$tf) = each %post) {
+ if (exists $FN{$did}) {
+ $IF{$word} = '' unless defined $IF{$word}; # perl -w
+ $IF{$word} .= pack($p.$p, $did, $tf{$word});
+ } else {
+ #warn "Delete $did from list for $word\n";
+ $IDF{$word}--;
+ }
+ }
+ }
+ }
untie %IF;
untie %IDF;
untie %FN;
***************
*** 294,299 ****
--- 326,333 ----
my %post = unpack($p.'*',$IF{$word});
my $idf = log($FN{'last'}/$IDF{$word});
for $did (keys %post) {
+ # skip deleted documents
+ next unless exists $FN{$did};
my ($maxtf) = unpack($p, $FN{$did});
$score{$did} = 0 unless defined $score{$did}; # perl -w
$score{$did} += $post{$did} / $maxtf * $idf;