Subject: | significant optimization |
Date: | Mon, 14 Nov 2011 22:09:34 -0800 |
To: | bug-Text-Similarity [...] rt.cpan.org |
From: | Tani Hosokawa <tani [...] tanihosokawa.org> |
Not a bug, but an optimization. Original version does inefficient
repeated linear search over text that can't possibly match. Instead,
precaches locations of keywords. Comparing 100 semi-randomly generated
fairly similar documents of about 500 words each results in approx 90%
speed increase, the efficiency increases as the documents get larger.
--- /usr/local/share/perl/5.12.4/Text/OverlapFinder.pm 2010-06-09
14:12:49.000000000 -0700
+++ Text/OverlapFinder.pm 2011-11-14 21:47:29.283463615 -0800
@@ -8,8 +8,8 @@
use constant MARKER => '###';
-sub contains(\@@);
-sub containsReplace(\@@);
+sub contains(\@$@);
+sub containsReplace(\@$@);
## stemmer support not available as yet
@@ -98,6 +98,10 @@
my @words0 = split /\s+/, $string0;
my @words1 = split /\s+/, $string1;
+ my %first;
+ foreach my $offset (0 .. $#words1) {
+ push @{$first{$words1[$offset]}}, $offset;
+ }
my $wc0 = scalar @words0;
my $wc1 = scalar @words1;
@@ -113,7 +117,7 @@
$currIndex++;
# if this works, carry on!
- if (contains (@words1, @words0[$matchStartIndex..$currIndex])) {
+ if (contains (@words1,
$first{$words0[$matchStartIndex]},@words0[$matchStartIndex..$currIndex])) {
next
}
else {
@@ -143,7 +147,8 @@
# check if still there in $string1. replace in string1
with a mark
if (1 #!doStop($temp)
- && containsReplace (@words1, @words0[$i..$stringEnd]))
+ && exists $first{$words0[$i]}
+ && containsReplace (@words1, $first{$words0[$i]},
@words0[$i..$stringEnd]))
{
# so its still there. we have an overlap!
my $temp = join (" ", @words0[$i..$stringEnd]);
@@ -171,7 +176,7 @@
{
# form the string
my $stringEnd = $i + $k - 1;
- last if contains (@words1, @words0[$i..$stringEnd]);
+ last if contains (@words1, $first{$words0[$i]},
@words0[$i..$stringEnd]);
$k--;
}
@@ -187,14 +192,17 @@
# returns true of the first array contains the list, otherwise returns
false
# See also containsReplace()
# e.g., contains (@Array, LIST);
-sub contains (\@@)
+sub contains (\@$@)
{
my $array2_ref = shift;
+ my $positions = shift;
+ return 0 if (not defined $positions);
my @array1 = @_;
return 0 if $#{$array2_ref} < $#array1;
- for my $j (0..($#{$array2_ref} - $#array1)) {
+ for my $j (@$positions) {
+ next if ($j > $#{$array2_ref} - $#array1);
next if $array2_ref->[$j] eq MARKER;
if ($array1[0] eq $array2_ref->[$j]) {
@@ -217,16 +225,20 @@
# same functionality as contains(), but replaces each word in the match
# with the constant MARKER
-sub containsReplace (\@@)
+sub containsReplace (\@$@)
{
my $array2_ref = shift;
+ my $positions = shift;
+ return 0 if (not defined $positions);
my @array1 = @_;
return 0 if $#{$array2_ref} < $#array1;
- for my $j (0..($#{$array2_ref} - $#array1)) {
+ for my $j (@$positions) {
+ next if ($j > $#{$array2_ref} - $#array1);
next if $array2_ref->[$j] eq MARKER;
+
if ($array1[0] eq $array2_ref->[$j]) {
my $match = 1;
for my $i (1..$#array1) {