Subject: | [PATCH] make abbreviation processing faster |
Currently the code loops over each abbreviation, creating a regex for each and then processing the text. I used about 250 abbreviations for my data (I just went to Wikipedia and grabbed all of the abbreviations commonly used in this language), and processing became very slow.
The attached patch creates and caches a regex that processes all of the registered abbreviations at the same time. It improved performance significantly for me.
Subject: | FastAbbrv.patch |
From aa202b55f5d58fbe8442b64ccad19953d5a26a00 Mon Sep 17 00:00:00 2001
From: Nathan Glenn <nathan.g.glenn@atr-trek.co.jp>
Date: Thu, 14 May 2015 09:55:39 +0900
Subject: [PATCH] modified
---
Sentence.pm | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/Sentence.pm b/Sentence.pm
index eacd582..d8df532 100644
--- a/Sentence.pm
+++ b/Sentence.pm
@@ -219,7 +219,8 @@ my @MISC = qw(no esp);
my @LATIN = qw(vs etc al ibid sic);
our @ABBREVIATIONS = (@PEOPLE, @TITLE_SUFFIXES, @ARMY, @INSTITUTES, @COMPANIES, @PLACES, @MONTHS, @MISC, @LATIN );
-
+my $abbreviation_regex;
+_set_abbreviations_regex();
#==============================================================================
#
@@ -250,6 +251,7 @@ sub get_sentences {
#------------------------------------------------------------------------------
sub add_acronyms {
push @ABBREVIATIONS, @_;
+ _set_abbreviations_regex();
}
#------------------------------------------------------------------------------
@@ -264,6 +266,7 @@ sub get_acronyms {
#------------------------------------------------------------------------------
sub set_acronyms {
@ABBREVIATIONS=@_;
+ _set_abbreviations_regex();
}
#------------------------------------------------------------------------------
@@ -282,7 +285,9 @@ sub set_EOS {
cluck "Won't set \$EOS to undefined value!\n";
return $EOS;
}
- return $EOS = $new_EOS;
+ $EOS = $new_EOS;
+ _set_abbreviations_regex();
+ return $EOS;
}
#------------------------------------------------------------------------------
@@ -334,6 +339,13 @@ sub set_locale {
#
#==============================================================================
+# save some time by pre-compiling a regex used for working with abbreviations
+sub _set_abbreviations_regex {
+ my $abbreviations = join '|', @ABBREVIATIONS;
+ $abbreviation_regex = qr[(\b(?:$abbreviations)$PAP\s)$EOS]is;
+ return;
+}
+
## Please email me any suggestions for optimizing these RegExps.
sub remove_false_end_of_sentence {
my ($marked_segment) = @_;
@@ -351,7 +363,7 @@ sub remove_false_end_of_sentence {
# fix "." "?" "!"
$marked_segment=~s/(['"]$P['"]\s+)$EOS/$1/sg;
## fix where abbreviations exist
- foreach (@ABBREVIATIONS) { $marked_segment=~s/(\b$_$PAP\s)$EOS/$1/isg; }
+ $marked_segment=~s/$abbreviation_regex/$1/g;
# don't break after quote unless its a capital letter.
$marked_segment=~s/(["']\s*)$EOS(\s*[[:lower:]])/$1$2/sg;
--
1.9.5.msysgit.1