Bug #22887 for perlindex: Text::English patch

Subject:

Text::English patch

* URLs for Porter's algorithm * Removed usage of $`, $&. * Added warning (that implementation is broken). * If called in scalar context returns first element of array of results * Wrote tests (uses Test::More) * use strict Also patch for 'Makefile.PL' to include license in META.yml and require Test::More (for hypotetical perl distribution where it wouldn't be in the core). ------- Alexandr Ciornii, http://chorny.net

Subject:

English.pm.patch

--- English.pm.dist Sun Apr 10 11:31:22 2005 +++ English.pm Tue Nov 7 23:51:14 2006 @@ -6,13 +6,15 @@ package Text::English; -$VERSION = $VERSION = '0.01'; +use strict; +#use warnings; +use vars qw/$VERSION/; +$VERSION = $VERSION = '0.02'; sub stem { my @parms = @_; foreach( @parms ) { $_ = lc $_; - # Step 0 - remove punctuation s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//; next unless /^[a-z]+$/; @@ -40,42 +42,35 @@ # step2_rules - if ( s/ational$/ate/ || s/tional$/tion/ || s/enci$/ence/ || - s/anci$/ance/ || s/izer$/ize/ || s/iser$/ise/ || - s/abli$/able/ || s/alli$/al/ || s/entli$/ent/ || - s/eli$/e/ || s/ousli$/ous/ || s/ization$/ize/ || - s/isation$/ise/ || s/ation$/ate/ || s/ator$/ate/ || - s/alism$/al/ || s/iveness$/ive/ || s/fulnes$/ful/ || - s/ousness$/ous/ || s/aliti$/al/ || s/iviti$/ive/ || - s/biliti$/ble/ - ) { - my ($l,$m) = ($`,$&); -#DEBUG warn "step 2: l=$l m=$m\n"; - $_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/; - } + s/(.*[^aeiou][aeiouy].*)ational$/$1ate/ || s/(.*[^aeiou][aeiouy].*)tional$/$1tion/ || s/(.*[^aeiou][aeiouy].*)enci$/$1ence/ || + s/(.*[^aeiou][aeiouy].*)anci$/$1ance/ || s/(.*[^aeiou][aeiouy].*)izer$/$1ize/ || s/(.*[^aeiou][aeiouy].*)iser$/$1ise/ || + s/(.*[^aeiou][aeiouy].*)abli$/$1able/ || s/(.*[^aeiou][aeiouy].*)alli$/$1al/ || s/(.*[^aeiou][aeiouy].*)entli$/$1ent/ || + s/(.*[^aeiou][aeiouy].*)eli$/$1e/ || s/(.*[^aeiou][aeiouy].*)ousli$/$1ous/ || s/(.*[^aeiou][aeiouy].*)ization$/$1ize/ || + s/(.*[^aeiou][aeiouy].*)isation$/$1ise/ || s/(.*[^aeiou][aeiouy].*)ation$/$1ate/ || s/(.*[^aeiou][aeiouy].*)ator$/$1ate/ || + s/(.*[^aeiou][aeiouy].*)alism$/$1al/ || s/(.*[^aeiou][aeiouy].*)iveness$/$1ive/ || s/(.*[^aeiou][aeiouy].*)fulnes$/$1ful/ || + s/(.*[^aeiou][aeiouy].*)ousness$/$1ous/ || s/(.*[^aeiou][aeiouy].*)aliti$/$1al/ || s/(.*[^aeiou][aeiouy].*)iviti$/$1ive/ || + s/(.*[^aeiou][aeiouy].*)biliti$/$1ble/; + # step3_rules - if ( s/icate$/ic/ || s/ative$// || s/alize$/al/ || - s/iciti$/ic/ || s/ical$/ic/ || s/ful$// || - s/ness$// - ) { - my ($l,$m) = ($`,$&); -#DEBUG warn "step 3: l=$l m=$m\n"; - $_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/; - } + s/(.*[^aeiou][aeiouy].*)icate$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ative$/$1/ || s/(.*[^aeiou][aeiouy].*)alize$/$1al/ || + s/(.*[^aeiou][aeiouy].*)iciti$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ical$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ful$/$1/ || + s/(.*[^aeiou][aeiouy].*)ness$/$1/; + + # step4_rules - if ( s/al$// || s/ance$// || s/ence$// || s/er$// || - s/ic$// || s/able$// || s/ible$// || s/ant$// || - s/ement$// || s/ment$// || s/ent$// || s/sion$/s/ || - s/tion$/t/ || s/ou$// || s/ism$// || s/ate$// || - s/iti$// || s/ous$// || s/ive$// || s/ize$// || - s/ise$// + if ( s/(.*)(al$)/$1/ || s/(.*)(ance$)/$1/ || s/(.*)(ence$)/$1/ || s/(.*)(er$)/$1/ || + s/(.*)(ic$)/$1/ || s/(.*)(able$)/$1/ || s/(.*)(ible$)/$1/ || s/(.*)(ant$)/$1/ || + s/(.*)(ement$)/$1/ || s/(.*)(ment$)/$1/ || s/(.*)(ent$)/$1/ || s/(.*)(sion$)/$1s/ || + s/(.*)(tion$)/$1t/ || s/(.*)(ou$)/$1/ || s/(.*)(ism$)/$1/ || s/(.*)(ate$)/$1/ || + s/(.*)(iti$)/$1/ || s/(.*)(ous$)/$1/ || s/(.*)(ive$)/$1/ || s/(.*)(ize$)/$1/ || + s/(.*)(ise$)/$1/ ) { - my ($l,$m) = ($`,$&); + my ($l,$m) = ($1,$2); # Look for two consonant/vowel transitions # NB simplified... #DEBUG warn "step 4: l=$l m=$m\n"; - $_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*[^aeiou][aeiouy]/; + $_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*?[^aeiou][aeiouy]/; } # step5a_rules @@ -85,12 +80,12 @@ # step5b_rules #DEBUG warn("step 5b: $_\n") && - s/ll$/l/ if /[^aeiou][aeiouy].*[^aeiou][aeiouy].*ll$/; + s/ll$/l/ if /[^aeiou][aeiouy].*?[^aeiou][aeiouy].*ll$/; # Cosmetic step s/(.)i$/$1y/; } - @parms; + return wantarray ? @parms : $parms[0]; } 1; @@ -101,6 +96,11 @@ Text::English - Porter's stemming algorithm +=head1 Warning + +This implementation is broken. On vocabulary of 23531 words it has 3569 results +different from original implementation. + =head1 SYNOPSIS use Text::English; @@ -117,7 +117,14 @@ in: Porter, M.F., "An Algorithm For Suffix Stripping," Program 14 (3), July 1980, pp. 130-137. Provenance: Written by B. Frakes and C. Cox, 1986. +http://maya.cs.depaul.edu/~classes/ds575/papers/porter-algorithm.html +(Link from http://www.tartarus.org/martin/PorterStemmer/) +C version: http://filebox.vt.edu/users/yucui/IR.code/ir-code/stemmer/stem.c + +Description of different stemmers: +http://www.pimpumpam.com/motoridiricerca/ir/chap08.htm + I have re-interpreted areas that use Frakes and Cox's "WordSize" function. My version may misbehave on short words starting with "y", but I can't think of any examples. @@ -130,9 +137,23 @@ =head1 NOTES -This is version 0.1. I would welcome feedback, especially improvements +This is version 0.01. I would welcome feedback, especially improvements to the punctuation-stripping step. +=head1 Changes + +0.02 by Alexandr Ciornii (alexchorny/@/gmail.com) + +* URLs for Porter's algorithm + +* Removed usage of $`, $&. + +* Added warning. + +* If called in scalar context returns first element of array + +* Wrote tests + =head1 AUTHOR Ian Phillipps <ian@unipalm.pipex.com>

Subject:

English.pm

#!/usr/bin/perl # -*- Mode: Perl -*- # Author : Ian Phillipps # Last Modified On: Sun May 2 15:35:33 2004 # Language : CPerl package Text::English; use strict; #use warnings; use vars qw/$VERSION/; $VERSION = $VERSION = '0.02'; sub stem { my @parms = @_; foreach( @parms ) { $_ = lc $_; # Step 0 - remove punctuation s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//; next unless /^[a-z]+$/; # step1a_rules if( /[^s]s$/ ) { s/sses$/ss/ || s/ies$/i/ || s/s$// } # step1b_rules. The business with rule==106 is embedded in the # boolean expressions here. (/[aeiouy][^aeiouy].*eed$/ && s/eed$/ee/ ) || ( s/([aeiou].*)ed$/$1/ || s/([aeiouy].*)ing$/$1/ ) && ( # step1b1_rules s/at$/ate/ || s/bl$/ble/ || s/iz$/ize/ || s/bb$/b/ || s/dd$/d/ || s/ff$/f/ || s/gg$/g/ || s/mm$/m/ || s/nn$/n/ || s/pp$/p/ || s/rr$/r/ || s/tt$/t/ || s/ww$/w/ || s/xx$/x/ || # This is wordsize==1 && CVC...addanE... s/^[^aeiouy]+[aeiouy][^aeiouy]$/$&e/ ) #DEBUG && warn "step1b1: $_\n" ; # step1c_rules #DEBUG warn "step1c: $_\n" if s/([aeiouy].*)y$/$1i/; # step2_rules s/(.*[^aeiou][aeiouy].*)ational$/$1ate/ || s/(.*[^aeiou][aeiouy].*)tional$/$1tion/ || s/(.*[^aeiou][aeiouy].*)enci$/$1ence/ || s/(.*[^aeiou][aeiouy].*)anci$/$1ance/ || s/(.*[^aeiou][aeiouy].*)izer$/$1ize/ || s/(.*[^aeiou][aeiouy].*)iser$/$1ise/ || s/(.*[^aeiou][aeiouy].*)abli$/$1able/ || s/(.*[^aeiou][aeiouy].*)alli$/$1al/ || s/(.*[^aeiou][aeiouy].*)entli$/$1ent/ || s/(.*[^aeiou][aeiouy].*)eli$/$1e/ || s/(.*[^aeiou][aeiouy].*)ousli$/$1ous/ || s/(.*[^aeiou][aeiouy].*)ization$/$1ize/ || s/(.*[^aeiou][aeiouy].*)isation$/$1ise/ || s/(.*[^aeiou][aeiouy].*)ation$/$1ate/ || s/(.*[^aeiou][aeiouy].*)ator$/$1ate/ || s/(.*[^aeiou][aeiouy].*)alism$/$1al/ || s/(.*[^aeiou][aeiouy].*)iveness$/$1ive/ || s/(.*[^aeiou][aeiouy].*)fulnes$/$1ful/ || s/(.*[^aeiou][aeiouy].*)ousness$/$1ous/ || s/(.*[^aeiou][aeiouy].*)aliti$/$1al/ || s/(.*[^aeiou][aeiouy].*)iviti$/$1ive/ || s/(.*[^aeiou][aeiouy].*)biliti$/$1ble/; # step3_rules s/(.*[^aeiou][aeiouy].*)icate$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ative$/$1/ || s/(.*[^aeiou][aeiouy].*)alize$/$1al/ || s/(.*[^aeiou][aeiouy].*)iciti$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ical$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ful$/$1/ || s/(.*[^aeiou][aeiouy].*)ness$/$1/; # step4_rules if ( s/(.*)(al$)/$1/ || s/(.*)(ance$)/$1/ || s/(.*)(ence$)/$1/ || s/(.*)(er$)/$1/ || s/(.*)(ic$)/$1/ || s/(.*)(able$)/$1/ || s/(.*)(ible$)/$1/ || s/(.*)(ant$)/$1/ || s/(.*)(ement$)/$1/ || s/(.*)(ment$)/$1/ || s/(.*)(ent$)/$1/ || s/(.*)(sion$)/$1s/ || s/(.*)(tion$)/$1t/ || s/(.*)(ou$)/$1/ || s/(.*)(ism$)/$1/ || s/(.*)(ate$)/$1/ || s/(.*)(iti$)/$1/ || s/(.*)(ous$)/$1/ || s/(.*)(ive$)/$1/ || s/(.*)(ize$)/$1/ || s/(.*)(ise$)/$1/ ) { my ($l,$m) = ($1,$2); # Look for two consonant/vowel transitions # NB simplified... #DEBUG warn "step 4: l=$l m=$m\n"; $_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*?[^aeiou][aeiouy]/; } # step5a_rules #DEBUG warn("step 5a: $_\n") && s/e$// if ( /[^aeiou][aeiouy].*[^aeiou][aeiouy].*e$/ || ( /[aeiou][^aeiouy].*e/ && ! /[^aeiou][aeiouy][^aeiouwxy]e$/) ); # step5b_rules #DEBUG warn("step 5b: $_\n") && s/ll$/l/ if /[^aeiou][aeiouy].*?[^aeiou][aeiouy].*ll$/; # Cosmetic step s/(.)i$/$1y/; } return wantarray ? @parms : $parms[0]; } 1; __END__ =head1 NAME Text::English - Porter's stemming algorithm =head1 Warning This implementation is broken. On vocabulary of 23531 words it has 3569 results different from original implementation. =head1 SYNOPSIS use Text::English; @stems = Text::English::stem( @words ); =head1 DESCRIPTION This routine applies the Porter Stemming Algorithm to its parameters, returning the stemmed words. It is derived from the C program "stemmer.c" as found in freewais and elsewhere, which contains these notes: Purpose: Implementation of the Porter stemming algorithm documented in: Porter, M.F., "An Algorithm For Suffix Stripping," Program 14 (3), July 1980, pp. 130-137. Provenance: Written by B. Frakes and C. Cox, 1986. http://maya.cs.depaul.edu/~classes/ds575/papers/porter-algorithm.html (Link from http://www.tartarus.org/martin/PorterStemmer/) C version: http://filebox.vt.edu/users/yucui/IR.code/ir-code/stemmer/stem.c Description of different stemmers: http://www.pimpumpam.com/motoridiricerca/ir/chap08.htm I have re-interpreted areas that use Frakes and Cox's "WordSize" function. My version may misbehave on short words starting with "y", but I can't think of any examples. The step numbers correspond to Frakes and Cox, and are probably in Porter's article (which I've not seen). Porter's algorithm still has rough spots (e.g current/currency, -ings words), which I've not attempted to cure, although I have added support for the British -ise suffix. =head1 NOTES This is version 0.01. I would welcome feedback, especially improvements to the punctuation-stripping step. =head1 Changes 0.02 by Alexandr Ciornii (alexchorny/@/gmail.com) * URLs for Porter's algorithm * Removed usage of $`, $&. * Added warning. * If called in scalar context returns first element of array * Wrote tests =head1 AUTHOR Ian Phillipps <ian@unipalm.pipex.com> =head1 COPYRIGHT Copyright Public IP Exchange Ltd (PIPEX). Available for use under the same terms as perl. =cut

Subject:

Makefile.PL.patch

--- Makefile.PL.dist Sun Jul 2 11:32:23 2006 +++ Makefile.PL Wed Nov 8 00:14:50 2006 @@ -23,9 +23,14 @@ 'DEFINE' => '', # e.g., '-DHAVE_SOMETHING' 'INC' => '', # e.g., '-I/usr/include/other' 'dist' => { SUFFIX => "gz", COMPRESS => "gzip -f"}, + 'PREREQ_PM' => { # we do bundle that module in the distribution -# 'PREREQ_PM' => { 'Text::English' => 0 }, +# 'Text::English' => 0, + 'Test::More' => 0, + }, 'EXE_FILES' => [ 'perlindex' ], 'clean' => { 'FILES' => 'perlindex' }, + ($ExtUtils::MakeMaker::VERSION ge '6.30_00'? + ('LICENSE' => 'perl', ) : ()), );

Subject:

stem.t

use strict; #use warnings; use Test::More; use Text::English; BEGIN { plan tests => 4, todo => [] } my @stems = Text::English::stem(qw/abandoned surprise constancies constancy blesses/); is_deeply(\@stems,[qw/abandon surpris constanc constanc bless/]); @stems = Text::English::stem(qw/rational traditional/); is_deeply(\@stems,[qw/ration tradit/]); #different from original algorithm @stems = Text::English::stem(qw/excellence excellencies/); is_deeply(\@stems,[qw/excellenc excellenc/]); #different from original algorithm my $stem = Text::English::stem(qw/loyalty/); is($stem,'loyalty');