Subject: | Text::English patch |
* URLs for Porter's algorithm
* Removed usage of $`, $&.
* Added warning (that implementation is broken).
* If called in scalar context returns first element of array of results
* Wrote tests (uses Test::More)
* use strict
Also patch for 'Makefile.PL' to include license in META.yml and require
Test::More (for hypotetical perl distribution where it wouldn't be in
the core).
-------
Alexandr Ciornii, http://chorny.net
Subject: | English.pm.patch |
--- English.pm.dist Sun Apr 10 11:31:22 2005
+++ English.pm Tue Nov 7 23:51:14 2006
@@ -6,13 +6,15 @@
package Text::English;
-$VERSION = $VERSION = '0.01';
+use strict;
+#use warnings;
+use vars qw/$VERSION/;
+$VERSION = $VERSION = '0.02';
sub stem {
my @parms = @_;
foreach( @parms ) {
$_ = lc $_;
-
# Step 0 - remove punctuation
s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//;
next unless /^[a-z]+$/;
@@ -40,42 +42,35 @@
# step2_rules
- if ( s/ational$/ate/ || s/tional$/tion/ || s/enci$/ence/ ||
- s/anci$/ance/ || s/izer$/ize/ || s/iser$/ise/ ||
- s/abli$/able/ || s/alli$/al/ || s/entli$/ent/ ||
- s/eli$/e/ || s/ousli$/ous/ || s/ization$/ize/ ||
- s/isation$/ise/ || s/ation$/ate/ || s/ator$/ate/ ||
- s/alism$/al/ || s/iveness$/ive/ || s/fulnes$/ful/ ||
- s/ousness$/ous/ || s/aliti$/al/ || s/iviti$/ive/ ||
- s/biliti$/ble/
- ) {
- my ($l,$m) = ($`,$&);
-#DEBUG warn "step 2: l=$l m=$m\n";
- $_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/;
- }
+ s/(.*[^aeiou][aeiouy].*)ational$/$1ate/ || s/(.*[^aeiou][aeiouy].*)tional$/$1tion/ || s/(.*[^aeiou][aeiouy].*)enci$/$1ence/ ||
+ s/(.*[^aeiou][aeiouy].*)anci$/$1ance/ || s/(.*[^aeiou][aeiouy].*)izer$/$1ize/ || s/(.*[^aeiou][aeiouy].*)iser$/$1ise/ ||
+ s/(.*[^aeiou][aeiouy].*)abli$/$1able/ || s/(.*[^aeiou][aeiouy].*)alli$/$1al/ || s/(.*[^aeiou][aeiouy].*)entli$/$1ent/ ||
+ s/(.*[^aeiou][aeiouy].*)eli$/$1e/ || s/(.*[^aeiou][aeiouy].*)ousli$/$1ous/ || s/(.*[^aeiou][aeiouy].*)ization$/$1ize/ ||
+ s/(.*[^aeiou][aeiouy].*)isation$/$1ise/ || s/(.*[^aeiou][aeiouy].*)ation$/$1ate/ || s/(.*[^aeiou][aeiouy].*)ator$/$1ate/ ||
+ s/(.*[^aeiou][aeiouy].*)alism$/$1al/ || s/(.*[^aeiou][aeiouy].*)iveness$/$1ive/ || s/(.*[^aeiou][aeiouy].*)fulnes$/$1ful/ ||
+ s/(.*[^aeiou][aeiouy].*)ousness$/$1ous/ || s/(.*[^aeiou][aeiouy].*)aliti$/$1al/ || s/(.*[^aeiou][aeiouy].*)iviti$/$1ive/ ||
+ s/(.*[^aeiou][aeiouy].*)biliti$/$1ble/;
+
# step3_rules
- if ( s/icate$/ic/ || s/ative$// || s/alize$/al/ ||
- s/iciti$/ic/ || s/ical$/ic/ || s/ful$// ||
- s/ness$//
- ) {
- my ($l,$m) = ($`,$&);
-#DEBUG warn "step 3: l=$l m=$m\n";
- $_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/;
- }
+ s/(.*[^aeiou][aeiouy].*)icate$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ative$/$1/ || s/(.*[^aeiou][aeiouy].*)alize$/$1al/ ||
+ s/(.*[^aeiou][aeiouy].*)iciti$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ical$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ful$/$1/ ||
+ s/(.*[^aeiou][aeiouy].*)ness$/$1/;
+
+
# step4_rules
- if ( s/al$// || s/ance$// || s/ence$// || s/er$// ||
- s/ic$// || s/able$// || s/ible$// || s/ant$// ||
- s/ement$// || s/ment$// || s/ent$// || s/sion$/s/ ||
- s/tion$/t/ || s/ou$// || s/ism$// || s/ate$// ||
- s/iti$// || s/ous$// || s/ive$// || s/ize$// ||
- s/ise$//
+ if ( s/(.*)(al$)/$1/ || s/(.*)(ance$)/$1/ || s/(.*)(ence$)/$1/ || s/(.*)(er$)/$1/ ||
+ s/(.*)(ic$)/$1/ || s/(.*)(able$)/$1/ || s/(.*)(ible$)/$1/ || s/(.*)(ant$)/$1/ ||
+ s/(.*)(ement$)/$1/ || s/(.*)(ment$)/$1/ || s/(.*)(ent$)/$1/ || s/(.*)(sion$)/$1s/ ||
+ s/(.*)(tion$)/$1t/ || s/(.*)(ou$)/$1/ || s/(.*)(ism$)/$1/ || s/(.*)(ate$)/$1/ ||
+ s/(.*)(iti$)/$1/ || s/(.*)(ous$)/$1/ || s/(.*)(ive$)/$1/ || s/(.*)(ize$)/$1/ ||
+ s/(.*)(ise$)/$1/
) {
- my ($l,$m) = ($`,$&);
+ my ($l,$m) = ($1,$2);
# Look for two consonant/vowel transitions
# NB simplified...
#DEBUG warn "step 4: l=$l m=$m\n";
- $_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*[^aeiou][aeiouy]/;
+ $_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*?[^aeiou][aeiouy]/;
}
# step5a_rules
@@ -85,12 +80,12 @@
# step5b_rules
#DEBUG warn("step 5b: $_\n") &&
- s/ll$/l/ if /[^aeiou][aeiouy].*[^aeiou][aeiouy].*ll$/;
+ s/ll$/l/ if /[^aeiou][aeiouy].*?[^aeiou][aeiouy].*ll$/;
# Cosmetic step
s/(.)i$/$1y/;
}
- @parms;
+ return wantarray ? @parms : $parms[0];
}
1;
@@ -101,6 +96,11 @@
Text::English - Porter's stemming algorithm
+=head1 Warning
+
+This implementation is broken. On vocabulary of 23531 words it has 3569 results
+different from original implementation.
+
=head1 SYNOPSIS
use Text::English;
@@ -117,7 +117,14 @@
in: Porter, M.F., "An Algorithm For Suffix Stripping,"
Program 14 (3), July 1980, pp. 130-137.
Provenance: Written by B. Frakes and C. Cox, 1986.
+http://maya.cs.depaul.edu/~classes/ds575/papers/porter-algorithm.html
+(Link from http://www.tartarus.org/martin/PorterStemmer/)
+C version: http://filebox.vt.edu/users/yucui/IR.code/ir-code/stemmer/stem.c
+
+Description of different stemmers:
+http://www.pimpumpam.com/motoridiricerca/ir/chap08.htm
+
I have re-interpreted areas that use Frakes and Cox's "WordSize"
function. My version may misbehave on short words starting with "y",
but I can't think of any examples.
@@ -130,9 +137,23 @@
=head1 NOTES
-This is version 0.1. I would welcome feedback, especially improvements
+This is version 0.01. I would welcome feedback, especially improvements
to the punctuation-stripping step.
+=head1 Changes
+
+0.02 by Alexandr Ciornii (alexchorny/@/gmail.com)
+
+* URLs for Porter's algorithm
+
+* Removed usage of $`, $&.
+
+* Added warning.
+
+* If called in scalar context returns first element of array
+
+* Wrote tests
+
=head1 AUTHOR
Ian Phillipps <ian@unipalm.pipex.com>
Subject: | English.pm |
#!/usr/bin/perl
# -*- Mode: Perl -*-
# Author : Ian Phillipps
# Last Modified On: Sun May 2 15:35:33 2004
# Language : CPerl
package Text::English;
use strict;
#use warnings;
use vars qw/$VERSION/;
$VERSION = $VERSION = '0.02';
sub stem {
my @parms = @_;
foreach( @parms ) {
$_ = lc $_;
# Step 0 - remove punctuation
s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//;
next unless /^[a-z]+$/;
# step1a_rules
if( /[^s]s$/ ) { s/sses$/ss/ || s/ies$/i/ || s/s$// }
# step1b_rules. The business with rule==106 is embedded in the
# boolean expressions here.
(/[aeiouy][^aeiouy].*eed$/ && s/eed$/ee/ ) ||
( s/([aeiou].*)ed$/$1/ || s/([aeiouy].*)ing$/$1/ ) &&
( # step1b1_rules
s/at$/ate/ || s/bl$/ble/ || s/iz$/ize/ || s/bb$/b/ ||
s/dd$/d/ || s/ff$/f/ || s/gg$/g/ || s/mm$/m/ ||
s/nn$/n/ || s/pp$/p/ || s/rr$/r/ || s/tt$/t/ ||
s/ww$/w/ || s/xx$/x/ ||
# This is wordsize==1 && CVC...addanE...
s/^[^aeiouy]+[aeiouy][^aeiouy]$/$&e/
)
#DEBUG && warn "step1b1: $_\n"
;
# step1c_rules
#DEBUG warn "step1c: $_\n" if
s/([aeiouy].*)y$/$1i/;
# step2_rules
s/(.*[^aeiou][aeiouy].*)ational$/$1ate/ || s/(.*[^aeiou][aeiouy].*)tional$/$1tion/ || s/(.*[^aeiou][aeiouy].*)enci$/$1ence/ ||
s/(.*[^aeiou][aeiouy].*)anci$/$1ance/ || s/(.*[^aeiou][aeiouy].*)izer$/$1ize/ || s/(.*[^aeiou][aeiouy].*)iser$/$1ise/ ||
s/(.*[^aeiou][aeiouy].*)abli$/$1able/ || s/(.*[^aeiou][aeiouy].*)alli$/$1al/ || s/(.*[^aeiou][aeiouy].*)entli$/$1ent/ ||
s/(.*[^aeiou][aeiouy].*)eli$/$1e/ || s/(.*[^aeiou][aeiouy].*)ousli$/$1ous/ || s/(.*[^aeiou][aeiouy].*)ization$/$1ize/ ||
s/(.*[^aeiou][aeiouy].*)isation$/$1ise/ || s/(.*[^aeiou][aeiouy].*)ation$/$1ate/ || s/(.*[^aeiou][aeiouy].*)ator$/$1ate/ ||
s/(.*[^aeiou][aeiouy].*)alism$/$1al/ || s/(.*[^aeiou][aeiouy].*)iveness$/$1ive/ || s/(.*[^aeiou][aeiouy].*)fulnes$/$1ful/ ||
s/(.*[^aeiou][aeiouy].*)ousness$/$1ous/ || s/(.*[^aeiou][aeiouy].*)aliti$/$1al/ || s/(.*[^aeiou][aeiouy].*)iviti$/$1ive/ ||
s/(.*[^aeiou][aeiouy].*)biliti$/$1ble/;
# step3_rules
s/(.*[^aeiou][aeiouy].*)icate$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ative$/$1/ || s/(.*[^aeiou][aeiouy].*)alize$/$1al/ ||
s/(.*[^aeiou][aeiouy].*)iciti$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ical$/$1ic/ || s/(.*[^aeiou][aeiouy].*)ful$/$1/ ||
s/(.*[^aeiou][aeiouy].*)ness$/$1/;
# step4_rules
if ( s/(.*)(al$)/$1/ || s/(.*)(ance$)/$1/ || s/(.*)(ence$)/$1/ || s/(.*)(er$)/$1/ ||
s/(.*)(ic$)/$1/ || s/(.*)(able$)/$1/ || s/(.*)(ible$)/$1/ || s/(.*)(ant$)/$1/ ||
s/(.*)(ement$)/$1/ || s/(.*)(ment$)/$1/ || s/(.*)(ent$)/$1/ || s/(.*)(sion$)/$1s/ ||
s/(.*)(tion$)/$1t/ || s/(.*)(ou$)/$1/ || s/(.*)(ism$)/$1/ || s/(.*)(ate$)/$1/ ||
s/(.*)(iti$)/$1/ || s/(.*)(ous$)/$1/ || s/(.*)(ive$)/$1/ || s/(.*)(ize$)/$1/ ||
s/(.*)(ise$)/$1/
) {
my ($l,$m) = ($1,$2);
# Look for two consonant/vowel transitions
# NB simplified...
#DEBUG warn "step 4: l=$l m=$m\n";
$_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*?[^aeiou][aeiouy]/;
}
# step5a_rules
#DEBUG warn("step 5a: $_\n") &&
s/e$// if ( /[^aeiou][aeiouy].*[^aeiou][aeiouy].*e$/ ||
( /[aeiou][^aeiouy].*e/ && ! /[^aeiou][aeiouy][^aeiouwxy]e$/) );
# step5b_rules
#DEBUG warn("step 5b: $_\n") &&
s/ll$/l/ if /[^aeiou][aeiouy].*?[^aeiou][aeiouy].*ll$/;
# Cosmetic step
s/(.)i$/$1y/;
}
return wantarray ? @parms : $parms[0];
}
1;
__END__
=head1 NAME
Text::English - Porter's stemming algorithm
=head1 Warning
This implementation is broken. On vocabulary of 23531 words it has 3569 results
different from original implementation.
=head1 SYNOPSIS
use Text::English;
@stems = Text::English::stem( @words );
=head1 DESCRIPTION
This routine applies the Porter Stemming Algorithm to its parameters,
returning the stemmed words.
It is derived from the C program "stemmer.c"
as found in freewais and elsewhere, which contains these notes:
Purpose: Implementation of the Porter stemming algorithm documented
in: Porter, M.F., "An Algorithm For Suffix Stripping,"
Program 14 (3), July 1980, pp. 130-137.
Provenance: Written by B. Frakes and C. Cox, 1986.
http://maya.cs.depaul.edu/~classes/ds575/papers/porter-algorithm.html
(Link from http://www.tartarus.org/martin/PorterStemmer/)
C version: http://filebox.vt.edu/users/yucui/IR.code/ir-code/stemmer/stem.c
Description of different stemmers:
http://www.pimpumpam.com/motoridiricerca/ir/chap08.htm
I have re-interpreted areas that use Frakes and Cox's "WordSize"
function. My version may misbehave on short words starting with "y",
but I can't think of any examples.
The step numbers correspond to Frakes and Cox, and are probably in
Porter's article (which I've not seen).
Porter's algorithm still has rough spots (e.g current/currency, -ings words),
which I've not attempted to cure, although I have added
support for the British -ise suffix.
=head1 NOTES
This is version 0.01. I would welcome feedback, especially improvements
to the punctuation-stripping step.
=head1 Changes
0.02 by Alexandr Ciornii (alexchorny/@/gmail.com)
* URLs for Porter's algorithm
* Removed usage of $`, $&.
* Added warning.
* If called in scalar context returns first element of array
* Wrote tests
=head1 AUTHOR
Ian Phillipps <ian@unipalm.pipex.com>
=head1 COPYRIGHT
Copyright Public IP Exchange Ltd (PIPEX).
Available for use under the same terms as perl.
=cut
Subject: | Makefile.PL.patch |
--- Makefile.PL.dist Sun Jul 2 11:32:23 2006
+++ Makefile.PL Wed Nov 8 00:14:50 2006
@@ -23,9 +23,14 @@
'DEFINE' => '', # e.g., '-DHAVE_SOMETHING'
'INC' => '', # e.g., '-I/usr/include/other'
'dist' => { SUFFIX => "gz", COMPRESS => "gzip -f"},
+ 'PREREQ_PM' => {
# we do bundle that module in the distribution
-# 'PREREQ_PM' => { 'Text::English' => 0 },
+# 'Text::English' => 0,
+ 'Test::More' => 0,
+ },
'EXE_FILES' => [ 'perlindex' ],
'clean' => { 'FILES' => 'perlindex' },
+ ($ExtUtils::MakeMaker::VERSION ge '6.30_00'?
+ ('LICENSE' => 'perl', ) : ()),
);
Subject: | stem.t |
use strict;
#use warnings;
use Test::More;
use Text::English;
BEGIN { plan tests => 4, todo => [] }
my @stems = Text::English::stem(qw/abandoned surprise constancies constancy blesses/);
is_deeply(\@stems,[qw/abandon surpris constanc constanc bless/]);
@stems = Text::English::stem(qw/rational traditional/);
is_deeply(\@stems,[qw/ration tradit/]);
#different from original algorithm
@stems = Text::English::stem(qw/excellence excellencies/);
is_deeply(\@stems,[qw/excellenc excellenc/]);
#different from original algorithm
my $stem = Text::English::stem(qw/loyalty/);
is($stem,'loyalty');