diff -rupN Text-LevenshteinXS-0.03.orig/LevenshteinXS.pm Text-LevenshteinXS-0.03/LevenshteinXS.pm
--- Text-LevenshteinXS-0.03.orig/LevenshteinXS.pm 2004-06-29 23:41:56.000000000 +0200
+++ Text-LevenshteinXS-0.03/LevenshteinXS.pm 2010-03-17 21:48:04.000000000 +0100
@@ -19,7 +19,7 @@ our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'
our @EXPORT = qw(
distance
);
-our $VERSION = '0.03';
+our $VERSION = '0.04';
bootstrap Text::LevenshteinXS $VERSION;
diff -rupN Text-LevenshteinXS-0.03.orig/LevenshteinXS.xs Text-LevenshteinXS-0.03/LevenshteinXS.xs
--- Text-LevenshteinXS-0.03.orig/LevenshteinXS.xs 2004-06-29 23:40:47.000000000 +0200
+++ Text-LevenshteinXS-0.03/LevenshteinXS.xs 2010-03-17 21:48:04.000000000 +0100
@@ -12,22 +12,30 @@
#include <stdlib.h>
#include <string.h>
-int levenshtein_distance(char *s,char*t);
+int levenshtein_distance(SV *s,SV *t);
int minimum(int a,int b,int c);
-int levenshtein_distance(char *s,char*t)
+int levenshtein_distance(SV* s, SV* t)
/*Compute levenshtein distance between s and t*/
{
//Step 1
int k,i,j,n,m,cost,*d,distance;
- if (strcmp(s,t) == 0) {return 0;}
- n=strlen(s);
- m=strlen(t);
+
+ n=sv_len_utf8(s);
+ m=sv_len_utf8(t);
+
+ U8* ps = SvPVX(s);
+ U8* pt = SvPVX(t);
+
+ // optimisation for equal strings
+ if(m == n && memEQ(ps, pt, n)) { return 0; }
+
if(n==0) {return m;}
if(m==0) {return n;}
d=malloc((sizeof(int))*(m+1)*(n+1));
+
m++;
n++;
//Step 2
@@ -35,31 +43,53 @@ int levenshtein_distance(char *s,char*t)
d[k]=k;
for(k=0;k<m;k++)
d[k*n]=k;
+
+
+ U8* si = ps;
+ U8* ti = pt;
+
+ int size;
+
//Step 3 and 4
for(i=1;i<n;i++)
+ {
+ size = UTF8SKIP(si);
+
+ ti = pt;
+
for(j=1;j<m;j++)
{
- //Step 5
- if(s[i-1]==t[j-1])
+ if(memEQ(si, ti, size))
cost=0;
else
cost=1;
- //Step 6
+
d[j*n+i]=minimum(d[(j-1)*n+i]+1,d[j*n+i-1]+1,d[(j-1)*n+i-1]+cost);
+
+ ti = utf8_hop(ti, 1);
}
+
+ si = utf8_hop(si, 1);
+ }
+
distance=d[n*m-1];
+
free(d);
+
return distance;
}
-int minimum(int a,int b,int c)
/*Gets the minimum of three values*/
+int minimum(int a, int b, int c)
{
int min=a;
+
if(b<min)
min=b;
+
if(c<min)
min=c;
+
return min;
}
@@ -67,8 +97,8 @@ MODULE = Text::LevenshteinXS PACKAGE =
int
distance(s,t)
- char * s
- char * t
+ SV * s
+ SV * t
CODE:
RETVAL = levenshtein_distance(s,t);
OUTPUT:
diff -rupN Text-LevenshteinXS-0.03.orig/Makefile.old Text-LevenshteinXS-0.03/Makefile.old
--- Text-LevenshteinXS-0.03.orig/Makefile.old 2010-03-17 21:41:20.000000000 +0100
+++ Text-LevenshteinXS-0.03/Makefile.old 2010-03-17 21:48:04.000000000 +0100
@@ -52,11 +52,11 @@ DIRFILESEP = /
DFSEP = $(DIRFILESEP)
NAME = Text::LevenshteinXS
NAME_SYM = Text_LevenshteinXS
-VERSION = 0.03
+VERSION = 0.04
VERSION_MACRO = VERSION
-VERSION_SYM = 0_03
+VERSION_SYM = 0_04
DEFINE_VERSION = -D$(VERSION_MACRO)=\"$(VERSION)\"
-XS_VERSION = 0.03
+XS_VERSION = 0.04
XS_VERSION_MACRO = XS_VERSION
XS_DEFINE_VERSION = -D$(XS_VERSION_MACRO)=\"$(XS_VERSION)\"
INST_ARCHLIB = blib/arch
@@ -177,13 +177,10 @@ PERL_ARCHIVE =
PERL_ARCHIVE_AFTER =
-TO_INST_PM = LevenshteinXS.pm \
- performance.pl
+TO_INST_PM = LevenshteinXS.pm
PM_TO_BLIB = LevenshteinXS.pm \
- $(INST_LIB)/Text/LevenshteinXS.pm \
- performance.pl \
- $(INST_LIB)/Text/performance.pl
+ $(INST_LIB)/Text/LevenshteinXS.pm
# --- MakeMaker platform_constants section:
@@ -258,7 +255,7 @@ RCS_LABEL = rcs -Nv$(VERSION_SYM): -q
DIST_CP = best
DIST_DEFAULT = tardist
DISTNAME = Text-LevenshteinXS
-DISTVNAME = Text-LevenshteinXS-0.03
+DISTVNAME = Text-LevenshteinXS-0.04
# --- MakeMaker macro section:
@@ -559,7 +556,7 @@ metafile : create_distdir
$(NOECHO) $(ECHO) Generating META.yml
$(NOECHO) $(ECHO) '--- #YAML:1.0' > META_new.yml
$(NOECHO) $(ECHO) 'name: Text-LevenshteinXS' >> META_new.yml
- $(NOECHO) $(ECHO) 'version: 0.03' >> META_new.yml
+ $(NOECHO) $(ECHO) 'version: 0.04' >> META_new.yml
$(NOECHO) $(ECHO) 'abstract: ~' >> META_new.yml
$(NOECHO) $(ECHO) 'license: ~' >> META_new.yml
$(NOECHO) $(ECHO) 'author: ~' >> META_new.yml
@@ -891,7 +888,7 @@ testdb_static :: pure_all $(MAP_TARGET)
# --- MakeMaker ppd section:
# Creates a PPD (Perl Package Description) for a binary distribution.
ppd :
- $(NOECHO) $(ECHO) '<SOFTPKG NAME="$(DISTNAME)" VERSION="0,03,0,0">' > $(DISTNAME).ppd
+ $(NOECHO) $(ECHO) '<SOFTPKG NAME="$(DISTNAME)" VERSION="0,04,0,0">' > $(DISTNAME).ppd
$(NOECHO) $(ECHO) ' <TITLE>$(DISTNAME)</TITLE>' >> $(DISTNAME).ppd
$(NOECHO) $(ECHO) ' <ABSTRACT></ABSTRACT>' >> $(DISTNAME).ppd
$(NOECHO) $(ECHO) ' <AUTHOR></AUTHOR>' >> $(DISTNAME).ppd
@@ -908,8 +905,7 @@ ppd :
pm_to_blib : $(TO_INST_PM)
$(NOECHO) $(ABSPERLRUN) -MExtUtils::Install -e 'pm_to_blib({@ARGV}, '\''$(INST_LIB)/auto'\'', '\''$(PM_FILTER)'\'')' -- \
- LevenshteinXS.pm $(INST_LIB)/Text/LevenshteinXS.pm \
- performance.pl $(INST_LIB)/Text/performance.pl
+ LevenshteinXS.pm $(INST_LIB)/Text/LevenshteinXS.pm
$(NOECHO) $(TOUCH) pm_to_blib
diff -rupN Text-LevenshteinXS-0.03.orig/META.yml Text-LevenshteinXS-0.03/META.yml
--- Text-LevenshteinXS-0.03.orig/META.yml 2004-06-29 23:43:35.000000000 +0200
+++ Text-LevenshteinXS-0.03/META.yml 2010-03-17 21:48:04.000000000 +0100
@@ -1,7 +1,7 @@
#
http://module-build.sourceforge.net/META-spec.html
#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX#
name: Text-LevenshteinXS
-version: 0.03
+version: 0.04
version_from: LevenshteinXS.pm
installdirs: site
requires:
diff -rupN Text-LevenshteinXS-0.03.orig/test.pl Text-LevenshteinXS-0.03/test.pl
--- Text-LevenshteinXS-0.03.orig/test.pl 2004-03-05 03:11:59.000000000 +0100
+++ Text-LevenshteinXS-0.03/test.pl 2010-03-17 21:48:04.000000000 +0100
@@ -1,16 +1,41 @@
-use Test;
-BEGIN { plan tests => 6 };
-
-use Text::LevenshteinXS qw(distance);
-
-ok(1);
-if (distance("foo","four") == 2) {ok(1)} else {ok(0)}
-if (distance("foo","foo") == 0) {ok(1)} else {ok(0)}
-if (distance("foo","") == 3) {ok(1)} else {ok(0)}
-if (distance("four","foo") == 2) {ok(1)} else {ok(0)}
-if (distance("foo","bar") == 3) {ok(1)} else {ok(0)}
-
-
-
-
-
+use Test::More qw/ no_plan /;
+
+use Text::LevenshteinXS qw(distance);
+
+use Encode;
+use Unicode::Normalize;
+
+ok(1);
+
+# basic tests
+is(distance("foo","four"), 2);
+is(distance("foo","foo"), 0);
+is(distance("foo",""), 3);
+is(distance("four","foo"), 2);
+is(distance("foo","bar"), 3);
+
+# more tests from the bugtracking system
+is(distance('headache','heavy load'), 7);
+is(distance('Distinction courses', 'Distinction Courses'), 1, 'upper/lower');
+is(distance("fool's handiwork bar", 'food handling bar'), 8);
+
+
+# utf-8 tests
+is(distance("äöü","äöü"), 0);
+is(distance("strase","straÃe"), 1);
+is(distance("strasse","straÃe"), 2);
+is(distance("äöü","123"), 3);
+
+is(distance("","str"), 3, 'length check');
+is(distance("","straÃe"), 6, 'length check ß');
+is(distance("", decode("utf8", "stra\x{00DF}e")), 6, 'length check \x');
+
+is(distance("äöü",decode("latin1", "äöü")), 0);
+is(distance("123","abcde"), 5, 'foo');
+
+my $umls = "äöü";
+
+{
+ use encoding "latin1";
+ is(distance($umls, "äöü"), 0, 'encoding latin1');
+}