Subject: | CallNumber normalizing/sorting |
No bug; no severity; no problem.
This code was requested by Andy Lester for inclusion into MARC record.
Code description:
Type: subroutine
Input: callnumber string
Output: array of strings containing callnumber chunks
Process: subroutine intelligently parses/normalizes a callnumber into
chunks suitable for sorting
Overview: after calling the subroutine, create a sort vector. the sort
vector should lead off with the array of callnumber chunks,
followed by most likely the actual callnumber, and whatever
other data should go along for the "ride".
you can then use the sort method of your choice to create output
in callnumber order.
Disclaimer: this code is in production at our installation. results indicate
about 99% correctness (my guess), which is way better than
anything else we've had.
Author: Roy Zimmer, Western Michigan University
package Sort::CallNumber;
use strict;
use vars qw($VERSION @ISA @EXPORT);
require Exporter;
require 5.004;
@ISA = qw(Exporter);
@EXPORT = qw(cnparse);
$VERSION = '1.10';
use Carp;
sub cnparse
### input: callnum - contains callnumber
### output: cnpart[0..36] - call number parse elements suitable for smart sorting
### usage: @callnumber_parts_array = cnparse($callnumber)
### you can then incorporate the callnumber parts at the beginning of
### the "record" for sorting purposes. sort results will not be perfect,
### but will be very close (in my experience), and better than any other
### callnumber sortings that I have seen.
###
### various character check routines are called as subroutines,
### rather than inlined, for clarity of code
### the following are considered to be separators within the callnumber:
### space " ", semicolon ";", colon ":", comma ",", period ".", forward slash "/"
### all resulting call number parts, no matter how "full", are of equal length.
###
### written by Roy Zimmer, Western Michigan University (for Waldo Library)
### versions
### 1.0 1990s initial version, ported from SAS on mainframe when used with NOTIS
### 1.1 2002 removed old garbage and improved code
{
my ($callnum) = @_;
my $curralpha = 0;
my $currnum = 0;
my $charidx = 0;
my $period = ".";
my $space = " ";
my $semicolon = ";";
my ($initidx, $callnumlength, $isalpha, $isnum, $isalphanum);
my ($isseparator, $callnumpartlength, $currchar, $nextchar);
my $parselength = 22;
my $callnumpartidx = 0;
my $callnumpartcharidx = -1;
my $hadperiod = 0;
my ($stringcheck, $idx, $callnumpart);
my @cnpart;
for $initidx (0..36) {$cnpart[$initidx] = "";}
# remove any leading (meaningless) separators
while (sepcheck(substr($callnum, 0, 1))) {$callnum = substr($callnum, 1);}
$callnumlength = length($callnum) - 1;
while ($charidx <= $callnumlength)
{
$currchar = uc(substr($callnum, $charidx, 1));
if ($callnumpartidx == 1)
{
if (alphacheck($cnpart[0]) && numcheck($cnpart[1]) &&
$currchar eq $space && substr($callnum, $charidx+1, 1) ne $period)
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
$cnpart[$callnumpartidx] = ".0";
}
}
$isseparator = sepcheck($currchar);
if ($currchar eq $period) {$hadperiod = 1;}
if (!$isseparator)
{
$isalpha = alphacheck($currchar);
$isnum = numcheck($currchar);
if (($curralpha && $isalpha) || ($currnum && $isnum) || (!$isalpha && !$isnum))
{
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart);
}
if (!$curralpha && !$currnum)
{
$curralpha = $isalpha;
$currnum = $isnum;
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart);
}
if (($curralpha && $isnum) || ($currnum && $isalpha))
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
if ($curralpha && $isnum)
{
$callnumpartcharidx =
storechar($period, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart);
}
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart);
$curralpha = $isalpha;
$currnum = $isnum;
}
}
else # isseparator
{
if ($charidx < $callnumlength)
{
$nextchar = substr($callnum, $charidx+1, 1);
if (($currchar eq $period) && ($nextchar eq $semicolon))
{
$charidx++;
$nextchar = substr($callnum, $charidx+1, 1);
}
$isseparator = sepcheck($nextchar);
if ($isseparator)
{
# nextchar = . and currchar = semicolon, comma, or space
if (($currchar =~ /[;, ]/) and ($nextchar eq $period))
{
$hadperiod = 1;
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
$callnumpartcharidx =
storechar($nextchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart);
$curralpha = $currnum = 0;
$charidx++;
}
if (($currchar eq $period) && ($nextchar ne $space))
{
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart);
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
$charidx++;
}
} # if isseparator
else # have a separator and nextchar is not a separator
{
if ($currchar eq $period)
{
if (!$curralpha)
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
}
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart);
if ($curralpha)
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
}
}
else
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
}
$curralpha = $currnum = 0;
}
} # charidx < callnumlength
} # isseparator
$charidx++;
} # do while charidx <= callnumlength
# pad parse groups as necessary for sorting
for ($idx=0; $idx<=$callnumpartidx; $idx++)
{
$callnumpart = $cnpart[$idx];
$callnumpartlength = length($callnumpart) - 1;
$isnum = numcheck($callnumpart);
# all digits
if ($isnum)
{
# right/left justify numeric cnpart, as required
$callnumpart =
adjcpnum($idx, $callnumpart, $parselength, $callnumpartlength, \@cnpart);
}
else
{
$stringcheck = substr($callnumpart, 0, 1);
$isalphanum = alphanumcheck($stringcheck);
if ($isalphanum && ($callnumpartlength > 0))
{
$stringcheck = substr($callnumpart, 0, $callnumpartlength-1);
$isnum = numcheck($stringcheck);
$stringcheck = substr($callnumpart, $callnumpartlength, 1);
$isalphanum = alphanumcheck($stringcheck);
# leading digits, trailing single non-alphanumeric
if (!$isalphanum && $isnum)
{
$callnumpart =
adjcpnum($idx, $callnumpart, $parselength, $callnumpartlength, \@cnpart);
}
}
}
$cnpart[$idx] = $callnumpart;
}
return @cnpart;
}
sub storechar
### the special handling of the period character allows
### numeric chunks parsed out well after the initial
### accompanying period to be correctly treated as part
### of a decimal number.
{
my ($achar, $callnumpartidx, $callnumpartcharidx, $hadperiod, $cnpart) = @_;
my $period = ".";
$callnumpartcharidx++;
if (length($cnpart->[$callnumpartidx]) == 0) {$callnumpartcharidx = 0;}
if ($achar ne $period)
{
substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar;
}
else
{
if (($callnumpartcharidx == 0) && $hadperiod)
{
substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar;
}
}
return $callnumpartcharidx;
}
sub adjcpnum
### code macro
### assumed that current cnpart is numeric
### we don't do anything until we're at least on the 4th cnpart
### if previous cnpart is not empty and starts with a letter,
### look at the cnpart before that (current-2) and
### if that one is all numeric,
### prepend a period to the current cnpart
### if none of the above applies,
### pad the current all-numeric cnpart with leading zeros
{
my ($idx, $callnumpart, $parselength, $callnumpartlength, $cnpart) = @_;
my $doneit = 0;
my ($isalphaprevchunkbeg, $stringcheck);
my $period = ".";
if ($idx > 2)
{
$stringcheck = $cnpart->[$idx-1];
# in case preceded by period
$isalphaprevchunkbeg = 0;
if (length($stringcheck) > 0)
{
$stringcheck = substr($stringcheck, 1);
$isalphaprevchunkbeg = alphacheck($stringcheck);
}
if ($isalphaprevchunkbeg)
{
$stringcheck = $cnpart->[$idx-2];
if (numcheck($stringcheck))
{
$callnumpart = $period . $callnumpart;
$doneit = 1;
}
}
}
if (!$doneit)
{
$callnumpart = (sprintf ("0" x ($parselength-$callnumpartlength-1))) . $callnumpart;
}
}
sub upcallnumpartidx
### code macro
{
my ($callnumpartidx, $callnumpartcharidx) = @_;
$callnumpartidx++;
$callnumpartcharidx = -1;
return ($callnumpartidx, $callnumpartcharidx);
}
sub numcheck
### check input string
### returns: 1 if contains only digits
### 0 if contains at least one other character or if empty string
{
my ($stringin) = @_;
if ($stringin =~ /^[0-9]+$/) {return 1;}
else {return 0;}
}
sub alphacheck
### check input string
### returns: 1 if contains only letters
### 0 if contains at least one other character or if empty string
{
my ($stringin) = @_;
if ($stringin =~ /^[A-Za-z]+$/) {return 1;}
else {return 0;}
}
sub alphanumcheck
### check input string
### returns: 1 if contains only letters and digits
### 0 if contains at least one other character or if empty string
{
my ($stringin) = @_;
if ($stringin =~ /^[a-zA-Z0-9]+$/) {return 1;}
else {return 0;}
}
sub sepcheck
### check call number character
### returns: 1 if is a section separator
### 0 if not
{
my ($cnchar) = @_;
# separator chars: space, semicolon, colon, comma, period, forward slash
if ($cnchar =~ /[ ;:,.\/]/) {return 1;}
else {return 0;}
}
__END__
any documentation you wish to include can go here...