Subject: | Call number handling |
From: ZIMMER@wmich.edu
Date: Fri Jan 31, 2003 8:22:58 AM America/Chicago
To: Andy Lester <andy@petdance.com>
Cc: ZIMMER@wmich.edu
Subject: Re: call number sorting/normalizing
Send it to me again, please, and let's see what I can do with it...
Here 'tis... Roy
======================================================================
#package Normalize::LC_Callnumber;
package cnmod;
use strict;
use vars qw($VERSION @ISA @EXPORT);
require Exporter;
require 5.004;
@ISA = qw(Exporter);
@EXPORT = qw(cnparse);
$VERSION = '1.10';
use Carp;
sub lcnum_parse
### input: callnum - contains callnumber
### output: cnpart[0..36] - call number parse elements suitable for
smart sorting
### usage: @callnumber_parts_array = cnparse($callnumber)
### you can then incorporate the callnumber parts at the beginning of
### the "record" for sorting purposes. sort results will not be
perfect,
### but will be very close (in my experience), and better than
any other
### callnumber sortings that I have seen.
###
### various character check routines are called as subroutines,
### rather than inlined, for clarity of code
### the following are considered to be separators within the callnumber:
### space " ", semicolon ";", colon ":", comma ",", period ".",
forward slash "/"
### all resulting call number parts, no matter how "full", are of equal
length.
###
### written by Roy Zimmer, Western Michigan University (for Waldo Library)
### versions
### 1.0 1990s initial version, ported from SAS on mainframe when used
with NOTIS
### 1.1 2002 removed old garbage and improved code
{
my ($callnum) = @_;
my $curralpha = 0;
my $currnum = 0;
my $charidx = 0;
my $period = ".";
my $space = " ";
my $semicolon = ";";
my ($initidx, $callnumlength, $isalpha, $isnum, $isalphanum);
my ($isseparator, $callnumpartlength, $currchar, $nextchar);
my $parselength = 22;
my $callnumpartidx = 0;
my $callnumpartcharidx = -1;
my $hadperiod = 0;
my ($stringcheck, $idx, $callnumpart);
my @cnpart;
for $initidx (0..36) {$cnpart[$initidx] = "";}
# remove any leading (meaningless) separators
while (sepcheck(substr($callnum, 0, 1))) {$callnum = substr($callnum, 1);}
$callnumlength = length($callnum) - 1;
while ($charidx <= $callnumlength)
{
$currchar = uc(substr($callnum, $charidx, 1));
if ($callnumpartidx == 1)
{
if (alphacheck($cnpart[0]) && numcheck($cnpart[1]) &&
$currchar eq $space && substr($callnum, $charidx+1, 1) ne $period)
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
$cnpart[$callnumpartidx] = ".0";
}
}
$isseparator = sepcheck($currchar);
if ($currchar eq $period) {$hadperiod = 1;}
if (!$isseparator)
{
$isalpha = alphacheck($currchar);
$isnum = numcheck($currchar);
if (($curralpha && $isalpha) || ($currnum && $isnum) || (!$isalpha
&& !$isnum))
{
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx,
$hadperiod, \@cnpart);
}
if (!$curralpha && !$currnum)
{
$curralpha = $isalpha;
$currnum = $isnum;
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx,
$hadperiod, \@cnpart);
}
if (($curralpha && $isnum) || ($currnum && $isalpha))
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
if ($curralpha && $isnum)
{
$callnumpartcharidx =
storechar($period, $callnumpartidx, $callnumpartcharidx,
$hadperiod, \@cnpart);
}
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx,
$hadperiod, \@cnpart);
$curralpha = $isalpha;
$currnum = $isnum;
}
}
else # isseparator
{
if ($charidx < $callnumlength)
{
$nextchar = substr($callnum, $charidx+1, 1);
if (($currchar eq $period) && ($nextchar eq $semicolon))
{
$charidx++;
$nextchar = substr($callnum, $charidx+1, 1);
}
$isseparator = sepcheck($nextchar);
if ($isseparator)
{
# nextchar = . and currchar = semicolon, comma, or space
if (($currchar =~ /[;, ]/) and ($nextchar eq $period))
{
$hadperiod = 1;
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
$callnumpartcharidx =
storechar($nextchar, $callnumpartidx, $callnumpartcharidx,
$hadperiod, \@cnpart);
$curralpha = $currnum = 0;
$charidx++;
}
if (($currchar eq $period) && ($nextchar ne $space))
{
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx,
$hadperiod, \@cnpart);
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
$charidx++;
}
} # if isseparator
else # have a separator and nextchar is not a separator
{
if ($currchar eq $period)
{
if (!$curralpha)
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
}
$callnumpartcharidx =
storechar($currchar, $callnumpartidx, $callnumpartcharidx,
$hadperiod, \@cnpart);
if ($curralpha)
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
}
}
else
{
($callnumpartidx, $callnumpartcharidx) =
upcallnumpartidx($callnumpartidx, $callnumpartcharidx);
}
$curralpha = $currnum = 0;
}
} # charidx < callnumlength
} # isseparator
$charidx++;
} # do while charidx <= callnumlength
# pad parse groups as necessary for sorting
for ($idx=0; $idx<=$callnumpartidx; $idx++)
{
$callnumpart = $cnpart[$idx];
$callnumpartlength = length($callnumpart) - 1;
$isnum = numcheck($callnumpart);
# all digits
if ($isnum)
{
# right/left justify numeric cnpart, as required
$callnumpart =
adjcpnum($idx, $isalpha, $callnumpart, $parselength,
$callnumpartlength, \@cnpart);
}
else
{
$stringcheck = substr($callnumpart, 0, 1);
$isalphanum = alphanumcheck($stringcheck);
if ($isalphanum && ($callnumpartlength > 0))
{
$stringcheck = substr($callnumpart, 0, $callnumpartlength-1);
$isnum = numcheck($stringcheck);
$stringcheck = substr($callnumpart, $callnumpartlength, 1);
$isalphanum = alphanumcheck($stringcheck);
# leading digits, trailing single non-alphanumeric
if (!$isalphanum && $isnum)
{
$callnumpart =
adjcpnum($idx, $isalpha, $callnumpart, $parselength,
$callnumpartlength, \@cnpart);
}
}
}
$cnpart[$idx] = $callnumpart;
}
return @cnpart;
}
sub storechar
### the special handling of the period character allows
### numeric chunks parsed out well after the initial
### accompanying period to be correctly treated as part
### of a decimal number.
{
my ($achar, $callnumpartidx, $callnumpartcharidx, $hadperiod, $cnpart)
= @_;
my $period = ".";
$callnumpartcharidx++;
if (length($cnpart->[$callnumpartidx]) == 0) {$callnumpartcharidx = 0;}
if ($achar ne $period)
{
substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar;
}
else
{
if (($callnumpartcharidx == 0) && $hadperiod)
{
substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar;
}
}
return $callnumpartcharidx;
}
sub adjcpnum
### code macro
### Heuristic:
### assumed that current cnpart is numeric
### IF previous cnpart starts with a period (.), OR
### (IF previous cnpart is alpha AND the cnpart before
### that is numeric AND the cnpart before -that- one
### begins with a period (.)) THEN
### convert current or (current-2) cnpart to -.nnn- format
### (and if successful check back 3 levels, concatenate
### current cnpart to current-2 cnpart),
### ELSE right justify current cnpart with leading zeros
### this ensures correct callnumber sorting.
{
my ($idx, $isalpha, $callnumpart, $parselength, $callnumpartlength,
$cnpart) = @_;
my $doneit = 0;
my ($isalpha1, $isalpha2, $stringcheck);
my $period = ".";
if ($idx > 2)
{
$stringcheck = $cnpart->[$idx-1];
$isalpha1 = alphacheck($stringcheck);
# in case preceded by period
$isalpha2 = 0;
if (length($stringcheck) > 0)
{
$stringcheck = substr($stringcheck, 1);
$isalpha2 = alphacheck($stringcheck);
}
if ($isalpha || $isalpha2)
{
$stringcheck = $cnpart->[$idx-2];
if (numcheck($stringcheck))
{
$callnumpart = $period . $callnumpart;
$doneit = 1;
}
}
}
if (!$doneit)
{
$callnumpart = (sprintf ("0" x ($parselength-$callnumpartlength-1)))
. $callnumpart;
}
}
sub upcallnumpartidx
### code macro
{
my ($callnumpartidx, $callnumpartcharidx) = @_;
$callnumpartidx++;
$callnumpartcharidx = -1;
return ($callnumpartidx, $callnumpartcharidx);
}
sub numcheck
### check input string
### returns: 1 if contains only digits
### 0 if contains at least one other character or if empty string
{
my ($stringin) = @_;
if ($stringin =~ /^[0-9]+$/) {return 1;}
else {return 0;}
}
sub alphacheck
### check input string
### returns: 1 if contains only letters
### 0 if contains at least one other character or if empty string
{
my ($stringin) = @_;
if ($stringin =~ /^[A-Za-z]+$/) {return 1;}
else {return 0;}
}
sub alphanumcheck
### check input string
### returns: 1 if contains only letters and digits
### 0 if contains at least one other character or if empty string
{
my ($stringin) = @_;
if ($stringin =~ /^[a-zA-Z0-9]+$/) {return 1;}
else {return 0;}
}
sub sepcheck
### check call number character
### returns: 1 if is a section separator
### 0 if not
{
my ($cnchar) = @_;
# separator chars: space, semicolon, colon, comma, period, forward slash
if ($cnchar =~ /[ ;:,.\/]/) {return 1;}
else {return 0;}
}
__END__
=head1 NAME
Normalize::LC_Callnumber - Normalizes a Library of Congress callnumber
for sorting
=head1 SYNOPSIS
use Normalize;
@callnumber_parts_array = lcnum_parse($lc_callnumber);
# prepend callnumber_parts_array elements to callnumber, etc. for sorting
# after sort, drop the leading call number elements and
# output or process remaining data for each record
=head1 DESCRIPTION
Prepares a Library of Congress type of callnumber for improved sorting,
as compared to full field sorting. Typically gets close to 100% correct
sort order. Splits callnumber into up to 37 chunks, parsing according to
the separators encountered. Normalizes the chunks so that the callnumber
can be intelligently sorted.
=head 1 COPYRIGHT
Copyright (c) 2002 Roy Zimmer. All rights reserved.
Developed at Western Michigan University.
This is free software. If you make modifications or improvements to this,
please send me a copy of that at zimmer@wmich.edu.
=cut