Skip Menu |

This queue is for tickets about the MARC-Record CPAN distribution.

Report information
The Basics
Id: 1601
Status: resolved
Priority: 0/
Queue: MARC-Record

People
Owner: GMCHARLT [...] cpan.org
Requestors: zimmer [...] wmich.edu
Cc:
AdminCc:

Bug Information
Severity: (no value)
Broken in: (no value)
Fixed in: (no value)



Subject: CallNumber normalizing/sorting
No bug; no severity; no problem. This code was requested by Andy Lester for inclusion into MARC record. Code description: Type: subroutine Input: callnumber string Output: array of strings containing callnumber chunks Process: subroutine intelligently parses/normalizes a callnumber into chunks suitable for sorting Overview: after calling the subroutine, create a sort vector. the sort vector should lead off with the array of callnumber chunks, followed by most likely the actual callnumber, and whatever other data should go along for the "ride". you can then use the sort method of your choice to create output in callnumber order. Disclaimer: this code is in production at our installation. results indicate about 99% correctness (my guess), which is way better than anything else we've had. Author: Roy Zimmer, Western Michigan University
package Sort::CallNumber; use strict; use vars qw($VERSION @ISA @EXPORT); require Exporter; require 5.004; @ISA = qw(Exporter); @EXPORT = qw(cnparse); $VERSION = '1.10'; use Carp; sub cnparse ### input: callnum - contains callnumber ### output: cnpart[0..36] - call number parse elements suitable for smart sorting ### usage: @callnumber_parts_array = cnparse($callnumber) ### you can then incorporate the callnumber parts at the beginning of ### the "record" for sorting purposes. sort results will not be perfect, ### but will be very close (in my experience), and better than any other ### callnumber sortings that I have seen. ### ### various character check routines are called as subroutines, ### rather than inlined, for clarity of code ### the following are considered to be separators within the callnumber: ### space " ", semicolon ";", colon ":", comma ",", period ".", forward slash "/" ### all resulting call number parts, no matter how "full", are of equal length. ### ### written by Roy Zimmer, Western Michigan University (for Waldo Library) ### versions ### 1.0 1990s initial version, ported from SAS on mainframe when used with NOTIS ### 1.1 2002 removed old garbage and improved code { my ($callnum) = @_; my $curralpha = 0; my $currnum = 0; my $charidx = 0; my $period = "."; my $space = " "; my $semicolon = ";"; my ($initidx, $callnumlength, $isalpha, $isnum, $isalphanum); my ($isseparator, $callnumpartlength, $currchar, $nextchar); my $parselength = 22; my $callnumpartidx = 0; my $callnumpartcharidx = -1; my $hadperiod = 0; my ($stringcheck, $idx, $callnumpart); my @cnpart; for $initidx (0..36) {$cnpart[$initidx] = "";} # remove any leading (meaningless) separators while (sepcheck(substr($callnum, 0, 1))) {$callnum = substr($callnum, 1);} $callnumlength = length($callnum) - 1; while ($charidx <= $callnumlength) { $currchar = uc(substr($callnum, $charidx, 1)); if ($callnumpartidx == 1) { if (alphacheck($cnpart[0]) && numcheck($cnpart[1]) && $currchar eq $space && substr($callnum, $charidx+1, 1) ne $period) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); $cnpart[$callnumpartidx] = ".0"; } } $isseparator = sepcheck($currchar); if ($currchar eq $period) {$hadperiod = 1;} if (!$isseparator) { $isalpha = alphacheck($currchar); $isnum = numcheck($currchar); if (($curralpha && $isalpha) || ($currnum && $isnum) || (!$isalpha && !$isnum)) { $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } if (!$curralpha && !$currnum) { $curralpha = $isalpha; $currnum = $isnum; $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } if (($curralpha && $isnum) || ($currnum && $isalpha)) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); if ($curralpha && $isnum) { $callnumpartcharidx = storechar($period, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); $curralpha = $isalpha; $currnum = $isnum; } } else # isseparator { if ($charidx < $callnumlength) { $nextchar = substr($callnum, $charidx+1, 1); if (($currchar eq $period) && ($nextchar eq $semicolon)) { $charidx++; $nextchar = substr($callnum, $charidx+1, 1); } $isseparator = sepcheck($nextchar); if ($isseparator) { # nextchar = . and currchar = semicolon, comma, or space if (($currchar =~ /[;, ]/) and ($nextchar eq $period)) { $hadperiod = 1; ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); $callnumpartcharidx = storechar($nextchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); $curralpha = $currnum = 0; $charidx++; } if (($currchar eq $period) && ($nextchar ne $space)) { $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); $charidx++; } } # if isseparator else # have a separator and nextchar is not a separator { if ($currchar eq $period) { if (!$curralpha) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); } $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); if ($curralpha) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); } } else { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); } $curralpha = $currnum = 0; } } # charidx < callnumlength } # isseparator $charidx++; } # do while charidx <= callnumlength # pad parse groups as necessary for sorting for ($idx=0; $idx<=$callnumpartidx; $idx++) { $callnumpart = $cnpart[$idx]; $callnumpartlength = length($callnumpart) - 1; $isnum = numcheck($callnumpart); # all digits if ($isnum) { # right/left justify numeric cnpart, as required $callnumpart = adjcpnum($idx, $callnumpart, $parselength, $callnumpartlength, \@cnpart); } else { $stringcheck = substr($callnumpart, 0, 1); $isalphanum = alphanumcheck($stringcheck); if ($isalphanum && ($callnumpartlength > 0)) { $stringcheck = substr($callnumpart, 0, $callnumpartlength-1); $isnum = numcheck($stringcheck); $stringcheck = substr($callnumpart, $callnumpartlength, 1); $isalphanum = alphanumcheck($stringcheck); # leading digits, trailing single non-alphanumeric if (!$isalphanum && $isnum) { $callnumpart = adjcpnum($idx, $callnumpart, $parselength, $callnumpartlength, \@cnpart); } } } $cnpart[$idx] = $callnumpart; } return @cnpart; } sub storechar ### the special handling of the period character allows ### numeric chunks parsed out well after the initial ### accompanying period to be correctly treated as part ### of a decimal number. { my ($achar, $callnumpartidx, $callnumpartcharidx, $hadperiod, $cnpart) = @_; my $period = "."; $callnumpartcharidx++; if (length($cnpart->[$callnumpartidx]) == 0) {$callnumpartcharidx = 0;} if ($achar ne $period) { substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar; } else { if (($callnumpartcharidx == 0) && $hadperiod) { substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar; } } return $callnumpartcharidx; } sub adjcpnum ### code macro ### assumed that current cnpart is numeric ### we don't do anything until we're at least on the 4th cnpart ### if previous cnpart is not empty and starts with a letter, ### look at the cnpart before that (current-2) and ### if that one is all numeric, ### prepend a period to the current cnpart ### if none of the above applies, ### pad the current all-numeric cnpart with leading zeros { my ($idx, $callnumpart, $parselength, $callnumpartlength, $cnpart) = @_; my $doneit = 0; my ($isalphaprevchunkbeg, $stringcheck); my $period = "."; if ($idx > 2) { $stringcheck = $cnpart->[$idx-1]; # in case preceded by period $isalphaprevchunkbeg = 0; if (length($stringcheck) > 0) { $stringcheck = substr($stringcheck, 1); $isalphaprevchunkbeg = alphacheck($stringcheck); } if ($isalphaprevchunkbeg) { $stringcheck = $cnpart->[$idx-2]; if (numcheck($stringcheck)) { $callnumpart = $period . $callnumpart; $doneit = 1; } } } if (!$doneit) { $callnumpart = (sprintf ("0" x ($parselength-$callnumpartlength-1))) . $callnumpart; } } sub upcallnumpartidx ### code macro { my ($callnumpartidx, $callnumpartcharidx) = @_; $callnumpartidx++; $callnumpartcharidx = -1; return ($callnumpartidx, $callnumpartcharidx); } sub numcheck ### check input string ### returns: 1 if contains only digits ### 0 if contains at least one other character or if empty string { my ($stringin) = @_; if ($stringin =~ /^[0-9]+$/) {return 1;} else {return 0;} } sub alphacheck ### check input string ### returns: 1 if contains only letters ### 0 if contains at least one other character or if empty string { my ($stringin) = @_; if ($stringin =~ /^[A-Za-z]+$/) {return 1;} else {return 0;} } sub alphanumcheck ### check input string ### returns: 1 if contains only letters and digits ### 0 if contains at least one other character or if empty string { my ($stringin) = @_; if ($stringin =~ /^[a-zA-Z0-9]+$/) {return 1;} else {return 0;} } sub sepcheck ### check call number character ### returns: 1 if is a section separator ### 0 if not { my ($cnchar) = @_; # separator chars: space, semicolon, colon, comma, period, forward slash if ($cnchar =~ /[ ;:,.\/]/) {return 1;} else {return 0;} } __END__ any documentation you wish to include can go here...
Marking as resolved -- nowadays, the module I recommend for normalizing LC call numbers is Library::CallNumber::LC (http://code.google.com/p/library- callnumber-lc/).