Skip Menu |

This queue is for tickets about the MARC-Record CPAN distribution.

Report information
The Basics
Id: 2135
Status: resolved
Priority: 0/
Queue: MARC-Record

People
Owner: GMCHARLT [...] cpan.org
Requestors: andy [...] petdance.com
Cc:
AdminCc:

Bug Information
Severity: (no value)
Broken in: (no value)
Fixed in: (no value)



Subject: Call number handling
From: ZIMMER@wmich.edu Date: Fri Jan 31, 2003 8:22:58 AM America/Chicago To: Andy Lester <andy@petdance.com> Cc: ZIMMER@wmich.edu Subject: Re: call number sorting/normalizing Send it to me again, please, and let's see what I can do with it... Here 'tis... Roy ====================================================================== #package Normalize::LC_Callnumber; package cnmod; use strict; use vars qw($VERSION @ISA @EXPORT); require Exporter; require 5.004; @ISA = qw(Exporter); @EXPORT = qw(cnparse); $VERSION = '1.10'; use Carp; sub lcnum_parse ### input: callnum - contains callnumber ### output: cnpart[0..36] - call number parse elements suitable for smart sorting ### usage: @callnumber_parts_array = cnparse($callnumber) ### you can then incorporate the callnumber parts at the beginning of ### the "record" for sorting purposes. sort results will not be perfect, ### but will be very close (in my experience), and better than any other ### callnumber sortings that I have seen. ### ### various character check routines are called as subroutines, ### rather than inlined, for clarity of code ### the following are considered to be separators within the callnumber: ### space " ", semicolon ";", colon ":", comma ",", period ".", forward slash "/" ### all resulting call number parts, no matter how "full", are of equal length. ### ### written by Roy Zimmer, Western Michigan University (for Waldo Library) ### versions ### 1.0 1990s initial version, ported from SAS on mainframe when used with NOTIS ### 1.1 2002 removed old garbage and improved code { my ($callnum) = @_; my $curralpha = 0; my $currnum = 0; my $charidx = 0; my $period = "."; my $space = " "; my $semicolon = ";"; my ($initidx, $callnumlength, $isalpha, $isnum, $isalphanum); my ($isseparator, $callnumpartlength, $currchar, $nextchar); my $parselength = 22; my $callnumpartidx = 0; my $callnumpartcharidx = -1; my $hadperiod = 0; my ($stringcheck, $idx, $callnumpart); my @cnpart; for $initidx (0..36) {$cnpart[$initidx] = "";} # remove any leading (meaningless) separators while (sepcheck(substr($callnum, 0, 1))) {$callnum = substr($callnum, 1);} $callnumlength = length($callnum) - 1; while ($charidx <= $callnumlength) { $currchar = uc(substr($callnum, $charidx, 1)); if ($callnumpartidx == 1) { if (alphacheck($cnpart[0]) && numcheck($cnpart[1]) && $currchar eq $space && substr($callnum, $charidx+1, 1) ne $period) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); $cnpart[$callnumpartidx] = ".0"; } } $isseparator = sepcheck($currchar); if ($currchar eq $period) {$hadperiod = 1;} if (!$isseparator) { $isalpha = alphacheck($currchar); $isnum = numcheck($currchar); if (($curralpha && $isalpha) || ($currnum && $isnum) || (!$isalpha && !$isnum)) { $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } if (!$curralpha && !$currnum) { $curralpha = $isalpha; $currnum = $isnum; $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } if (($curralpha && $isnum) || ($currnum && $isalpha)) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); if ($curralpha && $isnum) { $callnumpartcharidx = storechar($period, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); } $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); $curralpha = $isalpha; $currnum = $isnum; } } else # isseparator { if ($charidx < $callnumlength) { $nextchar = substr($callnum, $charidx+1, 1); if (($currchar eq $period) && ($nextchar eq $semicolon)) { $charidx++; $nextchar = substr($callnum, $charidx+1, 1); } $isseparator = sepcheck($nextchar); if ($isseparator) { # nextchar = . and currchar = semicolon, comma, or space if (($currchar =~ /[;, ]/) and ($nextchar eq $period)) { $hadperiod = 1; ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); $callnumpartcharidx = storechar($nextchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); $curralpha = $currnum = 0; $charidx++; } if (($currchar eq $period) && ($nextchar ne $space)) { $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); $charidx++; } } # if isseparator else # have a separator and nextchar is not a separator { if ($currchar eq $period) { if (!$curralpha) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); } $callnumpartcharidx = storechar($currchar, $callnumpartidx, $callnumpartcharidx, $hadperiod, \@cnpart); if ($curralpha) { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); } } else { ($callnumpartidx, $callnumpartcharidx) = upcallnumpartidx($callnumpartidx, $callnumpartcharidx); } $curralpha = $currnum = 0; } } # charidx < callnumlength } # isseparator $charidx++; } # do while charidx <= callnumlength # pad parse groups as necessary for sorting for ($idx=0; $idx<=$callnumpartidx; $idx++) { $callnumpart = $cnpart[$idx]; $callnumpartlength = length($callnumpart) - 1; $isnum = numcheck($callnumpart); # all digits if ($isnum) { # right/left justify numeric cnpart, as required $callnumpart = adjcpnum($idx, $isalpha, $callnumpart, $parselength, $callnumpartlength, \@cnpart); } else { $stringcheck = substr($callnumpart, 0, 1); $isalphanum = alphanumcheck($stringcheck); if ($isalphanum && ($callnumpartlength > 0)) { $stringcheck = substr($callnumpart, 0, $callnumpartlength-1); $isnum = numcheck($stringcheck); $stringcheck = substr($callnumpart, $callnumpartlength, 1); $isalphanum = alphanumcheck($stringcheck); # leading digits, trailing single non-alphanumeric if (!$isalphanum && $isnum) { $callnumpart = adjcpnum($idx, $isalpha, $callnumpart, $parselength, $callnumpartlength, \@cnpart); } } } $cnpart[$idx] = $callnumpart; } return @cnpart; } sub storechar ### the special handling of the period character allows ### numeric chunks parsed out well after the initial ### accompanying period to be correctly treated as part ### of a decimal number. { my ($achar, $callnumpartidx, $callnumpartcharidx, $hadperiod, $cnpart) = @_; my $period = "."; $callnumpartcharidx++; if (length($cnpart->[$callnumpartidx]) == 0) {$callnumpartcharidx = 0;} if ($achar ne $period) { substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar; } else { if (($callnumpartcharidx == 0) && $hadperiod) { substr($cnpart->[$callnumpartidx], $callnumpartcharidx, 1) = $achar; } } return $callnumpartcharidx; } sub adjcpnum ### code macro ### Heuristic: ### assumed that current cnpart is numeric ### IF previous cnpart starts with a period (.), OR ### (IF previous cnpart is alpha AND the cnpart before ### that is numeric AND the cnpart before -that- one ### begins with a period (.)) THEN ### convert current or (current-2) cnpart to -.nnn- format ### (and if successful check back 3 levels, concatenate ### current cnpart to current-2 cnpart), ### ELSE right justify current cnpart with leading zeros ### this ensures correct callnumber sorting. { my ($idx, $isalpha, $callnumpart, $parselength, $callnumpartlength, $cnpart) = @_; my $doneit = 0; my ($isalpha1, $isalpha2, $stringcheck); my $period = "."; if ($idx > 2) { $stringcheck = $cnpart->[$idx-1]; $isalpha1 = alphacheck($stringcheck); # in case preceded by period $isalpha2 = 0; if (length($stringcheck) > 0) { $stringcheck = substr($stringcheck, 1); $isalpha2 = alphacheck($stringcheck); } if ($isalpha || $isalpha2) { $stringcheck = $cnpart->[$idx-2]; if (numcheck($stringcheck)) { $callnumpart = $period . $callnumpart; $doneit = 1; } } } if (!$doneit) { $callnumpart = (sprintf ("0" x ($parselength-$callnumpartlength-1))) . $callnumpart; } } sub upcallnumpartidx ### code macro { my ($callnumpartidx, $callnumpartcharidx) = @_; $callnumpartidx++; $callnumpartcharidx = -1; return ($callnumpartidx, $callnumpartcharidx); } sub numcheck ### check input string ### returns: 1 if contains only digits ### 0 if contains at least one other character or if empty string { my ($stringin) = @_; if ($stringin =~ /^[0-9]+$/) {return 1;} else {return 0;} } sub alphacheck ### check input string ### returns: 1 if contains only letters ### 0 if contains at least one other character or if empty string { my ($stringin) = @_; if ($stringin =~ /^[A-Za-z]+$/) {return 1;} else {return 0;} } sub alphanumcheck ### check input string ### returns: 1 if contains only letters and digits ### 0 if contains at least one other character or if empty string { my ($stringin) = @_; if ($stringin =~ /^[a-zA-Z0-9]+$/) {return 1;} else {return 0;} } sub sepcheck ### check call number character ### returns: 1 if is a section separator ### 0 if not { my ($cnchar) = @_; # separator chars: space, semicolon, colon, comma, period, forward slash if ($cnchar =~ /[ ;:,.\/]/) {return 1;} else {return 0;} } __END__ =head1 NAME Normalize::LC_Callnumber - Normalizes a Library of Congress callnumber for sorting =head1 SYNOPSIS use Normalize; @callnumber_parts_array = lcnum_parse($lc_callnumber); # prepend callnumber_parts_array elements to callnumber, etc. for sorting # after sort, drop the leading call number elements and # output or process remaining data for each record =head1 DESCRIPTION Prepares a Library of Congress type of callnumber for improved sorting, as compared to full field sorting. Typically gets close to 100% correct sort order. Splits callnumber into up to 37 chunks, parsing according to the separators encountered. Normalizes the chunks so that the callnumber can be intelligently sorted. =head 1 COPYRIGHT Copyright (c) 2002 Roy Zimmer. All rights reserved. Developed at Western Michigan University. This is free software. If you make modifications or improvements to this, please send me a copy of that at zimmer@wmich.edu. =cut
Marking as resolved -- nowadays, the module I recommend for normalizing LC call numbers is Library::CallNumber::LC (http://code.google.com/p/library- callnumber-lc/).