Subject: | UTF8 diacritics management problems |
Date: | Thu, 23 Jul 2009 18:34:50 +0200 |
To: | bug-MARC-Record [...] rt.cpan.org |
From: | LAURENT Henri-Damien <henridamien.laurent [...] biblibre.com> |
*Here comes a test script to see that somehow, PERL management of UTF8
sometimes can break encoding of diacritics for MARC::Record Data.
Solution is to set UTF8 Flag for all the subfields.
perl testMARCRecord.pl fichierTestUTF8.2709
shows data before and after adding a simple field.
Solution comes with a function like this :
sub SetUTF8Flag{
my ($record)=@_;
return unless ($record && $record->fields());
foreach my $field ($record->fields()){
if ($field->tag()>=10){
my @subfields;
foreach my $subfield ($field->subfields()){
push @subfields,($$subfield[0],utf8::encode($$subfield[1]));
}
my $newfield=MARC::Field->new(
$field->tag(),
$field->indicator(1),
$field->indicator(2),
@subfields
);
$field->replace_with($newfield);
}
}
}
*
Message body not shown because it is not plain text.
#!/usr/bin/perl
use strict;
use warnings;
# Koha modules used
use MARC::File::USMARC;
use MARC::File::XML;
use MARC::Record;
use MARC::Batch;
use MARC::Charset;
use C4::Charset;
use utf8;
use open qw( :std :utf8);
use Encode;
my ( $input_marc_file) = ('');
$|=1;
my $debug=$ENV{DEBUG};
my $batch;
my $fh = IO::File->new($ARGV[0]); # don't let MARC::Batch open the file, as it applies the ':utf8' IO layer
$batch = MARC::Batch->new( 'USMARC', $fh );
$batch->warnings_off();
$batch->strict_off();
my $i=0;
my $commitnum = $commit ? $commit : 50;
RECORD: while ( ) {
my $record;
# get records
eval { $record = $batch->next() };
if ( $@ ) {
print "Bad MARC record: skipped\n";
next;
}
# skip if we get an empty record (that is MARC valid, but will result in AddBiblio failure
last unless ( $record );
my $record2=$record->clone;
warn "Original :", $record->as_formatted;
$record->insert_fields_ordered(MARC::Field->new('700','','',a=>"Billé",b=>'Louis'));
warn "Modified :",$record->as_formatted;
$i++;
}