Subject: | Filename Encoding by Archive::Zip |
Date: | Wed, 23 Apr 2008 14:09:56 +0200 |
To: | bug-archive-zip [...] rt.cpan.org |
From: | Torsten.Werner [...] assyst-intl.com |
Hi all,
There is a small problems in Archive::Zip running on Windows:
When a file name contains non ASCII-characters the file name is encoded in
the codepage for non-unicode applications. There is no unzip tool which can
extract this files properly.
When I change the encoding of the file names into the codepage used by cmd,
it is working fine.
But the basic problem is, that the archive does not include any information
about file name encoding at all.
With the current version of ZIP-specification is it possible to insert file
names utf8 encoded. The specification is available here:
http://www.pkware.com/documents/casestudies/APPNOTE.TXT
I did a few tests, it is working fine with SecureZip (made by PKWare,
maintainer of the ZIP specification). With this option is it possible to
exchange the archives between systems with different encoding on the
terminal. Perhaps we can't extract it by Archive::Zip caused by Perl
limitations, but any other specification compliant unpacker can do that.
Here I have a small function used by me for tests. It would be great when
you implement such a functionality in Archive::Zip.
I would support you for that if you like.
#################### cut here ##################
use strict;
use Archive::Zip qw( :ERROR_CODES :CONSTANTS );
use Encode;
use constant GPBF_is_utf8 => pack('S',0x0800); # Bit 11 -> utf8
use constant cdExtra_is_unicode_path => pack('S',0x7075);
use constant cdExtra_is_unicode_comment => pack('S',0x6375);
sub append_utf8_name($$$)
# append the utf8 name to central directory extra field
{
local $_;
my ($id,$name,$crc32)=@_;
my $new_part=cdExtra_is_unicode_path;
my $utf8_name=encode('utf8',$name);
use bytes;
my $length=length($utf8_name)+5; # version + crc32
$new_part.=pack('S',$length);
$new_part.=chr(1);
$new_part.=pack("N",$crc32);
$new_part.=$utf8_name;
return $id.$new_part;
}
sub fix_zip_archive($$$)
# change encoding of file names into Terminal Codepage by default, utf8 if
second parameter is set
# append the file name as utf8 to extra field if 3rd parameter is set
{
my ($zip,$utf8_names,$utf8_central_directory_entries)=@_;
foreach my $member ($zip->members()) {
my
$filename=CodingInfo::OSDecode($member->fileName());
my $new_filename=$utf8_names ?
encode('utf8',$filename) : encode(CodingInfo::TermEncoding(),$filename);
$member->{fileName} = $new_filename;
$member->{bitFlag} = $member->{bitFlag} | 0x0800
if $utf8_names;
$member->cdExtraField(append_utf8_name($member->cdExtraField(),$filename,Archive::Zip::computeCRC32($new_filename)))
if $utf8_central_directory_entries;
$zip->replaceMember(CodingInfo::OSEncode($filename),$member);
}
return $zip;
}
### now the archive creation.
# make a directory with non-ascii file names for tests like this:
# ZIP-Test/
# ZIP-Test/Ärger.txt
# ZIP-Test/Häßliche Zeichen/
# ZIP-Test/Häßliche Zeichen/Öde Ümläute.txt
my $hash={
full => ["PerlZipTest_full.zip",1,1], # name in utf8, extra
field entry. Readable by SecureZip
cde => ["PerlZipTest_cde.zip",0,1], # name in terminal
codepage, extra field entry. Readable by SecureZip everywhere and by any
other windows extraction tool as long as we have the same codepage on
terminal
header => ["PerlZipTest_header.zip",1,0], # name in utf8, no
extra field entry. Readable by SecureZip everywhere
compatible => ["PerlZipTest_compatible.zip",0,0], # name in
terminal codepage. Readable by any windows extraction tool as long as we
have the same codepage on terminal
};
my $dir="ZIP-Test";
foreach my $key (keys %$hash) {
my $zip=new Archive::Zip;
die "Error adding tree for directory '$dir'" unless
($zip->addTree( $dir, '') == AZ_OK);
my $name=$hash->{$key}->[0];
fix_zip_archive($zip,$hash->{$key}->[1],$hash->{$key}->[2]);
die "Error writing zip '$name'" unless
$zip->writeToFileNamed($name) ==AZ_OK;
}
###########################################
# here the functions for encoding/decoding:
# (it is a part of a other module, sorry for
# the sepparate package
###########################################
package CodingInfo;
require 5.008_007;
use strict;
require Exporter;
our @ISA=("Exporter");
our @EXPORT = qw(
OSEncoding
TermEncoding
OSDecode
OSEncode
);
use Carp qw(confess);
use Encode;
use if ($^O eq 'MSWin32'), "Win32::TieRegistry";
sub WinCodepage();
sub CmdCodepage();
sub OSEncoding();
sub TermEncoding();
sub OSDecode($);
sub OSEncode($);
my $Registry={};
if ($^O eq 'MSWin32') {
$Registry=$Win32::TieRegistry::Registry->Open(
"",
{
Access=>Win32::TieRegistry::KEY_READ(),
Delimiter=>"\\"
}
);
confess "Unable to open registry in read-only mode" unless (defined
$Registry)
}
sub OSDecode($)
{
return decode(OSEncoding,shift);
}
sub OSEncode($)
{
return encode(OSEncoding,shift);
}
sub WinCodepage ()
{
my
$cp=$Registry->{"LMachine\\SYSTEM\\CurrentControlSet\\Control\\Nls\\CodePage\\\\ACP"};
$cp="1252" unless (defined $cp);
return sprintf('cp%s',$cp);
}
sub CmdCodepage ()
{
my
$cp=$Registry->{"LMachine\\SYSTEM\\CurrentControlSet\\Control\\Nls\\CodePage\\\\OEMCP"};
$cp="850" unless (defined $cp);
return sprintf('cp%s',$cp);
}
sub OSEncoding()
{
return WinCodepage() if $^O eq 'MSWin32';
return TermEncoding;
}
sub TermEncoding()
{
local $_;
return CmdCodepage() if ($^O eq 'MSWin32');
my @lang_settings=`locale`;
chomp @lang_settings;
my $lang_settings={};
foreach (@lang_settings) {
my ($name,$value)=split /=/,$_,2;
$lang_settings->{$name}=$value;
}
ENV: foreach ('LC_CTYPE', 'LC_ALL', 'LANG') {
if ($lang_settings->{$_}) {
my $lang=$lang_settings->{$_};
$lang=~s/^"//;
$lang=~s/"$//;
$lang=~s/^.+\.//;
$lang=~s/\@euro//;
$lang='iso88591' if $lang eq 'C';
$lang='hp-roman8' if $lang eq 'roman8';
$lang=~s/8859/-8859-/;
return $lang;
}
}
return 'iso-8859-1';
}
##################### cut here #################
Bye
Torsten Werner