Subject: | Archive::Zip::MemberRead + XML::Parser |
I have the following problem: I've got ZIP archives with huge XML files I need to parse chunk by chunk. I am trying to use Archive::Zip::MemberRead togather with XML::Parser. Unfortunately the stream is always unparseable:
=== code ===
use XML::Twig; # Runs on the top of XML::Parser
use Archive::Zip;
use Archive::Zip::MemberRead;
my $catalog_parser = XML::Twig->new(...);
my $zip = Archive::Zip->new(...);
die "Failed to open ZIP file: $!" unless $zip;
foreach ($zip->members())
{
$catalog_parser->parse($_->readFileHandle());
}
=== end of code ===
=== output ===
not well-formed (invalid token) at line 1, column 24, byte 24 at /usr/lib/perl5/vendor_perl/i386-linux/XML/Parser.pm line 187
=== end of output ===
The solution was suggested by Sreeji K Das: before calling XML::Parser->parse() do:
@Archive::Zip::MemberRead::ISA = qw( IO::Handle );
After that the perl script started to SEGFAULT. I have attached the strace log.
open("./mirrors/its/hoteldetails/its_hoteldetails.zip", O_RDONLY|O_LARGEFILE) = 4
ioctl(4, SNDCTL_TMR_TIMEBASE or TCGETS, 0xbfffeb38) = -1 ENOTTY (Inappropriate ioctl for device)
_llseek(4, 0, [0], SEEK_CUR) = 0
fstat64(4, {st_mode=S_IFREG|0644, st_size=459208, ...}) = 0
fcntl64(4, F_SETFD, FD_CLOEXEC) = 0
_llseek(4, 0, [0], SEEK_SET) = 0
_llseek(4, 0, [0], SEEK_CUR) = 0
read(4, "PK\3\4\24\0\2\0\10\0U}\3562V!\36\210F\1\7\0\205\370\350"..., 4096) = 4096
_llseek(4, 30, [30], SEEK_SET) = 0
_llseek(4, 0, [30], SEEK_CUR) = 0
_llseek(4, 16, [46], SEEK_CUR) = 0
_llseek(4, 0, [46], SEEK_CUR) = 0
_llseek(4, 46, [46], SEEK_SET) = 0
_llseek(4, 0, [46], SEEK_CUR) = 0
brk(0) = 0x8d82000
brk(0x8da4000) = 0x8da4000
read(4, "\354\235Yo\343F\266\307\337\363)j\362\320\230\301\264%"..., 4096) = 4096
read(4, "\20HE\267\244\342\375\242\330\t\230\v\5.NG\5\243\330I\230"..., 4096) = 4096
read(4, "\373\260\322n|\21P\312\230@\372\255\205\252\244;\7\32x"..., 4096) = 4096
read(4, ":\372\342\320oN8#\234Q\17g\234[\266\250/\26\277\233\372"..., 4096) = 4096
read(4, "\352u-\267O\265\257}\25\326 \251M\254\37Mm4\345?\313\332"..., 4096) = 4096
read(4, "\226\350\250H , , , , , , , , \254\27\10\213_\36\2521\300"..., 4096) = 4096
read(4, "\'\351\2\2728\305\342\276M\375\30V;\205J{\232\322 \314"..., 4096) = 4096
read(4, "\202P\27\345\251\213\27M\324h\220\34\274\216\304\367,\351"..., 4096) = 4096
brk(0) = 0x8da4000
brk(0x8dda000) = 0x8dda000
brk(0) = 0x8dda000
brk(0) = 0x8dda000
brk(0x8dcb000) = 0x8dcb000
brk(0) = 0x8dcb000
mmap2(NULL, 524288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x407e6000
brk(0) = 0x8dcb000
brk(0) = 0x8dcb000
brk(0x8dac000) = 0x8dac000
brk(0) = 0x8dac000
mremap(0x407e6000, 524288, 1048576, MREMAP_MAYMOVE) = 0x407e6000
mremap(0x407e6000, 1048576, 2097152, MREMAP_MAYMOVE) = 0x407e6000
mmap2(NULL, 1257472, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x409e6000
munmap(0x407e6000, 2097152) = 0
mmap2(NULL, 1257472, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x407e6000
munmap(0x409e6000, 1257472) = 0
--- SIGSEGV (Segmentation fault) @ 0 (0) ---
+++ killed by SIGSEGV +++