Subject: | patch to skip files via a callback and limit memory use when skipping files |
The following patch changes the file skip functionality so that
you can give it a filter callback function to use to decided
what files should be skipped.
When skipping files, only a limited amount of memory is used
rather than allocating enough RAM to hold an entire file (a
very bad idea when your tar has large files in it).
The patch is against version 1.58. Let me know if you need
it updated for a more recent version or anything else.
Subject: | archive-tar.diff |
Only in Archive-Tar-1.58: Makefile.old
diff -ur Archive-Tar-1.58.orig/lib/Archive/Tar.pm Archive-Tar-1.58/lib/Archive/Tar.pm
--- Archive-Tar-1.58.orig/lib/Archive/Tar.pm 2010-02-17 13:39:08.000000000 -0800
+++ Archive-Tar-1.58/lib/Archive/Tar.pm 2010-03-10 14:52:26.000000000 -0800
@@ -301,6 +301,7 @@
my $count = $opts->{limit} || 0;
my $filter = $opts->{filter};
+ my $filter_cb = $opts->{filter_cb};
my $extract = $opts->{extract} || 0;
### set a cap on the amount of files to extract ###
@@ -392,19 +393,56 @@
$data = $entry->get_content_by_ref;
- ### just read everything into memory
- ### can't do lazy loading since IO::Zlib doesn't support 'seek'
- ### this is because Compress::Zlib doesn't support it =/
- ### this reads in the whole data in one read() call.
- if( $handle->read( $$data, $block ) < $block ) {
- $self->_error( qq[Read error on tarfile (missing data) '].
+ my $skip = 0;
+ ### skip this entry if we're filtering
+ if ($filter && $entry->name !~ $filter) {
+ $skip = 1;
+
+ ### skip this entry if it's a pax header. This is a special file added
+ ### by, among others, git-generated tarballs. It holds comments and is
+ ### not meant for extracting. See #38932: pax_global_header extracted
+ } elsif ( $entry->name eq PAX_HEADER ) {
+ $skip = 2;
+ } elsif ($filter_cb && ! $filter_cb->($entry)) {
+ $skip = 3;
+ }
+
+ if ($skip) {
+ #
+ # Since we're skipping, do not allocate memory for the
+ # whole file. Read it 64 BLOCKS at a time. Do not
+ # complete the skip yet because maybe what we read is a
+ # longlink and it won't get skipped after all
+ #
+ my $amt = $block;
+ while ($amt > 0) {
+ $$data = '';
+ my $this = 64 * BLOCK;
+ $this = $amt if $this > $amt;
+ if( $handle->read( $$data, $this ) < $this ) {
+ $self->_error( qq[Read error on tarfile (missing data) '].
+ $entry->full_path ."' at offset $offset" );
+ next LOOP;
+ }
+ $amt -= $this;
+ }
+ ### throw away trailing garbage ###
+ substr ($$data, $entry->size) = "" if defined $$data && $block < 64 * BLOCK;
+ } else {
+
+ ### just read everything into memory
+ ### can't do lazy loading since IO::Zlib doesn't support 'seek'
+ ### this is because Compress::Zlib doesn't support it =/
+ ### this reads in the whole data in one read() call.
+ if ( $handle->read( $$data, $block ) < $block ) {
+ $self->_error( qq[Read error on tarfile (missing data) '].
$entry->full_path ."' at offset $offset" );
- next LOOP;
+ next LOOP;
+ }
+ ### throw away trailing garbage ###
+ substr ($$data, $entry->size) = "" if defined $$data;
}
- ### throw away trailing garbage ###
- substr ($$data, $entry->size) = "" if defined $$data;
-
### part II of the @LongLink munging -- need to do /after/
### the checksum check.
if( $entry->is_longlink ) {
@@ -444,16 +482,17 @@
undef $real_name;
}
- ### skip this entry if we're filtering
- if ($filter && $entry->name !~ $filter) {
- next LOOP;
+ if ($filter && $entry->name !~ $filter) {
+ next LOOP;
- ### skip this entry if it's a pax header. This is a special file added
- ### by, among others, git-generated tarballs. It holds comments and is
- ### not meant for extracting. See #38932: pax_global_header extracted
- } elsif ( $entry->name eq PAX_HEADER ) {
- next LOOP;
- }
+ ### skip this entry if it's a pax header. This is a special file added
+ ### by, among others, git-generated tarballs. It holds comments and is
+ ### not meant for extracting. See #38932: pax_global_header extracted
+ } elsif ( $entry->name eq PAX_HEADER ) {
+ next LOOP;
+ } elsif ($filter_cb && ! $filter_cb->($entry)) {
+ next LOOP;
+ }
$self->_extract_file( $entry ) if $extract
&& !$entry->is_longlink
diff -ur Archive-Tar-1.58.orig/t/05_iter.t Archive-Tar-1.58/t/05_iter.t
--- Archive-Tar-1.58.orig/t/05_iter.t 2009-09-10 04:36:48.000000000 -0700
+++ Archive-Tar-1.58/t/05_iter.t 2010-03-10 15:07:19.000000000 -0800
@@ -30,36 +30,43 @@
my %opts = ();
my @expect = ();
+ my $dotest = sub {
+ my $desc = shift;
+ my $next = $Class->iter( $File, 0, \%opts );
+
+ my $pp_opts = join " => ", %opts;
+ ok( $next, "Iterator created from $File ($pp_opts $desc)" );
+ isa_ok( $next, "CODE", " Iterator $desc" );
+
+ my @names;
+ while( my $f = $next->() ) {
+ ok( $f, " File object retrieved $desc" );
+ isa_ok( $f, $FClass, " Object $desc" );
+
+ push @names, $f->name;
+ }
+
+ is( scalar(@names), scalar(@expect),
+ " Found correct number of files $desc" );
+
+ my $i = 0;
+ for my $name ( @names ) {
+ ok( 1, " Inspecting '$name' $desc" );
+ like($name, $expect[$i]," Matches $Expect[$i] $desc" );
+ $i++;
+ }
+ };
+
### do a full test vs individual filters
if( not ref $index ) {
my $regex = $Expect[$index];
- $opts{'filter'} = $regex;
@expect = ($regex);
+ %opts = ( filter => $regex );
+ $dotest->("filter $regex");
+ %opts = ( filter_cb => sub { my ($entry) = @_; $entry->name() =~ /$regex/ } );
+ $dotest->("filter_cb $regex");
} else {
@expect = @Expect;
- }
-
- my $next = $Class->iter( $File, 0, \%opts );
-
- my $pp_opts = join " => ", %opts;
- ok( $next, "Iterator created from $File ($pp_opts)" );
- isa_ok( $next, "CODE", " Iterator" );
-
- my @names;
- while( my $f = $next->() ) {
- ok( $f, " File object retrieved" );
- isa_ok( $f, $FClass, " Object" );
-
- push @names, $f->name;
- }
-
- is( scalar(@names), scalar(@expect),
- " Found correct number of files" );
-
- my $i = 0;
- for my $name ( @names ) {
- ok( 1, " Inspecting '$name' " );
- like($name, $expect[$i]," Matches $Expect[$i]" );
- $i++;
+ $dotest->("all");
}
}