On Tue Feb 21 11:39:04 2017, SPROUT wrote:
Show quoted text> I’m trying to extract one page at a time from a collection of PDFs,
> some of which are 200MB. I was shocked to see that CAM::PDF reads the
> entire PDF into memory, which unfortunately makes it unusable for my
> particular use case. It would be really nice if it could read just
> the parts of the file that it needs. Maybe using File::Map is a
> possibility.
Here is a patch that allows File::Map to be chosen by the caller. It does not include documentation. It works, as far as I can tell, but I don’t know whether you want to include such an option.
--- a/lib/CAM/PDF.pm 2017-02-09 18:01:04.000000000 -0800
+++ b/lib/CAM/PDF.pm 2017-02-21 18:18:09.000000000 -0800
@@ -296,6 +296,26 @@
};
}
+ my $self = {
+ options => $options,
+
+ pdfversion => undef,
+ maxstr => $CAM::PDF::MAX_STRING, # length of output string
+ content => '',
+ contentlength => length $content,
+ xref => {},
+ maxobj => 0,
+ changes => {},
+ versions => {},
+
+ # Caches:
+ objcache => {},
+ pagecache => {},
+ formcache => {},
+ Names => {},
+ NameObjects => {},
+ fontmetrics => {},
+ };
my $pdfversion = '1.2';
if ($content =~ m/ \A%PDF-([\d.]+) /xms)
@@ -305,6 +325,7 @@
{
$pdfversion = $ver;
}
+ $self->{content} = $content;
}
else
{
@@ -313,22 +334,30 @@
my $file = $content;
if ($file eq q{-})
{
- $content = q{};
my $offset = 0;
my $step = 4096;
binmode STDIN; ##no critic (Syscalls)
- while ($step == read STDIN, $content, $step, $offset)
+ while ($step == read STDIN, $self->{content}, $step, $offset)
{
$offset += $step;
}
}
else
{
- if (open my $fh, '<', $file)
+ if ($options->{use_file_map}) {
+ require File::Map;
+ unless(
+ eval { File::Map::map_file($self->{content}, $file); 1 }
+ ) {
+ $CAM::PDF::errstr = "Failed to open $file: $ERRNO\n";
+ return;
+ }
+ }
+ elsif (open my $fh, '<', $file)
{
binmode $fh; ##no critic (Syscalls)
my $size = -s $file;
- if ($size != read $fh, $content, $size) {
+ if ($size != read $fh, $self->{content}, $size) {
$CAM::PDF::errstr = "Failed to read $file bytes\n";
return;
}
@@ -344,7 +373,7 @@
}
}
}
- if ($content =~ m/ \A%PDF-([\d.]+) /xms)
+ if ($self->{content} =~ m/ \A%PDF-([\d.]+) /xms)
{
my $ver = $1;
if ($ver && $ver > $pdfversion)
@@ -358,28 +387,9 @@
return;
}
}
+ $self->{pdfversion} = $pdfversion;
#warn "got pdfversion $pdfversion\n";
- my $self = {
- options => $options,
-
- pdfversion => $pdfversion,
- maxstr => $CAM::PDF::MAX_STRING, # length of output string
- content => $content,
- contentlength => length $content,
- xref => {},
- maxobj => 0,
- changes => {},
- versions => {},
-
- # Caches:
- objcache => {},
- pagecache => {},
- formcache => {},
- Names => {},
- NameObjects => {},
- fontmetrics => {},
- };
bless $self, $pkg;
if (!$self->_startdoc())
{