Subject: | Faster implementation for parsing SQL files |
From Bjoern Hoehrmann <derhoermi@gmx.net>
package MyTie;
use strict;
use warnings;
# The map comes from the unescape function, my code would warn
# rather than croak if some unrecognized escape is encountered
our %ESCAPE_MAP = (
'\\' => '\\',
"'" => "'",
'"' => '"',
'n' => "\n",
't' => "\t",
);
BEGIN {
no strict 'refs';
# Install forwarding methods
for my $meth (qw/NEXTKEY FIRSTKEY STORE
DELETE CLEAR EXISTS SCALAR/) {
*{$meth} = sub { shift()->{tiedplus}->$meth(@_) }
}
}
sub TIEHASH {
my ($class, $plus, $unescape) = @_;
my $self = bless {
tiedplus => $plus,
unescape => $unescape
}, $class;
$self;
}
sub FETCH {
my ($self, $key) = @_;
my $value = $self->{tiedplus}->FETCH($key);
return $value unless $self->{unescape}{$key};
$value = "$value";
$value =~ s/\\(.)/$ESCAPE_MAP{$1}/g;
$value;
# Use MediaWiki::DumpFile::SQL to get the schema
my $sql = MediaWiki::DumpFile::SQL->new($file);
my @schema = $sql->schema;
my $name = $sql->table_name;
int => q/(?<%s>\d+)/,
bigint => q/(?<%s>\d+)/,
tinyint => q/(?<%s>\d+)/,
timestamp => q/(?<%s>\d+)/,
double => q/(?<%s>[^,]+)/, # TODO: better expr for e-notation
varbinary => q/'(?<%s>.*?)'/,
tinyblob => q/'(?<%s>.*?)'/,
blob => q/'(?<%s>.*?)'/,
char => q/'(?<%s>.*?)'/,
varchar => q/'(?<%s>.*?)'/,
enum => q/'(?<%s>.*?)'/,
);
my $re = "";
foreach my $col (@schema) {
my ($name, $type) = @$col;
# The names of named captures cannot be escaped and allow for only
# a rather limited set of characters, so we do not escpae them here.
my $inner = sprintf $type2re{$type}, $name;
$re .= ',' if $re;
$re .= "(?:NULL|$inner)";
grep { $unescape{ $_->[1] } }
@schema;
tie my %row, "MyTie", tied %+, \%unkeys;
my $seen_insert = 0;
# process $row here
#
}
# Keep only the remainder
package MyTie;
use strict;
use warnings;
# The map comes from the unescape function, my code would warn
# rather than croak if some unrecognized escape is encountered
our %ESCAPE_MAP = (
'\\' => '\\',
"'" => "'",
'"' => '"',
'n' => "\n",
't' => "\t",
);
BEGIN {
no strict 'refs';
# Install forwarding methods
for my $meth (qw/NEXTKEY FIRSTKEY STORE
DELETE CLEAR EXISTS SCALAR/) {
*{$meth} = sub { shift()->{tiedplus}->$meth(@_) }
}
}
sub TIEHASH {
my ($class, $plus, $unescape) = @_;
my $self = bless {
tiedplus => $plus,
unescape => $unescape
}, $class;
$self;
}
sub FETCH {
my ($self, $key) = @_;
my $value = $self->{tiedplus}->FETCH($key);
return $value unless $self->{unescape}{$key};
$value = "$value";
$value =~ s/\\(.)/$ESCAPE_MAP{$1}/g;
$value;
}
package main;
use strict;
use warnings;
use MediaWiki::DumpFile::SQL;
use IO::File;
my $file = $ARGV[0];package main;
use strict;
use warnings;
use MediaWiki::DumpFile::SQL;
use IO::File;
# Use MediaWiki::DumpFile::SQL to get the schema
my $sql = MediaWiki::DumpFile::SQL->new($file);
my @schema = $sql->schema;
my $name = $sql->table_name;
my %unescape =
map { $_ => 1 } qw/varbinary tinyblob blob char varchar enum/;
my %type2re = (map { $_ => 1 } qw/varbinary tinyblob blob char varchar enum/;
int => q/(?<%s>\d+)/,
bigint => q/(?<%s>\d+)/,
tinyint => q/(?<%s>\d+)/,
timestamp => q/(?<%s>\d+)/,
double => q/(?<%s>[^,]+)/, # TODO: better expr for e-notation
varbinary => q/'(?<%s>.*?)'/,
tinyblob => q/'(?<%s>.*?)'/,
blob => q/'(?<%s>.*?)'/,
char => q/'(?<%s>.*?)'/,
varchar => q/'(?<%s>.*?)'/,
enum => q/'(?<%s>.*?)'/,
);
my $re = "";
foreach my $col (@schema) {
my ($name, $type) = @$col;
# The names of named captures cannot be escaped and allow for only
# a rather limited set of characters, so we do not escpae them here.
my $inner = sprintf $type2re{$type}, $name;
$re .= ',' if $re;
$re .= "(?:NULL|$inner)";
}
# Keys where the value needs unescaping
my %unkeys = map { $_->[0] => 1 }# Keys where the value needs unescaping
grep { $unescape{ $_->[1] } }
@schema;
tie my %row, "MyTie", tied %+, \%unkeys;
my $seen_insert = 0;
my $f = IO::File->new('<' . $file);
my $b = "";
my $b = "";
# Read and parse until we have all parsable entries
while (!$f->eof) {
my $tmp;
# Read in 1MB blocks
$f->read($tmp, 1024 * 1024);
die unless defined $tmp;
$b .= $tmp;
# Skip to the INSERT statemement
if (!$seen_insert and $b =~ s/.*?INSERT INTO .*? VALUES //s) {
$seen_insert = 1;
}
next unless $seen_insert;
# Use /c to memorize the position after match failure
while ($b =~ /\($re\)[,;]/gc) {
#while (!$f->eof) {
my $tmp;
# Read in 1MB blocks
$f->read($tmp, 1024 * 1024);
die unless defined $tmp;
$b .= $tmp;
# Skip to the INSERT statemement
if (!$seen_insert and $b =~ s/.*?INSERT INTO .*? VALUES //s) {
$seen_insert = 1;
}
next unless $seen_insert;
# Use /c to memorize the position after match failure
while ($b =~ /\($re\)[,;]/gc) {
# process $row here
#
}
# Keep only the remainder
substr $b, 0, pos($b), '';
}
}