Bug #53370 for MediaWiki-DumpFile: Faster implementation for parsing SQL files

Subject:

Faster implementation for parsing SQL files

From Bjoern Hoehrmann <derhoermi@gmx.net>

package MyTie;
use strict;
use warnings;

# The map comes from the unescape function, my code would warn
# rather than croak if some unrecognized escape is encountered
our %ESCAPE_MAP = (
'\\' => '\\',
"'" => "'",
'"' => '"',
'n' => "\n",
't' => "\t",
);

BEGIN {
no strict 'refs';

# Install forwarding methods
for my $meth (qw/NEXTKEY FIRSTKEY STORE
DELETE CLEAR EXISTS SCALAR/) {
*{$meth} = sub { shift()->{tiedplus}->$meth(@_) }
}
}

sub TIEHASH {
my ($class, $plus, $unescape) = @_;

my $self = bless {
tiedplus => $plus,
unescape => $unescape
}, $class;

$self;
}

sub FETCH {
my ($self, $key) = @_;

my $value = $self->{tiedplus}->FETCH($key);

return $value unless $self->{unescape}{$key};

$value = "$value";
$value =~ s/\\(.)/$ESCAPE_MAP{$1}/g;
$value;

}

package main;
use strict;
use warnings;
use MediaWiki::DumpFile::SQL;
use IO::File;

my $file = $ARGV[0];

# Use MediaWiki::DumpFile::SQL to get the schema
my $sql = MediaWiki::DumpFile::SQL->new($file);

my @schema = $sql->schema;
my $name = $sql->table_name;

my %unescape =
map { $_ => 1 } qw/varbinary tinyblob blob char varchar enum/;

my %type2re = (
int => q/(?<%s>\d+)/,
bigint => q/(?<%s>\d+)/,
tinyint => q/(?<%s>\d+)/,
timestamp => q/(?<%s>\d+)/,
double => q/(?<%s>[^,]+)/, # TODO: better expr for e-notation
varbinary => q/'(?<%s>.*?)'/,
tinyblob => q/'(?<%s>.*?)'/,
blob => q/'(?<%s>.*?)'/,
char => q/'(?<%s>.*?)'/,
varchar => q/'(?<%s>.*?)'/,
enum => q/'(?<%s>.*?)'/,
);

my $re = "";
foreach my $col (@schema) {
my ($name, $type) = @$col;

# The names of named captures cannot be escaped and allow for only
# a rather limited set of characters, so we do not escpae them here.
my $inner = sprintf $type2re{$type}, $name;

$re .= ',' if $re;
$re .= "(?:NULL|$inner)";

}

# Keys where the value needs unescaping

my %unkeys = map { $_->[0] => 1 }
grep { $unescape{ $_->[1] } }
@schema;

tie my %row, "MyTie", tied %+, \%unkeys;

my $seen_insert = 0;

my $f = IO::File->new('<' . $file);
my $b = "";

# Read and parse until we have all parsable entries
while (!$f->eof) {
my $tmp;

# Read in 1MB blocks
$f->read($tmp, 1024 * 1024);

die unless defined $tmp;

$b .= $tmp;

# Skip to the INSERT statemement
if (!$seen_insert and $b =~ s/.*?INSERT INTO .*? VALUES //s) {
$seen_insert = 1;
}

next unless $seen_insert;

# Use /c to memorize the position after match failure
while ($b =~ /$$re$[,;]/gc) {

#
# process $row here
#
}

# Keep only the remainder

substr $b, 0, pos($b), '';
}