Skip Menu |

This queue is for tickets about the MediaWiki-DumpFile CPAN distribution.

Report information
The Basics
Id: 53370
Status: new
Priority: 0/
Queue: MediaWiki-DumpFile

People
Owner: Nobody in particular
Requestors: triddle [...] cpan.org
Cc:
AdminCc:

Bug Information
Severity: Wishlist
Broken in: 0.0.1
Fixed in: (no value)



Subject: Faster implementation for parsing SQL files
From Bjoern Hoehrmann <derhoermi@gmx.net>

 package MyTie;
 use strict;
 use warnings;

 # The map comes from the unescape function, my code would warn
 # rather than croak if some unrecognized escape is encountered
 our %ESCAPE_MAP = (
   '\\' => '\\',
   "'"  => "'",
   '"'  => '"',
   'n'  => "\n",
   't'  => "\t",
 );

 BEGIN {
   no strict 'refs';

   # Install forwarding methods
   for my $meth (qw/NEXTKEY FIRSTKEY STORE
     DELETE CLEAR EXISTS SCALAR/) {
     *{$meth} = sub { shift()->{tiedplus}->$meth(@_) }
   }
 }

 sub TIEHASH {
   my ($class, $plus, $unescape) = @_;

   my $self = bless {
     tiedplus => $plus,
     unescape => $unescape
   }, $class;

   $self;
 }

 sub FETCH {
   my ($self, $key) = @_;

   my $value = $self->{tiedplus}->FETCH($key);

   return $value unless $self->{unescape}{$key};

   $value = "$value";
   $value =~ s/\\(.)/$ESCAPE_MAP{$1}/g;
   $value;
 }

 package main;
 use strict;
 use warnings;
 use MediaWiki::DumpFile::SQL;
 use IO::File;
 
 my $file = $ARGV[0];

 # Use MediaWiki::DumpFile::SQL to get the schema
 my $sql = MediaWiki::DumpFile::SQL->new($file);

 my @schema = $sql->schema;
 my $name   = $sql->table_name;
 
 my %unescape =
   map { $_ => 1 } qw/varbinary tinyblob blob char varchar enum/;
 
 my %type2re = (
   int       => q/(?<%s>\d+)/,
   bigint    => q/(?<%s>\d+)/,
   tinyint   => q/(?<%s>\d+)/,
   timestamp => q/(?<%s>\d+)/,
   double    => q/(?<%s>[^,]+)/, # TODO: better expr for e-notation
   varbinary => q/'(?<%s>.*?)'/,
   tinyblob  => q/'(?<%s>.*?)'/,
   blob      => q/'(?<%s>.*?)'/,
   char      => q/'(?<%s>.*?)'/,
   varchar   => q/'(?<%s>.*?)'/,
   enum      => q/'(?<%s>.*?)'/,
 );

 my $re = "";
 foreach my $col (@schema) {
   my ($name, $type) = @$col;

   # The names of named captures cannot be escaped and allow for only
   # a rather limited set of characters, so we do not escpae them here.
   my $inner = sprintf $type2re{$type}, $name;

   $re .= ',' if $re;
   $re .= "(?:NULL|$inner)";
 }

 # Keys where the value needs unescaping
 my %unkeys = map { $_->[0] => 1 }
              grep { $unescape{ $_->[1] } }
              @schema;

 tie my %row, "MyTie", tied %+, \%unkeys;

 my $seen_insert = 0;
 my $f           = IO::File->new('<' . $file);
 my $b           = "";
 
 # Read and parse until we have all parsable entries
 while (!$f->eof) {
   my $tmp;

   # Read in 1MB blocks
   $f->read($tmp, 1024 * 1024);

   die unless defined $tmp;

   $b .= $tmp;

   # Skip to the INSERT statemement
   if (!$seen_insert and $b =~ s/.*?INSERT INTO .*? VALUES //s) {
     $seen_insert = 1;
   }

   next unless $seen_insert;

   # Use /c to memorize the position after match failure
   while ($b =~ /\($re\)[,;]/gc) {
     #
     # process $row here
     #
   }

   # Keep only the remainder
   substr $b, 0, pos($b), '';
 }