Bug #24256 for RDF-Core: RDF::Core::Storage::Memory is slow for large files

Sun Jan 07 03:03:21 2007 nwetters [...] cpan.org - Ticket created

Subject:

RDF::Core::Storage::Memory is slow for large files

When parsing RDF files larger than a few megabytes, Memory.pm uses too much RAM and is too slow to be usable. I wrote a replacement, which is attached.

Subject:

Memory.pm

package RDF::Core::Storage::Memory; use warnings; use strict; use Carp; use vars qw ( $VERSION ); $RDF::Core::Storage::Memory::VERSION = '1.00'; use RDF::Core::Enumerator::Memory; use constant _SUBJECT => 0; use constant _SUBJECTS => 0; use constant _OBJECT => 1; use constant _OBJECTS => 1; use constant _PREDICATE => 2; use constant _DATA => 2; use constant _DELETED_DATA => 3; use constant _COUNTER => 4; sub new { my ($pkg,%options) = @_; my $class = ref $pkg || $pkg; my $struct = [ {}, # index of subjects {}, # index of objects [], # data [], # deleted data indices 0 # next index ID ]; my $self = bless $struct, $class; return $self; } sub addStmt { my ($self,$stmt) = @_; my $subject = $stmt->getSubject(); my $object = $stmt->getObject(); my $predicate = $stmt->getPredicate(); return 0 if $self->existsStmt($subject,$predicate,$object); my $subjectURI = $subject->getURI(); if(exists $self->[_SUBJECTS]->{$subjectURI}){ $self->[_SUBJECTS]->{$subjectURI} .= pack('L',$self->[_COUNTER]); } else { $self->[_SUBJECTS]->{$subjectURI} = pack('L',$self->[_COUNTER]); } my $objectLabel = $object->getLabel(); if(exists $self->[_OBJECTS]->{$objectLabel}){ $self->[_OBJECTS]->{$objectLabel} .= pack('L',$self->[_COUNTER]); } else { $self->[_OBJECTS]->{$objectLabel} = pack('L',$self->[_COUNTER]); } my $data = []; $data->[_SUBJECT] = $subject; $data->[_OBJECT] = $object; $data->[_PREDICATE] = $predicate; if($#{$self->[_DELETED_DATA]} > -1){ $self->[_DATA]->[pop @{$self->[_DELETED_DATA]}] = $data; } else { $self->[_DATA]->[$self->[_COUNTER]] = $data; $self->[_COUNTER]++; } return 1; } sub removeStmt { my ($self,$stmt) = @_; my $subjectURI = $stmt->getSubject->getURI() || return; my $predicateURI = $stmt->getPredicate->getURI() || return; my $objectLabel = $stmt->getObject->getLabel() || return; my $subjectIndices = $self->_getSubjectIndicesByLabel($subjectURI) || return; my $objectIndices = $self->_getObjectIndicesByLabel(_OBJECTS, $objectLabel) || return; my ($index,%indexArrayCount); foreach(@{$objectIndices}){ $indexArrayCount{$_} = 1; } foreach(@{$subjectIndices}){ if(exists $indexArrayCount{$_}){ if($self->[_DATA]->[$_]->[_PREDICATE]->getURI() eq $predicateURI){ $index = $_; last; } } } return unless defined $index; $self->_removeIndexByTypeAndLabel(_SUBJECTS, $subjectURI, $index); $self->_removeIndexByTypeAndLabel(_OBJECTS, $objectLabel, $index); $self->[_DATA]->[$index] = undef; push @{$self->[_DELETED_DATA]}, $index; } sub getStmts { my ($self, $subject, $predicate, $object) = @_; my $data = []; my $indices = $self->_getStmts($subject, $predicate, $object); foreach my $index(@{$indices}){ my $d = $self->[_DATA]->[$index]; push @{$data}, RDF::Core::Statement->new($d->[_SUBJECT]->clone(), $d->[_PREDICATE]->clone(), $d->[_OBJECT]->clone()); } return RDF::Core::Enumerator::Memory->new($data) ; } sub countStmts { my ($self, $subject, $predicate, $object) = @_; my $data = $self->_getStmts($subject, $predicate, $object); return $#{$data} + 1; } *existsStmt = \&countStmts; # PRIVATE METHODS BELOW # DO NOT USE OUTSIDE THIS FILE! sub _getSubjectIndicesByLabel { my($self,$label) = @_; if(exists $self->[_SUBJECTS]->{$label}){ return [unpack('L*',$self->[_SUBJECTS]->{$label})]; } } sub _getObjectIndicesByLabel { my($self,$label) = @_; if(exists $self->[_OBJECTS]->{$label}){ return [unpack('L*',$self->[_OBJECTS]->{$label})]; } } ## FIXME! NEEDS TO BE REMOVED ### sub _getPredicateIndicesByLabel { my($self,$label) = @_; my @matches; my $matched = 0; for(my $i=0;$i<=$#{$self->[_DATA]};$i++){ if(defined $self->[_DATA]->[$i]){ if($self->[_DATA]->[$i]->[_PREDICATE]->getURI() eq $label){ push @matches, $i; $matched = 1; } } } if($matched){ return \@matches; } else { return undef; } } sub _removeIndexByTypeAndLabel { my ($self,$type,$label,$index) = @_; if(exists $self->[$type]->{$label}){ my $p = pack('L',$index); if ($self->[$type]->{$label} eq $p){ delete $self->[$type]->{$label}; } else { $self->[$type]->{$label} =~ s/^(....)*?$p(....)*?$/$1$2/s; } } } sub _getStmts { my ($self, $subject, $predicate, $object) = @_; # this is a grim conditional, but it's the fastest way - I'm sorry if (defined $subject){ my $subjects = $self->_getSubjectIndicesByLabel($subject->getURI()) || return []; if (defined $object){ my $objects = $self->_getObjectIndicesByLabel($object->getLabel()) || return []; my %matches; my $continue = 0; foreach my $index(@{$subjects}){ $matches{$index} = 1; } if (defined $predicate){ # $subject, $object and $predicate defined my @matches; my $predicateURI = $predicate->getURI(); foreach my $index(@{$objects}){ if(exists $matches{$index}){ if($self->[_DATA]->[$index]->[_PREDICATE]->getURI() eq $predicateURI){ push @matches, $index; } } } return \@matches; } else { # $subject and $object defined my @matches; foreach my $index(@{$objects}){ if(exists $matches{$index}){ push @matches, $index; } } return \@matches; } } elsif (defined $predicate){ # $subject and $predicate defined # TODO: the following line should replicate the predicate search in the block above my $predicates = $self->_getPredicateIndicesByLabel($predicate->getURI()) || return []; my %matches; my @matches; foreach my $index(@{$subjects}){ $matches{$index} = 1; } foreach my $index(@{$predicates}){ if(exists $matches{$index}){ push @matches, $index; } } return \@matches; } else { # $subject defined return $subjects; } } elsif (defined $object){ my $objects = $self->_getObjectIndicesByLabel($object->getLabel()) || return []; if (defined $predicate){ # $object and $predicate defined my $predicates = $self->_getPredicateIndicesByLabel($predicate->getURI()) || return []; my %matches; my @matches; foreach my $index(@{$objects}){ $matches{$index} = 1; } foreach my $index(@{$predicates}){ if(exists $matches{$index}){ push @matches, $index; } } return \@matches; } else { # $object defined return $objects; } } elsif (defined $predicate){ # $predicate defined my $predicates = $self->_getPredicateIndicesByLabel($predicate->getURI()) || return []; return $predicates; } else { # nothing defined my @defined; for (my $i=0;$i<=$#{$self->[_DATA]};$i++){ if (defined $self->[_DATA]->[$i]){ push @defined, $i; } } return \@defined; } carp("oops - this shouldn't happen"); } 1; __END__ =head1 NAME RDF::Core::Storage::Memory - a fast, light storage solution for RDF::Core =head2 Interface =over 4 =item * new() =item * addStmt($statement) =item * removeStmt($statement) =item * existsStmt($subject,$predicate,$object) =item * countStmts($subject,$predicate,$object) =item * getStmts($subject,$predicate,$object) =back =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 AUTHOR Copyright (c) 2006, Nigel Wetters Gourlay, based on interface from RDF::Core::Storage. =head1 SEE ALSO RDF::Core::Storage =cut

Fri Feb 16 08:21:35 2007 dan [...] gingerall.cz - Taken

Fri Feb 16 08:59:13 2007 dan [...] gingerall.cz - Correspondence added

On Sun Jan 07 03:03:21 2007, NWETTERS wrote: Show quoted text

> When parsing RDF files larger than a few megabytes, Memory.pm uses too > much RAM and is too slow to be usable. I wrote a replacement, which is > attached.

Hi Nigel, again, thanks for your help. Your replacement seems to incorrectly handle literals with datatypes and/or language tags. I'm enclosing a test file. (Shame on me these tests aren't part of distribution yet.) Regards, Dan

use lib qw(../blib/lib ../blib/lib/auto ); use strict; use warnings; use Data::Dumper; use Test::More qw(no_plan); use File::Temp qw(tempfile); use RDF::Core::Model; use RDF::Core::Literal; use RDF::Core::Storage::Memory; use RDF::Core::Storage::DB_File; use RDF::Core::Storage::Postgres; my @storage = ( ['Memory', sub { return (RDF::Core::Storage::Memory->new(), sub{}) }], ['DB_File', sub { my ($fh, $filename) = tempfile(); return (RDF::Core::Storage::DB_File->new( Name => $filename ), sub { unlink $filename }); }], ['Postgres', sub{ my $s = new RDF::Core::Storage::Postgres ( ConnectStr=>'dbi:Pg:dbname=rdf', DBUser=>'postgres', Model=>'rdf-test-01'); sub cleanup { my $enum = $s->getStmts(); my $stmt = $enum->getNext; while (defined $stmt) { $s->removeStmt($stmt); $stmt = $enum->getNext; } } cleanup(); return ($s,sub{},\&cleanup); }], ); my $subj = RDF::Core::Resource->new( 'http://kr.newco.com/test.owl#Restaurants_fctsont_restaurantsF' ); my $pred = RDF::Core::Resource->new( 'http://www.w3.org/2000/01/rdf-schema#label' ); my @obj = (RDF::Core::Literal->new("Restaurant"), RDF::Core::Literal->new("Restaurant","en"), RDF::Core::Literal->new("Restaurant","fr"), RDF::Core::Literal->new("Restaurant",undef,"http://www.w3.org/2001/XMLSchema#string"), RDF::Core::Literal->new("Restaurant","en","http://www.w3.org/2001/XMLSchema#string"), RDF::Core::Literal->new("Restaurant","fr","http://www.w3.org/2001/XMLSchema#string"), ); foreach my $storage_data (@storage) { my ($storage_name, $storage_factory) = @{ $storage_data }; warn "# $storage_name\n"; my ($storage, $storage_cleanup) = &$storage_factory; my $model = new RDF::Core::Model (Storage => $storage); # Empty model initiated #################################################### is ($model->existsStmt, 0, "empty model"); my $cnt = 1; foreach (@obj) { $model->addStmt(new RDF::Core::Statement($subj, $pred, $_)); is ($model->countStmts, $cnt, "countStmts raised to $cnt"); is (get_and_count($model), $cnt, "getStmts count raised to $cnt"); $cnt++; } # statements added ######################################################### is ($model->existsStmt, 1, "not empty model (statements added)"); $storage_cleanup->(); } # TOOLS ######################################################################## sub get_and_count { my ($model, $sub, $pred, $obj) = @_; my $enum = $model->getStmts($sub, $pred, $obj); my $cnt = 0; while (defined $enum->getNext) { $cnt++; } $enum->close; return $cnt; }

Fri Feb 16 08:59:17 2007 The RT System itself - Status changed from 'new' to 'open'