Subject: | RDF::Core::Storage::Memory is slow for large files |
When parsing RDF files larger than a few megabytes, Memory.pm uses too
much RAM and is too slow to be usable. I wrote a replacement, which is
attached.
Subject: | Memory.pm |
package RDF::Core::Storage::Memory;
use warnings;
use strict;
use Carp;
use vars qw ( $VERSION );
$RDF::Core::Storage::Memory::VERSION = '1.00';
use RDF::Core::Enumerator::Memory;
use constant _SUBJECT => 0;
use constant _SUBJECTS => 0;
use constant _OBJECT => 1;
use constant _OBJECTS => 1;
use constant _PREDICATE => 2;
use constant _DATA => 2;
use constant _DELETED_DATA => 3;
use constant _COUNTER => 4;
sub new
{
my ($pkg,%options) = @_;
my $class = ref $pkg || $pkg;
my $struct = [
{}, # index of subjects
{}, # index of objects
[], # data
[], # deleted data indices
0 # next index ID
];
my $self = bless $struct, $class;
return $self;
}
sub addStmt
{
my ($self,$stmt) = @_;
my $subject = $stmt->getSubject();
my $object = $stmt->getObject();
my $predicate = $stmt->getPredicate();
return 0 if $self->existsStmt($subject,$predicate,$object);
my $subjectURI = $subject->getURI();
if(exists $self->[_SUBJECTS]->{$subjectURI}){
$self->[_SUBJECTS]->{$subjectURI} .= pack('L',$self->[_COUNTER]);
} else {
$self->[_SUBJECTS]->{$subjectURI} = pack('L',$self->[_COUNTER]);
}
my $objectLabel = $object->getLabel();
if(exists $self->[_OBJECTS]->{$objectLabel}){
$self->[_OBJECTS]->{$objectLabel} .= pack('L',$self->[_COUNTER]);
} else {
$self->[_OBJECTS]->{$objectLabel} = pack('L',$self->[_COUNTER]);
}
my $data = [];
$data->[_SUBJECT] = $subject;
$data->[_OBJECT] = $object;
$data->[_PREDICATE] = $predicate;
if($#{$self->[_DELETED_DATA]} > -1){
$self->[_DATA]->[pop @{$self->[_DELETED_DATA]}] = $data;
} else {
$self->[_DATA]->[$self->[_COUNTER]] = $data;
$self->[_COUNTER]++;
}
return 1;
}
sub removeStmt
{
my ($self,$stmt) = @_;
my $subjectURI = $stmt->getSubject->getURI() || return;
my $predicateURI = $stmt->getPredicate->getURI() || return;
my $objectLabel = $stmt->getObject->getLabel() || return;
my $subjectIndices = $self->_getSubjectIndicesByLabel($subjectURI) || return;
my $objectIndices = $self->_getObjectIndicesByLabel(_OBJECTS, $objectLabel) || return;
my ($index,%indexArrayCount);
foreach(@{$objectIndices}){
$indexArrayCount{$_} = 1;
}
foreach(@{$subjectIndices}){
if(exists $indexArrayCount{$_}){
if($self->[_DATA]->[$_]->[_PREDICATE]->getURI() eq $predicateURI){
$index = $_;
last;
}
}
}
return unless defined $index;
$self->_removeIndexByTypeAndLabel(_SUBJECTS, $subjectURI, $index);
$self->_removeIndexByTypeAndLabel(_OBJECTS, $objectLabel, $index);
$self->[_DATA]->[$index] = undef;
push @{$self->[_DELETED_DATA]}, $index;
}
sub getStmts {
my ($self, $subject, $predicate, $object) = @_;
my $data = [];
my $indices = $self->_getStmts($subject, $predicate, $object);
foreach my $index(@{$indices}){
my $d = $self->[_DATA]->[$index];
push @{$data}, RDF::Core::Statement->new($d->[_SUBJECT]->clone(),
$d->[_PREDICATE]->clone(),
$d->[_OBJECT]->clone());
}
return RDF::Core::Enumerator::Memory->new($data) ;
}
sub countStmts
{
my ($self, $subject, $predicate, $object) = @_;
my $data = $self->_getStmts($subject, $predicate, $object);
return $#{$data} + 1;
}
*existsStmt = \&countStmts;
# PRIVATE METHODS BELOW
# DO NOT USE OUTSIDE THIS FILE!
sub _getSubjectIndicesByLabel
{
my($self,$label) = @_;
if(exists $self->[_SUBJECTS]->{$label}){
return [unpack('L*',$self->[_SUBJECTS]->{$label})];
}
}
sub _getObjectIndicesByLabel
{
my($self,$label) = @_;
if(exists $self->[_OBJECTS]->{$label}){
return [unpack('L*',$self->[_OBJECTS]->{$label})];
}
}
## FIXME! NEEDS TO BE REMOVED ###
sub _getPredicateIndicesByLabel
{
my($self,$label) = @_;
my @matches;
my $matched = 0;
for(my $i=0;$i<=$#{$self->[_DATA]};$i++){
if(defined $self->[_DATA]->[$i]){
if($self->[_DATA]->[$i]->[_PREDICATE]->getURI() eq $label){
push @matches, $i;
$matched = 1;
}
}
}
if($matched){
return \@matches;
} else {
return undef;
}
}
sub _removeIndexByTypeAndLabel
{
my ($self,$type,$label,$index) = @_;
if(exists $self->[$type]->{$label}){
my $p = pack('L',$index);
if ($self->[$type]->{$label} eq $p){
delete $self->[$type]->{$label};
} else {
$self->[$type]->{$label} =~ s/^(....)*?$p(....)*?$/$1$2/s;
}
}
}
sub _getStmts
{
my ($self, $subject, $predicate, $object) = @_;
# this is a grim conditional, but it's the fastest way - I'm sorry
if (defined $subject){
my $subjects = $self->_getSubjectIndicesByLabel($subject->getURI()) || return [];
if (defined $object){
my $objects = $self->_getObjectIndicesByLabel($object->getLabel()) || return [];
my %matches;
my $continue = 0;
foreach my $index(@{$subjects}){
$matches{$index} = 1;
}
if (defined $predicate){ # $subject, $object and $predicate defined
my @matches;
my $predicateURI = $predicate->getURI();
foreach my $index(@{$objects}){
if(exists $matches{$index}){
if($self->[_DATA]->[$index]->[_PREDICATE]->getURI() eq $predicateURI){
push @matches, $index;
}
}
}
return \@matches;
} else { # $subject and $object defined
my @matches;
foreach my $index(@{$objects}){
if(exists $matches{$index}){
push @matches, $index;
}
}
return \@matches;
}
} elsif (defined $predicate){ # $subject and $predicate defined
# TODO: the following line should replicate the predicate search in the block above
my $predicates = $self->_getPredicateIndicesByLabel($predicate->getURI()) || return [];
my %matches;
my @matches;
foreach my $index(@{$subjects}){
$matches{$index} = 1;
}
foreach my $index(@{$predicates}){
if(exists $matches{$index}){
push @matches, $index;
}
}
return \@matches;
} else { # $subject defined
return $subjects;
}
} elsif (defined $object){
my $objects = $self->_getObjectIndicesByLabel($object->getLabel()) || return [];
if (defined $predicate){ # $object and $predicate defined
my $predicates = $self->_getPredicateIndicesByLabel($predicate->getURI()) || return [];
my %matches;
my @matches;
foreach my $index(@{$objects}){
$matches{$index} = 1;
}
foreach my $index(@{$predicates}){
if(exists $matches{$index}){
push @matches, $index;
}
}
return \@matches;
} else { # $object defined
return $objects;
}
} elsif (defined $predicate){ # $predicate defined
my $predicates = $self->_getPredicateIndicesByLabel($predicate->getURI()) || return [];
return $predicates;
} else { # nothing defined
my @defined;
for (my $i=0;$i<=$#{$self->[_DATA]};$i++){
if (defined $self->[_DATA]->[$i]){
push @defined, $i;
}
}
return \@defined;
}
carp("oops - this shouldn't happen");
}
1;
__END__
=head1 NAME
RDF::Core::Storage::Memory - a fast, light storage solution for RDF::Core
=head2 Interface
=over 4
=item * new()
=item * addStmt($statement)
=item * removeStmt($statement)
=item * existsStmt($subject,$predicate,$object)
=item * countStmts($subject,$predicate,$object)
=item * getStmts($subject,$predicate,$object)
=back
=head1 LICENSE
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
=head1 AUTHOR
Copyright (c) 2006, Nigel Wetters Gourlay, based on interface from RDF::Core::Storage.
=head1 SEE ALSO
RDF::Core::Storage
=cut