Subject: | Optimize Hits::fetch_hit_hashref |
Hits::fetch_hit_hashref creates a full document object with full field
objects, then promptly extracts the field values. It would be more
efficient to retrieve only the values.
The attached patch implements fetch_field_values() in FieldsReader and
all the necessary classes leading up to Searcher.
Unfortunately, it causes problems with Highlighter, which needs to get
the term vector string from a full Field object, which is not available
when using fetch_field_values().
Subject: | fetch_field_values.patch |
Index: lib/KinoSearch/Search/Hits.pm
===================================================================
--- lib/KinoSearch/Search/Hits.pm (revision 921)
+++ lib/KinoSearch/Search/Hits.pm (working copy)
@@ -80,9 +80,8 @@
# lazily fetch stored fields
my $hit_doc = $self->{hit_docs}[ $self->{pointer}++ ];
- $hit_doc->set_doc( $self->{searcher}->fetch_doc( $hit_doc->get_doc_num ) )
- unless defined $hit_doc->get_doc;
- my $hashref = $hit_doc->get_doc()->to_hashref;
+ my $hashref = $self->{searcher}->fetch_field_values(
+ $hit_doc->get_doc_num );
if ( !exists $hashref->{score} ) {
$hashref->{score} = $hit_doc->get_score;
Index: lib/KinoSearch/Search/Searchable.pm
===================================================================
--- lib/KinoSearch/Search/Searchable.pm (revision 921)
+++ lib/KinoSearch/Search/Searchable.pm (working copy)
@@ -65,6 +65,18 @@
=begin comment
+ my $hashref = $reader->fietch_field_values( $doc_num, \@field_names );
+
+Given a document number and an arrayref of field names, return the values of
+all named stored fields as a hashref.
+
+=end comment
+=cut
+
+sub fetch_field_values { shift->abstract_death }
+
+=begin comment
+
my $doc_freq = $searchable->doc_freq($term);
Return the number of documents which contain this Term. Used for calculating
Index: lib/KinoSearch/Index/MultiReader.pm
===================================================================
--- lib/KinoSearch/Index/MultiReader.pm (revision 921)
+++ lib/KinoSearch/Index/MultiReader.pm (working copy)
@@ -76,6 +76,13 @@
return $self->{sub_readers}[$reader_index]->fetch_doc($doc_num);
}
+sub fetch_field_values {
+ my ( $self, $doc_num ) = @_;
+ my $reader_index = $self->_reader_index($doc_num);
+ $doc_num -= $self->{starts}[$reader_index];
+ return $self->{sub_readers}[$reader_index]->fetch_field_values($doc_num);
+}
+
sub delete_docs_by_term {
my ( $self, $term ) = @_;
$_->delete_docs_by_term($term) for @{ $self->{sub_readers} };
Index: lib/KinoSearch/Index/FieldsReader.pm
===================================================================
--- lib/KinoSearch/Index/FieldsReader.pm (revision 921)
+++ lib/KinoSearch/Index/FieldsReader.pm (working copy)
@@ -95,6 +95,47 @@
return $doc;
}
+# Return a doc's field values as a hashref, avoiding the OO overhead of
+# fetch_doc.
+sub fetch_field_values {
+ my ( $self, $doc_num, $fields_wanted ) = @_;
+ my ( $finfos, $findex_stream, $fdata_stream )
+ = @{$self}{qw( finfos findex_stream fdata_stream )};
+ my %field_values;
+ my %desired_fields;
+ if ( !defined $fields_wanted or !@$fields_wanted ) {
+ for my $finfo ( $finfos->get_infos ) {
+ $desired_fields{ $finfo->get_field_num } = $finfo->get_name;
+ }
+ }
+ else {
+ for my $field_name (@$fields_wanted) {
+ my $finfo = $finfos->info_by_name($_);
+ $desired_fields{ $finfos->get_num } = $field_name;
+ }
+ }
+
+ # get data file pointer from index and seek the data file instream
+ $findex_stream->seek( $doc_num * 8 );
+ my $start = $findex_stream->lu_read('Q');
+ $fdata_stream->seek($start);
+
+ # retrieve one doc's worth of field data
+ my $num_fields = $fdata_stream->lu_read('V');
+ for ( 1 .. $num_fields ) {
+ my ( $field_num, $bits, $content, $tv_string )
+ = $fdata_stream->lu_read('VaTT');
+ next unless exists $desired_fields{$field_num};
+ my $field_name = $desired_fields{$field_num};
+ $field_values{$field_name} =
+ ( $bits & COMPRESSED ) eq COMPRESSED
+ ? uncompress($content)
+ : $content;
+ }
+
+ return \%field_values;
+}
+
sub decode_fdt_bits {
my ( undef, $field, $bits ) = @_;
$field->set_analyzed( ( $bits & ANALYZED ) eq ANALYZED );
Index: lib/KinoSearch/Index/IndexReader.pm
===================================================================
--- lib/KinoSearch/Index/IndexReader.pm (revision 921)
+++ lib/KinoSearch/Index/IndexReader.pm (working copy)
@@ -167,6 +167,18 @@
=begin comment
+ my $hashref = $reader->fietch_field_values( $doc_num, \@field_names );
+
+Given a document number and an arrayref of field names, return the values of
+all named stored fields as a hashref.
+
+=end comment
+=cut
+
+sub fetch_field_values { shift->abstract_death }
+
+=begin comment
+
my $infos = $reader->generate_field_infos;
Return a new FieldInfos object, describing all the fields held by the reader.
Index: lib/KinoSearch/Index/SegReader.pm
===================================================================
--- lib/KinoSearch/Index/SegReader.pm (revision 921)
+++ lib/KinoSearch/Index/SegReader.pm (working copy)
@@ -179,6 +179,11 @@
$_[0]->{fields_reader}->fetch_doc( $_[1] );
}
+sub fetch_field_values {
+ my $self = shift;
+ return $self->{fields_reader}->fetch_field_values(@_);
+}
+
sub segreaders_to_merge {
my ( $self, $all ) = @_;
return $self if $all;
Index: lib/KinoSearch/Searcher.pm
===================================================================
--- lib/KinoSearch/Searcher.pm (revision 921)
+++ lib/KinoSearch/Searcher.pm (working copy)
@@ -133,6 +133,12 @@
}
sub fetch_doc { $_[0]->{reader}->fetch_doc( $_[1] ) }
+
+sub fetch_field_values {
+ my $self = shift;
+ return $self->{reader}->fetch_field_values(@_);
+}
+
sub max_doc { shift->{reader}->max_doc }
sub doc_freq {