Bug #19233 for KinoSearch: Optimize Hits::fetch_hit

Fri May 12 22:14:52 2006 CREAMYG [...] cpan.org - Ticket created

Subject:

Optimize Hits::fetch_hit_hashref

Hits::fetch_hit_hashref creates a full document object with full field objects, then promptly extracts the field values. It would be more efficient to retrieve only the values. The attached patch implements fetch_field_values() in FieldsReader and all the necessary classes leading up to Searcher. Unfortunately, it causes problems with Highlighter, which needs to get the term vector string from a full Field object, which is not available when using fetch_field_values().

Subject:

fetch_field_values.patch

Index: lib/KinoSearch/Search/Hits.pm =================================================================== --- lib/KinoSearch/Search/Hits.pm (revision 921) +++ lib/KinoSearch/Search/Hits.pm (working copy) @@ -80,9 +80,8 @@ # lazily fetch stored fields my $hit_doc = $self->{hit_docs}[ $self->{pointer}++ ]; - $hit_doc->set_doc( $self->{searcher}->fetch_doc( $hit_doc->get_doc_num ) ) - unless defined $hit_doc->get_doc; - my $hashref = $hit_doc->get_doc()->to_hashref; + my $hashref = $self->{searcher}->fetch_field_values( + $hit_doc->get_doc_num ); if ( !exists $hashref->{score} ) { $hashref->{score} = $hit_doc->get_score; Index: lib/KinoSearch/Search/Searchable.pm =================================================================== --- lib/KinoSearch/Search/Searchable.pm (revision 921) +++ lib/KinoSearch/Search/Searchable.pm (working copy) @@ -65,6 +65,18 @@ =begin comment + my $hashref = $reader->fietch_field_values( $doc_num, \@field_names ); + +Given a document number and an arrayref of field names, return the values of +all named stored fields as a hashref. + +=end comment +=cut + +sub fetch_field_values { shift->abstract_death } + +=begin comment + my $doc_freq = $searchable->doc_freq($term); Return the number of documents which contain this Term. Used for calculating Index: lib/KinoSearch/Index/MultiReader.pm =================================================================== --- lib/KinoSearch/Index/MultiReader.pm (revision 921) +++ lib/KinoSearch/Index/MultiReader.pm (working copy) @@ -76,6 +76,13 @@ return $self->{sub_readers}[$reader_index]->fetch_doc($doc_num); } +sub fetch_field_values { + my ( $self, $doc_num ) = @_; + my $reader_index = $self->_reader_index($doc_num); + $doc_num -= $self->{starts}[$reader_index]; + return $self->{sub_readers}[$reader_index]->fetch_field_values($doc_num); +} + sub delete_docs_by_term { my ( $self, $term ) = @_; $_->delete_docs_by_term($term) for @{ $self->{sub_readers} }; Index: lib/KinoSearch/Index/FieldsReader.pm =================================================================== --- lib/KinoSearch/Index/FieldsReader.pm (revision 921) +++ lib/KinoSearch/Index/FieldsReader.pm (working copy) @@ -95,6 +95,47 @@ return $doc; } +# Return a doc's field values as a hashref, avoiding the OO overhead of +# fetch_doc. +sub fetch_field_values { + my ( $self, $doc_num, $fields_wanted ) = @_; + my ( $finfos, $findex_stream, $fdata_stream ) + = @{$self}{qw( finfos findex_stream fdata_stream )}; + my %field_values; + my %desired_fields; + if ( !defined $fields_wanted or !@$fields_wanted ) { + for my $finfo ( $finfos->get_infos ) { + $desired_fields{ $finfo->get_field_num } = $finfo->get_name; + } + } + else { + for my $field_name (@$fields_wanted) { + my $finfo = $finfos->info_by_name($_); + $desired_fields{ $finfos->get_num } = $field_name; + } + } + + # get data file pointer from index and seek the data file instream + $findex_stream->seek( $doc_num * 8 ); + my $start = $findex_stream->lu_read('Q'); + $fdata_stream->seek($start); + + # retrieve one doc's worth of field data + my $num_fields = $fdata_stream->lu_read('V'); + for ( 1 .. $num_fields ) { + my ( $field_num, $bits, $content, $tv_string ) + = $fdata_stream->lu_read('VaTT'); + next unless exists $desired_fields{$field_num}; + my $field_name = $desired_fields{$field_num}; + $field_values{$field_name} = + ( $bits & COMPRESSED ) eq COMPRESSED + ? uncompress($content) + : $content; + } + + return \%field_values; +} + sub decode_fdt_bits { my ( undef, $field, $bits ) = @_; $field->set_analyzed( ( $bits & ANALYZED ) eq ANALYZED ); Index: lib/KinoSearch/Index/IndexReader.pm =================================================================== --- lib/KinoSearch/Index/IndexReader.pm (revision 921) +++ lib/KinoSearch/Index/IndexReader.pm (working copy) @@ -167,6 +167,18 @@ =begin comment + my $hashref = $reader->fietch_field_values( $doc_num, \@field_names ); + +Given a document number and an arrayref of field names, return the values of +all named stored fields as a hashref. + +=end comment +=cut + +sub fetch_field_values { shift->abstract_death } + +=begin comment + my $infos = $reader->generate_field_infos; Return a new FieldInfos object, describing all the fields held by the reader. Index: lib/KinoSearch/Index/SegReader.pm =================================================================== --- lib/KinoSearch/Index/SegReader.pm (revision 921) +++ lib/KinoSearch/Index/SegReader.pm (working copy) @@ -179,6 +179,11 @@ $_[0]->{fields_reader}->fetch_doc( $_[1] ); } +sub fetch_field_values { + my $self = shift; + return $self->{fields_reader}->fetch_field_values(@_); +} + sub segreaders_to_merge { my ( $self, $all ) = @_; return $self if $all; Index: lib/KinoSearch/Searcher.pm =================================================================== --- lib/KinoSearch/Searcher.pm (revision 921) +++ lib/KinoSearch/Searcher.pm (working copy) @@ -133,6 +133,12 @@ } sub fetch_doc { $_[0]->{reader}->fetch_doc( $_[1] ) } + +sub fetch_field_values { + my $self = shift; + return $self->{reader}->fetch_field_values(@_); +} + sub max_doc { shift->{reader}->max_doc } sub doc_freq {

Fri May 12 22:18:03 2006 CREAMYG [...] cpan.org - Taken

Tue Mar 20 16:13:17 2007 CREAMYG [...] cpan.org - Fixed in 0.20_02 added

Tue Mar 20 19:58:11 2007 CREAMYG [...] cpan.org - Correspondence added

The 0.20 branch of KS eliminates both the Doc and Field classes, and FieldsReader's replacement DocReader reads retrieved field values directly into the hash.

Tue Mar 20 19:58:18 2007 CREAMYG [...] cpan.org - Status changed from 'open' to 'resolved'

Bug #19233 for KinoSearch: Optimize Hits::fetch_hit_hashref