Bug #60688 for Lucene: Incorrectly stores non-BMP characters

Tue Aug 24 08:34:30 2010 tomaz.solc [...] tablix.org - Ticket created

Subject:

Incorrectly stores non-BMP characters

Hi Documents containing Unicode characters outside BMP (basic multilingual plane - code points > U+FFFF) get mangled after storing and retrieving from index. See attached test case. For instance character U+1D5C4 gets truncated to U+D5C4. Wide-character strings used by Perl should probably be UTF-16 encoded before passing on to CLucene functions. Regards Tomaž

Subject:

clucene_non_bmp_bug.pl

use Lucene; use encoding 'utf-8'; use strict; my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer(); my $store = Lucene::Store::FSDirectory->getDirectory("test", 1); my $tmp_writer = new Lucene::Index::IndexWriter($store, $analyzer, 1); my $doc = new Lucene::Document; $doc->add(Lucene::Document::Field->Text("foo", "\x{01d5c4}")); print "Correct: ", $doc->get("foo"), "\n"; $tmp_writer->addDocument($doc); $tmp_writer->close; undef $tmp_writer; my $reader = Lucene::Index::IndexReader->open($store); my $document = $reader->document(0); print "Wrong: ", $document->get("foo"), "\n";

Fri Jul 27 15:18:31 2012 tomaz.solc [...] tablix.org - Correspondence added

Just in case it's useful for anyone else with the same problem, here's a patch I've been using that fixes the issue with non-BMP Unicode characters. Best regards Tomaž

Subject:

lucene_non_bmp_unicode_support.patch

Index: Lucene-0.18/cpp/utils.cpp =================================================================== --- Lucene-0.18.orig/cpp/utils.cpp 2012-07-12 12:54:06.000000000 +0200 +++ Lucene-0.18/cpp/utils.cpp 2012-07-12 13:26:55.000000000 +0200 @@ -63,7 +63,7 @@ // Alloc memory for wide char string. This could be a bit more // then necessary. - Newz(0, ret, arg_len + 1, wchar_t); + Newz(0, ret, arg_len*2 + 1, wchar_t); U8* src = (U8*) SvPV_nolen(arg); wchar_t* dst = ret; @@ -72,7 +72,17 @@ // UTF8 to wide char mapping STRLEN len; while (*src) { - *dst++ = utf8_to_uvuni(src, &len); + wchar_t uni = utf8_to_uvuni(src, &len); + // Lucene only stores lower 16 bits of + // wchar_t, so we have to encode non-BMP + // characters as surrogate pairs. + if(uni > 0xffff) { + uni -= 0x10000; + *dst++ = 0xd800 + (uni >> 10); + *dst++ = 0xdc00 + (uni & 0x03ff); + } else { + *dst++ = uni; + } src += len; } } else { @@ -97,7 +107,13 @@ d = dst; while (*src) { - d = uvuni_to_utf8(d, *src++); + wchar_t uni = *src++; + // decode surrogate pairs + if ((uni & 0xfc00) == 0xd800) { + if (!(*src)) break; + uni = 0x10000 + ((uni & 0x03ff) << 10) | ((*src++) & 0x03ff); + } + d = uvuni_to_utf8(d, uni); } *d = 0; Index: Lucene-0.18/t/non-bmp-characters.t =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ Lucene-0.18/t/non-bmp-characters.t 2012-07-12 13:29:18.000000000 +0200 @@ -0,0 +1,26 @@ +#!perl -T + +use Test::More tests => 2; +use File::Temp qw/tempdir/; + +use Lucene; +use encoding 'utf-8'; +use strict; + +my $dir = tempdir(CLEANUP => 1); + +my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer(); +my $store = Lucene::Store::FSDirectory->getDirectory($dir, 1); + +my $tmp_writer = new Lucene::Index::IndexWriter($store, $analyzer, 1); +my $doc = new Lucene::Document; +$doc->add(Lucene::Document::Field->Text("foo", "\x{01d5c4}")); + +is($doc->get("foo"), "\x{01d5c4}"); +$tmp_writer->addDocument($doc); +$tmp_writer->close; +undef $tmp_writer; + +my $reader = Lucene::Index::IndexReader->open($store); +my $document = $reader->document(0); +is($document->get("foo"), "\x{01d5c4}");

Fri Jul 27 15:18:32 2012 tomaz.solc [...] tablix.org - Status changed from 'new' to 'open'