Just in case it's useful for anyone else with the same problem, here's a
patch I've been using that fixes the issue with non-BMP Unicode characters.
Best regards
Tomaž
Index: Lucene-0.18/cpp/utils.cpp
===================================================================
--- Lucene-0.18.orig/cpp/utils.cpp 2012-07-12 12:54:06.000000000 +0200
+++ Lucene-0.18/cpp/utils.cpp 2012-07-12 13:26:55.000000000 +0200
@@ -63,7 +63,7 @@
// Alloc memory for wide char string. This could be a bit more
// then necessary.
- Newz(0, ret, arg_len + 1, wchar_t);
+ Newz(0, ret, arg_len*2 + 1, wchar_t);
U8* src = (U8*) SvPV_nolen(arg);
wchar_t* dst = ret;
@@ -72,7 +72,17 @@
// UTF8 to wide char mapping
STRLEN len;
while (*src) {
- *dst++ = utf8_to_uvuni(src, &len);
+ wchar_t uni = utf8_to_uvuni(src, &len);
+ // Lucene only stores lower 16 bits of
+ // wchar_t, so we have to encode non-BMP
+ // characters as surrogate pairs.
+ if(uni > 0xffff) {
+ uni -= 0x10000;
+ *dst++ = 0xd800 + (uni >> 10);
+ *dst++ = 0xdc00 + (uni & 0x03ff);
+ } else {
+ *dst++ = uni;
+ }
src += len;
}
} else {
@@ -97,7 +107,13 @@
d = dst;
while (*src) {
- d = uvuni_to_utf8(d, *src++);
+ wchar_t uni = *src++;
+ // decode surrogate pairs
+ if ((uni & 0xfc00) == 0xd800) {
+ if (!(*src)) break;
+ uni = 0x10000 + ((uni & 0x03ff) << 10) | ((*src++) & 0x03ff);
+ }
+ d = uvuni_to_utf8(d, uni);
}
*d = 0;
Index: Lucene-0.18/t/non-bmp-characters.t
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ Lucene-0.18/t/non-bmp-characters.t 2012-07-12 13:29:18.000000000 +0200
@@ -0,0 +1,26 @@
+#!perl -T
+
+use Test::More tests => 2;
+use File::Temp qw/tempdir/;
+
+use Lucene;
+use encoding 'utf-8';
+use strict;
+
+my $dir = tempdir(CLEANUP => 1);
+
+my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
+my $store = Lucene::Store::FSDirectory->getDirectory($dir, 1);
+
+my $tmp_writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
+my $doc = new Lucene::Document;
+$doc->add(Lucene::Document::Field->Text("foo", "\x{01d5c4}"));
+
+is($doc->get("foo"), "\x{01d5c4}");
+$tmp_writer->addDocument($doc);
+$tmp_writer->close;
+undef $tmp_writer;
+
+my $reader = Lucene::Index::IndexReader->open($store);
+my $document = $reader->document(0);
+is($document->get("foo"), "\x{01d5c4}");