Hi,
Thanks for releasing this module.
I don't think you want to "_sanitize" field values. BibTeX can contain
any (La)TeX sequence to enable complex characters to be constructed e.g.
for u-umlaut you use "M{\"u}ller", so just stripping braces and
de-escaping slashed values won't give the correct result. This also
impacts on name parsing this correctly: "{Barnes and Noble, Inc.}".
Essentially you need to do token-parsing on names as well ...
I think Text::Balanced is a bit of a memory hog (at least for the
functionality gained ...). This patch will remove the requirement for
that module:
--- Parser.pm.1 2009-07-19 16:44:40.000000000 +0100
+++ Parser.pm 2009-07-21 16:22:08.000000000 +0100
@@ -5,8 +5,6 @@
our $VERSION = '0.3';
-use Text::Balanced qw(extract_bracketed extract_delimited);
-
use BibTeX::Parser::Entry;
=for stopwords jr von
@@ -239,7 +237,7 @@
{ # quoted string with embeded escapes
$value .= $1;
} else {
- my $part = ( extract_bracketed( $_, "{}" ) )[0];
+ my $part = extract_bracketed( $_ );
$value .= substr $part, 1, length($part) - 2; # strip quotes
}
@@ -251,4 +249,23 @@
return $value;
}
-1; # End of BibTeX::Parser
\ No newline at end of file
+sub _extract_bracketed
+{
+ for($_[0]) # alias to $_
+ {
+ /\G\s+/cg;
+ my $start = pos($_);
+ my $depth = 0;
+ while(1)
+ {
+ /\G\\./cg && next;
+ /\G\{/cg && (++$depth, next);
+ /\G\}/cg && (--$depth > 0 ? next : last);
+ /\G([^\{\}]+)/cg && next;
+ last; # end of string
+ }
+ return substr($_, $start, pos($_)-$start);
+ }
+}
+
+1; # End of BibTeX::Parser