Subject: | Add the option to keep the source document's whitespace. |
This simply adds an option to keep the source document's whitespace, in
order to accurately reproduce the structure in the destination.
Debugging problems is much easier this way.
Subject: | RTF-Tokenizer-1.13-whitespace.patch |
--- Tokenizer.pm.bak 2012-08-29 12:21:04.000000000 -0400
+++ Tokenizer.pm 2012-08-29 12:25:27.000000000 -0400
@@ -86,6 +86,9 @@
but this may change in future releases.
+C<whitespace> - boolean - whether to include whitespace in the output,
+defaults to false
+
=cut
sub new {
@@ -117,6 +120,7 @@
# Set up final config stuff
$self->{_NOTE_ESCAPES} = $config{'note_escapes'};
$self->{_SLOPPY} = $config{'sloppy'};
+ $self->{_WHITESPACE} = $config{'whitespace'};
return $self;
@@ -308,8 +312,11 @@
my $start_character = substr( $self->{_BUFFER}, 0, 1, '' );
+ my $non_text = '\\\\{}';
+ $non_text .= '\\r\\n' unless $self->{_WHITESPACE};
+
# Most likely to be text, so we check for that first
- if ( $start_character =~ /[^\\{}\r\n]/ ) {
+ if ( $start_character =~ /[^$non_text]/ ) {
local ($^W); # Turn off warnings here ('uninitialized')
# We want to return text fields that have newlines in as one
@@ -334,9 +341,11 @@
goto READTEXT if $self->{_BUFFER};
}
- # Make sure we're not including newlines in our output,
- # as RTF spec says they're to be ignored...
- $temp_text =~ s/(\cM\cJ|\cM|\cJ)//g;
+ unless ( $self->{_WHITESPACE} ) {
+ # Make sure we're not including newlines in our output,
+ # as RTF spec says they're to be ignored...
+ $temp_text =~ s/(\cM\cJ|\cM|\cJ)//g;
+ }
# Give the user a shiny token back
return ( 'text', $start_character . $temp_text, '' );
@@ -473,6 +482,14 @@
sub _grab_control {
my $self = shift;
+ my $whitespace = '\\s';
+
+ if ( $self->{_WHITESPACE} ) {
+ $whitespace .= '*';
+ } else {
+ $whitespace = '?:' . $whitespace;
+ }
+
# Check for a star here, as it simplifies our regex below,
# and it occurs pretty often
if ( $self->{_BUFFER} =~ s/^\*// ) {
@@ -485,7 +502,7 @@
^([a-z]{1,32}) # Lowercase word
(-?\d+)? # Optional signed number
(
- ?:\s # Either whitespace, which we gobble
+ $whitespace # Either whitespace, which we *may* include
|
(?=[^a-z0-9])) # or a non alpha-numeric, which we leave
@@ -494,6 +511,7 @@
# Return the control word, unless it's a \bin
my $param = '';
$param = $2 if defined($2);
+ $param .= $3 if defined($3);
return ( $1, $param ) unless $1 eq 'bin';
# Pre-grab the binary data, and return the control word
@@ -596,4 +614,4 @@
=cut
-1;
\ No newline at end of file
+1;