Subject: | Feature suggestion: filter_null |
I've added a --filter-null option, to help with 16-bit encodings, where
every second byte is a \0 character most of the time.
Subject: | fixlatin_nonnull.patch |
--- Encoding-FixLatin-1.02/lib/Encoding/FixLatin.pm.orig 2010-05-01 04:04:05.000000000 +0200
+++ Encoding-FixLatin-1.02/lib/Encoding/FixLatin.pm 2010-11-30 23:04:57.000000000 +0100
@@ -5,7 +5,7 @@ use strict;
require 5.008;
-our $VERSION = '1.02';
+our $VERSION = '1.021';
use Carp qw(croak);
use Exporter qw(import);
@@ -17,6 +17,8 @@ our @EXPORT_OK = qw(fix_latin);
my $byte_map;
my $ascii_str = qr{\A([\x00-\x7F]+)(.*)\z}s;
+my $ascii_str_nonnull = qr{\A([\x01-\x7F]+)(.*)\z}s;
+my $ascii_str_print = qr{\A([\t\r\n\x20-\x7E]+)(.*)\z}s;
my $cont_byte = '[\x80-\xBF]';
my $utf8_2 = qr{\A([\xC0-\xDF])($cont_byte)(.*)\z}s;
@@ -24,7 +26,7 @@ my $utf8_3 = qr{\A([\xE0-\xEF])($cont
my $utf8_4 = qr{\A([\xF0-\xF7])($cont_byte)($cont_byte)($cont_byte)(.*)\z}s;
my $utf8_5 = qr{\A([\xF8-\xFB])($cont_byte)($cont_byte)($cont_byte)($cont_byte)(.*)\z}s;
-my %known_opt = map { $_ => 1 } qw(bytes_only ascii_hex overlong_fatal);
+my %known_opt = map { $_ => 1 } qw(bytes_only ascii_hex overlong_fatal filter_nonprint filter_null);
my %non_1252 = (
"\x81" => '%81',
@@ -40,6 +42,8 @@ sub fix_latin {
ascii_hex => 1,
bytes_only => 0,
overlong_fatal => 0,
+ filter_nonprint => 0,
+ filter_null => 0,
@_
);
@@ -49,6 +53,9 @@ sub fix_latin {
return unless defined($input);
_init_byte_map(\%opt) unless $byte_map;
+ $byte_map->{pack('C', 0)} = '' if $opt{filter_null};
+ $ascii_str = $ascii_str_nonnull if $opt{filter_null};
+ $ascii_str = $ascii_str_print if $opt{filter_nonprint};
if(is_utf8($input)) { # input string already has utf8 flag set
if($opt{bytes_only}) {
@@ -305,6 +312,15 @@ use eval to trap and handle this scenari
There is a strong argument that overlong sequences are only ever encountered
in malicious input and therefore they should always be rejected.
+=item filter_null => 1/0
+
+This option removes any occurance of the binary NUL character ("\0") from the file.
+This helps making files readable that use a fixed width 16-bit encoding like UCS-2.
+Default is 0, which retains NUL characters.
+
+Please note, that data corruption is to be expected, if an UCS-2 encoded input
+actually uses codepoints above decimal 255, regardless how this option is set.
+
=back
=head1 LIMITATIONS OF THIS MODULE
--- Encoding-FixLatin-1.02/script/fix_latin.orig 2010-04-30 12:22:53.000000000 +0200
+++ Encoding-FixLatin-1.02/script/fix_latin 2010-11-30 22:55:53.000000000 +0100
@@ -24,14 +24,15 @@ use Encoding::FixLatin qw(fix_latin);
my %opt;
-if(!GetOptions(\%opt, 'help|?')) {
+if(!GetOptions(\%opt, 'help|?', 'filter_null|0')) {
pod2usage(-exitval => 1, -verbose => 0);
}
+my $always = defined($opt{filter_null});
pod2usage(-exitstatus => 0, -verbose => 2) if($opt{'help'});
while(<>) {
- $_ = fix_latin($_, bytes_only => 1) if /[^\x00-\x7f]/;
+ $_ = fix_latin($_, bytes_only => 1, %opt) if $always or /[^\x00-\x7f]/;
print;
}
@@ -52,6 +53,8 @@ latin (ie: non-ASCII 8 bit) characters
-? detailed help message
+ -0 strip NUL bytes
+
=head1 DESCRIPTION
The script acts as a filter, taking source data which may contain a mix of
@@ -62,7 +65,7 @@ Multi-byte UTF8 characters will be passe
UTF8 byte sequences will be converted to the shortest normal form). Single
byte characters will be converted as follows:
- 0x00 - 0x7F ASCII - passed through unchanged
+ 0x00 - 0x7F ASCII - passed through unchanged (0x01-0x7F under option -0)
0x80 - 0x9F Converted to UTF8 using CP1252 mappings
0xA0 - 0xFF Converted to UTF8 using Latin-1 mappings
@@ -74,6 +77,14 @@ byte characters will be converted as fol
Display this documentation.
+=item B<-0>
+
+=item B<--filter-null>
+
+Strip binary NUL characters ('\0') from the file. This helps making UCS-2 files readable.
+Note that with or without -0, data corruption is to be expected if your UCS-2
+file uses codepoints above 255.
+
=back
=head1 EXAMPLES