Subject: | [PATCH] Fix patterns like [^.], [\x00-\xFF], [\da-z] and much more specials inside [] |
I've found a problems with negative patterns and complex character notaions.
When I pass for ex [^.] I got [.^] on output, which have almost opposite
meaning ;)
I've tried to cover such a cases.
For now I've added a compression only for simple ranges, and tried to
leave others as is, to keep behavior. I think it's better not compress,
than break.
Some of complex variants could be compressed more (for ex like
[\x00-\x90\x91-\xff]), but I left this as TODO
In the patch attached there are fixes to version 0.01 and I've added a
lot of tests.
Subject: | regexp-assemble-compressed.patch |
diff -r -U5 Regexp-Assemble-Compressed-0.01/lib/Regexp/Assemble/Compressed.pm Regexp-Assemble-Compressed-patched/lib/Regexp/Assemble/Compressed.pm
--- Regexp-Assemble-Compressed-0.01/lib/Regexp/Assemble/Compressed.pm 2009-04-10 18:22:41.000000000 +0400
+++ Regexp-Assemble-Compressed-patched/lib/Regexp/Assemble/Compressed.pm 2010-06-27 04:01:40.000000000 +0400
@@ -1,38 +1,94 @@
package Regexp::Assemble::Compressed;
use strict;
use warnings;
-our $VERSION = '0.01';
+our $VERSION = '0.02';
use base qw(Regexp::Assemble);
+# Note: maybe handle \U,\L more smartly
+our $char = qr/
+ (?:\\u|\\l|) # \u, \l acts on one char or char group
+ (?:
+ \\Q.+?\\E # capture \Q..\E completely
+ | \[:[^:]+:\] # posix char class
+ | \\[UL].+?(?:\\E|\Z) # capture \U..\E and \L..\E completely
+ | \\x(?:\{[\dA-Fa-f]+\}|[\dA-Fa-f]{1,2}) # \x.. or \x{...}
+ | \\\d{1,3} # \000 - octal
+ | \\N\{[^{]+\} # unicode char
+ | \\[pP]\{[^{]+\} # unicode character class
+ | \\c. # control char \cX
+ | \\. # \t \n \s ...
+ | . # any char
+ )
+/xo;
+
sub as_string {
my $self = shift;
my $string = $self->SUPER::as_string;
- $string =~ s{(?<!\\)\[(.+?)(?<!\\)\]}{ "[" . _compress($1) . "]" }eg;
+ $string =~ s{(?<!\\)\[(\^|)((?:\[:[^:]+:\]|.)+?)(?<!\\)\]}{ "[" . $1 . _compress($2) . "]" }eg;
return $string;
}
sub _compress {
my $string = shift;
- my @characters = sort split //, $string;
+ my @characters = sort $string =~ m{ ( $char\-$char | $char ) }sgx;
+ #warn "[ ".join('|', @characters)." ]";
my @stack = ();
my @skipped = ();
my $last;
for my $char (@characters) {
- my $num = ord $char;
- if ($last && $num - $last == 1) {
- push @skipped, $char;
+ if ( length($char) == 1 ) {
+ my $num = ord $char;
+ if (defined $last and $num - $last == 0) { next }
+ if (defined $last and @skipped and $num >= ord $skipped[0] and $num <= ord $skipped[-1]) { next }
+ if (defined $last and $num - $last == 1) {
+ push @skipped, $char;
+ $last = $num;
+ next;
+ }
+ elsif (@skipped) {
+ push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]);
+ @skipped = ();
+ }
+ push @stack, $char;
$last = $num;
- next;
}
- elsif (@skipped) {
- push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]);
- @skipped = ();
+ elsif (length $char == 3 and $char =~ /^([^\\])-([^\\])$/) {
+ my ($beg,$end) = ($1,$2);
+ my $num = ord $beg;
+ my $enn = ord $end;
+ if (defined $last and @skipped and $num + 1 >= ord $skipped[0] and $num <= ord $skipped[-1]) {
+ if ($enn <= ord $skipped[-1]) { next }
+ else {
+ my $next = $skipped[-1];
+ ++$next;
+ push @skipped, $next..$end;
+ $last = $enn;
+ next;
+ }
+ }
+ if (defined $last and $num - $last == 1) {
+ push @skipped, $beg..$end;
+ $last = $enn;
+ next;
+ }
+ elsif (@skipped) {
+ push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]);
+ @skipped = ();
+ }
+ push @stack, $beg;
+ push @skipped, ++$beg..$end;
+ $last = $enn;
+ }
+ else {
+ if (@skipped) {
+ push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]);
+ @skipped = ();
+ }
+ push @stack, $char;
}
- push @stack, $char;
- $last = $num;
}
if (@skipped) {
push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]);
}
return join '', @stack;
diff -r -U5 Regexp-Assemble-Compressed-0.01/t/01_assemble.t Regexp-Assemble-Compressed-patched/t/01_assemble.t
--- Regexp-Assemble-Compressed-0.01/t/01_assemble.t 2009-04-10 18:22:22.000000000 +0400
+++ Regexp-Assemble-Compressed-patched/t/01_assemble.t 2010-06-27 04:01:47.000000000 +0400
@@ -1,22 +1,100 @@
use strict;
-use Test::More tests => 4;
+use Test::More tests => 22;
use_ok 'Regexp::Assemble::Compressed';
my $ra = Regexp::Assemble::Compressed->new;
-for my $i (0 .. 9) {
- $ra->add($i)
-}
-ok($ra->as_string, '[0-9]');
-$ra->reset;
+# This is done by Regexp::Assemble.
+#for my $i (0 .. 9) {
+# $ra->add($i)
+#}
+#is($ra->as_string, '\d', '[0-9]');
+#$ra->reset;
+
for my $i ('a' .. 'z') {
$ra->add($i);
}
-ok($ra->as_string, '[a-z]');
+is($ra->as_string, '[a-z]', '[a-z]');
$ra->reset;
-for my $i ('a' .. 'z', '0' .. '9', 'A' .. 'Z') {
+for my $i ('a' .. 'z', 'A' .. 'Z') {
$ra->add($i);
}
-ok($ra->as_string, '[0-9A-Za-z]');
+is($ra->as_string, '[A-Za-z]', '[A-Za-z]');
+
+$ra->reset;
+$ra->add('[^.]+');
+is($ra->as_string, '[^.]+', '[^.]+');
+
+$ra->reset;
+$ra->add('[\dabcdef]+');
+is($ra->as_string, '[\da-f]+', '[\da-f]+');
+
+$ra->reset;
+$ra->add('[a-fg-z]+');
+is($ra->as_string, '[a-z]+', '[a-z]+');
+
+$ra->reset;
+$ra->add('[a-fgh-z]+');
+is($ra->as_string, '[a-z]+', '[a-z]+');
+
+$ra->reset;
+$ra->add('[a-fh-z]+');
+is($ra->as_string, '[a-fh-z]+', '[a-fh-z]+');
+
+$ra->reset;
+$ra->add('[a-bd-z]+');
+is($ra->as_string, '[abd-z]+', '[a-bd-z]+');
+
+$ra->reset;
+$ra->add('[\x00-\xff]+');
+is($ra->as_string, '[\x00-\xff]+', '[\x00-\xff]+');
+
+$ra->reset;
+$ra->add('[\Ua-z]+');
+is($ra->as_string, '[\Ua-z]+', '[\Ua-z]+');
+
+$ra->reset;
+$ra->add('[\Ud-ha-c]+');
+is($ra->as_string, '[\Ud-ha-c]+', '[\Ud-ha-c]+');
+
+$ra->reset;
+$ra->add('[\ua-cd-h]+');
+is($ra->as_string, '[\ua-cd-h]+', '[\ua-cd-h]+');
+
+$ra->reset;
+$ra->add('[\-a]+');
+is($ra->as_string, '[\-a]+', '[\-a]+');
+
+$ra->reset;
+$ra->add('[\Ua-z\E!\U0-9\E]+');
+is($ra->as_string, '[!\U0-9\E\Ua-z\E]+', '[\Ua-z\E!\U0-9\E]+');
+
+$ra->reset;
+$ra->add('[a-z\p{QuotationMark}]+');
+is($ra->as_string, '[\p{QuotationMark}a-z]+', '[a-z\p{...}]+');
+
+$ra->reset;
+$ra->add('[a-za-z]+');
+is($ra->as_string, '[a-z]+', '[a-z]+');
+
+$ra->reset;
+$ra->add('[a-za-a]+');
+is($ra->as_string, '[a-z]+', '[a-za-a]+');
+
+$ra->reset;
+$ra->add('[a-zb-c]+');
+is($ra->as_string, '[a-z]+', '[a-za-a]+');
+
+$ra->reset;
+$ra->add('[a-db-h]+');
+is($ra->as_string, '[a-h]+', '[a-db-h]+');
+
+$ra->reset;
+$ra->add('[aaa]+');
+is($ra->as_string, '[a]+', '[aaa]+');
+
+$ra->reset;
+$ra->add('[1[:alpha:]234]+');
+is($ra->as_string, '[1-4[:alpha:]]+', '[1[:alpha:]234]+');