Subject: | Faster `sdiff` based on `compact_diff` for Algorithm::Diff::XS |
Date: | Fri, 25 Oct 2013 21:56:00 +0200 |
To: | bug-Algorithm-Diff-XS [...] rt.cpan.org |
From: | Bjoern Hoehrmann <derhoermi [...] gmx.net> |
Hi,
As noted in http://search.cpan.org/dist/Algorithm-Diff-XS v0.04, "only
the LCSidx function is optimized in XS at the moment, which means only
compact_diff will get significantly faster for large data sets, while
diff and sdiff will run in identical speed as Algorithm::Diff." Now the
`sdiff` function can be implemented on top of `compact_diff`. The code
below does that, including verification against random inputs by using
the `Algorithm::Diff::sdiff` function and comparing the result. Please
feel free to use the code under the same terms as Algorithm::Diff::XS. I
have seen speedups of an order of magnitude and greater.
#!perl -w
use Modern::Perl;
use Algorithm::Diff::XS;
use Test::More;
use Data::Random qw/rand_chars/;
use POSIX qw//;
sub fast_sdiff {
my $x = shift;
my $y = shift;
my $keyfunc = shift;
my @keyargs = @_;
my @cdiff;
if ($keyfunc) {
my @dx = map { $keyfunc->($_, @keyargs) } @$x;
my @dy = map { $keyfunc->($_, @keyargs) } @$y;
@cdiff = Algorithm::Diff::XS::compact_diff(\@dx, \@dy);
} else {
@cdiff = Algorithm::Diff::XS::compact_diff($x, $y);
}
_compact_diff_to_sdiff($x, $y, @cdiff);
}
sub _compact_diff_to_sdiff {
my ($a, $b, @cdiff) = @_;
my $MIN = -(POSIX::DBL_MAX);
my @temp;
my $add = sub {
my ($op, $ax, $bx, $count) = @_;
push @temp, [$op, $ax + $_, $bx + $_] for 0 .. $count - 1;
};
for (my $ix = 0; $ix < @cdiff - 2; $ix += 2) {
my ($a_from, $b_from, $a2_from, $b2_from) =
@cdiff[ $ix .. $ix + 3 ];
my $a_len = ($a2_from - 1) - $a_from + 1;
my $b_len = ($b2_from - 1) - $b_from + 1;
if ($ix & 2) {
# modified
if ($a_from == $a2_from) {
# addition
$add->('+', $MIN, $b_from, $b_len);
} elsif ($b_from == $b2_from) {
# removal
$add->('-', $a_from, $MIN, $a_len);
} else {
# change
if ($a_len < $b_len) {
$add->('c', $a_from, $b_from, $a_len);
$add->('+', $MIN, $b_from + $a_len, $b_len - $a_len);
} elsif ($a_len > $b_len) {
$add->('c', $a_from, $b_from, $b_len);
$add->('-', $a_from + $b_len, $MIN, $a_len - $b_len);
} else {
$add->('c', $a_from, $b_from, $a_len);
}
}
} else {
# unchanged
$add->('u', $a_from, $b_from, $a_len);
}
}
$_->[1] = $_->[1] >= 0 ? $a->[$_->[1]] : '' for @temp;
$_->[2] = $_->[2] >= 0 ? $b->[$_->[2]] : '' for @temp;
@temp;
}
for (0 .. 1000) {
my @x = map { rand_chars(set => 'all', max => 26) } 0 .. 2;
my @y = map { rand_chars(set => 'all', max => 26) } 0 .. 2;
my @sdiff = fast_sdiff(\@x, \@y);
my @sdiff_orig = Algorithm::Diff::XS::sdiff(\@x, \@y);
is_deeply(\@sdiff, \@sdiff_orig,
'fast_sdiff(...) == sdiff(...)');
}
regards,
--
Björn Höhrmann · mailto:bjoern@hoehrmann.de · http://bjoern.hoehrmann.de
Am Badedeich 7 · Telefon: +49(0)160/4415681 · http://www.bjoernsworld.de
25899 Dagebüll · PGP Pub. KeyID: 0xA4357E78 · http://www.websitedev.de/