Subject: | Ignore undef values in @data |
If @data contains undef values, then the statistics returned are incorrect.
This patch contains changes to min, max, mean, mode, median, sum and count to ignore undef
values. The patch also contains test cases to prove the changes were correct.
If this patch is accepted, please credit Opsera Limited for the fixes.
Thanks for a nice module!
Subject: | stats.diff |
diff -ur Statistics-Lite-3.2.original/Lite.pm Statistics-Lite-3.2/Lite.pm
--- Statistics-Lite-3.2.original/Lite.pm 2007-06-24 18:13:48.000000000 +0000
+++ Statistics-Lite-3.2/Lite.pm 2009-09-23 01:50:40.000000000 +0000
@@ -15,23 +15,25 @@
);
sub count
-{ return scalar @_; }
+{ return scalar stripped(@_) }
+
+sub stripped { return grep { defined $_ } @_ };
sub min
{
- return unless @_;
- return $_[0] unless @_ > 1;
- my $min= shift;
- foreach(@_) { $min= $_ if $_ < $min; }
+ my @vals = stripped(@_);
+ return unless @vals;
+ my $min= shift @vals;
+ foreach(@vals) { $min= $_ if ($_ < $min); }
return $min;
}
sub max
{
- return unless @_;
- return $_[0] unless @_ > 1;
- my $max= shift;
- foreach(@_) { $max= $_ if $_ > $max; }
+ my @vals = stripped(@_);
+ return unless @vals;
+ my $max= shift @vals;
+ foreach(@vals) { $max= $_ if ($_ > $max); }
return $max;
}
@@ -39,9 +41,9 @@
{
return unless @_;
return 0 unless @_ > 1;
- return abs($_[1]-$_[0]) unless @_ > 2;
- my $min= shift; my $max= $min;
- foreach(@_) { $min= $_ if $_ < $min; $max= $_ if $_ > $max; }
+ my $min=min(@_);
+ my $max=max(@_);
+ return undef if (! defined $min && ! defined $max);
return $max - $min;
}
@@ -50,7 +52,7 @@
return unless @_;
return $_[0] unless @_ > 1;
my $sum;
- foreach(@_) { $sum+= $_; }
+ foreach(@_) { $sum+= $_ if defined $_ }
return $sum;
}
@@ -58,17 +60,19 @@
{
return unless @_;
return $_[0] unless @_ > 1;
- return sum(@_)/scalar(@_);
+ my $count = count(@_);
+ return undef if $count == 0;
+ return sum(@_)/$count;
}
sub median
{
return unless @_;
return $_[0] unless @_ > 1;
- @_= sort{$a<=>$b}@_;
- return $_[$#_/2] if @_&1;
- my $mid= @_/2;
- return ($_[$mid-1]+$_[$mid])/2;
+ my @values = sort{$a<=>$b} (stripped(@_));
+ return $values[$#values/2] if @values&1;
+ my $mid= @values/2;
+ return ($values[$mid-1]+$values[$mid])/2;
}
sub mode
@@ -76,7 +80,7 @@
return unless @_;
return $_[0] unless @_ > 1;
my %count;
- foreach(@_) { $count{$_}++; }
+ foreach(stripped(@_)) { $count{$_}++; }
my $maxhits= max(values %count);
foreach(keys %count) { delete $count{$_} unless $count{$_} == $maxhits; }
return mean(keys %count);
@@ -232,12 +236,11 @@
=item C<min(@data)>, C<max(@data)>, C<range(@data)>, C<sum(@data)>, C<count(@data)>
Return the minimum value, maximum value, range (max - min),
-sum, or count of values in C<@data>.
-(Count simply returns C<scalar(@data)>.)
+sum, or count of values in C<@data>. Any undef values are ignored
=item C<mean(@data)>, C<median(@data)>, C<mode(@data)>
-Calculates the mean, median, or mode average of the values in C<@data>.
+Calculates the mean, median, or mode average of the values in C<@data>. Any undef values are ignored
(In the event of ties in the mode average, their mean is returned.)
=item C<variance(@data)>, C<stddev(@data)>
diff -ur Statistics-Lite-3.2.original/test.pl Statistics-Lite-3.2/test.pl
--- Statistics-Lite-3.2.original/test.pl 2007-06-17 16:52:53.000000000 +0000
+++ Statistics-Lite-3.2/test.pl 2009-09-23 01:33:23.000000000 +0000
@@ -1,7 +1,7 @@
#!perl
use strict;
use warnings;
-use Test::More tests => 28;
+use Test::More tests => 54;
BEGIN { use_ok( 'Statistics::Lite', ':all' ); }
@@ -18,8 +18,40 @@
is(count(1,2,3), 3, "call count - functional interface");
is(mean(1,2,3), 2, "call mean - functional interface");
is(median(1,2,3), 2, "call median - functional interface");
+is(median(1,2,3,4), 2.5, "median with even numbers");
is(mode(1,2,3), 2, "call mode - functional interface");
+is(min(1,-5,8), -5, "min with negative numbers" );
+is(range(-6,-9), 3, "range with negative values" );
+is(range(6,-9), 15, "range crossing 0" );
+
+# undef checking
+is(min(undef), undef );
+is(max(undef), undef );
+is(min(), undef );
+is(max(), undef );
+is(min(6,undef,10), 6, "ignore undefs" );
+is(max(-6,-10,undef), -6, "ignore undefs" );
+is(min(undef, 7, -5), -5, "ignore undefs when 1st" );
+is(max(undef, 7, -5), 7, "ignore undefs when 1st" );
+is(min(undef,undef,undef), undef, "Get undef if all undef" );
+is(max(undef,undef,undef), undef, "Get undef if all undef" );
+is(count(undef, 7, -5), 2, "Ignore undefs" );
+is(sum(undef, 7, -5), 2, "Ignore undefs" );
+is(mean(undef, 7, -5), 1, "Ignore undefs" );
+is(count(undef,undef,undef), 0, "Count = 0" );
+is(mean(undef,undef,undef), undef, "Get undef instead of divide by 0" );
+is(range(6,9,undef), 3, "Ignore undefs in range" );
+is(range(undef,6,9), 3, "Ignore undefs in range" );
+is(range(undef,undef,undef,7), 0, "Single value" );
+is(range(undef,undef,undef), undef, "undef when no range set" );
+
+# Excel test
+my @values = (3, -10, 8, undef, 7, undef, 8, 3, 6, 3);
+is(mean(@values), 3.5, "excel mean" );
+is(median(@values), 4.5, "excel median" );
+is(mode(@values), 3, "excel mode" );
+
is(variance(1,2,3), 1, "call variance - functional interface");
is(stddev(1,2,3), 1, "call stddev - functional interface");