#!/usr/bin/perl -w ########################################################################### # # Simple group-by implementation -- assumes already in sorted order. # # Copyright 2010,2011 Philip Johnson. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published # by the Free Software Foundation, either version 3 of the License, # or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License at for # more details. # ########################################################################### use strict; use Getopt::Long; my (@by, @actions, $precision, $delim, $noWarn); $noWarn = 0; $delim = '(?: |\t)+'; $precision = 4; #default to 4 if (!GetOptions("delimiter=s" => \$delim, "precision=i" => \$precision, "noWarn" => \$noWarn, "by=s" => sub { Parse(\@by, undef, $_[1]) }, "full" => sub { @by = (0) }, "identity=s" => sub { Parse(\@actions, \&identity, $_[1]) }, "debugdump" => sub {push @actions, [\&debugdump, 0] }, "num" => sub { push @actions, [sub {return scalar(@{$_[2]})}, 0] }, "avg|mean=s" => sub { Parse(\@actions, \&avg, $_[1]) }, "var=s" => sub { Parse(\@actions, \&var, $_[1]) }, "sd=s" => sub { Parse(\@actions, sub {return sqrt(var(@_))}, $_[1])}, "sem=s" => sub { Parse(\@actions,sub { return(sqrt(var(@_)/scalar(@{$_[2]})))}, $_[1])}, "q1=s" => sub { Parse(\@actions, \&percentile, $_[1], 25) }, "q2|median=s" => sub { Parse(\@actions, \&percentile, $_[1], 50) }, "q3=s" => sub { Parse(\@actions, \&percentile, $_[1], 75) }, "percent|pct=s" => sub { die "Option -percent needs ':'\n" if $_[1] !~ /\d+:\d+/; Parse(\@actions, \&percentile, split(/:/,$_[1]))}, "min=i" => sub { push @actions, [\&percentile, $_[1], 0] }, "max=i" => sub { push @actions, [\&percentile, $_[1], 100] }, "wheremax=i" => sub { push @actions, [\&wheremax, $_[1]] }, "sum=i" => sub { push @actions, [\&sum, $_[1]] }, "mse=s" => sub { die "Option -mse needs ':'\n" if $_[1] !~ /\d+:.+/; Parse(\@actions, \&mse, split(/:/,$_[1])) }, "rmse=s" => sub { die "Option -mse needs ':'\n" if $_[1] !~ /\d+:.+/; Parse(\@actions, sub {return sqrt(mse(@_))}, split(/:/,$_[1])) }, "bias=s" => sub{ die "Option -bias needs ':'\n" if $_[1] !~/\d+:.+/; push @actions, [\&bias, split(/:/,$_[1]) ] }, "cdf=s" => sub { die "Option -cdf needs ':'\n" if $_[1] !~/\d+:.+/; push @actions, [\&cdf, split(/:/,$_[1]) ] } ) || @actions == 0) { die "Syntax: $0 -by [-delimiter 'regex'] [-precision ] * Where is any of the following: \t-identity --> contents of first row of column \t-num --> number of rows \t-avg|mean --> average \t-var --> variance (divided by n, not n-1) \t-sd --> standard deviation \t-sem --> standard error of the mean (sd\/sqrt(n)) \t-q1 --> first quartile \t-q2|median --> median \t-q3 --> third quartile \t-percent : --> percentile \t-min --> minimum value \t-max --> maximum value \t-wheremax --> line where col has maximum value \t-sum --> sum of values \t-mse : --> mean squared error relative to value of formula \t-bias : --> mean - (value of formula) and := := | [,]+ | - := | := \$ \n"; } @by = map {--$_} @by; #switch from 1-based to 0-based if (@by == 1 && $by[0] == -1) { @by = (); } { # main program block $_ = <> || exit 0; chomp; s/^\s+//; #kill leading whitespace my $F = [split /$delim/]; my @data = ($F); my $currSet = join(chr(0), @{$F}[@by]); #assumes NUL isn't used in fields! my $numFields = @$F; while (<>) { chomp; s/^\s+//; #kill leading whitespace $F = [split /$delim/]; if (@$F != $numFields && !$noWarn) { print STDERR "Warning: line $. has ".scalar(@$F). " fields instead of the expected $numFields.\n"; } my $key = join(chr(0), @{$F}[@by]); #assumes NUL isn't used in fields! if ($key ne $currSet) { Report(\@actions, \@data); @data = (); $currSet = $key; $numFields = @$F; } push @data, $F; } Report(\@actions, \@data); } #----------------------------------------------------------------------------- # PRE : array of actions, function, command line argument w/ column # numbers for parsing. If undefined function, then just push column # number. # POST: columns parsed out with any of the following syntax: # X [just column X] # X,Y,Z [cols X, Y and Z] # X-Z [cols X through Z] # X..Z [cols X through Z] sub Parse { my ($array, $func, $clArg, $fArg) = @_; foreach my $val (split(/,/, $clArg)) { if ($val =~ /^(\d+)(?:-|\.{2})(\d+)$/) { my ($start, $stop) = split /-|(?:\.\.)/, $val; for (my $i = $start; $i <= $stop; ++$i) { push @$array, (defined $func ? [$func, $i, $fArg] : $i); } } elsif ($val =~ /^\d+$/) { push @$array, (defined $func ? [$func, $val, $fArg] : $val); } else { die "Invalid column specification: '$clArg'\n\n"; } } } #----------------------------------------------------------------------------- sub Report { my ($actions, $data) = @_; my $i = 0; foreach my $act (@$actions) { print "\t" if ++$i > 1; #-1 below is to switch from 1-based (from user) to 0-based # if ($act->[0] == \&identity || # $act->[0] == \&wheremax) { # print &{$act->[0]}($act->[1]-1, $act->[2], $data); # } else { # print sprintf('%.'.$precision.'g', &{$act->[0]}($act->[1]-1, $act->[2], $data)); # } my $res = &{$act->[0]}($act->[1]-1, $act->[2], $data); if ($res =~ /^\d*\.\d*$/) { print sprintf('%.'.$precision.'g', $res); } else { print $res; } } print "\n"; } #----------------------------------------------------------------------------- sub identity ($$$) { my ($col, $extra, $data) = @_; return $data->[0][$col]; } #----------------------------------------------------------------------------- sub debugdump ($$$) { my ($col, $extra, $data) = @_; print "-----------\n"; for (my $i = 0; $i < @$data; ++$i) { print join("\t", @{$data->[$i]}), "\n"; } print "-----------\n"; return '-'; } #----------------------------------------------------------------------------- sub sum ($$$) { my ($col, $extra, $data) = @_; my $sum = 0; for (my $i = 0; $i < @$data; ++$i) { $sum += $data->[$i][$col]; } return $sum; } #----------------------------------------------------------------------------- sub avg ($$$) { my ($col, $extra, $data) = @_; my $sum = 0; for (my $i = 0; $i < @$data; ++$i) { $sum += $data->[$i][$col]; } return $sum / @$data; } #----------------------------------------------------------------------------- sub var ($$$) { my ($col, $extra, $data) = @_; my $sum = 0; my $sumSqrs = 0; for (my $i = 0; $i < @$data; ++$i) { $sum += $data->[$i][$col]; $sumSqrs += $data->[$i][$col] * $data->[$i][$col]; } my $v = $sumSqrs/@$data - ($sum/@$data)**2; return ($v > 0 ? $v : 0);#correct for numerical errors near 0 } #----------------------------------------------------------------------------- sub percentile ($$$) { my ($col, $percentile, $data) = @_; my @sortDat = sort {$a->[$col] <=> $b->[$col]} @$data; if ($percentile == 100) { #special case maximum return $sortDat[@$data-1][$col]; } else { return $sortDat[@$data * $percentile/100][$col]; } } #----------------------------------------------------------------------------- sub wheremax ($$) { my ($col, $extra, $data) = @_; my @sortDat = sort {$a->[$col] <=> $b->[$col]} @$data; return join("\t", @{$sortDat[@$data-1]}); } #----------------------------------------------------------------------------- sub x_TokenizeFormula($) { my ($formula) = @_; my @f = split /(\$)/, $formula; shift @f if ($f[0] eq ''); pop @f if ($f[$#f] eq ''); return @f; } sub x_TranslateFormula($$) { my ($f, $row) = @_; if ($f->[0] eq '$') { return $row->[$f->[1]-1]; } else { return $f->[0]; } } #----------------------------------------------------------------------------- sub mse ($$$) { my ($col, $formula, $data) = @_; my @f = x_TokenizeFormula($formula); my $sum = 0; for (my $i = 0; $i < @$data; ++$i) { $sum += ($data->[$i][$col] - x_TranslateFormula(\@f, $data->[$i]))**2; } return $sum/@$data; } #----------------------------------------------------------------------------- sub bias ($$$) { my ($col, $formula, $data) = @_; my @f = x_TokenizeFormula($formula); return &avg(@_) - x_TranslateFormula(\@f, $data->[0]); } #----------------------------------------------------------------------------- sub cdf ($$$) { my ($col, $formula, $data) = @_; my @f = x_TokenizeFormula($formula); my $x = x_TranslateFormula(\@f, $data->[0]); my $cnt = 0; for (my $i = 0; $i < @$data; ++$i) { # some versions of perl have a lousy concept of infinity if ($data->[$i][$col] <= $x && $data->[$i][$col] ne 'inf') { ++$cnt; } } return $cnt / @$data; }