diff options
Diffstat (limited to 'cros_utils/stats.py')
-rw-r--r-- | cros_utils/stats.py | 4519 |
1 files changed, 4519 insertions, 0 deletions
diff --git a/cros_utils/stats.py b/cros_utils/stats.py new file mode 100644 index 00000000..0387a076 --- /dev/null +++ b/cros_utils/stats.py @@ -0,0 +1,4519 @@ +# We did not author this file nor mantain it. Skip linting it. +#pylint: skip-file +# Copyright (c) 1999-2008 Gary Strangman; All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# Comments and/or additions are welcome (send e-mail to: +# strang@nmr.mgh.harvard.edu). +# +"""stats.py module + +(Requires pstat.py module.) + +################################################# +####### Written by: Gary Strangman ########### +####### Last modified: Oct 31, 2008 ########### +################################################# + +A collection of basic statistical functions for python. The function +names appear below. + +IMPORTANT: There are really *3* sets of functions. The first set has an 'l' +prefix, which can be used with list or tuple arguments. The second set has +an 'a' prefix, which can accept NumPy array arguments. These latter +functions are defined only when NumPy is available on the system. The third +type has NO prefix (i.e., has the name that appears below). Functions of +this set are members of a "Dispatch" class, c/o David Ascher. This class +allows different functions to be called depending on the type of the passed +arguments. Thus, stats.mean is a member of the Dispatch class and +stats.mean(range(20)) will call stats.lmean(range(20)) while +stats.mean(Numeric.arange(20)) will call stats.amean(Numeric.arange(20)). +This is a handy way to keep consistent function names when different +argument types require different functions to be called. Having +implementated the Dispatch class, however, means that to get info on +a given function, you must use the REAL function name ... that is +"print stats.lmean.__doc__" or "print stats.amean.__doc__" work fine, +while "print stats.mean.__doc__" will print the doc for the Dispatch +class. NUMPY FUNCTIONS ('a' prefix) generally have more argument options +but should otherwise be consistent with the corresponding list functions. + +Disclaimers: The function list is obviously incomplete and, worse, the +functions are not optimized. All functions have been tested (some more +so than others), but they are far from bulletproof. Thus, as with any +free software, no warranty or guarantee is expressed or implied. :-) A +few extra functions that don't appear in the list below can be found by +interested treasure-hunters. These functions don't necessarily have +both list and array versions but were deemed useful + +CENTRAL TENDENCY: geometricmean + harmonicmean + mean + median + medianscore + mode + +MOMENTS: moment + variation + skew + kurtosis + skewtest (for Numpy arrays only) + kurtosistest (for Numpy arrays only) + normaltest (for Numpy arrays only) + +ALTERED VERSIONS: tmean (for Numpy arrays only) + tvar (for Numpy arrays only) + tmin (for Numpy arrays only) + tmax (for Numpy arrays only) + tstdev (for Numpy arrays only) + tsem (for Numpy arrays only) + describe + +FREQUENCY STATS: itemfreq + scoreatpercentile + percentileofscore + histogram + cumfreq + relfreq + +VARIABILITY: obrientransform + samplevar + samplestdev + signaltonoise (for Numpy arrays only) + var + stdev + sterr + sem + z + zs + zmap (for Numpy arrays only) + +TRIMMING FCNS: threshold (for Numpy arrays only) + trimboth + trim1 + round (round all vals to 'n' decimals; Numpy only) + +CORRELATION FCNS: covariance (for Numpy arrays only) + correlation (for Numpy arrays only) + paired + pearsonr + spearmanr + pointbiserialr + kendalltau + linregress + +INFERENTIAL STATS: ttest_1samp + ttest_ind + ttest_rel + chisquare + ks_2samp + mannwhitneyu + ranksums + wilcoxont + kruskalwallish + friedmanchisquare + +PROBABILITY CALCS: chisqprob + erfcc + zprob + ksprob + fprob + betacf + gammln + betai + +ANOVA FUNCTIONS: F_oneway + F_value + +SUPPORT FUNCTIONS: writecc + incr + sign (for Numpy arrays only) + sum + cumsum + ss + summult + sumdiffsquared + square_of_sums + shellsort + rankdata + outputpairedstats + findwithin +""" +## CHANGE LOG: +## =========== +## 09-07-21 ... added capability for getting the 'proportion' out of l/amannwhitneyu (but comment-disabled) +## 08-10-31 ... fixed import LinearAlgebra bug before glm fcns +## 07-11-26 ... conversion for numpy started +## 07-05-16 ... added Lin's Concordance Correlation Coefficient (alincc) and acov +## 05-08-21 ... added "Dice's coefficient" +## 04-10-26 ... added ap2t(), an ugly fcn for converting p-vals to T-vals +## 04-04-03 ... added amasslinregress() function to do regression on N-D arrays +## 03-01-03 ... CHANGED VERSION TO 0.6 +## fixed atsem() to properly handle limits=None case +## improved histogram and median functions (estbinwidth) and +## fixed atvar() function (wrong answers for neg numbers?!?) +## 02-11-19 ... fixed attest_ind and attest_rel for div-by-zero Overflows +## 02-05-10 ... fixed lchisqprob indentation (failed when df=even) +## 00-12-28 ... removed aanova() to separate module, fixed licensing to +## match Python License, fixed doc string & imports +## 00-04-13 ... pulled all "global" statements, except from aanova() +## added/fixed lots of documentation, removed io.py dependency +## changed to version 0.5 +## 99-11-13 ... added asign() function +## 99-11-01 ... changed version to 0.4 ... enough incremental changes now +## 99-10-25 ... added acovariance and acorrelation functions +## 99-10-10 ... fixed askew/akurtosis to avoid divide-by-zero errors +## added aglm function (crude, but will be improved) +## 99-10-04 ... upgraded acumsum, ass, asummult, asamplevar, avar, etc. to +## all handle lists of 'dimension's and keepdims +## REMOVED ar0, ar2, ar3, ar4 and replaced them with around +## reinserted fixes for abetai to avoid math overflows +## 99-09-05 ... rewrote achisqprob/aerfcc/aksprob/afprob/abetacf/abetai to +## handle multi-dimensional arrays (whew!) +## 99-08-30 ... fixed l/amoment, l/askew, l/akurtosis per D'Agostino (1990) +## added anormaltest per same reference +## re-wrote azprob to calc arrays of probs all at once +## 99-08-22 ... edited attest_ind printing section so arrays could be rounded +## 99-08-19 ... fixed amean and aharmonicmean for non-error(!) overflow on +## short/byte arrays (mean of #s btw 100-300 = -150??) +## 99-08-09 ... fixed asum so that the None case works for Byte arrays +## 99-08-08 ... fixed 7/3 'improvement' to handle t-calcs on N-D arrays +## 99-07-03 ... improved attest_ind, attest_rel (zero-division errortrap) +## 99-06-24 ... fixed bug(?) in attest_ind (n1=a.shape[0]) +## 04/11/99 ... added asignaltonoise, athreshold functions, changed all +## max/min in array section to N.maximum/N.minimum, +## fixed square_of_sums to prevent integer overflow +## 04/10/99 ... !!! Changed function name ... sumsquared ==> square_of_sums +## 03/18/99 ... Added ar0, ar2, ar3 and ar4 rounding functions +## 02/28/99 ... Fixed aobrientransform to return an array rather than a list +## 01/15/99 ... Essentially ceased updating list-versions of functions (!!!) +## 01/13/99 ... CHANGED TO VERSION 0.3 +## fixed bug in a/lmannwhitneyu p-value calculation +## 12/31/98 ... fixed variable-name bug in ldescribe +## 12/19/98 ... fixed bug in findwithin (fcns needed pstat. prefix) +## 12/16/98 ... changed amedianscore to return float (not array) for 1 score +## 12/14/98 ... added atmin and atmax functions +## removed umath from import line (not needed) +## l/ageometricmean modified to reduce chance of overflows (take +## nth root first, then multiply) +## 12/07/98 ... added __version__variable (now 0.2) +## removed all 'stats.' from anova() fcn +## 12/06/98 ... changed those functions (except shellsort) that altered +## arguments in-place ... cumsum, ranksort, ... +## updated (and fixed some) doc-strings +## 12/01/98 ... added anova() function (requires NumPy) +## incorporated Dispatch class +## 11/12/98 ... added functionality to amean, aharmonicmean, ageometricmean +## added 'asum' function (added functionality to N.add.reduce) +## fixed both moment and amoment (two errors) +## changed name of skewness and askewness to skew and askew +## fixed (a)histogram (which sometimes counted points <lowerlimit) + +import pstat # required 3rd party module +import math, string, copy # required python modules +from types import * + +__version__ = 0.6 + +############# DISPATCH CODE ############## + + +class Dispatch: + """ +The Dispatch class, care of David Ascher, allows different functions to +be called depending on the argument types. This way, there can be one +function name regardless of the argument type. To access function doc +in stats.py module, prefix the function with an 'l' or 'a' for list or +array arguments, respectively. That is, print stats.lmean.__doc__ or +print stats.amean.__doc__ or whatever. +""" + + def __init__(self, *tuples): + self._dispatch = {} + for func, types in tuples: + for t in types: + if t in self._dispatch.keys(): + raise ValueError, "can't have two dispatches on " + str(t) + self._dispatch[t] = func + self._types = self._dispatch.keys() + + def __call__(self, arg1, *args, **kw): + if type(arg1) not in self._types: + raise TypeError, "don't know how to dispatch %s arguments" % type(arg1) + return apply(self._dispatch[type(arg1)], (arg1,) + args, kw) + +########################################################################## +######################## LIST-BASED FUNCTIONS ######################## +########################################################################## + +### Define these regardless + +#################################### +####### CENTRAL TENDENCY ######### +#################################### + + +def lgeometricmean(inlist): + """ +Calculates the geometric mean of the values in the passed list. +That is: n-th root of (x1 * x2 * ... * xn). Assumes a '1D' list. + +Usage: lgeometricmean(inlist) +""" + mult = 1.0 + one_over_n = 1.0 / len(inlist) + for item in inlist: + mult = mult * pow(item, one_over_n) + return mult + + +def lharmonicmean(inlist): + """ +Calculates the harmonic mean of the values in the passed list. +That is: n / (1/x1 + 1/x2 + ... + 1/xn). Assumes a '1D' list. + +Usage: lharmonicmean(inlist) +""" + sum = 0 + for item in inlist: + sum = sum + 1.0 / item + return len(inlist) / sum + + +def lmean(inlist): + """ +Returns the arithematic mean of the values in the passed list. +Assumes a '1D' list, but will function on the 1st dim of an array(!). + +Usage: lmean(inlist) +""" + sum = 0 + for item in inlist: + sum = sum + item + return sum / float(len(inlist)) + + +def lmedian(inlist, numbins=1000): + """ +Returns the computed median value of a list of numbers, given the +number of bins to use for the histogram (more bins brings the computed value +closer to the median score, default number of bins = 1000). See G.W. +Heiman's Basic Stats (1st Edition), or CRC Probability & Statistics. + +Usage: lmedian (inlist, numbins=1000) +""" + (hist, smallest, binsize, extras) = histogram( + inlist, numbins, [min(inlist), max(inlist)]) # make histog + cumhist = cumsum(hist) # make cumulative histogram + for i in range(len(cumhist)): # get 1st(!) index holding 50%ile score + if cumhist[i] >= len(inlist) / 2.0: + cfbin = i + break + LRL = smallest + binsize * cfbin # get lower read limit of that bin + cfbelow = cumhist[cfbin - 1] + freq = float(hist[cfbin]) # frequency IN the 50%ile bin + median = LRL + ( + (len(inlist) / 2.0 - cfbelow) / float(freq)) * binsize # median formula + return median + + +def lmedianscore(inlist): + """ +Returns the 'middle' score of the passed list. If there is an even +number of scores, the mean of the 2 middle scores is returned. + +Usage: lmedianscore(inlist) +""" + + newlist = copy.deepcopy(inlist) + newlist.sort() + if len(newlist) % 2 == 0: # if even number of scores, average middle 2 + index = len(newlist) / 2 # integer division correct + median = float(newlist[index] + newlist[index - 1]) / 2 + else: + index = len(newlist) / 2 # int divsion gives mid value when count from 0 + median = newlist[index] + return median + + +def lmode(inlist): + """ +Returns a list of the modal (most common) score(s) in the passed +list. If there is more than one such score, all are returned. The +bin-count for the mode(s) is also returned. + +Usage: lmode(inlist) +Returns: bin-count for mode(s), a list of modal value(s) +""" + + scores = pstat.unique(inlist) + scores.sort() + freq = [] + for item in scores: + freq.append(inlist.count(item)) + maxfreq = max(freq) + mode = [] + stillmore = 1 + while stillmore: + try: + indx = freq.index(maxfreq) + mode.append(scores[indx]) + del freq[indx] + del scores[indx] + except ValueError: + stillmore = 0 + return maxfreq, mode + +#################################### +############ MOMENTS ############# +#################################### + + +def lmoment(inlist, moment=1): + """ +Calculates the nth moment about the mean for a sample (defaults to +the 1st moment). Used to calculate coefficients of skewness and kurtosis. + +Usage: lmoment(inlist,moment=1) +Returns: appropriate moment (r) from ... 1/n * SUM((inlist(i)-mean)**r) +""" + if moment == 1: + return 0.0 + else: + mn = mean(inlist) + n = len(inlist) + s = 0 + for x in inlist: + s = s + (x - mn)**moment + return s / float(n) + + +def lvariation(inlist): + """ +Returns the coefficient of variation, as defined in CRC Standard +Probability and Statistics, p.6. + +Usage: lvariation(inlist) +""" + return 100.0 * samplestdev(inlist) / float(mean(inlist)) + + +def lskew(inlist): + """ +Returns the skewness of a distribution, as defined in Numerical +Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.) + +Usage: lskew(inlist) +""" + return moment(inlist, 3) / pow(moment(inlist, 2), 1.5) + + +def lkurtosis(inlist): + """ +Returns the kurtosis of a distribution, as defined in Numerical +Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.) + +Usage: lkurtosis(inlist) +""" + return moment(inlist, 4) / pow(moment(inlist, 2), 2.0) + + +def ldescribe(inlist): + """ +Returns some descriptive statistics of the passed list (assumed to be 1D). + +Usage: ldescribe(inlist) +Returns: n, mean, standard deviation, skew, kurtosis +""" + n = len(inlist) + mm = (min(inlist), max(inlist)) + m = mean(inlist) + sd = stdev(inlist) + sk = skew(inlist) + kurt = kurtosis(inlist) + return n, mm, m, sd, sk, kurt + +#################################### +####### FREQUENCY STATS ########## +#################################### + + +def litemfreq(inlist): + """ +Returns a list of pairs. Each pair consists of one of the scores in inlist +and it's frequency count. Assumes a 1D list is passed. + +Usage: litemfreq(inlist) +Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies) +""" + scores = pstat.unique(inlist) + scores.sort() + freq = [] + for item in scores: + freq.append(inlist.count(item)) + return pstat.abut(scores, freq) + + +def lscoreatpercentile(inlist, percent): + """ +Returns the score at a given percentile relative to the distribution +given by inlist. + +Usage: lscoreatpercentile(inlist,percent) +""" + if percent > 1: + print '\nDividing percent>1 by 100 in lscoreatpercentile().\n' + percent = percent / 100.0 + targetcf = percent * len(inlist) + h, lrl, binsize, extras = histogram(inlist) + cumhist = cumsum(copy.deepcopy(h)) + for i in range(len(cumhist)): + if cumhist[i] >= targetcf: + break + score = binsize * ( + (targetcf - cumhist[i - 1]) / float(h[i])) + (lrl + binsize * i) + return score + + +def lpercentileofscore(inlist, score, histbins=10, defaultlimits=None): + """ +Returns the percentile value of a score relative to the distribution +given by inlist. Formula depends on the values used to histogram the data(!). + +Usage: lpercentileofscore(inlist,score,histbins=10,defaultlimits=None) +""" + + h, lrl, binsize, extras = histogram(inlist, histbins, defaultlimits) + cumhist = cumsum(copy.deepcopy(h)) + i = int((score - lrl) / float(binsize)) + pct = (cumhist[i - 1] + ( + (score - + (lrl + binsize * i)) / float(binsize)) * h[i]) / float(len(inlist)) * 100 + return pct + + +def lhistogram(inlist, numbins=10, defaultreallimits=None, printextras=0): + """ +Returns (i) a list of histogram bin counts, (ii) the smallest value +of the histogram binning, and (iii) the bin width (the last 2 are not +necessarily integers). Default number of bins is 10. If no sequence object +is given for defaultreallimits, the routine picks (usually non-pretty) bins +spanning all the numbers in the inlist. + +Usage: lhistogram (inlist, numbins=10, +defaultreallimits=None,suppressoutput=0) +Returns: list of bin values, lowerreallimit, binsize, extrapoints +""" + if (defaultreallimits <> None): + if type(defaultreallimits) not in [ListType, TupleType] or len( + defaultreallimits) == 1: # only one limit given, assumed to be lower one & upper is calc'd + lowerreallimit = defaultreallimits + upperreallimit = 1.000001 * max(inlist) + else: # assume both limits given + lowerreallimit = defaultreallimits[0] + upperreallimit = defaultreallimits[1] + binsize = (upperreallimit - lowerreallimit) / float(numbins) + else: # no limits given for histogram, both must be calc'd + estbinwidth = (max(inlist) - + min(inlist)) / float(numbins) + 1e-6 #1=>cover all + binsize = ((max(inlist) - min(inlist) + estbinwidth)) / float(numbins) + lowerreallimit = min(inlist) - binsize / 2 #lower real limit,1st bin + bins = [0] * (numbins) + extrapoints = 0 + for num in inlist: + try: + if (num - lowerreallimit) < 0: + extrapoints = extrapoints + 1 + else: + bintoincrement = int((num - lowerreallimit) / float(binsize)) + bins[bintoincrement] = bins[bintoincrement] + 1 + except: + extrapoints = extrapoints + 1 + if (extrapoints > 0 and printextras == 1): + print '\nPoints outside given histogram range =', extrapoints + return (bins, lowerreallimit, binsize, extrapoints) + + +def lcumfreq(inlist, numbins=10, defaultreallimits=None): + """ +Returns a cumulative frequency histogram, using the histogram function. + +Usage: lcumfreq(inlist,numbins=10,defaultreallimits=None) +Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints +""" + h, l, b, e = histogram(inlist, numbins, defaultreallimits) + cumhist = cumsum(copy.deepcopy(h)) + return cumhist, l, b, e + + +def lrelfreq(inlist, numbins=10, defaultreallimits=None): + """ +Returns a relative frequency histogram, using the histogram function. + +Usage: lrelfreq(inlist,numbins=10,defaultreallimits=None) +Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints +""" + h, l, b, e = histogram(inlist, numbins, defaultreallimits) + for i in range(len(h)): + h[i] = h[i] / float(len(inlist)) + return h, l, b, e + +#################################### +##### VARIABILITY FUNCTIONS ###### +#################################### + + +def lobrientransform(*args): + """ +Computes a transform on input data (any number of columns). Used to +test for homogeneity of variance prior to running one-way stats. From +Maxwell and Delaney, p.112. + +Usage: lobrientransform(*args) +Returns: transformed data for use in an ANOVA +""" + TINY = 1e-10 + k = len(args) + n = [0.0] * k + v = [0.0] * k + m = [0.0] * k + nargs = [] + for i in range(k): + nargs.append(copy.deepcopy(args[i])) + n[i] = float(len(nargs[i])) + v[i] = var(nargs[i]) + m[i] = mean(nargs[i]) + for j in range(k): + for i in range(n[j]): + t1 = (n[j] - 1.5) * n[j] * (nargs[j][i] - m[j])**2 + t2 = 0.5 * v[j] * (n[j] - 1.0) + t3 = (n[j] - 1.0) * (n[j] - 2.0) + nargs[j][i] = (t1 - t2) / float(t3) + check = 1 + for j in range(k): + if v[j] - mean(nargs[j]) > TINY: + check = 0 + if check <> 1: + raise ValueError, 'Problem in obrientransform.' + else: + return nargs + + +def lsamplevar(inlist): + """ +Returns the variance of the values in the passed list using +N for the denominator (i.e., DESCRIBES the sample variance only). + +Usage: lsamplevar(inlist) +""" + n = len(inlist) + mn = mean(inlist) + deviations = [] + for item in inlist: + deviations.append(item - mn) + return ss(deviations) / float(n) + + +def lsamplestdev(inlist): + """ +Returns the standard deviation of the values in the passed list using +N for the denominator (i.e., DESCRIBES the sample stdev only). + +Usage: lsamplestdev(inlist) +""" + return math.sqrt(samplevar(inlist)) + + +def lcov(x, y, keepdims=0): + """ +Returns the estimated covariance of the values in the passed +array (i.e., N-1). Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). Set keepdims=1 to return an array with the +same number of dimensions as inarray. + +Usage: lcov(x,y,keepdims=0) +""" + + n = len(x) + xmn = mean(x) + ymn = mean(y) + xdeviations = [0] * len(x) + ydeviations = [0] * len(y) + for i in range(len(x)): + xdeviations[i] = x[i] - xmn + ydeviations[i] = y[i] - ymn + ss = 0.0 + for i in range(len(xdeviations)): + ss = ss + xdeviations[i] * ydeviations[i] + return ss / float(n - 1) + + +def lvar(inlist): + """ +Returns the variance of the values in the passed list using N-1 +for the denominator (i.e., for estimating population variance). + +Usage: lvar(inlist) +""" + n = len(inlist) + mn = mean(inlist) + deviations = [0] * len(inlist) + for i in range(len(inlist)): + deviations[i] = inlist[i] - mn + return ss(deviations) / float(n - 1) + + +def lstdev(inlist): + """ +Returns the standard deviation of the values in the passed list +using N-1 in the denominator (i.e., to estimate population stdev). + +Usage: lstdev(inlist) +""" + return math.sqrt(var(inlist)) + + +def lsterr(inlist): + """ +Returns the standard error of the values in the passed list using N-1 +in the denominator (i.e., to estimate population standard error). + +Usage: lsterr(inlist) +""" + return stdev(inlist) / float(math.sqrt(len(inlist))) + + +def lsem(inlist): + """ +Returns the estimated standard error of the mean (sx-bar) of the +values in the passed list. sem = stdev / sqrt(n) + +Usage: lsem(inlist) +""" + sd = stdev(inlist) + n = len(inlist) + return sd / math.sqrt(n) + + +def lz(inlist, score): + """ +Returns the z-score for a given input score, given that score and the +list from which that score came. Not appropriate for population calculations. + +Usage: lz(inlist, score) +""" + z = (score - mean(inlist)) / samplestdev(inlist) + return z + + +def lzs(inlist): + """ +Returns a list of z-scores, one for each score in the passed list. + +Usage: lzs(inlist) +""" + zscores = [] + for item in inlist: + zscores.append(z(inlist, item)) + return zscores + +#################################### +####### TRIMMING FUNCTIONS ####### +#################################### + + +def ltrimboth(l, proportiontocut): + """ +Slices off the passed proportion of items from BOTH ends of the passed +list (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND 'rightmost' +10% of scores. Assumes list is sorted by magnitude. Slices off LESS if +proportion results in a non-integer slice index (i.e., conservatively +slices off proportiontocut). + +Usage: ltrimboth (l,proportiontocut) +Returns: trimmed version of list l +""" + lowercut = int(proportiontocut * len(l)) + uppercut = len(l) - lowercut + return l[lowercut:uppercut] + + +def ltrim1(l, proportiontocut, tail='right'): + """ +Slices off the passed proportion of items from ONE end of the passed +list (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost' +10% of scores). Slices off LESS if proportion results in a non-integer +slice index (i.e., conservatively slices off proportiontocut). + +Usage: ltrim1 (l,proportiontocut,tail='right') or set tail='left' +Returns: trimmed version of list l +""" + if tail == 'right': + lowercut = 0 + uppercut = len(l) - int(proportiontocut * len(l)) + elif tail == 'left': + lowercut = int(proportiontocut * len(l)) + uppercut = len(l) + return l[lowercut:uppercut] + +#################################### +##### CORRELATION FUNCTIONS ###### +#################################### + + +def lpaired(x, y): + """ +Interactively determines the type of data and then runs the +appropriated statistic for paired group data. + +Usage: lpaired(x,y) +Returns: appropriate statistic name, value, and probability +""" + samples = '' + while samples not in ['i', 'r', 'I', 'R', 'c', 'C']: + print '\nIndependent or related samples, or correlation (i,r,c): ', + samples = raw_input() + + if samples in ['i', 'I', 'r', 'R']: + print '\nComparing variances ...', + # USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112 + r = obrientransform(x, y) + f, p = F_oneway(pstat.colex(r, 0), pstat.colex(r, 1)) + if p < 0.05: + vartype = 'unequal, p=' + str(round(p, 4)) + else: + vartype = 'equal' + print vartype + if samples in ['i', 'I']: + if vartype[0] == 'e': + t, p = ttest_ind(x, y, 0) + print '\nIndependent samples t-test: ', round(t, 4), round(p, 4) + else: + if len(x) > 20 or len(y) > 20: + z, p = ranksums(x, y) + print '\nRank Sums test (NONparametric, n>20): ', round(z, 4), round( + p, 4) + else: + u, p = mannwhitneyu(x, y) + print '\nMann-Whitney U-test (NONparametric, ns<20): ', round( + u, 4), round(p, 4) + + else: # RELATED SAMPLES + if vartype[0] == 'e': + t, p = ttest_rel(x, y, 0) + print '\nRelated samples t-test: ', round(t, 4), round(p, 4) + else: + t, p = ranksums(x, y) + print '\nWilcoxon T-test (NONparametric): ', round(t, 4), round(p, 4) + else: # CORRELATION ANALYSIS + corrtype = '' + while corrtype not in ['c', 'C', 'r', 'R', 'd', 'D']: + print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ', + corrtype = raw_input() + if corrtype in ['c', 'C']: + m, b, r, p, see = linregress(x, y) + print '\nLinear regression for continuous variables ...' + lol = [['Slope', 'Intercept', 'r', 'Prob', 'SEestimate'], + [round(m, 4), round(b, 4), round(r, 4), round(p, 4), round(see, 4)] + ] + pstat.printcc(lol) + elif corrtype in ['r', 'R']: + r, p = spearmanr(x, y) + print '\nCorrelation for ranked variables ...' + print "Spearman's r: ", round(r, 4), round(p, 4) + else: # DICHOTOMOUS + r, p = pointbiserialr(x, y) + print '\nAssuming x contains a dichotomous variable ...' + print 'Point Biserial r: ', round(r, 4), round(p, 4) + print '\n\n' + return None + + +def lpearsonr(x, y): + """ +Calculates a Pearson correlation coefficient and the associated +probability value. Taken from Heiman's Basic Statistics for the Behav. +Sci (2nd), p.195. + +Usage: lpearsonr(x,y) where x and y are equal-length lists +Returns: Pearson's r value, two-tailed p-value +""" + TINY = 1.0e-30 + if len(x) <> len(y): + raise ValueError, 'Input values not paired in pearsonr. Aborting.' + n = len(x) + x = map(float, x) + y = map(float, y) + xmean = mean(x) + ymean = mean(y) + r_num = n * (summult(x, y)) - sum(x) * sum(y) + r_den = math.sqrt((n * ss(x) - square_of_sums(x)) * + (n * ss(y) - square_of_sums(y))) + r = (r_num / r_den) # denominator already a float + df = n - 2 + t = r * math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) + prob = betai(0.5 * df, 0.5, df / float(df + t * t)) + return r, prob + + +def llincc(x, y): + """ +Calculates Lin's concordance correlation coefficient. + +Usage: alincc(x,y) where x, y are equal-length arrays +Returns: Lin's CC +""" + covar = lcov(x, y) * (len(x) - 1) / float(len(x)) # correct denom to n + xvar = lvar(x) * (len(x) - 1) / float(len(x)) # correct denom to n + yvar = lvar(y) * (len(y) - 1) / float(len(y)) # correct denom to n + lincc = (2 * covar) / ((xvar + yvar) + ((amean(x) - amean(y))**2)) + return lincc + + +def lspearmanr(x, y): + """ +Calculates a Spearman rank-order correlation coefficient. Taken +from Heiman's Basic Statistics for the Behav. Sci (1st), p.192. + +Usage: lspearmanr(x,y) where x and y are equal-length lists +Returns: Spearman's r, two-tailed p-value +""" + TINY = 1e-30 + if len(x) <> len(y): + raise ValueError, 'Input values not paired in spearmanr. Aborting.' + n = len(x) + rankx = rankdata(x) + ranky = rankdata(y) + dsq = sumdiffsquared(rankx, ranky) + rs = 1 - 6 * dsq / float(n * (n**2 - 1)) + t = rs * math.sqrt((n - 2) / ((rs + 1.0) * (1.0 - rs))) + df = n - 2 + probrs = betai(0.5 * df, 0.5, df / (df + t * t)) # t already a float + # probability values for rs are from part 2 of the spearman function in + # Numerical Recipies, p.510. They are close to tables, but not exact. (?) + return rs, probrs + + +def lpointbiserialr(x, y): + """ +Calculates a point-biserial correlation coefficient and the associated +probability value. Taken from Heiman's Basic Statistics for the Behav. +Sci (1st), p.194. + +Usage: lpointbiserialr(x,y) where x,y are equal-length lists +Returns: Point-biserial r, two-tailed p-value +""" + TINY = 1e-30 + if len(x) <> len(y): + raise ValueError, 'INPUT VALUES NOT PAIRED IN pointbiserialr. ABORTING.' + data = pstat.abut(x, y) + categories = pstat.unique(x) + if len(categories) <> 2: + raise ValueError, 'Exactly 2 categories required for pointbiserialr().' + else: # there are 2 categories, continue + codemap = pstat.abut(categories, range(2)) + recoded = pstat.recode(data, codemap, 0) + x = pstat.linexand(data, 0, categories[0]) + y = pstat.linexand(data, 0, categories[1]) + xmean = mean(pstat.colex(x, 1)) + ymean = mean(pstat.colex(y, 1)) + n = len(data) + adjust = math.sqrt((len(x) / float(n)) * (len(y) / float(n))) + rpb = (ymean - xmean) / samplestdev(pstat.colex(data, 1)) * adjust + df = n - 2 + t = rpb * math.sqrt(df / ((1.0 - rpb + TINY) * (1.0 + rpb + TINY))) + prob = betai(0.5 * df, 0.5, df / (df + t * t)) # t already a float + return rpb, prob + + +def lkendalltau(x, y): + """ +Calculates Kendall's tau ... correlation of ordinal data. Adapted +from function kendl1 in Numerical Recipies. Needs good test-routine.@@@ + +Usage: lkendalltau(x,y) +Returns: Kendall's tau, two-tailed p-value +""" + n1 = 0 + n2 = 0 + iss = 0 + for j in range(len(x) - 1): + for k in range(j, len(y)): + a1 = x[j] - x[k] + a2 = y[j] - y[k] + aa = a1 * a2 + if (aa): # neither list has a tie + n1 = n1 + 1 + n2 = n2 + 1 + if aa > 0: + iss = iss + 1 + else: + iss = iss - 1 + else: + if (a1): + n1 = n1 + 1 + else: + n2 = n2 + 1 + tau = iss / math.sqrt(n1 * n2) + svar = (4.0 * len(x) + 10.0) / (9.0 * len(x) * (len(x) - 1)) + z = tau / math.sqrt(svar) + prob = erfcc(abs(z) / 1.4142136) + return tau, prob + + +def llinregress(x, y): + """ +Calculates a regression line on x,y pairs. + +Usage: llinregress(x,y) x,y are equal-length lists of x-y coordinates +Returns: slope, intercept, r, two-tailed prob, sterr-of-estimate +""" + TINY = 1.0e-20 + if len(x) <> len(y): + raise ValueError, 'Input values not paired in linregress. Aborting.' + n = len(x) + x = map(float, x) + y = map(float, y) + xmean = mean(x) + ymean = mean(y) + r_num = float(n * (summult(x, y)) - sum(x) * sum(y)) + r_den = math.sqrt((n * ss(x) - square_of_sums(x)) * + (n * ss(y) - square_of_sums(y))) + r = r_num / r_den + z = 0.5 * math.log((1.0 + r + TINY) / (1.0 - r + TINY)) + df = n - 2 + t = r * math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) + prob = betai(0.5 * df, 0.5, df / (df + t * t)) + slope = r_num / float(n * ss(x) - square_of_sums(x)) + intercept = ymean - slope * xmean + sterrest = math.sqrt(1 - r * r) * samplestdev(y) + return slope, intercept, r, prob, sterrest + +#################################### +##### INFERENTIAL STATISTICS ##### +#################################### + + +def lttest_1samp(a, popmean, printit=0, name='Sample', writemode='a'): + """ +Calculates the t-obtained for the independent samples T-test on ONE group +of scores a, given a population mean. If printit=1, results are printed +to the screen. If printit='filename', the results are output to 'filename' +using the given writemode (default=append). Returns t-value, and prob. + +Usage: lttest_1samp(a,popmean,Name='Sample',printit=0,writemode='a') +Returns: t-value, two-tailed prob +""" + x = mean(a) + v = var(a) + n = len(a) + df = n - 1 + svar = ((n - 1) * v) / float(df) + t = (x - popmean) / math.sqrt(svar * (1.0 / n)) + prob = betai(0.5 * df, 0.5, float(df) / (df + t * t)) + + if printit <> 0: + statname = 'Single-sample T-test.' + outputpairedstats(printit, writemode, 'Population', '--', popmean, 0, 0, 0, + name, n, x, v, min(a), max(a), statname, t, prob) + return t, prob + + +def lttest_ind(a, b, printit=0, name1='Samp1', name2='Samp2', writemode='a'): + """ +Calculates the t-obtained T-test on TWO INDEPENDENT samples of +scores a, and b. From Numerical Recipies, p.483. If printit=1, results +are printed to the screen. If printit='filename', the results are output +to 'filename' using the given writemode (default=append). Returns t-value, +and prob. + +Usage: lttest_ind(a,b,printit=0,name1='Samp1',name2='Samp2',writemode='a') +Returns: t-value, two-tailed prob +""" + x1 = mean(a) + x2 = mean(b) + v1 = stdev(a)**2 + v2 = stdev(b)**2 + n1 = len(a) + n2 = len(b) + df = n1 + n2 - 2 + svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) + if not svar: + svar = 1.0e-26 + t = (x1 - x2) / math.sqrt(svar * (1.0 / n1 + 1.0 / n2)) + prob = betai(0.5 * df, 0.5, df / (df + t * t)) + + if printit <> 0: + statname = 'Independent samples T-test.' + outputpairedstats(printit, writemode, name1, n1, x1, v1, min(a), max(a), + name2, n2, x2, v2, min(b), max(b), statname, t, prob) + return t, prob + + +def lttest_rel(a, + b, + printit=0, + name1='Sample1', + name2='Sample2', + writemode='a'): + """ +Calculates the t-obtained T-test on TWO RELATED samples of scores, +a and b. From Numerical Recipies, p.483. If printit=1, results are +printed to the screen. If printit='filename', the results are output to +'filename' using the given writemode (default=append). Returns t-value, +and prob. + +Usage: lttest_rel(a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a') +Returns: t-value, two-tailed prob +""" + if len(a) <> len(b): + raise ValueError, 'Unequal length lists in ttest_rel.' + x1 = mean(a) + x2 = mean(b) + v1 = var(a) + v2 = var(b) + n = len(a) + cov = 0 + for i in range(len(a)): + cov = cov + (a[i] - x1) * (b[i] - x2) + df = n - 1 + cov = cov / float(df) + sd = math.sqrt((v1 + v2 - 2.0 * cov) / float(n)) + t = (x1 - x2) / sd + prob = betai(0.5 * df, 0.5, df / (df + t * t)) + + if printit <> 0: + statname = 'Related samples T-test.' + outputpairedstats(printit, writemode, name1, n, x1, v1, min(a), max(a), + name2, n, x2, v2, min(b), max(b), statname, t, prob) + return t, prob + + +def lchisquare(f_obs, f_exp=None): + """ +Calculates a one-way chi square for list of observed frequencies and returns +the result. If no expected frequencies are given, the total N is assumed to +be equally distributed across all groups. + +Usage: lchisquare(f_obs, f_exp=None) f_obs = list of observed cell freq. +Returns: chisquare-statistic, associated p-value +""" + k = len(f_obs) # number of groups + if f_exp == None: + f_exp = [sum(f_obs) / float(k)] * len(f_obs) # create k bins with = freq. + chisq = 0 + for i in range(len(f_obs)): + chisq = chisq + (f_obs[i] - f_exp[i])**2 / float(f_exp[i]) + return chisq, chisqprob(chisq, k - 1) + + +def lks_2samp(data1, data2): + """ +Computes the Kolmogorov-Smirnof statistic on 2 samples. From +Numerical Recipies in C, page 493. + +Usage: lks_2samp(data1,data2) data1&2 are lists of values for 2 conditions +Returns: KS D-value, associated p-value +""" + j1 = 0 + j2 = 0 + fn1 = 0.0 + fn2 = 0.0 + n1 = len(data1) + n2 = len(data2) + en1 = n1 + en2 = n2 + d = 0.0 + data1.sort() + data2.sort() + while j1 < n1 and j2 < n2: + d1 = data1[j1] + d2 = data2[j2] + if d1 <= d2: + fn1 = (j1) / float(en1) + j1 = j1 + 1 + if d2 <= d1: + fn2 = (j2) / float(en2) + j2 = j2 + 1 + dt = (fn2 - fn1) + if math.fabs(dt) > math.fabs(d): + d = dt + try: + en = math.sqrt(en1 * en2 / float(en1 + en2)) + prob = ksprob((en + 0.12 + 0.11 / en) * abs(d)) + except: + prob = 1.0 + return d, prob + + +def lmannwhitneyu(x, y): + """ +Calculates a Mann-Whitney U statistic on the provided scores and +returns the result. Use only when the n in each condition is < 20 and +you have 2 independent samples of ranks. NOTE: Mann-Whitney U is +significant if the u-obtained is LESS THAN or equal to the critical +value of U found in the tables. Equivalent to Kruskal-Wallis H with +just 2 groups. + +Usage: lmannwhitneyu(data) +Returns: u-statistic, one-tailed p-value (i.e., p(z(U))) +""" + n1 = len(x) + n2 = len(y) + ranked = rankdata(x + y) + rankx = ranked[0:n1] # get the x-ranks + ranky = ranked[n1:] # the rest are y-ranks + u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - sum(rankx) # calc U for x + u2 = n1 * n2 - u1 # remainder is U for y + bigu = max(u1, u2) + smallu = min(u1, u2) + proportion = bigu / float(n1 * n2) + T = math.sqrt(tiecorrect(ranked)) # correction factor for tied scores + if T == 0: + raise ValueError, 'All numbers are identical in lmannwhitneyu' + sd = math.sqrt(T * n1 * n2 * (n1 + n2 + 1) / 12.0) + z = abs((bigu - n1 * n2 / 2.0) / sd) # normal approximation for prob calc + return smallu, 1.0 - zprob(z) #, proportion + + +def ltiecorrect(rankvals): + """ +Corrects for ties in Mann Whitney U and Kruskal Wallis H tests. See +Siegel, S. (1956) Nonparametric Statistics for the Behavioral Sciences. +New York: McGraw-Hill. Code adapted from |Stat rankind.c code. + +Usage: ltiecorrect(rankvals) +Returns: T correction factor for U or H +""" + sorted, posn = shellsort(rankvals) + n = len(sorted) + T = 0.0 + i = 0 + while (i < n - 1): + if sorted[i] == sorted[i + 1]: + nties = 1 + while (i < n - 1) and (sorted[i] == sorted[i + 1]): + nties = nties + 1 + i = i + 1 + T = T + nties**3 - nties + i = i + 1 + T = T / float(n**3 - n) + return 1.0 - T + + +def lranksums(x, y): + """ +Calculates the rank sums statistic on the provided scores and +returns the result. Use only when the n in each condition is > 20 and you +have 2 independent samples of ranks. + +Usage: lranksums(x,y) +Returns: a z-statistic, two-tailed p-value +""" + n1 = len(x) + n2 = len(y) + alldata = x + y + ranked = rankdata(alldata) + x = ranked[:n1] + y = ranked[n1:] + s = sum(x) + expected = n1 * (n1 + n2 + 1) / 2.0 + z = (s - expected) / math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0) + prob = 2 * (1.0 - zprob(abs(z))) + return z, prob + + +def lwilcoxont(x, y): + """ +Calculates the Wilcoxon T-test for related samples and returns the +result. A non-parametric T-test. + +Usage: lwilcoxont(x,y) +Returns: a t-statistic, two-tail probability estimate +""" + if len(x) <> len(y): + raise ValueError, 'Unequal N in wilcoxont. Aborting.' + d = [] + for i in range(len(x)): + diff = x[i] - y[i] + if diff <> 0: + d.append(diff) + count = len(d) + absd = map(abs, d) + absranked = rankdata(absd) + r_plus = 0.0 + r_minus = 0.0 + for i in range(len(absd)): + if d[i] < 0: + r_minus = r_minus + absranked[i] + else: + r_plus = r_plus + absranked[i] + wt = min(r_plus, r_minus) + mn = count * (count + 1) * 0.25 + se = math.sqrt(count * (count + 1) * (2.0 * count + 1.0) / 24.0) + z = math.fabs(wt - mn) / se + prob = 2 * (1.0 - zprob(abs(z))) + return wt, prob + + +def lkruskalwallish(*args): + """ +The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more +groups, requiring at least 5 subjects in each group. This function +calculates the Kruskal-Wallis H-test for 3 or more independent samples +and returns the result. + +Usage: lkruskalwallish(*args) +Returns: H-statistic (corrected for ties), associated p-value +""" + args = list(args) + n = [0] * len(args) + all = [] + n = map(len, args) + for i in range(len(args)): + all = all + args[i] + ranked = rankdata(all) + T = tiecorrect(ranked) + for i in range(len(args)): + args[i] = ranked[0:n[i]] + del ranked[0:n[i]] + rsums = [] + for i in range(len(args)): + rsums.append(sum(args[i])**2) + rsums[i] = rsums[i] / float(n[i]) + ssbn = sum(rsums) + totaln = sum(n) + h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) + df = len(args) - 1 + if T == 0: + raise ValueError, 'All numbers are identical in lkruskalwallish' + h = h / float(T) + return h, chisqprob(h, df) + + +def lfriedmanchisquare(*args): + """ +Friedman Chi-Square is a non-parametric, one-way within-subjects +ANOVA. This function calculates the Friedman Chi-square test for repeated +measures and returns the result, along with the associated probability +value. It assumes 3 or more repeated measures. Only 3 levels requires a +minimum of 10 subjects in the study. Four levels requires 5 subjects per +level(??). + +Usage: lfriedmanchisquare(*args) +Returns: chi-square statistic, associated p-value +""" + k = len(args) + if k < 3: + raise ValueError, 'Less than 3 levels. Friedman test not appropriate.' + n = len(args[0]) + data = apply(pstat.abut, tuple(args)) + for i in range(len(data)): + data[i] = rankdata(data[i]) + ssbn = 0 + for i in range(k): + ssbn = ssbn + sum(args[i])**2 + chisq = 12.0 / (k * n * (k + 1)) * ssbn - 3 * n * (k + 1) + return chisq, chisqprob(chisq, k - 1) + +#################################### +#### PROBABILITY CALCULATIONS #### +#################################### + + +def lchisqprob(chisq, df): + """ +Returns the (1-tailed) probability value associated with the provided +chi-square value and df. Adapted from chisq.c in Gary Perlman's |Stat. + +Usage: lchisqprob(chisq,df) +""" + BIG = 20.0 + + def ex(x): + BIG = 20.0 + if x < -BIG: + return 0.0 + else: + return math.exp(x) + + if chisq <= 0 or df < 1: + return 1.0 + a = 0.5 * chisq + if df % 2 == 0: + even = 1 + else: + even = 0 + if df > 1: + y = ex(-a) + if even: + s = y + else: + s = 2.0 * zprob(-math.sqrt(chisq)) + if (df > 2): + chisq = 0.5 * (df - 1.0) + if even: + z = 1.0 + else: + z = 0.5 + if a > BIG: + if even: + e = 0.0 + else: + e = math.log(math.sqrt(math.pi)) + c = math.log(a) + while (z <= chisq): + e = math.log(z) + e + s = s + ex(c * z - a - e) + z = z + 1.0 + return s + else: + if even: + e = 1.0 + else: + e = 1.0 / math.sqrt(math.pi) / math.sqrt(a) + c = 0.0 + while (z <= chisq): + e = e * (a / float(z)) + c = c + e + z = z + 1.0 + return (c * y + s) + else: + return s + + +def lerfcc(x): + """ +Returns the complementary error function erfc(x) with fractional +error everywhere less than 1.2e-7. Adapted from Numerical Recipies. + +Usage: lerfcc(x) +""" + z = abs(x) + t = 1.0 / (1.0 + 0.5 * z) + ans = t * math.exp(-z * z - 1.26551223 + t * (1.00002368 + t * ( + 0.37409196 + t * (0.09678418 + t * (-0.18628806 + t * (0.27886807 + t * ( + -1.13520398 + t * (1.48851587 + t * (-0.82215223 + t * 0.17087277))))) + )))) + if x >= 0: + return ans + else: + return 2.0 - ans + + +def lzprob(z): + """ +Returns the area under the normal curve 'to the left of' the given z value. +Thus, + for z<0, zprob(z) = 1-tail probability + for z>0, 1.0-zprob(z) = 1-tail probability + for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability +Adapted from z.c in Gary Perlman's |Stat. + +Usage: lzprob(z) +""" + Z_MAX = 6.0 # maximum meaningful z-value + if z == 0.0: + x = 0.0 + else: + y = 0.5 * math.fabs(z) + if y >= (Z_MAX * 0.5): + x = 1.0 + elif (y < 1.0): + w = y * y + x = (( + ((((((0.000124818987 * w - 0.001075204047) * w + 0.005198775019) * w - + 0.019198292004) * w + 0.059054035642) * w - 0.151968751364) * w + + 0.319152932694) * w - 0.531923007300) * w + 0.797884560593) * y * 2.0 + else: + y = y - 2.0 + x = ((((((( + ((((((-0.000045255659 * y + 0.000152529290) * y - 0.000019538132) * y + - 0.000676904986) * y + 0.001390604284) * y - 0.000794620820) * y + - 0.002034254874) * y + 0.006549791214) * y - 0.010557625006) * y + + 0.011630447319) * y - 0.009279453341) * y + 0.005353579108) * y - + 0.002141268741) * y + 0.000535310849) * y + 0.999936657524 + if z > 0.0: + prob = ((x + 1.0) * 0.5) + else: + prob = ((1.0 - x) * 0.5) + return prob + + +def lksprob(alam): + """ +Computes a Kolmolgorov-Smirnov t-test significance level. Adapted from +Numerical Recipies. + +Usage: lksprob(alam) +""" + fac = 2.0 + sum = 0.0 + termbf = 0.0 + a2 = -2.0 * alam * alam + for j in range(1, 201): + term = fac * math.exp(a2 * j * j) + sum = sum + term + if math.fabs(term) <= (0.001 * termbf) or math.fabs(term) < (1.0e-8 * sum): + return sum + fac = -fac + termbf = math.fabs(term) + return 1.0 # Get here only if fails to converge; was 0.0!! + + +def lfprob(dfnum, dfden, F): + """ +Returns the (1-tailed) significance level (p-value) of an F +statistic given the degrees of freedom for the numerator (dfR-dfF) and +the degrees of freedom for the denominator (dfF). + +Usage: lfprob(dfnum, dfden, F) where usually dfnum=dfbn, dfden=dfwn +""" + p = betai(0.5 * dfden, 0.5 * dfnum, dfden / float(dfden + dfnum * F)) + return p + + +def lbetacf(a, b, x): + """ +This function evaluates the continued fraction form of the incomplete +Beta function, betai. (Adapted from: Numerical Recipies in C.) + +Usage: lbetacf(a,b,x) +""" + ITMAX = 200 + EPS = 3.0e-7 + + bm = az = am = 1.0 + qab = a + b + qap = a + 1.0 + qam = a - 1.0 + bz = 1.0 - qab * x / qap + for i in range(ITMAX + 1): + em = float(i + 1) + tem = em + em + d = em * (b - em) * x / ((qam + tem) * (a + tem)) + ap = az + d * am + bp = bz + d * bm + d = -(a + em) * (qab + em) * x / ((qap + tem) * (a + tem)) + app = ap + d * az + bpp = bp + d * bz + aold = az + am = ap / bpp + bm = bp / bpp + az = app / bpp + bz = 1.0 + if (abs(az - aold) < (EPS * abs(az))): + return az + print 'a or b too big, or ITMAX too small in Betacf.' + + +def lgammln(xx): + """ +Returns the gamma function of xx. + Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt. +(Adapted from: Numerical Recipies in C.) + +Usage: lgammln(xx) +""" + + coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, 0.120858003e-2, + -0.536382e-5] + x = xx - 1.0 + tmp = x + 5.5 + tmp = tmp - (x + 0.5) * math.log(tmp) + ser = 1.0 + for j in range(len(coeff)): + x = x + 1 + ser = ser + coeff[j] / x + return -tmp + math.log(2.50662827465 * ser) + + +def lbetai(a, b, x): + """ +Returns the incomplete beta function: + + I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) + +where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma +function of a. The continued fraction formulation is implemented here, +using the betacf function. (Adapted from: Numerical Recipies in C.) + +Usage: lbetai(a,b,x) +""" + if (x < 0.0 or x > 1.0): + raise ValueError, 'Bad x in lbetai' + if (x == 0.0 or x == 1.0): + bt = 0.0 + else: + bt = math.exp(gammln(a + b) - gammln(a) - gammln(b) + a * math.log(x) + b * + math.log(1.0 - x)) + if (x < (a + 1.0) / (a + b + 2.0)): + return bt * betacf(a, b, x) / float(a) + else: + return 1.0 - bt * betacf(b, a, 1.0 - x) / float(b) + +#################################### +####### ANOVA CALCULATIONS ####### +#################################### + + +def lF_oneway(*lists): + """ +Performs a 1-way ANOVA, returning an F-value and probability given +any number of groups. From Heiman, pp.394-7. + +Usage: F_oneway(*lists) where *lists is any number of lists, one per + treatment group +Returns: F value, one-tailed p-value +""" + a = len(lists) # ANOVA on 'a' groups, each in it's own list + means = [0] * a + vars = [0] * a + ns = [0] * a + alldata = [] + tmp = map(N.array, lists) + means = map(amean, tmp) + vars = map(avar, tmp) + ns = map(len, lists) + for i in range(len(lists)): + alldata = alldata + lists[i] + alldata = N.array(alldata) + bign = len(alldata) + sstot = ass(alldata) - (asquare_of_sums(alldata) / float(bign)) + ssbn = 0 + for list in lists: + ssbn = ssbn + asquare_of_sums(N.array(list)) / float(len(list)) + ssbn = ssbn - (asquare_of_sums(alldata) / float(bign)) + sswn = sstot - ssbn + dfbn = a - 1 + dfwn = bign - a + msb = ssbn / float(dfbn) + msw = sswn / float(dfwn) + f = msb / msw + prob = fprob(dfbn, dfwn, f) + return f, prob + + +def lF_value(ER, EF, dfnum, dfden): + """ +Returns an F-statistic given the following: + ER = error associated with the null hypothesis (the Restricted model) + EF = error associated with the alternate hypothesis (the Full model) + dfR-dfF = degrees of freedom of the numerator + dfF = degrees of freedom associated with the denominator/Full model + +Usage: lF_value(ER,EF,dfnum,dfden) +""" + return ((ER - EF) / float(dfnum) / (EF / float(dfden))) + +#################################### +######## SUPPORT FUNCTIONS ####### +#################################### + + +def writecc(listoflists, file, writetype='w', extra=2): + """ +Writes a list of lists to a file in columns, customized by the max +size of items within the columns (max size of items in col, +2 characters) +to specified file. File-overwrite is the default. + +Usage: writecc (listoflists,file,writetype='w',extra=2) +Returns: None +""" + if type(listoflists[0]) not in [ListType, TupleType]: + listoflists = [listoflists] + outfile = open(file, writetype) + rowstokill = [] + list2print = copy.deepcopy(listoflists) + for i in range(len(listoflists)): + if listoflists[i] == [ + '\n' + ] or listoflists[i] == '\n' or listoflists[i] == 'dashes': + rowstokill = rowstokill + [i] + rowstokill.reverse() + for row in rowstokill: + del list2print[row] + maxsize = [0] * len(list2print[0]) + for col in range(len(list2print[0])): + items = pstat.colex(list2print, col) + items = map(pstat.makestr, items) + maxsize[col] = max(map(len, items)) + extra + for row in listoflists: + if row == ['\n'] or row == '\n': + outfile.write('\n') + elif row == ['dashes'] or row == 'dashes': + dashes = [0] * len(maxsize) + for j in range(len(maxsize)): + dashes[j] = '-' * (maxsize[j] - 2) + outfile.write(pstat.lineincustcols(dashes, maxsize)) + else: + outfile.write(pstat.lineincustcols(row, maxsize)) + outfile.write('\n') + outfile.close() + return None + + +def lincr(l, cap): # to increment a list up to a max-list of 'cap' + """ +Simulate a counting system from an n-dimensional list. + +Usage: lincr(l,cap) l=list to increment, cap=max values for each list pos'n +Returns: next set of values for list l, OR -1 (if overflow) +""" + l[0] = l[0] + 1 # e.g., [0,0,0] --> [2,4,3] (=cap) + for i in range(len(l)): + if l[i] > cap[i] and i < len(l) - 1: # if carryover AND not done + l[i] = 0 + l[i + 1] = l[i + 1] + 1 + elif l[i] > cap[i] and i == len( + l) - 1: # overflow past last column, must be finished + l = -1 + return l + + +def lsum(inlist): + """ +Returns the sum of the items in the passed list. + +Usage: lsum(inlist) +""" + s = 0 + for item in inlist: + s = s + item + return s + + +def lcumsum(inlist): + """ +Returns a list consisting of the cumulative sum of the items in the +passed list. + +Usage: lcumsum(inlist) +""" + newlist = copy.deepcopy(inlist) + for i in range(1, len(newlist)): + newlist[i] = newlist[i] + newlist[i - 1] + return newlist + + +def lss(inlist): + """ +Squares each value in the passed list, adds up these squares and +returns the result. + +Usage: lss(inlist) +""" + ss = 0 + for item in inlist: + ss = ss + item * item + return ss + + +def lsummult(list1, list2): + """ +Multiplies elements in list1 and list2, element by element, and +returns the sum of all resulting multiplications. Must provide equal +length lists. + +Usage: lsummult(list1,list2) +""" + if len(list1) <> len(list2): + raise ValueError, 'Lists not equal length in summult.' + s = 0 + for item1, item2 in pstat.abut(list1, list2): + s = s + item1 * item2 + return s + + +def lsumdiffsquared(x, y): + """ +Takes pairwise differences of the values in lists x and y, squares +these differences, and returns the sum of these squares. + +Usage: lsumdiffsquared(x,y) +Returns: sum[(x[i]-y[i])**2] +""" + sds = 0 + for i in range(len(x)): + sds = sds + (x[i] - y[i])**2 + return sds + + +def lsquare_of_sums(inlist): + """ +Adds the values in the passed list, squares the sum, and returns +the result. + +Usage: lsquare_of_sums(inlist) +Returns: sum(inlist[i])**2 +""" + s = sum(inlist) + return float(s) * s + + +def lshellsort(inlist): + """ +Shellsort algorithm. Sorts a 1D-list. + +Usage: lshellsort(inlist) +Returns: sorted-inlist, sorting-index-vector (for original list) +""" + n = len(inlist) + svec = copy.deepcopy(inlist) + ivec = range(n) + gap = n / 2 # integer division needed + while gap > 0: + for i in range(gap, n): + for j in range(i - gap, -1, -gap): + while j >= 0 and svec[j] > svec[j + gap]: + temp = svec[j] + svec[j] = svec[j + gap] + svec[j + gap] = temp + itemp = ivec[j] + ivec[j] = ivec[j + gap] + ivec[j + gap] = itemp + gap = gap / 2 # integer division needed +# svec is now sorted inlist, and ivec has the order svec[i] = vec[ivec[i]] + return svec, ivec + + +def lrankdata(inlist): + """ +Ranks the data in inlist, dealing with ties appropritely. Assumes +a 1D inlist. Adapted from Gary Perlman's |Stat ranksort. + +Usage: lrankdata(inlist) +Returns: a list of length equal to inlist, containing rank scores +""" + n = len(inlist) + svec, ivec = shellsort(inlist) + sumranks = 0 + dupcount = 0 + newlist = [0] * n + for i in range(n): + sumranks = sumranks + i + dupcount = dupcount + 1 + if i == n - 1 or svec[i] <> svec[i + 1]: + averank = sumranks / float(dupcount) + 1 + for j in range(i - dupcount + 1, i + 1): + newlist[ivec[j]] = averank + sumranks = 0 + dupcount = 0 + return newlist + + +def outputpairedstats(fname, writemode, name1, n1, m1, se1, min1, max1, name2, + n2, m2, se2, min2, max2, statname, stat, prob): + """ +Prints or write to a file stats for two groups, using the name, n, +mean, sterr, min and max for each group, as well as the statistic name, +its value, and the associated p-value. + +Usage: outputpairedstats(fname,writemode, + name1,n1,mean1,stderr1,min1,max1, + name2,n2,mean2,stderr2,min2,max2, + statname,stat,prob) +Returns: None +""" + suffix = '' # for *s after the p-value + try: + x = prob.shape + prob = prob[0] + except: + pass + if prob < 0.001: + suffix = ' ***' + elif prob < 0.01: + suffix = ' **' + elif prob < 0.05: + suffix = ' *' + title = [['Name', 'N', 'Mean', 'SD', 'Min', 'Max']] + lofl = title + [[name1, n1, round(m1, 3), round( + math.sqrt(se1), 3), min1, max1], [name2, n2, round(m2, 3), round( + math.sqrt(se2), 3), min2, max2]] + if type(fname) <> StringType or len(fname) == 0: + print + print statname + print + pstat.printcc(lofl) + print + try: + if stat.shape == (): + stat = stat[0] + if prob.shape == (): + prob = prob[0] + except: + pass + print 'Test statistic = ', round(stat, 3), ' p = ', round(prob, 3), suffix + print + else: + file = open(fname, writemode) + file.write('\n' + statname + '\n\n') + file.close() + writecc(lofl, fname, 'a') + file = open(fname, 'a') + try: + if stat.shape == (): + stat = stat[0] + if prob.shape == (): + prob = prob[0] + except: + pass + file.write(pstat.list2string(['\nTest statistic = ', round(stat, 4), + ' p = ', round(prob, 4), suffix, '\n\n'])) + file.close() + return None + + +def lfindwithin(data): + """ +Returns an integer representing a binary vector, where 1=within- +subject factor, 0=between. Input equals the entire data 2D list (i.e., +column 0=random factor, column -1=measured values (those two are skipped). +Note: input data is in |Stat format ... a list of lists ("2D list") with +one row per measured value, first column=subject identifier, last column= +score, one in-between column per factor (these columns contain level +designations on each factor). See also stats.anova.__doc__. + +Usage: lfindwithin(data) data in |Stat format +""" + + numfact = len(data[0]) - 1 + withinvec = 0 + for col in range(1, numfact): + examplelevel = pstat.unique(pstat.colex(data, col))[0] + rows = pstat.linexand(data, col, examplelevel) # get 1 level of this factor + factsubjs = pstat.unique(pstat.colex(rows, 0)) + allsubjs = pstat.unique(pstat.colex(data, 0)) + if len(factsubjs) == len(allsubjs): # fewer Ss than scores on this factor? + withinvec = withinvec + (1 << col) + return withinvec + +######################################################### +######################################################### +####### DISPATCH LISTS AND TUPLES TO ABOVE FCNS ######### +######################################################### +######################################################### + +## CENTRAL TENDENCY: +geometricmean = Dispatch((lgeometricmean, (ListType, TupleType)),) +harmonicmean = Dispatch((lharmonicmean, (ListType, TupleType)),) +mean = Dispatch((lmean, (ListType, TupleType)),) +median = Dispatch((lmedian, (ListType, TupleType)),) +medianscore = Dispatch((lmedianscore, (ListType, TupleType)),) +mode = Dispatch((lmode, (ListType, TupleType)),) + +## MOMENTS: +moment = Dispatch((lmoment, (ListType, TupleType)),) +variation = Dispatch((lvariation, (ListType, TupleType)),) +skew = Dispatch((lskew, (ListType, TupleType)),) +kurtosis = Dispatch((lkurtosis, (ListType, TupleType)),) +describe = Dispatch((ldescribe, (ListType, TupleType)),) + +## FREQUENCY STATISTICS: +itemfreq = Dispatch((litemfreq, (ListType, TupleType)),) +scoreatpercentile = Dispatch((lscoreatpercentile, (ListType, TupleType)),) +percentileofscore = Dispatch((lpercentileofscore, (ListType, TupleType)),) +histogram = Dispatch((lhistogram, (ListType, TupleType)),) +cumfreq = Dispatch((lcumfreq, (ListType, TupleType)),) +relfreq = Dispatch((lrelfreq, (ListType, TupleType)),) + +## VARIABILITY: +obrientransform = Dispatch((lobrientransform, (ListType, TupleType)),) +samplevar = Dispatch((lsamplevar, (ListType, TupleType)),) +samplestdev = Dispatch((lsamplestdev, (ListType, TupleType)),) +var = Dispatch((lvar, (ListType, TupleType)),) +stdev = Dispatch((lstdev, (ListType, TupleType)),) +sterr = Dispatch((lsterr, (ListType, TupleType)),) +sem = Dispatch((lsem, (ListType, TupleType)),) +z = Dispatch((lz, (ListType, TupleType)),) +zs = Dispatch((lzs, (ListType, TupleType)),) + +## TRIMMING FCNS: +trimboth = Dispatch((ltrimboth, (ListType, TupleType)),) +trim1 = Dispatch((ltrim1, (ListType, TupleType)),) + +## CORRELATION FCNS: +paired = Dispatch((lpaired, (ListType, TupleType)),) +pearsonr = Dispatch((lpearsonr, (ListType, TupleType)),) +spearmanr = Dispatch((lspearmanr, (ListType, TupleType)),) +pointbiserialr = Dispatch((lpointbiserialr, (ListType, TupleType)),) +kendalltau = Dispatch((lkendalltau, (ListType, TupleType)),) +linregress = Dispatch((llinregress, (ListType, TupleType)),) + +## INFERENTIAL STATS: +ttest_1samp = Dispatch((lttest_1samp, (ListType, TupleType)),) +ttest_ind = Dispatch((lttest_ind, (ListType, TupleType)),) +ttest_rel = Dispatch((lttest_rel, (ListType, TupleType)),) +chisquare = Dispatch((lchisquare, (ListType, TupleType)),) +ks_2samp = Dispatch((lks_2samp, (ListType, TupleType)),) +mannwhitneyu = Dispatch((lmannwhitneyu, (ListType, TupleType)),) +ranksums = Dispatch((lranksums, (ListType, TupleType)),) +tiecorrect = Dispatch((ltiecorrect, (ListType, TupleType)),) +wilcoxont = Dispatch((lwilcoxont, (ListType, TupleType)),) +kruskalwallish = Dispatch((lkruskalwallish, (ListType, TupleType)),) +friedmanchisquare = Dispatch((lfriedmanchisquare, (ListType, TupleType)),) + +## PROBABILITY CALCS: +chisqprob = Dispatch((lchisqprob, (IntType, FloatType)),) +zprob = Dispatch((lzprob, (IntType, FloatType)),) +ksprob = Dispatch((lksprob, (IntType, FloatType)),) +fprob = Dispatch((lfprob, (IntType, FloatType)),) +betacf = Dispatch((lbetacf, (IntType, FloatType)),) +betai = Dispatch((lbetai, (IntType, FloatType)),) +erfcc = Dispatch((lerfcc, (IntType, FloatType)),) +gammln = Dispatch((lgammln, (IntType, FloatType)),) + +## ANOVA FUNCTIONS: +F_oneway = Dispatch((lF_oneway, (ListType, TupleType)),) +F_value = Dispatch((lF_value, (ListType, TupleType)),) + +## SUPPORT FUNCTIONS: +incr = Dispatch((lincr, (ListType, TupleType)),) +sum = Dispatch((lsum, (ListType, TupleType)),) +cumsum = Dispatch((lcumsum, (ListType, TupleType)),) +ss = Dispatch((lss, (ListType, TupleType)),) +summult = Dispatch((lsummult, (ListType, TupleType)),) +square_of_sums = Dispatch((lsquare_of_sums, (ListType, TupleType)),) +sumdiffsquared = Dispatch((lsumdiffsquared, (ListType, TupleType)),) +shellsort = Dispatch((lshellsort, (ListType, TupleType)),) +rankdata = Dispatch((lrankdata, (ListType, TupleType)),) +findwithin = Dispatch((lfindwithin, (ListType, TupleType)),) + +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== + +try: # DEFINE THESE *ONLY* IF NUMERIC IS AVAILABLE + import numpy as N + import numpy.linalg as LA + + ##################################### + ######## ACENTRAL TENDENCY ######## + ##################################### + + + def ageometricmean(inarray, dimension=None, keepdims=0): + """ +Calculates the geometric mean of the values in the passed array. +That is: n-th root of (x1 * x2 * ... * xn). Defaults to ALL values in +the passed array. Use dimension=None to flatten array first. REMEMBER: if +dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and +if dimension is a sequence, it collapses over all specified dimensions. If +keepdims is set to 1, the resulting array will have as many dimensions as +inarray, with only 1 'level' per dim that was collapsed over. + +Usage: ageometricmean(inarray,dimension=None,keepdims=0) +Returns: geometric mean computed over dim(s) listed in dimension +""" + inarray = N.array(inarray, N.float_) + if dimension == None: + inarray = N.ravel(inarray) + size = len(inarray) + mult = N.power(inarray, 1.0 / size) + mult = N.multiply.reduce(mult) + elif type(dimension) in [IntType, FloatType]: + size = inarray.shape[dimension] + mult = N.power(inarray, 1.0 / size) + mult = N.multiply.reduce(mult, dimension) + if keepdims == 1: + shp = list(inarray.shape) + shp[dimension] = 1 + sum = N.reshape(sum, shp) + else: # must be a SEQUENCE of dims to average over + dims = list(dimension) + dims.sort() + dims.reverse() + size = N.array(N.multiply.reduce(N.take(inarray.shape, dims)), N.float_) + mult = N.power(inarray, 1.0 / size) + for dim in dims: + mult = N.multiply.reduce(mult, dim) + if keepdims == 1: + shp = list(inarray.shape) + for dim in dims: + shp[dim] = 1 + mult = N.reshape(mult, shp) + return mult + + def aharmonicmean(inarray, dimension=None, keepdims=0): + """ +Calculates the harmonic mean of the values in the passed array. +That is: n / (1/x1 + 1/x2 + ... + 1/xn). Defaults to ALL values in +the passed array. Use dimension=None to flatten array first. REMEMBER: if +dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and +if dimension is a sequence, it collapses over all specified dimensions. If +keepdims is set to 1, the resulting array will have as many dimensions as +inarray, with only 1 'level' per dim that was collapsed over. + +Usage: aharmonicmean(inarray,dimension=None,keepdims=0) +Returns: harmonic mean computed over dim(s) in dimension +""" + inarray = inarray.astype(N.float_) + if dimension == None: + inarray = N.ravel(inarray) + size = len(inarray) + s = N.add.reduce(1.0 / inarray) + elif type(dimension) in [IntType, FloatType]: + size = float(inarray.shape[dimension]) + s = N.add.reduce(1.0 / inarray, dimension) + if keepdims == 1: + shp = list(inarray.shape) + shp[dimension] = 1 + s = N.reshape(s, shp) + else: # must be a SEQUENCE of dims to average over + dims = list(dimension) + dims.sort() + nondims = [] + for i in range(len(inarray.shape)): + if i not in dims: + nondims.append(i) + tinarray = N.transpose(inarray, nondims + dims) # put keep-dims first + idx = [0] * len(nondims) + if idx == []: + size = len(N.ravel(inarray)) + s = asum(1.0 / inarray) + if keepdims == 1: + s = N.reshape([s], N.ones(len(inarray.shape))) + else: + idx[0] = -1 + loopcap = N.array(tinarray.shape[0:len(nondims)]) - 1 + s = N.zeros(loopcap + 1, N.float_) + while incr(idx, loopcap) <> -1: + s[idx] = asum(1.0 / tinarray[idx]) + size = N.multiply.reduce(N.take(inarray.shape, dims)) + if keepdims == 1: + shp = list(inarray.shape) + for dim in dims: + shp[dim] = 1 + s = N.reshape(s, shp) + return size / s + + def amean(inarray, dimension=None, keepdims=0): + """ +Calculates the arithmatic mean of the values in the passed array. +That is: 1/n * (x1 + x2 + ... + xn). Defaults to ALL values in the +passed array. Use dimension=None to flatten array first. REMEMBER: if +dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and +if dimension is a sequence, it collapses over all specified dimensions. If +keepdims is set to 1, the resulting array will have as many dimensions as +inarray, with only 1 'level' per dim that was collapsed over. + +Usage: amean(inarray,dimension=None,keepdims=0) +Returns: arithematic mean calculated over dim(s) in dimension +""" + if inarray.dtype in [N.int_, N.short, N.ubyte]: + inarray = inarray.astype(N.float_) + if dimension == None: + inarray = N.ravel(inarray) + sum = N.add.reduce(inarray) + denom = float(len(inarray)) + elif type(dimension) in [IntType, FloatType]: + sum = asum(inarray, dimension) + denom = float(inarray.shape[dimension]) + if keepdims == 1: + shp = list(inarray.shape) + shp[dimension] = 1 + sum = N.reshape(sum, shp) + else: # must be a TUPLE of dims to average over + dims = list(dimension) + dims.sort() + dims.reverse() + sum = inarray * 1.0 + for dim in dims: + sum = N.add.reduce(sum, dim) + denom = N.array(N.multiply.reduce(N.take(inarray.shape, dims)), N.float_) + if keepdims == 1: + shp = list(inarray.shape) + for dim in dims: + shp[dim] = 1 + sum = N.reshape(sum, shp) + return sum / denom + + def amedian(inarray, numbins=1000): + """ +Calculates the COMPUTED median value of an array of numbers, given the +number of bins to use for the histogram (more bins approaches finding the +precise median value of the array; default number of bins = 1000). From +G.W. Heiman's Basic Stats, or CRC Probability & Statistics. +NOTE: THIS ROUTINE ALWAYS uses the entire passed array (flattens it first). + +Usage: amedian(inarray,numbins=1000) +Returns: median calculated over ALL values in inarray +""" + inarray = N.ravel(inarray) + (hist, smallest, binsize, extras) = ahistogram(inarray, numbins, + [min(inarray), max(inarray)]) + cumhist = N.cumsum(hist) # make cumulative histogram + otherbins = N.greater_equal(cumhist, len(inarray) / 2.0) + otherbins = list(otherbins) # list of 0/1s, 1s start at median bin + cfbin = otherbins.index(1) # get 1st(!) index holding 50%ile score + LRL = smallest + binsize * cfbin # get lower read limit of that bin + cfbelow = N.add.reduce(hist[0:cfbin]) # cum. freq. below bin + freq = hist[cfbin] # frequency IN the 50%ile bin + median = LRL + ( + (len(inarray) / 2.0 - cfbelow) / float(freq)) * binsize # MEDIAN + return median + + def amedianscore(inarray, dimension=None): + """ +Returns the 'middle' score of the passed array. If there is an even +number of scores, the mean of the 2 middle scores is returned. Can function +with 1D arrays, or on the FIRST dimension of 2D arrays (i.e., dimension can +be None, to pre-flatten the array, or else dimension must equal 0). + +Usage: amedianscore(inarray,dimension=None) +Returns: 'middle' score of the array, or the mean of the 2 middle scores +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + inarray = N.sort(inarray, dimension) + if inarray.shape[dimension] % 2 == 0: # if even number of elements + indx = inarray.shape[dimension] / 2 # integer division correct + median = N.asarray(inarray[indx] + inarray[indx - 1]) / 2.0 + else: + indx = inarray.shape[dimension] / 2 # integer division correct + median = N.take(inarray, [indx], dimension) + if median.shape == (1,): + median = median[0] + return median + + def amode(a, dimension=None): + """ +Returns an array of the modal (most common) score in the passed array. +If there is more than one such score, ONLY THE FIRST is returned. +The bin-count for the modal values is also returned. Operates on whole +array (dimension=None), or on a given dimension. + +Usage: amode(a, dimension=None) +Returns: array of bin-counts for mode(s), array of corresponding modal values +""" + + if dimension == None: + a = N.ravel(a) + dimension = 0 + scores = pstat.aunique(N.ravel(a)) # get ALL unique values + testshape = list(a.shape) + testshape[dimension] = 1 + oldmostfreq = N.zeros(testshape) + oldcounts = N.zeros(testshape) + for score in scores: + template = N.equal(a, score) + counts = asum(template, dimension, 1) + mostfrequent = N.where(counts > oldcounts, score, oldmostfreq) + oldcounts = N.where(counts > oldcounts, counts, oldcounts) + oldmostfreq = mostfrequent + return oldcounts, mostfrequent + + def atmean(a, limits=None, inclusive=(1, 1)): + """ +Returns the arithmetic mean of all values in an array, ignoring values +strictly outside the sequence passed to 'limits'. Note: either limit +in the sequence, or the value of limits itself, can be set to None. The +inclusive list/tuple determines whether the lower and upper limiting bounds +(respectively) are open/exclusive (0) or closed/inclusive (1). + +Usage: atmean(a,limits=None,inclusive=(1,1)) +""" + if a.dtype in [N.int_, N.short, N.ubyte]: + a = a.astype(N.float_) + if limits == None: + return mean(a) + assert type(limits) in [ListType, TupleType, N.ndarray + ], 'Wrong type for limits in atmean' + if inclusive[0]: + lowerfcn = N.greater_equal + else: + lowerfcn = N.greater + if inclusive[1]: + upperfcn = N.less_equal + else: + upperfcn = N.less + if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce( + N.ravel(a)): + raise ValueError, 'No array values within given limits (atmean).' + elif limits[0] == None and limits[1] <> None: + mask = upperfcn(a, limits[1]) + elif limits[0] <> None and limits[1] == None: + mask = lowerfcn(a, limits[0]) + elif limits[0] <> None and limits[1] <> None: + mask = lowerfcn(a, limits[0]) * upperfcn(a, limits[1]) + s = float(N.add.reduce(N.ravel(a * mask))) + n = float(N.add.reduce(N.ravel(mask))) + return s / n + + def atvar(a, limits=None, inclusive=(1, 1)): + """ +Returns the sample variance of values in an array, (i.e., using N-1), +ignoring values strictly outside the sequence passed to 'limits'. +Note: either limit in the sequence, or the value of limits itself, +can be set to None. The inclusive list/tuple determines whether the lower +and upper limiting bounds (respectively) are open/exclusive (0) or +closed/inclusive (1). ASSUMES A FLAT ARRAY (OR ELSE PREFLATTENS). + +Usage: atvar(a,limits=None,inclusive=(1,1)) +""" + a = a.astype(N.float_) + if limits == None or limits == [None, None]: + return avar(a) + assert type(limits) in [ListType, TupleType, N.ndarray + ], 'Wrong type for limits in atvar' + if inclusive[0]: + lowerfcn = N.greater_equal + else: + lowerfcn = N.greater + if inclusive[1]: + upperfcn = N.less_equal + else: + upperfcn = N.less + if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce( + N.ravel(a)): + raise ValueError, 'No array values within given limits (atvar).' + elif limits[0] == None and limits[1] <> None: + mask = upperfcn(a, limits[1]) + elif limits[0] <> None and limits[1] == None: + mask = lowerfcn(a, limits[0]) + elif limits[0] <> None and limits[1] <> None: + mask = lowerfcn(a, limits[0]) * upperfcn(a, limits[1]) + + a = N.compress(mask, a) # squish out excluded values + return avar(a) + + def atmin(a, lowerlimit=None, dimension=None, inclusive=1): + """ +Returns the minimum value of a, along dimension, including only values less +than (or equal to, if inclusive=1) lowerlimit. If the limit is set to None, +all values in the array are used. + +Usage: atmin(a,lowerlimit=None,dimension=None,inclusive=1) +""" + if inclusive: + lowerfcn = N.greater + else: + lowerfcn = N.greater_equal + if dimension == None: + a = N.ravel(a) + dimension = 0 + if lowerlimit == None: + lowerlimit = N.minimum.reduce(N.ravel(a)) - 11 + biggest = N.maximum.reduce(N.ravel(a)) + ta = N.where(lowerfcn(a, lowerlimit), a, biggest) + return N.minimum.reduce(ta, dimension) + + def atmax(a, upperlimit, dimension=None, inclusive=1): + """ +Returns the maximum value of a, along dimension, including only values greater +than (or equal to, if inclusive=1) upperlimit. If the limit is set to None, +a limit larger than the max value in the array is used. + +Usage: atmax(a,upperlimit,dimension=None,inclusive=1) +""" + if inclusive: + upperfcn = N.less + else: + upperfcn = N.less_equal + if dimension == None: + a = N.ravel(a) + dimension = 0 + if upperlimit == None: + upperlimit = N.maximum.reduce(N.ravel(a)) + 1 + smallest = N.minimum.reduce(N.ravel(a)) + ta = N.where(upperfcn(a, upperlimit), a, smallest) + return N.maximum.reduce(ta, dimension) + + def atstdev(a, limits=None, inclusive=(1, 1)): + """ +Returns the standard deviation of all values in an array, ignoring values +strictly outside the sequence passed to 'limits'. Note: either limit +in the sequence, or the value of limits itself, can be set to None. The +inclusive list/tuple determines whether the lower and upper limiting bounds +(respectively) are open/exclusive (0) or closed/inclusive (1). + +Usage: atstdev(a,limits=None,inclusive=(1,1)) +""" + return N.sqrt(tvar(a, limits, inclusive)) + + def atsem(a, limits=None, inclusive=(1, 1)): + """ +Returns the standard error of the mean for the values in an array, +(i.e., using N for the denominator), ignoring values strictly outside +the sequence passed to 'limits'. Note: either limit in the sequence, +or the value of limits itself, can be set to None. The inclusive list/tuple +determines whether the lower and upper limiting bounds (respectively) are +open/exclusive (0) or closed/inclusive (1). + +Usage: atsem(a,limits=None,inclusive=(1,1)) +""" + sd = tstdev(a, limits, inclusive) + if limits == None or limits == [None, None]: + n = float(len(N.ravel(a))) + limits = [min(a) - 1, max(a) + 1] + assert type(limits) in [ListType, TupleType, N.ndarray + ], 'Wrong type for limits in atsem' + if inclusive[0]: + lowerfcn = N.greater_equal + else: + lowerfcn = N.greater + if inclusive[1]: + upperfcn = N.less_equal + else: + upperfcn = N.less + if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce( + N.ravel(a)): + raise ValueError, 'No array values within given limits (atsem).' + elif limits[0] == None and limits[1] <> None: + mask = upperfcn(a, limits[1]) + elif limits[0] <> None and limits[1] == None: + mask = lowerfcn(a, limits[0]) + elif limits[0] <> None and limits[1] <> None: + mask = lowerfcn(a, limits[0]) * upperfcn(a, limits[1]) + term1 = N.add.reduce(N.ravel(a * a * mask)) + n = float(N.add.reduce(N.ravel(mask))) + return sd / math.sqrt(n) + +##################################### +############ AMOMENTS ############# +##################################### + + def amoment(a, moment=1, dimension=None): + """ +Calculates the nth moment about the mean for a sample (defaults to the +1st moment). Generally used to calculate coefficients of skewness and +kurtosis. Dimension can equal None (ravel array first), an integer +(the dimension over which to operate), or a sequence (operate over +multiple dimensions). + +Usage: amoment(a,moment=1,dimension=None) +Returns: appropriate moment along given dimension +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + if moment == 1: + return 0.0 + else: + mn = amean(a, dimension, 1) # 1=keepdims + s = N.power((a - mn), moment) + return amean(s, dimension) + + def avariation(a, dimension=None): + """ +Returns the coefficient of variation, as defined in CRC Standard +Probability and Statistics, p.6. Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). + +Usage: avariation(a,dimension=None) +""" + return 100.0 * asamplestdev(a, dimension) / amean(a, dimension) + + def askew(a, dimension=None): + """ +Returns the skewness of a distribution (normal ==> 0.0; >0 means extra +weight in left tail). Use askewtest() to see if it's close enough. +Dimension can equal None (ravel array first), an integer (the +dimension over which to operate), or a sequence (operate over multiple +dimensions). + +Usage: askew(a, dimension=None) +Returns: skew of vals in a along dimension, returning ZERO where all vals equal +""" + denom = N.power(amoment(a, 2, dimension), 1.5) + zero = N.equal(denom, 0) + if type(denom) == N.ndarray and asum(zero) <> 0: + print 'Number of zeros in askew: ', asum(zero) + denom = denom + zero # prevent divide-by-zero + return N.where(zero, 0, amoment(a, 3, dimension) / denom) + + def akurtosis(a, dimension=None): + """ +Returns the kurtosis of a distribution (normal ==> 3.0; >3 means +heavier in the tails, and usually more peaked). Use akurtosistest() +to see if it's close enough. Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). + +Usage: akurtosis(a,dimension=None) +Returns: kurtosis of values in a along dimension, and ZERO where all vals equal +""" + denom = N.power(amoment(a, 2, dimension), 2) + zero = N.equal(denom, 0) + if type(denom) == N.ndarray and asum(zero) <> 0: + print 'Number of zeros in akurtosis: ', asum(zero) + denom = denom + zero # prevent divide-by-zero + return N.where(zero, 0, amoment(a, 4, dimension) / denom) + + def adescribe(inarray, dimension=None): + """ +Returns several descriptive statistics of the passed array. Dimension +can equal None (ravel array first), an integer (the dimension over +which to operate), or a sequence (operate over multiple dimensions). + +Usage: adescribe(inarray,dimension=None) +Returns: n, (min,max), mean, standard deviation, skew, kurtosis +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + n = inarray.shape[dimension] + mm = (N.minimum.reduce(inarray), N.maximum.reduce(inarray)) + m = amean(inarray, dimension) + sd = astdev(inarray, dimension) + skew = askew(inarray, dimension) + kurt = akurtosis(inarray, dimension) + return n, mm, m, sd, skew, kurt + +##################################### +######## NORMALITY TESTS ########## +##################################### + + def askewtest(a, dimension=None): + """ +Tests whether the skew is significantly different from a normal +distribution. Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). + +Usage: askewtest(a,dimension=None) +Returns: z-score and 2-tail z-probability +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + b2 = askew(a, dimension) + n = float(a.shape[dimension]) + y = b2 * N.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) + beta2 = (3.0 * (n * n + 27 * n - 70) * (n + 1) * + (n + 3)) / ((n - 2.0) * (n + 5) * (n + 7) * (n + 9)) + W2 = -1 + N.sqrt(2 * (beta2 - 1)) + delta = 1 / N.sqrt(N.log(N.sqrt(W2))) + alpha = N.sqrt(2 / (W2 - 1)) + y = N.where(y == 0, 1, y) + Z = delta * N.log(y / alpha + N.sqrt((y / alpha)**2 + 1)) + return Z, (1.0 - zprob(Z)) * 2 + + def akurtosistest(a, dimension=None): + """ +Tests whether a dataset has normal kurtosis (i.e., +kurtosis=3(n-1)/(n+1)) Valid only for n>20. Dimension can equal None +(ravel array first), an integer (the dimension over which to operate), +or a sequence (operate over multiple dimensions). + +Usage: akurtosistest(a,dimension=None) +Returns: z-score and 2-tail z-probability, returns 0 for bad pixels +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + n = float(a.shape[dimension]) + if n < 20: + print 'akurtosistest only valid for n>=20 ... continuing anyway, n=', n + b2 = akurtosis(a, dimension) + E = 3.0 * (n - 1) / (n + 1) + varb2 = 24.0 * n * (n - 2) * (n - 3) / ((n + 1) * (n + 1) * (n + 3) * + (n + 5)) + x = (b2 - E) / N.sqrt(varb2) + sqrtbeta1 = 6.0 * (n * n - 5 * n + 2) / ((n + 7) * (n + 9)) * N.sqrt( + (6.0 * (n + 3) * (n + 5)) / (n * (n - 2) * (n - 3))) + A = 6.0 + 8.0 / sqrtbeta1 * (2.0 / sqrtbeta1 + + N.sqrt(1 + 4.0 / (sqrtbeta1**2))) + term1 = 1 - 2 / (9.0 * A) + denom = 1 + x * N.sqrt(2 / (A - 4.0)) + denom = N.where(N.less(denom, 0), 99, denom) + term2 = N.where( + N.equal(denom, 0), term1, N.power( + (1 - 2.0 / A) / denom, 1 / 3.0)) + Z = (term1 - term2) / N.sqrt(2 / (9.0 * A)) + Z = N.where(N.equal(denom, 99), 0, Z) + return Z, (1.0 - zprob(Z)) * 2 + + def anormaltest(a, dimension=None): + """ +Tests whether skew and/OR kurtosis of dataset differs from normal +curve. Can operate over multiple dimensions. Dimension can equal +None (ravel array first), an integer (the dimension over which to +operate), or a sequence (operate over multiple dimensions). + +Usage: anormaltest(a,dimension=None) +Returns: z-score and 2-tail probability +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + s, p = askewtest(a, dimension) + k, p = akurtosistest(a, dimension) + k2 = N.power(s, 2) + N.power(k, 2) + return k2, achisqprob(k2, 2) + +##################################### +###### AFREQUENCY FUNCTIONS ####### +##################################### + + def aitemfreq(a): + """ +Returns a 2D array of item frequencies. Column 1 contains item values, +column 2 contains their respective counts. Assumes a 1D array is passed. +@@@sorting OK? + +Usage: aitemfreq(a) +Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies) +""" + scores = pstat.aunique(a) + scores = N.sort(scores) + freq = N.zeros(len(scores)) + for i in range(len(scores)): + freq[i] = N.add.reduce(N.equal(a, scores[i])) + return N.array(pstat.aabut(scores, freq)) + + def ascoreatpercentile(inarray, percent): + """ +Usage: ascoreatpercentile(inarray,percent) 0<percent<100 +Returns: score at given percentile, relative to inarray distribution +""" + percent = percent / 100.0 + targetcf = percent * len(inarray) + h, lrl, binsize, extras = histogram(inarray) + cumhist = cumsum(h * 1) + for i in range(len(cumhist)): + if cumhist[i] >= targetcf: + break + score = binsize * ( + (targetcf - cumhist[i - 1]) / float(h[i])) + (lrl + binsize * i) + return score + + def apercentileofscore(inarray, score, histbins=10, defaultlimits=None): + """ +Note: result of this function depends on the values used to histogram +the data(!). + +Usage: apercentileofscore(inarray,score,histbins=10,defaultlimits=None) +Returns: percentile-position of score (0-100) relative to inarray +""" + h, lrl, binsize, extras = histogram(inarray, histbins, defaultlimits) + cumhist = cumsum(h * 1) + i = int((score - lrl) / float(binsize)) + pct = (cumhist[i - 1] + ((score - (lrl + binsize * i)) / float(binsize)) * + h[i]) / float(len(inarray)) * 100 + return pct + + def ahistogram(inarray, numbins=10, defaultlimits=None, printextras=1): + """ +Returns (i) an array of histogram bin counts, (ii) the smallest value +of the histogram binning, and (iii) the bin width (the last 2 are not +necessarily integers). Default number of bins is 10. Defaultlimits +can be None (the routine picks bins spanning all the numbers in the +inarray) or a 2-sequence (lowerlimit, upperlimit). Returns all of the +following: array of bin values, lowerreallimit, binsize, extrapoints. + +Usage: ahistogram(inarray,numbins=10,defaultlimits=None,printextras=1) +Returns: (array of bin counts, bin-minimum, min-width, #-points-outside-range) +""" + inarray = N.ravel(inarray) # flatten any >1D arrays + if (defaultlimits <> None): + lowerreallimit = defaultlimits[0] + upperreallimit = defaultlimits[1] + binsize = (upperreallimit - lowerreallimit) / float(numbins) + else: + Min = N.minimum.reduce(inarray) + Max = N.maximum.reduce(inarray) + estbinwidth = float(Max - Min) / float(numbins) + 1e-6 + binsize = (Max - Min + estbinwidth) / float(numbins) + lowerreallimit = Min - binsize / 2.0 #lower real limit,1st bin + bins = N.zeros(numbins) + extrapoints = 0 + for num in inarray: + try: + if (num - lowerreallimit) < 0: + extrapoints = extrapoints + 1 + else: + bintoincrement = int((num - lowerreallimit) / float(binsize)) + bins[bintoincrement] = bins[bintoincrement] + 1 + except: # point outside lower/upper limits + extrapoints = extrapoints + 1 + if (extrapoints > 0 and printextras == 1): + print '\nPoints outside given histogram range =', extrapoints + return (bins, lowerreallimit, binsize, extrapoints) + + def acumfreq(a, numbins=10, defaultreallimits=None): + """ +Returns a cumulative frequency histogram, using the histogram function. +Defaultreallimits can be None (use all data), or a 2-sequence containing +lower and upper limits on values to include. + +Usage: acumfreq(a,numbins=10,defaultreallimits=None) +Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints +""" + h, l, b, e = histogram(a, numbins, defaultreallimits) + cumhist = cumsum(h * 1) + return cumhist, l, b, e + + def arelfreq(a, numbins=10, defaultreallimits=None): + """ +Returns a relative frequency histogram, using the histogram function. +Defaultreallimits can be None (use all data), or a 2-sequence containing +lower and upper limits on values to include. + +Usage: arelfreq(a,numbins=10,defaultreallimits=None) +Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints +""" + h, l, b, e = histogram(a, numbins, defaultreallimits) + h = N.array(h / float(a.shape[0])) + return h, l, b, e + +##################################### +###### AVARIABILITY FUNCTIONS ##### +##################################### + + def aobrientransform(*args): + """ +Computes a transform on input data (any number of columns). Used to +test for homogeneity of variance prior to running one-way stats. Each +array in *args is one level of a factor. If an F_oneway() run on the +transformed data and found significant, variances are unequal. From +Maxwell and Delaney, p.112. + +Usage: aobrientransform(*args) *args = 1D arrays, one per level of factor +Returns: transformed data for use in an ANOVA +""" + TINY = 1e-10 + k = len(args) + n = N.zeros(k, N.float_) + v = N.zeros(k, N.float_) + m = N.zeros(k, N.float_) + nargs = [] + for i in range(k): + nargs.append(args[i].astype(N.float_)) + n[i] = float(len(nargs[i])) + v[i] = var(nargs[i]) + m[i] = mean(nargs[i]) + for j in range(k): + for i in range(n[j]): + t1 = (n[j] - 1.5) * n[j] * (nargs[j][i] - m[j])**2 + t2 = 0.5 * v[j] * (n[j] - 1.0) + t3 = (n[j] - 1.0) * (n[j] - 2.0) + nargs[j][i] = (t1 - t2) / float(t3) + check = 1 + for j in range(k): + if v[j] - mean(nargs[j]) > TINY: + check = 0 + if check <> 1: + raise ValueError, 'Lack of convergence in obrientransform.' + else: + return N.array(nargs) + + def asamplevar(inarray, dimension=None, keepdims=0): + """ +Returns the sample standard deviation of the values in the passed +array (i.e., using N). Dimension can equal None (ravel array first), +an integer (the dimension over which to operate), or a sequence +(operate over multiple dimensions). Set keepdims=1 to return an array +with the same number of dimensions as inarray. + +Usage: asamplevar(inarray,dimension=None,keepdims=0) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + if dimension == 1: + mn = amean(inarray, dimension)[:, N.NewAxis] + else: + mn = amean(inarray, dimension, keepdims=1) + deviations = inarray - mn + if type(dimension) == ListType: + n = 1 + for d in dimension: + n = n * inarray.shape[d] + else: + n = inarray.shape[dimension] + svar = ass(deviations, dimension, keepdims) / float(n) + return svar + + def asamplestdev(inarray, dimension=None, keepdims=0): + """ +Returns the sample standard deviation of the values in the passed +array (i.e., using N). Dimension can equal None (ravel array first), +an integer (the dimension over which to operate), or a sequence +(operate over multiple dimensions). Set keepdims=1 to return an array +with the same number of dimensions as inarray. + +Usage: asamplestdev(inarray,dimension=None,keepdims=0) +""" + return N.sqrt(asamplevar(inarray, dimension, keepdims)) + + def asignaltonoise(instack, dimension=0): + """ +Calculates signal-to-noise. Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). + +Usage: asignaltonoise(instack,dimension=0): +Returns: array containing the value of (mean/stdev) along dimension, + or 0 when stdev=0 +""" + m = mean(instack, dimension) + sd = stdev(instack, dimension) + return N.where(sd == 0, 0, m / sd) + + def acov(x, y, dimension=None, keepdims=0): + """ +Returns the estimated covariance of the values in the passed +array (i.e., N-1). Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). Set keepdims=1 to return an array with the +same number of dimensions as inarray. + +Usage: acov(x,y,dimension=None,keepdims=0) +""" + if dimension == None: + x = N.ravel(x) + y = N.ravel(y) + dimension = 0 + xmn = amean(x, dimension, 1) # keepdims + xdeviations = x - xmn + ymn = amean(y, dimension, 1) # keepdims + ydeviations = y - ymn + if type(dimension) == ListType: + n = 1 + for d in dimension: + n = n * x.shape[d] + else: + n = x.shape[dimension] + covar = N.sum(xdeviations * ydeviations) / float(n - 1) + return covar + + def avar(inarray, dimension=None, keepdims=0): + """ +Returns the estimated population variance of the values in the passed +array (i.e., N-1). Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). Set keepdims=1 to return an array with the +same number of dimensions as inarray. + +Usage: avar(inarray,dimension=None,keepdims=0) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + mn = amean(inarray, dimension, 1) + deviations = inarray - mn + if type(dimension) == ListType: + n = 1 + for d in dimension: + n = n * inarray.shape[d] + else: + n = inarray.shape[dimension] + var = ass(deviations, dimension, keepdims) / float(n - 1) + return var + + def astdev(inarray, dimension=None, keepdims=0): + """ +Returns the estimated population standard deviation of the values in +the passed array (i.e., N-1). Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). Set keepdims=1 to return +an array with the same number of dimensions as inarray. + +Usage: astdev(inarray,dimension=None,keepdims=0) +""" + return N.sqrt(avar(inarray, dimension, keepdims)) + + def asterr(inarray, dimension=None, keepdims=0): + """ +Returns the estimated population standard error of the values in the +passed array (i.e., N-1). Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). Set keepdims=1 to return +an array with the same number of dimensions as inarray. + +Usage: asterr(inarray,dimension=None,keepdims=0) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + return astdev(inarray, dimension, + keepdims) / float(N.sqrt(inarray.shape[dimension])) + + def asem(inarray, dimension=None, keepdims=0): + """ +Returns the standard error of the mean (i.e., using N) of the values +in the passed array. Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). Set keepdims=1 to return an array with the +same number of dimensions as inarray. + +Usage: asem(inarray,dimension=None, keepdims=0) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + if type(dimension) == ListType: + n = 1 + for d in dimension: + n = n * inarray.shape[d] + else: + n = inarray.shape[dimension] + s = asamplestdev(inarray, dimension, keepdims) / N.sqrt(n - 1) + return s + + def az(a, score): + """ +Returns the z-score of a given input score, given thearray from which +that score came. Not appropriate for population calculations, nor for +arrays > 1D. + +Usage: az(a, score) +""" + z = (score - amean(a)) / asamplestdev(a) + return z + + def azs(a): + """ +Returns a 1D array of z-scores, one for each score in the passed array, +computed relative to the passed array. + +Usage: azs(a) +""" + zscores = [] + for item in a: + zscores.append(z(a, item)) + return N.array(zscores) + + def azmap(scores, compare, dimension=0): + """ +Returns an array of z-scores the shape of scores (e.g., [x,y]), compared to +array passed to compare (e.g., [time,x,y]). Assumes collapsing over dim 0 +of the compare array. + +Usage: azs(scores, compare, dimension=0) +""" + mns = amean(compare, dimension) + sstd = asamplestdev(compare, 0) + return (scores - mns) / sstd + +##################################### +####### ATRIMMING FUNCTIONS ####### +##################################### + +## deleted around() as it's in numpy now + + def athreshold(a, threshmin=None, threshmax=None, newval=0): + """ +Like Numeric.clip() except that values <threshmid or >threshmax are replaced +by newval instead of by threshmin/threshmax (respectively). + +Usage: athreshold(a,threshmin=None,threshmax=None,newval=0) +Returns: a, with values <threshmin or >threshmax replaced with newval +""" + mask = N.zeros(a.shape) + if threshmin <> None: + mask = mask + N.where(a < threshmin, 1, 0) + if threshmax <> None: + mask = mask + N.where(a > threshmax, 1, 0) + mask = N.clip(mask, 0, 1) + return N.where(mask, newval, a) + + def atrimboth(a, proportiontocut): + """ +Slices off the passed proportion of items from BOTH ends of the passed +array (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND +'rightmost' 10% of scores. You must pre-sort the array if you want +"proper" trimming. Slices off LESS if proportion results in a +non-integer slice index (i.e., conservatively slices off +proportiontocut). + +Usage: atrimboth (a,proportiontocut) +Returns: trimmed version of array a +""" + lowercut = int(proportiontocut * len(a)) + uppercut = len(a) - lowercut + return a[lowercut:uppercut] + + def atrim1(a, proportiontocut, tail='right'): + """ +Slices off the passed proportion of items from ONE end of the passed +array (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost' +10% of scores). Slices off LESS if proportion results in a non-integer +slice index (i.e., conservatively slices off proportiontocut). + +Usage: atrim1(a,proportiontocut,tail='right') or set tail='left' +Returns: trimmed version of array a +""" + if string.lower(tail) == 'right': + lowercut = 0 + uppercut = len(a) - int(proportiontocut * len(a)) + elif string.lower(tail) == 'left': + lowercut = int(proportiontocut * len(a)) + uppercut = len(a) + return a[lowercut:uppercut] + +##################################### +##### ACORRELATION FUNCTIONS ###### +##################################### + + def acovariance(X): + """ +Computes the covariance matrix of a matrix X. Requires a 2D matrix input. + +Usage: acovariance(X) +Returns: covariance matrix of X +""" + if len(X.shape) <> 2: + raise TypeError, 'acovariance requires 2D matrices' + n = X.shape[0] + mX = amean(X, 0) + return N.dot(N.transpose(X), X) / float(n) - N.multiply.outer(mX, mX) + + def acorrelation(X): + """ +Computes the correlation matrix of a matrix X. Requires a 2D matrix input. + +Usage: acorrelation(X) +Returns: correlation matrix of X +""" + C = acovariance(X) + V = N.diagonal(C) + return C / N.sqrt(N.multiply.outer(V, V)) + + def apaired(x, y): + """ +Interactively determines the type of data in x and y, and then runs the +appropriated statistic for paired group data. + +Usage: apaired(x,y) x,y = the two arrays of values to be compared +Returns: appropriate statistic name, value, and probability +""" + samples = '' + while samples not in ['i', 'r', 'I', 'R', 'c', 'C']: + print '\nIndependent or related samples, or correlation (i,r,c): ', + samples = raw_input() + + if samples in ['i', 'I', 'r', 'R']: + print '\nComparing variances ...', + # USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112 + r = obrientransform(x, y) + f, p = F_oneway(pstat.colex(r, 0), pstat.colex(r, 1)) + if p < 0.05: + vartype = 'unequal, p=' + str(round(p, 4)) + else: + vartype = 'equal' + print vartype + if samples in ['i', 'I']: + if vartype[0] == 'e': + t, p = ttest_ind(x, y, None, 0) + print '\nIndependent samples t-test: ', round(t, 4), round(p, 4) + else: + if len(x) > 20 or len(y) > 20: + z, p = ranksums(x, y) + print '\nRank Sums test (NONparametric, n>20): ', round( + z, 4), round(p, 4) + else: + u, p = mannwhitneyu(x, y) + print '\nMann-Whitney U-test (NONparametric, ns<20): ', round( + u, 4), round(p, 4) + + else: # RELATED SAMPLES + if vartype[0] == 'e': + t, p = ttest_rel(x, y, 0) + print '\nRelated samples t-test: ', round(t, 4), round(p, 4) + else: + t, p = ranksums(x, y) + print '\nWilcoxon T-test (NONparametric): ', round(t, 4), round(p, 4) + else: # CORRELATION ANALYSIS + corrtype = '' + while corrtype not in ['c', 'C', 'r', 'R', 'd', 'D']: + print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ', + corrtype = raw_input() + if corrtype in ['c', 'C']: + m, b, r, p, see = linregress(x, y) + print '\nLinear regression for continuous variables ...' + lol = [ + ['Slope', 'Intercept', 'r', 'Prob', 'SEestimate'], + [round(m, 4), round(b, 4), round(r, 4), round(p, 4), round(see, 4)] + ] + pstat.printcc(lol) + elif corrtype in ['r', 'R']: + r, p = spearmanr(x, y) + print '\nCorrelation for ranked variables ...' + print "Spearman's r: ", round(r, 4), round(p, 4) + else: # DICHOTOMOUS + r, p = pointbiserialr(x, y) + print '\nAssuming x contains a dichotomous variable ...' + print 'Point Biserial r: ', round(r, 4), round(p, 4) + print '\n\n' + return None + + def dices(x, y): + """ +Calculates Dice's coefficient ... (2*number of common terms)/(number of terms in +x + +number of terms in y). Returns a value between 0 (orthogonal) and 1. + +Usage: dices(x,y) +""" + import sets + x = sets.Set(x) + y = sets.Set(y) + common = len(x.intersection(y)) + total = float(len(x) + len(y)) + return 2 * common / total + + def icc(x, y=None, verbose=0): + """ +Calculates intraclass correlation coefficients using simple, Type I sums of +squares. +If only one variable is passed, assumed it's an Nx2 matrix + +Usage: icc(x,y=None,verbose=0) +Returns: icc rho, prob ####PROB IS A GUESS BASED ON PEARSON +""" + TINY = 1.0e-20 + if y: + all = N.concatenate([x, y], 0) + else: + all = x + 0 + x = all[:, 0] + y = all[:, 1] + totalss = ass(all - mean(all)) + pairmeans = (x + y) / 2. + withinss = ass(x - pairmeans) + ass(y - pairmeans) + withindf = float(len(x)) + betwdf = float(len(x) - 1) + withinms = withinss / withindf + betweenms = (totalss - withinss) / betwdf + rho = (betweenms - withinms) / (withinms + betweenms) + t = rho * math.sqrt(betwdf / ((1.0 - rho + TINY) * (1.0 + rho + TINY))) + prob = abetai(0.5 * betwdf, 0.5, betwdf / (betwdf + t * t), verbose) + return rho, prob + + def alincc(x, y): + """ +Calculates Lin's concordance correlation coefficient. + +Usage: alincc(x,y) where x, y are equal-length arrays +Returns: Lin's CC +""" + x = N.ravel(x) + y = N.ravel(y) + covar = acov(x, y) * (len(x) - 1) / float(len(x)) # correct denom to n + xvar = avar(x) * (len(x) - 1) / float(len(x)) # correct denom to n + yvar = avar(y) * (len(y) - 1) / float(len(y)) # correct denom to n + lincc = (2 * covar) / ((xvar + yvar) + ((amean(x) - amean(y))**2)) + return lincc + + def apearsonr(x, y, verbose=1): + """ +Calculates a Pearson correlation coefficient and returns p. Taken +from Heiman's Basic Statistics for the Behav. Sci (2nd), p.195. + +Usage: apearsonr(x,y,verbose=1) where x,y are equal length arrays +Returns: Pearson's r, two-tailed p-value +""" + TINY = 1.0e-20 + n = len(x) + xmean = amean(x) + ymean = amean(y) + r_num = n * (N.add.reduce(x * y)) - N.add.reduce(x) * N.add.reduce(y) + r_den = math.sqrt((n * ass(x) - asquare_of_sums(x)) * + (n * ass(y) - asquare_of_sums(y))) + r = (r_num / r_den) + df = n - 2 + t = r * math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) + prob = abetai(0.5 * df, 0.5, df / (df + t * t), verbose) + return r, prob + + def aspearmanr(x, y): + """ +Calculates a Spearman rank-order correlation coefficient. Taken +from Heiman's Basic Statistics for the Behav. Sci (1st), p.192. + +Usage: aspearmanr(x,y) where x,y are equal-length arrays +Returns: Spearman's r, two-tailed p-value +""" + TINY = 1e-30 + n = len(x) + rankx = rankdata(x) + ranky = rankdata(y) + dsq = N.add.reduce((rankx - ranky)**2) + rs = 1 - 6 * dsq / float(n * (n**2 - 1)) + t = rs * math.sqrt((n - 2) / ((rs + 1.0) * (1.0 - rs))) + df = n - 2 + probrs = abetai(0.5 * df, 0.5, df / (df + t * t)) + # probability values for rs are from part 2 of the spearman function in + # Numerical Recipies, p.510. They close to tables, but not exact.(?) + return rs, probrs + + def apointbiserialr(x, y): + """ +Calculates a point-biserial correlation coefficient and the associated +probability value. Taken from Heiman's Basic Statistics for the Behav. +Sci (1st), p.194. + +Usage: apointbiserialr(x,y) where x,y are equal length arrays +Returns: Point-biserial r, two-tailed p-value +""" + TINY = 1e-30 + categories = pstat.aunique(x) + data = pstat.aabut(x, y) + if len(categories) <> 2: + raise ValueError, ('Exactly 2 categories required (in x) for ' + 'pointbiserialr().') + else: # there are 2 categories, continue + codemap = pstat.aabut(categories, N.arange(2)) + recoded = pstat.arecode(data, codemap, 0) + x = pstat.alinexand(data, 0, categories[0]) + y = pstat.alinexand(data, 0, categories[1]) + xmean = amean(pstat.acolex(x, 1)) + ymean = amean(pstat.acolex(y, 1)) + n = len(data) + adjust = math.sqrt((len(x) / float(n)) * (len(y) / float(n))) + rpb = (ymean - xmean) / asamplestdev(pstat.acolex(data, 1)) * adjust + df = n - 2 + t = rpb * math.sqrt(df / ((1.0 - rpb + TINY) * (1.0 + rpb + TINY))) + prob = abetai(0.5 * df, 0.5, df / (df + t * t)) + return rpb, prob + + def akendalltau(x, y): + """ +Calculates Kendall's tau ... correlation of ordinal data. Adapted +from function kendl1 in Numerical Recipies. Needs good test-cases.@@@ + +Usage: akendalltau(x,y) +Returns: Kendall's tau, two-tailed p-value +""" + n1 = 0 + n2 = 0 + iss = 0 + for j in range(len(x) - 1): + for k in range(j, len(y)): + a1 = x[j] - x[k] + a2 = y[j] - y[k] + aa = a1 * a2 + if (aa): # neither array has a tie + n1 = n1 + 1 + n2 = n2 + 1 + if aa > 0: + iss = iss + 1 + else: + iss = iss - 1 + else: + if (a1): + n1 = n1 + 1 + else: + n2 = n2 + 1 + tau = iss / math.sqrt(n1 * n2) + svar = (4.0 * len(x) + 10.0) / (9.0 * len(x) * (len(x) - 1)) + z = tau / math.sqrt(svar) + prob = erfcc(abs(z) / 1.4142136) + return tau, prob + + def alinregress(*args): + """ +Calculates a regression line on two arrays, x and y, corresponding to x,y +pairs. If a single 2D array is passed, alinregress finds dim with 2 levels +and splits data into x,y pairs along that dim. + +Usage: alinregress(*args) args=2 equal-length arrays, or one 2D array +Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate, n +""" + TINY = 1.0e-20 + if len(args) == 1: # more than 1D array? + args = args[0] + if len(args) == 2: + x = args[0] + y = args[1] + else: + x = args[:, 0] + y = args[:, 1] + else: + x = args[0] + y = args[1] + n = len(x) + xmean = amean(x) + ymean = amean(y) + r_num = n * (N.add.reduce(x * y)) - N.add.reduce(x) * N.add.reduce(y) + r_den = math.sqrt((n * ass(x) - asquare_of_sums(x)) * + (n * ass(y) - asquare_of_sums(y))) + r = r_num / r_den + z = 0.5 * math.log((1.0 + r + TINY) / (1.0 - r + TINY)) + df = n - 2 + t = r * math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) + prob = abetai(0.5 * df, 0.5, df / (df + t * t)) + slope = r_num / (float(n) * ass(x) - asquare_of_sums(x)) + intercept = ymean - slope * xmean + sterrest = math.sqrt(1 - r * r) * asamplestdev(y) + return slope, intercept, r, prob, sterrest, n + + def amasslinregress(*args): + """ +Calculates a regression line on one 1D array (x) and one N-D array (y). + +Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate, n +""" + TINY = 1.0e-20 + if len(args) == 1: # more than 1D array? + args = args[0] + if len(args) == 2: + x = N.ravel(args[0]) + y = args[1] + else: + x = N.ravel(args[:, 0]) + y = args[:, 1] + else: + x = args[0] + y = args[1] + x = x.astype(N.float_) + y = y.astype(N.float_) + n = len(x) + xmean = amean(x) + ymean = amean(y, 0) + shp = N.ones(len(y.shape)) + shp[0] = len(x) + x.shape = shp + print x.shape, y.shape + r_num = n * (N.add.reduce(x * y, 0)) - N.add.reduce(x) * N.add.reduce(y, 0) + r_den = N.sqrt((n * ass(x) - asquare_of_sums(x)) * + (n * ass(y, 0) - asquare_of_sums(y, 0))) + zerodivproblem = N.equal(r_den, 0) + r_den = N.where(zerodivproblem, 1, r_den + ) # avoid zero-division in 1st place + r = r_num / r_den # need to do this nicely for matrix division + r = N.where(zerodivproblem, 0.0, r) + z = 0.5 * N.log((1.0 + r + TINY) / (1.0 - r + TINY)) + df = n - 2 + t = r * N.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) + prob = abetai(0.5 * df, 0.5, df / (df + t * t)) + + ss = float(n) * ass(x) - asquare_of_sums(x) + s_den = N.where(ss == 0, 1, ss) # avoid zero-division in 1st place + slope = r_num / s_den + intercept = ymean - slope * xmean + sterrest = N.sqrt(1 - r * r) * asamplestdev(y, 0) + return slope, intercept, r, prob, sterrest, n + +##################################### +##### AINFERENTIAL STATISTICS ##### +##################################### + + def attest_1samp(a, popmean, printit=0, name='Sample', writemode='a'): + """ +Calculates the t-obtained for the independent samples T-test on ONE group +of scores a, given a population mean. If printit=1, results are printed +to the screen. If printit='filename', the results are output to 'filename' +using the given writemode (default=append). Returns t-value, and prob. + +Usage: attest_1samp(a,popmean,Name='Sample',printit=0,writemode='a') +Returns: t-value, two-tailed prob +""" + if type(a) != N.ndarray: + a = N.array(a) + x = amean(a) + v = avar(a) + n = len(a) + df = n - 1 + svar = ((n - 1) * v) / float(df) + t = (x - popmean) / math.sqrt(svar * (1.0 / n)) + prob = abetai(0.5 * df, 0.5, df / (df + t * t)) + + if printit <> 0: + statname = 'Single-sample T-test.' + outputpairedstats(printit, writemode, 'Population', '--', popmean, 0, 0, + 0, name, n, x, v, N.minimum.reduce(N.ravel(a)), + N.maximum.reduce(N.ravel(a)), statname, t, prob) + return t, prob + + def attest_ind(a, + b, + dimension=None, + printit=0, + name1='Samp1', + name2='Samp2', + writemode='a'): + """ +Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores +a, and b. From Numerical Recipies, p.483. If printit=1, results are +printed to the screen. If printit='filename', the results are output +to 'filename' using the given writemode (default=append). Dimension +can equal None (ravel array first), or an integer (the dimension over +which to operate on a and b). + +Usage: attest_ind (a,b,dimension=None,printit=0, + Name1='Samp1',Name2='Samp2',writemode='a') +Returns: t-value, two-tailed p-value +""" + if dimension == None: + a = N.ravel(a) + b = N.ravel(b) + dimension = 0 + x1 = amean(a, dimension) + x2 = amean(b, dimension) + v1 = avar(a, dimension) + v2 = avar(b, dimension) + n1 = a.shape[dimension] + n2 = b.shape[dimension] + df = n1 + n2 - 2 + svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) + zerodivproblem = N.equal(svar, 0) + svar = N.where(zerodivproblem, 1, svar) # avoid zero-division in 1st place + t = (x1 - x2) / N.sqrt(svar * + (1.0 / n1 + 1.0 / n2)) # N-D COMPUTATION HERE!!!!!! + t = N.where(zerodivproblem, 1.0, t) # replace NaN/wrong t-values with 1.0 + probs = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) + + if type(t) == N.ndarray: + probs = N.reshape(probs, t.shape) + if probs.shape == (1,): + probs = probs[0] + + if printit <> 0: + if type(t) == N.ndarray: + t = t[0] + if type(probs) == N.ndarray: + probs = probs[0] + statname = 'Independent samples T-test.' + outputpairedstats(printit, writemode, name1, n1, x1, v1, + N.minimum.reduce(N.ravel(a)), + N.maximum.reduce(N.ravel(a)), name2, n2, x2, v2, + N.minimum.reduce(N.ravel(b)), + N.maximum.reduce(N.ravel(b)), statname, t, probs) + return + return t, probs + + def ap2t(pval, df): + """ +Tries to compute a t-value from a p-value (or pval array) and associated df. +SLOW for large numbers of elements(!) as it re-computes p-values 20 times +(smaller step-sizes) at which point it decides it's done. Keeps the signs +of the input array. Returns 1000 (or -1000) if t>100. + +Usage: ap2t(pval,df) +Returns: an array of t-values with the shape of pval + """ + pval = N.array(pval) + signs = N.sign(pval) + pval = abs(pval) + t = N.ones(pval.shape, N.float_) * 50 + step = N.ones(pval.shape, N.float_) * 25 + print 'Initial ap2t() prob calc' + prob = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) + print 'ap2t() iter: ', + for i in range(10): + print i, ' ', + t = N.where(pval < prob, t + step, t - step) + prob = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) + step = step / 2 + print + # since this is an ugly hack, we get ugly boundaries + t = N.where(t > 99.9, 1000, t) # hit upper-boundary + t = t + signs + return t #, prob, pval + + def attest_rel(a, + b, + dimension=None, + printit=0, + name1='Samp1', + name2='Samp2', + writemode='a'): + """ +Calculates the t-obtained T-test on TWO RELATED samples of scores, a +and b. From Numerical Recipies, p.483. If printit=1, results are +printed to the screen. If printit='filename', the results are output +to 'filename' using the given writemode (default=append). Dimension +can equal None (ravel array first), or an integer (the dimension over +which to operate on a and b). + +Usage: attest_rel(a,b,dimension=None,printit=0, + name1='Samp1',name2='Samp2',writemode='a') +Returns: t-value, two-tailed p-value +""" + if dimension == None: + a = N.ravel(a) + b = N.ravel(b) + dimension = 0 + if len(a) <> len(b): + raise ValueError, 'Unequal length arrays.' + x1 = amean(a, dimension) + x2 = amean(b, dimension) + v1 = avar(a, dimension) + v2 = avar(b, dimension) + n = a.shape[dimension] + df = float(n - 1) + d = (a - b).astype('d') + + denom = N.sqrt( + (n * N.add.reduce(d * d, dimension) - N.add.reduce(d, dimension)**2) / + df) + zerodivproblem = N.equal(denom, 0) + denom = N.where(zerodivproblem, 1, denom + ) # avoid zero-division in 1st place + t = N.add.reduce(d, dimension) / denom # N-D COMPUTATION HERE!!!!!! + t = N.where(zerodivproblem, 1.0, t) # replace NaN/wrong t-values with 1.0 + probs = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) + if type(t) == N.ndarray: + probs = N.reshape(probs, t.shape) + if probs.shape == (1,): + probs = probs[0] + + if printit <> 0: + statname = 'Related samples T-test.' + outputpairedstats(printit, writemode, name1, n, x1, v1, + N.minimum.reduce(N.ravel(a)), + N.maximum.reduce(N.ravel(a)), name2, n, x2, v2, + N.minimum.reduce(N.ravel(b)), + N.maximum.reduce(N.ravel(b)), statname, t, probs) + return + return t, probs + + def achisquare(f_obs, f_exp=None): + """ +Calculates a one-way chi square for array of observed frequencies and returns +the result. If no expected frequencies are given, the total N is assumed to +be equally distributed across all groups. +@@@NOT RIGHT?? + +Usage: achisquare(f_obs, f_exp=None) f_obs = array of observed cell freq. +Returns: chisquare-statistic, associated p-value +""" + + k = len(f_obs) + if f_exp == None: + f_exp = N.array([sum(f_obs) / float(k)] * len(f_obs), N.float_) + f_exp = f_exp.astype(N.float_) + chisq = N.add.reduce((f_obs - f_exp)**2 / f_exp) + return chisq, achisqprob(chisq, k - 1) + + def aks_2samp(data1, data2): + """ +Computes the Kolmogorov-Smirnof statistic on 2 samples. Modified from +Numerical Recipies in C, page 493. Returns KS D-value, prob. Not ufunc- +like. + +Usage: aks_2samp(data1,data2) where data1 and data2 are 1D arrays +Returns: KS D-value, p-value +""" + j1 = 0 # N.zeros(data1.shape[1:]) TRIED TO MAKE THIS UFUNC-LIKE + j2 = 0 # N.zeros(data2.shape[1:]) + fn1 = 0.0 # N.zeros(data1.shape[1:],N.float_) + fn2 = 0.0 # N.zeros(data2.shape[1:],N.float_) + n1 = data1.shape[0] + n2 = data2.shape[0] + en1 = n1 * 1 + en2 = n2 * 1 + d = N.zeros(data1.shape[1:], N.float_) + data1 = N.sort(data1, 0) + data2 = N.sort(data2, 0) + while j1 < n1 and j2 < n2: + d1 = data1[j1] + d2 = data2[j2] + if d1 <= d2: + fn1 = (j1) / float(en1) + j1 = j1 + 1 + if d2 <= d1: + fn2 = (j2) / float(en2) + j2 = j2 + 1 + dt = (fn2 - fn1) + if abs(dt) > abs(d): + d = dt +# try: + en = math.sqrt(en1 * en2 / float(en1 + en2)) + prob = aksprob((en + 0.12 + 0.11 / en) * N.fabs(d)) + # except: + # prob = 1.0 + return d, prob + + def amannwhitneyu(x, y): + """ +Calculates a Mann-Whitney U statistic on the provided scores and +returns the result. Use only when the n in each condition is < 20 and +you have 2 independent samples of ranks. REMEMBER: Mann-Whitney U is +significant if the u-obtained is LESS THAN or equal to the critical +value of U. + +Usage: amannwhitneyu(x,y) where x,y are arrays of values for 2 conditions +Returns: u-statistic, one-tailed p-value (i.e., p(z(U))) +""" + n1 = len(x) + n2 = len(y) + ranked = rankdata(N.concatenate((x, y))) + rankx = ranked[0:n1] # get the x-ranks + ranky = ranked[n1:] # the rest are y-ranks + u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - sum(rankx) # calc U for x + u2 = n1 * n2 - u1 # remainder is U for y + bigu = max(u1, u2) + smallu = min(u1, u2) + proportion = bigu / float(n1 * n2) + T = math.sqrt(tiecorrect(ranked)) # correction factor for tied scores + if T == 0: + raise ValueError, 'All numbers are identical in amannwhitneyu' + sd = math.sqrt(T * n1 * n2 * (n1 + n2 + 1) / 12.0) + z = abs((bigu - n1 * n2 / 2.0) / sd) # normal approximation for prob calc + return smallu, 1.0 - azprob(z), proportion + + def atiecorrect(rankvals): + """ +Tie-corrector for ties in Mann Whitney U and Kruskal Wallis H tests. +See Siegel, S. (1956) Nonparametric Statistics for the Behavioral +Sciences. New York: McGraw-Hill. Code adapted from |Stat rankind.c +code. + +Usage: atiecorrect(rankvals) +Returns: T correction factor for U or H +""" + sorted, posn = ashellsort(N.array(rankvals)) + n = len(sorted) + T = 0.0 + i = 0 + while (i < n - 1): + if sorted[i] == sorted[i + 1]: + nties = 1 + while (i < n - 1) and (sorted[i] == sorted[i + 1]): + nties = nties + 1 + i = i + 1 + T = T + nties**3 - nties + i = i + 1 + T = T / float(n**3 - n) + return 1.0 - T + + def aranksums(x, y): + """ +Calculates the rank sums statistic on the provided scores and returns +the result. + +Usage: aranksums(x,y) where x,y are arrays of values for 2 conditions +Returns: z-statistic, two-tailed p-value +""" + n1 = len(x) + n2 = len(y) + alldata = N.concatenate((x, y)) + ranked = arankdata(alldata) + x = ranked[:n1] + y = ranked[n1:] + s = sum(x) + expected = n1 * (n1 + n2 + 1) / 2.0 + z = (s - expected) / math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0) + prob = 2 * (1.0 - azprob(abs(z))) + return z, prob + + def awilcoxont(x, y): + """ +Calculates the Wilcoxon T-test for related samples and returns the +result. A non-parametric T-test. + +Usage: awilcoxont(x,y) where x,y are equal-length arrays for 2 conditions +Returns: t-statistic, two-tailed p-value +""" + if len(x) <> len(y): + raise ValueError, 'Unequal N in awilcoxont. Aborting.' + d = x - y + d = N.compress(N.not_equal(d, 0), d) # Keep all non-zero differences + count = len(d) + absd = abs(d) + absranked = arankdata(absd) + r_plus = 0.0 + r_minus = 0.0 + for i in range(len(absd)): + if d[i] < 0: + r_minus = r_minus + absranked[i] + else: + r_plus = r_plus + absranked[i] + wt = min(r_plus, r_minus) + mn = count * (count + 1) * 0.25 + se = math.sqrt(count * (count + 1) * (2.0 * count + 1.0) / 24.0) + z = math.fabs(wt - mn) / se + z = math.fabs(wt - mn) / se + prob = 2 * (1.0 - zprob(abs(z))) + return wt, prob + + def akruskalwallish(*args): + """ +The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more +groups, requiring at least 5 subjects in each group. This function +calculates the Kruskal-Wallis H and associated p-value for 3 or more +independent samples. + +Usage: akruskalwallish(*args) args are separate arrays for 3+ conditions +Returns: H-statistic (corrected for ties), associated p-value +""" + assert len(args) == 3, 'Need at least 3 groups in stats.akruskalwallish()' + args = list(args) + n = [0] * len(args) + n = map(len, args) + all = [] + for i in range(len(args)): + all = all + args[i].tolist() + ranked = rankdata(all) + T = tiecorrect(ranked) + for i in range(len(args)): + args[i] = ranked[0:n[i]] + del ranked[0:n[i]] + rsums = [] + for i in range(len(args)): + rsums.append(sum(args[i])**2) + rsums[i] = rsums[i] / float(n[i]) + ssbn = sum(rsums) + totaln = sum(n) + h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) + df = len(args) - 1 + if T == 0: + raise ValueError, 'All numbers are identical in akruskalwallish' + h = h / float(T) + return h, chisqprob(h, df) + + def afriedmanchisquare(*args): + """ +Friedman Chi-Square is a non-parametric, one-way within-subjects +ANOVA. This function calculates the Friedman Chi-square test for +repeated measures and returns the result, along with the associated +probability value. It assumes 3 or more repeated measures. Only 3 +levels requires a minimum of 10 subjects in the study. Four levels +requires 5 subjects per level(??). + +Usage: afriedmanchisquare(*args) args are separate arrays for 2+ conditions +Returns: chi-square statistic, associated p-value +""" + k = len(args) + if k < 3: + raise ValueError, ('\nLess than 3 levels. Friedman test not ' + 'appropriate.\n') + n = len(args[0]) + data = apply(pstat.aabut, args) + data = data.astype(N.float_) + for i in range(len(data)): + data[i] = arankdata(data[i]) + ssbn = asum(asum(args, 1)**2) + chisq = 12.0 / (k * n * (k + 1)) * ssbn - 3 * n * (k + 1) + return chisq, achisqprob(chisq, k - 1) + +##################################### +#### APROBABILITY CALCULATIONS #### +##################################### + + def achisqprob(chisq, df): + """ +Returns the (1-tail) probability value associated with the provided chi-square +value and df. Heavily modified from chisq.c in Gary Perlman's |Stat. Can +handle multiple dimensions. + +Usage: achisqprob(chisq,df) chisq=chisquare stat., df=degrees of freedom +""" + BIG = 200.0 + + def ex(x): + BIG = 200.0 + exponents = N.where(N.less(x, -BIG), -BIG, x) + return N.exp(exponents) + + if type(chisq) == N.ndarray: + arrayflag = 1 + else: + arrayflag = 0 + chisq = N.array([chisq]) + if df < 1: + return N.ones(chisq.shape, N.float) + probs = N.zeros(chisq.shape, N.float_) + probs = N.where( + N.less_equal(chisq, 0), 1.0, probs) # set prob=1 for chisq<0 + a = 0.5 * chisq + if df > 1: + y = ex(-a) + if df % 2 == 0: + even = 1 + s = y * 1 + s2 = s * 1 + else: + even = 0 + s = 2.0 * azprob(-N.sqrt(chisq)) + s2 = s * 1 + if (df > 2): + chisq = 0.5 * (df - 1.0) + if even: + z = N.ones(probs.shape, N.float_) + else: + z = 0.5 * N.ones(probs.shape, N.float_) + if even: + e = N.zeros(probs.shape, N.float_) + else: + e = N.log(N.sqrt(N.pi)) * N.ones(probs.shape, N.float_) + c = N.log(a) + mask = N.zeros(probs.shape) + a_big = N.greater(a, BIG) + a_big_frozen = -1 * N.ones(probs.shape, N.float_) + totalelements = N.multiply.reduce(N.array(probs.shape)) + while asum(mask) <> totalelements: + e = N.log(z) + e + s = s + ex(c * z - a - e) + z = z + 1.0 + # print z, e, s + newmask = N.greater(z, chisq) + a_big_frozen = N.where(newmask * N.equal(mask, 0) * a_big, s, + a_big_frozen) + mask = N.clip(newmask + mask, 0, 1) + if even: + z = N.ones(probs.shape, N.float_) + e = N.ones(probs.shape, N.float_) + else: + z = 0.5 * N.ones(probs.shape, N.float_) + e = 1.0 / N.sqrt(N.pi) / N.sqrt(a) * N.ones(probs.shape, N.float_) + c = 0.0 + mask = N.zeros(probs.shape) + a_notbig_frozen = -1 * N.ones(probs.shape, N.float_) + while asum(mask) <> totalelements: + e = e * (a / z.astype(N.float_)) + c = c + e + z = z + 1.0 + # print '#2', z, e, c, s, c*y+s2 + newmask = N.greater(z, chisq) + a_notbig_frozen = N.where(newmask * N.equal(mask, 0) * (1 - a_big), + c * y + s2, a_notbig_frozen) + mask = N.clip(newmask + mask, 0, 1) + probs = N.where( + N.equal(probs, 1), 1, N.where( + N.greater(a, BIG), a_big_frozen, a_notbig_frozen)) + return probs + else: + return s + + def aerfcc(x): + """ +Returns the complementary error function erfc(x) with fractional error +everywhere less than 1.2e-7. Adapted from Numerical Recipies. Can +handle multiple dimensions. + +Usage: aerfcc(x) +""" + z = abs(x) + t = 1.0 / (1.0 + 0.5 * z) + ans = t * N.exp(-z * z - 1.26551223 + t * (1.00002368 + t * ( + 0.37409196 + t * (0.09678418 + t * (-0.18628806 + t * ( + 0.27886807 + t * (-1.13520398 + t * (1.48851587 + t * ( + -0.82215223 + t * 0.17087277))))))))) + return N.where(N.greater_equal(x, 0), ans, 2.0 - ans) + + def azprob(z): + """ +Returns the area under the normal curve 'to the left of' the given z value. +Thus, + for z<0, zprob(z) = 1-tail probability + for z>0, 1.0-zprob(z) = 1-tail probability + for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability +Adapted from z.c in Gary Perlman's |Stat. Can handle multiple dimensions. + +Usage: azprob(z) where z is a z-value +""" + + def yfunc(y): + x = ((((((( + ((((((-0.000045255659 * y + 0.000152529290) * y - 0.000019538132) * y + - 0.000676904986) * y + 0.001390604284) * y - 0.000794620820) * y + - 0.002034254874) * y + 0.006549791214) * y - 0.010557625006) * y + + 0.011630447319) * y - 0.009279453341) * y + 0.005353579108) * y - + 0.002141268741) * y + 0.000535310849) * y + 0.999936657524 + return x + + def wfunc(w): + x = ((((((((0.000124818987 * w - 0.001075204047) * w + 0.005198775019) * w + - 0.019198292004) * w + 0.059054035642) * w - 0.151968751364) * + w + 0.319152932694) * w - 0.531923007300) * w + + 0.797884560593) * N.sqrt(w) * 2.0 + return x + + Z_MAX = 6.0 # maximum meaningful z-value + x = N.zeros(z.shape, N.float_) # initialize + y = 0.5 * N.fabs(z) + x = N.where(N.less(y, 1.0), wfunc(y * y), yfunc(y - 2.0)) # get x's + x = N.where(N.greater(y, Z_MAX * 0.5), 1.0, x) # kill those with big Z + prob = N.where(N.greater(z, 0), (x + 1) * 0.5, (1 - x) * 0.5) + return prob + + def aksprob(alam): + """ +Returns the probability value for a K-S statistic computed via ks_2samp. +Adapted from Numerical Recipies. Can handle multiple dimensions. + +Usage: aksprob(alam) +""" + if type(alam) == N.ndarray: + frozen = -1 * N.ones(alam.shape, N.float64) + alam = alam.astype(N.float64) + arrayflag = 1 + else: + frozen = N.array(-1.) + alam = N.array(alam, N.float64) + arrayflag = 1 + mask = N.zeros(alam.shape) + fac = 2.0 * N.ones(alam.shape, N.float_) + sum = N.zeros(alam.shape, N.float_) + termbf = N.zeros(alam.shape, N.float_) + a2 = N.array(-2.0 * alam * alam, N.float64) + totalelements = N.multiply.reduce(N.array(mask.shape)) + for j in range(1, 201): + if asum(mask) == totalelements: + break + exponents = (a2 * j * j) + overflowmask = N.less(exponents, -746) + frozen = N.where(overflowmask, 0, frozen) + mask = mask + overflowmask + term = fac * N.exp(exponents) + sum = sum + term + newmask = N.where( + N.less_equal( + abs(term), (0.001 * termbf)) + N.less( + abs(term), 1.0e-8 * sum), 1, 0) + frozen = N.where(newmask * N.equal(mask, 0), sum, frozen) + mask = N.clip(mask + newmask, 0, 1) + fac = -fac + termbf = abs(term) + if arrayflag: + return N.where( + N.equal(frozen, -1), 1.0, frozen) # 1.0 if doesn't converge + else: + return N.where( + N.equal(frozen, -1), 1.0, frozen)[0] # 1.0 if doesn't converge + + def afprob(dfnum, dfden, F): + """ +Returns the 1-tailed significance level (p-value) of an F statistic +given the degrees of freedom for the numerator (dfR-dfF) and the degrees +of freedom for the denominator (dfF). Can handle multiple dims for F. + +Usage: afprob(dfnum, dfden, F) where usually dfnum=dfbn, dfden=dfwn +""" + if type(F) == N.ndarray: + return abetai(0.5 * dfden, 0.5 * dfnum, dfden / (1.0 * dfden + dfnum * F)) + else: + return abetai(0.5 * dfden, 0.5 * dfnum, dfden / float(dfden + dfnum * F)) + + def abetacf(a, b, x, verbose=1): + """ +Evaluates the continued fraction form of the incomplete Beta function, +betai. (Adapted from: Numerical Recipies in C.) Can handle multiple +dimensions for x. + +Usage: abetacf(a,b,x,verbose=1) +""" + ITMAX = 200 + EPS = 3.0e-7 + + arrayflag = 1 + if type(x) == N.ndarray: + frozen = N.ones(x.shape, + N.float_) * -1 #start out w/ -1s, should replace all + else: + arrayflag = 0 + frozen = N.array([-1]) + x = N.array([x]) + mask = N.zeros(x.shape) + bm = az = am = 1.0 + qab = a + b + qap = a + 1.0 + qam = a - 1.0 + bz = 1.0 - qab * x / qap + for i in range(ITMAX + 1): + if N.sum(N.ravel(N.equal(frozen, -1))) == 0: + break + em = float(i + 1) + tem = em + em + d = em * (b - em) * x / ((qam + tem) * (a + tem)) + ap = az + d * am + bp = bz + d * bm + d = -(a + em) * (qab + em) * x / ((qap + tem) * (a + tem)) + app = ap + d * az + bpp = bp + d * bz + aold = az * 1 + am = ap / bpp + bm = bp / bpp + az = app / bpp + bz = 1.0 + newmask = N.less(abs(az - aold), EPS * abs(az)) + frozen = N.where(newmask * N.equal(mask, 0), az, frozen) + mask = N.clip(mask + newmask, 0, 1) + noconverge = asum(N.equal(frozen, -1)) + if noconverge <> 0 and verbose: + print 'a or b too big, or ITMAX too small in Betacf for ', noconverge, ' elements' + if arrayflag: + return frozen + else: + return frozen[0] + + def agammln(xx): + """ +Returns the gamma function of xx. + Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt. +Adapted from: Numerical Recipies in C. Can handle multiple dims ... but +probably doesn't normally have to. + +Usage: agammln(xx) +""" + coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, + 0.120858003e-2, -0.536382e-5] + x = xx - 1.0 + tmp = x + 5.5 + tmp = tmp - (x + 0.5) * N.log(tmp) + ser = 1.0 + for j in range(len(coeff)): + x = x + 1 + ser = ser + coeff[j] / x + return -tmp + N.log(2.50662827465 * ser) + + def abetai(a, b, x, verbose=1): + """ +Returns the incomplete beta function: + + I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) + +where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma +function of a. The continued fraction formulation is implemented +here, using the betacf function. (Adapted from: Numerical Recipies in +C.) Can handle multiple dimensions. + +Usage: abetai(a,b,x,verbose=1) +""" + TINY = 1e-15 + if type(a) == N.ndarray: + if asum(N.less(x, 0) + N.greater(x, 1)) <> 0: + raise ValueError, 'Bad x in abetai' + x = N.where(N.equal(x, 0), TINY, x) + x = N.where(N.equal(x, 1.0), 1 - TINY, x) + + bt = N.where(N.equal(x, 0) + N.equal(x, 1), 0, -1) + exponents = (gammln(a + b) - gammln(a) - gammln(b) + a * N.log(x) + b * + N.log(1.0 - x)) + # 746 (below) is the MAX POSSIBLE BEFORE OVERFLOW + exponents = N.where(N.less(exponents, -740), -740, exponents) + bt = N.exp(exponents) + if type(x) == N.ndarray: + ans = N.where( + N.less(x, (a + 1) / (a + b + 2.0)), bt * abetacf(a, b, x, verbose) / + float(a), 1.0 - bt * abetacf(b, a, 1.0 - x, verbose) / float(b)) + else: + if x < (a + 1) / (a + b + 2.0): + ans = bt * abetacf(a, b, x, verbose) / float(a) + else: + ans = 1.0 - bt * abetacf(b, a, 1.0 - x, verbose) / float(b) + return ans + +##################################### +####### AANOVA CALCULATIONS ####### +##################################### + + import numpy.linalg, operator + LA = numpy.linalg + + def aglm(data, para): + """ +Calculates a linear model fit ... anova/ancova/lin-regress/t-test/etc. Taken +from: + Peterson et al. Statistical limitations in functional neuroimaging + I. Non-inferential methods and statistical models. Phil Trans Royal Soc + Lond B 354: 1239-1260. + +Usage: aglm(data,para) +Returns: statistic, p-value ??? +""" + if len(para) <> len(data): + print 'data and para must be same length in aglm' + return + n = len(para) + p = pstat.aunique(para) + x = N.zeros((n, len(p))) # design matrix + for l in range(len(p)): + x[:, l] = N.equal(para, p[l]) + b = N.dot( + N.dot( + LA.inv(N.dot( + N.transpose(x), x)), # i.e., b=inv(X'X)X'Y + N.transpose(x)), + data) + diffs = (data - N.dot(x, b)) + s_sq = 1. / (n - len(p)) * N.dot(N.transpose(diffs), diffs) + + if len(p) == 2: # ttest_ind + c = N.array([1, -1]) + df = n - 2 + fact = asum(1.0 / asum(x, 0)) # i.e., 1/n1 + 1/n2 + 1/n3 ... + t = N.dot(c, b) / N.sqrt(s_sq * fact) + probs = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) + return t, probs + + def aF_oneway(*args): + """ +Performs a 1-way ANOVA, returning an F-value and probability given +any number of groups. From Heiman, pp.394-7. + +Usage: aF_oneway (*args) where *args is 2 or more arrays, one per + treatment group +Returns: f-value, probability +""" + na = len(args) # ANOVA on 'na' groups, each in it's own array + means = [0] * na + vars = [0] * na + ns = [0] * na + alldata = [] + tmp = map(N.array, args) + means = map(amean, tmp) + vars = map(avar, tmp) + ns = map(len, args) + alldata = N.concatenate(args) + bign = len(alldata) + sstot = ass(alldata) - (asquare_of_sums(alldata) / float(bign)) + ssbn = 0 + for a in args: + ssbn = ssbn + asquare_of_sums(N.array(a)) / float(len(a)) + ssbn = ssbn - (asquare_of_sums(alldata) / float(bign)) + sswn = sstot - ssbn + dfbn = na - 1 + dfwn = bign - na + msb = ssbn / float(dfbn) + msw = sswn / float(dfwn) + f = msb / msw + prob = fprob(dfbn, dfwn, f) + return f, prob + + def aF_value(ER, EF, dfR, dfF): + """ +Returns an F-statistic given the following: + ER = error associated with the null hypothesis (the Restricted model) + EF = error associated with the alternate hypothesis (the Full model) + dfR = degrees of freedom the Restricted model + dfF = degrees of freedom associated with the Restricted model +""" + return ((ER - EF) / float(dfR - dfF) / (EF / float(dfF))) + + def outputfstats(Enum, Eden, dfnum, dfden, f, prob): + Enum = round(Enum, 3) + Eden = round(Eden, 3) + dfnum = round(Enum, 3) + dfden = round(dfden, 3) + f = round(f, 3) + prob = round(prob, 3) + suffix = '' # for *s after the p-value + if prob < 0.001: + suffix = ' ***' + elif prob < 0.01: + suffix = ' **' + elif prob < 0.05: + suffix = ' *' + title = [['EF/ER', 'DF', 'Mean Square', 'F-value', 'prob', '']] + lofl = title + [[Enum, dfnum, round(Enum / float(dfnum), 3), f, prob, suffix + ], [Eden, dfden, round(Eden / float(dfden), 3), '', '', '']] + pstat.printcc(lofl) + return + + def F_value_multivariate(ER, EF, dfnum, dfden): + """ +Returns an F-statistic given the following: + ER = error associated with the null hypothesis (the Restricted model) + EF = error associated with the alternate hypothesis (the Full model) + dfR = degrees of freedom the Restricted model + dfF = degrees of freedom associated with the Restricted model +where ER and EF are matrices from a multivariate F calculation. +""" + if type(ER) in [IntType, FloatType]: + ER = N.array([[ER]]) + if type(EF) in [IntType, FloatType]: + EF = N.array([[EF]]) + n_um = (LA.det(ER) - LA.det(EF)) / float(dfnum) + d_en = LA.det(EF) / float(dfden) + return n_um / d_en + +##################################### +####### ASUPPORT FUNCTIONS ######## +##################################### + + def asign(a): + """ +Usage: asign(a) +Returns: array shape of a, with -1 where a<0 and +1 where a>=0 +""" + a = N.asarray(a) + if ((type(a) == type(1.4)) or (type(a) == type(1))): + return a - a - N.less(a, 0) + N.greater(a, 0) + else: + return N.zeros(N.shape(a)) - N.less(a, 0) + N.greater(a, 0) + + def asum(a, dimension=None, keepdims=0): + """ +An alternative to the Numeric.add.reduce function, which allows one to +(1) collapse over multiple dimensions at once, and/or (2) to retain +all dimensions in the original array (squashing one down to size. +Dimension can equal None (ravel array first), an integer (the +dimension over which to operate), or a sequence (operate over multiple +dimensions). If keepdims=1, the resulting array will have as many +dimensions as the input array. + +Usage: asum(a, dimension=None, keepdims=0) +Returns: array summed along 'dimension'(s), same _number_ of dims if keepdims=1 +""" + if type(a) == N.ndarray and a.dtype in [N.int_, N.short, N.ubyte]: + a = a.astype(N.float_) + if dimension == None: + s = N.sum(N.ravel(a)) + elif type(dimension) in [IntType, FloatType]: + s = N.add.reduce(a, dimension) + if keepdims == 1: + shp = list(a.shape) + shp[dimension] = 1 + s = N.reshape(s, shp) + else: # must be a SEQUENCE of dims to sum over + dims = list(dimension) + dims.sort() + dims.reverse() + s = a * 1.0 + for dim in dims: + s = N.add.reduce(s, dim) + if keepdims == 1: + shp = list(a.shape) + for dim in dims: + shp[dim] = 1 + s = N.reshape(s, shp) + return s + + def acumsum(a, dimension=None): + """ +Returns an array consisting of the cumulative sum of the items in the +passed array. Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions, but this last one just barely makes sense). + +Usage: acumsum(a,dimension=None) +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + if type(dimension) in [ListType, TupleType, N.ndarray]: + dimension = list(dimension) + dimension.sort() + dimension.reverse() + for d in dimension: + a = N.add.accumulate(a, d) + return a + else: + return N.add.accumulate(a, dimension) + + def ass(inarray, dimension=None, keepdims=0): + """ +Squares each value in the passed array, adds these squares & returns +the result. Unfortunate function name. :-) Defaults to ALL values in +the array. Dimension can equal None (ravel array first), an integer +(the dimension over which to operate), or a sequence (operate over +multiple dimensions). Set keepdims=1 to maintain the original number +of dimensions. + +Usage: ass(inarray, dimension=None, keepdims=0) +Returns: sum-along-'dimension' for (inarray*inarray) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + return asum(inarray * inarray, dimension, keepdims) + + def asummult(array1, array2, dimension=None, keepdims=0): + """ +Multiplies elements in array1 and array2, element by element, and +returns the sum (along 'dimension') of all resulting multiplications. +Dimension can equal None (ravel array first), an integer (the +dimension over which to operate), or a sequence (operate over multiple +dimensions). A trivial function, but included for completeness. + +Usage: asummult(array1,array2,dimension=None,keepdims=0) +""" + if dimension == None: + array1 = N.ravel(array1) + array2 = N.ravel(array2) + dimension = 0 + return asum(array1 * array2, dimension, keepdims) + + def asquare_of_sums(inarray, dimension=None, keepdims=0): + """ +Adds the values in the passed array, squares that sum, and returns the +result. Dimension can equal None (ravel array first), an integer (the +dimension over which to operate), or a sequence (operate over multiple +dimensions). If keepdims=1, the returned array will have the same +NUMBER of dimensions as the original. + +Usage: asquare_of_sums(inarray, dimension=None, keepdims=0) +Returns: the square of the sum over dim(s) in dimension +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + s = asum(inarray, dimension, keepdims) + if type(s) == N.ndarray: + return s.astype(N.float_) * s + else: + return float(s) * s + + def asumdiffsquared(a, b, dimension=None, keepdims=0): + """ +Takes pairwise differences of the values in arrays a and b, squares +these differences, and returns the sum of these squares. Dimension +can equal None (ravel array first), an integer (the dimension over +which to operate), or a sequence (operate over multiple dimensions). +keepdims=1 means the return shape = len(a.shape) = len(b.shape) + +Usage: asumdiffsquared(a,b) +Returns: sum[ravel(a-b)**2] +""" + if dimension == None: + inarray = N.ravel(a) + dimension = 0 + return asum((a - b)**2, dimension, keepdims) + + def ashellsort(inarray): + """ +Shellsort algorithm. Sorts a 1D-array. + +Usage: ashellsort(inarray) +Returns: sorted-inarray, sorting-index-vector (for original array) +""" + n = len(inarray) + svec = inarray * 1.0 + ivec = range(n) + gap = n / 2 # integer division needed + while gap > 0: + for i in range(gap, n): + for j in range(i - gap, -1, -gap): + while j >= 0 and svec[j] > svec[j + gap]: + temp = svec[j] + svec[j] = svec[j + gap] + svec[j + gap] = temp + itemp = ivec[j] + ivec[j] = ivec[j + gap] + ivec[j + gap] = itemp + gap = gap / 2 # integer division needed +# svec is now sorted input vector, ivec has the order svec[i] = vec[ivec[i]] + return svec, ivec + + def arankdata(inarray): + """ +Ranks the data in inarray, dealing with ties appropritely. Assumes +a 1D inarray. Adapted from Gary Perlman's |Stat ranksort. + +Usage: arankdata(inarray) +Returns: array of length equal to inarray, containing rank scores +""" + n = len(inarray) + svec, ivec = ashellsort(inarray) + sumranks = 0 + dupcount = 0 + newarray = N.zeros(n, N.float_) + for i in range(n): + sumranks = sumranks + i + dupcount = dupcount + 1 + if i == n - 1 or svec[i] <> svec[i + 1]: + averank = sumranks / float(dupcount) + 1 + for j in range(i - dupcount + 1, i + 1): + newarray[ivec[j]] = averank + sumranks = 0 + dupcount = 0 + return newarray + + def afindwithin(data): + """ +Returns a binary vector, 1=within-subject factor, 0=between. Input +equals the entire data array (i.e., column 1=random factor, last +column = measured values. + +Usage: afindwithin(data) data in |Stat format +""" + numfact = len(data[0]) - 2 + withinvec = [0] * numfact + for col in range(1, numfact + 1): + rows = pstat.linexand(data, col, pstat.unique(pstat.colex(data, 1))[0] + ) # get 1 level of this factor + if len(pstat.unique(pstat.colex(rows, 0))) < len( + rows): # if fewer subjects than scores on this factor + withinvec[col - 1] = 1 + return withinvec + + ######################################################### + ######################################################### + ###### RE-DEFINE DISPATCHES TO INCLUDE ARRAYS ######### + ######################################################### + ######################################################### + + ## CENTRAL TENDENCY: + geometricmean = Dispatch( + (lgeometricmean, (ListType, TupleType)), (ageometricmean, (N.ndarray,))) + harmonicmean = Dispatch( + (lharmonicmean, (ListType, TupleType)), (aharmonicmean, (N.ndarray,))) + mean = Dispatch((lmean, (ListType, TupleType)), (amean, (N.ndarray,))) + median = Dispatch((lmedian, (ListType, TupleType)), (amedian, (N.ndarray,))) + medianscore = Dispatch( + (lmedianscore, (ListType, TupleType)), (amedianscore, (N.ndarray,))) + mode = Dispatch((lmode, (ListType, TupleType)), (amode, (N.ndarray,))) + tmean = Dispatch((atmean, (N.ndarray,))) + tvar = Dispatch((atvar, (N.ndarray,))) + tstdev = Dispatch((atstdev, (N.ndarray,))) + tsem = Dispatch((atsem, (N.ndarray,))) + + ## VARIATION: + moment = Dispatch((lmoment, (ListType, TupleType)), (amoment, (N.ndarray,))) + variation = Dispatch( + (lvariation, (ListType, TupleType)), (avariation, (N.ndarray,))) + skew = Dispatch((lskew, (ListType, TupleType)), (askew, (N.ndarray,))) + kurtosis = Dispatch( + (lkurtosis, (ListType, TupleType)), (akurtosis, (N.ndarray,))) + describe = Dispatch( + (ldescribe, (ListType, TupleType)), (adescribe, (N.ndarray,))) + + ## DISTRIBUTION TESTS + + skewtest = Dispatch( + (askewtest, (ListType, TupleType)), (askewtest, (N.ndarray,))) + kurtosistest = Dispatch( + (akurtosistest, (ListType, TupleType)), (akurtosistest, (N.ndarray,))) + normaltest = Dispatch( + (anormaltest, (ListType, TupleType)), (anormaltest, (N.ndarray,))) + + ## FREQUENCY STATS: + itemfreq = Dispatch( + (litemfreq, (ListType, TupleType)), (aitemfreq, (N.ndarray,))) + scoreatpercentile = Dispatch( + (lscoreatpercentile, (ListType, TupleType)), (ascoreatpercentile, + (N.ndarray,))) + percentileofscore = Dispatch( + (lpercentileofscore, (ListType, TupleType)), (apercentileofscore, + (N.ndarray,))) + histogram = Dispatch( + (lhistogram, (ListType, TupleType)), (ahistogram, (N.ndarray,))) + cumfreq = Dispatch( + (lcumfreq, (ListType, TupleType)), (acumfreq, (N.ndarray,))) + relfreq = Dispatch( + (lrelfreq, (ListType, TupleType)), (arelfreq, (N.ndarray,))) + + ## VARIABILITY: + obrientransform = Dispatch( + (lobrientransform, (ListType, TupleType)), (aobrientransform, + (N.ndarray,))) + samplevar = Dispatch( + (lsamplevar, (ListType, TupleType)), (asamplevar, (N.ndarray,))) + samplestdev = Dispatch( + (lsamplestdev, (ListType, TupleType)), (asamplestdev, (N.ndarray,))) + signaltonoise = Dispatch((asignaltonoise, (N.ndarray,)),) + var = Dispatch((lvar, (ListType, TupleType)), (avar, (N.ndarray,))) + stdev = Dispatch((lstdev, (ListType, TupleType)), (astdev, (N.ndarray,))) + sterr = Dispatch((lsterr, (ListType, TupleType)), (asterr, (N.ndarray,))) + sem = Dispatch((lsem, (ListType, TupleType)), (asem, (N.ndarray,))) + z = Dispatch((lz, (ListType, TupleType)), (az, (N.ndarray,))) + zs = Dispatch((lzs, (ListType, TupleType)), (azs, (N.ndarray,))) + + ## TRIMMING FCNS: + threshold = Dispatch((athreshold, (N.ndarray,)),) + trimboth = Dispatch( + (ltrimboth, (ListType, TupleType)), (atrimboth, (N.ndarray,))) + trim1 = Dispatch((ltrim1, (ListType, TupleType)), (atrim1, (N.ndarray,))) + + ## CORRELATION FCNS: + paired = Dispatch((lpaired, (ListType, TupleType)), (apaired, (N.ndarray,))) + lincc = Dispatch((llincc, (ListType, TupleType)), (alincc, (N.ndarray,))) + pearsonr = Dispatch( + (lpearsonr, (ListType, TupleType)), (apearsonr, (N.ndarray,))) + spearmanr = Dispatch( + (lspearmanr, (ListType, TupleType)), (aspearmanr, (N.ndarray,))) + pointbiserialr = Dispatch( + (lpointbiserialr, (ListType, TupleType)), (apointbiserialr, (N.ndarray,))) + kendalltau = Dispatch( + (lkendalltau, (ListType, TupleType)), (akendalltau, (N.ndarray,))) + linregress = Dispatch( + (llinregress, (ListType, TupleType)), (alinregress, (N.ndarray,))) + + ## INFERENTIAL STATS: + ttest_1samp = Dispatch( + (lttest_1samp, (ListType, TupleType)), (attest_1samp, (N.ndarray,))) + ttest_ind = Dispatch( + (lttest_ind, (ListType, TupleType)), (attest_ind, (N.ndarray,))) + ttest_rel = Dispatch( + (lttest_rel, (ListType, TupleType)), (attest_rel, (N.ndarray,))) + chisquare = Dispatch( + (lchisquare, (ListType, TupleType)), (achisquare, (N.ndarray,))) + ks_2samp = Dispatch( + (lks_2samp, (ListType, TupleType)), (aks_2samp, (N.ndarray,))) + mannwhitneyu = Dispatch( + (lmannwhitneyu, (ListType, TupleType)), (amannwhitneyu, (N.ndarray,))) + tiecorrect = Dispatch( + (ltiecorrect, (ListType, TupleType)), (atiecorrect, (N.ndarray,))) + ranksums = Dispatch( + (lranksums, (ListType, TupleType)), (aranksums, (N.ndarray,))) + wilcoxont = Dispatch( + (lwilcoxont, (ListType, TupleType)), (awilcoxont, (N.ndarray,))) + kruskalwallish = Dispatch( + (lkruskalwallish, (ListType, TupleType)), (akruskalwallish, (N.ndarray,))) + friedmanchisquare = Dispatch( + (lfriedmanchisquare, (ListType, TupleType)), (afriedmanchisquare, + (N.ndarray,))) + + ## PROBABILITY CALCS: + chisqprob = Dispatch( + (lchisqprob, (IntType, FloatType)), (achisqprob, (N.ndarray,))) + zprob = Dispatch((lzprob, (IntType, FloatType)), (azprob, (N.ndarray,))) + ksprob = Dispatch((lksprob, (IntType, FloatType)), (aksprob, (N.ndarray,))) + fprob = Dispatch((lfprob, (IntType, FloatType)), (afprob, (N.ndarray,))) + betacf = Dispatch((lbetacf, (IntType, FloatType)), (abetacf, (N.ndarray,))) + betai = Dispatch((lbetai, (IntType, FloatType)), (abetai, (N.ndarray,))) + erfcc = Dispatch((lerfcc, (IntType, FloatType)), (aerfcc, (N.ndarray,))) + gammln = Dispatch((lgammln, (IntType, FloatType)), (agammln, (N.ndarray,))) + + ## ANOVA FUNCTIONS: + F_oneway = Dispatch( + (lF_oneway, (ListType, TupleType)), (aF_oneway, (N.ndarray,))) + F_value = Dispatch( + (lF_value, (ListType, TupleType)), (aF_value, (N.ndarray,))) + + ## SUPPORT FUNCTIONS: + incr = Dispatch((lincr, (ListType, TupleType, N.ndarray)),) + sum = Dispatch((lsum, (ListType, TupleType)), (asum, (N.ndarray,))) + cumsum = Dispatch((lcumsum, (ListType, TupleType)), (acumsum, (N.ndarray,))) + ss = Dispatch((lss, (ListType, TupleType)), (ass, (N.ndarray,))) + summult = Dispatch( + (lsummult, (ListType, TupleType)), (asummult, (N.ndarray,))) + square_of_sums = Dispatch( + (lsquare_of_sums, (ListType, TupleType)), (asquare_of_sums, (N.ndarray,))) + sumdiffsquared = Dispatch( + (lsumdiffsquared, (ListType, TupleType)), (asumdiffsquared, (N.ndarray,))) + shellsort = Dispatch( + (lshellsort, (ListType, TupleType)), (ashellsort, (N.ndarray,))) + rankdata = Dispatch( + (lrankdata, (ListType, TupleType)), (arankdata, (N.ndarray,))) + findwithin = Dispatch( + (lfindwithin, (ListType, TupleType)), (afindwithin, (N.ndarray,))) + +###################### END OF NUMERIC FUNCTION BLOCK ##################### + +###################### END OF STATISTICAL FUNCTIONS ###################### + +except ImportError: + pass |