diff options
author | Zhizhou Yang <zhizhouy@google.com> | 2019-12-23 16:56:26 -0800 |
---|---|---|
committer | Commit Bot <commit-bot@chromium.org> | 2020-01-09 07:34:17 +0000 |
commit | 45e4c1f8be303b43030e3dfac6cab5cd67d10a99 (patch) | |
tree | f00931a3bf505fded2f375f7589b41e6dfc34f18 /cros_utils | |
parent | e41baa8073f6f5d160ef8cf75ccb7ca9beb1ad72 (diff) | |
download | toolchain-utils-45e4c1f8be303b43030e3dfac6cab5cd67d10a99.tar.gz |
toolchain-utils: Migrate cros_utils to python3
This patch fixes all presubmit checks in cros_utils and migrated it from
python2 to python3.
TEST=Passed all unittests and presubmit checks.
BUG=chromium:1011676
Change-Id: I3a7097d6570fb2cb4e5dcdd5ae22f30c5c5762e9
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/toolchain-utils/+/1981087
Commit-Queue: Zhizhou Yang <zhizhouy@google.com>
Tested-by: Zhizhou Yang <zhizhouy@google.com>
Auto-Submit: Zhizhou Yang <zhizhouy@google.com>
Reviewed-by: George Burgess <gbiv@chromium.org>
Diffstat (limited to 'cros_utils')
27 files changed, 311 insertions, 6152 deletions
diff --git a/cros_utils/__init__.py b/cros_utils/__init__.py index 8b137891..4c4e5554 100644 --- a/cros_utils/__init__.py +++ b/cros_utils/__init__.py @@ -1 +1,4 @@ - +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. diff --git a/cros_utils/buildbot_json.py b/cros_utils/buildbot_json.py index 42a27744..08a8ae05 100755 --- a/cros_utils/buildbot_json.py +++ b/cros_utils/buildbot_json.py @@ -1,5 +1,8 @@ -#!/usr/bin/env python2 -# Copyright (c) 2012 The Chromium Authors. All rights reserved. +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are @@ -28,6 +31,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # NOTE: This file is NOT under GPL. See above. + """Queries buildbot through the json interface. """ @@ -50,9 +54,10 @@ import logging import optparse import time -import urllib -import urllib2 import sys +import urllib.error +import urllib.parse +import urllib.request try: from natsort import natsorted @@ -63,7 +68,7 @@ except ImportError: # These values are buildbot constants used for Build and BuildStep. # This line was copied from master/buildbot/status/builder.py. -SUCCESS, WARNINGS, FAILURE, SKIPPED, EXCEPTION, RETRY = range(6) +SUCCESS, WARNINGS, FAILURE, SKIPPED, EXCEPTION, RETRY = list(range(6)) ## Generic node caching code. @@ -182,7 +187,8 @@ class AddressableDataNode(AddressableBaseDataNode): # pylint: disable=W0223 """Automatically encodes the url.""" def __init__(self, parent, url, data): - super(AddressableDataNode, self).__init__(parent, urllib.quote(url), data) + super(AddressableDataNode, self).__init__(parent, urllib.parse.quote(url), + data) class NonAddressableDataNode(Node): # pylint: disable=W0223 @@ -329,7 +335,7 @@ class NonAddressableNodeList(VirtualNodeList): # pylint: disable=W0223 def cached_keys(self): if self.parent.cached_data is None: return None - return range(len(self.parent.data.get(self.subkey, []))) + return list(range(len(self.parent.data.get(self.subkey, [])))) @property def data(self): @@ -382,13 +388,13 @@ class AddressableNodeList(NodeList): @property def cached_children(self): - for item in self._cache.itervalues(): + for item in self._cache.values(): if item.cached_data is not None: yield item @property def cached_keys(self): - return self._cache.keys() + return list(self._cache.keys()) def __getitem__(self, key): """Enables 'obj[i]'.""" @@ -419,14 +425,13 @@ class AddressableNodeList(NodeList): # pylint: disable=W0212 if not self._is_cached: to_fetch = [ - child - for child in children + child for child in children if not (child in self._cache and self._cache[child].cached_data) ] if to_fetch: # Similar to cache(). The only reason to sort is to simplify testing. - params = '&'.join('select=%s' % urllib.quote(str(v)) - for v in sorted(to_fetch)) + params = '&'.join( + 'select=%s' % urllib.parse.quote(str(v)) for v in sorted(to_fetch)) data = self.read('?' + params) for key in sorted(data): self._create_obj(key, data[key]) @@ -440,7 +445,7 @@ class AddressableNodeList(NodeList): def discard(self): """Discards temporary children.""" super(AddressableNodeList, self).discard() - for v in self._cache.itervalues(): + for v in self._cache.values(): v.discard() def read(self, suburl): @@ -526,8 +531,8 @@ class SubViewNodeList(VirtualNodeList): # pylint: disable=W0223 self.cache() return super(SubViewNodeList, self).__iter__() -############################################################################### -## Buildbot-specific code + +# Buildbot-specific code class Slave(AddressableDataNode): @@ -681,7 +686,7 @@ class BuildSteps(NonAddressableNodeList): def __getitem__(self, key): """Accept step name in addition to index number.""" - if isinstance(key, basestring): + if isinstance(key, str): # It's a string, try to find the corresponding index. for i, step in enumerate(self.data): if step['name'] == key: @@ -840,7 +845,7 @@ class Builds(AddressableNodeList): # highest key value and calculate from it. key = max(self._keys) + key + 1 - if not key in self._cache: + if key not in self._cache: # Create an empty object. self._create_obj(key, None) return self._cache[key] @@ -856,7 +861,7 @@ class Builds(AddressableNodeList): To access the older builds, use self.iterall() instead. """ self.cache() - return reversed(self._cache.values()) + return reversed(list(self._cache.values())) def iterall(self): """Returns Build objects in decreasing order unbounded up to build 0. @@ -970,22 +975,23 @@ class Buildbot(AddressableBaseDataNode): else: url += '?filter=1' logging.info('read(%s)', suburl) - channel = urllib.urlopen(url) + channel = urllib.request.urlopen(url) data = channel.read() try: return json.loads(data) except ValueError: if channel.getcode() >= 400: # Convert it into an HTTPError for easier processing. - raise urllib2.HTTPError(url, channel.getcode(), '%s:\n%s' % (url, data), - channel.headers, None) + raise urllib.error.HTTPError(url, channel.getcode(), + '%s:\n%s' % (url, data), channel.headers, + None) raise def _readall(self): return self.read('project') -############################################################################### -## Controller code + +# Controller code def usage(more): @@ -1026,12 +1032,13 @@ def need_buildbot(fn): @need_buildbot def CMDpending(parser, args): """Lists pending jobs.""" - parser.add_option('-b', - '--builder', - dest='builders', - action='append', - default=[], - help='Builders to filter on') + parser.add_option( + '-b', + '--builder', + dest='builders', + action='append', + default=[], + help='Builders to filter on') options, args, buildbot = parser.parse_args(args) if args: parser.error('Unrecognized parameters: %s' % ' '.join(args)) @@ -1049,7 +1056,7 @@ def CMDpending(parser, args): print(' revision: %s' % pending['source']['revision']) for change in pending['source']['changes']: print(' change:') - print(' comment: %r' % unicode(change['comments'][:50])) + print(' comment: %r' % change['comments'][:50]) print(' who: %s' % change['who']) return 0 @@ -1063,13 +1070,11 @@ def CMDrun(parser, args): was on its own line. """ parser.add_option('-f', '--file', help='Read script from file') - parser.add_option('-i', - dest='use_stdin', - action='store_true', - help='Read script on stdin') + parser.add_option( + '-i', dest='use_stdin', action='store_true', help='Read script on stdin') # Variable 'buildbot' is not used directly. # pylint: disable=W0612 - options, args, buildbot = parser.parse_args(args) + options, args, _ = parser.parse_args(args) if (bool(args) + bool(options.use_stdin) + bool(options.file)) != 1: parser.error('Need to pass only one of: <commands>, -f <file> or -i') if options.use_stdin: @@ -1090,10 +1095,9 @@ def CMDinteractive(parser, args): _, args, buildbot = parser.parse_args(args) if args: parser.error('Unrecognized parameters: %s' % ' '.join(args)) - prompt = ( - 'Buildbot interactive console for "%s".\n' - 'Hint: Start with typing: \'buildbot.printable_attributes\' or ' - '\'print str(buildbot)\' to explore.') % buildbot.url[:-len('/json')] + prompt = ('Buildbot interactive console for "%s".\n' + "Hint: Start with typing: 'buildbot.printable_attributes' or " + "'print str(buildbot)' to explore.") % buildbot.url[:-len('/json')] local_vars = {'buildbot': buildbot, 'b': buildbot} code.interact(prompt, None, local_vars) @@ -1123,18 +1127,20 @@ def CMDdisconnected(parser, args): def find_idle_busy_slaves(parser, args, show_idle): - parser.add_option('-b', - '--builder', - dest='builders', - action='append', - default=[], - help='Builders to filter on') - parser.add_option('-s', - '--slave', - dest='slaves', - action='append', - default=[], - help='Slaves to filter on') + parser.add_option( + '-b', + '--builder', + dest='builders', + action='append', + default=[], + help='Builders to filter on') + parser.add_option( + '-s', + '--slave', + dest='slaves', + action='append', + default=[], + help='Slaves to filter on') options, args, buildbot = parser.parse_args(args) if args: parser.error('Unrecognized parameters: %s' % ' '.join(args)) @@ -1212,8 +1218,9 @@ def last_failure(buildbot, def CMDlast_failure(parser, args): """Lists all slaves that failed on that step on their last build. - Example: to find all slaves where their last build was a compile failure, - run with --step compile + Examples: + To find all slaves where their last build was a compile failure, + run with --step compile """ parser.add_option( '-S', @@ -1222,32 +1229,36 @@ def CMDlast_failure(parser, args): action='append', default=[], help='List all slaves that failed on that step on their last build') - parser.add_option('-b', - '--builder', - dest='builders', - action='append', - default=[], - help='Builders to filter on') - parser.add_option('-s', - '--slave', - dest='slaves', - action='append', - default=[], - help='Slaves to filter on') - parser.add_option('-n', - '--no_cache', - action='store_true', - help='Don\'t load all builds at once') + parser.add_option( + '-b', + '--builder', + dest='builders', + action='append', + default=[], + help='Builders to filter on') + parser.add_option( + '-s', + '--slave', + dest='slaves', + action='append', + default=[], + help='Slaves to filter on') + parser.add_option( + '-n', + '--no_cache', + action='store_true', + help="Don't load all builds at once") options, args, buildbot = parser.parse_args(args) if args: parser.error('Unrecognized parameters: %s' % ' '.join(args)) print_builders = not options.quiet and len(options.builders) != 1 last_builder = None - for build in last_failure(buildbot, - builders=options.builders, - slaves=options.slaves, - steps=options.steps, - no_cache=options.no_cache): + for build in last_failure( + buildbot, + builders=options.builders, + slaves=options.slaves, + steps=options.steps, + no_cache=options.no_cache): if print_builders and last_builder != build.builder: print(build.builder.name) @@ -1259,8 +1270,8 @@ def CMDlast_failure(parser, args): else: print(build.slave.name) else: - out = '%d on %s: blame:%s' % (build.number, build.slave.name, - ', '.join(build.blame)) + out = '%d on %s: blame:%s' % (build.number, build.slave.name, ', '.join( + build.blame)) if print_builders: out = ' ' + out print(out) @@ -1280,15 +1291,15 @@ def CMDlast_failure(parser, args): @need_buildbot def CMDcurrent(parser, args): """Lists current jobs.""" - parser.add_option('-b', - '--builder', - dest='builders', - action='append', - default=[], - help='Builders to filter on') - parser.add_option('--blame', - action='store_true', - help='Only print the blame list') + parser.add_option( + '-b', + '--builder', + dest='builders', + action='append', + default=[], + help='Builders to filter on') + parser.add_option( + '--blame', action='store_true', help='Only print the blame list') options, args, buildbot = parser.parse_args(args) if args: parser.error('Unrecognized parameters: %s' % ' '.join(args)) @@ -1330,28 +1341,30 @@ def CMDcurrent(parser, args): def CMDbuilds(parser, args): """Lists all builds. - Example: to find all builds on a single slave, run with -b bar -s foo + Examples: + To find all builds on a single slave, run with -b bar -s foo. """ - parser.add_option('-r', - '--result', - type='int', - help='Build result to filter on') - parser.add_option('-b', - '--builder', - dest='builders', - action='append', - default=[], - help='Builders to filter on') - parser.add_option('-s', - '--slave', - dest='slaves', - action='append', - default=[], - help='Slaves to filter on') - parser.add_option('-n', - '--no_cache', - action='store_true', - help='Don\'t load all builds at once') + parser.add_option( + '-r', '--result', type='int', help='Build result to filter on') + parser.add_option( + '-b', + '--builder', + dest='builders', + action='append', + default=[], + help='Builders to filter on') + parser.add_option( + '-s', + '--slave', + dest='slaves', + action='append', + default=[], + help='Slaves to filter on') + parser.add_option( + '-n', + '--no_cache', + action='store_true', + help="Don't load all builds at once") options, args, buildbot = parser.parse_args(args) if args: parser.error('Unrecognized parameters: %s' % ' '.join(args)) @@ -1377,16 +1390,15 @@ def CMDbuilds(parser, args): @need_buildbot def CMDcount(parser, args): """Count the number of builds that occured during a specific period.""" - parser.add_option('-o', - '--over', - type='int', - help='Number of seconds to look for') - parser.add_option('-b', - '--builder', - dest='builders', - action='append', - default=[], - help='Builders to filter on') + parser.add_option( + '-o', '--over', type='int', help='Number of seconds to look for') + parser.add_option( + '-b', + '--builder', + dest='builders', + action='append', + default=[], + help='Builders to filter on') options, args, buildbot = parser.parse_args(args) if args: parser.error('Unrecognized parameters: %s' % ' '.join(args)) @@ -1405,10 +1417,11 @@ def CMDcount(parser, args): for build in builder.builds.iterall(): try: start_time = build.start_time - except urllib2.HTTPError: + except urllib.error.HTTPError: # The build was probably trimmed. - print('Failed to fetch build %s/%d' % (builder.name, build.number), - file=sys.stderr) + print( + 'Failed to fetch build %s/%d' % (builder.name, build.number), + file=sys.stderr) continue if start_time >= since: counts[builder.name] += 1 @@ -1418,10 +1431,10 @@ def CMDcount(parser, args): print('.. %d' % counts[builder.name]) align_name = max(len(b) for b in counts) - align_number = max(len(str(c)) for c in counts.itervalues()) + align_number = max(len(str(c)) for c in counts.values()) for builder in sorted(counts): print('%*s: %*d' % (align_name, builder, align_number, counts[builder])) - print('Total: %d' % sum(counts.itervalues())) + print('Total: %d' % sum(counts.values())) return 0 @@ -1448,22 +1461,24 @@ def gen_parser(): parser.parse_args = Parse - parser.add_option('-v', - '--verbose', - action='count', - help='Use multiple times to increase logging leve') + parser.add_option( + '-v', + '--verbose', + action='count', + help='Use multiple times to increase logging leve') parser.add_option( '-q', '--quiet', action='store_true', help='Reduces the output to be parsed by scripts, independent of -v') - parser.add_option('--throttle', - type='float', - help='Minimum delay to sleep between requests') + parser.add_option( + '--throttle', + type='float', + help='Minimum delay to sleep between requests') return parser -############################################################################### -## Generic subcommand handling code + +# Generic subcommand handling code def Command(name): @@ -1497,7 +1512,8 @@ def main(args=None): # pylint: disable=E1101 CMDhelp.__doc__ += '\n\nCommands are:\n' + '\n'.join( ' %-12s %s' % (fn[3:], Command(fn[3:]).__doc__.split('\n', 1)[0]) - for fn in dir(sys.modules[__name__]) if fn.startswith('CMD')) + for fn in dir(sys.modules[__name__]) + if fn.startswith('CMD')) parser = gen_parser() if args is None: diff --git a/cros_utils/buildbot_utils.py b/cros_utils/buildbot_utils.py index 35dc3ac6..a06abd26 100644 --- a/cros_utils/buildbot_utils.py +++ b/cros_utils/buildbot_utils.py @@ -5,6 +5,7 @@ """Utilities for launching and accessing ChromeOS buildbots.""" +from __future__ import division from __future__ import print_function import ast @@ -133,7 +134,7 @@ def GetTrybotImage(chromeos_root, patch_list, tryjob_flags=None, build_toolchain=False, - async=False): + asynchronous=False): """Launch buildbot and get resulting trybot artifact name. This function launches a buildbot with the appropriate flags to @@ -151,7 +152,7 @@ def GetTrybotImage(chromeos_root, tryjob_flags: See cros tryjob --help for available options. build_toolchain: builds and uses the latest toolchain, rather than the prebuilt one in SDK. - async: don't wait for artifacts; just return the buildbucket id + asynchronous: don't wait for artifacts; just return the buildbucket id Returns: (buildbucket id, partial image url) e.g. @@ -159,7 +160,7 @@ def GetTrybotImage(chromeos_root, """ buildbucket_id = SubmitTryjob(chromeos_root, buildbot_name, patch_list, tryjob_flags, build_toolchain) - if async: + if asynchronous: return buildbucket_id, ' ' # The trybot generally takes more than 2 hours to finish. @@ -246,7 +247,7 @@ def GetLatestImage(chromeos_root, path): _, out, _ = ce.ChrootRunCommandWOutput( chromeos_root, command, print_to_console=False) candidates = [l.split('/')[-2] for l in out.split()] - candidates = map(fmt.match, candidates) + candidates = [fmt.match(c) for c in candidates] candidates = [[int(r) for r in m.group(1, 2, 3, 4)] for m in candidates if m] candidates.sort(reverse=True) for c in candidates: diff --git a/cros_utils/buildbot_utils_unittest.py b/cros_utils/buildbot_utils_unittest.py index bfba8d78..4fc3d170 100755 --- a/cros_utils/buildbot_utils_unittest.py +++ b/cros_utils/buildbot_utils_unittest.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Copyright 2018 The Chromium OS Authors. All rights reserved. @@ -9,10 +9,10 @@ from __future__ import print_function -from mock import patch - import time + import unittest +from unittest.mock import patch from cros_utils import buildbot_utils from cros_utils import command_executer @@ -82,7 +82,7 @@ class TrybotTest(unittest.TestCase): # async buildbucket_id, image = buildbot_utils.GetTrybotImage( - '/tmp', 'falco-release-tryjob', [], async=True) + '/tmp', 'falco-release-tryjob', [], asynchronous=True) self.assertEqual(buildbucket_id, self.buildbucket_id) self.assertEqual(' ', image) diff --git a/cros_utils/command_executer.py b/cros_utils/command_executer.py index 08e4e6ae..c4d3fded 100644 --- a/cros_utils/command_executer.py +++ b/cros_utils/command_executer.py @@ -124,7 +124,7 @@ class CommandExecuter(object): terminated_time = None started_time = time.time() - while len(pipes): + while pipes: if command_terminator and command_terminator.IsTerminated(): os.killpg(os.getpgid(p.pid), signal.SIGTERM) if self.logger: @@ -136,7 +136,7 @@ class CommandExecuter(object): l = my_poll.poll(100) for (fd, _) in l: if fd == p.stdout.fileno(): - out = os.read(p.stdout.fileno(), 16384) + out = os.read(p.stdout.fileno(), 16384).decode('utf8') if return_output: full_stdout += out if self.logger: @@ -145,7 +145,7 @@ class CommandExecuter(object): pipes.remove(p.stdout) my_poll.unregister(p.stdout) if fd == p.stderr.fileno(): - err = os.read(p.stderr.fileno(), 16384) + err = os.read(p.stderr.fileno(), 16384).decode('utf8') if return_output: full_stderr += err if self.logger: @@ -182,8 +182,8 @@ class CommandExecuter(object): if return_output: return (p.returncode, full_stdout, full_stderr) return (p.returncode, '', '') - except BaseException as e: - except_handler(p, e) + except BaseException as err: + except_handler(p, err) raise def RunCommand(self, *args, **kwargs): @@ -233,7 +233,7 @@ class CommandExecuter(object): command += '\n. ' + chromeos_root + '/src/scripts/common.sh' command += '\n. ' + chromeos_root + '/src/scripts/remote_access.sh' command += '\nTMP=$(mktemp -d)' - command += "\nFLAGS \"$@\" || exit 1" + command += '\nFLAGS "$@" || exit 1' command += '\nremote_access_init' return command @@ -300,7 +300,7 @@ class CommandExecuter(object): command = self.RemoteAccessInitCommand(chromeos_root, machine) command += '\nremote_sh bash %s' % command_file - command += "\nl_retval=$?; echo \"$REMOTE_OUT\"; exit $l_retval" + command += '\nl_retval=$?; echo "$REMOTE_OUT"; exit $l_retval' retval = self.RunCommandGeneric( command, return_output, @@ -371,7 +371,7 @@ class CommandExecuter(object): os.write(handle, '\n') os.close(handle) - os.chmod(command_file, 0777) + os.chmod(command_file, 0o777) # if return_output is set, run a dummy command first to make sure that # the chroot already exists. We want the final returned output to skip @@ -474,7 +474,7 @@ class CommandExecuter(object): ssh_command = ( 'ssh -p ${FLAGS_ssh_port}' + ' -o StrictHostKeyChecking=no' + ' -o UserKnownHostsFile=$(mktemp)' + ' -i $TMP_PRIVATE_KEY') - rsync_prefix = "\nrsync -r -e \"%s\" " % ssh_command + rsync_prefix = '\nrsync -r -e "%s" ' % ssh_command if dest_cros: command += rsync_prefix + '%s root@%s:%s' % (src, dest_machine, dest) return self.RunCommand( @@ -627,7 +627,7 @@ class CommandExecuter(object): poll.register(errfd, select.POLLIN | select.POLLPRI) handlermap[errfd] = StreamHandler(pobject, errfd, 'stderr', line_consumer) - while len(handlermap): + while handlermap: readables = poll.poll(300) for (fd, evt) in readables: handler = handlermap[fd] @@ -642,17 +642,14 @@ class CommandExecuter(object): os.killpg(os.getpgid(pobject.pid), signal.SIGTERM) return pobject.wait() - except BaseException as e: - except_handler(pobject, e) + except BaseException as err: + except_handler(pobject, err) raise class MockCommandExecuter(CommandExecuter): """Mock class for class CommandExecuter.""" - def __init__(self, log_level, logger_to_set=None): - super(MockCommandExecuter, self).__init__(log_level, logger_to_set) - def RunCommandGeneric(self, cmd, return_output=False, diff --git a/cros_utils/command_executer_unittest.py b/cros_utils/command_executer_unittest.py index 4da4a4ac..959000fd 100755 --- a/cros_utils/command_executer_unittest.py +++ b/cros_utils/command_executer_unittest.py @@ -1,5 +1,8 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. """Unittest for command_executer.py.""" diff --git a/cros_utils/constants.py b/cros_utils/constants.py index 827e9233..b12175bb 100644 --- a/cros_utils/constants.py +++ b/cros_utils/constants.py @@ -1,4 +1,8 @@ -# Copyright 2010 Google Inc. All Rights Reserved. +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + """Generic constants used accross modules. """ diff --git a/cros_utils/contextlib3.py b/cros_utils/contextlib3.py deleted file mode 100644 index 9fabbf6e..00000000 --- a/cros_utils/contextlib3.py +++ /dev/null @@ -1,116 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2019 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -"""Random utilties from Python3's contextlib.""" - -from __future__ import division -from __future__ import print_function - -import sys - - -class ExitStack(object): - """https://docs.python.org/3/library/contextlib.html#contextlib.ExitStack""" - - def __init__(self): - self._stack = [] - self._is_entered = False - - def _assert_is_entered(self): - # Strictly, entering has no effect on the operations that call this. - # However, if you're trying to e.g. push things to an ExitStack that hasn't - # yet been entered, that's likely a bug. - assert self._is_entered, 'ExitStack op performed before entering' - - def __enter__(self): - self._is_entered = True - return self - - def _perform_exit(self, exc_type, exc, exc_traceback): - # I suppose a better name for this is - # `take_exception_handling_into_our_own_hands`, but that's harder to type. - exception_handled = False - while self._stack: - fn = self._stack.pop() - # The except clause below is meant to run as-if it's a `finally` block, - # but `finally` blocks don't have easy access to exceptions currently in - # flight. Hence, we do need to catch things like KeyboardInterrupt, - # SystemExit, ... - # pylint: disable=bare-except - try: - # If an __exit__ handler returns a truthy value, we should assume that - # it handled the exception appropriately. Otherwise, we need to keep it - # with us. (PEP 343) - if fn(exc_type, exc, exc_traceback): - exc_type, exc, exc_traceback = None, None, None - exception_handled = True - except: - # Python2 doesn't appear to have the notion of 'exception causes', - # which is super unfortunate. In the case: - # - # @contextlib.contextmanager - # def foo() - # try: - # yield - # finally: - # raise ValueError - # - # with foo(): - # assert False - # - # ...Python will only note the ValueError; nothing about the failing - # assertion is printed. - # - # I guess on the bright side, that means we don't have to fiddle with - # __cause__s/etc. - exc_type, exc, exc_traceback = sys.exc_info() - exception_handled = True - - if not exception_handled: - return False - - # Something changed. We either need to raise for ourselves, or note that - # the exception has been suppressed. - if exc_type is not None: - raise exc_type, exc, exc_traceback - - # Otherwise, the exception was suppressed. Go us! - return True - - def __exit__(self, exc_type, exc, exc_traceback): - return self._perform_exit(exc_type, exc, exc_traceback) - - def close(self): - """Unwinds the exit stack, unregistering all events""" - self._perform_exit(None, None, None) - - def enter_context(self, cm): - """Enters the given context manager, and registers it to be exited.""" - self._assert_is_entered() - - # The spec specifically notes that we should take __exit__ prior to calling - # __enter__. - exit_cleanup = cm.__exit__ - result = cm.__enter__() - self._stack.append(exit_cleanup) - return result - - # pylint complains about `exit` being redefined. `exit` is the documented - # name of this param, and renaming it would break portability if someone - # decided to `push(exit=foo)`, so just ignore the lint. - # pylint: disable=redefined-builtin - def push(self, exit): - """Like `enter_context`, but won't enter the value given.""" - self._assert_is_entered() - self._stack.append(exit.__exit__) - - def callback(self, callback, *args, **kwargs): - """Performs the given callback on exit""" - self._assert_is_entered() - - def fn(_exc_type, _exc, _exc_traceback): - callback(*args, **kwargs) - - self._stack.append(fn) diff --git a/cros_utils/contextlib3_test.py b/cros_utils/contextlib3_test.py deleted file mode 100755 index 76c010f2..00000000 --- a/cros_utils/contextlib3_test.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- -# Copyright 2019 The Chromium OS Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -"""Tests for contextlib3""" - -from __future__ import division -from __future__ import print_function - -import contextlib -import unittest - -import contextlib3 - - -class SomeException(Exception): - """Just an alternative to ValueError in the Exception class hierarchy.""" - pass - - -class TestExitStack(unittest.TestCase): - """Tests contextlib3.ExitStack""" - - def test_exceptions_in_exit_override_exceptions_in_with(self): - - @contextlib.contextmanager - def raise_exit(): - raised = False - try: - yield - except Exception: - raised = True - raise ValueError - finally: - self.assertTrue(raised) - - # (As noted in comments in contextlib3, this behavior is consistent with - # how python2 works. Namely, if __exit__ raises, the exception from - # __exit__ overrides the inner exception) - with self.assertRaises(ValueError): - with contextlib3.ExitStack() as stack: - stack.enter_context(raise_exit()) - raise SomeException() - - def test_raising_in_exit_doesnt_block_later_exits(self): - exited = [] - - @contextlib.contextmanager - def raise_exit(): - try: - yield - finally: - exited.append('raise') - raise ValueError - - @contextlib.contextmanager - def push_exit(): - try: - yield - finally: - exited.append('push') - - with self.assertRaises(ValueError): - with contextlib3.ExitStack() as stack: - stack.enter_context(push_exit()) - stack.enter_context(raise_exit()) - self.assertEqual(exited, ['raise', 'push']) - - exited = [] - with self.assertRaises(ValueError): - with contextlib3.ExitStack() as stack: - stack.enter_context(push_exit()) - stack.enter_context(raise_exit()) - raise SomeException() - self.assertEqual(exited, ['raise', 'push']) - - def test_push_doesnt_enter_the_context(self): - exited = [] - - test_self = self - - class Manager(object): - """A simple ContextManager for testing purposes""" - - def __enter__(self): - test_self.fail('context manager was entered :(') - - def __exit__(self, *args, **kwargs): - exited.append(1) - - with contextlib3.ExitStack() as stack: - stack.push(Manager()) - self.assertEqual(exited, []) - self.assertEqual(exited, [1]) - - def test_callbacks_are_run_properly(self): - callback_was_run = [] - - def callback(arg, some_kwarg=None): - self.assertEqual(arg, 41) - self.assertEqual(some_kwarg, 42) - callback_was_run.append(1) - - with contextlib3.ExitStack() as stack: - stack.callback(callback, 41, some_kwarg=42) - self.assertEqual(callback_was_run, []) - self.assertEqual(callback_was_run, [1]) - - callback_was_run = [] - with self.assertRaises(ValueError): - with contextlib3.ExitStack() as stack: - stack.callback(callback, 41, some_kwarg=42) - raise ValueError() - self.assertEqual(callback_was_run, [1]) - - def test_finallys_are_run(self): - finally_run = [] - - @contextlib.contextmanager - def append_on_exit(): - try: - yield - finally: - finally_run.append(0) - - with self.assertRaises(ValueError): - with contextlib3.ExitStack() as stack: - stack.enter_context(append_on_exit()) - raise ValueError() - self.assertEqual(finally_run, [0]) - - def test_unwinding_happens_in_reverse_order(self): - exit_runs = [] - - @contextlib.contextmanager - def append_things(start_push, end_push): - exit_runs.append(start_push) - try: - yield - finally: - exit_runs.append(end_push) - - with contextlib3.ExitStack() as stack: - stack.enter_context(append_things(1, 4)) - stack.enter_context(append_things(2, 3)) - self.assertEqual(exit_runs, [1, 2, 3, 4]) - - exit_runs = [] - with self.assertRaises(ValueError): - with contextlib3.ExitStack() as stack: - stack.enter_context(append_things(1, 4)) - stack.enter_context(append_things(2, 3)) - raise ValueError - self.assertEqual(exit_runs, [1, 2, 3, 4]) - - def test_exceptions_are_propagated(self): - - @contextlib.contextmanager - def die_on_regular_exit(): - yield - self.fail('Unreachable in theory') - - with self.assertRaises(ValueError): - with contextlib3.ExitStack() as stack: - stack.enter_context(die_on_regular_exit()) - raise ValueError() - - def test_exceptions_can_be_blocked(self): - - @contextlib.contextmanager - def block(): - try: - yield - except Exception: - pass - - with contextlib3.ExitStack() as stack: - stack.enter_context(block()) - raise ValueError() - - def test_objects_are_returned_from_enter_context(self): - - @contextlib.contextmanager - def yield_arg(arg): - yield arg - - with contextlib3.ExitStack() as stack: - val = stack.enter_context(yield_arg(1)) - self.assertEqual(val, 1) - - -if __name__ == '__main__': - unittest.main() diff --git a/cros_utils/device_setup_utils.py b/cros_utils/device_setup_utils.py index ea705263..61dbba27 100755..100644 --- a/cros_utils/device_setup_utils.py +++ b/cros_utils/device_setup_utils.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python2 # -*- coding: utf-8 -*- # # Copyright 2019 The Chromium OS Authors. All rights reserved. diff --git a/cros_utils/device_setup_utils_unittest.py b/cros_utils/device_setup_utils_unittest.py index 24fe8b3f..63f9bf66 100755 --- a/cros_utils/device_setup_utils_unittest.py +++ b/cros_utils/device_setup_utils_unittest.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Copyright 2019 The Chromium OS Authors. All rights reserved. @@ -12,7 +12,7 @@ from __future__ import print_function import time import unittest -import mock +from unittest import mock from device_setup_utils import DutWrapper @@ -450,8 +450,8 @@ class DutWrapperTest(unittest.TestCase): self.dw.dut_config['cpu_freq_pct'] = 50 self.dw.SetupCpuFreq(online) self.dw.RunCommandOnDut.assert_called_once() - self.assertNotRegexpMatches(self.dw.RunCommandOnDut.call_args_list[0][0][0], - '^echo.*scaling_max_freq$') + self.assertNotRegex(self.dw.RunCommandOnDut.call_args_list[0][0][0], + '^echo.*scaling_max_freq$') def test_setup_cpu_freq_multiple_no_access(self): online = [0, 1] diff --git a/cros_utils/email_sender.py b/cros_utils/email_sender.py index 0d2bd651..3f724f64 100755 --- a/cros_utils/email_sender.py +++ b/cros_utils/email_sender.py @@ -1,6 +1,10 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. -# Copyright 2011 Google Inc. All Rights Reserved. """Utilities to send email either through SMTP or SendGMR.""" from __future__ import print_function @@ -72,7 +76,7 @@ class EmailSender(object): part.set_payload(attachment.content) Encoders.encode_base64(part) part.add_header('Content-Disposition', - "attachment; filename=\"%s\"" % attachment.name) + 'attachment; filename="%s"' % attachment.name) msg.attach(part) # Send the message via our own SMTP server, but don't include the diff --git a/cros_utils/file_utils.py b/cros_utils/file_utils.py index 777e6251..fe6fc16f 100644 --- a/cros_utils/file_utils.py +++ b/cros_utils/file_utils.py @@ -1,4 +1,8 @@ -# Copyright 2011 Google Inc. All Rights Reserved. +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + """Utilities for operations on files.""" from __future__ import print_function diff --git a/cros_utils/html_tools.py b/cros_utils/html_tools.py index 8ca795bf..688955ff 100644 --- a/cros_utils/html_tools.py +++ b/cros_utils/html_tools.py @@ -1,4 +1,8 @@ -# Copyright 2010 Google Inc. All Rights Reserved. +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + """Utilities for generating html.""" diff --git a/cros_utils/logger.py b/cros_utils/logger.py index 364d9c9d..4cc4618e 100644 --- a/cros_utils/logger.py +++ b/cros_utils/logger.py @@ -1,4 +1,8 @@ -# Copyright 2010 Google Inc. All Rights Reserved. +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + """Logging helper module.""" from __future__ import print_function @@ -9,7 +13,7 @@ import sys import traceback -#TODO(yunlian@google.com): Use GetRoot from misc +# TODO(yunlian@google.com): Use GetRoot from misc def GetRoot(scr_name): """Break up pathname into (dir+name).""" abs_path = os.path.abspath(scr_name) @@ -182,16 +186,12 @@ class Logger(object): self.LogWarning(msg) def LogCommandOutput(self, msg, print_to_console=True): - self.LogMsg(self.stdout, - self._GetStdout(print_to_console), - msg, - flush=False) + self.LogMsg( + self.stdout, self._GetStdout(print_to_console), msg, flush=False) def LogCommandError(self, msg, print_to_console=True): - self.LogMsg(self.stderr, - self._GetStderr(print_to_console), - msg, - flush=False) + self.LogMsg( + self.stderr, self._GetStderr(print_to_console), msg, flush=False) def Flush(self): self.cmdfd.flush() @@ -319,16 +319,12 @@ class MockLogger(object): self.LogWarning(msg) def LogCommandOutput(self, msg, print_to_console=True): - self.LogMsg(self.stdout, - self._GetStdout(print_to_console), - msg, - flush=False) + self.LogMsg( + self.stdout, self._GetStdout(print_to_console), msg, flush=False) def LogCommandError(self, msg, print_to_console=True): - self.LogMsg(self.stderr, - self._GetStderr(print_to_console), - msg, - flush=False) + self.LogMsg( + self.stderr, self._GetStderr(print_to_console), msg, flush=False) def Flush(self): print('MockLogger: Flushing cmdfd, stdout, stderr') @@ -363,7 +359,7 @@ def HandleUncaughtExceptions(fun): def _Interceptor(*args, **kwargs): try: return fun(*args, **kwargs) - except StandardError: + except Exception: GetLogger().LogFatal('Uncaught exception:\n%s' % traceback.format_exc()) return _Interceptor diff --git a/cros_utils/machines.py b/cros_utils/machines.py index 722df3b8..89b51b01 100644 --- a/cros_utils/machines.py +++ b/cros_utils/machines.py @@ -1,6 +1,8 @@ +# -*- coding: utf-8 -*- # Copyright 2015 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. + """Utilities relating to machine-specific functions.""" from __future__ import print_function diff --git a/cros_utils/manifest_versions.py b/cros_utils/manifest_versions.py index 47e2fb20..83e88908 100644 --- a/cros_utils/manifest_versions.py +++ b/cros_utils/manifest_versions.py @@ -1,6 +1,8 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2013 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. + """Tools for searching/manipulating the manifests repository.""" from __future__ import print_function @@ -76,8 +78,8 @@ class ManifestVersions(object): if ret: logger.GetLogger().LogFatal('Failed to checkout manifest at ' 'specified time') - path = os.path.realpath( - '{0}/manifest-versions/LKGM/lkgm.xml'.format(self.clone_location)) + path = os.path.realpath('{0}/manifest-versions/LKGM/lkgm.xml'.format( + self.clone_location)) pp = path.split('/') new_list = copy.deepcopy(pp) for i, e in enumerate(pp): @@ -124,8 +126,8 @@ class ManifestVersions(object): if ret: logger.GetLogger().LogFatal('Failed to checkout manifest at ' 'specified time') - path = os.path.realpath( - '{0}/manifest-versions/LKGM/lkgm.xml'.format(self.clone_location)) + path = os.path.realpath('{0}/manifest-versions/LKGM/lkgm.xml'.format( + self.clone_location)) pp = path.split('/') small = os.path.basename(path).split('.xml')[0] version = pp[-2] + '.' + small @@ -142,7 +144,9 @@ class ManifestVersions(object): version = version.split('.', 1)[1] os.chdir(self.clone_location) files = [ - os.path.join(r, f) for r, _, fs in os.walk('.') for f in fs + os.path.join(r, f) + for r, _, fs in os.walk('.') + for f in fs if version in f ] if files: diff --git a/cros_utils/misc.py b/cros_utils/misc.py index 58076f40..48e3c724 100644 --- a/cros_utils/misc.py +++ b/cros_utils/misc.py @@ -1,8 +1,11 @@ +# -*- coding: utf-8 -*- # Copyright 2013 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. + """Utilities for toolchain build.""" +from __future__ import division from __future__ import print_function __author__ = 'asharif@google.com (Ahmad Sharif)' @@ -71,7 +74,8 @@ def GetFilenameFromString(string): string, (r'/', '__'), (r'\s', '_'), - (r'[\\$="?^]', ''),) + (r'[\\$="?^]', ''), + ) def GetRoot(scr_name): @@ -160,9 +164,7 @@ def GetBuildImageCommand(board, dev=False): dev_args)) -def GetSetupBoardCommand(board, - usepkg=None, - force=None): +def GetSetupBoardCommand(board, usepkg=None, force=None): """Get setup_board command.""" options = [] @@ -202,8 +204,8 @@ def GetCtargetFromBoard(board, chromeos_root): def GetArchFromBoard(board, chromeos_root): """Get Arch from board.""" base_board = board.split('_')[0] - command = ('source %s; get_board_arch %s' % (TOOLCHAIN_UTILS_PATH, - base_board)) + command = ( + 'source %s; get_board_arch %s' % (TOOLCHAIN_UTILS_PATH, base_board)) ce = command_executer.GetCommandExecuter() ret, out, _ = ce.ChrootRunCommandWOutput(chromeos_root, command) if ret != 0: @@ -237,7 +239,7 @@ def GetChromeSrcDir(): def GetEnvStringFromDict(env_dict): - return ' '.join(["%s=\"%s\"" % var for var in env_dict.items()]) + return ' '.join(['%s="%s"' % var for var in env_dict.items()]) def MergeEnvStringWithDict(env_string, env_dict, prepend=True): @@ -247,11 +249,11 @@ def MergeEnvStringWithDict(env_string, env_dict, prepend=True): override_env_list = [] ce = command_executer.GetCommandExecuter() for k, v in env_dict.items(): - v = v.strip("\"'") + v = v.strip('"\'') if prepend: - new_env = "%s=\"%s $%s\"" % (k, v, k) + new_env = '%s="%s $%s"' % (k, v, k) else: - new_env = "%s=\"$%s %s\"" % (k, k, v) + new_env = '%s="$%s %s"' % (k, k, v) command = '; '.join([env_string, new_env, 'echo $%s' % k]) ret, out, _ = ce.RunCommandWOutput(command) override_env_list.append('%s=%r' % (k, out.strip())) @@ -479,8 +481,8 @@ def ApplyGerritPatches(chromeos_root, gerrit_patch_string, pi_str = '{project}:{ref}'.format(project=pi.project, ref=pi.ref) try: project_git_path = project_checkout.GetPath(absolute=True) - logger.GetLogger().LogOutput( - 'Applying patch "{0}" in "{1}" ...'.format(pi_str, project_git_path)) + logger.GetLogger().LogOutput('Applying patch "{0}" in "{1}" ...'.format( + pi_str, project_git_path)) pi.Apply(project_git_path, branch, trivial=False) except Exception: traceback.print_exc(file=sys.stdout) @@ -524,7 +526,8 @@ def BooleanPrompt(prompt='Do you want to continue?', while True: try: - response = raw_input(prompt).lower() + # pylint: disable=input-builtin, bad-builtin + response = input(prompt).lower() except EOFError: # If the user hits CTRL+D, or stdin is disabled, use the default. print() @@ -553,7 +556,7 @@ def rgb2short(r, g, b): greencolor = [255, 118, 82, 46, 10] if g == 0: - return redcolor[r / 52] + return redcolor[r // 52] if r == 0: - return greencolor[g / 52] + return greencolor[g // 52] return 4 diff --git a/cros_utils/misc_test.py b/cros_utils/misc_test.py index 80082207..cb7b8c89 100644..100755 --- a/cros_utils/misc_test.py +++ b/cros_utils/misc_test.py @@ -1,4 +1,9 @@ -# Copyright 2012 Google Inc. All Rights Reserved. +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + """Tests for misc.""" from __future__ import print_function diff --git a/cros_utils/no_pseudo_terminal_test.py b/cros_utils/no_pseudo_terminal_test.py index 41d71539..10fd9608 100755 --- a/cros_utils/no_pseudo_terminal_test.py +++ b/cros_utils/no_pseudo_terminal_test.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Copyright 2019 The Chromium OS Authors. All rights reserved. @@ -27,6 +27,7 @@ class NoPsuedoTerminalTest(unittest.TestCase): """Attaches strace to the current process.""" args = ['sudo', 'strace', '-o', output_file, '-p', str(os.getpid())] print(args) + # pylint: disable=bad-option-value, subprocess-popen-preexec-fn self._strace_process = subprocess.Popen(args, preexec_fn=os.setpgrp) # Wait until we see some activity. start_time = time.time() diff --git a/cros_utils/perf_diff.py b/cros_utils/perf_diff.py index 31cde994..fed6a81d 100755 --- a/cros_utils/perf_diff.py +++ b/cros_utils/perf_diff.py @@ -1,5 +1,9 @@ -#!/usr/bin/env python2 -# Copyright 2012 Google Inc. All Rights Reserved. +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + """One-line documentation for perf_diff module. A detailed description of perf_diff. @@ -10,6 +14,7 @@ from __future__ import print_function __author__ = 'asharif@google.com (Ahmad Sharif)' import argparse +import functools import re import sys @@ -38,7 +43,7 @@ def GetPerfDictFromReport(report_file): def _SortDictionaryByValue(d): - l = [(k, v) for (k, v) in d.iteritems()] + l = d.items() def GetFloat(x): if misc.IsFloat(x): @@ -271,11 +276,11 @@ class PerfDiffer(object): for report in self._reports: if section_name in report.sections: section = report.sections[section_name] - function_names = [f.name for f in section.functions] + function_names = {f.name for f in section.functions} function_names_list.append(function_names) self._common_function_names[section_name] = ( - reduce(set.intersection, map(set, function_names_list))) + functools.reduce(set.intersection, function_names_list)) def _GetTopFunctions(self, section_name, num_functions): all_functions = {} @@ -301,17 +306,19 @@ class PerfDiffer(object): def Main(argv): """The entry of the main.""" parser = argparse.ArgumentParser() - parser.add_argument('-n', - '--num_symbols', - dest='num_symbols', - default='5', - help='The number of symbols to show.') - parser.add_argument('-c', - '--common_only', - dest='common_only', - action='store_true', - default=False, - help='Diff common symbols only.') + parser.add_argument( + '-n', + '--num_symbols', + dest='num_symbols', + default='5', + help='The number of symbols to show.') + parser.add_argument( + '-c', + '--common_only', + dest='common_only', + action='store_true', + default=False, + help='Diff common symbols only.') options, args = parser.parse_known_args(argv) diff --git a/cros_utils/pstat.py b/cros_utils/pstat.py deleted file mode 100644 index 602fc0c7..00000000 --- a/cros_utils/pstat.py +++ /dev/null @@ -1,1077 +0,0 @@ -# We did not author this file nor mantain it. Skip linting it. -#pylint: skip-file -# Copyright (c) 1999-2007 Gary Strangman; All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# -# Comments and/or additions are welcome (send e-mail to: -# strang@nmr.mgh.harvard.edu). -# -"""pstat.py module - -################################################# -####### Written by: Gary Strangman ########### -####### Last modified: Dec 18, 2007 ########### -################################################# - -This module provides some useful list and array manipulation routines -modeled after those found in the |Stat package by Gary Perlman, plus a -number of other useful list/file manipulation functions. The list-based -functions include: - - abut (source,*args) - simpleabut (source, addon) - colex (listoflists,cnums) - collapse (listoflists,keepcols,collapsecols,fcn1=None,fcn2=None,cfcn=None) - dm (listoflists,criterion) - flat (l) - linexand (listoflists,columnlist,valuelist) - linexor (listoflists,columnlist,valuelist) - linedelimited (inlist,delimiter) - lineincols (inlist,colsize) - lineincustcols (inlist,colsizes) - list2string (inlist) - makelol(inlist) - makestr(x) - printcc (lst,extra=2) - printincols (listoflists,colsize) - pl (listoflists) - printl(listoflists) - replace (lst,oldval,newval) - recode (inlist,listmap,cols='all') - remap (listoflists,criterion) - roundlist (inlist,num_digits_to_round_floats_to) - sortby(listoflists,sortcols) - unique (inlist) - duplicates(inlist) - writedelimited (listoflists, delimiter, file, writetype='w') - -Some of these functions have alternate versions which are defined only if -Numeric (NumPy) can be imported. These functions are generally named as -above, with an 'a' prefix. - - aabut (source, *args) - acolex (a,indices,axis=1) - acollapse (a,keepcols,collapsecols,sterr=0,ns=0) - adm (a,criterion) - alinexand (a,columnlist,valuelist) - alinexor (a,columnlist,valuelist) - areplace (a,oldval,newval) - arecode (a,listmap,col='all') - arowcompare (row1, row2) - arowsame (row1, row2) - asortrows(a,axis=0) - aunique(inarray) - aduplicates(inarray) - -Currently, the code is all but completely un-optimized. In many cases, the -array versions of functions amount simply to aliases to built-in array -functions/methods. Their inclusion here is for function name consistency. -""" - -## CHANGE LOG: -## ========== -## 07-11-26 ... edited to work with numpy -## 01-11-15 ... changed list2string() to accept a delimiter -## 01-06-29 ... converted exec()'s to eval()'s to make compatible with Py2.1 -## 01-05-31 ... added duplicates() and aduplicates() functions -## 00-12-28 ... license made GPL, docstring and import requirements -## 99-11-01 ... changed version to 0.3 -## 99-08-30 ... removed get, getstrings, put, aget, aput (into io.py) -## 03/27/99 ... added areplace function, made replace fcn recursive -## 12/31/98 ... added writefc function for ouput to fixed column sizes -## 12/07/98 ... fixed import problem (failed on collapse() fcn) -## added __version__ variable (now 0.2) -## 12/05/98 ... updated doc-strings -## added features to collapse() function -## added flat() function for lists -## fixed a broken asortrows() -## 11/16/98 ... fixed minor bug in aput for 1D arrays -## -## 11/08/98 ... fixed aput to output large arrays correctly - -import stats # required 3rd party module -import string, copy -from types import * - -__version__ = 0.4 - -###=========================== LIST FUNCTIONS ========================== -### -### Here are the list functions, DEFINED FOR ALL SYSTEMS. -### Array functions (for NumPy-enabled computers) appear below. -### - - -def abut(source, *args): - """ -Like the |Stat abut command. It concatenates two lists side-by-side -and returns the result. '2D' lists are also accomodated for either argument -(source or addon). CAUTION: If one list is shorter, it will be repeated -until it is as long as the longest list. If this behavior is not desired, -use pstat.simpleabut(). - -Usage: abut(source, args) where args=any # of lists -Returns: a list of lists as long as the LONGEST list past, source on the - 'left', lists in <args> attached consecutively on the 'right' -""" - - if type(source) not in [ListType, TupleType]: - source = [source] - for addon in args: - if type(addon) not in [ListType, TupleType]: - addon = [addon] - if len(addon) < len(source): # is source list longer? - if len(source) % len(addon) == 0: # are they integer multiples? - repeats = len(source) / len(addon) # repeat addon n times - origadd = copy.deepcopy(addon) - for i in range(repeats - 1): - addon = addon + origadd - else: - repeats = len(source) / len(addon) + 1 # repeat addon x times, - origadd = copy.deepcopy(addon) # x is NOT an integer - for i in range(repeats - 1): - addon = addon + origadd - addon = addon[0:len(source)] - elif len(source) < len(addon): # is addon list longer? - if len(addon) % len(source) == 0: # are they integer multiples? - repeats = len(addon) / len(source) # repeat source n times - origsour = copy.deepcopy(source) - for i in range(repeats - 1): - source = source + origsour - else: - repeats = len(addon) / len(source) + 1 # repeat source x times, - origsour = copy.deepcopy(source) # x is NOT an integer - for i in range(repeats - 1): - source = source + origsour - source = source[0:len(addon)] - - source = simpleabut(source, addon) - return source - - -def simpleabut(source, addon): - """ -Concatenates two lists as columns and returns the result. '2D' lists -are also accomodated for either argument (source or addon). This DOES NOT -repeat either list to make the 2 lists of equal length. Beware of list pairs -with different lengths ... the resulting list will be the length of the -FIRST list passed. - -Usage: simpleabut(source,addon) where source, addon=list (or list-of-lists) -Returns: a list of lists as long as source, with source on the 'left' and - addon on the 'right' -""" - if type(source) not in [ListType, TupleType]: - source = [source] - if type(addon) not in [ListType, TupleType]: - addon = [addon] - minlen = min(len(source), len(addon)) - list = copy.deepcopy(source) # start abut process - if type(source[0]) not in [ListType, TupleType]: - if type(addon[0]) not in [ListType, TupleType]: - for i in range(minlen): - list[i] = [source[i]] + [addon[i]] # source/addon = column - else: - for i in range(minlen): - list[i] = [source[i]] + addon[i] # addon=list-of-lists - else: - if type(addon[0]) not in [ListType, TupleType]: - for i in range(minlen): - list[i] = source[i] + [addon[i]] # source=list-of-lists - else: - for i in range(minlen): - list[i] = source[i] + addon[i] # source/addon = list-of-lists - source = list - return source - - -def colex(listoflists, cnums): - """ -Extracts from listoflists the columns specified in the list 'cnums' -(cnums can be an integer, a sequence of integers, or a string-expression that -corresponds to a slice operation on the variable x ... e.g., 'x[3:]' will colex -columns 3 onward from the listoflists). - -Usage: colex (listoflists,cnums) -Returns: a list-of-lists corresponding to the columns from listoflists - specified by cnums, in the order the column numbers appear in cnums -""" - global index - column = 0 - if type(cnums) in [ListType, TupleType]: # if multiple columns to get - index = cnums[0] - column = map(lambda x: x[index], listoflists) - for col in cnums[1:]: - index = col - column = abut(column, map(lambda x: x[index], listoflists)) - elif type(cnums) == StringType: # if an 'x[3:]' type expr. - evalstring = 'map(lambda x: x' + cnums + ', listoflists)' - column = eval(evalstring) - else: # else it's just 1 col to get - index = cnums - column = map(lambda x: x[index], listoflists) - return column - - -def collapse(listoflists, - keepcols, - collapsecols, - fcn1=None, - fcn2=None, - cfcn=None): - """ -Averages data in collapsecol, keeping all unique items in keepcols -(using unique, which keeps unique LISTS of column numbers), retaining the -unique sets of values in keepcols, the mean for each. Setting fcn1 -and/or fcn2 to point to a function rather than None (e.g., stats.sterr, len) -will append those results (e.g., the sterr, N) after each calculated mean. -cfcn is the collapse function to apply (defaults to mean, defined here in the -pstat module to avoid circular imports with stats.py, but harmonicmean or -others could be passed). - -Usage: collapse -(listoflists,keepcols,collapsecols,fcn1=None,fcn2=None,cfcn=None) -Returns: a list of lists with all unique permutations of entries appearing in - columns ("conditions") specified by keepcols, abutted with the result of - cfcn (if cfcn=None, defaults to the mean) of each column specified by - collapsecols. -""" - - def collmean(inlist): - s = 0 - for item in inlist: - s = s + item - return s / float(len(inlist)) - - if type(keepcols) not in [ListType, TupleType]: - keepcols = [keepcols] - if type(collapsecols) not in [ListType, TupleType]: - collapsecols = [collapsecols] - if cfcn == None: - cfcn = collmean - if keepcols == []: - means = [0] * len(collapsecols) - for i in range(len(collapsecols)): - avgcol = colex(listoflists, collapsecols[i]) - means[i] = cfcn(avgcol) - if fcn1: - try: - test = fcn1(avgcol) - except: - test = 'N/A' - means[i] = [means[i], test] - if fcn2: - try: - test = fcn2(avgcol) - except: - test = 'N/A' - try: - means[i] = means[i] + [len(avgcol)] - except TypeError: - means[i] = [means[i], len(avgcol)] - return means - else: - values = colex(listoflists, keepcols) - uniques = unique(values) - uniques.sort() - newlist = [] - if type(keepcols) not in [ListType, TupleType]: - keepcols = [keepcols] - for item in uniques: - if type(item) not in [ListType, TupleType]: - item = [item] - tmprows = linexand(listoflists, keepcols, item) - for col in collapsecols: - avgcol = colex(tmprows, col) - item.append(cfcn(avgcol)) - if fcn1 <> None: - try: - test = fcn1(avgcol) - except: - test = 'N/A' - item.append(test) - if fcn2 <> None: - try: - test = fcn2(avgcol) - except: - test = 'N/A' - item.append(test) - newlist.append(item) - return newlist - - -def dm(listoflists, criterion): - """ -Returns rows from the passed list of lists that meet the criteria in -the passed criterion expression (a string as a function of x; e.g., 'x[3]>=9' -will return all rows where the 4th column>=9 and "x[2]=='N'" will return rows -with column 2 equal to the string 'N'). - -Usage: dm (listoflists, criterion) -Returns: rows from listoflists that meet the specified criterion. -""" - function = 'filter(lambda x: ' + criterion + ',listoflists)' - lines = eval(function) - return lines - - -def flat(l): - """ -Returns the flattened version of a '2D' list. List-correlate to the a.ravel()() -method of NumPy arrays. - -Usage: flat(l) -""" - newl = [] - for i in range(len(l)): - for j in range(len(l[i])): - newl.append(l[i][j]) - return newl - - -def linexand(listoflists, columnlist, valuelist): - """ -Returns the rows of a list of lists where col (from columnlist) = val -(from valuelist) for EVERY pair of values (columnlist[i],valuelists[i]). -len(columnlist) must equal len(valuelist). - -Usage: linexand (listoflists,columnlist,valuelist) -Returns: the rows of listoflists where columnlist[i]=valuelist[i] for ALL i -""" - if type(columnlist) not in [ListType, TupleType]: - columnlist = [columnlist] - if type(valuelist) not in [ListType, TupleType]: - valuelist = [valuelist] - criterion = '' - for i in range(len(columnlist)): - if type(valuelist[i]) == StringType: - critval = '\'' + valuelist[i] + '\'' - else: - critval = str(valuelist[i]) - criterion = criterion + ' x[' + str(columnlist[ - i]) + ']==' + critval + ' and' - criterion = criterion[0:-3] # remove the "and" after the last crit - function = 'filter(lambda x: ' + criterion + ',listoflists)' - lines = eval(function) - return lines - - -def linexor(listoflists, columnlist, valuelist): - """ -Returns the rows of a list of lists where col (from columnlist) = val -(from valuelist) for ANY pair of values (colunmlist[i],valuelist[i[). -One value is required for each column in columnlist. If only one value -exists for columnlist but multiple values appear in valuelist, the -valuelist values are all assumed to pertain to the same column. - -Usage: linexor (listoflists,columnlist,valuelist) -Returns: the rows of listoflists where columnlist[i]=valuelist[i] for ANY i -""" - if type(columnlist) not in [ListType, TupleType]: - columnlist = [columnlist] - if type(valuelist) not in [ListType, TupleType]: - valuelist = [valuelist] - criterion = '' - if len(columnlist) == 1 and len(valuelist) > 1: - columnlist = columnlist * len(valuelist) - for i in range(len(columnlist)): # build an exec string - if type(valuelist[i]) == StringType: - critval = '\'' + valuelist[i] + '\'' - else: - critval = str(valuelist[i]) - criterion = criterion + ' x[' + str(columnlist[i]) + ']==' + critval + ' or' - criterion = criterion[0:-2] # remove the "or" after the last crit - function = 'filter(lambda x: ' + criterion + ',listoflists)' - lines = eval(function) - return lines - - -def linedelimited(inlist, delimiter): - """ -Returns a string composed of elements in inlist, with each element -separated by 'delimiter.' Used by function writedelimited. Use '\t' -for tab-delimiting. - -Usage: linedelimited (inlist,delimiter) -""" - outstr = '' - for item in inlist: - if type(item) <> StringType: - item = str(item) - outstr = outstr + item + delimiter - outstr = outstr[0:-1] - return outstr - - -def lineincols(inlist, colsize): - """ -Returns a string composed of elements in inlist, with each element -right-aligned in columns of (fixed) colsize. - -Usage: lineincols (inlist,colsize) where colsize is an integer -""" - outstr = '' - for item in inlist: - if type(item) <> StringType: - item = str(item) - size = len(item) - if size <= colsize: - for i in range(colsize - size): - outstr = outstr + ' ' - outstr = outstr + item - else: - outstr = outstr + item[0:colsize + 1] - return outstr - - -def lineincustcols(inlist, colsizes): - """ -Returns a string composed of elements in inlist, with each element -right-aligned in a column of width specified by a sequence colsizes. The -length of colsizes must be greater than or equal to the number of columns -in inlist. - -Usage: lineincustcols (inlist,colsizes) -Returns: formatted string created from inlist -""" - outstr = '' - for i in range(len(inlist)): - if type(inlist[i]) <> StringType: - item = str(inlist[i]) - else: - item = inlist[i] - size = len(item) - if size <= colsizes[i]: - for j in range(colsizes[i] - size): - outstr = outstr + ' ' - outstr = outstr + item - else: - outstr = outstr + item[0:colsizes[i] + 1] - return outstr - - -def list2string(inlist, delimit=' '): - """ -Converts a 1D list to a single long string for file output, using -the string.join function. - -Usage: list2string (inlist,delimit=' ') -Returns: the string created from inlist -""" - stringlist = map(makestr, inlist) - return string.join(stringlist, delimit) - - -def makelol(inlist): - """ -Converts a 1D list to a 2D list (i.e., a list-of-lists). Useful when you -want to use put() to write a 1D list one item per line in the file. - -Usage: makelol(inlist) -Returns: if l = [1,2,'hi'] then returns [[1],[2],['hi']] etc. -""" - x = [] - for item in inlist: - x.append([item]) - return x - - -def makestr(x): - if type(x) <> StringType: - x = str(x) - return x - - -def printcc(lst, extra=2): - """ -Prints a list of lists in columns, customized by the max size of items -within the columns (max size of items in col, plus 'extra' number of spaces). -Use 'dashes' or '\\n' in the list-of-lists to print dashes or blank lines, -respectively. - -Usage: printcc (lst,extra=2) -Returns: None -""" - if type(lst[0]) not in [ListType, TupleType]: - lst = [lst] - rowstokill = [] - list2print = copy.deepcopy(lst) - for i in range(len(lst)): - if lst[i] == [ - '\n' - ] or lst[i] == '\n' or lst[i] == 'dashes' or lst[i] == '' or lst[i] == ['']: - rowstokill = rowstokill + [i] - rowstokill.reverse() # delete blank rows from the end - for row in rowstokill: - del list2print[row] - maxsize = [0] * len(list2print[0]) - for col in range(len(list2print[0])): - items = colex(list2print, col) - items = map(makestr, items) - maxsize[col] = max(map(len, items)) + extra - for row in lst: - if row == ['\n'] or row == '\n' or row == '' or row == ['']: - print - elif row == ['dashes'] or row == 'dashes': - dashes = [0] * len(maxsize) - for j in range(len(maxsize)): - dashes[j] = '-' * (maxsize[j] - 2) - print lineincustcols(dashes, maxsize) - else: - print lineincustcols(row, maxsize) - return None - - -def printincols(listoflists, colsize): - """ -Prints a list of lists in columns of (fixed) colsize width, where -colsize is an integer. - -Usage: printincols (listoflists,colsize) -Returns: None -""" - for row in listoflists: - print lineincols(row, colsize) - return None - - -def pl(listoflists): - """ -Prints a list of lists, 1 list (row) at a time. - -Usage: pl(listoflists) -Returns: None -""" - for row in listoflists: - if row[-1] == '\n': - print row, - else: - print row - return None - - -def printl(listoflists): - """Alias for pl.""" - pl(listoflists) - return - - -def replace(inlst, oldval, newval): - """ -Replaces all occurrences of 'oldval' with 'newval', recursively. - -Usage: replace (inlst,oldval,newval) -""" - lst = inlst * 1 - for i in range(len(lst)): - if type(lst[i]) not in [ListType, TupleType]: - if lst[i] == oldval: - lst[i] = newval - else: - lst[i] = replace(lst[i], oldval, newval) - return lst - - -def recode(inlist, listmap, cols=None): - """ -Changes the values in a list to a new set of values (useful when -you need to recode data from (e.g.) strings to numbers. cols defaults -to None (meaning all columns are recoded). - -Usage: recode (inlist,listmap,cols=None) cols=recode cols, listmap=2D list -Returns: inlist with the appropriate values replaced with new ones -""" - lst = copy.deepcopy(inlist) - if cols != None: - if type(cols) not in [ListType, TupleType]: - cols = [cols] - for col in cols: - for row in range(len(lst)): - try: - idx = colex(listmap, 0).index(lst[row][col]) - lst[row][col] = listmap[idx][1] - except ValueError: - pass - else: - for row in range(len(lst)): - for col in range(len(lst)): - try: - idx = colex(listmap, 0).index(lst[row][col]) - lst[row][col] = listmap[idx][1] - except ValueError: - pass - return lst - - -def remap(listoflists, criterion): - """ -Remaps values in a given column of a 2D list (listoflists). This requires -a criterion as a function of 'x' so that the result of the following is -returned ... map(lambda x: 'criterion',listoflists). - -Usage: remap(listoflists,criterion) criterion=string -Returns: remapped version of listoflists -""" - function = 'map(lambda x: ' + criterion + ',listoflists)' - lines = eval(function) - return lines - - -def roundlist(inlist, digits): - """ -Goes through each element in a 1D or 2D inlist, and applies the following -function to all elements of FloatType ... round(element,digits). - -Usage: roundlist(inlist,digits) -Returns: list with rounded floats -""" - if type(inlist[0]) in [IntType, FloatType]: - inlist = [inlist] - l = inlist * 1 - for i in range(len(l)): - for j in range(len(l[i])): - if type(l[i][j]) == FloatType: - l[i][j] = round(l[i][j], digits) - return l - - -def sortby(listoflists, sortcols): - """ -Sorts a list of lists on the column(s) specified in the sequence -sortcols. - -Usage: sortby(listoflists,sortcols) -Returns: sorted list, unchanged column ordering -""" - newlist = abut(colex(listoflists, sortcols), listoflists) - newlist.sort() - try: - numcols = len(sortcols) - except TypeError: - numcols = 1 - crit = '[' + str(numcols) + ':]' - newlist = colex(newlist, crit) - return newlist - - -def unique(inlist): - """ -Returns all unique items in the passed list. If the a list-of-lists -is passed, unique LISTS are found (i.e., items in the first dimension are -compared). - -Usage: unique (inlist) -Returns: the unique elements (or rows) in inlist -""" - uniques = [] - for item in inlist: - if item not in uniques: - uniques.append(item) - return uniques - - -def duplicates(inlist): - """ -Returns duplicate items in the FIRST dimension of the passed list. - -Usage: duplicates (inlist) -""" - dups = [] - for i in range(len(inlist)): - if inlist[i] in inlist[i + 1:]: - dups.append(inlist[i]) - return dups - - -def nonrepeats(inlist): - """ -Returns items that are NOT duplicated in the first dim of the passed list. - -Usage: nonrepeats (inlist) -""" - nonrepeats = [] - for i in range(len(inlist)): - if inlist.count(inlist[i]) == 1: - nonrepeats.append(inlist[i]) - return nonrepeats - -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== -#=================== PSTAT ARRAY FUNCTIONS ===================== - -try: # DEFINE THESE *ONLY* IF numpy IS AVAILABLE - import numpy as N - - def aabut(source, *args): - """ -Like the |Stat abut command. It concatenates two arrays column-wise -and returns the result. CAUTION: If one array is shorter, it will be -repeated until it is as long as the other. - -Usage: aabut (source, args) where args=any # of arrays -Returns: an array as long as the LONGEST array past, source appearing on the - 'left', arrays in <args> attached on the 'right'. -""" - if len(source.shape) == 1: - width = 1 - source = N.resize(source, [source.shape[0], width]) - else: - width = source.shape[1] - for addon in args: - if len(addon.shape) == 1: - width = 1 - addon = N.resize(addon, [source.shape[0], width]) - else: - width = source.shape[1] - if len(addon) < len(source): - addon = N.resize(addon, [source.shape[0], addon.shape[1]]) - elif len(source) < len(addon): - source = N.resize(source, [addon.shape[0], source.shape[1]]) - source = N.concatenate((source, addon), 1) - return source - - def acolex(a, indices, axis=1): - """ -Extracts specified indices (a list) from passed array, along passed -axis (column extraction is default). BEWARE: A 1D array is presumed to be a -column-array (and that the whole array will be returned as a column). - -Usage: acolex (a,indices,axis=1) -Returns: the columns of a specified by indices -""" - if type(indices) not in [ListType, TupleType, N.ndarray]: - indices = [indices] - if len(N.shape(a)) == 1: - cols = N.resize(a, [a.shape[0], 1]) - else: - # print a[:3] - cols = N.take(a, indices, axis) -# print cols[:3] - return cols - - def acollapse(a, keepcols, collapsecols, fcn1=None, fcn2=None, cfcn=None): - """ -Averages data in collapsecol, keeping all unique items in keepcols -(using unique, which keeps unique LISTS of column numbers), retaining -the unique sets of values in keepcols, the mean for each. If stderror or -N of the mean are desired, set either or both parameters to 1. - -Usage: acollapse (a,keepcols,collapsecols,fcn1=None,fcn2=None,cfcn=None) -Returns: unique 'conditions' specified by the contents of columns specified - by keepcols, abutted with the mean(s) of column(s) specified by - collapsecols -""" - - def acollmean(inarray): - return N.sum(N.ravel(inarray)) - - if type(keepcols) not in [ListType, TupleType, N.ndarray]: - keepcols = [keepcols] - if type(collapsecols) not in [ListType, TupleType, N.ndarray]: - collapsecols = [collapsecols] - - if cfcn == None: - cfcn = acollmean - if keepcols == []: - avgcol = acolex(a, collapsecols) - means = N.sum(avgcol) / float(len(avgcol)) - if fcn1 <> None: - try: - test = fcn1(avgcol) - except: - test = N.array(['N/A'] * len(means)) - means = aabut(means, test) - if fcn2 <> None: - try: - test = fcn2(avgcol) - except: - test = N.array(['N/A'] * len(means)) - means = aabut(means, test) - return means - else: - if type(keepcols) not in [ListType, TupleType, N.ndarray]: - keepcols = [keepcols] - values = colex(a, keepcols) # so that "item" can be appended (below) - uniques = unique(values) # get a LIST, so .sort keeps rows intact - uniques.sort() - newlist = [] - for item in uniques: - if type(item) not in [ListType, TupleType, N.ndarray]: - item = [item] - tmprows = alinexand(a, keepcols, item) - for col in collapsecols: - avgcol = acolex(tmprows, col) - item.append(acollmean(avgcol)) - if fcn1 <> None: - try: - test = fcn1(avgcol) - except: - test = 'N/A' - item.append(test) - if fcn2 <> None: - try: - test = fcn2(avgcol) - except: - test = 'N/A' - item.append(test) - newlist.append(item) - try: - new_a = N.array(newlist) - except TypeError: - new_a = N.array(newlist, 'O') - return new_a - - def adm(a, criterion): - """ -Returns rows from the passed list of lists that meet the criteria in -the passed criterion expression (a string as a function of x). - -Usage: adm (a,criterion) where criterion is like 'x[2]==37' -""" - function = 'filter(lambda x: ' + criterion + ',a)' - lines = eval(function) - try: - lines = N.array(lines) - except: - lines = N.array(lines, dtype='O') - return lines - - def isstring(x): - if type(x) == StringType: - return 1 - else: - return 0 - - def alinexand(a, columnlist, valuelist): - """ -Returns the rows of an array where col (from columnlist) = val -(from valuelist). One value is required for each column in columnlist. - -Usage: alinexand (a,columnlist,valuelist) -Returns: the rows of a where columnlist[i]=valuelist[i] for ALL i -""" - if type(columnlist) not in [ListType, TupleType, N.ndarray]: - columnlist = [columnlist] - if type(valuelist) not in [ListType, TupleType, N.ndarray]: - valuelist = [valuelist] - criterion = '' - for i in range(len(columnlist)): - if type(valuelist[i]) == StringType: - critval = '\'' + valuelist[i] + '\'' - else: - critval = str(valuelist[i]) - criterion = criterion + ' x[' + str(columnlist[ - i]) + ']==' + critval + ' and' - criterion = criterion[0:-3] # remove the "and" after the last crit - return adm(a, criterion) - - def alinexor(a, columnlist, valuelist): - """ -Returns the rows of an array where col (from columnlist) = val (from -valuelist). One value is required for each column in columnlist. -The exception is if either columnlist or valuelist has only 1 value, -in which case that item will be expanded to match the length of the -other list. - -Usage: alinexor (a,columnlist,valuelist) -Returns: the rows of a where columnlist[i]=valuelist[i] for ANY i -""" - if type(columnlist) not in [ListType, TupleType, N.ndarray]: - columnlist = [columnlist] - if type(valuelist) not in [ListType, TupleType, N.ndarray]: - valuelist = [valuelist] - criterion = '' - if len(columnlist) == 1 and len(valuelist) > 1: - columnlist = columnlist * len(valuelist) - elif len(valuelist) == 1 and len(columnlist) > 1: - valuelist = valuelist * len(columnlist) - for i in range(len(columnlist)): - if type(valuelist[i]) == StringType: - critval = '\'' + valuelist[i] + '\'' - else: - critval = str(valuelist[i]) - criterion = criterion + ' x[' + str(columnlist[ - i]) + ']==' + critval + ' or' - criterion = criterion[0:-2] # remove the "or" after the last crit - return adm(a, criterion) - - def areplace(a, oldval, newval): - """ -Replaces all occurrences of oldval with newval in array a. - -Usage: areplace(a,oldval,newval) -""" - return N.where(a == oldval, newval, a) - - def arecode(a, listmap, col='all'): - """ -Remaps the values in an array to a new set of values (useful when -you need to recode data from (e.g.) strings to numbers as most stats -packages require. Can work on SINGLE columns, or 'all' columns at once. -@@@BROKEN 2007-11-26 - -Usage: arecode (a,listmap,col='all') -Returns: a version of array a where listmap[i][0] = (instead) listmap[i][1] -""" - ashape = a.shape - if col == 'all': - work = a.ravel() - else: - work = acolex(a, col) - work = work.ravel() - for pair in listmap: - if type(pair[ - 1]) == StringType or work.dtype.char == 'O' or a.dtype.char == 'O': - work = N.array(work, dtype='O') - a = N.array(a, dtype='O') - for i in range(len(work)): - if work[i] == pair[0]: - work[i] = pair[1] - if col == 'all': - return N.reshape(work, ashape) - else: - return N.concatenate( - [a[:, 0:col], work[:, N.newaxis], a[:, col + 1:]], 1) - else: # must be a non-Object type array and replacement - work = N.where(work == pair[0], pair[1], work) - return N.concatenate( - [a[:, 0:col], work[:, N.newaxis], a[:, col + 1:]], 1) - - def arowcompare(row1, row2): - """ -Compares two rows from an array, regardless of whether it is an -array of numbers or of python objects (which requires the cmp function). -@@@PURPOSE? 2007-11-26 - -Usage: arowcompare(row1,row2) -Returns: an array of equal length containing 1s where the two rows had - identical elements and 0 otherwise -""" - return - if row1.dtype.char == 'O' or row2.dtype == 'O': - cmpvect = N.logical_not( - abs(N.array(map(cmp, row1, row2)))) # cmp fcn gives -1,0,1 - else: - cmpvect = N.equal(row1, row2) - return cmpvect - - def arowsame(row1, row2): - """ -Compares two rows from an array, regardless of whether it is an -array of numbers or of python objects (which requires the cmp function). - -Usage: arowsame(row1,row2) -Returns: 1 if the two rows are identical, 0 otherwise. -""" - cmpval = N.alltrue(arowcompare(row1, row2)) - return cmpval - - def asortrows(a, axis=0): - """ -Sorts an array "by rows". This differs from the Numeric.sort() function, -which sorts elements WITHIN the given axis. Instead, this function keeps -the elements along the given axis intact, but shifts them 'up or down' -relative to one another. - -Usage: asortrows(a,axis=0) -Returns: sorted version of a -""" - return N.sort(a, axis=axis, kind='mergesort') - - def aunique(inarray): - """ -Returns unique items in the FIRST dimension of the passed array. Only -works on arrays NOT including string items. - -Usage: aunique (inarray) -""" - uniques = N.array([inarray[0]]) - if len(uniques.shape) == 1: # IF IT'S A 1D ARRAY - for item in inarray[1:]: - if N.add.reduce(N.equal(uniques, item).ravel()) == 0: - try: - uniques = N.concatenate([uniques, N.array[N.newaxis, :]]) - except TypeError: - uniques = N.concatenate([uniques, N.array([item])]) - else: # IT MUST BE A 2+D ARRAY - if inarray.dtype.char != 'O': # not an Object array - for item in inarray[1:]: - if not N.sum(N.alltrue(N.equal(uniques, item), 1)): - try: - uniques = N.concatenate([uniques, item[N.newaxis, :]]) - except TypeError: # the item to add isn't a list - uniques = N.concatenate([uniques, N.array([item])]) - else: - pass # this item is already in the uniques array - else: # must be an Object array, alltrue/equal functions don't work - for item in inarray[1:]: - newflag = 1 - for unq in uniques: # NOTE: cmp --> 0=same, -1=<, 1=> - test = N.sum(abs(N.array(map(cmp, item, unq)))) - if test == 0: # if item identical to any 1 row in uniques - newflag = 0 # then not a novel item to add - break - if newflag == 1: - try: - uniques = N.concatenate([uniques, item[N.newaxis, :]]) - except TypeError: # the item to add isn't a list - uniques = N.concatenate([uniques, N.array([item])]) - return uniques - - def aduplicates(inarray): - """ -Returns duplicate items in the FIRST dimension of the passed array. Only -works on arrays NOT including string items. - -Usage: aunique (inarray) -""" - inarray = N.array(inarray) - if len(inarray.shape) == 1: # IF IT'S A 1D ARRAY - dups = [] - inarray = inarray.tolist() - for i in range(len(inarray)): - if inarray[i] in inarray[i + 1:]: - dups.append(inarray[i]) - dups = aunique(dups) - else: # IT MUST BE A 2+D ARRAY - dups = [] - aslist = inarray.tolist() - for i in range(len(aslist)): - if aslist[i] in aslist[i + 1:]: - dups.append(aslist[i]) - dups = unique(dups) - dups = N.array(dups) - return dups - -except ImportError: # IF NUMERIC ISN'T AVAILABLE, SKIP ALL arrayfuncs - pass diff --git a/cros_utils/stats.py b/cros_utils/stats.py deleted file mode 100644 index 0387a076..00000000 --- a/cros_utils/stats.py +++ /dev/null @@ -1,4519 +0,0 @@ -# We did not author this file nor mantain it. Skip linting it. -#pylint: skip-file -# Copyright (c) 1999-2008 Gary Strangman; All Rights Reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# -# Comments and/or additions are welcome (send e-mail to: -# strang@nmr.mgh.harvard.edu). -# -"""stats.py module - -(Requires pstat.py module.) - -################################################# -####### Written by: Gary Strangman ########### -####### Last modified: Oct 31, 2008 ########### -################################################# - -A collection of basic statistical functions for python. The function -names appear below. - -IMPORTANT: There are really *3* sets of functions. The first set has an 'l' -prefix, which can be used with list or tuple arguments. The second set has -an 'a' prefix, which can accept NumPy array arguments. These latter -functions are defined only when NumPy is available on the system. The third -type has NO prefix (i.e., has the name that appears below). Functions of -this set are members of a "Dispatch" class, c/o David Ascher. This class -allows different functions to be called depending on the type of the passed -arguments. Thus, stats.mean is a member of the Dispatch class and -stats.mean(range(20)) will call stats.lmean(range(20)) while -stats.mean(Numeric.arange(20)) will call stats.amean(Numeric.arange(20)). -This is a handy way to keep consistent function names when different -argument types require different functions to be called. Having -implementated the Dispatch class, however, means that to get info on -a given function, you must use the REAL function name ... that is -"print stats.lmean.__doc__" or "print stats.amean.__doc__" work fine, -while "print stats.mean.__doc__" will print the doc for the Dispatch -class. NUMPY FUNCTIONS ('a' prefix) generally have more argument options -but should otherwise be consistent with the corresponding list functions. - -Disclaimers: The function list is obviously incomplete and, worse, the -functions are not optimized. All functions have been tested (some more -so than others), but they are far from bulletproof. Thus, as with any -free software, no warranty or guarantee is expressed or implied. :-) A -few extra functions that don't appear in the list below can be found by -interested treasure-hunters. These functions don't necessarily have -both list and array versions but were deemed useful - -CENTRAL TENDENCY: geometricmean - harmonicmean - mean - median - medianscore - mode - -MOMENTS: moment - variation - skew - kurtosis - skewtest (for Numpy arrays only) - kurtosistest (for Numpy arrays only) - normaltest (for Numpy arrays only) - -ALTERED VERSIONS: tmean (for Numpy arrays only) - tvar (for Numpy arrays only) - tmin (for Numpy arrays only) - tmax (for Numpy arrays only) - tstdev (for Numpy arrays only) - tsem (for Numpy arrays only) - describe - -FREQUENCY STATS: itemfreq - scoreatpercentile - percentileofscore - histogram - cumfreq - relfreq - -VARIABILITY: obrientransform - samplevar - samplestdev - signaltonoise (for Numpy arrays only) - var - stdev - sterr - sem - z - zs - zmap (for Numpy arrays only) - -TRIMMING FCNS: threshold (for Numpy arrays only) - trimboth - trim1 - round (round all vals to 'n' decimals; Numpy only) - -CORRELATION FCNS: covariance (for Numpy arrays only) - correlation (for Numpy arrays only) - paired - pearsonr - spearmanr - pointbiserialr - kendalltau - linregress - -INFERENTIAL STATS: ttest_1samp - ttest_ind - ttest_rel - chisquare - ks_2samp - mannwhitneyu - ranksums - wilcoxont - kruskalwallish - friedmanchisquare - -PROBABILITY CALCS: chisqprob - erfcc - zprob - ksprob - fprob - betacf - gammln - betai - -ANOVA FUNCTIONS: F_oneway - F_value - -SUPPORT FUNCTIONS: writecc - incr - sign (for Numpy arrays only) - sum - cumsum - ss - summult - sumdiffsquared - square_of_sums - shellsort - rankdata - outputpairedstats - findwithin -""" -## CHANGE LOG: -## =========== -## 09-07-21 ... added capability for getting the 'proportion' out of l/amannwhitneyu (but comment-disabled) -## 08-10-31 ... fixed import LinearAlgebra bug before glm fcns -## 07-11-26 ... conversion for numpy started -## 07-05-16 ... added Lin's Concordance Correlation Coefficient (alincc) and acov -## 05-08-21 ... added "Dice's coefficient" -## 04-10-26 ... added ap2t(), an ugly fcn for converting p-vals to T-vals -## 04-04-03 ... added amasslinregress() function to do regression on N-D arrays -## 03-01-03 ... CHANGED VERSION TO 0.6 -## fixed atsem() to properly handle limits=None case -## improved histogram and median functions (estbinwidth) and -## fixed atvar() function (wrong answers for neg numbers?!?) -## 02-11-19 ... fixed attest_ind and attest_rel for div-by-zero Overflows -## 02-05-10 ... fixed lchisqprob indentation (failed when df=even) -## 00-12-28 ... removed aanova() to separate module, fixed licensing to -## match Python License, fixed doc string & imports -## 00-04-13 ... pulled all "global" statements, except from aanova() -## added/fixed lots of documentation, removed io.py dependency -## changed to version 0.5 -## 99-11-13 ... added asign() function -## 99-11-01 ... changed version to 0.4 ... enough incremental changes now -## 99-10-25 ... added acovariance and acorrelation functions -## 99-10-10 ... fixed askew/akurtosis to avoid divide-by-zero errors -## added aglm function (crude, but will be improved) -## 99-10-04 ... upgraded acumsum, ass, asummult, asamplevar, avar, etc. to -## all handle lists of 'dimension's and keepdims -## REMOVED ar0, ar2, ar3, ar4 and replaced them with around -## reinserted fixes for abetai to avoid math overflows -## 99-09-05 ... rewrote achisqprob/aerfcc/aksprob/afprob/abetacf/abetai to -## handle multi-dimensional arrays (whew!) -## 99-08-30 ... fixed l/amoment, l/askew, l/akurtosis per D'Agostino (1990) -## added anormaltest per same reference -## re-wrote azprob to calc arrays of probs all at once -## 99-08-22 ... edited attest_ind printing section so arrays could be rounded -## 99-08-19 ... fixed amean and aharmonicmean for non-error(!) overflow on -## short/byte arrays (mean of #s btw 100-300 = -150??) -## 99-08-09 ... fixed asum so that the None case works for Byte arrays -## 99-08-08 ... fixed 7/3 'improvement' to handle t-calcs on N-D arrays -## 99-07-03 ... improved attest_ind, attest_rel (zero-division errortrap) -## 99-06-24 ... fixed bug(?) in attest_ind (n1=a.shape[0]) -## 04/11/99 ... added asignaltonoise, athreshold functions, changed all -## max/min in array section to N.maximum/N.minimum, -## fixed square_of_sums to prevent integer overflow -## 04/10/99 ... !!! Changed function name ... sumsquared ==> square_of_sums -## 03/18/99 ... Added ar0, ar2, ar3 and ar4 rounding functions -## 02/28/99 ... Fixed aobrientransform to return an array rather than a list -## 01/15/99 ... Essentially ceased updating list-versions of functions (!!!) -## 01/13/99 ... CHANGED TO VERSION 0.3 -## fixed bug in a/lmannwhitneyu p-value calculation -## 12/31/98 ... fixed variable-name bug in ldescribe -## 12/19/98 ... fixed bug in findwithin (fcns needed pstat. prefix) -## 12/16/98 ... changed amedianscore to return float (not array) for 1 score -## 12/14/98 ... added atmin and atmax functions -## removed umath from import line (not needed) -## l/ageometricmean modified to reduce chance of overflows (take -## nth root first, then multiply) -## 12/07/98 ... added __version__variable (now 0.2) -## removed all 'stats.' from anova() fcn -## 12/06/98 ... changed those functions (except shellsort) that altered -## arguments in-place ... cumsum, ranksort, ... -## updated (and fixed some) doc-strings -## 12/01/98 ... added anova() function (requires NumPy) -## incorporated Dispatch class -## 11/12/98 ... added functionality to amean, aharmonicmean, ageometricmean -## added 'asum' function (added functionality to N.add.reduce) -## fixed both moment and amoment (two errors) -## changed name of skewness and askewness to skew and askew -## fixed (a)histogram (which sometimes counted points <lowerlimit) - -import pstat # required 3rd party module -import math, string, copy # required python modules -from types import * - -__version__ = 0.6 - -############# DISPATCH CODE ############## - - -class Dispatch: - """ -The Dispatch class, care of David Ascher, allows different functions to -be called depending on the argument types. This way, there can be one -function name regardless of the argument type. To access function doc -in stats.py module, prefix the function with an 'l' or 'a' for list or -array arguments, respectively. That is, print stats.lmean.__doc__ or -print stats.amean.__doc__ or whatever. -""" - - def __init__(self, *tuples): - self._dispatch = {} - for func, types in tuples: - for t in types: - if t in self._dispatch.keys(): - raise ValueError, "can't have two dispatches on " + str(t) - self._dispatch[t] = func - self._types = self._dispatch.keys() - - def __call__(self, arg1, *args, **kw): - if type(arg1) not in self._types: - raise TypeError, "don't know how to dispatch %s arguments" % type(arg1) - return apply(self._dispatch[type(arg1)], (arg1,) + args, kw) - -########################################################################## -######################## LIST-BASED FUNCTIONS ######################## -########################################################################## - -### Define these regardless - -#################################### -####### CENTRAL TENDENCY ######### -#################################### - - -def lgeometricmean(inlist): - """ -Calculates the geometric mean of the values in the passed list. -That is: n-th root of (x1 * x2 * ... * xn). Assumes a '1D' list. - -Usage: lgeometricmean(inlist) -""" - mult = 1.0 - one_over_n = 1.0 / len(inlist) - for item in inlist: - mult = mult * pow(item, one_over_n) - return mult - - -def lharmonicmean(inlist): - """ -Calculates the harmonic mean of the values in the passed list. -That is: n / (1/x1 + 1/x2 + ... + 1/xn). Assumes a '1D' list. - -Usage: lharmonicmean(inlist) -""" - sum = 0 - for item in inlist: - sum = sum + 1.0 / item - return len(inlist) / sum - - -def lmean(inlist): - """ -Returns the arithematic mean of the values in the passed list. -Assumes a '1D' list, but will function on the 1st dim of an array(!). - -Usage: lmean(inlist) -""" - sum = 0 - for item in inlist: - sum = sum + item - return sum / float(len(inlist)) - - -def lmedian(inlist, numbins=1000): - """ -Returns the computed median value of a list of numbers, given the -number of bins to use for the histogram (more bins brings the computed value -closer to the median score, default number of bins = 1000). See G.W. -Heiman's Basic Stats (1st Edition), or CRC Probability & Statistics. - -Usage: lmedian (inlist, numbins=1000) -""" - (hist, smallest, binsize, extras) = histogram( - inlist, numbins, [min(inlist), max(inlist)]) # make histog - cumhist = cumsum(hist) # make cumulative histogram - for i in range(len(cumhist)): # get 1st(!) index holding 50%ile score - if cumhist[i] >= len(inlist) / 2.0: - cfbin = i - break - LRL = smallest + binsize * cfbin # get lower read limit of that bin - cfbelow = cumhist[cfbin - 1] - freq = float(hist[cfbin]) # frequency IN the 50%ile bin - median = LRL + ( - (len(inlist) / 2.0 - cfbelow) / float(freq)) * binsize # median formula - return median - - -def lmedianscore(inlist): - """ -Returns the 'middle' score of the passed list. If there is an even -number of scores, the mean of the 2 middle scores is returned. - -Usage: lmedianscore(inlist) -""" - - newlist = copy.deepcopy(inlist) - newlist.sort() - if len(newlist) % 2 == 0: # if even number of scores, average middle 2 - index = len(newlist) / 2 # integer division correct - median = float(newlist[index] + newlist[index - 1]) / 2 - else: - index = len(newlist) / 2 # int divsion gives mid value when count from 0 - median = newlist[index] - return median - - -def lmode(inlist): - """ -Returns a list of the modal (most common) score(s) in the passed -list. If there is more than one such score, all are returned. The -bin-count for the mode(s) is also returned. - -Usage: lmode(inlist) -Returns: bin-count for mode(s), a list of modal value(s) -""" - - scores = pstat.unique(inlist) - scores.sort() - freq = [] - for item in scores: - freq.append(inlist.count(item)) - maxfreq = max(freq) - mode = [] - stillmore = 1 - while stillmore: - try: - indx = freq.index(maxfreq) - mode.append(scores[indx]) - del freq[indx] - del scores[indx] - except ValueError: - stillmore = 0 - return maxfreq, mode - -#################################### -############ MOMENTS ############# -#################################### - - -def lmoment(inlist, moment=1): - """ -Calculates the nth moment about the mean for a sample (defaults to -the 1st moment). Used to calculate coefficients of skewness and kurtosis. - -Usage: lmoment(inlist,moment=1) -Returns: appropriate moment (r) from ... 1/n * SUM((inlist(i)-mean)**r) -""" - if moment == 1: - return 0.0 - else: - mn = mean(inlist) - n = len(inlist) - s = 0 - for x in inlist: - s = s + (x - mn)**moment - return s / float(n) - - -def lvariation(inlist): - """ -Returns the coefficient of variation, as defined in CRC Standard -Probability and Statistics, p.6. - -Usage: lvariation(inlist) -""" - return 100.0 * samplestdev(inlist) / float(mean(inlist)) - - -def lskew(inlist): - """ -Returns the skewness of a distribution, as defined in Numerical -Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.) - -Usage: lskew(inlist) -""" - return moment(inlist, 3) / pow(moment(inlist, 2), 1.5) - - -def lkurtosis(inlist): - """ -Returns the kurtosis of a distribution, as defined in Numerical -Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.) - -Usage: lkurtosis(inlist) -""" - return moment(inlist, 4) / pow(moment(inlist, 2), 2.0) - - -def ldescribe(inlist): - """ -Returns some descriptive statistics of the passed list (assumed to be 1D). - -Usage: ldescribe(inlist) -Returns: n, mean, standard deviation, skew, kurtosis -""" - n = len(inlist) - mm = (min(inlist), max(inlist)) - m = mean(inlist) - sd = stdev(inlist) - sk = skew(inlist) - kurt = kurtosis(inlist) - return n, mm, m, sd, sk, kurt - -#################################### -####### FREQUENCY STATS ########## -#################################### - - -def litemfreq(inlist): - """ -Returns a list of pairs. Each pair consists of one of the scores in inlist -and it's frequency count. Assumes a 1D list is passed. - -Usage: litemfreq(inlist) -Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies) -""" - scores = pstat.unique(inlist) - scores.sort() - freq = [] - for item in scores: - freq.append(inlist.count(item)) - return pstat.abut(scores, freq) - - -def lscoreatpercentile(inlist, percent): - """ -Returns the score at a given percentile relative to the distribution -given by inlist. - -Usage: lscoreatpercentile(inlist,percent) -""" - if percent > 1: - print '\nDividing percent>1 by 100 in lscoreatpercentile().\n' - percent = percent / 100.0 - targetcf = percent * len(inlist) - h, lrl, binsize, extras = histogram(inlist) - cumhist = cumsum(copy.deepcopy(h)) - for i in range(len(cumhist)): - if cumhist[i] >= targetcf: - break - score = binsize * ( - (targetcf - cumhist[i - 1]) / float(h[i])) + (lrl + binsize * i) - return score - - -def lpercentileofscore(inlist, score, histbins=10, defaultlimits=None): - """ -Returns the percentile value of a score relative to the distribution -given by inlist. Formula depends on the values used to histogram the data(!). - -Usage: lpercentileofscore(inlist,score,histbins=10,defaultlimits=None) -""" - - h, lrl, binsize, extras = histogram(inlist, histbins, defaultlimits) - cumhist = cumsum(copy.deepcopy(h)) - i = int((score - lrl) / float(binsize)) - pct = (cumhist[i - 1] + ( - (score - - (lrl + binsize * i)) / float(binsize)) * h[i]) / float(len(inlist)) * 100 - return pct - - -def lhistogram(inlist, numbins=10, defaultreallimits=None, printextras=0): - """ -Returns (i) a list of histogram bin counts, (ii) the smallest value -of the histogram binning, and (iii) the bin width (the last 2 are not -necessarily integers). Default number of bins is 10. If no sequence object -is given for defaultreallimits, the routine picks (usually non-pretty) bins -spanning all the numbers in the inlist. - -Usage: lhistogram (inlist, numbins=10, -defaultreallimits=None,suppressoutput=0) -Returns: list of bin values, lowerreallimit, binsize, extrapoints -""" - if (defaultreallimits <> None): - if type(defaultreallimits) not in [ListType, TupleType] or len( - defaultreallimits) == 1: # only one limit given, assumed to be lower one & upper is calc'd - lowerreallimit = defaultreallimits - upperreallimit = 1.000001 * max(inlist) - else: # assume both limits given - lowerreallimit = defaultreallimits[0] - upperreallimit = defaultreallimits[1] - binsize = (upperreallimit - lowerreallimit) / float(numbins) - else: # no limits given for histogram, both must be calc'd - estbinwidth = (max(inlist) - - min(inlist)) / float(numbins) + 1e-6 #1=>cover all - binsize = ((max(inlist) - min(inlist) + estbinwidth)) / float(numbins) - lowerreallimit = min(inlist) - binsize / 2 #lower real limit,1st bin - bins = [0] * (numbins) - extrapoints = 0 - for num in inlist: - try: - if (num - lowerreallimit) < 0: - extrapoints = extrapoints + 1 - else: - bintoincrement = int((num - lowerreallimit) / float(binsize)) - bins[bintoincrement] = bins[bintoincrement] + 1 - except: - extrapoints = extrapoints + 1 - if (extrapoints > 0 and printextras == 1): - print '\nPoints outside given histogram range =', extrapoints - return (bins, lowerreallimit, binsize, extrapoints) - - -def lcumfreq(inlist, numbins=10, defaultreallimits=None): - """ -Returns a cumulative frequency histogram, using the histogram function. - -Usage: lcumfreq(inlist,numbins=10,defaultreallimits=None) -Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints -""" - h, l, b, e = histogram(inlist, numbins, defaultreallimits) - cumhist = cumsum(copy.deepcopy(h)) - return cumhist, l, b, e - - -def lrelfreq(inlist, numbins=10, defaultreallimits=None): - """ -Returns a relative frequency histogram, using the histogram function. - -Usage: lrelfreq(inlist,numbins=10,defaultreallimits=None) -Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints -""" - h, l, b, e = histogram(inlist, numbins, defaultreallimits) - for i in range(len(h)): - h[i] = h[i] / float(len(inlist)) - return h, l, b, e - -#################################### -##### VARIABILITY FUNCTIONS ###### -#################################### - - -def lobrientransform(*args): - """ -Computes a transform on input data (any number of columns). Used to -test for homogeneity of variance prior to running one-way stats. From -Maxwell and Delaney, p.112. - -Usage: lobrientransform(*args) -Returns: transformed data for use in an ANOVA -""" - TINY = 1e-10 - k = len(args) - n = [0.0] * k - v = [0.0] * k - m = [0.0] * k - nargs = [] - for i in range(k): - nargs.append(copy.deepcopy(args[i])) - n[i] = float(len(nargs[i])) - v[i] = var(nargs[i]) - m[i] = mean(nargs[i]) - for j in range(k): - for i in range(n[j]): - t1 = (n[j] - 1.5) * n[j] * (nargs[j][i] - m[j])**2 - t2 = 0.5 * v[j] * (n[j] - 1.0) - t3 = (n[j] - 1.0) * (n[j] - 2.0) - nargs[j][i] = (t1 - t2) / float(t3) - check = 1 - for j in range(k): - if v[j] - mean(nargs[j]) > TINY: - check = 0 - if check <> 1: - raise ValueError, 'Problem in obrientransform.' - else: - return nargs - - -def lsamplevar(inlist): - """ -Returns the variance of the values in the passed list using -N for the denominator (i.e., DESCRIBES the sample variance only). - -Usage: lsamplevar(inlist) -""" - n = len(inlist) - mn = mean(inlist) - deviations = [] - for item in inlist: - deviations.append(item - mn) - return ss(deviations) / float(n) - - -def lsamplestdev(inlist): - """ -Returns the standard deviation of the values in the passed list using -N for the denominator (i.e., DESCRIBES the sample stdev only). - -Usage: lsamplestdev(inlist) -""" - return math.sqrt(samplevar(inlist)) - - -def lcov(x, y, keepdims=0): - """ -Returns the estimated covariance of the values in the passed -array (i.e., N-1). Dimension can equal None (ravel array first), an -integer (the dimension over which to operate), or a sequence (operate -over multiple dimensions). Set keepdims=1 to return an array with the -same number of dimensions as inarray. - -Usage: lcov(x,y,keepdims=0) -""" - - n = len(x) - xmn = mean(x) - ymn = mean(y) - xdeviations = [0] * len(x) - ydeviations = [0] * len(y) - for i in range(len(x)): - xdeviations[i] = x[i] - xmn - ydeviations[i] = y[i] - ymn - ss = 0.0 - for i in range(len(xdeviations)): - ss = ss + xdeviations[i] * ydeviations[i] - return ss / float(n - 1) - - -def lvar(inlist): - """ -Returns the variance of the values in the passed list using N-1 -for the denominator (i.e., for estimating population variance). - -Usage: lvar(inlist) -""" - n = len(inlist) - mn = mean(inlist) - deviations = [0] * len(inlist) - for i in range(len(inlist)): - deviations[i] = inlist[i] - mn - return ss(deviations) / float(n - 1) - - -def lstdev(inlist): - """ -Returns the standard deviation of the values in the passed list -using N-1 in the denominator (i.e., to estimate population stdev). - -Usage: lstdev(inlist) -""" - return math.sqrt(var(inlist)) - - -def lsterr(inlist): - """ -Returns the standard error of the values in the passed list using N-1 -in the denominator (i.e., to estimate population standard error). - -Usage: lsterr(inlist) -""" - return stdev(inlist) / float(math.sqrt(len(inlist))) - - -def lsem(inlist): - """ -Returns the estimated standard error of the mean (sx-bar) of the -values in the passed list. sem = stdev / sqrt(n) - -Usage: lsem(inlist) -""" - sd = stdev(inlist) - n = len(inlist) - return sd / math.sqrt(n) - - -def lz(inlist, score): - """ -Returns the z-score for a given input score, given that score and the -list from which that score came. Not appropriate for population calculations. - -Usage: lz(inlist, score) -""" - z = (score - mean(inlist)) / samplestdev(inlist) - return z - - -def lzs(inlist): - """ -Returns a list of z-scores, one for each score in the passed list. - -Usage: lzs(inlist) -""" - zscores = [] - for item in inlist: - zscores.append(z(inlist, item)) - return zscores - -#################################### -####### TRIMMING FUNCTIONS ####### -#################################### - - -def ltrimboth(l, proportiontocut): - """ -Slices off the passed proportion of items from BOTH ends of the passed -list (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND 'rightmost' -10% of scores. Assumes list is sorted by magnitude. Slices off LESS if -proportion results in a non-integer slice index (i.e., conservatively -slices off proportiontocut). - -Usage: ltrimboth (l,proportiontocut) -Returns: trimmed version of list l -""" - lowercut = int(proportiontocut * len(l)) - uppercut = len(l) - lowercut - return l[lowercut:uppercut] - - -def ltrim1(l, proportiontocut, tail='right'): - """ -Slices off the passed proportion of items from ONE end of the passed -list (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost' -10% of scores). Slices off LESS if proportion results in a non-integer -slice index (i.e., conservatively slices off proportiontocut). - -Usage: ltrim1 (l,proportiontocut,tail='right') or set tail='left' -Returns: trimmed version of list l -""" - if tail == 'right': - lowercut = 0 - uppercut = len(l) - int(proportiontocut * len(l)) - elif tail == 'left': - lowercut = int(proportiontocut * len(l)) - uppercut = len(l) - return l[lowercut:uppercut] - -#################################### -##### CORRELATION FUNCTIONS ###### -#################################### - - -def lpaired(x, y): - """ -Interactively determines the type of data and then runs the -appropriated statistic for paired group data. - -Usage: lpaired(x,y) -Returns: appropriate statistic name, value, and probability -""" - samples = '' - while samples not in ['i', 'r', 'I', 'R', 'c', 'C']: - print '\nIndependent or related samples, or correlation (i,r,c): ', - samples = raw_input() - - if samples in ['i', 'I', 'r', 'R']: - print '\nComparing variances ...', - # USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112 - r = obrientransform(x, y) - f, p = F_oneway(pstat.colex(r, 0), pstat.colex(r, 1)) - if p < 0.05: - vartype = 'unequal, p=' + str(round(p, 4)) - else: - vartype = 'equal' - print vartype - if samples in ['i', 'I']: - if vartype[0] == 'e': - t, p = ttest_ind(x, y, 0) - print '\nIndependent samples t-test: ', round(t, 4), round(p, 4) - else: - if len(x) > 20 or len(y) > 20: - z, p = ranksums(x, y) - print '\nRank Sums test (NONparametric, n>20): ', round(z, 4), round( - p, 4) - else: - u, p = mannwhitneyu(x, y) - print '\nMann-Whitney U-test (NONparametric, ns<20): ', round( - u, 4), round(p, 4) - - else: # RELATED SAMPLES - if vartype[0] == 'e': - t, p = ttest_rel(x, y, 0) - print '\nRelated samples t-test: ', round(t, 4), round(p, 4) - else: - t, p = ranksums(x, y) - print '\nWilcoxon T-test (NONparametric): ', round(t, 4), round(p, 4) - else: # CORRELATION ANALYSIS - corrtype = '' - while corrtype not in ['c', 'C', 'r', 'R', 'd', 'D']: - print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ', - corrtype = raw_input() - if corrtype in ['c', 'C']: - m, b, r, p, see = linregress(x, y) - print '\nLinear regression for continuous variables ...' - lol = [['Slope', 'Intercept', 'r', 'Prob', 'SEestimate'], - [round(m, 4), round(b, 4), round(r, 4), round(p, 4), round(see, 4)] - ] - pstat.printcc(lol) - elif corrtype in ['r', 'R']: - r, p = spearmanr(x, y) - print '\nCorrelation for ranked variables ...' - print "Spearman's r: ", round(r, 4), round(p, 4) - else: # DICHOTOMOUS - r, p = pointbiserialr(x, y) - print '\nAssuming x contains a dichotomous variable ...' - print 'Point Biserial r: ', round(r, 4), round(p, 4) - print '\n\n' - return None - - -def lpearsonr(x, y): - """ -Calculates a Pearson correlation coefficient and the associated -probability value. Taken from Heiman's Basic Statistics for the Behav. -Sci (2nd), p.195. - -Usage: lpearsonr(x,y) where x and y are equal-length lists -Returns: Pearson's r value, two-tailed p-value -""" - TINY = 1.0e-30 - if len(x) <> len(y): - raise ValueError, 'Input values not paired in pearsonr. Aborting.' - n = len(x) - x = map(float, x) - y = map(float, y) - xmean = mean(x) - ymean = mean(y) - r_num = n * (summult(x, y)) - sum(x) * sum(y) - r_den = math.sqrt((n * ss(x) - square_of_sums(x)) * - (n * ss(y) - square_of_sums(y))) - r = (r_num / r_den) # denominator already a float - df = n - 2 - t = r * math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) - prob = betai(0.5 * df, 0.5, df / float(df + t * t)) - return r, prob - - -def llincc(x, y): - """ -Calculates Lin's concordance correlation coefficient. - -Usage: alincc(x,y) where x, y are equal-length arrays -Returns: Lin's CC -""" - covar = lcov(x, y) * (len(x) - 1) / float(len(x)) # correct denom to n - xvar = lvar(x) * (len(x) - 1) / float(len(x)) # correct denom to n - yvar = lvar(y) * (len(y) - 1) / float(len(y)) # correct denom to n - lincc = (2 * covar) / ((xvar + yvar) + ((amean(x) - amean(y))**2)) - return lincc - - -def lspearmanr(x, y): - """ -Calculates a Spearman rank-order correlation coefficient. Taken -from Heiman's Basic Statistics for the Behav. Sci (1st), p.192. - -Usage: lspearmanr(x,y) where x and y are equal-length lists -Returns: Spearman's r, two-tailed p-value -""" - TINY = 1e-30 - if len(x) <> len(y): - raise ValueError, 'Input values not paired in spearmanr. Aborting.' - n = len(x) - rankx = rankdata(x) - ranky = rankdata(y) - dsq = sumdiffsquared(rankx, ranky) - rs = 1 - 6 * dsq / float(n * (n**2 - 1)) - t = rs * math.sqrt((n - 2) / ((rs + 1.0) * (1.0 - rs))) - df = n - 2 - probrs = betai(0.5 * df, 0.5, df / (df + t * t)) # t already a float - # probability values for rs are from part 2 of the spearman function in - # Numerical Recipies, p.510. They are close to tables, but not exact. (?) - return rs, probrs - - -def lpointbiserialr(x, y): - """ -Calculates a point-biserial correlation coefficient and the associated -probability value. Taken from Heiman's Basic Statistics for the Behav. -Sci (1st), p.194. - -Usage: lpointbiserialr(x,y) where x,y are equal-length lists -Returns: Point-biserial r, two-tailed p-value -""" - TINY = 1e-30 - if len(x) <> len(y): - raise ValueError, 'INPUT VALUES NOT PAIRED IN pointbiserialr. ABORTING.' - data = pstat.abut(x, y) - categories = pstat.unique(x) - if len(categories) <> 2: - raise ValueError, 'Exactly 2 categories required for pointbiserialr().' - else: # there are 2 categories, continue - codemap = pstat.abut(categories, range(2)) - recoded = pstat.recode(data, codemap, 0) - x = pstat.linexand(data, 0, categories[0]) - y = pstat.linexand(data, 0, categories[1]) - xmean = mean(pstat.colex(x, 1)) - ymean = mean(pstat.colex(y, 1)) - n = len(data) - adjust = math.sqrt((len(x) / float(n)) * (len(y) / float(n))) - rpb = (ymean - xmean) / samplestdev(pstat.colex(data, 1)) * adjust - df = n - 2 - t = rpb * math.sqrt(df / ((1.0 - rpb + TINY) * (1.0 + rpb + TINY))) - prob = betai(0.5 * df, 0.5, df / (df + t * t)) # t already a float - return rpb, prob - - -def lkendalltau(x, y): - """ -Calculates Kendall's tau ... correlation of ordinal data. Adapted -from function kendl1 in Numerical Recipies. Needs good test-routine.@@@ - -Usage: lkendalltau(x,y) -Returns: Kendall's tau, two-tailed p-value -""" - n1 = 0 - n2 = 0 - iss = 0 - for j in range(len(x) - 1): - for k in range(j, len(y)): - a1 = x[j] - x[k] - a2 = y[j] - y[k] - aa = a1 * a2 - if (aa): # neither list has a tie - n1 = n1 + 1 - n2 = n2 + 1 - if aa > 0: - iss = iss + 1 - else: - iss = iss - 1 - else: - if (a1): - n1 = n1 + 1 - else: - n2 = n2 + 1 - tau = iss / math.sqrt(n1 * n2) - svar = (4.0 * len(x) + 10.0) / (9.0 * len(x) * (len(x) - 1)) - z = tau / math.sqrt(svar) - prob = erfcc(abs(z) / 1.4142136) - return tau, prob - - -def llinregress(x, y): - """ -Calculates a regression line on x,y pairs. - -Usage: llinregress(x,y) x,y are equal-length lists of x-y coordinates -Returns: slope, intercept, r, two-tailed prob, sterr-of-estimate -""" - TINY = 1.0e-20 - if len(x) <> len(y): - raise ValueError, 'Input values not paired in linregress. Aborting.' - n = len(x) - x = map(float, x) - y = map(float, y) - xmean = mean(x) - ymean = mean(y) - r_num = float(n * (summult(x, y)) - sum(x) * sum(y)) - r_den = math.sqrt((n * ss(x) - square_of_sums(x)) * - (n * ss(y) - square_of_sums(y))) - r = r_num / r_den - z = 0.5 * math.log((1.0 + r + TINY) / (1.0 - r + TINY)) - df = n - 2 - t = r * math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) - prob = betai(0.5 * df, 0.5, df / (df + t * t)) - slope = r_num / float(n * ss(x) - square_of_sums(x)) - intercept = ymean - slope * xmean - sterrest = math.sqrt(1 - r * r) * samplestdev(y) - return slope, intercept, r, prob, sterrest - -#################################### -##### INFERENTIAL STATISTICS ##### -#################################### - - -def lttest_1samp(a, popmean, printit=0, name='Sample', writemode='a'): - """ -Calculates the t-obtained for the independent samples T-test on ONE group -of scores a, given a population mean. If printit=1, results are printed -to the screen. If printit='filename', the results are output to 'filename' -using the given writemode (default=append). Returns t-value, and prob. - -Usage: lttest_1samp(a,popmean,Name='Sample',printit=0,writemode='a') -Returns: t-value, two-tailed prob -""" - x = mean(a) - v = var(a) - n = len(a) - df = n - 1 - svar = ((n - 1) * v) / float(df) - t = (x - popmean) / math.sqrt(svar * (1.0 / n)) - prob = betai(0.5 * df, 0.5, float(df) / (df + t * t)) - - if printit <> 0: - statname = 'Single-sample T-test.' - outputpairedstats(printit, writemode, 'Population', '--', popmean, 0, 0, 0, - name, n, x, v, min(a), max(a), statname, t, prob) - return t, prob - - -def lttest_ind(a, b, printit=0, name1='Samp1', name2='Samp2', writemode='a'): - """ -Calculates the t-obtained T-test on TWO INDEPENDENT samples of -scores a, and b. From Numerical Recipies, p.483. If printit=1, results -are printed to the screen. If printit='filename', the results are output -to 'filename' using the given writemode (default=append). Returns t-value, -and prob. - -Usage: lttest_ind(a,b,printit=0,name1='Samp1',name2='Samp2',writemode='a') -Returns: t-value, two-tailed prob -""" - x1 = mean(a) - x2 = mean(b) - v1 = stdev(a)**2 - v2 = stdev(b)**2 - n1 = len(a) - n2 = len(b) - df = n1 + n2 - 2 - svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) - if not svar: - svar = 1.0e-26 - t = (x1 - x2) / math.sqrt(svar * (1.0 / n1 + 1.0 / n2)) - prob = betai(0.5 * df, 0.5, df / (df + t * t)) - - if printit <> 0: - statname = 'Independent samples T-test.' - outputpairedstats(printit, writemode, name1, n1, x1, v1, min(a), max(a), - name2, n2, x2, v2, min(b), max(b), statname, t, prob) - return t, prob - - -def lttest_rel(a, - b, - printit=0, - name1='Sample1', - name2='Sample2', - writemode='a'): - """ -Calculates the t-obtained T-test on TWO RELATED samples of scores, -a and b. From Numerical Recipies, p.483. If printit=1, results are -printed to the screen. If printit='filename', the results are output to -'filename' using the given writemode (default=append). Returns t-value, -and prob. - -Usage: lttest_rel(a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a') -Returns: t-value, two-tailed prob -""" - if len(a) <> len(b): - raise ValueError, 'Unequal length lists in ttest_rel.' - x1 = mean(a) - x2 = mean(b) - v1 = var(a) - v2 = var(b) - n = len(a) - cov = 0 - for i in range(len(a)): - cov = cov + (a[i] - x1) * (b[i] - x2) - df = n - 1 - cov = cov / float(df) - sd = math.sqrt((v1 + v2 - 2.0 * cov) / float(n)) - t = (x1 - x2) / sd - prob = betai(0.5 * df, 0.5, df / (df + t * t)) - - if printit <> 0: - statname = 'Related samples T-test.' - outputpairedstats(printit, writemode, name1, n, x1, v1, min(a), max(a), - name2, n, x2, v2, min(b), max(b), statname, t, prob) - return t, prob - - -def lchisquare(f_obs, f_exp=None): - """ -Calculates a one-way chi square for list of observed frequencies and returns -the result. If no expected frequencies are given, the total N is assumed to -be equally distributed across all groups. - -Usage: lchisquare(f_obs, f_exp=None) f_obs = list of observed cell freq. -Returns: chisquare-statistic, associated p-value -""" - k = len(f_obs) # number of groups - if f_exp == None: - f_exp = [sum(f_obs) / float(k)] * len(f_obs) # create k bins with = freq. - chisq = 0 - for i in range(len(f_obs)): - chisq = chisq + (f_obs[i] - f_exp[i])**2 / float(f_exp[i]) - return chisq, chisqprob(chisq, k - 1) - - -def lks_2samp(data1, data2): - """ -Computes the Kolmogorov-Smirnof statistic on 2 samples. From -Numerical Recipies in C, page 493. - -Usage: lks_2samp(data1,data2) data1&2 are lists of values for 2 conditions -Returns: KS D-value, associated p-value -""" - j1 = 0 - j2 = 0 - fn1 = 0.0 - fn2 = 0.0 - n1 = len(data1) - n2 = len(data2) - en1 = n1 - en2 = n2 - d = 0.0 - data1.sort() - data2.sort() - while j1 < n1 and j2 < n2: - d1 = data1[j1] - d2 = data2[j2] - if d1 <= d2: - fn1 = (j1) / float(en1) - j1 = j1 + 1 - if d2 <= d1: - fn2 = (j2) / float(en2) - j2 = j2 + 1 - dt = (fn2 - fn1) - if math.fabs(dt) > math.fabs(d): - d = dt - try: - en = math.sqrt(en1 * en2 / float(en1 + en2)) - prob = ksprob((en + 0.12 + 0.11 / en) * abs(d)) - except: - prob = 1.0 - return d, prob - - -def lmannwhitneyu(x, y): - """ -Calculates a Mann-Whitney U statistic on the provided scores and -returns the result. Use only when the n in each condition is < 20 and -you have 2 independent samples of ranks. NOTE: Mann-Whitney U is -significant if the u-obtained is LESS THAN or equal to the critical -value of U found in the tables. Equivalent to Kruskal-Wallis H with -just 2 groups. - -Usage: lmannwhitneyu(data) -Returns: u-statistic, one-tailed p-value (i.e., p(z(U))) -""" - n1 = len(x) - n2 = len(y) - ranked = rankdata(x + y) - rankx = ranked[0:n1] # get the x-ranks - ranky = ranked[n1:] # the rest are y-ranks - u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - sum(rankx) # calc U for x - u2 = n1 * n2 - u1 # remainder is U for y - bigu = max(u1, u2) - smallu = min(u1, u2) - proportion = bigu / float(n1 * n2) - T = math.sqrt(tiecorrect(ranked)) # correction factor for tied scores - if T == 0: - raise ValueError, 'All numbers are identical in lmannwhitneyu' - sd = math.sqrt(T * n1 * n2 * (n1 + n2 + 1) / 12.0) - z = abs((bigu - n1 * n2 / 2.0) / sd) # normal approximation for prob calc - return smallu, 1.0 - zprob(z) #, proportion - - -def ltiecorrect(rankvals): - """ -Corrects for ties in Mann Whitney U and Kruskal Wallis H tests. See -Siegel, S. (1956) Nonparametric Statistics for the Behavioral Sciences. -New York: McGraw-Hill. Code adapted from |Stat rankind.c code. - -Usage: ltiecorrect(rankvals) -Returns: T correction factor for U or H -""" - sorted, posn = shellsort(rankvals) - n = len(sorted) - T = 0.0 - i = 0 - while (i < n - 1): - if sorted[i] == sorted[i + 1]: - nties = 1 - while (i < n - 1) and (sorted[i] == sorted[i + 1]): - nties = nties + 1 - i = i + 1 - T = T + nties**3 - nties - i = i + 1 - T = T / float(n**3 - n) - return 1.0 - T - - -def lranksums(x, y): - """ -Calculates the rank sums statistic on the provided scores and -returns the result. Use only when the n in each condition is > 20 and you -have 2 independent samples of ranks. - -Usage: lranksums(x,y) -Returns: a z-statistic, two-tailed p-value -""" - n1 = len(x) - n2 = len(y) - alldata = x + y - ranked = rankdata(alldata) - x = ranked[:n1] - y = ranked[n1:] - s = sum(x) - expected = n1 * (n1 + n2 + 1) / 2.0 - z = (s - expected) / math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0) - prob = 2 * (1.0 - zprob(abs(z))) - return z, prob - - -def lwilcoxont(x, y): - """ -Calculates the Wilcoxon T-test for related samples and returns the -result. A non-parametric T-test. - -Usage: lwilcoxont(x,y) -Returns: a t-statistic, two-tail probability estimate -""" - if len(x) <> len(y): - raise ValueError, 'Unequal N in wilcoxont. Aborting.' - d = [] - for i in range(len(x)): - diff = x[i] - y[i] - if diff <> 0: - d.append(diff) - count = len(d) - absd = map(abs, d) - absranked = rankdata(absd) - r_plus = 0.0 - r_minus = 0.0 - for i in range(len(absd)): - if d[i] < 0: - r_minus = r_minus + absranked[i] - else: - r_plus = r_plus + absranked[i] - wt = min(r_plus, r_minus) - mn = count * (count + 1) * 0.25 - se = math.sqrt(count * (count + 1) * (2.0 * count + 1.0) / 24.0) - z = math.fabs(wt - mn) / se - prob = 2 * (1.0 - zprob(abs(z))) - return wt, prob - - -def lkruskalwallish(*args): - """ -The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more -groups, requiring at least 5 subjects in each group. This function -calculates the Kruskal-Wallis H-test for 3 or more independent samples -and returns the result. - -Usage: lkruskalwallish(*args) -Returns: H-statistic (corrected for ties), associated p-value -""" - args = list(args) - n = [0] * len(args) - all = [] - n = map(len, args) - for i in range(len(args)): - all = all + args[i] - ranked = rankdata(all) - T = tiecorrect(ranked) - for i in range(len(args)): - args[i] = ranked[0:n[i]] - del ranked[0:n[i]] - rsums = [] - for i in range(len(args)): - rsums.append(sum(args[i])**2) - rsums[i] = rsums[i] / float(n[i]) - ssbn = sum(rsums) - totaln = sum(n) - h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) - df = len(args) - 1 - if T == 0: - raise ValueError, 'All numbers are identical in lkruskalwallish' - h = h / float(T) - return h, chisqprob(h, df) - - -def lfriedmanchisquare(*args): - """ -Friedman Chi-Square is a non-parametric, one-way within-subjects -ANOVA. This function calculates the Friedman Chi-square test for repeated -measures and returns the result, along with the associated probability -value. It assumes 3 or more repeated measures. Only 3 levels requires a -minimum of 10 subjects in the study. Four levels requires 5 subjects per -level(??). - -Usage: lfriedmanchisquare(*args) -Returns: chi-square statistic, associated p-value -""" - k = len(args) - if k < 3: - raise ValueError, 'Less than 3 levels. Friedman test not appropriate.' - n = len(args[0]) - data = apply(pstat.abut, tuple(args)) - for i in range(len(data)): - data[i] = rankdata(data[i]) - ssbn = 0 - for i in range(k): - ssbn = ssbn + sum(args[i])**2 - chisq = 12.0 / (k * n * (k + 1)) * ssbn - 3 * n * (k + 1) - return chisq, chisqprob(chisq, k - 1) - -#################################### -#### PROBABILITY CALCULATIONS #### -#################################### - - -def lchisqprob(chisq, df): - """ -Returns the (1-tailed) probability value associated with the provided -chi-square value and df. Adapted from chisq.c in Gary Perlman's |Stat. - -Usage: lchisqprob(chisq,df) -""" - BIG = 20.0 - - def ex(x): - BIG = 20.0 - if x < -BIG: - return 0.0 - else: - return math.exp(x) - - if chisq <= 0 or df < 1: - return 1.0 - a = 0.5 * chisq - if df % 2 == 0: - even = 1 - else: - even = 0 - if df > 1: - y = ex(-a) - if even: - s = y - else: - s = 2.0 * zprob(-math.sqrt(chisq)) - if (df > 2): - chisq = 0.5 * (df - 1.0) - if even: - z = 1.0 - else: - z = 0.5 - if a > BIG: - if even: - e = 0.0 - else: - e = math.log(math.sqrt(math.pi)) - c = math.log(a) - while (z <= chisq): - e = math.log(z) + e - s = s + ex(c * z - a - e) - z = z + 1.0 - return s - else: - if even: - e = 1.0 - else: - e = 1.0 / math.sqrt(math.pi) / math.sqrt(a) - c = 0.0 - while (z <= chisq): - e = e * (a / float(z)) - c = c + e - z = z + 1.0 - return (c * y + s) - else: - return s - - -def lerfcc(x): - """ -Returns the complementary error function erfc(x) with fractional -error everywhere less than 1.2e-7. Adapted from Numerical Recipies. - -Usage: lerfcc(x) -""" - z = abs(x) - t = 1.0 / (1.0 + 0.5 * z) - ans = t * math.exp(-z * z - 1.26551223 + t * (1.00002368 + t * ( - 0.37409196 + t * (0.09678418 + t * (-0.18628806 + t * (0.27886807 + t * ( - -1.13520398 + t * (1.48851587 + t * (-0.82215223 + t * 0.17087277))))) - )))) - if x >= 0: - return ans - else: - return 2.0 - ans - - -def lzprob(z): - """ -Returns the area under the normal curve 'to the left of' the given z value. -Thus, - for z<0, zprob(z) = 1-tail probability - for z>0, 1.0-zprob(z) = 1-tail probability - for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability -Adapted from z.c in Gary Perlman's |Stat. - -Usage: lzprob(z) -""" - Z_MAX = 6.0 # maximum meaningful z-value - if z == 0.0: - x = 0.0 - else: - y = 0.5 * math.fabs(z) - if y >= (Z_MAX * 0.5): - x = 1.0 - elif (y < 1.0): - w = y * y - x = (( - ((((((0.000124818987 * w - 0.001075204047) * w + 0.005198775019) * w - - 0.019198292004) * w + 0.059054035642) * w - 0.151968751364) * w + - 0.319152932694) * w - 0.531923007300) * w + 0.797884560593) * y * 2.0 - else: - y = y - 2.0 - x = ((((((( - ((((((-0.000045255659 * y + 0.000152529290) * y - 0.000019538132) * y - - 0.000676904986) * y + 0.001390604284) * y - 0.000794620820) * y - - 0.002034254874) * y + 0.006549791214) * y - 0.010557625006) * y + - 0.011630447319) * y - 0.009279453341) * y + 0.005353579108) * y - - 0.002141268741) * y + 0.000535310849) * y + 0.999936657524 - if z > 0.0: - prob = ((x + 1.0) * 0.5) - else: - prob = ((1.0 - x) * 0.5) - return prob - - -def lksprob(alam): - """ -Computes a Kolmolgorov-Smirnov t-test significance level. Adapted from -Numerical Recipies. - -Usage: lksprob(alam) -""" - fac = 2.0 - sum = 0.0 - termbf = 0.0 - a2 = -2.0 * alam * alam - for j in range(1, 201): - term = fac * math.exp(a2 * j * j) - sum = sum + term - if math.fabs(term) <= (0.001 * termbf) or math.fabs(term) < (1.0e-8 * sum): - return sum - fac = -fac - termbf = math.fabs(term) - return 1.0 # Get here only if fails to converge; was 0.0!! - - -def lfprob(dfnum, dfden, F): - """ -Returns the (1-tailed) significance level (p-value) of an F -statistic given the degrees of freedom for the numerator (dfR-dfF) and -the degrees of freedom for the denominator (dfF). - -Usage: lfprob(dfnum, dfden, F) where usually dfnum=dfbn, dfden=dfwn -""" - p = betai(0.5 * dfden, 0.5 * dfnum, dfden / float(dfden + dfnum * F)) - return p - - -def lbetacf(a, b, x): - """ -This function evaluates the continued fraction form of the incomplete -Beta function, betai. (Adapted from: Numerical Recipies in C.) - -Usage: lbetacf(a,b,x) -""" - ITMAX = 200 - EPS = 3.0e-7 - - bm = az = am = 1.0 - qab = a + b - qap = a + 1.0 - qam = a - 1.0 - bz = 1.0 - qab * x / qap - for i in range(ITMAX + 1): - em = float(i + 1) - tem = em + em - d = em * (b - em) * x / ((qam + tem) * (a + tem)) - ap = az + d * am - bp = bz + d * bm - d = -(a + em) * (qab + em) * x / ((qap + tem) * (a + tem)) - app = ap + d * az - bpp = bp + d * bz - aold = az - am = ap / bpp - bm = bp / bpp - az = app / bpp - bz = 1.0 - if (abs(az - aold) < (EPS * abs(az))): - return az - print 'a or b too big, or ITMAX too small in Betacf.' - - -def lgammln(xx): - """ -Returns the gamma function of xx. - Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt. -(Adapted from: Numerical Recipies in C.) - -Usage: lgammln(xx) -""" - - coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, 0.120858003e-2, - -0.536382e-5] - x = xx - 1.0 - tmp = x + 5.5 - tmp = tmp - (x + 0.5) * math.log(tmp) - ser = 1.0 - for j in range(len(coeff)): - x = x + 1 - ser = ser + coeff[j] / x - return -tmp + math.log(2.50662827465 * ser) - - -def lbetai(a, b, x): - """ -Returns the incomplete beta function: - - I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) - -where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma -function of a. The continued fraction formulation is implemented here, -using the betacf function. (Adapted from: Numerical Recipies in C.) - -Usage: lbetai(a,b,x) -""" - if (x < 0.0 or x > 1.0): - raise ValueError, 'Bad x in lbetai' - if (x == 0.0 or x == 1.0): - bt = 0.0 - else: - bt = math.exp(gammln(a + b) - gammln(a) - gammln(b) + a * math.log(x) + b * - math.log(1.0 - x)) - if (x < (a + 1.0) / (a + b + 2.0)): - return bt * betacf(a, b, x) / float(a) - else: - return 1.0 - bt * betacf(b, a, 1.0 - x) / float(b) - -#################################### -####### ANOVA CALCULATIONS ####### -#################################### - - -def lF_oneway(*lists): - """ -Performs a 1-way ANOVA, returning an F-value and probability given -any number of groups. From Heiman, pp.394-7. - -Usage: F_oneway(*lists) where *lists is any number of lists, one per - treatment group -Returns: F value, one-tailed p-value -""" - a = len(lists) # ANOVA on 'a' groups, each in it's own list - means = [0] * a - vars = [0] * a - ns = [0] * a - alldata = [] - tmp = map(N.array, lists) - means = map(amean, tmp) - vars = map(avar, tmp) - ns = map(len, lists) - for i in range(len(lists)): - alldata = alldata + lists[i] - alldata = N.array(alldata) - bign = len(alldata) - sstot = ass(alldata) - (asquare_of_sums(alldata) / float(bign)) - ssbn = 0 - for list in lists: - ssbn = ssbn + asquare_of_sums(N.array(list)) / float(len(list)) - ssbn = ssbn - (asquare_of_sums(alldata) / float(bign)) - sswn = sstot - ssbn - dfbn = a - 1 - dfwn = bign - a - msb = ssbn / float(dfbn) - msw = sswn / float(dfwn) - f = msb / msw - prob = fprob(dfbn, dfwn, f) - return f, prob - - -def lF_value(ER, EF, dfnum, dfden): - """ -Returns an F-statistic given the following: - ER = error associated with the null hypothesis (the Restricted model) - EF = error associated with the alternate hypothesis (the Full model) - dfR-dfF = degrees of freedom of the numerator - dfF = degrees of freedom associated with the denominator/Full model - -Usage: lF_value(ER,EF,dfnum,dfden) -""" - return ((ER - EF) / float(dfnum) / (EF / float(dfden))) - -#################################### -######## SUPPORT FUNCTIONS ####### -#################################### - - -def writecc(listoflists, file, writetype='w', extra=2): - """ -Writes a list of lists to a file in columns, customized by the max -size of items within the columns (max size of items in col, +2 characters) -to specified file. File-overwrite is the default. - -Usage: writecc (listoflists,file,writetype='w',extra=2) -Returns: None -""" - if type(listoflists[0]) not in [ListType, TupleType]: - listoflists = [listoflists] - outfile = open(file, writetype) - rowstokill = [] - list2print = copy.deepcopy(listoflists) - for i in range(len(listoflists)): - if listoflists[i] == [ - '\n' - ] or listoflists[i] == '\n' or listoflists[i] == 'dashes': - rowstokill = rowstokill + [i] - rowstokill.reverse() - for row in rowstokill: - del list2print[row] - maxsize = [0] * len(list2print[0]) - for col in range(len(list2print[0])): - items = pstat.colex(list2print, col) - items = map(pstat.makestr, items) - maxsize[col] = max(map(len, items)) + extra - for row in listoflists: - if row == ['\n'] or row == '\n': - outfile.write('\n') - elif row == ['dashes'] or row == 'dashes': - dashes = [0] * len(maxsize) - for j in range(len(maxsize)): - dashes[j] = '-' * (maxsize[j] - 2) - outfile.write(pstat.lineincustcols(dashes, maxsize)) - else: - outfile.write(pstat.lineincustcols(row, maxsize)) - outfile.write('\n') - outfile.close() - return None - - -def lincr(l, cap): # to increment a list up to a max-list of 'cap' - """ -Simulate a counting system from an n-dimensional list. - -Usage: lincr(l,cap) l=list to increment, cap=max values for each list pos'n -Returns: next set of values for list l, OR -1 (if overflow) -""" - l[0] = l[0] + 1 # e.g., [0,0,0] --> [2,4,3] (=cap) - for i in range(len(l)): - if l[i] > cap[i] and i < len(l) - 1: # if carryover AND not done - l[i] = 0 - l[i + 1] = l[i + 1] + 1 - elif l[i] > cap[i] and i == len( - l) - 1: # overflow past last column, must be finished - l = -1 - return l - - -def lsum(inlist): - """ -Returns the sum of the items in the passed list. - -Usage: lsum(inlist) -""" - s = 0 - for item in inlist: - s = s + item - return s - - -def lcumsum(inlist): - """ -Returns a list consisting of the cumulative sum of the items in the -passed list. - -Usage: lcumsum(inlist) -""" - newlist = copy.deepcopy(inlist) - for i in range(1, len(newlist)): - newlist[i] = newlist[i] + newlist[i - 1] - return newlist - - -def lss(inlist): - """ -Squares each value in the passed list, adds up these squares and -returns the result. - -Usage: lss(inlist) -""" - ss = 0 - for item in inlist: - ss = ss + item * item - return ss - - -def lsummult(list1, list2): - """ -Multiplies elements in list1 and list2, element by element, and -returns the sum of all resulting multiplications. Must provide equal -length lists. - -Usage: lsummult(list1,list2) -""" - if len(list1) <> len(list2): - raise ValueError, 'Lists not equal length in summult.' - s = 0 - for item1, item2 in pstat.abut(list1, list2): - s = s + item1 * item2 - return s - - -def lsumdiffsquared(x, y): - """ -Takes pairwise differences of the values in lists x and y, squares -these differences, and returns the sum of these squares. - -Usage: lsumdiffsquared(x,y) -Returns: sum[(x[i]-y[i])**2] -""" - sds = 0 - for i in range(len(x)): - sds = sds + (x[i] - y[i])**2 - return sds - - -def lsquare_of_sums(inlist): - """ -Adds the values in the passed list, squares the sum, and returns -the result. - -Usage: lsquare_of_sums(inlist) -Returns: sum(inlist[i])**2 -""" - s = sum(inlist) - return float(s) * s - - -def lshellsort(inlist): - """ -Shellsort algorithm. Sorts a 1D-list. - -Usage: lshellsort(inlist) -Returns: sorted-inlist, sorting-index-vector (for original list) -""" - n = len(inlist) - svec = copy.deepcopy(inlist) - ivec = range(n) - gap = n / 2 # integer division needed - while gap > 0: - for i in range(gap, n): - for j in range(i - gap, -1, -gap): - while j >= 0 and svec[j] > svec[j + gap]: - temp = svec[j] - svec[j] = svec[j + gap] - svec[j + gap] = temp - itemp = ivec[j] - ivec[j] = ivec[j + gap] - ivec[j + gap] = itemp - gap = gap / 2 # integer division needed -# svec is now sorted inlist, and ivec has the order svec[i] = vec[ivec[i]] - return svec, ivec - - -def lrankdata(inlist): - """ -Ranks the data in inlist, dealing with ties appropritely. Assumes -a 1D inlist. Adapted from Gary Perlman's |Stat ranksort. - -Usage: lrankdata(inlist) -Returns: a list of length equal to inlist, containing rank scores -""" - n = len(inlist) - svec, ivec = shellsort(inlist) - sumranks = 0 - dupcount = 0 - newlist = [0] * n - for i in range(n): - sumranks = sumranks + i - dupcount = dupcount + 1 - if i == n - 1 or svec[i] <> svec[i + 1]: - averank = sumranks / float(dupcount) + 1 - for j in range(i - dupcount + 1, i + 1): - newlist[ivec[j]] = averank - sumranks = 0 - dupcount = 0 - return newlist - - -def outputpairedstats(fname, writemode, name1, n1, m1, se1, min1, max1, name2, - n2, m2, se2, min2, max2, statname, stat, prob): - """ -Prints or write to a file stats for two groups, using the name, n, -mean, sterr, min and max for each group, as well as the statistic name, -its value, and the associated p-value. - -Usage: outputpairedstats(fname,writemode, - name1,n1,mean1,stderr1,min1,max1, - name2,n2,mean2,stderr2,min2,max2, - statname,stat,prob) -Returns: None -""" - suffix = '' # for *s after the p-value - try: - x = prob.shape - prob = prob[0] - except: - pass - if prob < 0.001: - suffix = ' ***' - elif prob < 0.01: - suffix = ' **' - elif prob < 0.05: - suffix = ' *' - title = [['Name', 'N', 'Mean', 'SD', 'Min', 'Max']] - lofl = title + [[name1, n1, round(m1, 3), round( - math.sqrt(se1), 3), min1, max1], [name2, n2, round(m2, 3), round( - math.sqrt(se2), 3), min2, max2]] - if type(fname) <> StringType or len(fname) == 0: - print - print statname - print - pstat.printcc(lofl) - print - try: - if stat.shape == (): - stat = stat[0] - if prob.shape == (): - prob = prob[0] - except: - pass - print 'Test statistic = ', round(stat, 3), ' p = ', round(prob, 3), suffix - print - else: - file = open(fname, writemode) - file.write('\n' + statname + '\n\n') - file.close() - writecc(lofl, fname, 'a') - file = open(fname, 'a') - try: - if stat.shape == (): - stat = stat[0] - if prob.shape == (): - prob = prob[0] - except: - pass - file.write(pstat.list2string(['\nTest statistic = ', round(stat, 4), - ' p = ', round(prob, 4), suffix, '\n\n'])) - file.close() - return None - - -def lfindwithin(data): - """ -Returns an integer representing a binary vector, where 1=within- -subject factor, 0=between. Input equals the entire data 2D list (i.e., -column 0=random factor, column -1=measured values (those two are skipped). -Note: input data is in |Stat format ... a list of lists ("2D list") with -one row per measured value, first column=subject identifier, last column= -score, one in-between column per factor (these columns contain level -designations on each factor). See also stats.anova.__doc__. - -Usage: lfindwithin(data) data in |Stat format -""" - - numfact = len(data[0]) - 1 - withinvec = 0 - for col in range(1, numfact): - examplelevel = pstat.unique(pstat.colex(data, col))[0] - rows = pstat.linexand(data, col, examplelevel) # get 1 level of this factor - factsubjs = pstat.unique(pstat.colex(rows, 0)) - allsubjs = pstat.unique(pstat.colex(data, 0)) - if len(factsubjs) == len(allsubjs): # fewer Ss than scores on this factor? - withinvec = withinvec + (1 << col) - return withinvec - -######################################################### -######################################################### -####### DISPATCH LISTS AND TUPLES TO ABOVE FCNS ######### -######################################################### -######################################################### - -## CENTRAL TENDENCY: -geometricmean = Dispatch((lgeometricmean, (ListType, TupleType)),) -harmonicmean = Dispatch((lharmonicmean, (ListType, TupleType)),) -mean = Dispatch((lmean, (ListType, TupleType)),) -median = Dispatch((lmedian, (ListType, TupleType)),) -medianscore = Dispatch((lmedianscore, (ListType, TupleType)),) -mode = Dispatch((lmode, (ListType, TupleType)),) - -## MOMENTS: -moment = Dispatch((lmoment, (ListType, TupleType)),) -variation = Dispatch((lvariation, (ListType, TupleType)),) -skew = Dispatch((lskew, (ListType, TupleType)),) -kurtosis = Dispatch((lkurtosis, (ListType, TupleType)),) -describe = Dispatch((ldescribe, (ListType, TupleType)),) - -## FREQUENCY STATISTICS: -itemfreq = Dispatch((litemfreq, (ListType, TupleType)),) -scoreatpercentile = Dispatch((lscoreatpercentile, (ListType, TupleType)),) -percentileofscore = Dispatch((lpercentileofscore, (ListType, TupleType)),) -histogram = Dispatch((lhistogram, (ListType, TupleType)),) -cumfreq = Dispatch((lcumfreq, (ListType, TupleType)),) -relfreq = Dispatch((lrelfreq, (ListType, TupleType)),) - -## VARIABILITY: -obrientransform = Dispatch((lobrientransform, (ListType, TupleType)),) -samplevar = Dispatch((lsamplevar, (ListType, TupleType)),) -samplestdev = Dispatch((lsamplestdev, (ListType, TupleType)),) -var = Dispatch((lvar, (ListType, TupleType)),) -stdev = Dispatch((lstdev, (ListType, TupleType)),) -sterr = Dispatch((lsterr, (ListType, TupleType)),) -sem = Dispatch((lsem, (ListType, TupleType)),) -z = Dispatch((lz, (ListType, TupleType)),) -zs = Dispatch((lzs, (ListType, TupleType)),) - -## TRIMMING FCNS: -trimboth = Dispatch((ltrimboth, (ListType, TupleType)),) -trim1 = Dispatch((ltrim1, (ListType, TupleType)),) - -## CORRELATION FCNS: -paired = Dispatch((lpaired, (ListType, TupleType)),) -pearsonr = Dispatch((lpearsonr, (ListType, TupleType)),) -spearmanr = Dispatch((lspearmanr, (ListType, TupleType)),) -pointbiserialr = Dispatch((lpointbiserialr, (ListType, TupleType)),) -kendalltau = Dispatch((lkendalltau, (ListType, TupleType)),) -linregress = Dispatch((llinregress, (ListType, TupleType)),) - -## INFERENTIAL STATS: -ttest_1samp = Dispatch((lttest_1samp, (ListType, TupleType)),) -ttest_ind = Dispatch((lttest_ind, (ListType, TupleType)),) -ttest_rel = Dispatch((lttest_rel, (ListType, TupleType)),) -chisquare = Dispatch((lchisquare, (ListType, TupleType)),) -ks_2samp = Dispatch((lks_2samp, (ListType, TupleType)),) -mannwhitneyu = Dispatch((lmannwhitneyu, (ListType, TupleType)),) -ranksums = Dispatch((lranksums, (ListType, TupleType)),) -tiecorrect = Dispatch((ltiecorrect, (ListType, TupleType)),) -wilcoxont = Dispatch((lwilcoxont, (ListType, TupleType)),) -kruskalwallish = Dispatch((lkruskalwallish, (ListType, TupleType)),) -friedmanchisquare = Dispatch((lfriedmanchisquare, (ListType, TupleType)),) - -## PROBABILITY CALCS: -chisqprob = Dispatch((lchisqprob, (IntType, FloatType)),) -zprob = Dispatch((lzprob, (IntType, FloatType)),) -ksprob = Dispatch((lksprob, (IntType, FloatType)),) -fprob = Dispatch((lfprob, (IntType, FloatType)),) -betacf = Dispatch((lbetacf, (IntType, FloatType)),) -betai = Dispatch((lbetai, (IntType, FloatType)),) -erfcc = Dispatch((lerfcc, (IntType, FloatType)),) -gammln = Dispatch((lgammln, (IntType, FloatType)),) - -## ANOVA FUNCTIONS: -F_oneway = Dispatch((lF_oneway, (ListType, TupleType)),) -F_value = Dispatch((lF_value, (ListType, TupleType)),) - -## SUPPORT FUNCTIONS: -incr = Dispatch((lincr, (ListType, TupleType)),) -sum = Dispatch((lsum, (ListType, TupleType)),) -cumsum = Dispatch((lcumsum, (ListType, TupleType)),) -ss = Dispatch((lss, (ListType, TupleType)),) -summult = Dispatch((lsummult, (ListType, TupleType)),) -square_of_sums = Dispatch((lsquare_of_sums, (ListType, TupleType)),) -sumdiffsquared = Dispatch((lsumdiffsquared, (ListType, TupleType)),) -shellsort = Dispatch((lshellsort, (ListType, TupleType)),) -rankdata = Dispatch((lrankdata, (ListType, TupleType)),) -findwithin = Dispatch((lfindwithin, (ListType, TupleType)),) - -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== -#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== - -try: # DEFINE THESE *ONLY* IF NUMERIC IS AVAILABLE - import numpy as N - import numpy.linalg as LA - - ##################################### - ######## ACENTRAL TENDENCY ######## - ##################################### - - - def ageometricmean(inarray, dimension=None, keepdims=0): - """ -Calculates the geometric mean of the values in the passed array. -That is: n-th root of (x1 * x2 * ... * xn). Defaults to ALL values in -the passed array. Use dimension=None to flatten array first. REMEMBER: if -dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and -if dimension is a sequence, it collapses over all specified dimensions. If -keepdims is set to 1, the resulting array will have as many dimensions as -inarray, with only 1 'level' per dim that was collapsed over. - -Usage: ageometricmean(inarray,dimension=None,keepdims=0) -Returns: geometric mean computed over dim(s) listed in dimension -""" - inarray = N.array(inarray, N.float_) - if dimension == None: - inarray = N.ravel(inarray) - size = len(inarray) - mult = N.power(inarray, 1.0 / size) - mult = N.multiply.reduce(mult) - elif type(dimension) in [IntType, FloatType]: - size = inarray.shape[dimension] - mult = N.power(inarray, 1.0 / size) - mult = N.multiply.reduce(mult, dimension) - if keepdims == 1: - shp = list(inarray.shape) - shp[dimension] = 1 - sum = N.reshape(sum, shp) - else: # must be a SEQUENCE of dims to average over - dims = list(dimension) - dims.sort() - dims.reverse() - size = N.array(N.multiply.reduce(N.take(inarray.shape, dims)), N.float_) - mult = N.power(inarray, 1.0 / size) - for dim in dims: - mult = N.multiply.reduce(mult, dim) - if keepdims == 1: - shp = list(inarray.shape) - for dim in dims: - shp[dim] = 1 - mult = N.reshape(mult, shp) - return mult - - def aharmonicmean(inarray, dimension=None, keepdims=0): - """ -Calculates the harmonic mean of the values in the passed array. -That is: n / (1/x1 + 1/x2 + ... + 1/xn). Defaults to ALL values in -the passed array. Use dimension=None to flatten array first. REMEMBER: if -dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and -if dimension is a sequence, it collapses over all specified dimensions. If -keepdims is set to 1, the resulting array will have as many dimensions as -inarray, with only 1 'level' per dim that was collapsed over. - -Usage: aharmonicmean(inarray,dimension=None,keepdims=0) -Returns: harmonic mean computed over dim(s) in dimension -""" - inarray = inarray.astype(N.float_) - if dimension == None: - inarray = N.ravel(inarray) - size = len(inarray) - s = N.add.reduce(1.0 / inarray) - elif type(dimension) in [IntType, FloatType]: - size = float(inarray.shape[dimension]) - s = N.add.reduce(1.0 / inarray, dimension) - if keepdims == 1: - shp = list(inarray.shape) - shp[dimension] = 1 - s = N.reshape(s, shp) - else: # must be a SEQUENCE of dims to average over - dims = list(dimension) - dims.sort() - nondims = [] - for i in range(len(inarray.shape)): - if i not in dims: - nondims.append(i) - tinarray = N.transpose(inarray, nondims + dims) # put keep-dims first - idx = [0] * len(nondims) - if idx == []: - size = len(N.ravel(inarray)) - s = asum(1.0 / inarray) - if keepdims == 1: - s = N.reshape([s], N.ones(len(inarray.shape))) - else: - idx[0] = -1 - loopcap = N.array(tinarray.shape[0:len(nondims)]) - 1 - s = N.zeros(loopcap + 1, N.float_) - while incr(idx, loopcap) <> -1: - s[idx] = asum(1.0 / tinarray[idx]) - size = N.multiply.reduce(N.take(inarray.shape, dims)) - if keepdims == 1: - shp = list(inarray.shape) - for dim in dims: - shp[dim] = 1 - s = N.reshape(s, shp) - return size / s - - def amean(inarray, dimension=None, keepdims=0): - """ -Calculates the arithmatic mean of the values in the passed array. -That is: 1/n * (x1 + x2 + ... + xn). Defaults to ALL values in the -passed array. Use dimension=None to flatten array first. REMEMBER: if -dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and -if dimension is a sequence, it collapses over all specified dimensions. If -keepdims is set to 1, the resulting array will have as many dimensions as -inarray, with only 1 'level' per dim that was collapsed over. - -Usage: amean(inarray,dimension=None,keepdims=0) -Returns: arithematic mean calculated over dim(s) in dimension -""" - if inarray.dtype in [N.int_, N.short, N.ubyte]: - inarray = inarray.astype(N.float_) - if dimension == None: - inarray = N.ravel(inarray) - sum = N.add.reduce(inarray) - denom = float(len(inarray)) - elif type(dimension) in [IntType, FloatType]: - sum = asum(inarray, dimension) - denom = float(inarray.shape[dimension]) - if keepdims == 1: - shp = list(inarray.shape) - shp[dimension] = 1 - sum = N.reshape(sum, shp) - else: # must be a TUPLE of dims to average over - dims = list(dimension) - dims.sort() - dims.reverse() - sum = inarray * 1.0 - for dim in dims: - sum = N.add.reduce(sum, dim) - denom = N.array(N.multiply.reduce(N.take(inarray.shape, dims)), N.float_) - if keepdims == 1: - shp = list(inarray.shape) - for dim in dims: - shp[dim] = 1 - sum = N.reshape(sum, shp) - return sum / denom - - def amedian(inarray, numbins=1000): - """ -Calculates the COMPUTED median value of an array of numbers, given the -number of bins to use for the histogram (more bins approaches finding the -precise median value of the array; default number of bins = 1000). From -G.W. Heiman's Basic Stats, or CRC Probability & Statistics. -NOTE: THIS ROUTINE ALWAYS uses the entire passed array (flattens it first). - -Usage: amedian(inarray,numbins=1000) -Returns: median calculated over ALL values in inarray -""" - inarray = N.ravel(inarray) - (hist, smallest, binsize, extras) = ahistogram(inarray, numbins, - [min(inarray), max(inarray)]) - cumhist = N.cumsum(hist) # make cumulative histogram - otherbins = N.greater_equal(cumhist, len(inarray) / 2.0) - otherbins = list(otherbins) # list of 0/1s, 1s start at median bin - cfbin = otherbins.index(1) # get 1st(!) index holding 50%ile score - LRL = smallest + binsize * cfbin # get lower read limit of that bin - cfbelow = N.add.reduce(hist[0:cfbin]) # cum. freq. below bin - freq = hist[cfbin] # frequency IN the 50%ile bin - median = LRL + ( - (len(inarray) / 2.0 - cfbelow) / float(freq)) * binsize # MEDIAN - return median - - def amedianscore(inarray, dimension=None): - """ -Returns the 'middle' score of the passed array. If there is an even -number of scores, the mean of the 2 middle scores is returned. Can function -with 1D arrays, or on the FIRST dimension of 2D arrays (i.e., dimension can -be None, to pre-flatten the array, or else dimension must equal 0). - -Usage: amedianscore(inarray,dimension=None) -Returns: 'middle' score of the array, or the mean of the 2 middle scores -""" - if dimension == None: - inarray = N.ravel(inarray) - dimension = 0 - inarray = N.sort(inarray, dimension) - if inarray.shape[dimension] % 2 == 0: # if even number of elements - indx = inarray.shape[dimension] / 2 # integer division correct - median = N.asarray(inarray[indx] + inarray[indx - 1]) / 2.0 - else: - indx = inarray.shape[dimension] / 2 # integer division correct - median = N.take(inarray, [indx], dimension) - if median.shape == (1,): - median = median[0] - return median - - def amode(a, dimension=None): - """ -Returns an array of the modal (most common) score in the passed array. -If there is more than one such score, ONLY THE FIRST is returned. -The bin-count for the modal values is also returned. Operates on whole -array (dimension=None), or on a given dimension. - -Usage: amode(a, dimension=None) -Returns: array of bin-counts for mode(s), array of corresponding modal values -""" - - if dimension == None: - a = N.ravel(a) - dimension = 0 - scores = pstat.aunique(N.ravel(a)) # get ALL unique values - testshape = list(a.shape) - testshape[dimension] = 1 - oldmostfreq = N.zeros(testshape) - oldcounts = N.zeros(testshape) - for score in scores: - template = N.equal(a, score) - counts = asum(template, dimension, 1) - mostfrequent = N.where(counts > oldcounts, score, oldmostfreq) - oldcounts = N.where(counts > oldcounts, counts, oldcounts) - oldmostfreq = mostfrequent - return oldcounts, mostfrequent - - def atmean(a, limits=None, inclusive=(1, 1)): - """ -Returns the arithmetic mean of all values in an array, ignoring values -strictly outside the sequence passed to 'limits'. Note: either limit -in the sequence, or the value of limits itself, can be set to None. The -inclusive list/tuple determines whether the lower and upper limiting bounds -(respectively) are open/exclusive (0) or closed/inclusive (1). - -Usage: atmean(a,limits=None,inclusive=(1,1)) -""" - if a.dtype in [N.int_, N.short, N.ubyte]: - a = a.astype(N.float_) - if limits == None: - return mean(a) - assert type(limits) in [ListType, TupleType, N.ndarray - ], 'Wrong type for limits in atmean' - if inclusive[0]: - lowerfcn = N.greater_equal - else: - lowerfcn = N.greater - if inclusive[1]: - upperfcn = N.less_equal - else: - upperfcn = N.less - if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce( - N.ravel(a)): - raise ValueError, 'No array values within given limits (atmean).' - elif limits[0] == None and limits[1] <> None: - mask = upperfcn(a, limits[1]) - elif limits[0] <> None and limits[1] == None: - mask = lowerfcn(a, limits[0]) - elif limits[0] <> None and limits[1] <> None: - mask = lowerfcn(a, limits[0]) * upperfcn(a, limits[1]) - s = float(N.add.reduce(N.ravel(a * mask))) - n = float(N.add.reduce(N.ravel(mask))) - return s / n - - def atvar(a, limits=None, inclusive=(1, 1)): - """ -Returns the sample variance of values in an array, (i.e., using N-1), -ignoring values strictly outside the sequence passed to 'limits'. -Note: either limit in the sequence, or the value of limits itself, -can be set to None. The inclusive list/tuple determines whether the lower -and upper limiting bounds (respectively) are open/exclusive (0) or -closed/inclusive (1). ASSUMES A FLAT ARRAY (OR ELSE PREFLATTENS). - -Usage: atvar(a,limits=None,inclusive=(1,1)) -""" - a = a.astype(N.float_) - if limits == None or limits == [None, None]: - return avar(a) - assert type(limits) in [ListType, TupleType, N.ndarray - ], 'Wrong type for limits in atvar' - if inclusive[0]: - lowerfcn = N.greater_equal - else: - lowerfcn = N.greater - if inclusive[1]: - upperfcn = N.less_equal - else: - upperfcn = N.less - if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce( - N.ravel(a)): - raise ValueError, 'No array values within given limits (atvar).' - elif limits[0] == None and limits[1] <> None: - mask = upperfcn(a, limits[1]) - elif limits[0] <> None and limits[1] == None: - mask = lowerfcn(a, limits[0]) - elif limits[0] <> None and limits[1] <> None: - mask = lowerfcn(a, limits[0]) * upperfcn(a, limits[1]) - - a = N.compress(mask, a) # squish out excluded values - return avar(a) - - def atmin(a, lowerlimit=None, dimension=None, inclusive=1): - """ -Returns the minimum value of a, along dimension, including only values less -than (or equal to, if inclusive=1) lowerlimit. If the limit is set to None, -all values in the array are used. - -Usage: atmin(a,lowerlimit=None,dimension=None,inclusive=1) -""" - if inclusive: - lowerfcn = N.greater - else: - lowerfcn = N.greater_equal - if dimension == None: - a = N.ravel(a) - dimension = 0 - if lowerlimit == None: - lowerlimit = N.minimum.reduce(N.ravel(a)) - 11 - biggest = N.maximum.reduce(N.ravel(a)) - ta = N.where(lowerfcn(a, lowerlimit), a, biggest) - return N.minimum.reduce(ta, dimension) - - def atmax(a, upperlimit, dimension=None, inclusive=1): - """ -Returns the maximum value of a, along dimension, including only values greater -than (or equal to, if inclusive=1) upperlimit. If the limit is set to None, -a limit larger than the max value in the array is used. - -Usage: atmax(a,upperlimit,dimension=None,inclusive=1) -""" - if inclusive: - upperfcn = N.less - else: - upperfcn = N.less_equal - if dimension == None: - a = N.ravel(a) - dimension = 0 - if upperlimit == None: - upperlimit = N.maximum.reduce(N.ravel(a)) + 1 - smallest = N.minimum.reduce(N.ravel(a)) - ta = N.where(upperfcn(a, upperlimit), a, smallest) - return N.maximum.reduce(ta, dimension) - - def atstdev(a, limits=None, inclusive=(1, 1)): - """ -Returns the standard deviation of all values in an array, ignoring values -strictly outside the sequence passed to 'limits'. Note: either limit -in the sequence, or the value of limits itself, can be set to None. The -inclusive list/tuple determines whether the lower and upper limiting bounds -(respectively) are open/exclusive (0) or closed/inclusive (1). - -Usage: atstdev(a,limits=None,inclusive=(1,1)) -""" - return N.sqrt(tvar(a, limits, inclusive)) - - def atsem(a, limits=None, inclusive=(1, 1)): - """ -Returns the standard error of the mean for the values in an array, -(i.e., using N for the denominator), ignoring values strictly outside -the sequence passed to 'limits'. Note: either limit in the sequence, -or the value of limits itself, can be set to None. The inclusive list/tuple -determines whether the lower and upper limiting bounds (respectively) are -open/exclusive (0) or closed/inclusive (1). - -Usage: atsem(a,limits=None,inclusive=(1,1)) -""" - sd = tstdev(a, limits, inclusive) - if limits == None or limits == [None, None]: - n = float(len(N.ravel(a))) - limits = [min(a) - 1, max(a) + 1] - assert type(limits) in [ListType, TupleType, N.ndarray - ], 'Wrong type for limits in atsem' - if inclusive[0]: - lowerfcn = N.greater_equal - else: - lowerfcn = N.greater - if inclusive[1]: - upperfcn = N.less_equal - else: - upperfcn = N.less - if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce( - N.ravel(a)): - raise ValueError, 'No array values within given limits (atsem).' - elif limits[0] == None and limits[1] <> None: - mask = upperfcn(a, limits[1]) - elif limits[0] <> None and limits[1] == None: - mask = lowerfcn(a, limits[0]) - elif limits[0] <> None and limits[1] <> None: - mask = lowerfcn(a, limits[0]) * upperfcn(a, limits[1]) - term1 = N.add.reduce(N.ravel(a * a * mask)) - n = float(N.add.reduce(N.ravel(mask))) - return sd / math.sqrt(n) - -##################################### -############ AMOMENTS ############# -##################################### - - def amoment(a, moment=1, dimension=None): - """ -Calculates the nth moment about the mean for a sample (defaults to the -1st moment). Generally used to calculate coefficients of skewness and -kurtosis. Dimension can equal None (ravel array first), an integer -(the dimension over which to operate), or a sequence (operate over -multiple dimensions). - -Usage: amoment(a,moment=1,dimension=None) -Returns: appropriate moment along given dimension -""" - if dimension == None: - a = N.ravel(a) - dimension = 0 - if moment == 1: - return 0.0 - else: - mn = amean(a, dimension, 1) # 1=keepdims - s = N.power((a - mn), moment) - return amean(s, dimension) - - def avariation(a, dimension=None): - """ -Returns the coefficient of variation, as defined in CRC Standard -Probability and Statistics, p.6. Dimension can equal None (ravel array -first), an integer (the dimension over which to operate), or a -sequence (operate over multiple dimensions). - -Usage: avariation(a,dimension=None) -""" - return 100.0 * asamplestdev(a, dimension) / amean(a, dimension) - - def askew(a, dimension=None): - """ -Returns the skewness of a distribution (normal ==> 0.0; >0 means extra -weight in left tail). Use askewtest() to see if it's close enough. -Dimension can equal None (ravel array first), an integer (the -dimension over which to operate), or a sequence (operate over multiple -dimensions). - -Usage: askew(a, dimension=None) -Returns: skew of vals in a along dimension, returning ZERO where all vals equal -""" - denom = N.power(amoment(a, 2, dimension), 1.5) - zero = N.equal(denom, 0) - if type(denom) == N.ndarray and asum(zero) <> 0: - print 'Number of zeros in askew: ', asum(zero) - denom = denom + zero # prevent divide-by-zero - return N.where(zero, 0, amoment(a, 3, dimension) / denom) - - def akurtosis(a, dimension=None): - """ -Returns the kurtosis of a distribution (normal ==> 3.0; >3 means -heavier in the tails, and usually more peaked). Use akurtosistest() -to see if it's close enough. Dimension can equal None (ravel array -first), an integer (the dimension over which to operate), or a -sequence (operate over multiple dimensions). - -Usage: akurtosis(a,dimension=None) -Returns: kurtosis of values in a along dimension, and ZERO where all vals equal -""" - denom = N.power(amoment(a, 2, dimension), 2) - zero = N.equal(denom, 0) - if type(denom) == N.ndarray and asum(zero) <> 0: - print 'Number of zeros in akurtosis: ', asum(zero) - denom = denom + zero # prevent divide-by-zero - return N.where(zero, 0, amoment(a, 4, dimension) / denom) - - def adescribe(inarray, dimension=None): - """ -Returns several descriptive statistics of the passed array. Dimension -can equal None (ravel array first), an integer (the dimension over -which to operate), or a sequence (operate over multiple dimensions). - -Usage: adescribe(inarray,dimension=None) -Returns: n, (min,max), mean, standard deviation, skew, kurtosis -""" - if dimension == None: - inarray = N.ravel(inarray) - dimension = 0 - n = inarray.shape[dimension] - mm = (N.minimum.reduce(inarray), N.maximum.reduce(inarray)) - m = amean(inarray, dimension) - sd = astdev(inarray, dimension) - skew = askew(inarray, dimension) - kurt = akurtosis(inarray, dimension) - return n, mm, m, sd, skew, kurt - -##################################### -######## NORMALITY TESTS ########## -##################################### - - def askewtest(a, dimension=None): - """ -Tests whether the skew is significantly different from a normal -distribution. Dimension can equal None (ravel array first), an -integer (the dimension over which to operate), or a sequence (operate -over multiple dimensions). - -Usage: askewtest(a,dimension=None) -Returns: z-score and 2-tail z-probability -""" - if dimension == None: - a = N.ravel(a) - dimension = 0 - b2 = askew(a, dimension) - n = float(a.shape[dimension]) - y = b2 * N.sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) - beta2 = (3.0 * (n * n + 27 * n - 70) * (n + 1) * - (n + 3)) / ((n - 2.0) * (n + 5) * (n + 7) * (n + 9)) - W2 = -1 + N.sqrt(2 * (beta2 - 1)) - delta = 1 / N.sqrt(N.log(N.sqrt(W2))) - alpha = N.sqrt(2 / (W2 - 1)) - y = N.where(y == 0, 1, y) - Z = delta * N.log(y / alpha + N.sqrt((y / alpha)**2 + 1)) - return Z, (1.0 - zprob(Z)) * 2 - - def akurtosistest(a, dimension=None): - """ -Tests whether a dataset has normal kurtosis (i.e., -kurtosis=3(n-1)/(n+1)) Valid only for n>20. Dimension can equal None -(ravel array first), an integer (the dimension over which to operate), -or a sequence (operate over multiple dimensions). - -Usage: akurtosistest(a,dimension=None) -Returns: z-score and 2-tail z-probability, returns 0 for bad pixels -""" - if dimension == None: - a = N.ravel(a) - dimension = 0 - n = float(a.shape[dimension]) - if n < 20: - print 'akurtosistest only valid for n>=20 ... continuing anyway, n=', n - b2 = akurtosis(a, dimension) - E = 3.0 * (n - 1) / (n + 1) - varb2 = 24.0 * n * (n - 2) * (n - 3) / ((n + 1) * (n + 1) * (n + 3) * - (n + 5)) - x = (b2 - E) / N.sqrt(varb2) - sqrtbeta1 = 6.0 * (n * n - 5 * n + 2) / ((n + 7) * (n + 9)) * N.sqrt( - (6.0 * (n + 3) * (n + 5)) / (n * (n - 2) * (n - 3))) - A = 6.0 + 8.0 / sqrtbeta1 * (2.0 / sqrtbeta1 + - N.sqrt(1 + 4.0 / (sqrtbeta1**2))) - term1 = 1 - 2 / (9.0 * A) - denom = 1 + x * N.sqrt(2 / (A - 4.0)) - denom = N.where(N.less(denom, 0), 99, denom) - term2 = N.where( - N.equal(denom, 0), term1, N.power( - (1 - 2.0 / A) / denom, 1 / 3.0)) - Z = (term1 - term2) / N.sqrt(2 / (9.0 * A)) - Z = N.where(N.equal(denom, 99), 0, Z) - return Z, (1.0 - zprob(Z)) * 2 - - def anormaltest(a, dimension=None): - """ -Tests whether skew and/OR kurtosis of dataset differs from normal -curve. Can operate over multiple dimensions. Dimension can equal -None (ravel array first), an integer (the dimension over which to -operate), or a sequence (operate over multiple dimensions). - -Usage: anormaltest(a,dimension=None) -Returns: z-score and 2-tail probability -""" - if dimension == None: - a = N.ravel(a) - dimension = 0 - s, p = askewtest(a, dimension) - k, p = akurtosistest(a, dimension) - k2 = N.power(s, 2) + N.power(k, 2) - return k2, achisqprob(k2, 2) - -##################################### -###### AFREQUENCY FUNCTIONS ####### -##################################### - - def aitemfreq(a): - """ -Returns a 2D array of item frequencies. Column 1 contains item values, -column 2 contains their respective counts. Assumes a 1D array is passed. -@@@sorting OK? - -Usage: aitemfreq(a) -Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies) -""" - scores = pstat.aunique(a) - scores = N.sort(scores) - freq = N.zeros(len(scores)) - for i in range(len(scores)): - freq[i] = N.add.reduce(N.equal(a, scores[i])) - return N.array(pstat.aabut(scores, freq)) - - def ascoreatpercentile(inarray, percent): - """ -Usage: ascoreatpercentile(inarray,percent) 0<percent<100 -Returns: score at given percentile, relative to inarray distribution -""" - percent = percent / 100.0 - targetcf = percent * len(inarray) - h, lrl, binsize, extras = histogram(inarray) - cumhist = cumsum(h * 1) - for i in range(len(cumhist)): - if cumhist[i] >= targetcf: - break - score = binsize * ( - (targetcf - cumhist[i - 1]) / float(h[i])) + (lrl + binsize * i) - return score - - def apercentileofscore(inarray, score, histbins=10, defaultlimits=None): - """ -Note: result of this function depends on the values used to histogram -the data(!). - -Usage: apercentileofscore(inarray,score,histbins=10,defaultlimits=None) -Returns: percentile-position of score (0-100) relative to inarray -""" - h, lrl, binsize, extras = histogram(inarray, histbins, defaultlimits) - cumhist = cumsum(h * 1) - i = int((score - lrl) / float(binsize)) - pct = (cumhist[i - 1] + ((score - (lrl + binsize * i)) / float(binsize)) * - h[i]) / float(len(inarray)) * 100 - return pct - - def ahistogram(inarray, numbins=10, defaultlimits=None, printextras=1): - """ -Returns (i) an array of histogram bin counts, (ii) the smallest value -of the histogram binning, and (iii) the bin width (the last 2 are not -necessarily integers). Default number of bins is 10. Defaultlimits -can be None (the routine picks bins spanning all the numbers in the -inarray) or a 2-sequence (lowerlimit, upperlimit). Returns all of the -following: array of bin values, lowerreallimit, binsize, extrapoints. - -Usage: ahistogram(inarray,numbins=10,defaultlimits=None,printextras=1) -Returns: (array of bin counts, bin-minimum, min-width, #-points-outside-range) -""" - inarray = N.ravel(inarray) # flatten any >1D arrays - if (defaultlimits <> None): - lowerreallimit = defaultlimits[0] - upperreallimit = defaultlimits[1] - binsize = (upperreallimit - lowerreallimit) / float(numbins) - else: - Min = N.minimum.reduce(inarray) - Max = N.maximum.reduce(inarray) - estbinwidth = float(Max - Min) / float(numbins) + 1e-6 - binsize = (Max - Min + estbinwidth) / float(numbins) - lowerreallimit = Min - binsize / 2.0 #lower real limit,1st bin - bins = N.zeros(numbins) - extrapoints = 0 - for num in inarray: - try: - if (num - lowerreallimit) < 0: - extrapoints = extrapoints + 1 - else: - bintoincrement = int((num - lowerreallimit) / float(binsize)) - bins[bintoincrement] = bins[bintoincrement] + 1 - except: # point outside lower/upper limits - extrapoints = extrapoints + 1 - if (extrapoints > 0 and printextras == 1): - print '\nPoints outside given histogram range =', extrapoints - return (bins, lowerreallimit, binsize, extrapoints) - - def acumfreq(a, numbins=10, defaultreallimits=None): - """ -Returns a cumulative frequency histogram, using the histogram function. -Defaultreallimits can be None (use all data), or a 2-sequence containing -lower and upper limits on values to include. - -Usage: acumfreq(a,numbins=10,defaultreallimits=None) -Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints -""" - h, l, b, e = histogram(a, numbins, defaultreallimits) - cumhist = cumsum(h * 1) - return cumhist, l, b, e - - def arelfreq(a, numbins=10, defaultreallimits=None): - """ -Returns a relative frequency histogram, using the histogram function. -Defaultreallimits can be None (use all data), or a 2-sequence containing -lower and upper limits on values to include. - -Usage: arelfreq(a,numbins=10,defaultreallimits=None) -Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints -""" - h, l, b, e = histogram(a, numbins, defaultreallimits) - h = N.array(h / float(a.shape[0])) - return h, l, b, e - -##################################### -###### AVARIABILITY FUNCTIONS ##### -##################################### - - def aobrientransform(*args): - """ -Computes a transform on input data (any number of columns). Used to -test for homogeneity of variance prior to running one-way stats. Each -array in *args is one level of a factor. If an F_oneway() run on the -transformed data and found significant, variances are unequal. From -Maxwell and Delaney, p.112. - -Usage: aobrientransform(*args) *args = 1D arrays, one per level of factor -Returns: transformed data for use in an ANOVA -""" - TINY = 1e-10 - k = len(args) - n = N.zeros(k, N.float_) - v = N.zeros(k, N.float_) - m = N.zeros(k, N.float_) - nargs = [] - for i in range(k): - nargs.append(args[i].astype(N.float_)) - n[i] = float(len(nargs[i])) - v[i] = var(nargs[i]) - m[i] = mean(nargs[i]) - for j in range(k): - for i in range(n[j]): - t1 = (n[j] - 1.5) * n[j] * (nargs[j][i] - m[j])**2 - t2 = 0.5 * v[j] * (n[j] - 1.0) - t3 = (n[j] - 1.0) * (n[j] - 2.0) - nargs[j][i] = (t1 - t2) / float(t3) - check = 1 - for j in range(k): - if v[j] - mean(nargs[j]) > TINY: - check = 0 - if check <> 1: - raise ValueError, 'Lack of convergence in obrientransform.' - else: - return N.array(nargs) - - def asamplevar(inarray, dimension=None, keepdims=0): - """ -Returns the sample standard deviation of the values in the passed -array (i.e., using N). Dimension can equal None (ravel array first), -an integer (the dimension over which to operate), or a sequence -(operate over multiple dimensions). Set keepdims=1 to return an array -with the same number of dimensions as inarray. - -Usage: asamplevar(inarray,dimension=None,keepdims=0) -""" - if dimension == None: - inarray = N.ravel(inarray) - dimension = 0 - if dimension == 1: - mn = amean(inarray, dimension)[:, N.NewAxis] - else: - mn = amean(inarray, dimension, keepdims=1) - deviations = inarray - mn - if type(dimension) == ListType: - n = 1 - for d in dimension: - n = n * inarray.shape[d] - else: - n = inarray.shape[dimension] - svar = ass(deviations, dimension, keepdims) / float(n) - return svar - - def asamplestdev(inarray, dimension=None, keepdims=0): - """ -Returns the sample standard deviation of the values in the passed -array (i.e., using N). Dimension can equal None (ravel array first), -an integer (the dimension over which to operate), or a sequence -(operate over multiple dimensions). Set keepdims=1 to return an array -with the same number of dimensions as inarray. - -Usage: asamplestdev(inarray,dimension=None,keepdims=0) -""" - return N.sqrt(asamplevar(inarray, dimension, keepdims)) - - def asignaltonoise(instack, dimension=0): - """ -Calculates signal-to-noise. Dimension can equal None (ravel array -first), an integer (the dimension over which to operate), or a -sequence (operate over multiple dimensions). - -Usage: asignaltonoise(instack,dimension=0): -Returns: array containing the value of (mean/stdev) along dimension, - or 0 when stdev=0 -""" - m = mean(instack, dimension) - sd = stdev(instack, dimension) - return N.where(sd == 0, 0, m / sd) - - def acov(x, y, dimension=None, keepdims=0): - """ -Returns the estimated covariance of the values in the passed -array (i.e., N-1). Dimension can equal None (ravel array first), an -integer (the dimension over which to operate), or a sequence (operate -over multiple dimensions). Set keepdims=1 to return an array with the -same number of dimensions as inarray. - -Usage: acov(x,y,dimension=None,keepdims=0) -""" - if dimension == None: - x = N.ravel(x) - y = N.ravel(y) - dimension = 0 - xmn = amean(x, dimension, 1) # keepdims - xdeviations = x - xmn - ymn = amean(y, dimension, 1) # keepdims - ydeviations = y - ymn - if type(dimension) == ListType: - n = 1 - for d in dimension: - n = n * x.shape[d] - else: - n = x.shape[dimension] - covar = N.sum(xdeviations * ydeviations) / float(n - 1) - return covar - - def avar(inarray, dimension=None, keepdims=0): - """ -Returns the estimated population variance of the values in the passed -array (i.e., N-1). Dimension can equal None (ravel array first), an -integer (the dimension over which to operate), or a sequence (operate -over multiple dimensions). Set keepdims=1 to return an array with the -same number of dimensions as inarray. - -Usage: avar(inarray,dimension=None,keepdims=0) -""" - if dimension == None: - inarray = N.ravel(inarray) - dimension = 0 - mn = amean(inarray, dimension, 1) - deviations = inarray - mn - if type(dimension) == ListType: - n = 1 - for d in dimension: - n = n * inarray.shape[d] - else: - n = inarray.shape[dimension] - var = ass(deviations, dimension, keepdims) / float(n - 1) - return var - - def astdev(inarray, dimension=None, keepdims=0): - """ -Returns the estimated population standard deviation of the values in -the passed array (i.e., N-1). Dimension can equal None (ravel array -first), an integer (the dimension over which to operate), or a -sequence (operate over multiple dimensions). Set keepdims=1 to return -an array with the same number of dimensions as inarray. - -Usage: astdev(inarray,dimension=None,keepdims=0) -""" - return N.sqrt(avar(inarray, dimension, keepdims)) - - def asterr(inarray, dimension=None, keepdims=0): - """ -Returns the estimated population standard error of the values in the -passed array (i.e., N-1). Dimension can equal None (ravel array -first), an integer (the dimension over which to operate), or a -sequence (operate over multiple dimensions). Set keepdims=1 to return -an array with the same number of dimensions as inarray. - -Usage: asterr(inarray,dimension=None,keepdims=0) -""" - if dimension == None: - inarray = N.ravel(inarray) - dimension = 0 - return astdev(inarray, dimension, - keepdims) / float(N.sqrt(inarray.shape[dimension])) - - def asem(inarray, dimension=None, keepdims=0): - """ -Returns the standard error of the mean (i.e., using N) of the values -in the passed array. Dimension can equal None (ravel array first), an -integer (the dimension over which to operate), or a sequence (operate -over multiple dimensions). Set keepdims=1 to return an array with the -same number of dimensions as inarray. - -Usage: asem(inarray,dimension=None, keepdims=0) -""" - if dimension == None: - inarray = N.ravel(inarray) - dimension = 0 - if type(dimension) == ListType: - n = 1 - for d in dimension: - n = n * inarray.shape[d] - else: - n = inarray.shape[dimension] - s = asamplestdev(inarray, dimension, keepdims) / N.sqrt(n - 1) - return s - - def az(a, score): - """ -Returns the z-score of a given input score, given thearray from which -that score came. Not appropriate for population calculations, nor for -arrays > 1D. - -Usage: az(a, score) -""" - z = (score - amean(a)) / asamplestdev(a) - return z - - def azs(a): - """ -Returns a 1D array of z-scores, one for each score in the passed array, -computed relative to the passed array. - -Usage: azs(a) -""" - zscores = [] - for item in a: - zscores.append(z(a, item)) - return N.array(zscores) - - def azmap(scores, compare, dimension=0): - """ -Returns an array of z-scores the shape of scores (e.g., [x,y]), compared to -array passed to compare (e.g., [time,x,y]). Assumes collapsing over dim 0 -of the compare array. - -Usage: azs(scores, compare, dimension=0) -""" - mns = amean(compare, dimension) - sstd = asamplestdev(compare, 0) - return (scores - mns) / sstd - -##################################### -####### ATRIMMING FUNCTIONS ####### -##################################### - -## deleted around() as it's in numpy now - - def athreshold(a, threshmin=None, threshmax=None, newval=0): - """ -Like Numeric.clip() except that values <threshmid or >threshmax are replaced -by newval instead of by threshmin/threshmax (respectively). - -Usage: athreshold(a,threshmin=None,threshmax=None,newval=0) -Returns: a, with values <threshmin or >threshmax replaced with newval -""" - mask = N.zeros(a.shape) - if threshmin <> None: - mask = mask + N.where(a < threshmin, 1, 0) - if threshmax <> None: - mask = mask + N.where(a > threshmax, 1, 0) - mask = N.clip(mask, 0, 1) - return N.where(mask, newval, a) - - def atrimboth(a, proportiontocut): - """ -Slices off the passed proportion of items from BOTH ends of the passed -array (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND -'rightmost' 10% of scores. You must pre-sort the array if you want -"proper" trimming. Slices off LESS if proportion results in a -non-integer slice index (i.e., conservatively slices off -proportiontocut). - -Usage: atrimboth (a,proportiontocut) -Returns: trimmed version of array a -""" - lowercut = int(proportiontocut * len(a)) - uppercut = len(a) - lowercut - return a[lowercut:uppercut] - - def atrim1(a, proportiontocut, tail='right'): - """ -Slices off the passed proportion of items from ONE end of the passed -array (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost' -10% of scores). Slices off LESS if proportion results in a non-integer -slice index (i.e., conservatively slices off proportiontocut). - -Usage: atrim1(a,proportiontocut,tail='right') or set tail='left' -Returns: trimmed version of array a -""" - if string.lower(tail) == 'right': - lowercut = 0 - uppercut = len(a) - int(proportiontocut * len(a)) - elif string.lower(tail) == 'left': - lowercut = int(proportiontocut * len(a)) - uppercut = len(a) - return a[lowercut:uppercut] - -##################################### -##### ACORRELATION FUNCTIONS ###### -##################################### - - def acovariance(X): - """ -Computes the covariance matrix of a matrix X. Requires a 2D matrix input. - -Usage: acovariance(X) -Returns: covariance matrix of X -""" - if len(X.shape) <> 2: - raise TypeError, 'acovariance requires 2D matrices' - n = X.shape[0] - mX = amean(X, 0) - return N.dot(N.transpose(X), X) / float(n) - N.multiply.outer(mX, mX) - - def acorrelation(X): - """ -Computes the correlation matrix of a matrix X. Requires a 2D matrix input. - -Usage: acorrelation(X) -Returns: correlation matrix of X -""" - C = acovariance(X) - V = N.diagonal(C) - return C / N.sqrt(N.multiply.outer(V, V)) - - def apaired(x, y): - """ -Interactively determines the type of data in x and y, and then runs the -appropriated statistic for paired group data. - -Usage: apaired(x,y) x,y = the two arrays of values to be compared -Returns: appropriate statistic name, value, and probability -""" - samples = '' - while samples not in ['i', 'r', 'I', 'R', 'c', 'C']: - print '\nIndependent or related samples, or correlation (i,r,c): ', - samples = raw_input() - - if samples in ['i', 'I', 'r', 'R']: - print '\nComparing variances ...', - # USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112 - r = obrientransform(x, y) - f, p = F_oneway(pstat.colex(r, 0), pstat.colex(r, 1)) - if p < 0.05: - vartype = 'unequal, p=' + str(round(p, 4)) - else: - vartype = 'equal' - print vartype - if samples in ['i', 'I']: - if vartype[0] == 'e': - t, p = ttest_ind(x, y, None, 0) - print '\nIndependent samples t-test: ', round(t, 4), round(p, 4) - else: - if len(x) > 20 or len(y) > 20: - z, p = ranksums(x, y) - print '\nRank Sums test (NONparametric, n>20): ', round( - z, 4), round(p, 4) - else: - u, p = mannwhitneyu(x, y) - print '\nMann-Whitney U-test (NONparametric, ns<20): ', round( - u, 4), round(p, 4) - - else: # RELATED SAMPLES - if vartype[0] == 'e': - t, p = ttest_rel(x, y, 0) - print '\nRelated samples t-test: ', round(t, 4), round(p, 4) - else: - t, p = ranksums(x, y) - print '\nWilcoxon T-test (NONparametric): ', round(t, 4), round(p, 4) - else: # CORRELATION ANALYSIS - corrtype = '' - while corrtype not in ['c', 'C', 'r', 'R', 'd', 'D']: - print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ', - corrtype = raw_input() - if corrtype in ['c', 'C']: - m, b, r, p, see = linregress(x, y) - print '\nLinear regression for continuous variables ...' - lol = [ - ['Slope', 'Intercept', 'r', 'Prob', 'SEestimate'], - [round(m, 4), round(b, 4), round(r, 4), round(p, 4), round(see, 4)] - ] - pstat.printcc(lol) - elif corrtype in ['r', 'R']: - r, p = spearmanr(x, y) - print '\nCorrelation for ranked variables ...' - print "Spearman's r: ", round(r, 4), round(p, 4) - else: # DICHOTOMOUS - r, p = pointbiserialr(x, y) - print '\nAssuming x contains a dichotomous variable ...' - print 'Point Biserial r: ', round(r, 4), round(p, 4) - print '\n\n' - return None - - def dices(x, y): - """ -Calculates Dice's coefficient ... (2*number of common terms)/(number of terms in -x + -number of terms in y). Returns a value between 0 (orthogonal) and 1. - -Usage: dices(x,y) -""" - import sets - x = sets.Set(x) - y = sets.Set(y) - common = len(x.intersection(y)) - total = float(len(x) + len(y)) - return 2 * common / total - - def icc(x, y=None, verbose=0): - """ -Calculates intraclass correlation coefficients using simple, Type I sums of -squares. -If only one variable is passed, assumed it's an Nx2 matrix - -Usage: icc(x,y=None,verbose=0) -Returns: icc rho, prob ####PROB IS A GUESS BASED ON PEARSON -""" - TINY = 1.0e-20 - if y: - all = N.concatenate([x, y], 0) - else: - all = x + 0 - x = all[:, 0] - y = all[:, 1] - totalss = ass(all - mean(all)) - pairmeans = (x + y) / 2. - withinss = ass(x - pairmeans) + ass(y - pairmeans) - withindf = float(len(x)) - betwdf = float(len(x) - 1) - withinms = withinss / withindf - betweenms = (totalss - withinss) / betwdf - rho = (betweenms - withinms) / (withinms + betweenms) - t = rho * math.sqrt(betwdf / ((1.0 - rho + TINY) * (1.0 + rho + TINY))) - prob = abetai(0.5 * betwdf, 0.5, betwdf / (betwdf + t * t), verbose) - return rho, prob - - def alincc(x, y): - """ -Calculates Lin's concordance correlation coefficient. - -Usage: alincc(x,y) where x, y are equal-length arrays -Returns: Lin's CC -""" - x = N.ravel(x) - y = N.ravel(y) - covar = acov(x, y) * (len(x) - 1) / float(len(x)) # correct denom to n - xvar = avar(x) * (len(x) - 1) / float(len(x)) # correct denom to n - yvar = avar(y) * (len(y) - 1) / float(len(y)) # correct denom to n - lincc = (2 * covar) / ((xvar + yvar) + ((amean(x) - amean(y))**2)) - return lincc - - def apearsonr(x, y, verbose=1): - """ -Calculates a Pearson correlation coefficient and returns p. Taken -from Heiman's Basic Statistics for the Behav. Sci (2nd), p.195. - -Usage: apearsonr(x,y,verbose=1) where x,y are equal length arrays -Returns: Pearson's r, two-tailed p-value -""" - TINY = 1.0e-20 - n = len(x) - xmean = amean(x) - ymean = amean(y) - r_num = n * (N.add.reduce(x * y)) - N.add.reduce(x) * N.add.reduce(y) - r_den = math.sqrt((n * ass(x) - asquare_of_sums(x)) * - (n * ass(y) - asquare_of_sums(y))) - r = (r_num / r_den) - df = n - 2 - t = r * math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) - prob = abetai(0.5 * df, 0.5, df / (df + t * t), verbose) - return r, prob - - def aspearmanr(x, y): - """ -Calculates a Spearman rank-order correlation coefficient. Taken -from Heiman's Basic Statistics for the Behav. Sci (1st), p.192. - -Usage: aspearmanr(x,y) where x,y are equal-length arrays -Returns: Spearman's r, two-tailed p-value -""" - TINY = 1e-30 - n = len(x) - rankx = rankdata(x) - ranky = rankdata(y) - dsq = N.add.reduce((rankx - ranky)**2) - rs = 1 - 6 * dsq / float(n * (n**2 - 1)) - t = rs * math.sqrt((n - 2) / ((rs + 1.0) * (1.0 - rs))) - df = n - 2 - probrs = abetai(0.5 * df, 0.5, df / (df + t * t)) - # probability values for rs are from part 2 of the spearman function in - # Numerical Recipies, p.510. They close to tables, but not exact.(?) - return rs, probrs - - def apointbiserialr(x, y): - """ -Calculates a point-biserial correlation coefficient and the associated -probability value. Taken from Heiman's Basic Statistics for the Behav. -Sci (1st), p.194. - -Usage: apointbiserialr(x,y) where x,y are equal length arrays -Returns: Point-biserial r, two-tailed p-value -""" - TINY = 1e-30 - categories = pstat.aunique(x) - data = pstat.aabut(x, y) - if len(categories) <> 2: - raise ValueError, ('Exactly 2 categories required (in x) for ' - 'pointbiserialr().') - else: # there are 2 categories, continue - codemap = pstat.aabut(categories, N.arange(2)) - recoded = pstat.arecode(data, codemap, 0) - x = pstat.alinexand(data, 0, categories[0]) - y = pstat.alinexand(data, 0, categories[1]) - xmean = amean(pstat.acolex(x, 1)) - ymean = amean(pstat.acolex(y, 1)) - n = len(data) - adjust = math.sqrt((len(x) / float(n)) * (len(y) / float(n))) - rpb = (ymean - xmean) / asamplestdev(pstat.acolex(data, 1)) * adjust - df = n - 2 - t = rpb * math.sqrt(df / ((1.0 - rpb + TINY) * (1.0 + rpb + TINY))) - prob = abetai(0.5 * df, 0.5, df / (df + t * t)) - return rpb, prob - - def akendalltau(x, y): - """ -Calculates Kendall's tau ... correlation of ordinal data. Adapted -from function kendl1 in Numerical Recipies. Needs good test-cases.@@@ - -Usage: akendalltau(x,y) -Returns: Kendall's tau, two-tailed p-value -""" - n1 = 0 - n2 = 0 - iss = 0 - for j in range(len(x) - 1): - for k in range(j, len(y)): - a1 = x[j] - x[k] - a2 = y[j] - y[k] - aa = a1 * a2 - if (aa): # neither array has a tie - n1 = n1 + 1 - n2 = n2 + 1 - if aa > 0: - iss = iss + 1 - else: - iss = iss - 1 - else: - if (a1): - n1 = n1 + 1 - else: - n2 = n2 + 1 - tau = iss / math.sqrt(n1 * n2) - svar = (4.0 * len(x) + 10.0) / (9.0 * len(x) * (len(x) - 1)) - z = tau / math.sqrt(svar) - prob = erfcc(abs(z) / 1.4142136) - return tau, prob - - def alinregress(*args): - """ -Calculates a regression line on two arrays, x and y, corresponding to x,y -pairs. If a single 2D array is passed, alinregress finds dim with 2 levels -and splits data into x,y pairs along that dim. - -Usage: alinregress(*args) args=2 equal-length arrays, or one 2D array -Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate, n -""" - TINY = 1.0e-20 - if len(args) == 1: # more than 1D array? - args = args[0] - if len(args) == 2: - x = args[0] - y = args[1] - else: - x = args[:, 0] - y = args[:, 1] - else: - x = args[0] - y = args[1] - n = len(x) - xmean = amean(x) - ymean = amean(y) - r_num = n * (N.add.reduce(x * y)) - N.add.reduce(x) * N.add.reduce(y) - r_den = math.sqrt((n * ass(x) - asquare_of_sums(x)) * - (n * ass(y) - asquare_of_sums(y))) - r = r_num / r_den - z = 0.5 * math.log((1.0 + r + TINY) / (1.0 - r + TINY)) - df = n - 2 - t = r * math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) - prob = abetai(0.5 * df, 0.5, df / (df + t * t)) - slope = r_num / (float(n) * ass(x) - asquare_of_sums(x)) - intercept = ymean - slope * xmean - sterrest = math.sqrt(1 - r * r) * asamplestdev(y) - return slope, intercept, r, prob, sterrest, n - - def amasslinregress(*args): - """ -Calculates a regression line on one 1D array (x) and one N-D array (y). - -Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate, n -""" - TINY = 1.0e-20 - if len(args) == 1: # more than 1D array? - args = args[0] - if len(args) == 2: - x = N.ravel(args[0]) - y = args[1] - else: - x = N.ravel(args[:, 0]) - y = args[:, 1] - else: - x = args[0] - y = args[1] - x = x.astype(N.float_) - y = y.astype(N.float_) - n = len(x) - xmean = amean(x) - ymean = amean(y, 0) - shp = N.ones(len(y.shape)) - shp[0] = len(x) - x.shape = shp - print x.shape, y.shape - r_num = n * (N.add.reduce(x * y, 0)) - N.add.reduce(x) * N.add.reduce(y, 0) - r_den = N.sqrt((n * ass(x) - asquare_of_sums(x)) * - (n * ass(y, 0) - asquare_of_sums(y, 0))) - zerodivproblem = N.equal(r_den, 0) - r_den = N.where(zerodivproblem, 1, r_den - ) # avoid zero-division in 1st place - r = r_num / r_den # need to do this nicely for matrix division - r = N.where(zerodivproblem, 0.0, r) - z = 0.5 * N.log((1.0 + r + TINY) / (1.0 - r + TINY)) - df = n - 2 - t = r * N.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))) - prob = abetai(0.5 * df, 0.5, df / (df + t * t)) - - ss = float(n) * ass(x) - asquare_of_sums(x) - s_den = N.where(ss == 0, 1, ss) # avoid zero-division in 1st place - slope = r_num / s_den - intercept = ymean - slope * xmean - sterrest = N.sqrt(1 - r * r) * asamplestdev(y, 0) - return slope, intercept, r, prob, sterrest, n - -##################################### -##### AINFERENTIAL STATISTICS ##### -##################################### - - def attest_1samp(a, popmean, printit=0, name='Sample', writemode='a'): - """ -Calculates the t-obtained for the independent samples T-test on ONE group -of scores a, given a population mean. If printit=1, results are printed -to the screen. If printit='filename', the results are output to 'filename' -using the given writemode (default=append). Returns t-value, and prob. - -Usage: attest_1samp(a,popmean,Name='Sample',printit=0,writemode='a') -Returns: t-value, two-tailed prob -""" - if type(a) != N.ndarray: - a = N.array(a) - x = amean(a) - v = avar(a) - n = len(a) - df = n - 1 - svar = ((n - 1) * v) / float(df) - t = (x - popmean) / math.sqrt(svar * (1.0 / n)) - prob = abetai(0.5 * df, 0.5, df / (df + t * t)) - - if printit <> 0: - statname = 'Single-sample T-test.' - outputpairedstats(printit, writemode, 'Population', '--', popmean, 0, 0, - 0, name, n, x, v, N.minimum.reduce(N.ravel(a)), - N.maximum.reduce(N.ravel(a)), statname, t, prob) - return t, prob - - def attest_ind(a, - b, - dimension=None, - printit=0, - name1='Samp1', - name2='Samp2', - writemode='a'): - """ -Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores -a, and b. From Numerical Recipies, p.483. If printit=1, results are -printed to the screen. If printit='filename', the results are output -to 'filename' using the given writemode (default=append). Dimension -can equal None (ravel array first), or an integer (the dimension over -which to operate on a and b). - -Usage: attest_ind (a,b,dimension=None,printit=0, - Name1='Samp1',Name2='Samp2',writemode='a') -Returns: t-value, two-tailed p-value -""" - if dimension == None: - a = N.ravel(a) - b = N.ravel(b) - dimension = 0 - x1 = amean(a, dimension) - x2 = amean(b, dimension) - v1 = avar(a, dimension) - v2 = avar(b, dimension) - n1 = a.shape[dimension] - n2 = b.shape[dimension] - df = n1 + n2 - 2 - svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / float(df) - zerodivproblem = N.equal(svar, 0) - svar = N.where(zerodivproblem, 1, svar) # avoid zero-division in 1st place - t = (x1 - x2) / N.sqrt(svar * - (1.0 / n1 + 1.0 / n2)) # N-D COMPUTATION HERE!!!!!! - t = N.where(zerodivproblem, 1.0, t) # replace NaN/wrong t-values with 1.0 - probs = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) - - if type(t) == N.ndarray: - probs = N.reshape(probs, t.shape) - if probs.shape == (1,): - probs = probs[0] - - if printit <> 0: - if type(t) == N.ndarray: - t = t[0] - if type(probs) == N.ndarray: - probs = probs[0] - statname = 'Independent samples T-test.' - outputpairedstats(printit, writemode, name1, n1, x1, v1, - N.minimum.reduce(N.ravel(a)), - N.maximum.reduce(N.ravel(a)), name2, n2, x2, v2, - N.minimum.reduce(N.ravel(b)), - N.maximum.reduce(N.ravel(b)), statname, t, probs) - return - return t, probs - - def ap2t(pval, df): - """ -Tries to compute a t-value from a p-value (or pval array) and associated df. -SLOW for large numbers of elements(!) as it re-computes p-values 20 times -(smaller step-sizes) at which point it decides it's done. Keeps the signs -of the input array. Returns 1000 (or -1000) if t>100. - -Usage: ap2t(pval,df) -Returns: an array of t-values with the shape of pval - """ - pval = N.array(pval) - signs = N.sign(pval) - pval = abs(pval) - t = N.ones(pval.shape, N.float_) * 50 - step = N.ones(pval.shape, N.float_) * 25 - print 'Initial ap2t() prob calc' - prob = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) - print 'ap2t() iter: ', - for i in range(10): - print i, ' ', - t = N.where(pval < prob, t + step, t - step) - prob = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) - step = step / 2 - print - # since this is an ugly hack, we get ugly boundaries - t = N.where(t > 99.9, 1000, t) # hit upper-boundary - t = t + signs - return t #, prob, pval - - def attest_rel(a, - b, - dimension=None, - printit=0, - name1='Samp1', - name2='Samp2', - writemode='a'): - """ -Calculates the t-obtained T-test on TWO RELATED samples of scores, a -and b. From Numerical Recipies, p.483. If printit=1, results are -printed to the screen. If printit='filename', the results are output -to 'filename' using the given writemode (default=append). Dimension -can equal None (ravel array first), or an integer (the dimension over -which to operate on a and b). - -Usage: attest_rel(a,b,dimension=None,printit=0, - name1='Samp1',name2='Samp2',writemode='a') -Returns: t-value, two-tailed p-value -""" - if dimension == None: - a = N.ravel(a) - b = N.ravel(b) - dimension = 0 - if len(a) <> len(b): - raise ValueError, 'Unequal length arrays.' - x1 = amean(a, dimension) - x2 = amean(b, dimension) - v1 = avar(a, dimension) - v2 = avar(b, dimension) - n = a.shape[dimension] - df = float(n - 1) - d = (a - b).astype('d') - - denom = N.sqrt( - (n * N.add.reduce(d * d, dimension) - N.add.reduce(d, dimension)**2) / - df) - zerodivproblem = N.equal(denom, 0) - denom = N.where(zerodivproblem, 1, denom - ) # avoid zero-division in 1st place - t = N.add.reduce(d, dimension) / denom # N-D COMPUTATION HERE!!!!!! - t = N.where(zerodivproblem, 1.0, t) # replace NaN/wrong t-values with 1.0 - probs = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) - if type(t) == N.ndarray: - probs = N.reshape(probs, t.shape) - if probs.shape == (1,): - probs = probs[0] - - if printit <> 0: - statname = 'Related samples T-test.' - outputpairedstats(printit, writemode, name1, n, x1, v1, - N.minimum.reduce(N.ravel(a)), - N.maximum.reduce(N.ravel(a)), name2, n, x2, v2, - N.minimum.reduce(N.ravel(b)), - N.maximum.reduce(N.ravel(b)), statname, t, probs) - return - return t, probs - - def achisquare(f_obs, f_exp=None): - """ -Calculates a one-way chi square for array of observed frequencies and returns -the result. If no expected frequencies are given, the total N is assumed to -be equally distributed across all groups. -@@@NOT RIGHT?? - -Usage: achisquare(f_obs, f_exp=None) f_obs = array of observed cell freq. -Returns: chisquare-statistic, associated p-value -""" - - k = len(f_obs) - if f_exp == None: - f_exp = N.array([sum(f_obs) / float(k)] * len(f_obs), N.float_) - f_exp = f_exp.astype(N.float_) - chisq = N.add.reduce((f_obs - f_exp)**2 / f_exp) - return chisq, achisqprob(chisq, k - 1) - - def aks_2samp(data1, data2): - """ -Computes the Kolmogorov-Smirnof statistic on 2 samples. Modified from -Numerical Recipies in C, page 493. Returns KS D-value, prob. Not ufunc- -like. - -Usage: aks_2samp(data1,data2) where data1 and data2 are 1D arrays -Returns: KS D-value, p-value -""" - j1 = 0 # N.zeros(data1.shape[1:]) TRIED TO MAKE THIS UFUNC-LIKE - j2 = 0 # N.zeros(data2.shape[1:]) - fn1 = 0.0 # N.zeros(data1.shape[1:],N.float_) - fn2 = 0.0 # N.zeros(data2.shape[1:],N.float_) - n1 = data1.shape[0] - n2 = data2.shape[0] - en1 = n1 * 1 - en2 = n2 * 1 - d = N.zeros(data1.shape[1:], N.float_) - data1 = N.sort(data1, 0) - data2 = N.sort(data2, 0) - while j1 < n1 and j2 < n2: - d1 = data1[j1] - d2 = data2[j2] - if d1 <= d2: - fn1 = (j1) / float(en1) - j1 = j1 + 1 - if d2 <= d1: - fn2 = (j2) / float(en2) - j2 = j2 + 1 - dt = (fn2 - fn1) - if abs(dt) > abs(d): - d = dt -# try: - en = math.sqrt(en1 * en2 / float(en1 + en2)) - prob = aksprob((en + 0.12 + 0.11 / en) * N.fabs(d)) - # except: - # prob = 1.0 - return d, prob - - def amannwhitneyu(x, y): - """ -Calculates a Mann-Whitney U statistic on the provided scores and -returns the result. Use only when the n in each condition is < 20 and -you have 2 independent samples of ranks. REMEMBER: Mann-Whitney U is -significant if the u-obtained is LESS THAN or equal to the critical -value of U. - -Usage: amannwhitneyu(x,y) where x,y are arrays of values for 2 conditions -Returns: u-statistic, one-tailed p-value (i.e., p(z(U))) -""" - n1 = len(x) - n2 = len(y) - ranked = rankdata(N.concatenate((x, y))) - rankx = ranked[0:n1] # get the x-ranks - ranky = ranked[n1:] # the rest are y-ranks - u1 = n1 * n2 + (n1 * (n1 + 1)) / 2.0 - sum(rankx) # calc U for x - u2 = n1 * n2 - u1 # remainder is U for y - bigu = max(u1, u2) - smallu = min(u1, u2) - proportion = bigu / float(n1 * n2) - T = math.sqrt(tiecorrect(ranked)) # correction factor for tied scores - if T == 0: - raise ValueError, 'All numbers are identical in amannwhitneyu' - sd = math.sqrt(T * n1 * n2 * (n1 + n2 + 1) / 12.0) - z = abs((bigu - n1 * n2 / 2.0) / sd) # normal approximation for prob calc - return smallu, 1.0 - azprob(z), proportion - - def atiecorrect(rankvals): - """ -Tie-corrector for ties in Mann Whitney U and Kruskal Wallis H tests. -See Siegel, S. (1956) Nonparametric Statistics for the Behavioral -Sciences. New York: McGraw-Hill. Code adapted from |Stat rankind.c -code. - -Usage: atiecorrect(rankvals) -Returns: T correction factor for U or H -""" - sorted, posn = ashellsort(N.array(rankvals)) - n = len(sorted) - T = 0.0 - i = 0 - while (i < n - 1): - if sorted[i] == sorted[i + 1]: - nties = 1 - while (i < n - 1) and (sorted[i] == sorted[i + 1]): - nties = nties + 1 - i = i + 1 - T = T + nties**3 - nties - i = i + 1 - T = T / float(n**3 - n) - return 1.0 - T - - def aranksums(x, y): - """ -Calculates the rank sums statistic on the provided scores and returns -the result. - -Usage: aranksums(x,y) where x,y are arrays of values for 2 conditions -Returns: z-statistic, two-tailed p-value -""" - n1 = len(x) - n2 = len(y) - alldata = N.concatenate((x, y)) - ranked = arankdata(alldata) - x = ranked[:n1] - y = ranked[n1:] - s = sum(x) - expected = n1 * (n1 + n2 + 1) / 2.0 - z = (s - expected) / math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0) - prob = 2 * (1.0 - azprob(abs(z))) - return z, prob - - def awilcoxont(x, y): - """ -Calculates the Wilcoxon T-test for related samples and returns the -result. A non-parametric T-test. - -Usage: awilcoxont(x,y) where x,y are equal-length arrays for 2 conditions -Returns: t-statistic, two-tailed p-value -""" - if len(x) <> len(y): - raise ValueError, 'Unequal N in awilcoxont. Aborting.' - d = x - y - d = N.compress(N.not_equal(d, 0), d) # Keep all non-zero differences - count = len(d) - absd = abs(d) - absranked = arankdata(absd) - r_plus = 0.0 - r_minus = 0.0 - for i in range(len(absd)): - if d[i] < 0: - r_minus = r_minus + absranked[i] - else: - r_plus = r_plus + absranked[i] - wt = min(r_plus, r_minus) - mn = count * (count + 1) * 0.25 - se = math.sqrt(count * (count + 1) * (2.0 * count + 1.0) / 24.0) - z = math.fabs(wt - mn) / se - z = math.fabs(wt - mn) / se - prob = 2 * (1.0 - zprob(abs(z))) - return wt, prob - - def akruskalwallish(*args): - """ -The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more -groups, requiring at least 5 subjects in each group. This function -calculates the Kruskal-Wallis H and associated p-value for 3 or more -independent samples. - -Usage: akruskalwallish(*args) args are separate arrays for 3+ conditions -Returns: H-statistic (corrected for ties), associated p-value -""" - assert len(args) == 3, 'Need at least 3 groups in stats.akruskalwallish()' - args = list(args) - n = [0] * len(args) - n = map(len, args) - all = [] - for i in range(len(args)): - all = all + args[i].tolist() - ranked = rankdata(all) - T = tiecorrect(ranked) - for i in range(len(args)): - args[i] = ranked[0:n[i]] - del ranked[0:n[i]] - rsums = [] - for i in range(len(args)): - rsums.append(sum(args[i])**2) - rsums[i] = rsums[i] / float(n[i]) - ssbn = sum(rsums) - totaln = sum(n) - h = 12.0 / (totaln * (totaln + 1)) * ssbn - 3 * (totaln + 1) - df = len(args) - 1 - if T == 0: - raise ValueError, 'All numbers are identical in akruskalwallish' - h = h / float(T) - return h, chisqprob(h, df) - - def afriedmanchisquare(*args): - """ -Friedman Chi-Square is a non-parametric, one-way within-subjects -ANOVA. This function calculates the Friedman Chi-square test for -repeated measures and returns the result, along with the associated -probability value. It assumes 3 or more repeated measures. Only 3 -levels requires a minimum of 10 subjects in the study. Four levels -requires 5 subjects per level(??). - -Usage: afriedmanchisquare(*args) args are separate arrays for 2+ conditions -Returns: chi-square statistic, associated p-value -""" - k = len(args) - if k < 3: - raise ValueError, ('\nLess than 3 levels. Friedman test not ' - 'appropriate.\n') - n = len(args[0]) - data = apply(pstat.aabut, args) - data = data.astype(N.float_) - for i in range(len(data)): - data[i] = arankdata(data[i]) - ssbn = asum(asum(args, 1)**2) - chisq = 12.0 / (k * n * (k + 1)) * ssbn - 3 * n * (k + 1) - return chisq, achisqprob(chisq, k - 1) - -##################################### -#### APROBABILITY CALCULATIONS #### -##################################### - - def achisqprob(chisq, df): - """ -Returns the (1-tail) probability value associated with the provided chi-square -value and df. Heavily modified from chisq.c in Gary Perlman's |Stat. Can -handle multiple dimensions. - -Usage: achisqprob(chisq,df) chisq=chisquare stat., df=degrees of freedom -""" - BIG = 200.0 - - def ex(x): - BIG = 200.0 - exponents = N.where(N.less(x, -BIG), -BIG, x) - return N.exp(exponents) - - if type(chisq) == N.ndarray: - arrayflag = 1 - else: - arrayflag = 0 - chisq = N.array([chisq]) - if df < 1: - return N.ones(chisq.shape, N.float) - probs = N.zeros(chisq.shape, N.float_) - probs = N.where( - N.less_equal(chisq, 0), 1.0, probs) # set prob=1 for chisq<0 - a = 0.5 * chisq - if df > 1: - y = ex(-a) - if df % 2 == 0: - even = 1 - s = y * 1 - s2 = s * 1 - else: - even = 0 - s = 2.0 * azprob(-N.sqrt(chisq)) - s2 = s * 1 - if (df > 2): - chisq = 0.5 * (df - 1.0) - if even: - z = N.ones(probs.shape, N.float_) - else: - z = 0.5 * N.ones(probs.shape, N.float_) - if even: - e = N.zeros(probs.shape, N.float_) - else: - e = N.log(N.sqrt(N.pi)) * N.ones(probs.shape, N.float_) - c = N.log(a) - mask = N.zeros(probs.shape) - a_big = N.greater(a, BIG) - a_big_frozen = -1 * N.ones(probs.shape, N.float_) - totalelements = N.multiply.reduce(N.array(probs.shape)) - while asum(mask) <> totalelements: - e = N.log(z) + e - s = s + ex(c * z - a - e) - z = z + 1.0 - # print z, e, s - newmask = N.greater(z, chisq) - a_big_frozen = N.where(newmask * N.equal(mask, 0) * a_big, s, - a_big_frozen) - mask = N.clip(newmask + mask, 0, 1) - if even: - z = N.ones(probs.shape, N.float_) - e = N.ones(probs.shape, N.float_) - else: - z = 0.5 * N.ones(probs.shape, N.float_) - e = 1.0 / N.sqrt(N.pi) / N.sqrt(a) * N.ones(probs.shape, N.float_) - c = 0.0 - mask = N.zeros(probs.shape) - a_notbig_frozen = -1 * N.ones(probs.shape, N.float_) - while asum(mask) <> totalelements: - e = e * (a / z.astype(N.float_)) - c = c + e - z = z + 1.0 - # print '#2', z, e, c, s, c*y+s2 - newmask = N.greater(z, chisq) - a_notbig_frozen = N.where(newmask * N.equal(mask, 0) * (1 - a_big), - c * y + s2, a_notbig_frozen) - mask = N.clip(newmask + mask, 0, 1) - probs = N.where( - N.equal(probs, 1), 1, N.where( - N.greater(a, BIG), a_big_frozen, a_notbig_frozen)) - return probs - else: - return s - - def aerfcc(x): - """ -Returns the complementary error function erfc(x) with fractional error -everywhere less than 1.2e-7. Adapted from Numerical Recipies. Can -handle multiple dimensions. - -Usage: aerfcc(x) -""" - z = abs(x) - t = 1.0 / (1.0 + 0.5 * z) - ans = t * N.exp(-z * z - 1.26551223 + t * (1.00002368 + t * ( - 0.37409196 + t * (0.09678418 + t * (-0.18628806 + t * ( - 0.27886807 + t * (-1.13520398 + t * (1.48851587 + t * ( - -0.82215223 + t * 0.17087277))))))))) - return N.where(N.greater_equal(x, 0), ans, 2.0 - ans) - - def azprob(z): - """ -Returns the area under the normal curve 'to the left of' the given z value. -Thus, - for z<0, zprob(z) = 1-tail probability - for z>0, 1.0-zprob(z) = 1-tail probability - for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability -Adapted from z.c in Gary Perlman's |Stat. Can handle multiple dimensions. - -Usage: azprob(z) where z is a z-value -""" - - def yfunc(y): - x = ((((((( - ((((((-0.000045255659 * y + 0.000152529290) * y - 0.000019538132) * y - - 0.000676904986) * y + 0.001390604284) * y - 0.000794620820) * y - - 0.002034254874) * y + 0.006549791214) * y - 0.010557625006) * y + - 0.011630447319) * y - 0.009279453341) * y + 0.005353579108) * y - - 0.002141268741) * y + 0.000535310849) * y + 0.999936657524 - return x - - def wfunc(w): - x = ((((((((0.000124818987 * w - 0.001075204047) * w + 0.005198775019) * w - - 0.019198292004) * w + 0.059054035642) * w - 0.151968751364) * - w + 0.319152932694) * w - 0.531923007300) * w + - 0.797884560593) * N.sqrt(w) * 2.0 - return x - - Z_MAX = 6.0 # maximum meaningful z-value - x = N.zeros(z.shape, N.float_) # initialize - y = 0.5 * N.fabs(z) - x = N.where(N.less(y, 1.0), wfunc(y * y), yfunc(y - 2.0)) # get x's - x = N.where(N.greater(y, Z_MAX * 0.5), 1.0, x) # kill those with big Z - prob = N.where(N.greater(z, 0), (x + 1) * 0.5, (1 - x) * 0.5) - return prob - - def aksprob(alam): - """ -Returns the probability value for a K-S statistic computed via ks_2samp. -Adapted from Numerical Recipies. Can handle multiple dimensions. - -Usage: aksprob(alam) -""" - if type(alam) == N.ndarray: - frozen = -1 * N.ones(alam.shape, N.float64) - alam = alam.astype(N.float64) - arrayflag = 1 - else: - frozen = N.array(-1.) - alam = N.array(alam, N.float64) - arrayflag = 1 - mask = N.zeros(alam.shape) - fac = 2.0 * N.ones(alam.shape, N.float_) - sum = N.zeros(alam.shape, N.float_) - termbf = N.zeros(alam.shape, N.float_) - a2 = N.array(-2.0 * alam * alam, N.float64) - totalelements = N.multiply.reduce(N.array(mask.shape)) - for j in range(1, 201): - if asum(mask) == totalelements: - break - exponents = (a2 * j * j) - overflowmask = N.less(exponents, -746) - frozen = N.where(overflowmask, 0, frozen) - mask = mask + overflowmask - term = fac * N.exp(exponents) - sum = sum + term - newmask = N.where( - N.less_equal( - abs(term), (0.001 * termbf)) + N.less( - abs(term), 1.0e-8 * sum), 1, 0) - frozen = N.where(newmask * N.equal(mask, 0), sum, frozen) - mask = N.clip(mask + newmask, 0, 1) - fac = -fac - termbf = abs(term) - if arrayflag: - return N.where( - N.equal(frozen, -1), 1.0, frozen) # 1.0 if doesn't converge - else: - return N.where( - N.equal(frozen, -1), 1.0, frozen)[0] # 1.0 if doesn't converge - - def afprob(dfnum, dfden, F): - """ -Returns the 1-tailed significance level (p-value) of an F statistic -given the degrees of freedom for the numerator (dfR-dfF) and the degrees -of freedom for the denominator (dfF). Can handle multiple dims for F. - -Usage: afprob(dfnum, dfden, F) where usually dfnum=dfbn, dfden=dfwn -""" - if type(F) == N.ndarray: - return abetai(0.5 * dfden, 0.5 * dfnum, dfden / (1.0 * dfden + dfnum * F)) - else: - return abetai(0.5 * dfden, 0.5 * dfnum, dfden / float(dfden + dfnum * F)) - - def abetacf(a, b, x, verbose=1): - """ -Evaluates the continued fraction form of the incomplete Beta function, -betai. (Adapted from: Numerical Recipies in C.) Can handle multiple -dimensions for x. - -Usage: abetacf(a,b,x,verbose=1) -""" - ITMAX = 200 - EPS = 3.0e-7 - - arrayflag = 1 - if type(x) == N.ndarray: - frozen = N.ones(x.shape, - N.float_) * -1 #start out w/ -1s, should replace all - else: - arrayflag = 0 - frozen = N.array([-1]) - x = N.array([x]) - mask = N.zeros(x.shape) - bm = az = am = 1.0 - qab = a + b - qap = a + 1.0 - qam = a - 1.0 - bz = 1.0 - qab * x / qap - for i in range(ITMAX + 1): - if N.sum(N.ravel(N.equal(frozen, -1))) == 0: - break - em = float(i + 1) - tem = em + em - d = em * (b - em) * x / ((qam + tem) * (a + tem)) - ap = az + d * am - bp = bz + d * bm - d = -(a + em) * (qab + em) * x / ((qap + tem) * (a + tem)) - app = ap + d * az - bpp = bp + d * bz - aold = az * 1 - am = ap / bpp - bm = bp / bpp - az = app / bpp - bz = 1.0 - newmask = N.less(abs(az - aold), EPS * abs(az)) - frozen = N.where(newmask * N.equal(mask, 0), az, frozen) - mask = N.clip(mask + newmask, 0, 1) - noconverge = asum(N.equal(frozen, -1)) - if noconverge <> 0 and verbose: - print 'a or b too big, or ITMAX too small in Betacf for ', noconverge, ' elements' - if arrayflag: - return frozen - else: - return frozen[0] - - def agammln(xx): - """ -Returns the gamma function of xx. - Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt. -Adapted from: Numerical Recipies in C. Can handle multiple dims ... but -probably doesn't normally have to. - -Usage: agammln(xx) -""" - coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, - 0.120858003e-2, -0.536382e-5] - x = xx - 1.0 - tmp = x + 5.5 - tmp = tmp - (x + 0.5) * N.log(tmp) - ser = 1.0 - for j in range(len(coeff)): - x = x + 1 - ser = ser + coeff[j] / x - return -tmp + N.log(2.50662827465 * ser) - - def abetai(a, b, x, verbose=1): - """ -Returns the incomplete beta function: - - I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) - -where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma -function of a. The continued fraction formulation is implemented -here, using the betacf function. (Adapted from: Numerical Recipies in -C.) Can handle multiple dimensions. - -Usage: abetai(a,b,x,verbose=1) -""" - TINY = 1e-15 - if type(a) == N.ndarray: - if asum(N.less(x, 0) + N.greater(x, 1)) <> 0: - raise ValueError, 'Bad x in abetai' - x = N.where(N.equal(x, 0), TINY, x) - x = N.where(N.equal(x, 1.0), 1 - TINY, x) - - bt = N.where(N.equal(x, 0) + N.equal(x, 1), 0, -1) - exponents = (gammln(a + b) - gammln(a) - gammln(b) + a * N.log(x) + b * - N.log(1.0 - x)) - # 746 (below) is the MAX POSSIBLE BEFORE OVERFLOW - exponents = N.where(N.less(exponents, -740), -740, exponents) - bt = N.exp(exponents) - if type(x) == N.ndarray: - ans = N.where( - N.less(x, (a + 1) / (a + b + 2.0)), bt * abetacf(a, b, x, verbose) / - float(a), 1.0 - bt * abetacf(b, a, 1.0 - x, verbose) / float(b)) - else: - if x < (a + 1) / (a + b + 2.0): - ans = bt * abetacf(a, b, x, verbose) / float(a) - else: - ans = 1.0 - bt * abetacf(b, a, 1.0 - x, verbose) / float(b) - return ans - -##################################### -####### AANOVA CALCULATIONS ####### -##################################### - - import numpy.linalg, operator - LA = numpy.linalg - - def aglm(data, para): - """ -Calculates a linear model fit ... anova/ancova/lin-regress/t-test/etc. Taken -from: - Peterson et al. Statistical limitations in functional neuroimaging - I. Non-inferential methods and statistical models. Phil Trans Royal Soc - Lond B 354: 1239-1260. - -Usage: aglm(data,para) -Returns: statistic, p-value ??? -""" - if len(para) <> len(data): - print 'data and para must be same length in aglm' - return - n = len(para) - p = pstat.aunique(para) - x = N.zeros((n, len(p))) # design matrix - for l in range(len(p)): - x[:, l] = N.equal(para, p[l]) - b = N.dot( - N.dot( - LA.inv(N.dot( - N.transpose(x), x)), # i.e., b=inv(X'X)X'Y - N.transpose(x)), - data) - diffs = (data - N.dot(x, b)) - s_sq = 1. / (n - len(p)) * N.dot(N.transpose(diffs), diffs) - - if len(p) == 2: # ttest_ind - c = N.array([1, -1]) - df = n - 2 - fact = asum(1.0 / asum(x, 0)) # i.e., 1/n1 + 1/n2 + 1/n3 ... - t = N.dot(c, b) / N.sqrt(s_sq * fact) - probs = abetai(0.5 * df, 0.5, float(df) / (df + t * t)) - return t, probs - - def aF_oneway(*args): - """ -Performs a 1-way ANOVA, returning an F-value and probability given -any number of groups. From Heiman, pp.394-7. - -Usage: aF_oneway (*args) where *args is 2 or more arrays, one per - treatment group -Returns: f-value, probability -""" - na = len(args) # ANOVA on 'na' groups, each in it's own array - means = [0] * na - vars = [0] * na - ns = [0] * na - alldata = [] - tmp = map(N.array, args) - means = map(amean, tmp) - vars = map(avar, tmp) - ns = map(len, args) - alldata = N.concatenate(args) - bign = len(alldata) - sstot = ass(alldata) - (asquare_of_sums(alldata) / float(bign)) - ssbn = 0 - for a in args: - ssbn = ssbn + asquare_of_sums(N.array(a)) / float(len(a)) - ssbn = ssbn - (asquare_of_sums(alldata) / float(bign)) - sswn = sstot - ssbn - dfbn = na - 1 - dfwn = bign - na - msb = ssbn / float(dfbn) - msw = sswn / float(dfwn) - f = msb / msw - prob = fprob(dfbn, dfwn, f) - return f, prob - - def aF_value(ER, EF, dfR, dfF): - """ -Returns an F-statistic given the following: - ER = error associated with the null hypothesis (the Restricted model) - EF = error associated with the alternate hypothesis (the Full model) - dfR = degrees of freedom the Restricted model - dfF = degrees of freedom associated with the Restricted model -""" - return ((ER - EF) / float(dfR - dfF) / (EF / float(dfF))) - - def outputfstats(Enum, Eden, dfnum, dfden, f, prob): - Enum = round(Enum, 3) - Eden = round(Eden, 3) - dfnum = round(Enum, 3) - dfden = round(dfden, 3) - f = round(f, 3) - prob = round(prob, 3) - suffix = '' # for *s after the p-value - if prob < 0.001: - suffix = ' ***' - elif prob < 0.01: - suffix = ' **' - elif prob < 0.05: - suffix = ' *' - title = [['EF/ER', 'DF', 'Mean Square', 'F-value', 'prob', '']] - lofl = title + [[Enum, dfnum, round(Enum / float(dfnum), 3), f, prob, suffix - ], [Eden, dfden, round(Eden / float(dfden), 3), '', '', '']] - pstat.printcc(lofl) - return - - def F_value_multivariate(ER, EF, dfnum, dfden): - """ -Returns an F-statistic given the following: - ER = error associated with the null hypothesis (the Restricted model) - EF = error associated with the alternate hypothesis (the Full model) - dfR = degrees of freedom the Restricted model - dfF = degrees of freedom associated with the Restricted model -where ER and EF are matrices from a multivariate F calculation. -""" - if type(ER) in [IntType, FloatType]: - ER = N.array([[ER]]) - if type(EF) in [IntType, FloatType]: - EF = N.array([[EF]]) - n_um = (LA.det(ER) - LA.det(EF)) / float(dfnum) - d_en = LA.det(EF) / float(dfden) - return n_um / d_en - -##################################### -####### ASUPPORT FUNCTIONS ######## -##################################### - - def asign(a): - """ -Usage: asign(a) -Returns: array shape of a, with -1 where a<0 and +1 where a>=0 -""" - a = N.asarray(a) - if ((type(a) == type(1.4)) or (type(a) == type(1))): - return a - a - N.less(a, 0) + N.greater(a, 0) - else: - return N.zeros(N.shape(a)) - N.less(a, 0) + N.greater(a, 0) - - def asum(a, dimension=None, keepdims=0): - """ -An alternative to the Numeric.add.reduce function, which allows one to -(1) collapse over multiple dimensions at once, and/or (2) to retain -all dimensions in the original array (squashing one down to size. -Dimension can equal None (ravel array first), an integer (the -dimension over which to operate), or a sequence (operate over multiple -dimensions). If keepdims=1, the resulting array will have as many -dimensions as the input array. - -Usage: asum(a, dimension=None, keepdims=0) -Returns: array summed along 'dimension'(s), same _number_ of dims if keepdims=1 -""" - if type(a) == N.ndarray and a.dtype in [N.int_, N.short, N.ubyte]: - a = a.astype(N.float_) - if dimension == None: - s = N.sum(N.ravel(a)) - elif type(dimension) in [IntType, FloatType]: - s = N.add.reduce(a, dimension) - if keepdims == 1: - shp = list(a.shape) - shp[dimension] = 1 - s = N.reshape(s, shp) - else: # must be a SEQUENCE of dims to sum over - dims = list(dimension) - dims.sort() - dims.reverse() - s = a * 1.0 - for dim in dims: - s = N.add.reduce(s, dim) - if keepdims == 1: - shp = list(a.shape) - for dim in dims: - shp[dim] = 1 - s = N.reshape(s, shp) - return s - - def acumsum(a, dimension=None): - """ -Returns an array consisting of the cumulative sum of the items in the -passed array. Dimension can equal None (ravel array first), an -integer (the dimension over which to operate), or a sequence (operate -over multiple dimensions, but this last one just barely makes sense). - -Usage: acumsum(a,dimension=None) -""" - if dimension == None: - a = N.ravel(a) - dimension = 0 - if type(dimension) in [ListType, TupleType, N.ndarray]: - dimension = list(dimension) - dimension.sort() - dimension.reverse() - for d in dimension: - a = N.add.accumulate(a, d) - return a - else: - return N.add.accumulate(a, dimension) - - def ass(inarray, dimension=None, keepdims=0): - """ -Squares each value in the passed array, adds these squares & returns -the result. Unfortunate function name. :-) Defaults to ALL values in -the array. Dimension can equal None (ravel array first), an integer -(the dimension over which to operate), or a sequence (operate over -multiple dimensions). Set keepdims=1 to maintain the original number -of dimensions. - -Usage: ass(inarray, dimension=None, keepdims=0) -Returns: sum-along-'dimension' for (inarray*inarray) -""" - if dimension == None: - inarray = N.ravel(inarray) - dimension = 0 - return asum(inarray * inarray, dimension, keepdims) - - def asummult(array1, array2, dimension=None, keepdims=0): - """ -Multiplies elements in array1 and array2, element by element, and -returns the sum (along 'dimension') of all resulting multiplications. -Dimension can equal None (ravel array first), an integer (the -dimension over which to operate), or a sequence (operate over multiple -dimensions). A trivial function, but included for completeness. - -Usage: asummult(array1,array2,dimension=None,keepdims=0) -""" - if dimension == None: - array1 = N.ravel(array1) - array2 = N.ravel(array2) - dimension = 0 - return asum(array1 * array2, dimension, keepdims) - - def asquare_of_sums(inarray, dimension=None, keepdims=0): - """ -Adds the values in the passed array, squares that sum, and returns the -result. Dimension can equal None (ravel array first), an integer (the -dimension over which to operate), or a sequence (operate over multiple -dimensions). If keepdims=1, the returned array will have the same -NUMBER of dimensions as the original. - -Usage: asquare_of_sums(inarray, dimension=None, keepdims=0) -Returns: the square of the sum over dim(s) in dimension -""" - if dimension == None: - inarray = N.ravel(inarray) - dimension = 0 - s = asum(inarray, dimension, keepdims) - if type(s) == N.ndarray: - return s.astype(N.float_) * s - else: - return float(s) * s - - def asumdiffsquared(a, b, dimension=None, keepdims=0): - """ -Takes pairwise differences of the values in arrays a and b, squares -these differences, and returns the sum of these squares. Dimension -can equal None (ravel array first), an integer (the dimension over -which to operate), or a sequence (operate over multiple dimensions). -keepdims=1 means the return shape = len(a.shape) = len(b.shape) - -Usage: asumdiffsquared(a,b) -Returns: sum[ravel(a-b)**2] -""" - if dimension == None: - inarray = N.ravel(a) - dimension = 0 - return asum((a - b)**2, dimension, keepdims) - - def ashellsort(inarray): - """ -Shellsort algorithm. Sorts a 1D-array. - -Usage: ashellsort(inarray) -Returns: sorted-inarray, sorting-index-vector (for original array) -""" - n = len(inarray) - svec = inarray * 1.0 - ivec = range(n) - gap = n / 2 # integer division needed - while gap > 0: - for i in range(gap, n): - for j in range(i - gap, -1, -gap): - while j >= 0 and svec[j] > svec[j + gap]: - temp = svec[j] - svec[j] = svec[j + gap] - svec[j + gap] = temp - itemp = ivec[j] - ivec[j] = ivec[j + gap] - ivec[j + gap] = itemp - gap = gap / 2 # integer division needed -# svec is now sorted input vector, ivec has the order svec[i] = vec[ivec[i]] - return svec, ivec - - def arankdata(inarray): - """ -Ranks the data in inarray, dealing with ties appropritely. Assumes -a 1D inarray. Adapted from Gary Perlman's |Stat ranksort. - -Usage: arankdata(inarray) -Returns: array of length equal to inarray, containing rank scores -""" - n = len(inarray) - svec, ivec = ashellsort(inarray) - sumranks = 0 - dupcount = 0 - newarray = N.zeros(n, N.float_) - for i in range(n): - sumranks = sumranks + i - dupcount = dupcount + 1 - if i == n - 1 or svec[i] <> svec[i + 1]: - averank = sumranks / float(dupcount) + 1 - for j in range(i - dupcount + 1, i + 1): - newarray[ivec[j]] = averank - sumranks = 0 - dupcount = 0 - return newarray - - def afindwithin(data): - """ -Returns a binary vector, 1=within-subject factor, 0=between. Input -equals the entire data array (i.e., column 1=random factor, last -column = measured values. - -Usage: afindwithin(data) data in |Stat format -""" - numfact = len(data[0]) - 2 - withinvec = [0] * numfact - for col in range(1, numfact + 1): - rows = pstat.linexand(data, col, pstat.unique(pstat.colex(data, 1))[0] - ) # get 1 level of this factor - if len(pstat.unique(pstat.colex(rows, 0))) < len( - rows): # if fewer subjects than scores on this factor - withinvec[col - 1] = 1 - return withinvec - - ######################################################### - ######################################################### - ###### RE-DEFINE DISPATCHES TO INCLUDE ARRAYS ######### - ######################################################### - ######################################################### - - ## CENTRAL TENDENCY: - geometricmean = Dispatch( - (lgeometricmean, (ListType, TupleType)), (ageometricmean, (N.ndarray,))) - harmonicmean = Dispatch( - (lharmonicmean, (ListType, TupleType)), (aharmonicmean, (N.ndarray,))) - mean = Dispatch((lmean, (ListType, TupleType)), (amean, (N.ndarray,))) - median = Dispatch((lmedian, (ListType, TupleType)), (amedian, (N.ndarray,))) - medianscore = Dispatch( - (lmedianscore, (ListType, TupleType)), (amedianscore, (N.ndarray,))) - mode = Dispatch((lmode, (ListType, TupleType)), (amode, (N.ndarray,))) - tmean = Dispatch((atmean, (N.ndarray,))) - tvar = Dispatch((atvar, (N.ndarray,))) - tstdev = Dispatch((atstdev, (N.ndarray,))) - tsem = Dispatch((atsem, (N.ndarray,))) - - ## VARIATION: - moment = Dispatch((lmoment, (ListType, TupleType)), (amoment, (N.ndarray,))) - variation = Dispatch( - (lvariation, (ListType, TupleType)), (avariation, (N.ndarray,))) - skew = Dispatch((lskew, (ListType, TupleType)), (askew, (N.ndarray,))) - kurtosis = Dispatch( - (lkurtosis, (ListType, TupleType)), (akurtosis, (N.ndarray,))) - describe = Dispatch( - (ldescribe, (ListType, TupleType)), (adescribe, (N.ndarray,))) - - ## DISTRIBUTION TESTS - - skewtest = Dispatch( - (askewtest, (ListType, TupleType)), (askewtest, (N.ndarray,))) - kurtosistest = Dispatch( - (akurtosistest, (ListType, TupleType)), (akurtosistest, (N.ndarray,))) - normaltest = Dispatch( - (anormaltest, (ListType, TupleType)), (anormaltest, (N.ndarray,))) - - ## FREQUENCY STATS: - itemfreq = Dispatch( - (litemfreq, (ListType, TupleType)), (aitemfreq, (N.ndarray,))) - scoreatpercentile = Dispatch( - (lscoreatpercentile, (ListType, TupleType)), (ascoreatpercentile, - (N.ndarray,))) - percentileofscore = Dispatch( - (lpercentileofscore, (ListType, TupleType)), (apercentileofscore, - (N.ndarray,))) - histogram = Dispatch( - (lhistogram, (ListType, TupleType)), (ahistogram, (N.ndarray,))) - cumfreq = Dispatch( - (lcumfreq, (ListType, TupleType)), (acumfreq, (N.ndarray,))) - relfreq = Dispatch( - (lrelfreq, (ListType, TupleType)), (arelfreq, (N.ndarray,))) - - ## VARIABILITY: - obrientransform = Dispatch( - (lobrientransform, (ListType, TupleType)), (aobrientransform, - (N.ndarray,))) - samplevar = Dispatch( - (lsamplevar, (ListType, TupleType)), (asamplevar, (N.ndarray,))) - samplestdev = Dispatch( - (lsamplestdev, (ListType, TupleType)), (asamplestdev, (N.ndarray,))) - signaltonoise = Dispatch((asignaltonoise, (N.ndarray,)),) - var = Dispatch((lvar, (ListType, TupleType)), (avar, (N.ndarray,))) - stdev = Dispatch((lstdev, (ListType, TupleType)), (astdev, (N.ndarray,))) - sterr = Dispatch((lsterr, (ListType, TupleType)), (asterr, (N.ndarray,))) - sem = Dispatch((lsem, (ListType, TupleType)), (asem, (N.ndarray,))) - z = Dispatch((lz, (ListType, TupleType)), (az, (N.ndarray,))) - zs = Dispatch((lzs, (ListType, TupleType)), (azs, (N.ndarray,))) - - ## TRIMMING FCNS: - threshold = Dispatch((athreshold, (N.ndarray,)),) - trimboth = Dispatch( - (ltrimboth, (ListType, TupleType)), (atrimboth, (N.ndarray,))) - trim1 = Dispatch((ltrim1, (ListType, TupleType)), (atrim1, (N.ndarray,))) - - ## CORRELATION FCNS: - paired = Dispatch((lpaired, (ListType, TupleType)), (apaired, (N.ndarray,))) - lincc = Dispatch((llincc, (ListType, TupleType)), (alincc, (N.ndarray,))) - pearsonr = Dispatch( - (lpearsonr, (ListType, TupleType)), (apearsonr, (N.ndarray,))) - spearmanr = Dispatch( - (lspearmanr, (ListType, TupleType)), (aspearmanr, (N.ndarray,))) - pointbiserialr = Dispatch( - (lpointbiserialr, (ListType, TupleType)), (apointbiserialr, (N.ndarray,))) - kendalltau = Dispatch( - (lkendalltau, (ListType, TupleType)), (akendalltau, (N.ndarray,))) - linregress = Dispatch( - (llinregress, (ListType, TupleType)), (alinregress, (N.ndarray,))) - - ## INFERENTIAL STATS: - ttest_1samp = Dispatch( - (lttest_1samp, (ListType, TupleType)), (attest_1samp, (N.ndarray,))) - ttest_ind = Dispatch( - (lttest_ind, (ListType, TupleType)), (attest_ind, (N.ndarray,))) - ttest_rel = Dispatch( - (lttest_rel, (ListType, TupleType)), (attest_rel, (N.ndarray,))) - chisquare = Dispatch( - (lchisquare, (ListType, TupleType)), (achisquare, (N.ndarray,))) - ks_2samp = Dispatch( - (lks_2samp, (ListType, TupleType)), (aks_2samp, (N.ndarray,))) - mannwhitneyu = Dispatch( - (lmannwhitneyu, (ListType, TupleType)), (amannwhitneyu, (N.ndarray,))) - tiecorrect = Dispatch( - (ltiecorrect, (ListType, TupleType)), (atiecorrect, (N.ndarray,))) - ranksums = Dispatch( - (lranksums, (ListType, TupleType)), (aranksums, (N.ndarray,))) - wilcoxont = Dispatch( - (lwilcoxont, (ListType, TupleType)), (awilcoxont, (N.ndarray,))) - kruskalwallish = Dispatch( - (lkruskalwallish, (ListType, TupleType)), (akruskalwallish, (N.ndarray,))) - friedmanchisquare = Dispatch( - (lfriedmanchisquare, (ListType, TupleType)), (afriedmanchisquare, - (N.ndarray,))) - - ## PROBABILITY CALCS: - chisqprob = Dispatch( - (lchisqprob, (IntType, FloatType)), (achisqprob, (N.ndarray,))) - zprob = Dispatch((lzprob, (IntType, FloatType)), (azprob, (N.ndarray,))) - ksprob = Dispatch((lksprob, (IntType, FloatType)), (aksprob, (N.ndarray,))) - fprob = Dispatch((lfprob, (IntType, FloatType)), (afprob, (N.ndarray,))) - betacf = Dispatch((lbetacf, (IntType, FloatType)), (abetacf, (N.ndarray,))) - betai = Dispatch((lbetai, (IntType, FloatType)), (abetai, (N.ndarray,))) - erfcc = Dispatch((lerfcc, (IntType, FloatType)), (aerfcc, (N.ndarray,))) - gammln = Dispatch((lgammln, (IntType, FloatType)), (agammln, (N.ndarray,))) - - ## ANOVA FUNCTIONS: - F_oneway = Dispatch( - (lF_oneway, (ListType, TupleType)), (aF_oneway, (N.ndarray,))) - F_value = Dispatch( - (lF_value, (ListType, TupleType)), (aF_value, (N.ndarray,))) - - ## SUPPORT FUNCTIONS: - incr = Dispatch((lincr, (ListType, TupleType, N.ndarray)),) - sum = Dispatch((lsum, (ListType, TupleType)), (asum, (N.ndarray,))) - cumsum = Dispatch((lcumsum, (ListType, TupleType)), (acumsum, (N.ndarray,))) - ss = Dispatch((lss, (ListType, TupleType)), (ass, (N.ndarray,))) - summult = Dispatch( - (lsummult, (ListType, TupleType)), (asummult, (N.ndarray,))) - square_of_sums = Dispatch( - (lsquare_of_sums, (ListType, TupleType)), (asquare_of_sums, (N.ndarray,))) - sumdiffsquared = Dispatch( - (lsumdiffsquared, (ListType, TupleType)), (asumdiffsquared, (N.ndarray,))) - shellsort = Dispatch( - (lshellsort, (ListType, TupleType)), (ashellsort, (N.ndarray,))) - rankdata = Dispatch( - (lrankdata, (ListType, TupleType)), (arankdata, (N.ndarray,))) - findwithin = Dispatch( - (lfindwithin, (ListType, TupleType)), (afindwithin, (N.ndarray,))) - -###################### END OF NUMERIC FUNCTION BLOCK ##################### - -###################### END OF STATISTICAL FUNCTIONS ###################### - -except ImportError: - pass diff --git a/cros_utils/tabulator.py b/cros_utils/tabulator.py index 59e4d426..9527fde0 100644 --- a/cros_utils/tabulator.py +++ b/cros_utils/tabulator.py @@ -61,12 +61,16 @@ table: print tp.Print() """ +from __future__ import division from __future__ import print_function import getpass import math import sys +# TODO(zhizhouy): Drop numpy in the future +# pylint: disable=import-error import numpy +import scipy from email_sender import EmailSender import misc @@ -162,7 +166,7 @@ class TableGenerator(object): keys = self._AggregateKeys() return self._SortKeys(keys) - def GetTable(self, number_of_rows=sys.maxint): + def GetTable(self, number_of_rows=sys.maxsize): """Returns a table from a list of list of dicts. Examples: @@ -239,7 +243,7 @@ class SamplesTableGenerator(TableGenerator): keys = self._runs.keys() return self._SortKeys(keys) - def GetTable(self, number_of_rows=sys.maxint): + def GetTable(self, number_of_rows=sys.maxsize): """Returns a tuple, which contains three args: 1) a table from a list of list of dicts. @@ -660,8 +664,7 @@ class PValueResult(ComparisonResult): if len(values) < 2 or len(baseline_values) < 2: cell.value = float('nan') return - import stats - _, cell.value = stats.lttest_ind(values, baseline_values) + _, cell.value = scipy.stats.ttest_ind(values, baseline_values) def _ComputeString(self, cell, values, baseline_values): return float('nan') @@ -877,7 +880,7 @@ class WeightFormat(Format): class StorageFormat(Format): """Format the cell as a storage number. - Example: + Examples: If the cell contains a value of 1024, the string_value will be 1.0K. """ @@ -899,7 +902,7 @@ class StorageFormat(Format): class CoeffVarFormat(Format): """Format the cell as a percent. - Example: + Examples: If the cell contains a value of 1.5, the string_value will be +150%. """ @@ -917,7 +920,7 @@ class CoeffVarFormat(Format): class PercentFormat(Format): """Format the cell as a percent. - Example: + Examples: If the cell contains a value of 1.5, the string_value will be +50%. """ @@ -930,7 +933,7 @@ class PercentFormat(Format): class RatioFormat(Format): """Format the cell as a ratio. - Example: + Examples: If the cell contains a value of 1.5642, the string_value will be 1.56. """ @@ -943,7 +946,7 @@ class RatioFormat(Format): class ColorBoxFormat(Format): """Format the cell as a color box. - Example: + Examples: If the cell contains a value of 1.5, it will get a green color. If the cell contains a value of 0.5, it will get a red color. The intensity of the green/red will be determined by how much above or below @@ -989,7 +992,7 @@ class Cell(object): # Entire row inherits this color. self.color_row = False self.bgcolor_row = False - self.width = None + self.width = 0 self.colspan = 1 self.name = None self.header = False @@ -1297,7 +1300,7 @@ class TablePrinter(object): suffix = '\033[0m' elif self._output_type in [self.EMAIL, self.HTML]: rgb = color.GetRGB() - prefix = ("<FONT style=\"BACKGROUND-COLOR:#{0}\">".format(rgb)) + prefix = ('<FONT style="BACKGROUND-COLOR:#{0}">'.format(rgb)) suffix = '</FONT>' elif self._output_type in [self.PLAIN, self.TSV]: prefix = '' @@ -1365,7 +1368,7 @@ class TablePrinter(object): tag = 'th' else: tag = 'td' - out = "<{0} colspan = \"{2}\"> {1} </{0}>".format(tag, out, cell.colspan) + out = '<{0} colspan = "{2}"> {1} </{0}>'.format(tag, out, cell.colspan) return out @@ -1387,7 +1390,7 @@ class TablePrinter(object): if self._output_type in [self.PLAIN, self.CONSOLE, self.TSV, self.EMAIL]: return '' if self._output_type == self.HTML: - return "<p></p><table id=\"box-table-a\">\n<tr>" + return '<p></p><table id="box-table-a">\n<tr>' def _GetSuffix(self): if self._output_type in [self.PLAIN, self.CONSOLE, self.TSV, self.EMAIL]: diff --git a/cros_utils/tabulator_test.py b/cros_utils/tabulator_test.py index 33c8da25..4c0f8677 100644..100755 --- a/cros_utils/tabulator_test.py +++ b/cros_utils/tabulator_test.py @@ -1,7 +1,9 @@ +#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. + """Tests for the tabulator module.""" from __future__ import print_function @@ -71,7 +73,7 @@ class TabulatorTest(unittest.TestCase): a = [1.0e+308] * 3 # pylint: disable=protected-access b = tabulator.Result()._GetGmean(a) - self.assertTrue(b >= 0.99e+308 and b <= 1.01e+308) + self.assertTrue(0.99e+308 <= b <= 1.01e+308) def testIgnoreMinMax(self): amr = tabulator.AmeanResult(ignore_min_max=True) diff --git a/cros_utils/timeline.py b/cros_utils/timeline.py index 873aaa30..cce0b05c 100644 --- a/cros_utils/timeline.py +++ b/cros_utils/timeline.py @@ -1,5 +1,8 @@ -# Copyright 2012 Google Inc. All Rights Reserved. -# +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + """Tools for recording and reporting timeline of benchmark_run.""" from __future__ import print_function @@ -25,8 +28,8 @@ class Timeline(object): def Record(self, event): for e in self.events: - assert e.name != event, ('The event {0} is already recorded.' - .format(event)) + assert e.name != event, ( + 'The event {0} is already recorded.'.format(event)) cur_event = Event(name=event, cur_time=time.time()) self.events.append(cur_event) @@ -43,7 +46,7 @@ class Timeline(object): for e in self.events: if e.name == event: return e.timestamp - raise IndexError, 'The event {0} is not recorded'.format(event) + raise IndexError('The event {0} is not recorded'.format(event)) def GetLastEventTime(self): return self.events[-1].timestamp diff --git a/cros_utils/timeline_test.py b/cros_utils/timeline_test.py index c93a1274..314cf3c4 100644..100755 --- a/cros_utils/timeline_test.py +++ b/cros_utils/timeline_test.py @@ -1,4 +1,9 @@ -# Copyright 2012 Google Inc. All Rights Reserved. +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + """Tests for time_line.py.""" from __future__ import print_function |