diff options
31 files changed, 6565 insertions, 521 deletions
diff --git a/crosperf/autotest_runner.py b/crosperf/autotest_runner.py index 80fb2a24..5611b655 100644 --- a/crosperf/autotest_runner.py +++ b/crosperf/autotest_runner.py @@ -15,11 +15,15 @@ class AutotestRunner(object): def Run(self, machine_name, chromeos_root, board, autotest_name, autotest_args): + """Run the run_remote_test.""" options = "" if board: options += " --board=%s" % board if autotest_args: options += " %s" % autotest_args + command = "rm -rf /usr/local/autotest/results/*" + self._ce.CrosRunCommand(command, machine=machine_name, username="root", + chromeos_root=chromeos_root) command = ("./run_remote_tests.sh --remote=%s %s %s" % (machine_name, options, autotest_name)) return self._ce.ChrootRunCommand(chromeos_root, command, True, self._ct) diff --git a/crosperf/benchmark.py b/crosperf/benchmark.py index fa12d934..a75bd8e3 100644 --- a/crosperf/benchmark.py +++ b/crosperf/benchmark.py @@ -14,11 +14,10 @@ class Benchmark(object): """ def __init__(self, name, autotest_name, autotest_args, iterations, - outlier_range, profile_counters, profile_type): + outlier_range, perf_args): self.name = name self.autotest_name = autotest_name self.autotest_args = autotest_args self.iterations = iterations self.outlier_range = outlier_range - self.profile_counters = profile_counters - self.profile_type = profile_type + self.perf_args = perf_args diff --git a/crosperf/benchmark_run.py b/crosperf/benchmark_run.py index c20b24e0..7579b6c2 100644 --- a/crosperf/benchmark_run.py +++ b/crosperf/benchmark_run.py @@ -8,12 +8,12 @@ import re import threading import time import traceback -from results_cache import Result -from utils import logger -from utils import command_executer + from autotest_runner import AutotestRunner +from results_cache import Result from results_cache import ResultsCache - +from utils import command_executer +from utils import logger STATUS_FAILED = "FAILED" STATUS_SUCCEEDED = "SUCCEEDED" @@ -26,7 +26,7 @@ STATUS_PENDING = "PENDING" class BenchmarkRun(threading.Thread): def __init__(self, name, benchmark_name, autotest_name, autotest_args, label_name, chromeos_root, chromeos_image, board, iteration, - cache_conditions, outlier_range, profile_counters, profile_type, + cache_conditions, outlier_range, perf_args, machine_manager, logger_to_use): threading.Thread.__init__(self) @@ -45,8 +45,7 @@ class BenchmarkRun(threading.Thread): self.status = STATUS_PENDING self.run_completed = False self.outlier_range = outlier_range - self.profile_counters = profile_counters - self.profile_type = profile_type + self.perf_args = perf_args self.machine_manager = machine_manager self.cache = ResultsCache() self.autotest_runner = AutotestRunner(self._logger) @@ -68,10 +67,11 @@ class BenchmarkRun(threading.Thread): self.autotest_name, self.iteration, self.autotest_args, - self.machine_manager.GetMachines()[0].name, + self.machine_manager, self.board, self.cache_conditions, - self._logger) + self._logger, + ) self.result = self.cache.ReadResult() self.cache_hit = (self.result is not None) @@ -134,13 +134,12 @@ class BenchmarkRun(threading.Thread): return machine def _GetExtraAutotestArgs(self): - if self.profile_type: - if self.profile_type == "record": - perf_args = "record -a -e %s" % ",".join(self.profile_counters) - elif self.profile_type == "stat": - perf_args = "stat -a" - else: - raise Exception("profile_type must be either record or stat") + if self.perf_args: + perf_args_list = self.perf_args.split(" ") + perf_args_list = [perf_args_list[0]] + ["-a"] + perf_args_list[1:] + perf_args = " ".join(perf_args_list) + if not perf_args_list[0] in ["record", "stat"]: + raise Exception("perf_args must start with either record or stat") extra_autotest_args = ["--profiler=custom_perf", ("--profiler_args='perf_options=\"%s\"'" % perf_args)] diff --git a/crosperf/experiment.py b/crosperf/experiment.py index 3f68f8be..7b48344c 100644 --- a/crosperf/experiment.py +++ b/crosperf/experiment.py @@ -18,7 +18,7 @@ class Experiment(object): def __init__(self, name, remote, rerun_if_failed, working_directory, chromeos_root, cache_conditions, labels, benchmarks, - experiment_file): + experiment_file, email_to): self.name = name self.rerun_if_failed = rerun_if_failed self.working_directory = working_directory @@ -26,6 +26,7 @@ class Experiment(object): self.chromeos_root = chromeos_root self.cache_conditions = cache_conditions self.experiment_file = experiment_file + self.email_to = email_to self.results_directory = os.path.join(self.working_directory, self.name + "_results") @@ -48,6 +49,8 @@ class Experiment(object): for machine in remote: self.machine_manager.AddMachine(machine) + self.machine_manager.ComputeCommonCheckSum() + self.machine_manager.ComputeCommonCheckSumString() self.start_time = None self.benchmark_runs = self._GenerateBenchmarkRuns() @@ -76,8 +79,7 @@ class Experiment(object): iteration, self.cache_conditions, benchmark.outlier_range, - benchmark.profile_counters, - benchmark.profile_type, + benchmark.perf_args, self.machine_manager, logger_to_use) diff --git a/crosperf/experiment_factory.py b/crosperf/experiment_factory.py index d3c717ae..5c21179e 100644 --- a/crosperf/experiment_factory.py +++ b/crosperf/experiment_factory.py @@ -33,7 +33,7 @@ class ExperimentFactory(object): if global_settings.GetField("rerun"): cache_conditions.append(CacheConditions.FALSE) if global_settings.GetField("exact_remote"): - cache_conditions.append(CacheConditions.REMOTES_MATCH) + cache_conditions.append(CacheConditions.MACHINES_MATCH) # Construct benchmarks. benchmarks = [] @@ -46,11 +46,9 @@ class ExperimentFactory(object): autotest_args = benchmark_settings.GetField("autotest_args") iterations = benchmark_settings.GetField("iterations") outlier_range = benchmark_settings.GetField("outlier_range") - profile_counters = benchmark_settings.GetField("profile_counters") - profile_type = benchmark_settings.GetField("profile_type") + perf_args = benchmark_settings.GetField("perf_args") benchmark = Benchmark(benchmark_name, autotest_name, autotest_args, - iterations, outlier_range, profile_counters, - profile_type) + iterations, outlier_range, perf_args) benchmarks.append(benchmark) # Construct labels. @@ -64,9 +62,12 @@ class ExperimentFactory(object): label = Label(label_name, image, chromeos_root, board) labels.append(label) + email = global_settings.GetField("email") + experiment = Experiment(experiment_name, remote, rerun_if_failed, working_directory, chromeos_root, cache_conditions, labels, benchmarks, - experiment_file.Canonicalize()) + experiment_file.Canonicalize(), + email) return experiment diff --git a/crosperf/experiment_file.py b/crosperf/experiment_file.py index bde2a4d7..fc0f16ab 100644 --- a/crosperf/experiment_file.py +++ b/crosperf/experiment_file.py @@ -4,6 +4,7 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +import os.path import re from settings import Settings from settings_factory import SettingsFactory @@ -143,6 +144,11 @@ class ExperimentFile(object): field = settings.fields[field_name] if field.assigned: res += "\t%s: %s\n" % (field.name, field.GetString()) + if field.name == "chromeos_image": + real_file = (os.path.realpath + (os.path.expanduser(field.GetString()))) + if real_file != field.GetString(): + res += "\t#actual_image: %s\n" % real_file res += "}\n\n" return res diff --git a/crosperf/experiment_files/aes_perf b/crosperf/experiment_files/aes_perf index b298e91c..0c54ccbd 100644 --- a/crosperf/experiment_files/aes_perf +++ b/crosperf/experiment_files/aes_perf @@ -1,8 +1,7 @@ # This experiment just runs a short autotest which measures the performance of # aes encryption. In addition, it profiles -profile_type: record -profile_counters: instructions cycles +profile_args: record -e cycles -e instructions benchmark: platform_AesThroughput { } diff --git a/crosperf/experiment_files/bloat_perf b/crosperf/experiment_files/bloat_perf index a95d2cbf..f8258ee1 100644 --- a/crosperf/experiment_files/bloat_perf +++ b/crosperf/experiment_files/bloat_perf @@ -1,5 +1,4 @@ -profile_type: record -profile_counters: cycles instructions +perf_args: record -e cycles benchmark: bloat { autotest_name: desktopui_PyAutoPerfTests diff --git a/crosperf/experiment_files/morejs_perf b/crosperf/experiment_files/morejs_perf index d7ab45bb..a02f15f5 100644 --- a/crosperf/experiment_files/morejs_perf +++ b/crosperf/experiment_files/morejs_perf @@ -1,5 +1,4 @@ -profile_type: record -profile_counters: cycles instructions +perf_args: record -e cycles benchmark: morejs { autotest_name: desktopui_PyAutoPerfTests diff --git a/crosperf/experiment_files/page_cycler_perf b/crosperf/experiment_files/page_cycler_perf index 866fb751..7f5e7118 100644 --- a/crosperf/experiment_files/page_cycler_perf +++ b/crosperf/experiment_files/page_cycler_perf @@ -1,7 +1,6 @@ # This experiment profiles all page cyclers. -profile_type: record -profile_counters: cycles +perf_args: record -e cycles benchmark: morejs { autotest_name: desktopui_PyAutoPerfTests diff --git a/crosperf/experiment_files/toolchain b/crosperf/experiment_files/toolchain index c6790505..9156998b 100644 --- a/crosperf/experiment_files/toolchain +++ b/crosperf/experiment_files/toolchain @@ -5,11 +5,12 @@ benchmark: bvt { } benchmark: suite_Smoke { - auotest_name: suite:smoke + autotest_name: suite:smoke } benchmark: PyAutoPerfTests { } -benchmark: AndroidBench { +benchmark: BootPerfServer { + autotest_name: ^server/site_tests/platform_BootPerfServer/control$ } diff --git a/crosperf/experiment_runner.py b/crosperf/experiment_runner.py index c5cd1ada..4219c435 100644 --- a/crosperf/experiment_runner.py +++ b/crosperf/experiment_runner.py @@ -53,7 +53,7 @@ class ExperimentRunner(object): if not benchmark_run.cache_hit: send_mail = True break - if not send_mail: + if not send_mail and not experiment.email_to: return label_names = [] @@ -61,11 +61,12 @@ class ExperimentRunner(object): label_names.append(label.name) subject = "%s: %s" % (experiment.name, " vs. ".join(label_names)) - text_report = TextResultsReport(experiment).GetReport() + text_report = TextResultsReport(experiment, True).GetReport() text_report = "<pre style='font-size: 13px'>%s</pre>" % text_report html_report = HTMLResultsReport(experiment).GetReport() attachment = EmailSender.Attachment("report.html", html_report) - EmailSender().SendEmail([getpass.getuser()], + email_to = [getpass.getuser()] + experiment.email_to + EmailSender().SendEmail(email_to, subject, text_report, attachments=[attachment], @@ -89,10 +90,11 @@ class ExperimentRunner(object): self.l.LogOutput("Storing results of each benchmark run.") for benchmark_run in experiment.benchmark_runs: - benchmark_run_name = filter(str.isalnum, benchmark_run.name) - benchmark_run_path = os.path.join(results_directory, - benchmark_run_name) - benchmark_run.result.CopyResultsTo(benchmark_run_path) + if benchmark_run.result: + benchmark_run_name = filter(str.isalnum, benchmark_run.name) + benchmark_run_path = os.path.join(results_directory, + benchmark_run_name) + benchmark_run.result.CopyResultsTo(benchmark_run_path) def Run(self): self._Run(self._experiment) diff --git a/crosperf/machine_manager.py b/crosperf/machine_manager.py index eb0b7539..8562e929 100644 --- a/crosperf/machine_manager.py +++ b/crosperf/machine_manager.py @@ -1,9 +1,12 @@ +import hashlib +import image_chromeos +import lock_machine +import math +import os.path import sys import threading import time from image_checksummer import ImageChecksummer -import image_chromeos -import lock_machine from utils import command_executer from utils import logger from utils.file_utils import FileUtils @@ -12,13 +15,90 @@ CHECKSUM_FILE = "/usr/local/osimage_checksum_file" class CrosMachine(object): - def __init__(self, name): + def __init__(self, name, chromeos_root): self.name = name self.image = None self.checksum = None self.locked = False self.released_time = time.time() self.autotest_run = None + self.chromeos_root = chromeos_root + self._GetMemoryInfo() + self._GetCPUInfo() + self._ComputeMachineChecksumString() + self._ComputeMachineChecksum() + + def _ParseMemoryInfo(self): + line = self.meminfo.splitlines()[0] + usable_kbytes = int(line.split()[1]) + # This code is from src/third_party/autotest/files/client/bin/base_utils.py + # usable_kbytes is system's usable DRAM in kbytes, + # as reported by memtotal() from device /proc/meminfo memtotal + # after Linux deducts 1.5% to 9.5% for system table overhead + # Undo the unknown actual deduction by rounding up + # to next small multiple of a big power-of-two + # eg 12GB - 5.1% gets rounded back up to 12GB + mindeduct = 0.005 # 0.5 percent + maxdeduct = 0.095 # 9.5 percent + # deduction range 1.5% .. 9.5% supports physical mem sizes + # 6GB .. 12GB in steps of .5GB + # 12GB .. 24GB in steps of 1 GB + # 24GB .. 48GB in steps of 2 GB ... + # Finer granularity in physical mem sizes would require + # tighter spread between min and max possible deductions + + # increase mem size by at least min deduction, without rounding + min_kbytes = int(usable_kbytes / (1.0 - mindeduct)) + # increase mem size further by 2**n rounding, by 0..roundKb or more + round_kbytes = int(usable_kbytes / (1.0 - maxdeduct)) - min_kbytes + # find least binary roundup 2**n that covers worst-cast roundKb + mod2n = 1 << int(math.ceil(math.log(round_kbytes, 2))) + # have round_kbytes <= mod2n < round_kbytes*2 + # round min_kbytes up to next multiple of mod2n + phys_kbytes = min_kbytes + mod2n - 1 + phys_kbytes -= phys_kbytes % mod2n # clear low bits + self.phys_kbytes = phys_kbytes + + def _GetMemoryInfo(self): + #TODO yunlian: when the machine in rebooting, it will not return + #meminfo, the assert does not catch it either + ce = command_executer.GetCommandExecuter() + command = "cat /proc/meminfo" + ret, self.meminfo, _ = ce.CrosRunCommand( + command, return_output=True, + machine=self.name, username="root", chromeos_root=self.chromeos_root) + assert ret == 0, "Could not get meminfo from machine: %s" % self.name + if ret == 0: + self._ParseMemoryInfo() + + #cpuinfo format is different across architecture + #need to find a better way to parse it. + def _ParseCPUInfo(self,cpuinfo): + return 0 + + def _GetCPUInfo(self): + ce = command_executer.GetCommandExecuter() + command = "cat /proc/cpuinfo" + ret, self.cpuinfo, _ = ce.CrosRunCommand( + command, return_output=True, + machine=self.name, username="root", chromeos_root=self.chromeos_root) + assert ret == 0, "Could not get cpuinfo from machine: %s" % self.name + if ret == 0: + self._ParseCPUInfo(self.cpuinfo) + + def _ComputeMachineChecksumString(self): + self.checksum_string = "" + exclude_lines_list = ["MHz", "BogoMIPS", "bogomips"] + for line in self.cpuinfo.splitlines(): + if not any([e in line for e in exclude_lines_list]): + self.checksum_string += line + self.checksum_string += " " + str(self.phys_kbytes) + + def _ComputeMachineChecksum(self): + if self.checksum_string: + self.machine_checksum = hashlib.md5(self.checksum_string).hexdigest() + else: + self.machine_checksum = "" def __str__(self): l = [] @@ -38,7 +118,10 @@ class MachineManager(object): self.image_lock = threading.Lock() self.num_reimages = 0 self.chromeos_root = None - self.no_lock = False + if os.path.isdir(lock_machine.FileLock.LOCKS_DIR): + self.no_lock = False + else: + self.no_lock = True self.initialized = False self.chromeos_root = chromeos_root @@ -69,6 +152,20 @@ class MachineManager(object): return retval + def ComputeCommonCheckSum(self): + self.machine_checksum = "" + for machine in self.GetMachines(): + if machine.machine_checksum: + self.machine_checksum = machine.machine_checksum + break + + def ComputeCommonCheckSumString(self): + self.machine_checksum_string = "" + for machine in self.GetMachines(): + if machine.checksum_string: + self.machine_checksum_string = machine.checksum_string + break + def _TryToLockMachine(self, cros_machine): with self._lock: assert cros_machine, "Machine can't be None" @@ -96,7 +193,14 @@ class MachineManager(object): with self._lock: for m in self._all_machines: assert m.name != machine_name, "Tried to double-add %s" % machine_name - self._all_machines.append(CrosMachine(machine_name)) + cm = CrosMachine(machine_name, self.chromeos_root) + assert cm.machine_checksum, ("Could not find checksum for machine %s" % + machine_name) + self._all_machines.append(cm) + + def AreAllMachineSame(self): + checksums = [m.machine_checksum for m in self.GetMachines()] + return len(set(checksums)) == 1 def AcquireMachine(self, chromeos_image): image_checksum = ImageChecksummer().Checksum(chromeos_image) @@ -109,12 +213,15 @@ class MachineManager(object): for m in self._all_machines: m.released_time = time.time() + if not self.AreAllMachineSame(): + logger.GetLogger().LogFatal("-- not all the machine are identical") if not self._machines: machine_names = [] for machine in self._all_machines: machine_names.append(machine.name) - raise Exception("Could not acquire any of the following machines: '%s'" - % ", ".join(machine_names)) + logger.GetLogger().LogFatal("Could not acquire any of the" + "following machines: '%s'" + % ", ".join(machine_names)) ### for m in self._machines: ### if (m.locked and time.time() - m.released_time < 10 and diff --git a/crosperf/results_cache.py b/crosperf/results_cache.py index 6fdca550..1c33e720 100644 --- a/crosperf/results_cache.py +++ b/crosperf/results_cache.py @@ -10,14 +10,15 @@ import pickle import re import tempfile -from image_checksummer import ImageChecksummer from utils import command_executer from utils import logger from utils import misc +from image_checksummer import ImageChecksummer SCRATCH_DIR = "/home/%s/cros_scratch" % getpass.getuser() RESULTS_FILE = "results.txt" +MACHINE_FILE = "machine.txt" AUTOTEST_TARBALL = "autotest.tbz2" PERF_RESULTS_FILE = "perf-results.txt" @@ -65,7 +66,7 @@ class Result(object): [ret, out, err] = self._ce.RunCommand(command, return_output=True) keyvals_dict = {} for line in out.splitlines(): - tokens = line.split(",") + tokens = re.split("=|,", line) key = tokens[-2] if key.startswith(self.results_dir): key = key[len(self.results_dir) + 1:] @@ -109,6 +110,7 @@ class Result(object): chroot_perf_report_file = misc.GetInsideChrootPath(self._chromeos_root, perf_report_file) command = ("/usr/sbin/perf report " + "-n " "--symfs /build/%s " "--vmlinux /build/%s/usr/lib/debug/boot/vmlinux " "--kallsyms /build/%s/boot/System.map-* " @@ -186,6 +188,7 @@ class Result(object): self.perf_data_files = self._GetPerfDataFiles() self.perf_report_files = self._GetPerfReportFiles() self._ProcessResults() + self.CleanUp() def CleanUp(self): if self._temp_dir: @@ -193,7 +196,7 @@ class Result(object): self._ce.RunCommand(command) - def StoreToCacheDir(self, cache_dir): + def StoreToCacheDir(self, cache_dir, machine_manager): # Create the dir if it doesn't exist. command = "mkdir -p %s" % cache_dir ret = self._ce.RunCommand(command) @@ -206,10 +209,19 @@ class Result(object): pickle.dump(self.retval, f) tarball = os.path.join(cache_dir, AUTOTEST_TARBALL) - command = ("cd %s && tar cjf %s ." % (self.results_dir, tarball)) + command = ("cd %s && " + "tar " + "--exclude=var/spool " + "--exclude=var/log " + "-cjf %s ." % (self.results_dir, tarball)) ret = self._ce.RunCommand(command) if ret: raise Exception("Couldn't store autotest output directory.") + # Store machine info. + # TODO(asharif): Make machine_manager a singleton, and don't pass it into + # this function. + with open(os.path.join(cache_dir, MACHINE_FILE), "w") as f: + f.write(machine_manager.machine_checksum_string) @classmethod def CreateFromRun(cls, logger, chromeos_root, board, out, err, retval): @@ -232,8 +244,9 @@ class CacheConditions(object): # Cache hit only if the result file exists. CACHE_FILE_EXISTS = 0 - # Cache hit if the ip address of the cached result and the new run match. - REMOTES_MATCH = 1 + # Cache hit if the checksum of cpuinfo and totalmem of + # the cached result and the new run match. + MACHINES_MATCH = 1 # Cache hit if the image checksum of the cached result and the new run match. CHECKSUMS_MATCH = 2 @@ -253,18 +266,19 @@ class ResultsCache(object): is exactly stored (value). The value generation is handled by the Results class. """ - CACHE_VERSION = 3 + CACHE_VERSION = 5 + def Init(self, chromeos_image, chromeos_root, autotest_name, iteration, - autotest_args, remote, board, cache_conditions, + autotest_args, machine_manager, board, cache_conditions, logger_to_use): self.chromeos_image = chromeos_image self.chromeos_root = chromeos_root self.autotest_name = autotest_name self.iteration = iteration self.autotest_args = autotest_args, - self.remote = remote self.board = board self.cache_conditions = cache_conditions + self.machine_manager = machine_manager self._logger = logger_to_use self._ce = command_executer.GetCommandExecuter(self._logger) @@ -291,10 +305,10 @@ class ResultsCache(object): return cache_path def _GetCacheKeyList(self, read): - if read and CacheConditions.REMOTES_MATCH not in self.cache_conditions: - remote = "*" + if read and CacheConditions.MACHINES_MATCH not in self.cache_conditions: + machine_checksum = "*" else: - remote = self.remote + machine_checksum = self.machine_manager.machine_checksum if read and CacheConditions.CHECKSUMS_MATCH not in self.cache_conditions: checksum = "*" else: @@ -307,12 +321,11 @@ class ResultsCache(object): autotest_args_checksum = hashlib.md5( "".join(self.autotest_args)).hexdigest() - return (image_path_checksum, self.autotest_name, str(self.iteration), autotest_args_checksum, checksum, - remote, + machine_checksum, str(self.CACHE_VERSION)) def ReadResult(self): @@ -342,7 +355,7 @@ class ResultsCache(object): def StoreResult(self, result): cache_dir = self._GetCacheDirForWrite() - result.StoreToCacheDir(cache_dir) + result.StoreToCacheDir(cache_dir, self.machine_manager) class MockResultsCache(object): diff --git a/crosperf/results_columns.py b/crosperf/results_columns.py deleted file mode 100644 index 09e97d0b..00000000 --- a/crosperf/results_columns.py +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/python - -# Copyright 2011 Google Inc. All Rights Reserved. - -import math - - -class Column(object): - def __init__(self, name): - self.name = name - - def _ContainsString(self, results): - for result in results: - if isinstance(result, str): - return True - return False - - def _StripNone(self, results): - res = [] - for result in results: - if result is not None: - res.append(result) - return res - - -class MinColumn(Column): - def Compute(self, results, baseline_results): - if self._ContainsString(results): - return "-" - results = self._StripNone(results) - if not results: - return "-" - return min(results) - - -class MaxColumn(Column): - def Compute(self, results, baseline_results): - if self._ContainsString(results): - return "-" - results = self._StripNone(results) - if not results: - return "-" - return max(results) - - -class MeanColumn(Column): - def Compute(self, results, baseline_results): - all_pass = True - all_fail = True - if self._ContainsString(results): - for result in results: - if result != "PASSED": - all_pass = False - if result != "FAILED": - all_fail = False - - if all_pass: - return "ALL PASS" - elif all_fail: - return "ALL FAIL" - else: - return "-" - - results = self._StripNone(results) - if not results: - return "-" - return float(sum(results)) / len(results) - - -class StandardDeviationColumn(Column): - def __init__(self, name): - super(StandardDeviationColumn, self).__init__(name) - - def Compute(self, results, baseline_results): - if self._ContainsString(results): - return "-" - - results = self._StripNone(results) - if not results: - return "-" - n = len(results) - average = sum(results) / n - total = 0 - for result in results: - total += (result - average) ** 2 - - return math.sqrt(total / n) - - -class RatioColumn(Column): - def __init__(self, name): - super(RatioColumn, self).__init__(name) - - def Compute(self, results, baseline_results): - if self._ContainsString(results) or self._ContainsString(baseline_results): - return "-" - - results = self._StripNone(results) - baseline_results = self._StripNone(baseline_results) - if not results or not baseline_results: - return "-" - result_mean = sum(results) / len(results) - baseline_mean = sum(baseline_results) / len(baseline_results) - - if not baseline_mean: - return "-" - - return result_mean / baseline_mean - - -class DeltaColumn(Column): - def __init__(self, name): - super(DeltaColumn, self).__init__(name) - - def Compute(self, results, baseline_results): - if self._ContainsString(results) or self._ContainsString(baseline_results): - return "-" - - results = self._StripNone(results) - baseline_results = self._StripNone(baseline_results) - if not results or not baseline_results: - return "-" - result_mean = sum(results) / len(results) - baseline_mean = sum(baseline_results) / len(baseline_results) - - if not baseline_mean: - return "-" - - res = 100 * (result_mean - baseline_mean) / baseline_mean - return res - - -class IterationsCompleteColumn(Column): - def __init__(self, name): - super(IterationsCompleteColumn, self).__init__(name) - - def Compute(self, results, baseline_results): - return len(self._StripNone(results)) - - -class IterationColumn(Column): - def __init__(self, name, iteration): - super(IterationColumn, self).__init__(name) - self.iteration = iteration - - def Compute(self, results, baseline_results): - if self.iteration > len(results): - return "" - res = results[self.iteration - 1] - if not res: - return "-" - return res diff --git a/crosperf/results_organizer.py b/crosperf/results_organizer.py new file mode 100644 index 00000000..0071387b --- /dev/null +++ b/crosperf/results_organizer.py @@ -0,0 +1,42 @@ +#!/usr/bin/python + +# Copyright 2012 Google Inc. All Rights Reserved. + + +class ResultOrganizer(object): + """Create a dict from benchmark_runs. + + The structure of the output dict is as follows: + {"benchmark_1":[ + [{"key1":"v1", "key2":"v2"},{"key1":"v1", "key2","v2"}] + #one label + [] + #the other label + ] + "benchmark_2": + [ + ]}. + """ + + def __init__(self, benchmark_runs, labels): + self.result = {} + self.labels = [] + for label in labels: + self.labels.append(label.name) + for benchmark_run in benchmark_runs: + benchmark_name = benchmark_run.benchmark_name + if benchmark_name not in self.result: + self.result[benchmark_name] = [] + while len(self.result[benchmark_name]) < len(labels): + self.result[benchmark_name].append([]) + label_index = self.labels.index(benchmark_run.label_name) + cur_table = self.result[benchmark_name][label_index] + index = benchmark_run.iteration - 1 + while index >= len(cur_table): + cur_table.append({}) + cur_dict = cur_table[index] + if not benchmark_run.result: + continue + for autotest_key in benchmark_run.result.keyvals: + result_value = benchmark_run.result.keyvals[autotest_key] + cur_dict[autotest_key] = result_value diff --git a/crosperf/results_report.py b/crosperf/results_report.py index 0cd46ed4..b591370a 100644 --- a/crosperf/results_report.py +++ b/crosperf/results_report.py @@ -2,20 +2,14 @@ # Copyright 2011 Google Inc. All Rights Reserved. +import math from column_chart import ColumnChart -from results_columns import IterationColumn -from results_columns import IterationsCompleteColumn -from results_columns import MaxColumn -from results_columns import MeanColumn -from results_columns import MinColumn -from results_columns import RatioColumn -from results_columns import StandardDeviationColumn from results_sorter import ResultSorter -from table import Table - +from results_organizer import ResultOrganizer +from utils.tabulator import * class ResultsReport(object): - DELTA_COLUMN_NAME = "Change" + MAX_COLOR_CODE = 255 def __init__(self, experiment): self.experiment = experiment @@ -32,85 +26,106 @@ class ResultsReport(object): labels[benchmark_run.label_name].append(benchmark_run) return labels - def GetFullTable(self): - full_columns = [] - max_iterations = 0 - for benchmark in self.benchmarks: - if benchmark.iterations > max_iterations: - max_iterations = benchmark.iterations - - for i in range(1, max_iterations + 1): - full_columns.append(IterationColumn(str(i), i)) - - full_columns.append(IterationsCompleteColumn("Completed")) - full_columns.append(MinColumn("Min")) - full_columns.append(MaxColumn("Max")) - full_columns.append(MeanColumn("Avg")) - full_columns.append(StandardDeviationColumn("Std Dev")) - full_columns.append(RatioColumn(self.DELTA_COLUMN_NAME)) - return self._GetTable(self.labels, self.benchmarks, self.benchmark_runs, - full_columns) - - def GetSummaryTable(self): - summary_columns = [MeanColumn("Average"), - RatioColumn(self.DELTA_COLUMN_NAME)] - return self._GetTable(self.labels, self.benchmarks, self.benchmark_runs, - summary_columns) - - def _GetTable(self, labels, benchmarks, benchmark_runs, columns): - table = Table("box-table-a") - label_headings = [Table.Cell("", hidden=True, colspan=2, header=True)] - for label in labels: - colspan = len(columns) - if label.name == self.baseline.name: - colspan -= 1 - label_headings.append(Table.Cell(label.name, colspan=colspan, - header=True)) - - table.AddRow(label_headings) - - column_headings = [Table.Cell("Autotest Key", header=True), - Table.Cell("Iterations", header=True)] - for label in labels: - for column in columns: - if (label.name == self.baseline.name and - column.name == self.DELTA_COLUMN_NAME): - continue - column_headings.append(Table.Cell(column.name, header=True)) - - table.AddRow(column_headings) - - sorter = ResultSorter(benchmark_runs) - - for benchmark in benchmarks: - table.AddRow([Table.Cell(benchmark.name)]) - autotest_keys = sorter.GetAutotestKeys(benchmark.name) - for autotest_key in autotest_keys: - row = [Table.Cell(autotest_key), - Table.Cell(benchmark.iterations)] - for label in labels: - for column in columns: - if (label.name == self.baseline.name and - column.name == self.DELTA_COLUMN_NAME): - continue - results = sorter.GetResults(benchmark.name, - autotest_key, label.name) - baseline_results = sorter.GetResults(benchmark.name, - autotest_key, - self.baseline.name) - value = column.Compute(results, baseline_results) - if isinstance(value, float): - value_string = "%.2f" % value - else: - value_string = value - - row.append(Table.Cell(value_string)) - - table.AddRow(row) - - return table - - + def GetFullTables(self): + columns = [Column(NonEmptyCountResult(), + Format(), + "Completed"), + Column(RawResult(), + Format()), + Column(MinResult(), + Format()), + Column(MaxResult(), + Format()), + Column(AmeanResult(), + Format()), + Column(StdResult(), + Format()) + ] + return self._GetTables(self.labels, self.benchmark_runs, columns) + + def GetSummaryTables(self): + columns = [Column(AmeanResult(), + Format()), + Column(StdResult(), + Format(), "StdDev"), + Column(CoeffVarResult(), + CoeffVarFormat(), "Mean/StdDev"), + Column(GmeanRatioResult(), + RatioFormat(), "GmeanSpeedup"), + Column(GmeanRatioResult(), + ColorBoxFormat(), " "), + Column(StatsSignificant(), + Format(), "p-value") + ] + return self._GetTables(self.labels, self.benchmark_runs, columns) + + def _ParseColumn(self, columns, iteration): + new_column = [] + for column in columns: + if column.result.__class__.__name__ != "RawResult": + #TODO(asharif): tabulator should support full table natively. + new_column.append(column) + else: + for i in range(iteration): + cc = Column(LiteralResult(i), Format(), str(i+1)) + new_column.append(cc) + return new_column + + def _AreAllRunsEmpty(self, runs): + for label in runs: + for dictionary in label: + if dictionary: + return False + return True + + def _GetTables(self, labels, benchmark_runs, columns): + tables = [] + ro = ResultOrganizer(benchmark_runs, labels) + result = ro.result + label_name = ro.labels + for item in result: + runs = result[item] + for benchmark in self.benchmarks: + if benchmark.name == item: + break + benchmark_info = ("Benchmark: {0}; Iterations: {1}" + .format(benchmark.name, benchmark.iterations)) + cell = Cell() + cell.string_value = benchmark_info + ben_table = [[cell]] + + if self._AreAllRunsEmpty(runs): + cell = Cell() + cell.string_value = ("This benchmark contains no result." + " Is the benchmark name valid?") + cell_table = [[cell]] + else: + tg = TableGenerator(runs, label_name) + table = tg.GetTable() + parsed_columns = self._ParseColumn(columns, benchmark.iterations) + tf = TableFormatter(table, parsed_columns) + cell_table = tf.GetCellTable() + tables.append(ben_table) + tables.append(cell_table) + return tables + + def PrintTables(self, tables, out_to): + output = "" + for table in tables: + if out_to == "HTML": + tp = TablePrinter(table, TablePrinter.HTML) + elif out_to == "PLAIN": + tp = TablePrinter(table, TablePrinter.PLAIN) + elif out_to == "CONSOLE": + tp = TablePrinter(table, TablePrinter.CONSOLE) + elif out_to == "TSV": + tp = TablePrinter(table, TablePrinter.TSV) + elif out_to == "EMAIL": + tp = TablePrinter(table, TablePrinter.EMAIL) + else: + pass + output += tp.Print() + return output class TextResultsReport(ResultsReport): TEXT = """ =========================================== @@ -118,50 +133,36 @@ Results report for: '%s' =========================================== ------------------------------------------- -Benchmark Run Status -------------------------------------------- -%s - -Number re-images: %s - -------------------------------------------- Summary ------------------------------------------- %s ------------------------------------------- -Full Table -------------------------------------------- -%s - -------------------------------------------- Experiment File ------------------------------------------- %s =========================================== """ - def __init__(self, experiment): + def __init__(self, experiment, email=False): super(TextResultsReport, self).__init__(experiment) - - def GetStatusTable(self): - status_table = Table("status") - for benchmark_run in self.benchmark_runs: - status_table.AddRow([Table.Cell(benchmark_run.name), - Table.Cell(benchmark_run.status), - Table.Cell(benchmark_run.failure_reason)]) - return status_table + self.email = email def GetReport(self): + summary_table = self.GetSummaryTables() + full_table = self.GetFullTables() + if not self.email: + return self.TEXT % (self.experiment.name, + self.PrintTables(summary_table, "CONSOLE"), + self.experiment.experiment_file) + return self.TEXT % (self.experiment.name, - self.GetStatusTable().ToText(), - self.experiment.machine_manager.num_reimages, - self.GetSummaryTable().ToText(80), - self.GetFullTable().ToText(80), + self.PrintTables(summary_table, "EMAIL"), self.experiment.experiment_file) class HTMLResultsReport(ResultsReport): + HTML = """ <html> <head> @@ -303,36 +304,49 @@ pre { def GetReport(self): chart_javascript = "" - charts = self._GetCharts(self.labels, self.benchmarks, self.benchmark_runs) + charts = self._GetCharts(self.labels, self.benchmark_runs) for chart in charts: chart_javascript += chart.GetJavascript() chart_divs = "" for chart in charts: chart_divs += chart.GetDiv() - summary_table = self.GetSummaryTable() - full_table = self.GetFullTable() + summary_table = self.GetSummaryTables() + full_table = self.GetFullTables() return self.HTML % (chart_javascript, - summary_table.ToHTML(), - summary_table.ToText(), - summary_table.ToTSV(), + self.PrintTables(summary_table, "HTML"), + self.PrintTables(summary_table, "PLAIN"), + self.PrintTables(summary_table, "TSV"), self._GetTabMenuHTML("summary"), chart_divs, - full_table.ToHTML(), - full_table.ToText(), - full_table.ToTSV(), + self.PrintTables(full_table, "HTML"), + self.PrintTables(full_table, "PLAIN"), + self.PrintTables(full_table, "TSV"), self._GetTabMenuHTML("full"), self.experiment.experiment_file) - def _GetCharts(self, labels, benchmarks, benchmark_runs): + def _GetCharts(self, labels, benchmark_runs): charts = [] - sorter = ResultSorter(benchmark_runs) - - for benchmark in benchmarks: - autotest_keys = sorter.GetAutotestKeys(benchmark.name) - - for autotest_key in autotest_keys: - title = "%s: %s" % (benchmark.name, autotest_key.replace("/", " ")) + ro = ResultOrganizer(benchmark_runs, labels) + result = ro.result + for item in result: + runs = result[item] + tg = TableGenerator(runs, ro.labels) + table = tg.GetTable() + columns = [Column(AmeanResult(), + Format()), + Column(MinResult(), + Format()), + Column(MaxResult(), + Format()) + ] + tf = TableFormatter(table, columns) + data_table = tf.GetCellTable() + + for i in range(2, len(data_table)): + cur_row_data = data_table[i] + autotest_key = cur_row_data[0].string_value + title = "{0}: {1}".format(item, autotest_key.replace("/", "")) chart = ColumnChart(title, 300, 200) chart.AddColumn("Label", "string") chart.AddColumn("Average", "number") @@ -340,17 +354,15 @@ pre { chart.AddColumn("Max", "number") chart.AddSeries("Min", "line", "black") chart.AddSeries("Max", "line", "black") - - for label in labels: - res = sorter.GetResults(benchmark.name, autotest_key, label.name) - avg_val = MeanColumn("").Compute(res, None) - min_val = MinColumn("").Compute(res, None) - max_val = MaxColumn("").Compute(res, None) - chart.AddRow([label.name, avg_val, min_val, max_val]) - if isinstance(avg_val, str): + cur_index = 1 + for label in ro.labels: + chart.AddRow([label, cur_row_data[cur_index].value, + cur_row_data[cur_index + 1].value, + cur_row_data[cur_index + 2].value]) + if isinstance(cur_row_data[cur_index].value, str): chart = None break - + cur_index += 3 if chart: charts.append(chart) return charts diff --git a/crosperf/results_sorter.py b/crosperf/results_sorter.py index 0567ef7b..985a91fb 100644 --- a/crosperf/results_sorter.py +++ b/crosperf/results_sorter.py @@ -9,6 +9,8 @@ class ResultSorter(object): for benchmark_run in benchmark_runs: benchmark_name = benchmark_run.benchmark_name label_name = benchmark_run.label_name + if not benchmark_run.result: + continue for autotest_key in benchmark_run.result.keyvals: result_tuple = (benchmark_name, autotest_key, label_name) if result_tuple not in self.table: @@ -32,6 +34,8 @@ class ResultSorter(object): benchmark_name = benchmark_run.benchmark_name if benchmark_name not in self.autotest_keys: self.autotest_keys[benchmark_name] = {} + if not benchmark_run.result: + continue for autotest_key in benchmark_run.result.keyvals: self.autotest_keys[benchmark_name][autotest_key] = True diff --git a/crosperf/settings_factory.py b/crosperf/settings_factory.py index 95ceb0fd..782f0dd3 100644 --- a/crosperf/settings_factory.py +++ b/crosperf/settings_factory.py @@ -26,15 +26,11 @@ class BenchmarkSettings(Settings): self.AddField(FloatField("outlier_range", default=0.2, description="The percentage of highest/lowest " "values to omit when computing the average.")) - self.AddField(ListField("profile_counters", - default=["cycles"], - description="A list of profile counters to " - "collect.")) - self.AddField(EnumField("profile_type", - description="The type of profile to collect. " - "Either 'stat', 'record' or ''.", - options=["stat", "record", ""], - default="")) + self.AddField(TextField("perf_args", default="", + description="The optional profile command. It " + "enables perf commands to record perforamance " + "related counters. It must start with perf " + "command record or stat followed by arguments.")) class LabelSettings(Settings): @@ -66,10 +62,12 @@ class GlobalSettings(Settings): self.AddField(BooleanField("rerun_if_failed", description="Whether to " "re-run failed autotest runs or not.", default=False)) + self.AddField(ListField("email", description="Space-seperated" + "list of email addresses to send email to.")) self.AddField(BooleanField("rerun", description="Whether to ignore the " "cache and for autotests to be re-run.", default=False)) - self.AddField(BooleanField("exact_remote", default=False, + self.AddField(BooleanField("exact_remote", default=True, description="Ensure cached runs are run on the " "same device that is specified as a remote.")) self.AddField(IntegerField("iterations", default=1, @@ -80,14 +78,11 @@ class GlobalSettings(Settings): "contains a src/scripts directory. Defaults to " "the chromeos checkout which contains the " "chromeos_image.")) - self.AddField(ListField("profile_counters", - default=["cycles"], - description="A list of profile counters to " - "collect.")) - self.AddField(EnumField("profile_type", - description="The type of profile to collect. " - "Either 'stat', 'record' or ''.", - options=["stat", "record", ""])) + self.AddField(TextField("perf_args", default="", + description="The optional profile command. It " + "enables perf commands to record perforamance " + "related counters. It must start with perf " + "command record or stat followed by arguments.")) class SettingsFactory(object): diff --git a/crosperf/table.py b/crosperf/table.py deleted file mode 100644 index 84eb21ae..00000000 --- a/crosperf/table.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/python - -# Copyright 2011 Google Inc. All Rights Reserved. - -import math - - -class Table(object): - class Cell(object): - def __init__(self, value, colspan=1, hidden=False, header=False): - self.value = value - self.colspan = colspan - self.hidden = hidden - self.header = header - - def __init__(self, table_id): - self.table_id = table_id - self.rows = [] - - def AddRow(self, row): - self.rows.append(row) - - def ToHTML(self): - res = "<table id='%s'>\n" % self.table_id - for row in self.rows: - res += "<tr>" - for cell in row: - if cell.header: - tag = "th" - else: - tag = "td" - cell_class = "" - if cell.hidden: - cell_class = "class='hidden'" - res += "<%s colspan='%s' %s>%s</%s>" % (tag, cell.colspan, cell_class, - cell.value, tag) - res += "</tr>\n" - res += "</table>" - return res - - def ToText(self, max_column_width=None): - col_spacing = 2 - max_widths = [] - for row in self.rows: - column = 0 - for cell in row: - text_width = len(str(cell.value)) - per_column_width = int(math.ceil(float(text_width) / cell.colspan)) - if max_column_width: - per_column_width = min(max_column_width, per_column_width) - for i in range(column, column + cell.colspan): - while i >= len(max_widths): - max_widths.append(0) - max_widths[i] = max(per_column_width, max_widths[i]) - column += cell.colspan - - res = "" - for row in self.rows: - column = 0 - for cell in row: - val = str(cell.value) - if max_column_width: - if len(val) > max_column_width: - val = val[:2] + ".." + val[len(val) - (max_column_width - 4):] - res += val - space_to_use = (sum(max_widths[column:column + cell.colspan]) + - (cell.colspan * col_spacing)) - whitespace_length = space_to_use - len(val) - res += " " * whitespace_length - # Add space b/w columns - column += cell.colspan - res += "\n" - return res - - def ToTSV(self): - res = "" - column = 0 - for row in self.rows: - for cell in row: - res += str(cell.value).replace("\t", " ") - for _ in range(column, column + cell.colspan): - res += "\t" - res += "\n" - return res diff --git a/image_chromeos.py b/image_chromeos.py index f1497fb9..380a94f7 100755 --- a/image_chromeos.py +++ b/image_chromeos.py @@ -9,10 +9,12 @@ This script images a remote ChromeOS device with a specific image." __author__ = "asharif@google.com (Ahmad Sharif)" +import fcntl import filecmp import glob import optparse import os +import re import shutil import sys import tempfile @@ -22,15 +24,25 @@ from utils import misc from utils.file_utils import FileUtils checksum_file = "/usr/local/osimage_checksum_file" - +lock_file = "/tmp/lock_image_chromeos" def Usage(parser, message): print "ERROR: " + message parser.print_help() sys.exit(0) + def Main(argv): """Build ChromeOS.""" + #Get lock for the host + f = open(lock_file, "w+a") + try: + fcntl.lockf(f, fcntl.LOCK_EX|fcntl.LOCK_NB) + except IOError: + f.close() + print ("You can not run two instances of image_chromes at the same time." + "\nTry again. Exiting ....") + exit(0) # Common initializations cmd_executer = command_executer.GetCommandExecuter() l = logger.GetLogger() @@ -150,6 +162,8 @@ def Main(argv): else: l.LogOutput("Checksums match. Skipping reimage") + fcntl.lockf(f, fcntl.LOCK_UN) + f.close() return retval @@ -219,7 +233,8 @@ def IsImageModdedForTest(chromeos_root, image): stateful_mp = tempfile.mkdtemp() MountImage(chromeos_root, image, rootfs_mp, stateful_mp) lsb_release_file = os.path.join(rootfs_mp, "etc/lsb-release") - is_test_image = "Test Build" in open(lsb_release_file).read() + lsb_release_contents = open(lsb_release_file).read() + is_test_image = re.search("test", lsb_release_contents, re.IGNORECASE) MountImage(chromeos_root, image, rootfs_mp, stateful_mp, unmount=True) return is_test_image diff --git a/utils/colortrans.py b/utils/colortrans.py new file mode 100644 index 00000000..37e91572 --- /dev/null +++ b/utils/colortrans.py @@ -0,0 +1,376 @@ +#! /usr/bin/env python + +""" Convert values between RGB hex codes and xterm-256 color codes. + +Nice long listing of all 256 colors and their codes. Useful for +developing console color themes, or even script output schemes. + +Resources: +* http://en.wikipedia.org/wiki/8-bit_color +* http://en.wikipedia.org/wiki/ANSI_escape_code +* /usr/share/X11/rgb.txt + +I'm not sure where this script was inspired from. I think I must have +written it from scratch, though it's been several years now. +""" + +__author__ = 'Micah Elliott http://MicahElliott.com' +__version__ = '0.1' +__copyright__ = 'Copyright (C) 2011 Micah Elliott. All rights reserved.' +__license__ = 'WTFPL http://sam.zoy.org/wtfpl/' + +#--------------------------------------------------------------------- + +import sys, re + +CLUT = [ # color look-up table +# 8-bit, RGB hex + + # Primary 3-bit (8 colors). Unique representation! + ('00', '000000'), + ('01', '800000'), + ('02', '008000'), + ('03', '808000'), + ('04', '000080'), + ('05', '800080'), + ('06', '008080'), + ('07', 'c0c0c0'), + + # Equivalent "bright" versions of original 8 colors. + ('08', '808080'), + ('09', 'ff0000'), + ('10', '00ff00'), + ('11', 'ffff00'), + ('12', '0000ff'), + ('13', 'ff00ff'), + ('14', '00ffff'), + ('15', 'ffffff'), + + # Strictly ascending. + ('16', '000000'), + ('17', '00005f'), + ('18', '000087'), + ('19', '0000af'), + ('20', '0000d7'), + ('21', '0000ff'), + ('22', '005f00'), + ('23', '005f5f'), + ('24', '005f87'), + ('25', '005faf'), + ('26', '005fd7'), + ('27', '005fff'), + ('28', '008700'), + ('29', '00875f'), + ('30', '008787'), + ('31', '0087af'), + ('32', '0087d7'), + ('33', '0087ff'), + ('34', '00af00'), + ('35', '00af5f'), + ('36', '00af87'), + ('37', '00afaf'), + ('38', '00afd7'), + ('39', '00afff'), + ('40', '00d700'), + ('41', '00d75f'), + ('42', '00d787'), + ('43', '00d7af'), + ('44', '00d7d7'), + ('45', '00d7ff'), + ('46', '00ff00'), + ('47', '00ff5f'), + ('48', '00ff87'), + ('49', '00ffaf'), + ('50', '00ffd7'), + ('51', '00ffff'), + ('52', '5f0000'), + ('53', '5f005f'), + ('54', '5f0087'), + ('55', '5f00af'), + ('56', '5f00d7'), + ('57', '5f00ff'), + ('58', '5f5f00'), + ('59', '5f5f5f'), + ('60', '5f5f87'), + ('61', '5f5faf'), + ('62', '5f5fd7'), + ('63', '5f5fff'), + ('64', '5f8700'), + ('65', '5f875f'), + ('66', '5f8787'), + ('67', '5f87af'), + ('68', '5f87d7'), + ('69', '5f87ff'), + ('70', '5faf00'), + ('71', '5faf5f'), + ('72', '5faf87'), + ('73', '5fafaf'), + ('74', '5fafd7'), + ('75', '5fafff'), + ('76', '5fd700'), + ('77', '5fd75f'), + ('78', '5fd787'), + ('79', '5fd7af'), + ('80', '5fd7d7'), + ('81', '5fd7ff'), + ('82', '5fff00'), + ('83', '5fff5f'), + ('84', '5fff87'), + ('85', '5fffaf'), + ('86', '5fffd7'), + ('87', '5fffff'), + ('88', '870000'), + ('89', '87005f'), + ('90', '870087'), + ('91', '8700af'), + ('92', '8700d7'), + ('93', '8700ff'), + ('94', '875f00'), + ('95', '875f5f'), + ('96', '875f87'), + ('97', '875faf'), + ('98', '875fd7'), + ('99', '875fff'), + ('100', '878700'), + ('101', '87875f'), + ('102', '878787'), + ('103', '8787af'), + ('104', '8787d7'), + ('105', '8787ff'), + ('106', '87af00'), + ('107', '87af5f'), + ('108', '87af87'), + ('109', '87afaf'), + ('110', '87afd7'), + ('111', '87afff'), + ('112', '87d700'), + ('113', '87d75f'), + ('114', '87d787'), + ('115', '87d7af'), + ('116', '87d7d7'), + ('117', '87d7ff'), + ('118', '87ff00'), + ('119', '87ff5f'), + ('120', '87ff87'), + ('121', '87ffaf'), + ('122', '87ffd7'), + ('123', '87ffff'), + ('124', 'af0000'), + ('125', 'af005f'), + ('126', 'af0087'), + ('127', 'af00af'), + ('128', 'af00d7'), + ('129', 'af00ff'), + ('130', 'af5f00'), + ('131', 'af5f5f'), + ('132', 'af5f87'), + ('133', 'af5faf'), + ('134', 'af5fd7'), + ('135', 'af5fff'), + ('136', 'af8700'), + ('137', 'af875f'), + ('138', 'af8787'), + ('139', 'af87af'), + ('140', 'af87d7'), + ('141', 'af87ff'), + ('142', 'afaf00'), + ('143', 'afaf5f'), + ('144', 'afaf87'), + ('145', 'afafaf'), + ('146', 'afafd7'), + ('147', 'afafff'), + ('148', 'afd700'), + ('149', 'afd75f'), + ('150', 'afd787'), + ('151', 'afd7af'), + ('152', 'afd7d7'), + ('153', 'afd7ff'), + ('154', 'afff00'), + ('155', 'afff5f'), + ('156', 'afff87'), + ('157', 'afffaf'), + ('158', 'afffd7'), + ('159', 'afffff'), + ('160', 'd70000'), + ('161', 'd7005f'), + ('162', 'd70087'), + ('163', 'd700af'), + ('164', 'd700d7'), + ('165', 'd700ff'), + ('166', 'd75f00'), + ('167', 'd75f5f'), + ('168', 'd75f87'), + ('169', 'd75faf'), + ('170', 'd75fd7'), + ('171', 'd75fff'), + ('172', 'd78700'), + ('173', 'd7875f'), + ('174', 'd78787'), + ('175', 'd787af'), + ('176', 'd787d7'), + ('177', 'd787ff'), + ('178', 'd7af00'), + ('179', 'd7af5f'), + ('180', 'd7af87'), + ('181', 'd7afaf'), + ('182', 'd7afd7'), + ('183', 'd7afff'), + ('184', 'd7d700'), + ('185', 'd7d75f'), + ('186', 'd7d787'), + ('187', 'd7d7af'), + ('188', 'd7d7d7'), + ('189', 'd7d7ff'), + ('190', 'd7ff00'), + ('191', 'd7ff5f'), + ('192', 'd7ff87'), + ('193', 'd7ffaf'), + ('194', 'd7ffd7'), + ('195', 'd7ffff'), + ('196', 'ff0000'), + ('197', 'ff005f'), + ('198', 'ff0087'), + ('199', 'ff00af'), + ('200', 'ff00d7'), + ('201', 'ff00ff'), + ('202', 'ff5f00'), + ('203', 'ff5f5f'), + ('204', 'ff5f87'), + ('205', 'ff5faf'), + ('206', 'ff5fd7'), + ('207', 'ff5fff'), + ('208', 'ff8700'), + ('209', 'ff875f'), + ('210', 'ff8787'), + ('211', 'ff87af'), + ('212', 'ff87d7'), + ('213', 'ff87ff'), + ('214', 'ffaf00'), + ('215', 'ffaf5f'), + ('216', 'ffaf87'), + ('217', 'ffafaf'), + ('218', 'ffafd7'), + ('219', 'ffafff'), + ('220', 'ffd700'), + ('221', 'ffd75f'), + ('222', 'ffd787'), + ('223', 'ffd7af'), + ('224', 'ffd7d7'), + ('225', 'ffd7ff'), + ('226', 'ffff00'), + ('227', 'ffff5f'), + ('228', 'ffff87'), + ('229', 'ffffaf'), + ('230', 'ffffd7'), + ('231', 'ffffff'), + + # Gray-scale range. + ('232', '080808'), + ('233', '121212'), + ('234', '1c1c1c'), + ('235', '262626'), + ('236', '303030'), + ('237', '3a3a3a'), + ('238', '444444'), + ('239', '4e4e4e'), + ('240', '585858'), + ('241', '626262'), + ('242', '6c6c6c'), + ('243', '767676'), + ('244', '808080'), + ('245', '8a8a8a'), + ('246', '949494'), + ('247', '9e9e9e'), + ('248', 'a8a8a8'), + ('249', 'b2b2b2'), + ('250', 'bcbcbc'), + ('251', 'c6c6c6'), + ('252', 'd0d0d0'), + ('253', 'dadada'), + ('254', 'e4e4e4'), + ('255', 'eeeeee'), +] + +def _str2hex(hexstr): + return int(hexstr, 16) + +def _strip_hash(rgb): + # Strip leading `#` if exists. + if rgb.startswith('#'): + rgb = rgb.lstrip('#') + return rgb + +def _create_dicts(): + short2rgb_dict = dict(CLUT) + rgb2short_dict = {} + for k, v in short2rgb_dict.items(): + rgb2short_dict[v] = k + return rgb2short_dict, short2rgb_dict + +def short2rgb(short): + return SHORT2RGB_DICT[short] + +def print_all(): + """ Print all 256 xterm color codes. + """ + for short, rgb in CLUT: + sys.stdout.write('\033[48;5;%sm%s:%s' % (short, short, rgb)) + sys.stdout.write("\033[0m ") + sys.stdout.write('\033[38;5;%sm%s:%s' % (short, short, rgb)) + sys.stdout.write("\033[0m\n") + print "Printed all codes." + print "You can translate a hex or 0-255 code by providing an argument." + +def rgb2short(rgb): + """ Find the closest xterm-256 approximation to the given RGB value. + @param rgb: Hex code representing an RGB value, eg, 'abcdef' + @returns: String between 0 and 255, compatible with xterm. + >>> rgb2short('123456') + ('23', '005f5f') + >>> rgb2short('ffffff') + ('231', 'ffffff') + >>> rgb2short('0DADD6') # vimeo logo + ('38', '00afd7') + """ + rgb = _strip_hash(rgb) + incs = (0x00, 0x5f, 0x87, 0xaf, 0xd7, 0xff) + # Break 6-char RGB code into 3 integer vals. + parts = [ int(h, 16) for h in re.split(r'(..)(..)(..)', rgb)[1:4] ] + res = [] + for part in parts: + i = 0 + while i < len(incs)-1: + s, b = incs[i], incs[i+1] # smaller, bigger + if s <= part <= b: + s1 = abs(s - part) + b1 = abs(b - part) + if s1 < b1: closest = s + else: closest = b + res.append(closest) + break + i += 1 + #print '***', res + res = ''.join([ ('%02.x' % i) for i in res ]) + equiv = RGB2SHORT_DICT[ res ] + #print '***', res, equiv + return equiv, res + +RGB2SHORT_DICT, SHORT2RGB_DICT = _create_dicts() + +#--------------------------------------------------------------------- + +if __name__ == '__main__': + import doctest + doctest.testmod() + if len(sys.argv) == 1: + print_all() + raise SystemExit + arg = sys.argv[1] + if len(arg) < 4 and int(arg) < 256: + rgb = short2rgb(arg) + sys.stdout.write('xterm color \033[38;5;%sm%s\033[0m -> RGB exact \033[38;5;%sm%s\033[0m' % (arg, arg, arg, rgb)) + sys.stdout.write("\033[0m\n") + else: + short, rgb = rgb2short(arg) + sys.stdout.write('RGB %s -> xterm color approx \033[38;5;%sm%s (%s)' % (arg, short, short, rgb)) + sys.stdout.write("\033[0m\n") diff --git a/utils/command_executer.py b/utils/command_executer.py index c9a19ff9..54c9c355 100644 --- a/utils/command_executer.py +++ b/utils/command_executer.py @@ -42,14 +42,15 @@ class CommandExecuter: def RunCommand(self, cmd, return_output=False, machine=None, username=None, command_terminator=None, command_timeout=None, - terminated_timeout=10): + terminated_timeout=10, + print_to_console=True): """Run a command.""" cmd = str(cmd) - self.logger.LogCmd(cmd, machine, username) + self.logger.LogCmd(cmd, machine, username, print_to_console) if command_terminator and command_terminator.IsTerminated(): - self.logger.LogError("Command was terminated!") + self.logger.LogError("Command was terminated!", print_to_console) if return_output: return [1, "", ""] else: @@ -81,7 +82,7 @@ class CommandExecuter: if command_terminator and command_terminator.IsTerminated(): self.RunCommand("sudo kill -9 " + str(p.pid)) wait = p.wait() - self.logger.LogError("Command was terminated!") + self.logger.LogError("Command was terminated!", print_to_console) if return_output: return (p.wait, full_stdout, full_stderr) else: @@ -91,14 +92,14 @@ class CommandExecuter: out = os.read(p.stdout.fileno(), 16384) if return_output: full_stdout += out - self.logger.LogCommandOutput(out) + self.logger.LogCommandOutput(out, print_to_console) if out == "": pipes.remove(p.stdout) if fd == p.stderr: err = os.read(p.stderr.fileno(), 16384) if return_output: full_stderr += err - self.logger.LogCommandError(err) + self.logger.LogCommandError(err, print_to_console) if err == "": pipes.remove(p.stderr) @@ -109,14 +110,14 @@ class CommandExecuter: time.time() - terminated_time > terminated_timeout): m = ("Timeout of %s seconds reached since process termination." % terminated_timeout) - self.logger.LogWarning(m) + self.logger.LogWarning(m, print_to_console) break if (command_timeout is not None and time.time() - started_time > command_timeout): m = ("Timeout of %s seconds reached since process started." % command_timeout) - self.logger.LogWarning(m) + self.logger.LogWarning(m, print_to_console) self.RunCommand("kill %d || sudo kill %d || sudo kill -9 %d" % (p.pid, p.pid, p.pid)) break @@ -161,9 +162,10 @@ class CommandExecuter: def CrosRunCommand(self, cmd, return_output=False, machine=None, username=None, command_terminator=None, chromeos_root=None, command_timeout=None, - terminated_timeout=10): + terminated_timeout=10, + print_to_console=True): """Run a command on a chromeos box""" - self.logger.LogCmd(cmd) + self.logger.LogCmd(cmd, print_to_console) self.logger.LogFatalIf(not machine, "No machine provided!") self.logger.LogFatalIf(not chromeos_root, "chromeos_root not given!") chromeos_root = os.path.expanduser(chromeos_root) @@ -197,8 +199,9 @@ class CommandExecuter: def ChrootRunCommand(self, chromeos_root, command, return_output=False, command_terminator=None, command_timeout=None, - terminated_timeout=10): - self.logger.LogCmd(command) + terminated_timeout=10, + print_to_console=True): + self.logger.LogCmd(command, print_to_console) handle, command_file = tempfile.mkstemp(dir=os.path.join(chromeos_root, "src/scripts"), diff --git a/utils/constants.py b/utils/constants.py new file mode 100644 index 00000000..33875d66 --- /dev/null +++ b/utils/constants.py @@ -0,0 +1,10 @@ +#!/usr/bin/python2.6 +# +# Copyright 2010 Google Inc. All Rights Reserved. + +"""Generic constants used accross modules. +""" + +__author__ = "shenhan@google.com (Han Shen)" + +mounted_toolchain_root='/usr/local/toolchain_root' diff --git a/utils/file_utils.py b/utils/file_utils.py index 987e88ad..24809333 100644 --- a/utils/file_utils.py +++ b/utils/file_utils.py @@ -6,6 +6,7 @@ import errno import hashlib import os import shutil +import command_executer class FileUtils(object): @@ -28,21 +29,18 @@ class FileUtils(object): return cls._instance def Md5File(self, filename, block_size=2 ** 10): - md5 = hashlib.md5() + command = "md5sum %s" % filename + ce = command_executer.GetCommandExecuter() + ret, out, err = ce.RunCommand(command, return_output=True) + if ret: + raise Exception("Could not run md5sum on: %s" % filename) - with open(filename) as f: - while True: - data = f.read(block_size) - if not data: - break - md5.update(data) - - return md5.hexdigest() + return out.strip().split()[0] def CanonicalizeChromeOSRoot(self, chromeos_root): chromeos_root = os.path.expanduser(chromeos_root) if os.path.isfile(os.path.join(chromeos_root, - "src/scripts/enter_chroot.sh")): + "src/scripts/run_remote_tests.sh")): return chromeos_root else: return None diff --git a/utils/logger.py b/utils/logger.py index 1ef996eb..a8f0a6f0 100644 --- a/utils/logger.py +++ b/utils/logger.py @@ -56,13 +56,21 @@ class Logger(object): # self._AddSuffix(basename, suffix)) return suffix + def _CreateLogFileHandle(self, name): + fd = None + try: + fd = open(name, "w") + except IOError: + print "Warning: could not open %s for writing." % name + return fd + def _CreateLogFileHandles(self, basename): suffix = self._FindSuffix(basename) suffixed_basename = self._AddSuffix(basename, suffix) - self.cmdfd = open("%s.cmd" % suffixed_basename, "w", 0755) - self.stdout = open("%s.out" % suffixed_basename, "w") - self.stderr = open("%s.err" % suffixed_basename, "w") + self.cmdfd = self._CreateLogFileHandle("%s.cmd" % suffixed_basename) + self.stdout = self._CreateLogFileHandle("%s.out" % suffixed_basename) + self.stderr = self._CreateLogFileHandle("%s.err" % suffixed_basename) self._CreateLogFileSymlinks(basename, suffixed_basename) @@ -75,40 +83,58 @@ class Logger(object): if os.path.exists(dest_file): os.remove(dest_file) os.symlink(src_file, dest_file) - except IOError as ex: - self.LogFatal(str(ex)) + except Exception as ex: + print "Exception while creating symlinks: %s" % str(ex) def _WriteTo(self, fd, msg, flush): - fd.write(msg) - if flush: - fd.flush() + if fd: + fd.write(msg) + if flush: + fd.flush() def _LogMsg(self, file_fd, term_fd, msg, flush=True): - self._WriteTo(file_fd, msg, flush) + if file_fd: + self._WriteTo(file_fd, msg, flush) if self.print_console: self._WriteTo(term_fd, msg, flush) - def LogCmd(self, cmd, machine="", user=None): + def _GetStdout(self, print_to_console): + if print_to_console: + return sys.stdout + return None + + def _GetStderr(self, print_to_console): + if print_to_console: + return sys.stderr + return None + + def LogCmd(self, cmd, machine="", user=None, print_to_console=True): if user: host = "%s@%s" % (user, machine) else: host = machine - self._LogMsg(self.cmdfd, sys.stdout, "CMD (%s): %s\n" % (host, cmd)) + self._LogMsg(self.cmdfd, self._GetStdout(print_to_console), + "CMD (%s): %s\n" % (host, cmd)) - def LogFatal(self, msg): - self._LogMsg(self.stderr, sys.stderr, "FATAL: %s\n" % msg) - self._LogMsg(self.stderr, sys.stderr, "\n".join(traceback.format_stack())) + def LogFatal(self, msg, print_to_console=True): + self._LogMsg(self.stderr, self._GetStderr(print_to_console), + "FATAL: %s\n" % msg) + self._LogMsg(self.stderr, self._GetStderr(print_to_console), + "\n".join(traceback.format_stack())) sys.exit(1) - def LogError(self, msg): - self._LogMsg(self.stderr, sys.stderr, "ERROR: %s\n" % msg) + def LogError(self, msg, print_to_console=True): + self._LogMsg(self.stderr, self._GetStderr(print_to_console), + "ERROR: %s\n" % msg) - def LogWarning(self, msg): - self._LogMsg(self.stderr, sys.stderr, "WARNING: %s\n" % msg) + def LogWarning(self, msg, print_to_console=True): + self._LogMsg(self.stderr, self._GetStderr(print_to_console), + "WARNING: %s\n" % msg) - def LogOutput(self, msg): - self._LogMsg(self.stdout, sys.stdout, "OUTPUT: %s\n" % msg) + def LogOutput(self, msg, print_to_console=True): + self._LogMsg(self.stdout, self._GetStdout(print_to_console), + "OUTPUT: %s\n" % msg) def LogFatalIf(self, condition, msg): if condition: @@ -122,11 +148,13 @@ class Logger(object): if condition: self.LogWarning(msg) - def LogCommandOutput(self, msg): - self._LogMsg(self.stdout, sys.stdout, msg, flush=False) + def LogCommandOutput(self, msg, print_to_console=True): + self._LogMsg(self.stdout, self._GetStdout(print_to_console), + msg, flush=False) - def LogCommandError(self, msg): - self._LogMsg(self.stderr, sys.stderr, msg, flush=False) + def LogCommandError(self, msg, print_to_console=True): + self._LogMsg(self.stderr, self._GetStderr(print_to_console), + msg, flush=False) def Flush(self): self.cmdfd.flush() diff --git a/utils/misc.py b/utils/misc.py index 43ef09dc..cb9acbfa 100644 --- a/utils/misc.py +++ b/utils/misc.py @@ -6,14 +6,34 @@ __author__ = "asharif@google.com (Ahmad Sharif)" -import hashlib +from contextlib import contextmanager import os import re -import stat + import command_executer import logger -import tempfile -from contextlib import contextmanager + + +def GetChromeOSVersionFromLSBVersion(lsb_version): + ce = command_executer.GetCommandExecuter() + command = "git ls-remote http://git.chromium.org/chromiumos/manifest.git" + ret, out, _ = ce.RunCommand(command, return_output=True, + print_to_console=False) + assert ret == 0, "Command %s failed" % command + lower = [] + for line in out.splitlines(): + mo = re.search("refs/heads/release-R(\d+)-(\d+)\.B", line) + if mo: + revision = int(mo.group(1)) + build = int(mo.group(2)) + lsb_build = int(lsb_version.split(".")[0]) + if lsb_build > build: + lower.append(revision) + lower = sorted(lower) + if lower: + return "R%d-%s" % (lower[-1] + 1, lsb_version) + else: + return "Unknown" def ApplySubs(string, *substitutions): @@ -27,9 +47,11 @@ def UnitToNumber(string, base=1000): "mega": base**2, "giga": base**3} string = string.lower() - mo = re.search("(\d*)(.+)", string) + mo = re.search("(\d*)(.+)?", string) number = mo.group(1) unit = mo.group(2) + if not unit: + return float(number) for k, v in unit_dict.items(): if k.startswith(unit): return float(number) * v @@ -42,8 +64,8 @@ def GetFilenameFromString(string): return ApplySubs(string, ("/", "__"), ("\s", "_"), - ("=", ""), - ("\"", "")) + ("[\^\$=\"\\\?]", ""), + ) def GetRoot(scr_name): @@ -154,14 +176,21 @@ def GetCtargetFromBoard(board, chromeos_root): "../platform/dev/toolchain_utils.sh; get_ctarget_from_board %s" % base_board) ce = command_executer.GetCommandExecuter() - ret, out, err = ce.ChrootRunCommand(chromeos_root, - command, - return_output=True) + ret, out, _ = ce.ChrootRunCommand(chromeos_root, + command, + return_output=True) if ret != 0: raise ValueError("Board %s is invalid!" % board) + # Remove ANSI escape sequences. + out = StripANSIEscapeSequences(out) return out.strip() +def StripANSIEscapeSequences(string): + string = re.sub("\x1b\[[0-9]*[a-zA-Z]", "", string) + return string + + def GetChromeSrcDir(): return "var/cache/distfiles/target/chrome-src/src" @@ -170,11 +199,30 @@ def GetEnvStringFromDict(env_dict): return " ".join(["%s=\"%s\"" % var for var in env_dict.items()]) +def MergeEnvStringWithDict(env_string, env_dict, prepend=True): + if not env_string.strip(): + return GetEnvStringFromDict(env_dict) + override_env_list = [] + ce = command_executer.GetCommandExecuter() + for k, v in env_dict.items(): + v = v.strip("\"'") + if prepend: + new_env = "%s=\"%s $%s\"" % (k, v, k) + else: + new_env = "%s=\"$%s %s\"" % (k, k, v) + command = "; ".join([env_string, new_env, "echo $%s" % k]) + ret, out, _ = ce.RunCommand(command, return_output=True) + override_env_list.append("%s=%r" % (k, out.strip())) + ret = env_string + " " + " ".join(override_env_list) + return ret.strip() + + def GetAllImages(chromeos_root, board): ce = command_executer.GetCommandExecuter() command = ("find %s/src/build/images/%s -name chromiumos_test_image.bin" % (chromeos_root, board)) - ret, out, err = ce.RunCommand(command, return_output=True) + ret, out, _ = ce.RunCommand(command, return_output=True) + assert ret == 0, "Could not run command: %s" % command return out.splitlines() diff --git a/utils/misc_test.py b/utils/misc_test.py index 62742e3b..e234332f 100644 --- a/utils/misc_test.py +++ b/utils/misc_test.py @@ -14,9 +14,37 @@ import misc class UtilsTest(unittest.TestCase): def testGetFilenameFromString(self): - string = 'a /b=c"d' + string = 'a /b=c"d^$?\\' filename = misc.GetFilenameFromString(string) self.assertTrue(filename == 'a___bcd') + def testPrependMergeEnv(self): + var = 'USE' + use_flags = 'hello 123' + added_use_flags = 'bla bla' + env_string = '%s=%r' % (var, use_flags) + new_env_string = misc.MergeEnvStringWithDict(env_string, + {var: added_use_flags}) + expected_new_env = '%s=%r' % (var, ' '.join([added_use_flags, use_flags])) + self.assertTrue(new_env_string == ' '.join([env_string, expected_new_env])) + + def testGetChromeOSVersionFromLSBVersion(self): + versions_dict = {"2630.0.0": "22", + "2030.0.0": "19"} + f = misc.GetChromeOSVersionFromLSBVersion + for k, v in versions_dict.items(): + self.assertTrue(f(k) == "R%s-%s" % (v, k)) + + def testPostpendMergeEnv(self): + var = 'USE' + use_flags = 'hello 123' + added_use_flags = 'bla bla' + env_string = '%s=%r' % (var, use_flags) + new_env_string = misc.MergeEnvStringWithDict(env_string, + {var: added_use_flags}, + False) + expected_new_env = '%s=%r' % (var, ' '.join([use_flags, added_use_flags])) + self.assertTrue(new_env_string == ' '.join([env_string, expected_new_env])) + if __name__ == '__main__': unittest.main() diff --git a/utils/stats.py b/utils/stats.py new file mode 100644 index 00000000..aceed824 --- /dev/null +++ b/utils/stats.py @@ -0,0 +1,4528 @@ +# Copyright (c) 1999-2008 Gary Strangman; All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# Comments and/or additions are welcome (send e-mail to: +# strang@nmr.mgh.harvard.edu). +# +""" +stats.py module + +(Requires pstat.py module.) + +################################################# +####### Written by: Gary Strangman ########### +####### Last modified: Oct 31, 2008 ########### +################################################# + +A collection of basic statistical functions for python. The function +names appear below. + +IMPORTANT: There are really *3* sets of functions. The first set has an 'l' +prefix, which can be used with list or tuple arguments. The second set has +an 'a' prefix, which can accept NumPy array arguments. These latter +functions are defined only when NumPy is available on the system. The third +type has NO prefix (i.e., has the name that appears below). Functions of +this set are members of a "Dispatch" class, c/o David Ascher. This class +allows different functions to be called depending on the type of the passed +arguments. Thus, stats.mean is a member of the Dispatch class and +stats.mean(range(20)) will call stats.lmean(range(20)) while +stats.mean(Numeric.arange(20)) will call stats.amean(Numeric.arange(20)). +This is a handy way to keep consistent function names when different +argument types require different functions to be called. Having +implementated the Dispatch class, however, means that to get info on +a given function, you must use the REAL function name ... that is +"print stats.lmean.__doc__" or "print stats.amean.__doc__" work fine, +while "print stats.mean.__doc__" will print the doc for the Dispatch +class. NUMPY FUNCTIONS ('a' prefix) generally have more argument options +but should otherwise be consistent with the corresponding list functions. + +Disclaimers: The function list is obviously incomplete and, worse, the +functions are not optimized. All functions have been tested (some more +so than others), but they are far from bulletproof. Thus, as with any +free software, no warranty or guarantee is expressed or implied. :-) A +few extra functions that don't appear in the list below can be found by +interested treasure-hunters. These functions don't necessarily have +both list and array versions but were deemed useful + +CENTRAL TENDENCY: geometricmean + harmonicmean + mean + median + medianscore + mode + +MOMENTS: moment + variation + skew + kurtosis + skewtest (for Numpy arrays only) + kurtosistest (for Numpy arrays only) + normaltest (for Numpy arrays only) + +ALTERED VERSIONS: tmean (for Numpy arrays only) + tvar (for Numpy arrays only) + tmin (for Numpy arrays only) + tmax (for Numpy arrays only) + tstdev (for Numpy arrays only) + tsem (for Numpy arrays only) + describe + +FREQUENCY STATS: itemfreq + scoreatpercentile + percentileofscore + histogram + cumfreq + relfreq + +VARIABILITY: obrientransform + samplevar + samplestdev + signaltonoise (for Numpy arrays only) + var + stdev + sterr + sem + z + zs + zmap (for Numpy arrays only) + +TRIMMING FCNS: threshold (for Numpy arrays only) + trimboth + trim1 + round (round all vals to 'n' decimals; Numpy only) + +CORRELATION FCNS: covariance (for Numpy arrays only) + correlation (for Numpy arrays only) + paired + pearsonr + spearmanr + pointbiserialr + kendalltau + linregress + +INFERENTIAL STATS: ttest_1samp + ttest_ind + ttest_rel + chisquare + ks_2samp + mannwhitneyu + ranksums + wilcoxont + kruskalwallish + friedmanchisquare + +PROBABILITY CALCS: chisqprob + erfcc + zprob + ksprob + fprob + betacf + gammln + betai + +ANOVA FUNCTIONS: F_oneway + F_value + +SUPPORT FUNCTIONS: writecc + incr + sign (for Numpy arrays only) + sum + cumsum + ss + summult + sumdiffsquared + square_of_sums + shellsort + rankdata + outputpairedstats + findwithin +""" +## CHANGE LOG: +## =========== +## 09-07-21 ... added capability for getting the 'proportion' out of l/amannwhitneyu (but comment-disabled) +## 08-10-31 ... fixed import LinearAlgebra bug before glm fcns +## 07-11-26 ... conversion for numpy started +## 07-05-16 ... added Lin's Concordance Correlation Coefficient (alincc) and acov +## 05-08-21 ... added "Dice's coefficient" +## 04-10-26 ... added ap2t(), an ugly fcn for converting p-vals to T-vals +## 04-04-03 ... added amasslinregress() function to do regression on N-D arrays +## 03-01-03 ... CHANGED VERSION TO 0.6 +## fixed atsem() to properly handle limits=None case +## improved histogram and median functions (estbinwidth) and +## fixed atvar() function (wrong answers for neg numbers?!?) +## 02-11-19 ... fixed attest_ind and attest_rel for div-by-zero Overflows +## 02-05-10 ... fixed lchisqprob indentation (failed when df=even) +## 00-12-28 ... removed aanova() to separate module, fixed licensing to +## match Python License, fixed doc string & imports +## 00-04-13 ... pulled all "global" statements, except from aanova() +## added/fixed lots of documentation, removed io.py dependency +## changed to version 0.5 +## 99-11-13 ... added asign() function +## 99-11-01 ... changed version to 0.4 ... enough incremental changes now +## 99-10-25 ... added acovariance and acorrelation functions +## 99-10-10 ... fixed askew/akurtosis to avoid divide-by-zero errors +## added aglm function (crude, but will be improved) +## 99-10-04 ... upgraded acumsum, ass, asummult, asamplevar, avar, etc. to +## all handle lists of 'dimension's and keepdims +## REMOVED ar0, ar2, ar3, ar4 and replaced them with around +## reinserted fixes for abetai to avoid math overflows +## 99-09-05 ... rewrote achisqprob/aerfcc/aksprob/afprob/abetacf/abetai to +## handle multi-dimensional arrays (whew!) +## 99-08-30 ... fixed l/amoment, l/askew, l/akurtosis per D'Agostino (1990) +## added anormaltest per same reference +## re-wrote azprob to calc arrays of probs all at once +## 99-08-22 ... edited attest_ind printing section so arrays could be rounded +## 99-08-19 ... fixed amean and aharmonicmean for non-error(!) overflow on +## short/byte arrays (mean of #s btw 100-300 = -150??) +## 99-08-09 ... fixed asum so that the None case works for Byte arrays +## 99-08-08 ... fixed 7/3 'improvement' to handle t-calcs on N-D arrays +## 99-07-03 ... improved attest_ind, attest_rel (zero-division errortrap) +## 99-06-24 ... fixed bug(?) in attest_ind (n1=a.shape[0]) +## 04/11/99 ... added asignaltonoise, athreshold functions, changed all +## max/min in array section to N.maximum/N.minimum, +## fixed square_of_sums to prevent integer overflow +## 04/10/99 ... !!! Changed function name ... sumsquared ==> square_of_sums +## 03/18/99 ... Added ar0, ar2, ar3 and ar4 rounding functions +## 02/28/99 ... Fixed aobrientransform to return an array rather than a list +## 01/15/99 ... Essentially ceased updating list-versions of functions (!!!) +## 01/13/99 ... CHANGED TO VERSION 0.3 +## fixed bug in a/lmannwhitneyu p-value calculation +## 12/31/98 ... fixed variable-name bug in ldescribe +## 12/19/98 ... fixed bug in findwithin (fcns needed pstat. prefix) +## 12/16/98 ... changed amedianscore to return float (not array) for 1 score +## 12/14/98 ... added atmin and atmax functions +## removed umath from import line (not needed) +## l/ageometricmean modified to reduce chance of overflows (take +## nth root first, then multiply) +## 12/07/98 ... added __version__variable (now 0.2) +## removed all 'stats.' from anova() fcn +## 12/06/98 ... changed those functions (except shellsort) that altered +## arguments in-place ... cumsum, ranksort, ... +## updated (and fixed some) doc-strings +## 12/01/98 ... added anova() function (requires NumPy) +## incorporated Dispatch class +## 11/12/98 ... added functionality to amean, aharmonicmean, ageometricmean +## added 'asum' function (added functionality to N.add.reduce) +## fixed both moment and amoment (two errors) +## changed name of skewness and askewness to skew and askew +## fixed (a)histogram (which sometimes counted points <lowerlimit) + +import pstat # required 3rd party module +import math, string, copy # required python modules +from types import * + +__version__ = 0.6 + +############# DISPATCH CODE ############## + + +class Dispatch: + """ +The Dispatch class, care of David Ascher, allows different functions to +be called depending on the argument types. This way, there can be one +function name regardless of the argument type. To access function doc +in stats.py module, prefix the function with an 'l' or 'a' for list or +array arguments, respectively. That is, print stats.lmean.__doc__ or +print stats.amean.__doc__ or whatever. +""" + + def __init__(self, *tuples): + self._dispatch = {} + for func, types in tuples: + for t in types: + if t in self._dispatch.keys(): + raise ValueError, "can't have two dispatches on "+str(t) + self._dispatch[t] = func + self._types = self._dispatch.keys() + + def __call__(self, arg1, *args, **kw): + if type(arg1) not in self._types: + raise TypeError, "don't know how to dispatch %s arguments" % type(arg1) + return apply(self._dispatch[type(arg1)], (arg1,) + args, kw) + + +########################################################################## +######################## LIST-BASED FUNCTIONS ######################## +########################################################################## + +### Define these regardless + +#################################### +####### CENTRAL TENDENCY ######### +#################################### + +def lgeometricmean (inlist): + """ +Calculates the geometric mean of the values in the passed list. +That is: n-th root of (x1 * x2 * ... * xn). Assumes a '1D' list. + +Usage: lgeometricmean(inlist) +""" + mult = 1.0 + one_over_n = 1.0/len(inlist) + for item in inlist: + mult = mult * pow(item,one_over_n) + return mult + + +def lharmonicmean (inlist): + """ +Calculates the harmonic mean of the values in the passed list. +That is: n / (1/x1 + 1/x2 + ... + 1/xn). Assumes a '1D' list. + +Usage: lharmonicmean(inlist) +""" + sum = 0 + for item in inlist: + sum = sum + 1.0/item + return len(inlist) / sum + + +def lmean (inlist): + """ +Returns the arithematic mean of the values in the passed list. +Assumes a '1D' list, but will function on the 1st dim of an array(!). + +Usage: lmean(inlist) +""" + sum = 0 + for item in inlist: + sum = sum + item + return sum/float(len(inlist)) + + +def lmedian (inlist,numbins=1000): + """ +Returns the computed median value of a list of numbers, given the +number of bins to use for the histogram (more bins brings the computed value +closer to the median score, default number of bins = 1000). See G.W. +Heiman's Basic Stats (1st Edition), or CRC Probability & Statistics. + +Usage: lmedian (inlist, numbins=1000) +""" + (hist, smallest, binsize, extras) = histogram(inlist,numbins,[min(inlist),max(inlist)]) # make histog + cumhist = cumsum(hist) # make cumulative histogram + for i in range(len(cumhist)): # get 1st(!) index holding 50%ile score + if cumhist[i]>=len(inlist)/2.0: + cfbin = i + break + LRL = smallest + binsize*cfbin # get lower read limit of that bin + cfbelow = cumhist[cfbin-1] + freq = float(hist[cfbin]) # frequency IN the 50%ile bin + median = LRL + ((len(inlist)/2.0 - cfbelow)/float(freq))*binsize # median formula + return median + + +def lmedianscore (inlist): + """ +Returns the 'middle' score of the passed list. If there is an even +number of scores, the mean of the 2 middle scores is returned. + +Usage: lmedianscore(inlist) +""" + + newlist = copy.deepcopy(inlist) + newlist.sort() + if len(newlist) % 2 == 0: # if even number of scores, average middle 2 + index = len(newlist)/2 # integer division correct + median = float(newlist[index] + newlist[index-1]) /2 + else: + index = len(newlist)/2 # int divsion gives mid value when count from 0 + median = newlist[index] + return median + + +def lmode(inlist): + """ +Returns a list of the modal (most common) score(s) in the passed +list. If there is more than one such score, all are returned. The +bin-count for the mode(s) is also returned. + +Usage: lmode(inlist) +Returns: bin-count for mode(s), a list of modal value(s) +""" + + scores = pstat.unique(inlist) + scores.sort() + freq = [] + for item in scores: + freq.append(inlist.count(item)) + maxfreq = max(freq) + mode = [] + stillmore = 1 + while stillmore: + try: + indx = freq.index(maxfreq) + mode.append(scores[indx]) + del freq[indx] + del scores[indx] + except ValueError: + stillmore=0 + return maxfreq, mode + + +#################################### +############ MOMENTS ############# +#################################### + +def lmoment(inlist,moment=1): + """ +Calculates the nth moment about the mean for a sample (defaults to +the 1st moment). Used to calculate coefficients of skewness and kurtosis. + +Usage: lmoment(inlist,moment=1) +Returns: appropriate moment (r) from ... 1/n * SUM((inlist(i)-mean)**r) +""" + if moment == 1: + return 0.0 + else: + mn = mean(inlist) + n = len(inlist) + s = 0 + for x in inlist: + s = s + (x-mn)**moment + return s/float(n) + + +def lvariation(inlist): + """ +Returns the coefficient of variation, as defined in CRC Standard +Probability and Statistics, p.6. + +Usage: lvariation(inlist) +""" + return 100.0*samplestdev(inlist)/float(mean(inlist)) + + +def lskew(inlist): + """ +Returns the skewness of a distribution, as defined in Numerical +Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.) + +Usage: lskew(inlist) +""" + return moment(inlist,3)/pow(moment(inlist,2),1.5) + + +def lkurtosis(inlist): + """ +Returns the kurtosis of a distribution, as defined in Numerical +Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.) + +Usage: lkurtosis(inlist) +""" + return moment(inlist,4)/pow(moment(inlist,2),2.0) + + +def ldescribe(inlist): + """ +Returns some descriptive statistics of the passed list (assumed to be 1D). + +Usage: ldescribe(inlist) +Returns: n, mean, standard deviation, skew, kurtosis +""" + n = len(inlist) + mm = (min(inlist),max(inlist)) + m = mean(inlist) + sd = stdev(inlist) + sk = skew(inlist) + kurt = kurtosis(inlist) + return n, mm, m, sd, sk, kurt + + +#################################### +####### FREQUENCY STATS ########## +#################################### + +def litemfreq(inlist): + """ +Returns a list of pairs. Each pair consists of one of the scores in inlist +and it's frequency count. Assumes a 1D list is passed. + +Usage: litemfreq(inlist) +Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies) +""" + scores = pstat.unique(inlist) + scores.sort() + freq = [] + for item in scores: + freq.append(inlist.count(item)) + return pstat.abut(scores, freq) + + +def lscoreatpercentile (inlist, percent): + """ +Returns the score at a given percentile relative to the distribution +given by inlist. + +Usage: lscoreatpercentile(inlist,percent) +""" + if percent > 1: + print "\nDividing percent>1 by 100 in lscoreatpercentile().\n" + percent = percent / 100.0 + targetcf = percent*len(inlist) + h, lrl, binsize, extras = histogram(inlist) + cumhist = cumsum(copy.deepcopy(h)) + for i in range(len(cumhist)): + if cumhist[i] >= targetcf: + break + score = binsize * ((targetcf - cumhist[i-1]) / float(h[i])) + (lrl+binsize*i) + return score + + +def lpercentileofscore (inlist, score,histbins=10,defaultlimits=None): + """ +Returns the percentile value of a score relative to the distribution +given by inlist. Formula depends on the values used to histogram the data(!). + +Usage: lpercentileofscore(inlist,score,histbins=10,defaultlimits=None) +""" + + h, lrl, binsize, extras = histogram(inlist,histbins,defaultlimits) + cumhist = cumsum(copy.deepcopy(h)) + i = int((score - lrl)/float(binsize)) + pct = (cumhist[i-1]+((score-(lrl+binsize*i))/float(binsize))*h[i])/float(len(inlist)) * 100 + return pct + + +def lhistogram (inlist,numbins=10,defaultreallimits=None,printextras=0): + """ +Returns (i) a list of histogram bin counts, (ii) the smallest value +of the histogram binning, and (iii) the bin width (the last 2 are not +necessarily integers). Default number of bins is 10. If no sequence object +is given for defaultreallimits, the routine picks (usually non-pretty) bins +spanning all the numbers in the inlist. + +Usage: lhistogram (inlist, numbins=10, defaultreallimits=None,suppressoutput=0) +Returns: list of bin values, lowerreallimit, binsize, extrapoints +""" + if (defaultreallimits <> None): + if type(defaultreallimits) not in [ListType,TupleType] or len(defaultreallimits)==1: # only one limit given, assumed to be lower one & upper is calc'd + lowerreallimit = defaultreallimits + upperreallimit = 1.000001 * max(inlist) + else: # assume both limits given + lowerreallimit = defaultreallimits[0] + upperreallimit = defaultreallimits[1] + binsize = (upperreallimit-lowerreallimit)/float(numbins) + else: # no limits given for histogram, both must be calc'd + estbinwidth=(max(inlist)-min(inlist))/float(numbins) +1e-6 #1=>cover all + binsize = ((max(inlist)-min(inlist)+estbinwidth))/float(numbins) + lowerreallimit = min(inlist) - binsize/2 #lower real limit,1st bin + bins = [0]*(numbins) + extrapoints = 0 + for num in inlist: + try: + if (num-lowerreallimit) < 0: + extrapoints = extrapoints + 1 + else: + bintoincrement = int((num-lowerreallimit)/float(binsize)) + bins[bintoincrement] = bins[bintoincrement] + 1 + except: + extrapoints = extrapoints + 1 + if (extrapoints > 0 and printextras == 1): + print '\nPoints outside given histogram range =',extrapoints + return (bins, lowerreallimit, binsize, extrapoints) + + +def lcumfreq(inlist,numbins=10,defaultreallimits=None): + """ +Returns a cumulative frequency histogram, using the histogram function. + +Usage: lcumfreq(inlist,numbins=10,defaultreallimits=None) +Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints +""" + h,l,b,e = histogram(inlist,numbins,defaultreallimits) + cumhist = cumsum(copy.deepcopy(h)) + return cumhist,l,b,e + + +def lrelfreq(inlist,numbins=10,defaultreallimits=None): + """ +Returns a relative frequency histogram, using the histogram function. + +Usage: lrelfreq(inlist,numbins=10,defaultreallimits=None) +Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints +""" + h,l,b,e = histogram(inlist,numbins,defaultreallimits) + for i in range(len(h)): + h[i] = h[i]/float(len(inlist)) + return h,l,b,e + + +#################################### +##### VARIABILITY FUNCTIONS ###### +#################################### + +def lobrientransform(*args): + """ +Computes a transform on input data (any number of columns). Used to +test for homogeneity of variance prior to running one-way stats. From +Maxwell and Delaney, p.112. + +Usage: lobrientransform(*args) +Returns: transformed data for use in an ANOVA +""" + TINY = 1e-10 + k = len(args) + n = [0.0]*k + v = [0.0]*k + m = [0.0]*k + nargs = [] + for i in range(k): + nargs.append(copy.deepcopy(args[i])) + n[i] = float(len(nargs[i])) + v[i] = var(nargs[i]) + m[i] = mean(nargs[i]) + for j in range(k): + for i in range(n[j]): + t1 = (n[j]-1.5)*n[j]*(nargs[j][i]-m[j])**2 + t2 = 0.5*v[j]*(n[j]-1.0) + t3 = (n[j]-1.0)*(n[j]-2.0) + nargs[j][i] = (t1-t2) / float(t3) + check = 1 + for j in range(k): + if v[j] - mean(nargs[j]) > TINY: + check = 0 + if check <> 1: + raise ValueError, 'Problem in obrientransform.' + else: + return nargs + + +def lsamplevar (inlist): + """ +Returns the variance of the values in the passed list using +N for the denominator (i.e., DESCRIBES the sample variance only). + +Usage: lsamplevar(inlist) +""" + n = len(inlist) + mn = mean(inlist) + deviations = [] + for item in inlist: + deviations.append(item-mn) + return ss(deviations)/float(n) + + +def lsamplestdev (inlist): + """ +Returns the standard deviation of the values in the passed list using +N for the denominator (i.e., DESCRIBES the sample stdev only). + +Usage: lsamplestdev(inlist) +""" + return math.sqrt(samplevar(inlist)) + + +def lcov (x,y, keepdims=0): + """ +Returns the estimated covariance of the values in the passed +array (i.e., N-1). Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). Set keepdims=1 to return an array with the +same number of dimensions as inarray. + +Usage: lcov(x,y,keepdims=0) +""" + + n = len(x) + xmn = mean(x) + ymn = mean(y) + xdeviations = [0]*len(x) + ydeviations = [0]*len(y) + for i in range(len(x)): + xdeviations[i] = x[i] - xmn + ydeviations[i] = y[i] - ymn + ss = 0.0 + for i in range(len(xdeviations)): + ss = ss + xdeviations[i]*ydeviations[i] + return ss/float(n-1) + + +def lvar (inlist): + """ +Returns the variance of the values in the passed list using N-1 +for the denominator (i.e., for estimating population variance). + +Usage: lvar(inlist) +""" + n = len(inlist) + mn = mean(inlist) + deviations = [0]*len(inlist) + for i in range(len(inlist)): + deviations[i] = inlist[i] - mn + return ss(deviations)/float(n-1) + + +def lstdev (inlist): + """ +Returns the standard deviation of the values in the passed list +using N-1 in the denominator (i.e., to estimate population stdev). + +Usage: lstdev(inlist) +""" + return math.sqrt(var(inlist)) + + +def lsterr(inlist): + """ +Returns the standard error of the values in the passed list using N-1 +in the denominator (i.e., to estimate population standard error). + +Usage: lsterr(inlist) +""" + return stdev(inlist) / float(math.sqrt(len(inlist))) + + +def lsem (inlist): + """ +Returns the estimated standard error of the mean (sx-bar) of the +values in the passed list. sem = stdev / sqrt(n) + +Usage: lsem(inlist) +""" + sd = stdev(inlist) + n = len(inlist) + return sd/math.sqrt(n) + + +def lz (inlist, score): + """ +Returns the z-score for a given input score, given that score and the +list from which that score came. Not appropriate for population calculations. + +Usage: lz(inlist, score) +""" + z = (score-mean(inlist))/samplestdev(inlist) + return z + + +def lzs (inlist): + """ +Returns a list of z-scores, one for each score in the passed list. + +Usage: lzs(inlist) +""" + zscores = [] + for item in inlist: + zscores.append(z(inlist,item)) + return zscores + + +#################################### +####### TRIMMING FUNCTIONS ####### +#################################### + +def ltrimboth (l,proportiontocut): + """ +Slices off the passed proportion of items from BOTH ends of the passed +list (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND 'rightmost' +10% of scores. Assumes list is sorted by magnitude. Slices off LESS if +proportion results in a non-integer slice index (i.e., conservatively +slices off proportiontocut). + +Usage: ltrimboth (l,proportiontocut) +Returns: trimmed version of list l +""" + lowercut = int(proportiontocut*len(l)) + uppercut = len(l) - lowercut + return l[lowercut:uppercut] + + +def ltrim1 (l,proportiontocut,tail='right'): + """ +Slices off the passed proportion of items from ONE end of the passed +list (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost' +10% of scores). Slices off LESS if proportion results in a non-integer +slice index (i.e., conservatively slices off proportiontocut). + +Usage: ltrim1 (l,proportiontocut,tail='right') or set tail='left' +Returns: trimmed version of list l +""" + if tail == 'right': + lowercut = 0 + uppercut = len(l) - int(proportiontocut*len(l)) + elif tail == 'left': + lowercut = int(proportiontocut*len(l)) + uppercut = len(l) + return l[lowercut:uppercut] + + +#################################### +##### CORRELATION FUNCTIONS ###### +#################################### + +def lpaired(x,y): + """ +Interactively determines the type of data and then runs the +appropriated statistic for paired group data. + +Usage: lpaired(x,y) +Returns: appropriate statistic name, value, and probability +""" + samples = '' + while samples not in ['i','r','I','R','c','C']: + print '\nIndependent or related samples, or correlation (i,r,c): ', + samples = raw_input() + + if samples in ['i','I','r','R']: + print '\nComparing variances ...', +# USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112 + r = obrientransform(x,y) + f,p = F_oneway(pstat.colex(r,0),pstat.colex(r,1)) + if p<0.05: + vartype='unequal, p='+str(round(p,4)) + else: + vartype='equal' + print vartype + if samples in ['i','I']: + if vartype[0]=='e': + t,p = ttest_ind(x,y,0) + print '\nIndependent samples t-test: ', round(t,4),round(p,4) + else: + if len(x)>20 or len(y)>20: + z,p = ranksums(x,y) + print '\nRank Sums test (NONparametric, n>20): ', round(z,4),round(p,4) + else: + u,p = mannwhitneyu(x,y) + print '\nMann-Whitney U-test (NONparametric, ns<20): ', round(u,4),round(p,4) + + else: # RELATED SAMPLES + if vartype[0]=='e': + t,p = ttest_rel(x,y,0) + print '\nRelated samples t-test: ', round(t,4),round(p,4) + else: + t,p = ranksums(x,y) + print '\nWilcoxon T-test (NONparametric): ', round(t,4),round(p,4) + else: # CORRELATION ANALYSIS + corrtype = '' + while corrtype not in ['c','C','r','R','d','D']: + print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ', + corrtype = raw_input() + if corrtype in ['c','C']: + m,b,r,p,see = linregress(x,y) + print '\nLinear regression for continuous variables ...' + lol = [['Slope','Intercept','r','Prob','SEestimate'],[round(m,4),round(b,4),round(r,4),round(p,4),round(see,4)]] + pstat.printcc(lol) + elif corrtype in ['r','R']: + r,p = spearmanr(x,y) + print '\nCorrelation for ranked variables ...' + print "Spearman's r: ",round(r,4),round(p,4) + else: # DICHOTOMOUS + r,p = pointbiserialr(x,y) + print '\nAssuming x contains a dichotomous variable ...' + print 'Point Biserial r: ',round(r,4),round(p,4) + print '\n\n' + return None + + +def lpearsonr(x,y): + """ +Calculates a Pearson correlation coefficient and the associated +probability value. Taken from Heiman's Basic Statistics for the Behav. +Sci (2nd), p.195. + +Usage: lpearsonr(x,y) where x and y are equal-length lists +Returns: Pearson's r value, two-tailed p-value +""" + TINY = 1.0e-30 + if len(x) <> len(y): + raise ValueError, 'Input values not paired in pearsonr. Aborting.' + n = len(x) + x = map(float,x) + y = map(float,y) + xmean = mean(x) + ymean = mean(y) + r_num = n*(summult(x,y)) - sum(x)*sum(y) + r_den = math.sqrt((n*ss(x) - square_of_sums(x))*(n*ss(y)-square_of_sums(y))) + r = (r_num / r_den) # denominator already a float + df = n-2 + t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) + prob = betai(0.5*df,0.5,df/float(df+t*t)) + return r, prob + + +def llincc(x,y): + """ +Calculates Lin's concordance correlation coefficient. + +Usage: alincc(x,y) where x, y are equal-length arrays +Returns: Lin's CC +""" + covar = lcov(x,y)*(len(x)-1)/float(len(x)) # correct denom to n + xvar = lvar(x)*(len(x)-1)/float(len(x)) # correct denom to n + yvar = lvar(y)*(len(y)-1)/float(len(y)) # correct denom to n + lincc = (2 * covar) / ((xvar+yvar) +((amean(x)-amean(y))**2)) + return lincc + + +def lspearmanr(x,y): + """ +Calculates a Spearman rank-order correlation coefficient. Taken +from Heiman's Basic Statistics for the Behav. Sci (1st), p.192. + +Usage: lspearmanr(x,y) where x and y are equal-length lists +Returns: Spearman's r, two-tailed p-value +""" + TINY = 1e-30 + if len(x) <> len(y): + raise ValueError, 'Input values not paired in spearmanr. Aborting.' + n = len(x) + rankx = rankdata(x) + ranky = rankdata(y) + dsq = sumdiffsquared(rankx,ranky) + rs = 1 - 6*dsq / float(n*(n**2-1)) + t = rs * math.sqrt((n-2) / ((rs+1.0)*(1.0-rs))) + df = n-2 + probrs = betai(0.5*df,0.5,df/(df+t*t)) # t already a float +# probability values for rs are from part 2 of the spearman function in +# Numerical Recipies, p.510. They are close to tables, but not exact. (?) + return rs, probrs + + +def lpointbiserialr(x,y): + """ +Calculates a point-biserial correlation coefficient and the associated +probability value. Taken from Heiman's Basic Statistics for the Behav. +Sci (1st), p.194. + +Usage: lpointbiserialr(x,y) where x,y are equal-length lists +Returns: Point-biserial r, two-tailed p-value +""" + TINY = 1e-30 + if len(x) <> len(y): + raise ValueError, 'INPUT VALUES NOT PAIRED IN pointbiserialr. ABORTING.' + data = pstat.abut(x,y) + categories = pstat.unique(x) + if len(categories) <> 2: + raise ValueError, "Exactly 2 categories required for pointbiserialr()." + else: # there are 2 categories, continue + codemap = pstat.abut(categories,range(2)) + recoded = pstat.recode(data,codemap,0) + x = pstat.linexand(data,0,categories[0]) + y = pstat.linexand(data,0,categories[1]) + xmean = mean(pstat.colex(x,1)) + ymean = mean(pstat.colex(y,1)) + n = len(data) + adjust = math.sqrt((len(x)/float(n))*(len(y)/float(n))) + rpb = (ymean - xmean)/samplestdev(pstat.colex(data,1))*adjust + df = n-2 + t = rpb*math.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY))) + prob = betai(0.5*df,0.5,df/(df+t*t)) # t already a float + return rpb, prob + + +def lkendalltau(x,y): + """ +Calculates Kendall's tau ... correlation of ordinal data. Adapted +from function kendl1 in Numerical Recipies. Needs good test-routine.@@@ + +Usage: lkendalltau(x,y) +Returns: Kendall's tau, two-tailed p-value +""" + n1 = 0 + n2 = 0 + iss = 0 + for j in range(len(x)-1): + for k in range(j,len(y)): + a1 = x[j] - x[k] + a2 = y[j] - y[k] + aa = a1 * a2 + if (aa): # neither list has a tie + n1 = n1 + 1 + n2 = n2 + 1 + if aa > 0: + iss = iss + 1 + else: + iss = iss -1 + else: + if (a1): + n1 = n1 + 1 + else: + n2 = n2 + 1 + tau = iss / math.sqrt(n1*n2) + svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1)) + z = tau / math.sqrt(svar) + prob = erfcc(abs(z)/1.4142136) + return tau, prob + + +def llinregress(x,y): + """ +Calculates a regression line on x,y pairs. + +Usage: llinregress(x,y) x,y are equal-length lists of x-y coordinates +Returns: slope, intercept, r, two-tailed prob, sterr-of-estimate +""" + TINY = 1.0e-20 + if len(x) <> len(y): + raise ValueError, 'Input values not paired in linregress. Aborting.' + n = len(x) + x = map(float,x) + y = map(float,y) + xmean = mean(x) + ymean = mean(y) + r_num = float(n*(summult(x,y)) - sum(x)*sum(y)) + r_den = math.sqrt((n*ss(x) - square_of_sums(x))*(n*ss(y)-square_of_sums(y))) + r = r_num / r_den + z = 0.5*math.log((1.0+r+TINY)/(1.0-r+TINY)) + df = n-2 + t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) + prob = betai(0.5*df,0.5,df/(df+t*t)) + slope = r_num / float(n*ss(x) - square_of_sums(x)) + intercept = ymean - slope*xmean + sterrest = math.sqrt(1-r*r)*samplestdev(y) + return slope, intercept, r, prob, sterrest + + +#################################### +##### INFERENTIAL STATISTICS ##### +#################################### + +def lttest_1samp(a,popmean,printit=0,name='Sample',writemode='a'): + """ +Calculates the t-obtained for the independent samples T-test on ONE group +of scores a, given a population mean. If printit=1, results are printed +to the screen. If printit='filename', the results are output to 'filename' +using the given writemode (default=append). Returns t-value, and prob. + +Usage: lttest_1samp(a,popmean,Name='Sample',printit=0,writemode='a') +Returns: t-value, two-tailed prob +""" + x = mean(a) + v = var(a) + n = len(a) + df = n-1 + svar = ((n-1)*v)/float(df) + t = (x-popmean)/math.sqrt(svar*(1.0/n)) + prob = betai(0.5*df,0.5,float(df)/(df+t*t)) + + if printit <> 0: + statname = 'Single-sample T-test.' + outputpairedstats(printit,writemode, + 'Population','--',popmean,0,0,0, + name,n,x,v,min(a),max(a), + statname,t,prob) + return t,prob + + +def lttest_ind (a, b, printit=0, name1='Samp1', name2='Samp2', writemode='a'): + """ +Calculates the t-obtained T-test on TWO INDEPENDENT samples of +scores a, and b. From Numerical Recipies, p.483. If printit=1, results +are printed to the screen. If printit='filename', the results are output +to 'filename' using the given writemode (default=append). Returns t-value, +and prob. + +Usage: lttest_ind(a,b,printit=0,name1='Samp1',name2='Samp2',writemode='a') +Returns: t-value, two-tailed prob +""" + x1 = mean(a) + x2 = mean(b) + v1 = stdev(a)**2 + v2 = stdev(b)**2 + n1 = len(a) + n2 = len(b) + df = n1+n2-2 + svar = ((n1-1)*v1+(n2-1)*v2)/float(df) + if not svar: + svar = 1.0e-26 + t = (x1-x2)/math.sqrt(svar*(1.0/n1 + 1.0/n2)) + prob = betai(0.5*df,0.5,df/(df+t*t)) + + if printit <> 0: + statname = 'Independent samples T-test.' + outputpairedstats(printit,writemode, + name1,n1,x1,v1,min(a),max(a), + name2,n2,x2,v2,min(b),max(b), + statname,t,prob) + return t,prob + + +def lttest_rel (a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a'): + """ +Calculates the t-obtained T-test on TWO RELATED samples of scores, +a and b. From Numerical Recipies, p.483. If printit=1, results are +printed to the screen. If printit='filename', the results are output to +'filename' using the given writemode (default=append). Returns t-value, +and prob. + +Usage: lttest_rel(a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a') +Returns: t-value, two-tailed prob +""" + if len(a)<>len(b): + raise ValueError, 'Unequal length lists in ttest_rel.' + x1 = mean(a) + x2 = mean(b) + v1 = var(a) + v2 = var(b) + n = len(a) + cov = 0 + for i in range(len(a)): + cov = cov + (a[i]-x1) * (b[i]-x2) + df = n-1 + cov = cov / float(df) + sd = math.sqrt((v1+v2 - 2.0*cov)/float(n)) + t = (x1-x2)/sd + prob = betai(0.5*df,0.5,df/(df+t*t)) + + if printit <> 0: + statname = 'Related samples T-test.' + outputpairedstats(printit,writemode, + name1,n,x1,v1,min(a),max(a), + name2,n,x2,v2,min(b),max(b), + statname,t,prob) + return t, prob + + +def lchisquare(f_obs,f_exp=None): + """ +Calculates a one-way chi square for list of observed frequencies and returns +the result. If no expected frequencies are given, the total N is assumed to +be equally distributed across all groups. + +Usage: lchisquare(f_obs, f_exp=None) f_obs = list of observed cell freq. +Returns: chisquare-statistic, associated p-value +""" + k = len(f_obs) # number of groups + if f_exp == None: + f_exp = [sum(f_obs)/float(k)] * len(f_obs) # create k bins with = freq. + chisq = 0 + for i in range(len(f_obs)): + chisq = chisq + (f_obs[i]-f_exp[i])**2 / float(f_exp[i]) + return chisq, chisqprob(chisq, k-1) + + +def lks_2samp (data1,data2): + """ +Computes the Kolmogorov-Smirnof statistic on 2 samples. From +Numerical Recipies in C, page 493. + +Usage: lks_2samp(data1,data2) data1&2 are lists of values for 2 conditions +Returns: KS D-value, associated p-value +""" + j1 = 0 + j2 = 0 + fn1 = 0.0 + fn2 = 0.0 + n1 = len(data1) + n2 = len(data2) + en1 = n1 + en2 = n2 + d = 0.0 + data1.sort() + data2.sort() + while j1 < n1 and j2 < n2: + d1=data1[j1] + d2=data2[j2] + if d1 <= d2: + fn1 = (j1)/float(en1) + j1 = j1 + 1 + if d2 <= d1: + fn2 = (j2)/float(en2) + j2 = j2 + 1 + dt = (fn2-fn1) + if math.fabs(dt) > math.fabs(d): + d = dt + try: + en = math.sqrt(en1*en2/float(en1+en2)) + prob = ksprob((en+0.12+0.11/en)*abs(d)) + except: + prob = 1.0 + return d, prob + + +def lmannwhitneyu(x,y): + """ +Calculates a Mann-Whitney U statistic on the provided scores and +returns the result. Use only when the n in each condition is < 20 and +you have 2 independent samples of ranks. NOTE: Mann-Whitney U is +significant if the u-obtained is LESS THAN or equal to the critical +value of U found in the tables. Equivalent to Kruskal-Wallis H with +just 2 groups. + +Usage: lmannwhitneyu(data) +Returns: u-statistic, one-tailed p-value (i.e., p(z(U))) +""" + n1 = len(x) + n2 = len(y) + ranked = rankdata(x+y) + rankx = ranked[0:n1] # get the x-ranks + ranky = ranked[n1:] # the rest are y-ranks + u1 = n1*n2 + (n1*(n1+1))/2.0 - sum(rankx) # calc U for x + u2 = n1*n2 - u1 # remainder is U for y + bigu = max(u1,u2) + smallu = min(u1,u2) + proportion = bigu/float(n1*n2) + T = math.sqrt(tiecorrect(ranked)) # correction factor for tied scores + if T == 0: + raise ValueError, 'All numbers are identical in lmannwhitneyu' + sd = math.sqrt(T*n1*n2*(n1+n2+1)/12.0) + z = abs((bigu-n1*n2/2.0) / sd) # normal approximation for prob calc + return smallu, 1.0 - zprob(z) #, proportion + + +def ltiecorrect(rankvals): + """ +Corrects for ties in Mann Whitney U and Kruskal Wallis H tests. See +Siegel, S. (1956) Nonparametric Statistics for the Behavioral Sciences. +New York: McGraw-Hill. Code adapted from |Stat rankind.c code. + +Usage: ltiecorrect(rankvals) +Returns: T correction factor for U or H +""" + sorted,posn = shellsort(rankvals) + n = len(sorted) + T = 0.0 + i = 0 + while (i<n-1): + if sorted[i] == sorted[i+1]: + nties = 1 + while (i<n-1) and (sorted[i] == sorted[i+1]): + nties = nties +1 + i = i +1 + T = T + nties**3 - nties + i = i+1 + T = T / float(n**3-n) + return 1.0 - T + + +def lranksums(x,y): + """ +Calculates the rank sums statistic on the provided scores and +returns the result. Use only when the n in each condition is > 20 and you +have 2 independent samples of ranks. + +Usage: lranksums(x,y) +Returns: a z-statistic, two-tailed p-value +""" + n1 = len(x) + n2 = len(y) + alldata = x+y + ranked = rankdata(alldata) + x = ranked[:n1] + y = ranked[n1:] + s = sum(x) + expected = n1*(n1+n2+1) / 2.0 + z = (s - expected) / math.sqrt(n1*n2*(n1+n2+1)/12.0) + prob = 2*(1.0 -zprob(abs(z))) + return z, prob + + +def lwilcoxont(x,y): + """ +Calculates the Wilcoxon T-test for related samples and returns the +result. A non-parametric T-test. + +Usage: lwilcoxont(x,y) +Returns: a t-statistic, two-tail probability estimate +""" + if len(x) <> len(y): + raise ValueError, 'Unequal N in wilcoxont. Aborting.' + d=[] + for i in range(len(x)): + diff = x[i] - y[i] + if diff <> 0: + d.append(diff) + count = len(d) + absd = map(abs,d) + absranked = rankdata(absd) + r_plus = 0.0 + r_minus = 0.0 + for i in range(len(absd)): + if d[i] < 0: + r_minus = r_minus + absranked[i] + else: + r_plus = r_plus + absranked[i] + wt = min(r_plus, r_minus) + mn = count * (count+1) * 0.25 + se = math.sqrt(count*(count+1)*(2.0*count+1.0)/24.0) + z = math.fabs(wt-mn) / se + prob = 2*(1.0 -zprob(abs(z))) + return wt, prob + + +def lkruskalwallish(*args): + """ +The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more +groups, requiring at least 5 subjects in each group. This function +calculates the Kruskal-Wallis H-test for 3 or more independent samples +and returns the result. + +Usage: lkruskalwallish(*args) +Returns: H-statistic (corrected for ties), associated p-value +""" + args = list(args) + n = [0]*len(args) + all = [] + n = map(len,args) + for i in range(len(args)): + all = all + args[i] + ranked = rankdata(all) + T = tiecorrect(ranked) + for i in range(len(args)): + args[i] = ranked[0:n[i]] + del ranked[0:n[i]] + rsums = [] + for i in range(len(args)): + rsums.append(sum(args[i])**2) + rsums[i] = rsums[i] / float(n[i]) + ssbn = sum(rsums) + totaln = sum(n) + h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1) + df = len(args) - 1 + if T == 0: + raise ValueError, 'All numbers are identical in lkruskalwallish' + h = h / float(T) + return h, chisqprob(h,df) + + +def lfriedmanchisquare(*args): + """ +Friedman Chi-Square is a non-parametric, one-way within-subjects +ANOVA. This function calculates the Friedman Chi-square test for repeated +measures and returns the result, along with the associated probability +value. It assumes 3 or more repeated measures. Only 3 levels requires a +minimum of 10 subjects in the study. Four levels requires 5 subjects per +level(??). + +Usage: lfriedmanchisquare(*args) +Returns: chi-square statistic, associated p-value +""" + k = len(args) + if k < 3: + raise ValueError, 'Less than 3 levels. Friedman test not appropriate.' + n = len(args[0]) + data = apply(pstat.abut,tuple(args)) + for i in range(len(data)): + data[i] = rankdata(data[i]) + ssbn = 0 + for i in range(k): + ssbn = ssbn + sum(args[i])**2 + chisq = 12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1) + return chisq, chisqprob(chisq,k-1) + + +#################################### +#### PROBABILITY CALCULATIONS #### +#################################### + +def lchisqprob(chisq,df): + """ +Returns the (1-tailed) probability value associated with the provided +chi-square value and df. Adapted from chisq.c in Gary Perlman's |Stat. + +Usage: lchisqprob(chisq,df) +""" + BIG = 20.0 + def ex(x): + BIG = 20.0 + if x < -BIG: + return 0.0 + else: + return math.exp(x) + + if chisq <=0 or df < 1: + return 1.0 + a = 0.5 * chisq + if df%2 == 0: + even = 1 + else: + even = 0 + if df > 1: + y = ex(-a) + if even: + s = y + else: + s = 2.0 * zprob(-math.sqrt(chisq)) + if (df > 2): + chisq = 0.5 * (df - 1.0) + if even: + z = 1.0 + else: + z = 0.5 + if a > BIG: + if even: + e = 0.0 + else: + e = math.log(math.sqrt(math.pi)) + c = math.log(a) + while (z <= chisq): + e = math.log(z) + e + s = s + ex(c*z-a-e) + z = z + 1.0 + return s + else: + if even: + e = 1.0 + else: + e = 1.0 / math.sqrt(math.pi) / math.sqrt(a) + c = 0.0 + while (z <= chisq): + e = e * (a/float(z)) + c = c + e + z = z + 1.0 + return (c*y+s) + else: + return s + + +def lerfcc(x): + """ +Returns the complementary error function erfc(x) with fractional +error everywhere less than 1.2e-7. Adapted from Numerical Recipies. + +Usage: lerfcc(x) +""" + z = abs(x) + t = 1.0 / (1.0+0.5*z) + ans = t * math.exp(-z*z-1.26551223 + t*(1.00002368+t*(0.37409196+t*(0.09678418+t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+t*(-0.82215223+t*0.17087277))))))))) + if x >= 0: + return ans + else: + return 2.0 - ans + + +def lzprob(z): + """ +Returns the area under the normal curve 'to the left of' the given z value. +Thus, + for z<0, zprob(z) = 1-tail probability + for z>0, 1.0-zprob(z) = 1-tail probability + for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability +Adapted from z.c in Gary Perlman's |Stat. + +Usage: lzprob(z) +""" + Z_MAX = 6.0 # maximum meaningful z-value + if z == 0.0: + x = 0.0 + else: + y = 0.5 * math.fabs(z) + if y >= (Z_MAX*0.5): + x = 1.0 + elif (y < 1.0): + w = y*y + x = ((((((((0.000124818987 * w + -0.001075204047) * w +0.005198775019) * w + -0.019198292004) * w +0.059054035642) * w + -0.151968751364) * w +0.319152932694) * w + -0.531923007300) * w +0.797884560593) * y * 2.0 + else: + y = y - 2.0 + x = (((((((((((((-0.000045255659 * y + +0.000152529290) * y -0.000019538132) * y + -0.000676904986) * y +0.001390604284) * y + -0.000794620820) * y -0.002034254874) * y + +0.006549791214) * y -0.010557625006) * y + +0.011630447319) * y -0.009279453341) * y + +0.005353579108) * y -0.002141268741) * y + +0.000535310849) * y +0.999936657524 + if z > 0.0: + prob = ((x+1.0)*0.5) + else: + prob = ((1.0-x)*0.5) + return prob + + +def lksprob(alam): + """ +Computes a Kolmolgorov-Smirnov t-test significance level. Adapted from +Numerical Recipies. + +Usage: lksprob(alam) +""" + fac = 2.0 + sum = 0.0 + termbf = 0.0 + a2 = -2.0*alam*alam + for j in range(1,201): + term = fac*math.exp(a2*j*j) + sum = sum + term + if math.fabs(term) <= (0.001*termbf) or math.fabs(term) < (1.0e-8*sum): + return sum + fac = -fac + termbf = math.fabs(term) + return 1.0 # Get here only if fails to converge; was 0.0!! + + +def lfprob (dfnum, dfden, F): + """ +Returns the (1-tailed) significance level (p-value) of an F +statistic given the degrees of freedom for the numerator (dfR-dfF) and +the degrees of freedom for the denominator (dfF). + +Usage: lfprob(dfnum, dfden, F) where usually dfnum=dfbn, dfden=dfwn +""" + p = betai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F)) + return p + + +def lbetacf(a,b,x): + """ +This function evaluates the continued fraction form of the incomplete +Beta function, betai. (Adapted from: Numerical Recipies in C.) + +Usage: lbetacf(a,b,x) +""" + ITMAX = 200 + EPS = 3.0e-7 + + bm = az = am = 1.0 + qab = a+b + qap = a+1.0 + qam = a-1.0 + bz = 1.0-qab*x/qap + for i in range(ITMAX+1): + em = float(i+1) + tem = em + em + d = em*(b-em)*x/((qam+tem)*(a+tem)) + ap = az + d*am + bp = bz+d*bm + d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem)) + app = ap+d*az + bpp = bp+d*bz + aold = az + am = ap/bpp + bm = bp/bpp + az = app/bpp + bz = 1.0 + if (abs(az-aold)<(EPS*abs(az))): + return az + print 'a or b too big, or ITMAX too small in Betacf.' + + +def lgammln(xx): + """ +Returns the gamma function of xx. + Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt. +(Adapted from: Numerical Recipies in C.) + +Usage: lgammln(xx) +""" + + coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, + 0.120858003e-2, -0.536382e-5] + x = xx - 1.0 + tmp = x + 5.5 + tmp = tmp - (x+0.5)*math.log(tmp) + ser = 1.0 + for j in range(len(coeff)): + x = x + 1 + ser = ser + coeff[j]/x + return -tmp + math.log(2.50662827465*ser) + + +def lbetai(a,b,x): + """ +Returns the incomplete beta function: + + I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) + +where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma +function of a. The continued fraction formulation is implemented here, +using the betacf function. (Adapted from: Numerical Recipies in C.) + +Usage: lbetai(a,b,x) +""" + if (x<0.0 or x>1.0): + raise ValueError, 'Bad x in lbetai' + if (x==0.0 or x==1.0): + bt = 0.0 + else: + bt = math.exp(gammln(a+b)-gammln(a)-gammln(b)+a*math.log(x)+b* + math.log(1.0-x)) + if (x<(a+1.0)/(a+b+2.0)): + return bt*betacf(a,b,x)/float(a) + else: + return 1.0-bt*betacf(b,a,1.0-x)/float(b) + + +#################################### +####### ANOVA CALCULATIONS ####### +#################################### + +def lF_oneway(*lists): + """ +Performs a 1-way ANOVA, returning an F-value and probability given +any number of groups. From Heiman, pp.394-7. + +Usage: F_oneway(*lists) where *lists is any number of lists, one per + treatment group +Returns: F value, one-tailed p-value +""" + a = len(lists) # ANOVA on 'a' groups, each in it's own list + means = [0]*a + vars = [0]*a + ns = [0]*a + alldata = [] + tmp = map(N.array,lists) + means = map(amean,tmp) + vars = map(avar,tmp) + ns = map(len,lists) + for i in range(len(lists)): + alldata = alldata + lists[i] + alldata = N.array(alldata) + bign = len(alldata) + sstot = ass(alldata)-(asquare_of_sums(alldata)/float(bign)) + ssbn = 0 + for list in lists: + ssbn = ssbn + asquare_of_sums(N.array(list))/float(len(list)) + ssbn = ssbn - (asquare_of_sums(alldata)/float(bign)) + sswn = sstot-ssbn + dfbn = a-1 + dfwn = bign - a + msb = ssbn/float(dfbn) + msw = sswn/float(dfwn) + f = msb/msw + prob = fprob(dfbn,dfwn,f) + return f, prob + + +def lF_value (ER,EF,dfnum,dfden): + """ +Returns an F-statistic given the following: + ER = error associated with the null hypothesis (the Restricted model) + EF = error associated with the alternate hypothesis (the Full model) + dfR-dfF = degrees of freedom of the numerator + dfF = degrees of freedom associated with the denominator/Full model + +Usage: lF_value(ER,EF,dfnum,dfden) +""" + return ((ER-EF)/float(dfnum) / (EF/float(dfden))) + + +#################################### +######## SUPPORT FUNCTIONS ####### +#################################### + +def writecc (listoflists,file,writetype='w',extra=2): + """ +Writes a list of lists to a file in columns, customized by the max +size of items within the columns (max size of items in col, +2 characters) +to specified file. File-overwrite is the default. + +Usage: writecc (listoflists,file,writetype='w',extra=2) +Returns: None +""" + if type(listoflists[0]) not in [ListType,TupleType]: + listoflists = [listoflists] + outfile = open(file,writetype) + rowstokill = [] + list2print = copy.deepcopy(listoflists) + for i in range(len(listoflists)): + if listoflists[i] == ['\n'] or listoflists[i]=='\n' or listoflists[i]=='dashes': + rowstokill = rowstokill + [i] + rowstokill.reverse() + for row in rowstokill: + del list2print[row] + maxsize = [0]*len(list2print[0]) + for col in range(len(list2print[0])): + items = pstat.colex(list2print,col) + items = map(pstat.makestr,items) + maxsize[col] = max(map(len,items)) + extra + for row in listoflists: + if row == ['\n'] or row == '\n': + outfile.write('\n') + elif row == ['dashes'] or row == 'dashes': + dashes = [0]*len(maxsize) + for j in range(len(maxsize)): + dashes[j] = '-'*(maxsize[j]-2) + outfile.write(pstat.lineincustcols(dashes,maxsize)) + else: + outfile.write(pstat.lineincustcols(row,maxsize)) + outfile.write('\n') + outfile.close() + return None + + +def lincr(l,cap): # to increment a list up to a max-list of 'cap' + """ +Simulate a counting system from an n-dimensional list. + +Usage: lincr(l,cap) l=list to increment, cap=max values for each list pos'n +Returns: next set of values for list l, OR -1 (if overflow) +""" + l[0] = l[0] + 1 # e.g., [0,0,0] --> [2,4,3] (=cap) + for i in range(len(l)): + if l[i] > cap[i] and i < len(l)-1: # if carryover AND not done + l[i] = 0 + l[i+1] = l[i+1] + 1 + elif l[i] > cap[i] and i == len(l)-1: # overflow past last column, must be finished + l = -1 + return l + + +def lsum (inlist): + """ +Returns the sum of the items in the passed list. + +Usage: lsum(inlist) +""" + s = 0 + for item in inlist: + s = s + item + return s + + +def lcumsum (inlist): + """ +Returns a list consisting of the cumulative sum of the items in the +passed list. + +Usage: lcumsum(inlist) +""" + newlist = copy.deepcopy(inlist) + for i in range(1,len(newlist)): + newlist[i] = newlist[i] + newlist[i-1] + return newlist + + +def lss(inlist): + """ +Squares each value in the passed list, adds up these squares and +returns the result. + +Usage: lss(inlist) +""" + ss = 0 + for item in inlist: + ss = ss + item*item + return ss + + +def lsummult (list1,list2): + """ +Multiplies elements in list1 and list2, element by element, and +returns the sum of all resulting multiplications. Must provide equal +length lists. + +Usage: lsummult(list1,list2) +""" + if len(list1) <> len(list2): + raise ValueError, "Lists not equal length in summult." + s = 0 + for item1,item2 in pstat.abut(list1,list2): + s = s + item1*item2 + return s + + +def lsumdiffsquared(x,y): + """ +Takes pairwise differences of the values in lists x and y, squares +these differences, and returns the sum of these squares. + +Usage: lsumdiffsquared(x,y) +Returns: sum[(x[i]-y[i])**2] +""" + sds = 0 + for i in range(len(x)): + sds = sds + (x[i]-y[i])**2 + return sds + + +def lsquare_of_sums(inlist): + """ +Adds the values in the passed list, squares the sum, and returns +the result. + +Usage: lsquare_of_sums(inlist) +Returns: sum(inlist[i])**2 +""" + s = sum(inlist) + return float(s)*s + + +def lshellsort(inlist): + """ +Shellsort algorithm. Sorts a 1D-list. + +Usage: lshellsort(inlist) +Returns: sorted-inlist, sorting-index-vector (for original list) +""" + n = len(inlist) + svec = copy.deepcopy(inlist) + ivec = range(n) + gap = n/2 # integer division needed + while gap >0: + for i in range(gap,n): + for j in range(i-gap,-1,-gap): + while j>=0 and svec[j]>svec[j+gap]: + temp = svec[j] + svec[j] = svec[j+gap] + svec[j+gap] = temp + itemp = ivec[j] + ivec[j] = ivec[j+gap] + ivec[j+gap] = itemp + gap = gap / 2 # integer division needed +# svec is now sorted inlist, and ivec has the order svec[i] = vec[ivec[i]] + return svec, ivec + + +def lrankdata(inlist): + """ +Ranks the data in inlist, dealing with ties appropritely. Assumes +a 1D inlist. Adapted from Gary Perlman's |Stat ranksort. + +Usage: lrankdata(inlist) +Returns: a list of length equal to inlist, containing rank scores +""" + n = len(inlist) + svec, ivec = shellsort(inlist) + sumranks = 0 + dupcount = 0 + newlist = [0]*n + for i in range(n): + sumranks = sumranks + i + dupcount = dupcount + 1 + if i==n-1 or svec[i] <> svec[i+1]: + averank = sumranks / float(dupcount) + 1 + for j in range(i-dupcount+1,i+1): + newlist[ivec[j]] = averank + sumranks = 0 + dupcount = 0 + return newlist + + +def outputpairedstats(fname,writemode,name1,n1,m1,se1,min1,max1,name2,n2,m2,se2,min2,max2,statname,stat,prob): + """ +Prints or write to a file stats for two groups, using the name, n, +mean, sterr, min and max for each group, as well as the statistic name, +its value, and the associated p-value. + +Usage: outputpairedstats(fname,writemode, + name1,n1,mean1,stderr1,min1,max1, + name2,n2,mean2,stderr2,min2,max2, + statname,stat,prob) +Returns: None +""" + suffix = '' # for *s after the p-value + try: + x = prob.shape + prob = prob[0] + except: + pass + if prob < 0.001: suffix = ' ***' + elif prob < 0.01: suffix = ' **' + elif prob < 0.05: suffix = ' *' + title = [['Name','N','Mean','SD','Min','Max']] + lofl = title+[[name1,n1,round(m1,3),round(math.sqrt(se1),3),min1,max1], + [name2,n2,round(m2,3),round(math.sqrt(se2),3),min2,max2]] + if type(fname)<>StringType or len(fname)==0: + print + print statname + print + pstat.printcc(lofl) + print + try: + if stat.shape == (): + stat = stat[0] + if prob.shape == (): + prob = prob[0] + except: + pass + print 'Test statistic = ',round(stat,3),' p = ',round(prob,3),suffix + print + else: + file = open(fname,writemode) + file.write('\n'+statname+'\n\n') + file.close() + writecc(lofl,fname,'a') + file = open(fname,'a') + try: + if stat.shape == (): + stat = stat[0] + if prob.shape == (): + prob = prob[0] + except: + pass + file.write(pstat.list2string(['\nTest statistic = ',round(stat,4),' p = ',round(prob,4),suffix,'\n\n'])) + file.close() + return None + + +def lfindwithin (data): + """ +Returns an integer representing a binary vector, where 1=within- +subject factor, 0=between. Input equals the entire data 2D list (i.e., +column 0=random factor, column -1=measured values (those two are skipped). +Note: input data is in |Stat format ... a list of lists ("2D list") with +one row per measured value, first column=subject identifier, last column= +score, one in-between column per factor (these columns contain level +designations on each factor). See also stats.anova.__doc__. + +Usage: lfindwithin(data) data in |Stat format +""" + + numfact = len(data[0])-1 + withinvec = 0 + for col in range(1,numfact): + examplelevel = pstat.unique(pstat.colex(data,col))[0] + rows = pstat.linexand(data,col,examplelevel) # get 1 level of this factor + factsubjs = pstat.unique(pstat.colex(rows,0)) + allsubjs = pstat.unique(pstat.colex(data,0)) + if len(factsubjs) == len(allsubjs): # fewer Ss than scores on this factor? + withinvec = withinvec + (1 << col) + return withinvec + + +######################################################### +######################################################### +####### DISPATCH LISTS AND TUPLES TO ABOVE FCNS ######### +######################################################### +######################################################### + +## CENTRAL TENDENCY: +geometricmean = Dispatch ( (lgeometricmean, (ListType, TupleType)), ) +harmonicmean = Dispatch ( (lharmonicmean, (ListType, TupleType)), ) +mean = Dispatch ( (lmean, (ListType, TupleType)), ) +median = Dispatch ( (lmedian, (ListType, TupleType)), ) +medianscore = Dispatch ( (lmedianscore, (ListType, TupleType)), ) +mode = Dispatch ( (lmode, (ListType, TupleType)), ) + +## MOMENTS: +moment = Dispatch ( (lmoment, (ListType, TupleType)), ) +variation = Dispatch ( (lvariation, (ListType, TupleType)), ) +skew = Dispatch ( (lskew, (ListType, TupleType)), ) +kurtosis = Dispatch ( (lkurtosis, (ListType, TupleType)), ) +describe = Dispatch ( (ldescribe, (ListType, TupleType)), ) + +## FREQUENCY STATISTICS: +itemfreq = Dispatch ( (litemfreq, (ListType, TupleType)), ) +scoreatpercentile = Dispatch ( (lscoreatpercentile, (ListType, TupleType)), ) +percentileofscore = Dispatch ( (lpercentileofscore, (ListType, TupleType)), ) +histogram = Dispatch ( (lhistogram, (ListType, TupleType)), ) +cumfreq = Dispatch ( (lcumfreq, (ListType, TupleType)), ) +relfreq = Dispatch ( (lrelfreq, (ListType, TupleType)), ) + +## VARIABILITY: +obrientransform = Dispatch ( (lobrientransform, (ListType, TupleType)), ) +samplevar = Dispatch ( (lsamplevar, (ListType, TupleType)), ) +samplestdev = Dispatch ( (lsamplestdev, (ListType, TupleType)), ) +var = Dispatch ( (lvar, (ListType, TupleType)), ) +stdev = Dispatch ( (lstdev, (ListType, TupleType)), ) +sterr = Dispatch ( (lsterr, (ListType, TupleType)), ) +sem = Dispatch ( (lsem, (ListType, TupleType)), ) +z = Dispatch ( (lz, (ListType, TupleType)), ) +zs = Dispatch ( (lzs, (ListType, TupleType)), ) + +## TRIMMING FCNS: +trimboth = Dispatch ( (ltrimboth, (ListType, TupleType)), ) +trim1 = Dispatch ( (ltrim1, (ListType, TupleType)), ) + +## CORRELATION FCNS: +paired = Dispatch ( (lpaired, (ListType, TupleType)), ) +pearsonr = Dispatch ( (lpearsonr, (ListType, TupleType)), ) +spearmanr = Dispatch ( (lspearmanr, (ListType, TupleType)), ) +pointbiserialr = Dispatch ( (lpointbiserialr, (ListType, TupleType)), ) +kendalltau = Dispatch ( (lkendalltau, (ListType, TupleType)), ) +linregress = Dispatch ( (llinregress, (ListType, TupleType)), ) + +## INFERENTIAL STATS: +ttest_1samp = Dispatch ( (lttest_1samp, (ListType, TupleType)), ) +ttest_ind = Dispatch ( (lttest_ind, (ListType, TupleType)), ) +ttest_rel = Dispatch ( (lttest_rel, (ListType, TupleType)), ) +chisquare = Dispatch ( (lchisquare, (ListType, TupleType)), ) +ks_2samp = Dispatch ( (lks_2samp, (ListType, TupleType)), ) +mannwhitneyu = Dispatch ( (lmannwhitneyu, (ListType, TupleType)), ) +ranksums = Dispatch ( (lranksums, (ListType, TupleType)), ) +tiecorrect = Dispatch ( (ltiecorrect, (ListType, TupleType)), ) +wilcoxont = Dispatch ( (lwilcoxont, (ListType, TupleType)), ) +kruskalwallish = Dispatch ( (lkruskalwallish, (ListType, TupleType)), ) +friedmanchisquare = Dispatch ( (lfriedmanchisquare, (ListType, TupleType)), ) + +## PROBABILITY CALCS: +chisqprob = Dispatch ( (lchisqprob, (IntType, FloatType)), ) +zprob = Dispatch ( (lzprob, (IntType, FloatType)), ) +ksprob = Dispatch ( (lksprob, (IntType, FloatType)), ) +fprob = Dispatch ( (lfprob, (IntType, FloatType)), ) +betacf = Dispatch ( (lbetacf, (IntType, FloatType)), ) +betai = Dispatch ( (lbetai, (IntType, FloatType)), ) +erfcc = Dispatch ( (lerfcc, (IntType, FloatType)), ) +gammln = Dispatch ( (lgammln, (IntType, FloatType)), ) + +## ANOVA FUNCTIONS: +F_oneway = Dispatch ( (lF_oneway, (ListType, TupleType)), ) +F_value = Dispatch ( (lF_value, (ListType, TupleType)), ) + +## SUPPORT FUNCTIONS: +incr = Dispatch ( (lincr, (ListType, TupleType)), ) +sum = Dispatch ( (lsum, (ListType, TupleType)), ) +cumsum = Dispatch ( (lcumsum, (ListType, TupleType)), ) +ss = Dispatch ( (lss, (ListType, TupleType)), ) +summult = Dispatch ( (lsummult, (ListType, TupleType)), ) +square_of_sums = Dispatch ( (lsquare_of_sums, (ListType, TupleType)), ) +sumdiffsquared = Dispatch ( (lsumdiffsquared, (ListType, TupleType)), ) +shellsort = Dispatch ( (lshellsort, (ListType, TupleType)), ) +rankdata = Dispatch ( (lrankdata, (ListType, TupleType)), ) +findwithin = Dispatch ( (lfindwithin, (ListType, TupleType)), ) + + +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== +#============= THE ARRAY-VERSION OF THE STATS FUNCTIONS =============== + +try: # DEFINE THESE *ONLY* IF NUMERIC IS AVAILABLE + import numpy as N + import numpy.linalg as LA + + +##################################### +######## ACENTRAL TENDENCY ######## +##################################### + + def ageometricmean (inarray,dimension=None,keepdims=0): + """ +Calculates the geometric mean of the values in the passed array. +That is: n-th root of (x1 * x2 * ... * xn). Defaults to ALL values in +the passed array. Use dimension=None to flatten array first. REMEMBER: if +dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and +if dimension is a sequence, it collapses over all specified dimensions. If +keepdims is set to 1, the resulting array will have as many dimensions as +inarray, with only 1 'level' per dim that was collapsed over. + +Usage: ageometricmean(inarray,dimension=None,keepdims=0) +Returns: geometric mean computed over dim(s) listed in dimension +""" + inarray = N.array(inarray,N.float_) + if dimension == None: + inarray = N.ravel(inarray) + size = len(inarray) + mult = N.power(inarray,1.0/size) + mult = N.multiply.reduce(mult) + elif type(dimension) in [IntType,FloatType]: + size = inarray.shape[dimension] + mult = N.power(inarray,1.0/size) + mult = N.multiply.reduce(mult,dimension) + if keepdims == 1: + shp = list(inarray.shape) + shp[dimension] = 1 + sum = N.reshape(sum,shp) + else: # must be a SEQUENCE of dims to average over + dims = list(dimension) + dims.sort() + dims.reverse() + size = N.array(N.multiply.reduce(N.take(inarray.shape,dims)),N.float_) + mult = N.power(inarray,1.0/size) + for dim in dims: + mult = N.multiply.reduce(mult,dim) + if keepdims == 1: + shp = list(inarray.shape) + for dim in dims: + shp[dim] = 1 + mult = N.reshape(mult,shp) + return mult + + + def aharmonicmean (inarray,dimension=None,keepdims=0): + """ +Calculates the harmonic mean of the values in the passed array. +That is: n / (1/x1 + 1/x2 + ... + 1/xn). Defaults to ALL values in +the passed array. Use dimension=None to flatten array first. REMEMBER: if +dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and +if dimension is a sequence, it collapses over all specified dimensions. If +keepdims is set to 1, the resulting array will have as many dimensions as +inarray, with only 1 'level' per dim that was collapsed over. + +Usage: aharmonicmean(inarray,dimension=None,keepdims=0) +Returns: harmonic mean computed over dim(s) in dimension +""" + inarray = inarray.astype(N.float_) + if dimension == None: + inarray = N.ravel(inarray) + size = len(inarray) + s = N.add.reduce(1.0 / inarray) + elif type(dimension) in [IntType,FloatType]: + size = float(inarray.shape[dimension]) + s = N.add.reduce(1.0/inarray, dimension) + if keepdims == 1: + shp = list(inarray.shape) + shp[dimension] = 1 + s = N.reshape(s,shp) + else: # must be a SEQUENCE of dims to average over + dims = list(dimension) + dims.sort() + nondims = [] + for i in range(len(inarray.shape)): + if i not in dims: + nondims.append(i) + tinarray = N.transpose(inarray,nondims+dims) # put keep-dims first + idx = [0] *len(nondims) + if idx == []: + size = len(N.ravel(inarray)) + s = asum(1.0 / inarray) + if keepdims == 1: + s = N.reshape([s],N.ones(len(inarray.shape))) + else: + idx[0] = -1 + loopcap = N.array(tinarray.shape[0:len(nondims)]) -1 + s = N.zeros(loopcap+1,N.float_) + while incr(idx,loopcap) <> -1: + s[idx] = asum(1.0/tinarray[idx]) + size = N.multiply.reduce(N.take(inarray.shape,dims)) + if keepdims == 1: + shp = list(inarray.shape) + for dim in dims: + shp[dim] = 1 + s = N.reshape(s,shp) + return size / s + + + def amean (inarray,dimension=None,keepdims=0): + """ +Calculates the arithmatic mean of the values in the passed array. +That is: 1/n * (x1 + x2 + ... + xn). Defaults to ALL values in the +passed array. Use dimension=None to flatten array first. REMEMBER: if +dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and +if dimension is a sequence, it collapses over all specified dimensions. If +keepdims is set to 1, the resulting array will have as many dimensions as +inarray, with only 1 'level' per dim that was collapsed over. + +Usage: amean(inarray,dimension=None,keepdims=0) +Returns: arithematic mean calculated over dim(s) in dimension +""" + if inarray.dtype in [N.int_, N.short,N.ubyte]: + inarray = inarray.astype(N.float_) + if dimension == None: + inarray = N.ravel(inarray) + sum = N.add.reduce(inarray) + denom = float(len(inarray)) + elif type(dimension) in [IntType,FloatType]: + sum = asum(inarray,dimension) + denom = float(inarray.shape[dimension]) + if keepdims == 1: + shp = list(inarray.shape) + shp[dimension] = 1 + sum = N.reshape(sum,shp) + else: # must be a TUPLE of dims to average over + dims = list(dimension) + dims.sort() + dims.reverse() + sum = inarray *1.0 + for dim in dims: + sum = N.add.reduce(sum,dim) + denom = N.array(N.multiply.reduce(N.take(inarray.shape,dims)),N.float_) + if keepdims == 1: + shp = list(inarray.shape) + for dim in dims: + shp[dim] = 1 + sum = N.reshape(sum,shp) + return sum/denom + + + def amedian (inarray,numbins=1000): + """ +Calculates the COMPUTED median value of an array of numbers, given the +number of bins to use for the histogram (more bins approaches finding the +precise median value of the array; default number of bins = 1000). From +G.W. Heiman's Basic Stats, or CRC Probability & Statistics. +NOTE: THIS ROUTINE ALWAYS uses the entire passed array (flattens it first). + +Usage: amedian(inarray,numbins=1000) +Returns: median calculated over ALL values in inarray +""" + inarray = N.ravel(inarray) + (hist, smallest, binsize, extras) = ahistogram(inarray,numbins,[min(inarray),max(inarray)]) + cumhist = N.cumsum(hist) # make cumulative histogram + otherbins = N.greater_equal(cumhist,len(inarray)/2.0) + otherbins = list(otherbins) # list of 0/1s, 1s start at median bin + cfbin = otherbins.index(1) # get 1st(!) index holding 50%ile score + LRL = smallest + binsize*cfbin # get lower read limit of that bin + cfbelow = N.add.reduce(hist[0:cfbin]) # cum. freq. below bin + freq = hist[cfbin] # frequency IN the 50%ile bin + median = LRL + ((len(inarray)/2.0-cfbelow)/float(freq))*binsize # MEDIAN + return median + + + def amedianscore (inarray,dimension=None): + """ +Returns the 'middle' score of the passed array. If there is an even +number of scores, the mean of the 2 middle scores is returned. Can function +with 1D arrays, or on the FIRST dimension of 2D arrays (i.e., dimension can +be None, to pre-flatten the array, or else dimension must equal 0). + +Usage: amedianscore(inarray,dimension=None) +Returns: 'middle' score of the array, or the mean of the 2 middle scores +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + inarray = N.sort(inarray,dimension) + if inarray.shape[dimension] % 2 == 0: # if even number of elements + indx = inarray.shape[dimension]/2 # integer division correct + median = N.asarray(inarray[indx]+inarray[indx-1]) / 2.0 + else: + indx = inarray.shape[dimension] / 2 # integer division correct + median = N.take(inarray,[indx],dimension) + if median.shape == (1,): + median = median[0] + return median + + + def amode(a, dimension=None): + """ +Returns an array of the modal (most common) score in the passed array. +If there is more than one such score, ONLY THE FIRST is returned. +The bin-count for the modal values is also returned. Operates on whole +array (dimension=None), or on a given dimension. + +Usage: amode(a, dimension=None) +Returns: array of bin-counts for mode(s), array of corresponding modal values +""" + + if dimension == None: + a = N.ravel(a) + dimension = 0 + scores = pstat.aunique(N.ravel(a)) # get ALL unique values + testshape = list(a.shape) + testshape[dimension] = 1 + oldmostfreq = N.zeros(testshape) + oldcounts = N.zeros(testshape) + for score in scores: + template = N.equal(a,score) + counts = asum(template,dimension,1) + mostfrequent = N.where(counts>oldcounts,score,oldmostfreq) + oldcounts = N.where(counts>oldcounts,counts,oldcounts) + oldmostfreq = mostfrequent + return oldcounts, mostfrequent + + + def atmean(a,limits=None,inclusive=(1,1)): + """ +Returns the arithmetic mean of all values in an array, ignoring values +strictly outside the sequence passed to 'limits'. Note: either limit +in the sequence, or the value of limits itself, can be set to None. The +inclusive list/tuple determines whether the lower and upper limiting bounds +(respectively) are open/exclusive (0) or closed/inclusive (1). + +Usage: atmean(a,limits=None,inclusive=(1,1)) +""" + if a.dtype in [N.int_, N.short,N.ubyte]: + a = a.astype(N.float_) + if limits == None: + return mean(a) + assert type(limits) in [ListType,TupleType,N.ndarray], "Wrong type for limits in atmean" + if inclusive[0]: lowerfcn = N.greater_equal + else: lowerfcn = N.greater + if inclusive[1]: upperfcn = N.less_equal + else: upperfcn = N.less + if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)): + raise ValueError, "No array values within given limits (atmean)." + elif limits[0]==None and limits[1]<>None: + mask = upperfcn(a,limits[1]) + elif limits[0]<>None and limits[1]==None: + mask = lowerfcn(a,limits[0]) + elif limits[0]<>None and limits[1]<>None: + mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1]) + s = float(N.add.reduce(N.ravel(a*mask))) + n = float(N.add.reduce(N.ravel(mask))) + return s/n + + + def atvar(a,limits=None,inclusive=(1,1)): + """ +Returns the sample variance of values in an array, (i.e., using N-1), +ignoring values strictly outside the sequence passed to 'limits'. +Note: either limit in the sequence, or the value of limits itself, +can be set to None. The inclusive list/tuple determines whether the lower +and upper limiting bounds (respectively) are open/exclusive (0) or +closed/inclusive (1). ASSUMES A FLAT ARRAY (OR ELSE PREFLATTENS). + +Usage: atvar(a,limits=None,inclusive=(1,1)) +""" + a = a.astype(N.float_) + if limits == None or limits == [None,None]: + return avar(a) + assert type(limits) in [ListType,TupleType,N.ndarray], "Wrong type for limits in atvar" + if inclusive[0]: lowerfcn = N.greater_equal + else: lowerfcn = N.greater + if inclusive[1]: upperfcn = N.less_equal + else: upperfcn = N.less + if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)): + raise ValueError, "No array values within given limits (atvar)." + elif limits[0]==None and limits[1]<>None: + mask = upperfcn(a,limits[1]) + elif limits[0]<>None and limits[1]==None: + mask = lowerfcn(a,limits[0]) + elif limits[0]<>None and limits[1]<>None: + mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1]) + + a = N.compress(mask,a) # squish out excluded values + return avar(a) + + + def atmin(a,lowerlimit=None,dimension=None,inclusive=1): + """ +Returns the minimum value of a, along dimension, including only values less +than (or equal to, if inclusive=1) lowerlimit. If the limit is set to None, +all values in the array are used. + +Usage: atmin(a,lowerlimit=None,dimension=None,inclusive=1) +""" + if inclusive: lowerfcn = N.greater + else: lowerfcn = N.greater_equal + if dimension == None: + a = N.ravel(a) + dimension = 0 + if lowerlimit == None: + lowerlimit = N.minimum.reduce(N.ravel(a))-11 + biggest = N.maximum.reduce(N.ravel(a)) + ta = N.where(lowerfcn(a,lowerlimit),a,biggest) + return N.minimum.reduce(ta,dimension) + + + def atmax(a,upperlimit,dimension=None,inclusive=1): + """ +Returns the maximum value of a, along dimension, including only values greater +than (or equal to, if inclusive=1) upperlimit. If the limit is set to None, +a limit larger than the max value in the array is used. + +Usage: atmax(a,upperlimit,dimension=None,inclusive=1) +""" + if inclusive: upperfcn = N.less + else: upperfcn = N.less_equal + if dimension == None: + a = N.ravel(a) + dimension = 0 + if upperlimit == None: + upperlimit = N.maximum.reduce(N.ravel(a))+1 + smallest = N.minimum.reduce(N.ravel(a)) + ta = N.where(upperfcn(a,upperlimit),a,smallest) + return N.maximum.reduce(ta,dimension) + + + def atstdev(a,limits=None,inclusive=(1,1)): + """ +Returns the standard deviation of all values in an array, ignoring values +strictly outside the sequence passed to 'limits'. Note: either limit +in the sequence, or the value of limits itself, can be set to None. The +inclusive list/tuple determines whether the lower and upper limiting bounds +(respectively) are open/exclusive (0) or closed/inclusive (1). + +Usage: atstdev(a,limits=None,inclusive=(1,1)) +""" + return N.sqrt(tvar(a,limits,inclusive)) + + + def atsem(a,limits=None,inclusive=(1,1)): + """ +Returns the standard error of the mean for the values in an array, +(i.e., using N for the denominator), ignoring values strictly outside +the sequence passed to 'limits'. Note: either limit in the sequence, +or the value of limits itself, can be set to None. The inclusive list/tuple +determines whether the lower and upper limiting bounds (respectively) are +open/exclusive (0) or closed/inclusive (1). + +Usage: atsem(a,limits=None,inclusive=(1,1)) +""" + sd = tstdev(a,limits,inclusive) + if limits == None or limits == [None,None]: + n = float(len(N.ravel(a))) + limits = [min(a)-1, max(a)+1] + assert type(limits) in [ListType,TupleType,N.ndarray], "Wrong type for limits in atsem" + if inclusive[0]: lowerfcn = N.greater_equal + else: lowerfcn = N.greater + if inclusive[1]: upperfcn = N.less_equal + else: upperfcn = N.less + if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)): + raise ValueError, "No array values within given limits (atsem)." + elif limits[0]==None and limits[1]<>None: + mask = upperfcn(a,limits[1]) + elif limits[0]<>None and limits[1]==None: + mask = lowerfcn(a,limits[0]) + elif limits[0]<>None and limits[1]<>None: + mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1]) + term1 = N.add.reduce(N.ravel(a*a*mask)) + n = float(N.add.reduce(N.ravel(mask))) + return sd/math.sqrt(n) + + +##################################### +############ AMOMENTS ############# +##################################### + + def amoment(a,moment=1,dimension=None): + """ +Calculates the nth moment about the mean for a sample (defaults to the +1st moment). Generally used to calculate coefficients of skewness and +kurtosis. Dimension can equal None (ravel array first), an integer +(the dimension over which to operate), or a sequence (operate over +multiple dimensions). + +Usage: amoment(a,moment=1,dimension=None) +Returns: appropriate moment along given dimension +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + if moment == 1: + return 0.0 + else: + mn = amean(a,dimension,1) # 1=keepdims + s = N.power((a-mn),moment) + return amean(s,dimension) + + + def avariation(a,dimension=None): + """ +Returns the coefficient of variation, as defined in CRC Standard +Probability and Statistics, p.6. Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). + +Usage: avariation(a,dimension=None) +""" + return 100.0*asamplestdev(a,dimension)/amean(a,dimension) + + + def askew(a,dimension=None): + """ +Returns the skewness of a distribution (normal ==> 0.0; >0 means extra +weight in left tail). Use askewtest() to see if it's close enough. +Dimension can equal None (ravel array first), an integer (the +dimension over which to operate), or a sequence (operate over multiple +dimensions). + +Usage: askew(a, dimension=None) +Returns: skew of vals in a along dimension, returning ZERO where all vals equal +""" + denom = N.power(amoment(a,2,dimension),1.5) + zero = N.equal(denom,0) + if type(denom) == N.ndarray and asum(zero) <> 0: + print "Number of zeros in askew: ",asum(zero) + denom = denom + zero # prevent divide-by-zero + return N.where(zero, 0, amoment(a,3,dimension)/denom) + + + def akurtosis(a,dimension=None): + """ +Returns the kurtosis of a distribution (normal ==> 3.0; >3 means +heavier in the tails, and usually more peaked). Use akurtosistest() +to see if it's close enough. Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). + +Usage: akurtosis(a,dimension=None) +Returns: kurtosis of values in a along dimension, and ZERO where all vals equal +""" + denom = N.power(amoment(a,2,dimension),2) + zero = N.equal(denom,0) + if type(denom) == N.ndarray and asum(zero) <> 0: + print "Number of zeros in akurtosis: ",asum(zero) + denom = denom + zero # prevent divide-by-zero + return N.where(zero,0,amoment(a,4,dimension)/denom) + + + def adescribe(inarray,dimension=None): + """ +Returns several descriptive statistics of the passed array. Dimension +can equal None (ravel array first), an integer (the dimension over +which to operate), or a sequence (operate over multiple dimensions). + +Usage: adescribe(inarray,dimension=None) +Returns: n, (min,max), mean, standard deviation, skew, kurtosis +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + n = inarray.shape[dimension] + mm = (N.minimum.reduce(inarray),N.maximum.reduce(inarray)) + m = amean(inarray,dimension) + sd = astdev(inarray,dimension) + skew = askew(inarray,dimension) + kurt = akurtosis(inarray,dimension) + return n, mm, m, sd, skew, kurt + + +##################################### +######## NORMALITY TESTS ########## +##################################### + + def askewtest(a,dimension=None): + """ +Tests whether the skew is significantly different from a normal +distribution. Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). + +Usage: askewtest(a,dimension=None) +Returns: z-score and 2-tail z-probability +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + b2 = askew(a,dimension) + n = float(a.shape[dimension]) + y = b2 * N.sqrt(((n+1)*(n+3)) / (6.0*(n-2)) ) + beta2 = ( 3.0*(n*n+27*n-70)*(n+1)*(n+3) ) / ( (n-2.0)*(n+5)*(n+7)*(n+9) ) + W2 = -1 + N.sqrt(2*(beta2-1)) + delta = 1/N.sqrt(N.log(N.sqrt(W2))) + alpha = N.sqrt(2/(W2-1)) + y = N.where(y==0,1,y) + Z = delta*N.log(y/alpha + N.sqrt((y/alpha)**2+1)) + return Z, (1.0-zprob(Z))*2 + + + def akurtosistest(a,dimension=None): + """ +Tests whether a dataset has normal kurtosis (i.e., +kurtosis=3(n-1)/(n+1)) Valid only for n>20. Dimension can equal None +(ravel array first), an integer (the dimension over which to operate), +or a sequence (operate over multiple dimensions). + +Usage: akurtosistest(a,dimension=None) +Returns: z-score and 2-tail z-probability, returns 0 for bad pixels +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + n = float(a.shape[dimension]) + if n<20: + print "akurtosistest only valid for n>=20 ... continuing anyway, n=",n + b2 = akurtosis(a,dimension) + E = 3.0*(n-1) /(n+1) + varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5)) + x = (b2-E)/N.sqrt(varb2) + sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * N.sqrt((6.0*(n+3)*(n+5))/ + (n*(n-2)*(n-3))) + A = 6.0 + 8.0/sqrtbeta1 *(2.0/sqrtbeta1 + N.sqrt(1+4.0/(sqrtbeta1**2))) + term1 = 1 -2/(9.0*A) + denom = 1 +x*N.sqrt(2/(A-4.0)) + denom = N.where(N.less(denom,0), 99, denom) + term2 = N.where(N.equal(denom,0), term1, N.power((1-2.0/A)/denom,1/3.0)) + Z = ( term1 - term2 ) / N.sqrt(2/(9.0*A)) + Z = N.where(N.equal(denom,99), 0, Z) + return Z, (1.0-zprob(Z))*2 + + + def anormaltest(a,dimension=None): + """ +Tests whether skew and/OR kurtosis of dataset differs from normal +curve. Can operate over multiple dimensions. Dimension can equal +None (ravel array first), an integer (the dimension over which to +operate), or a sequence (operate over multiple dimensions). + +Usage: anormaltest(a,dimension=None) +Returns: z-score and 2-tail probability +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + s,p = askewtest(a,dimension) + k,p = akurtosistest(a,dimension) + k2 = N.power(s,2) + N.power(k,2) + return k2, achisqprob(k2,2) + + +##################################### +###### AFREQUENCY FUNCTIONS ####### +##################################### + + def aitemfreq(a): + """ +Returns a 2D array of item frequencies. Column 1 contains item values, +column 2 contains their respective counts. Assumes a 1D array is passed. +@@@sorting OK? + +Usage: aitemfreq(a) +Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies) +""" + scores = pstat.aunique(a) + scores = N.sort(scores) + freq = N.zeros(len(scores)) + for i in range(len(scores)): + freq[i] = N.add.reduce(N.equal(a,scores[i])) + return N.array(pstat.aabut(scores, freq)) + + + def ascoreatpercentile (inarray, percent): + """ +Usage: ascoreatpercentile(inarray,percent) 0<percent<100 +Returns: score at given percentile, relative to inarray distribution +""" + percent = percent / 100.0 + targetcf = percent*len(inarray) + h, lrl, binsize, extras = histogram(inarray) + cumhist = cumsum(h*1) + for i in range(len(cumhist)): + if cumhist[i] >= targetcf: + break + score = binsize * ((targetcf - cumhist[i-1]) / float(h[i])) + (lrl+binsize*i) + return score + + + def apercentileofscore (inarray,score,histbins=10,defaultlimits=None): + """ +Note: result of this function depends on the values used to histogram +the data(!). + +Usage: apercentileofscore(inarray,score,histbins=10,defaultlimits=None) +Returns: percentile-position of score (0-100) relative to inarray +""" + h, lrl, binsize, extras = histogram(inarray,histbins,defaultlimits) + cumhist = cumsum(h*1) + i = int((score - lrl)/float(binsize)) + pct = (cumhist[i-1]+((score-(lrl+binsize*i))/float(binsize))*h[i])/float(len(inarray)) * 100 + return pct + + + def ahistogram (inarray,numbins=10,defaultlimits=None,printextras=1): + """ +Returns (i) an array of histogram bin counts, (ii) the smallest value +of the histogram binning, and (iii) the bin width (the last 2 are not +necessarily integers). Default number of bins is 10. Defaultlimits +can be None (the routine picks bins spanning all the numbers in the +inarray) or a 2-sequence (lowerlimit, upperlimit). Returns all of the +following: array of bin values, lowerreallimit, binsize, extrapoints. + +Usage: ahistogram(inarray,numbins=10,defaultlimits=None,printextras=1) +Returns: (array of bin counts, bin-minimum, min-width, #-points-outside-range) +""" + inarray = N.ravel(inarray) # flatten any >1D arrays + if (defaultlimits <> None): + lowerreallimit = defaultlimits[0] + upperreallimit = defaultlimits[1] + binsize = (upperreallimit-lowerreallimit) / float(numbins) + else: + Min = N.minimum.reduce(inarray) + Max = N.maximum.reduce(inarray) + estbinwidth = float(Max - Min)/float(numbins) + 1e-6 + binsize = (Max-Min+estbinwidth)/float(numbins) + lowerreallimit = Min - binsize/2.0 #lower real limit,1st bin + bins = N.zeros(numbins) + extrapoints = 0 + for num in inarray: + try: + if (num-lowerreallimit) < 0: + extrapoints = extrapoints + 1 + else: + bintoincrement = int((num-lowerreallimit) / float(binsize)) + bins[bintoincrement] = bins[bintoincrement] + 1 + except: # point outside lower/upper limits + extrapoints = extrapoints + 1 + if (extrapoints > 0 and printextras == 1): + print '\nPoints outside given histogram range =',extrapoints + return (bins, lowerreallimit, binsize, extrapoints) + + + def acumfreq(a,numbins=10,defaultreallimits=None): + """ +Returns a cumulative frequency histogram, using the histogram function. +Defaultreallimits can be None (use all data), or a 2-sequence containing +lower and upper limits on values to include. + +Usage: acumfreq(a,numbins=10,defaultreallimits=None) +Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints +""" + h,l,b,e = histogram(a,numbins,defaultreallimits) + cumhist = cumsum(h*1) + return cumhist,l,b,e + + + def arelfreq(a,numbins=10,defaultreallimits=None): + """ +Returns a relative frequency histogram, using the histogram function. +Defaultreallimits can be None (use all data), or a 2-sequence containing +lower and upper limits on values to include. + +Usage: arelfreq(a,numbins=10,defaultreallimits=None) +Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints +""" + h,l,b,e = histogram(a,numbins,defaultreallimits) + h = N.array(h/float(a.shape[0])) + return h,l,b,e + + +##################################### +###### AVARIABILITY FUNCTIONS ##### +##################################### + + def aobrientransform(*args): + """ +Computes a transform on input data (any number of columns). Used to +test for homogeneity of variance prior to running one-way stats. Each +array in *args is one level of a factor. If an F_oneway() run on the +transformed data and found significant, variances are unequal. From +Maxwell and Delaney, p.112. + +Usage: aobrientransform(*args) *args = 1D arrays, one per level of factor +Returns: transformed data for use in an ANOVA +""" + TINY = 1e-10 + k = len(args) + n = N.zeros(k,N.float_) + v = N.zeros(k,N.float_) + m = N.zeros(k,N.float_) + nargs = [] + for i in range(k): + nargs.append(args[i].astype(N.float_)) + n[i] = float(len(nargs[i])) + v[i] = var(nargs[i]) + m[i] = mean(nargs[i]) + for j in range(k): + for i in range(n[j]): + t1 = (n[j]-1.5)*n[j]*(nargs[j][i]-m[j])**2 + t2 = 0.5*v[j]*(n[j]-1.0) + t3 = (n[j]-1.0)*(n[j]-2.0) + nargs[j][i] = (t1-t2) / float(t3) + check = 1 + for j in range(k): + if v[j] - mean(nargs[j]) > TINY: + check = 0 + if check <> 1: + raise ValueError, 'Lack of convergence in obrientransform.' + else: + return N.array(nargs) + + + def asamplevar (inarray,dimension=None,keepdims=0): + """ +Returns the sample standard deviation of the values in the passed +array (i.e., using N). Dimension can equal None (ravel array first), +an integer (the dimension over which to operate), or a sequence +(operate over multiple dimensions). Set keepdims=1 to return an array +with the same number of dimensions as inarray. + +Usage: asamplevar(inarray,dimension=None,keepdims=0) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + if dimension == 1: + mn = amean(inarray,dimension)[:,N.NewAxis] + else: + mn = amean(inarray,dimension,keepdims=1) + deviations = inarray - mn + if type(dimension) == ListType: + n = 1 + for d in dimension: + n = n*inarray.shape[d] + else: + n = inarray.shape[dimension] + svar = ass(deviations,dimension,keepdims) / float(n) + return svar + + + def asamplestdev (inarray, dimension=None, keepdims=0): + """ +Returns the sample standard deviation of the values in the passed +array (i.e., using N). Dimension can equal None (ravel array first), +an integer (the dimension over which to operate), or a sequence +(operate over multiple dimensions). Set keepdims=1 to return an array +with the same number of dimensions as inarray. + +Usage: asamplestdev(inarray,dimension=None,keepdims=0) +""" + return N.sqrt(asamplevar(inarray,dimension,keepdims)) + + + def asignaltonoise(instack,dimension=0): + """ +Calculates signal-to-noise. Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). + +Usage: asignaltonoise(instack,dimension=0): +Returns: array containing the value of (mean/stdev) along dimension, + or 0 when stdev=0 +""" + m = mean(instack,dimension) + sd = stdev(instack,dimension) + return N.where(sd==0,0,m/sd) + + + def acov (x,y, dimension=None,keepdims=0): + """ +Returns the estimated covariance of the values in the passed +array (i.e., N-1). Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). Set keepdims=1 to return an array with the +same number of dimensions as inarray. + +Usage: acov(x,y,dimension=None,keepdims=0) +""" + if dimension == None: + x = N.ravel(x) + y = N.ravel(y) + dimension = 0 + xmn = amean(x,dimension,1) # keepdims + xdeviations = x - xmn + ymn = amean(y,dimension,1) # keepdims + ydeviations = y - ymn + if type(dimension) == ListType: + n = 1 + for d in dimension: + n = n*x.shape[d] + else: + n = x.shape[dimension] + covar = N.sum(xdeviations*ydeviations)/float(n-1) + return covar + + + def avar (inarray, dimension=None,keepdims=0): + """ +Returns the estimated population variance of the values in the passed +array (i.e., N-1). Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). Set keepdims=1 to return an array with the +same number of dimensions as inarray. + +Usage: avar(inarray,dimension=None,keepdims=0) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + mn = amean(inarray,dimension,1) + deviations = inarray - mn + if type(dimension) == ListType: + n = 1 + for d in dimension: + n = n*inarray.shape[d] + else: + n = inarray.shape[dimension] + var = ass(deviations,dimension,keepdims)/float(n-1) + return var + + + def astdev (inarray, dimension=None, keepdims=0): + """ +Returns the estimated population standard deviation of the values in +the passed array (i.e., N-1). Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). Set keepdims=1 to return +an array with the same number of dimensions as inarray. + +Usage: astdev(inarray,dimension=None,keepdims=0) +""" + return N.sqrt(avar(inarray,dimension,keepdims)) + + + def asterr (inarray, dimension=None, keepdims=0): + """ +Returns the estimated population standard error of the values in the +passed array (i.e., N-1). Dimension can equal None (ravel array +first), an integer (the dimension over which to operate), or a +sequence (operate over multiple dimensions). Set keepdims=1 to return +an array with the same number of dimensions as inarray. + +Usage: asterr(inarray,dimension=None,keepdims=0) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + return astdev(inarray,dimension,keepdims) / float(N.sqrt(inarray.shape[dimension])) + + + def asem (inarray, dimension=None, keepdims=0): + """ +Returns the standard error of the mean (i.e., using N) of the values +in the passed array. Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions). Set keepdims=1 to return an array with the +same number of dimensions as inarray. + +Usage: asem(inarray,dimension=None, keepdims=0) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + if type(dimension) == ListType: + n = 1 + for d in dimension: + n = n*inarray.shape[d] + else: + n = inarray.shape[dimension] + s = asamplestdev(inarray,dimension,keepdims) / N.sqrt(n-1) + return s + + + def az (a, score): + """ +Returns the z-score of a given input score, given thearray from which +that score came. Not appropriate for population calculations, nor for +arrays > 1D. + +Usage: az(a, score) +""" + z = (score-amean(a)) / asamplestdev(a) + return z + + + def azs (a): + """ +Returns a 1D array of z-scores, one for each score in the passed array, +computed relative to the passed array. + +Usage: azs(a) +""" + zscores = [] + for item in a: + zscores.append(z(a,item)) + return N.array(zscores) + + + def azmap (scores, compare, dimension=0): + """ +Returns an array of z-scores the shape of scores (e.g., [x,y]), compared to +array passed to compare (e.g., [time,x,y]). Assumes collapsing over dim 0 +of the compare array. + +Usage: azs(scores, compare, dimension=0) +""" + mns = amean(compare,dimension) + sstd = asamplestdev(compare,0) + return (scores - mns) / sstd + + +##################################### +####### ATRIMMING FUNCTIONS ####### +##################################### + +## deleted around() as it's in numpy now + + def athreshold(a,threshmin=None,threshmax=None,newval=0): + """ +Like Numeric.clip() except that values <threshmid or >threshmax are replaced +by newval instead of by threshmin/threshmax (respectively). + +Usage: athreshold(a,threshmin=None,threshmax=None,newval=0) +Returns: a, with values <threshmin or >threshmax replaced with newval +""" + mask = N.zeros(a.shape) + if threshmin <> None: + mask = mask + N.where(a<threshmin,1,0) + if threshmax <> None: + mask = mask + N.where(a>threshmax,1,0) + mask = N.clip(mask,0,1) + return N.where(mask,newval,a) + + + def atrimboth (a,proportiontocut): + """ +Slices off the passed proportion of items from BOTH ends of the passed +array (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND +'rightmost' 10% of scores. You must pre-sort the array if you want +"proper" trimming. Slices off LESS if proportion results in a +non-integer slice index (i.e., conservatively slices off +proportiontocut). + +Usage: atrimboth (a,proportiontocut) +Returns: trimmed version of array a +""" + lowercut = int(proportiontocut*len(a)) + uppercut = len(a) - lowercut + return a[lowercut:uppercut] + + + def atrim1 (a,proportiontocut,tail='right'): + """ +Slices off the passed proportion of items from ONE end of the passed +array (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost' +10% of scores). Slices off LESS if proportion results in a non-integer +slice index (i.e., conservatively slices off proportiontocut). + +Usage: atrim1(a,proportiontocut,tail='right') or set tail='left' +Returns: trimmed version of array a +""" + if string.lower(tail) == 'right': + lowercut = 0 + uppercut = len(a) - int(proportiontocut*len(a)) + elif string.lower(tail) == 'left': + lowercut = int(proportiontocut*len(a)) + uppercut = len(a) + return a[lowercut:uppercut] + + +##################################### +##### ACORRELATION FUNCTIONS ###### +##################################### + + def acovariance(X): + """ +Computes the covariance matrix of a matrix X. Requires a 2D matrix input. + +Usage: acovariance(X) +Returns: covariance matrix of X +""" + if len(X.shape) <> 2: + raise TypeError, "acovariance requires 2D matrices" + n = X.shape[0] + mX = amean(X,0) + return N.dot(N.transpose(X),X) / float(n) - N.multiply.outer(mX,mX) + + + def acorrelation(X): + """ +Computes the correlation matrix of a matrix X. Requires a 2D matrix input. + +Usage: acorrelation(X) +Returns: correlation matrix of X +""" + C = acovariance(X) + V = N.diagonal(C) + return C / N.sqrt(N.multiply.outer(V,V)) + + + def apaired(x,y): + """ +Interactively determines the type of data in x and y, and then runs the +appropriated statistic for paired group data. + +Usage: apaired(x,y) x,y = the two arrays of values to be compared +Returns: appropriate statistic name, value, and probability +""" + samples = '' + while samples not in ['i','r','I','R','c','C']: + print '\nIndependent or related samples, or correlation (i,r,c): ', + samples = raw_input() + + if samples in ['i','I','r','R']: + print '\nComparing variances ...', +# USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112 + r = obrientransform(x,y) + f,p = F_oneway(pstat.colex(r,0),pstat.colex(r,1)) + if p<0.05: + vartype='unequal, p='+str(round(p,4)) + else: + vartype='equal' + print vartype + if samples in ['i','I']: + if vartype[0]=='e': + t,p = ttest_ind(x,y,None,0) + print '\nIndependent samples t-test: ', round(t,4),round(p,4) + else: + if len(x)>20 or len(y)>20: + z,p = ranksums(x,y) + print '\nRank Sums test (NONparametric, n>20): ', round(z,4),round(p,4) + else: + u,p = mannwhitneyu(x,y) + print '\nMann-Whitney U-test (NONparametric, ns<20): ', round(u,4),round(p,4) + + else: # RELATED SAMPLES + if vartype[0]=='e': + t,p = ttest_rel(x,y,0) + print '\nRelated samples t-test: ', round(t,4),round(p,4) + else: + t,p = ranksums(x,y) + print '\nWilcoxon T-test (NONparametric): ', round(t,4),round(p,4) + else: # CORRELATION ANALYSIS + corrtype = '' + while corrtype not in ['c','C','r','R','d','D']: + print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ', + corrtype = raw_input() + if corrtype in ['c','C']: + m,b,r,p,see = linregress(x,y) + print '\nLinear regression for continuous variables ...' + lol = [['Slope','Intercept','r','Prob','SEestimate'],[round(m,4),round(b,4),round(r,4),round(p,4),round(see,4)]] + pstat.printcc(lol) + elif corrtype in ['r','R']: + r,p = spearmanr(x,y) + print '\nCorrelation for ranked variables ...' + print "Spearman's r: ",round(r,4),round(p,4) + else: # DICHOTOMOUS + r,p = pointbiserialr(x,y) + print '\nAssuming x contains a dichotomous variable ...' + print 'Point Biserial r: ',round(r,4),round(p,4) + print '\n\n' + return None + + + def dices(x,y): + """ +Calculates Dice's coefficient ... (2*number of common terms)/(number of terms in x + +number of terms in y). Returns a value between 0 (orthogonal) and 1. + +Usage: dices(x,y) +""" + import sets + x = sets.Set(x) + y = sets.Set(y) + common = len(x.intersection(y)) + total = float(len(x) + len(y)) + return 2*common/total + + + def icc(x,y=None,verbose=0): + """ +Calculates intraclass correlation coefficients using simple, Type I sums of squares. +If only one variable is passed, assumed it's an Nx2 matrix + +Usage: icc(x,y=None,verbose=0) +Returns: icc rho, prob ####PROB IS A GUESS BASED ON PEARSON +""" + TINY = 1.0e-20 + if y: + all = N.concatenate([x,y],0) + else: + all = x+0 + x = all[:,0] + y = all[:,1] + totalss = ass(all-mean(all)) + pairmeans = (x+y)/2. + withinss = ass(x-pairmeans) + ass(y-pairmeans) + withindf = float(len(x)) + betwdf = float(len(x)-1) + withinms = withinss / withindf + betweenms = (totalss-withinss) / betwdf + rho = (betweenms-withinms)/(withinms+betweenms) + t = rho*math.sqrt(betwdf/((1.0-rho+TINY)*(1.0+rho+TINY))) + prob = abetai(0.5*betwdf,0.5,betwdf/(betwdf+t*t),verbose) + return rho, prob + + + def alincc(x,y): + """ +Calculates Lin's concordance correlation coefficient. + +Usage: alincc(x,y) where x, y are equal-length arrays +Returns: Lin's CC +""" + x = N.ravel(x) + y = N.ravel(y) + covar = acov(x,y)*(len(x)-1)/float(len(x)) # correct denom to n + xvar = avar(x)*(len(x)-1)/float(len(x)) # correct denom to n + yvar = avar(y)*(len(y)-1)/float(len(y)) # correct denom to n + lincc = (2 * covar) / ((xvar+yvar) +((amean(x)-amean(y))**2)) + return lincc + + + def apearsonr(x,y,verbose=1): + """ +Calculates a Pearson correlation coefficient and returns p. Taken +from Heiman's Basic Statistics for the Behav. Sci (2nd), p.195. + +Usage: apearsonr(x,y,verbose=1) where x,y are equal length arrays +Returns: Pearson's r, two-tailed p-value +""" + TINY = 1.0e-20 + n = len(x) + xmean = amean(x) + ymean = amean(y) + r_num = n*(N.add.reduce(x*y)) - N.add.reduce(x)*N.add.reduce(y) + r_den = math.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y)-asquare_of_sums(y))) + r = (r_num / r_den) + df = n-2 + t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) + prob = abetai(0.5*df,0.5,df/(df+t*t),verbose) + return r,prob + + + def aspearmanr(x,y): + """ +Calculates a Spearman rank-order correlation coefficient. Taken +from Heiman's Basic Statistics for the Behav. Sci (1st), p.192. + +Usage: aspearmanr(x,y) where x,y are equal-length arrays +Returns: Spearman's r, two-tailed p-value +""" + TINY = 1e-30 + n = len(x) + rankx = rankdata(x) + ranky = rankdata(y) + dsq = N.add.reduce((rankx-ranky)**2) + rs = 1 - 6*dsq / float(n*(n**2-1)) + t = rs * math.sqrt((n-2) / ((rs+1.0)*(1.0-rs))) + df = n-2 + probrs = abetai(0.5*df,0.5,df/(df+t*t)) +# probability values for rs are from part 2 of the spearman function in +# Numerical Recipies, p.510. They close to tables, but not exact.(?) + return rs, probrs + + + def apointbiserialr(x,y): + """ +Calculates a point-biserial correlation coefficient and the associated +probability value. Taken from Heiman's Basic Statistics for the Behav. +Sci (1st), p.194. + +Usage: apointbiserialr(x,y) where x,y are equal length arrays +Returns: Point-biserial r, two-tailed p-value +""" + TINY = 1e-30 + categories = pstat.aunique(x) + data = pstat.aabut(x,y) + if len(categories) <> 2: + raise ValueError, "Exactly 2 categories required (in x) for pointbiserialr()." + else: # there are 2 categories, continue + codemap = pstat.aabut(categories,N.arange(2)) + recoded = pstat.arecode(data,codemap,0) + x = pstat.alinexand(data,0,categories[0]) + y = pstat.alinexand(data,0,categories[1]) + xmean = amean(pstat.acolex(x,1)) + ymean = amean(pstat.acolex(y,1)) + n = len(data) + adjust = math.sqrt((len(x)/float(n))*(len(y)/float(n))) + rpb = (ymean - xmean)/asamplestdev(pstat.acolex(data,1))*adjust + df = n-2 + t = rpb*math.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY))) + prob = abetai(0.5*df,0.5,df/(df+t*t)) + return rpb, prob + + + def akendalltau(x,y): + """ +Calculates Kendall's tau ... correlation of ordinal data. Adapted +from function kendl1 in Numerical Recipies. Needs good test-cases.@@@ + +Usage: akendalltau(x,y) +Returns: Kendall's tau, two-tailed p-value +""" + n1 = 0 + n2 = 0 + iss = 0 + for j in range(len(x)-1): + for k in range(j,len(y)): + a1 = x[j] - x[k] + a2 = y[j] - y[k] + aa = a1 * a2 + if (aa): # neither array has a tie + n1 = n1 + 1 + n2 = n2 + 1 + if aa > 0: + iss = iss + 1 + else: + iss = iss -1 + else: + if (a1): + n1 = n1 + 1 + else: + n2 = n2 + 1 + tau = iss / math.sqrt(n1*n2) + svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1)) + z = tau / math.sqrt(svar) + prob = erfcc(abs(z)/1.4142136) + return tau, prob + + + def alinregress(*args): + """ +Calculates a regression line on two arrays, x and y, corresponding to x,y +pairs. If a single 2D array is passed, alinregress finds dim with 2 levels +and splits data into x,y pairs along that dim. + +Usage: alinregress(*args) args=2 equal-length arrays, or one 2D array +Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate, n +""" + TINY = 1.0e-20 + if len(args) == 1: # more than 1D array? + args = args[0] + if len(args) == 2: + x = args[0] + y = args[1] + else: + x = args[:,0] + y = args[:,1] + else: + x = args[0] + y = args[1] + n = len(x) + xmean = amean(x) + ymean = amean(y) + r_num = n*(N.add.reduce(x*y)) - N.add.reduce(x)*N.add.reduce(y) + r_den = math.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y)-asquare_of_sums(y))) + r = r_num / r_den + z = 0.5*math.log((1.0+r+TINY)/(1.0-r+TINY)) + df = n-2 + t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) + prob = abetai(0.5*df,0.5,df/(df+t*t)) + slope = r_num / (float(n)*ass(x) - asquare_of_sums(x)) + intercept = ymean - slope*xmean + sterrest = math.sqrt(1-r*r)*asamplestdev(y) + return slope, intercept, r, prob, sterrest, n + + def amasslinregress(*args): + """ +Calculates a regression line on one 1D array (x) and one N-D array (y). + +Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate, n +""" + TINY = 1.0e-20 + if len(args) == 1: # more than 1D array? + args = args[0] + if len(args) == 2: + x = N.ravel(args[0]) + y = args[1] + else: + x = N.ravel(args[:,0]) + y = args[:,1] + else: + x = args[0] + y = args[1] + x = x.astype(N.float_) + y = y.astype(N.float_) + n = len(x) + xmean = amean(x) + ymean = amean(y,0) + shp = N.ones(len(y.shape)) + shp[0] = len(x) + x.shape = shp + print x.shape, y.shape + r_num = n*(N.add.reduce(x*y,0)) - N.add.reduce(x)*N.add.reduce(y,0) + r_den = N.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y,0)-asquare_of_sums(y,0))) + zerodivproblem = N.equal(r_den,0) + r_den = N.where(zerodivproblem,1,r_den) # avoid zero-division in 1st place + r = r_num / r_den # need to do this nicely for matrix division + r = N.where(zerodivproblem,0.0,r) + z = 0.5*N.log((1.0+r+TINY)/(1.0-r+TINY)) + df = n-2 + t = r*N.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY))) + prob = abetai(0.5*df,0.5,df/(df+t*t)) + + ss = float(n)*ass(x)-asquare_of_sums(x) + s_den = N.where(ss==0,1,ss) # avoid zero-division in 1st place + slope = r_num / s_den + intercept = ymean - slope*xmean + sterrest = N.sqrt(1-r*r)*asamplestdev(y,0) + return slope, intercept, r, prob, sterrest, n + + +##################################### +##### AINFERENTIAL STATISTICS ##### +##################################### + + def attest_1samp(a,popmean,printit=0,name='Sample',writemode='a'): + """ +Calculates the t-obtained for the independent samples T-test on ONE group +of scores a, given a population mean. If printit=1, results are printed +to the screen. If printit='filename', the results are output to 'filename' +using the given writemode (default=append). Returns t-value, and prob. + +Usage: attest_1samp(a,popmean,Name='Sample',printit=0,writemode='a') +Returns: t-value, two-tailed prob +""" + if type(a) != N.ndarray: + a = N.array(a) + x = amean(a) + v = avar(a) + n = len(a) + df = n-1 + svar = ((n-1)*v) / float(df) + t = (x-popmean)/math.sqrt(svar*(1.0/n)) + prob = abetai(0.5*df,0.5,df/(df+t*t)) + + if printit <> 0: + statname = 'Single-sample T-test.' + outputpairedstats(printit,writemode, + 'Population','--',popmean,0,0,0, + name,n,x,v,N.minimum.reduce(N.ravel(a)), + N.maximum.reduce(N.ravel(a)), + statname,t,prob) + return t,prob + + + def attest_ind (a, b, dimension=None, printit=0, name1='Samp1', name2='Samp2',writemode='a'): + """ +Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores +a, and b. From Numerical Recipies, p.483. If printit=1, results are +printed to the screen. If printit='filename', the results are output +to 'filename' using the given writemode (default=append). Dimension +can equal None (ravel array first), or an integer (the dimension over +which to operate on a and b). + +Usage: attest_ind (a,b,dimension=None,printit=0, + Name1='Samp1',Name2='Samp2',writemode='a') +Returns: t-value, two-tailed p-value +""" + if dimension == None: + a = N.ravel(a) + b = N.ravel(b) + dimension = 0 + x1 = amean(a,dimension) + x2 = amean(b,dimension) + v1 = avar(a,dimension) + v2 = avar(b,dimension) + n1 = a.shape[dimension] + n2 = b.shape[dimension] + df = n1+n2-2 + svar = ((n1-1)*v1+(n2-1)*v2) / float(df) + zerodivproblem = N.equal(svar,0) + svar = N.where(zerodivproblem,1,svar) # avoid zero-division in 1st place + t = (x1-x2)/N.sqrt(svar*(1.0/n1 + 1.0/n2)) # N-D COMPUTATION HERE!!!!!! + t = N.where(zerodivproblem,1.0,t) # replace NaN/wrong t-values with 1.0 + probs = abetai(0.5*df,0.5,float(df)/(df+t*t)) + + if type(t) == N.ndarray: + probs = N.reshape(probs,t.shape) + if probs.shape == (1,): + probs = probs[0] + + if printit <> 0: + if type(t) == N.ndarray: + t = t[0] + if type(probs) == N.ndarray: + probs = probs[0] + statname = 'Independent samples T-test.' + outputpairedstats(printit,writemode, + name1,n1,x1,v1,N.minimum.reduce(N.ravel(a)), + N.maximum.reduce(N.ravel(a)), + name2,n2,x2,v2,N.minimum.reduce(N.ravel(b)), + N.maximum.reduce(N.ravel(b)), + statname,t,probs) + return + return t, probs + + def ap2t(pval,df): + """ +Tries to compute a t-value from a p-value (or pval array) and associated df. +SLOW for large numbers of elements(!) as it re-computes p-values 20 times +(smaller step-sizes) at which point it decides it's done. Keeps the signs +of the input array. Returns 1000 (or -1000) if t>100. + +Usage: ap2t(pval,df) +Returns: an array of t-values with the shape of pval + """ + pval = N.array(pval) + signs = N.sign(pval) + pval = abs(pval) + t = N.ones(pval.shape,N.float_)*50 + step = N.ones(pval.shape,N.float_)*25 + print "Initial ap2t() prob calc" + prob = abetai(0.5*df,0.5,float(df)/(df+t*t)) + print 'ap2t() iter: ', + for i in range(10): + print i,' ', + t = N.where(pval<prob,t+step,t-step) + prob = abetai(0.5*df,0.5,float(df)/(df+t*t)) + step = step/2 + print + # since this is an ugly hack, we get ugly boundaries + t = N.where(t>99.9,1000,t) # hit upper-boundary + t = t+signs + return t #, prob, pval + + + def attest_rel (a,b,dimension=None,printit=0,name1='Samp1',name2='Samp2',writemode='a'): + """ +Calculates the t-obtained T-test on TWO RELATED samples of scores, a +and b. From Numerical Recipies, p.483. If printit=1, results are +printed to the screen. If printit='filename', the results are output +to 'filename' using the given writemode (default=append). Dimension +can equal None (ravel array first), or an integer (the dimension over +which to operate on a and b). + +Usage: attest_rel(a,b,dimension=None,printit=0, + name1='Samp1',name2='Samp2',writemode='a') +Returns: t-value, two-tailed p-value +""" + if dimension == None: + a = N.ravel(a) + b = N.ravel(b) + dimension = 0 + if len(a)<>len(b): + raise ValueError, 'Unequal length arrays.' + x1 = amean(a,dimension) + x2 = amean(b,dimension) + v1 = avar(a,dimension) + v2 = avar(b,dimension) + n = a.shape[dimension] + df = float(n-1) + d = (a-b).astype('d') + + denom = N.sqrt((n*N.add.reduce(d*d,dimension) - N.add.reduce(d,dimension)**2) /df) + zerodivproblem = N.equal(denom,0) + denom = N.where(zerodivproblem,1,denom) # avoid zero-division in 1st place + t = N.add.reduce(d,dimension) / denom # N-D COMPUTATION HERE!!!!!! + t = N.where(zerodivproblem,1.0,t) # replace NaN/wrong t-values with 1.0 + probs = abetai(0.5*df,0.5,float(df)/(df+t*t)) + if type(t) == N.ndarray: + probs = N.reshape(probs,t.shape) + if probs.shape == (1,): + probs = probs[0] + + if printit <> 0: + statname = 'Related samples T-test.' + outputpairedstats(printit,writemode, + name1,n,x1,v1,N.minimum.reduce(N.ravel(a)), + N.maximum.reduce(N.ravel(a)), + name2,n,x2,v2,N.minimum.reduce(N.ravel(b)), + N.maximum.reduce(N.ravel(b)), + statname,t,probs) + return + return t, probs + + + def achisquare(f_obs,f_exp=None): + """ +Calculates a one-way chi square for array of observed frequencies and returns +the result. If no expected frequencies are given, the total N is assumed to +be equally distributed across all groups. +@@@NOT RIGHT?? + +Usage: achisquare(f_obs, f_exp=None) f_obs = array of observed cell freq. +Returns: chisquare-statistic, associated p-value +""" + + k = len(f_obs) + if f_exp == None: + f_exp = N.array([sum(f_obs)/float(k)] * len(f_obs),N.float_) + f_exp = f_exp.astype(N.float_) + chisq = N.add.reduce((f_obs-f_exp)**2 / f_exp) + return chisq, achisqprob(chisq, k-1) + + + def aks_2samp (data1,data2): + """ +Computes the Kolmogorov-Smirnof statistic on 2 samples. Modified from +Numerical Recipies in C, page 493. Returns KS D-value, prob. Not ufunc- +like. + +Usage: aks_2samp(data1,data2) where data1 and data2 are 1D arrays +Returns: KS D-value, p-value +""" + j1 = 0 # N.zeros(data1.shape[1:]) TRIED TO MAKE THIS UFUNC-LIKE + j2 = 0 # N.zeros(data2.shape[1:]) + fn1 = 0.0 # N.zeros(data1.shape[1:],N.float_) + fn2 = 0.0 # N.zeros(data2.shape[1:],N.float_) + n1 = data1.shape[0] + n2 = data2.shape[0] + en1 = n1*1 + en2 = n2*1 + d = N.zeros(data1.shape[1:],N.float_) + data1 = N.sort(data1,0) + data2 = N.sort(data2,0) + while j1 < n1 and j2 < n2: + d1=data1[j1] + d2=data2[j2] + if d1 <= d2: + fn1 = (j1)/float(en1) + j1 = j1 + 1 + if d2 <= d1: + fn2 = (j2)/float(en2) + j2 = j2 + 1 + dt = (fn2-fn1) + if abs(dt) > abs(d): + d = dt +# try: + en = math.sqrt(en1*en2/float(en1+en2)) + prob = aksprob((en+0.12+0.11/en)*N.fabs(d)) +# except: +# prob = 1.0 + return d, prob + + + def amannwhitneyu(x,y): + """ +Calculates a Mann-Whitney U statistic on the provided scores and +returns the result. Use only when the n in each condition is < 20 and +you have 2 independent samples of ranks. REMEMBER: Mann-Whitney U is +significant if the u-obtained is LESS THAN or equal to the critical +value of U. + +Usage: amannwhitneyu(x,y) where x,y are arrays of values for 2 conditions +Returns: u-statistic, one-tailed p-value (i.e., p(z(U))) +""" + n1 = len(x) + n2 = len(y) + ranked = rankdata(N.concatenate((x,y))) + rankx = ranked[0:n1] # get the x-ranks + ranky = ranked[n1:] # the rest are y-ranks + u1 = n1*n2 + (n1*(n1+1))/2.0 - sum(rankx) # calc U for x + u2 = n1*n2 - u1 # remainder is U for y + bigu = max(u1,u2) + smallu = min(u1,u2) + proportion = bigu/float(n1*n2) + T = math.sqrt(tiecorrect(ranked)) # correction factor for tied scores + if T == 0: + raise ValueError, 'All numbers are identical in amannwhitneyu' + sd = math.sqrt(T*n1*n2*(n1+n2+1)/12.0) + z = abs((bigu-n1*n2/2.0) / sd) # normal approximation for prob calc + return smallu, 1.0 - azprob(z), proportion + + + def atiecorrect(rankvals): + """ +Tie-corrector for ties in Mann Whitney U and Kruskal Wallis H tests. +See Siegel, S. (1956) Nonparametric Statistics for the Behavioral +Sciences. New York: McGraw-Hill. Code adapted from |Stat rankind.c +code. + +Usage: atiecorrect(rankvals) +Returns: T correction factor for U or H +""" + sorted,posn = ashellsort(N.array(rankvals)) + n = len(sorted) + T = 0.0 + i = 0 + while (i<n-1): + if sorted[i] == sorted[i+1]: + nties = 1 + while (i<n-1) and (sorted[i] == sorted[i+1]): + nties = nties +1 + i = i +1 + T = T + nties**3 - nties + i = i+1 + T = T / float(n**3-n) + return 1.0 - T + + + def aranksums(x,y): + """ +Calculates the rank sums statistic on the provided scores and returns +the result. + +Usage: aranksums(x,y) where x,y are arrays of values for 2 conditions +Returns: z-statistic, two-tailed p-value +""" + n1 = len(x) + n2 = len(y) + alldata = N.concatenate((x,y)) + ranked = arankdata(alldata) + x = ranked[:n1] + y = ranked[n1:] + s = sum(x) + expected = n1*(n1+n2+1) / 2.0 + z = (s - expected) / math.sqrt(n1*n2*(n1+n2+1)/12.0) + prob = 2*(1.0 - azprob(abs(z))) + return z, prob + + + def awilcoxont(x,y): + """ +Calculates the Wilcoxon T-test for related samples and returns the +result. A non-parametric T-test. + +Usage: awilcoxont(x,y) where x,y are equal-length arrays for 2 conditions +Returns: t-statistic, two-tailed p-value +""" + if len(x) <> len(y): + raise ValueError, 'Unequal N in awilcoxont. Aborting.' + d = x-y + d = N.compress(N.not_equal(d,0),d) # Keep all non-zero differences + count = len(d) + absd = abs(d) + absranked = arankdata(absd) + r_plus = 0.0 + r_minus = 0.0 + for i in range(len(absd)): + if d[i] < 0: + r_minus = r_minus + absranked[i] + else: + r_plus = r_plus + absranked[i] + wt = min(r_plus, r_minus) + mn = count * (count+1) * 0.25 + se = math.sqrt(count*(count+1)*(2.0*count+1.0)/24.0) + z = math.fabs(wt-mn) / se + z = math.fabs(wt-mn) / se + prob = 2*(1.0 -zprob(abs(z))) + return wt, prob + + + def akruskalwallish(*args): + """ +The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more +groups, requiring at least 5 subjects in each group. This function +calculates the Kruskal-Wallis H and associated p-value for 3 or more +independent samples. + +Usage: akruskalwallish(*args) args are separate arrays for 3+ conditions +Returns: H-statistic (corrected for ties), associated p-value +""" + assert len(args) == 3, "Need at least 3 groups in stats.akruskalwallish()" + args = list(args) + n = [0]*len(args) + n = map(len,args) + all = [] + for i in range(len(args)): + all = all + args[i].tolist() + ranked = rankdata(all) + T = tiecorrect(ranked) + for i in range(len(args)): + args[i] = ranked[0:n[i]] + del ranked[0:n[i]] + rsums = [] + for i in range(len(args)): + rsums.append(sum(args[i])**2) + rsums[i] = rsums[i] / float(n[i]) + ssbn = sum(rsums) + totaln = sum(n) + h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1) + df = len(args) - 1 + if T == 0: + raise ValueError, 'All numbers are identical in akruskalwallish' + h = h / float(T) + return h, chisqprob(h,df) + + + def afriedmanchisquare(*args): + """ +Friedman Chi-Square is a non-parametric, one-way within-subjects +ANOVA. This function calculates the Friedman Chi-square test for +repeated measures and returns the result, along with the associated +probability value. It assumes 3 or more repeated measures. Only 3 +levels requires a minimum of 10 subjects in the study. Four levels +requires 5 subjects per level(??). + +Usage: afriedmanchisquare(*args) args are separate arrays for 2+ conditions +Returns: chi-square statistic, associated p-value +""" + k = len(args) + if k < 3: + raise ValueError, '\nLess than 3 levels. Friedman test not appropriate.\n' + n = len(args[0]) + data = apply(pstat.aabut,args) + data = data.astype(N.float_) + for i in range(len(data)): + data[i] = arankdata(data[i]) + ssbn = asum(asum(args,1)**2) + chisq = 12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1) + return chisq, achisqprob(chisq,k-1) + + +##################################### +#### APROBABILITY CALCULATIONS #### +##################################### + + def achisqprob(chisq,df): + """ +Returns the (1-tail) probability value associated with the provided chi-square +value and df. Heavily modified from chisq.c in Gary Perlman's |Stat. Can +handle multiple dimensions. + +Usage: achisqprob(chisq,df) chisq=chisquare stat., df=degrees of freedom +""" + BIG = 200.0 + def ex(x): + BIG = 200.0 + exponents = N.where(N.less(x,-BIG),-BIG,x) + return N.exp(exponents) + + if type(chisq) == N.ndarray: + arrayflag = 1 + else: + arrayflag = 0 + chisq = N.array([chisq]) + if df < 1: + return N.ones(chisq.shape,N.float) + probs = N.zeros(chisq.shape,N.float_) + probs = N.where(N.less_equal(chisq,0),1.0,probs) # set prob=1 for chisq<0 + a = 0.5 * chisq + if df > 1: + y = ex(-a) + if df%2 == 0: + even = 1 + s = y*1 + s2 = s*1 + else: + even = 0 + s = 2.0 * azprob(-N.sqrt(chisq)) + s2 = s*1 + if (df > 2): + chisq = 0.5 * (df - 1.0) + if even: + z = N.ones(probs.shape,N.float_) + else: + z = 0.5 *N.ones(probs.shape,N.float_) + if even: + e = N.zeros(probs.shape,N.float_) + else: + e = N.log(N.sqrt(N.pi)) *N.ones(probs.shape,N.float_) + c = N.log(a) + mask = N.zeros(probs.shape) + a_big = N.greater(a,BIG) + a_big_frozen = -1 *N.ones(probs.shape,N.float_) + totalelements = N.multiply.reduce(N.array(probs.shape)) + while asum(mask)<>totalelements: + e = N.log(z) + e + s = s + ex(c*z-a-e) + z = z + 1.0 +# print z, e, s + newmask = N.greater(z,chisq) + a_big_frozen = N.where(newmask*N.equal(mask,0)*a_big, s, a_big_frozen) + mask = N.clip(newmask+mask,0,1) + if even: + z = N.ones(probs.shape,N.float_) + e = N.ones(probs.shape,N.float_) + else: + z = 0.5 *N.ones(probs.shape,N.float_) + e = 1.0 / N.sqrt(N.pi) / N.sqrt(a) * N.ones(probs.shape,N.float_) + c = 0.0 + mask = N.zeros(probs.shape) + a_notbig_frozen = -1 *N.ones(probs.shape,N.float_) + while asum(mask)<>totalelements: + e = e * (a/z.astype(N.float_)) + c = c + e + z = z + 1.0 +# print '#2', z, e, c, s, c*y+s2 + newmask = N.greater(z,chisq) + a_notbig_frozen = N.where(newmask*N.equal(mask,0)*(1-a_big), + c*y+s2, a_notbig_frozen) + mask = N.clip(newmask+mask,0,1) + probs = N.where(N.equal(probs,1),1, + N.where(N.greater(a,BIG),a_big_frozen,a_notbig_frozen)) + return probs + else: + return s + + + def aerfcc(x): + """ +Returns the complementary error function erfc(x) with fractional error +everywhere less than 1.2e-7. Adapted from Numerical Recipies. Can +handle multiple dimensions. + +Usage: aerfcc(x) +""" + z = abs(x) + t = 1.0 / (1.0+0.5*z) + ans = t * N.exp(-z*z-1.26551223 + t*(1.00002368+t*(0.37409196+t*(0.09678418+t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+t*(-0.82215223+t*0.17087277))))))))) + return N.where(N.greater_equal(x,0), ans, 2.0-ans) + + + def azprob(z): + """ +Returns the area under the normal curve 'to the left of' the given z value. +Thus, + for z<0, zprob(z) = 1-tail probability + for z>0, 1.0-zprob(z) = 1-tail probability + for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability +Adapted from z.c in Gary Perlman's |Stat. Can handle multiple dimensions. + +Usage: azprob(z) where z is a z-value +""" + def yfunc(y): + x = (((((((((((((-0.000045255659 * y + +0.000152529290) * y -0.000019538132) * y + -0.000676904986) * y +0.001390604284) * y + -0.000794620820) * y -0.002034254874) * y + +0.006549791214) * y -0.010557625006) * y + +0.011630447319) * y -0.009279453341) * y + +0.005353579108) * y -0.002141268741) * y + +0.000535310849) * y +0.999936657524 + return x + + def wfunc(w): + x = ((((((((0.000124818987 * w + -0.001075204047) * w +0.005198775019) * w + -0.019198292004) * w +0.059054035642) * w + -0.151968751364) * w +0.319152932694) * w + -0.531923007300) * w +0.797884560593) * N.sqrt(w) * 2.0 + return x + + Z_MAX = 6.0 # maximum meaningful z-value + x = N.zeros(z.shape,N.float_) # initialize + y = 0.5 * N.fabs(z) + x = N.where(N.less(y,1.0),wfunc(y*y),yfunc(y-2.0)) # get x's + x = N.where(N.greater(y,Z_MAX*0.5),1.0,x) # kill those with big Z + prob = N.where(N.greater(z,0),(x+1)*0.5,(1-x)*0.5) + return prob + + + def aksprob(alam): + """ +Returns the probability value for a K-S statistic computed via ks_2samp. +Adapted from Numerical Recipies. Can handle multiple dimensions. + +Usage: aksprob(alam) +""" + if type(alam) == N.ndarray: + frozen = -1 *N.ones(alam.shape,N.float64) + alam = alam.astype(N.float64) + arrayflag = 1 + else: + frozen = N.array(-1.) + alam = N.array(alam,N.float64) + arrayflag = 1 + mask = N.zeros(alam.shape) + fac = 2.0 *N.ones(alam.shape,N.float_) + sum = N.zeros(alam.shape,N.float_) + termbf = N.zeros(alam.shape,N.float_) + a2 = N.array(-2.0*alam*alam,N.float64) + totalelements = N.multiply.reduce(N.array(mask.shape)) + for j in range(1,201): + if asum(mask) == totalelements: + break + exponents = (a2*j*j) + overflowmask = N.less(exponents,-746) + frozen = N.where(overflowmask,0,frozen) + mask = mask+overflowmask + term = fac*N.exp(exponents) + sum = sum + term + newmask = N.where(N.less_equal(abs(term),(0.001*termbf)) + + N.less(abs(term),1.0e-8*sum), 1, 0) + frozen = N.where(newmask*N.equal(mask,0), sum, frozen) + mask = N.clip(mask+newmask,0,1) + fac = -fac + termbf = abs(term) + if arrayflag: + return N.where(N.equal(frozen,-1), 1.0, frozen) # 1.0 if doesn't converge + else: + return N.where(N.equal(frozen,-1), 1.0, frozen)[0] # 1.0 if doesn't converge + + + def afprob (dfnum, dfden, F): + """ +Returns the 1-tailed significance level (p-value) of an F statistic +given the degrees of freedom for the numerator (dfR-dfF) and the degrees +of freedom for the denominator (dfF). Can handle multiple dims for F. + +Usage: afprob(dfnum, dfden, F) where usually dfnum=dfbn, dfden=dfwn +""" + if type(F) == N.ndarray: + return abetai(0.5*dfden, 0.5*dfnum, dfden/(1.0*dfden+dfnum*F)) + else: + return abetai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F)) + + + def abetacf(a,b,x,verbose=1): + """ +Evaluates the continued fraction form of the incomplete Beta function, +betai. (Adapted from: Numerical Recipies in C.) Can handle multiple +dimensions for x. + +Usage: abetacf(a,b,x,verbose=1) +""" + ITMAX = 200 + EPS = 3.0e-7 + + arrayflag = 1 + if type(x) == N.ndarray: + frozen = N.ones(x.shape,N.float_) *-1 #start out w/ -1s, should replace all + else: + arrayflag = 0 + frozen = N.array([-1]) + x = N.array([x]) + mask = N.zeros(x.shape) + bm = az = am = 1.0 + qab = a+b + qap = a+1.0 + qam = a-1.0 + bz = 1.0-qab*x/qap + for i in range(ITMAX+1): + if N.sum(N.ravel(N.equal(frozen,-1)))==0: + break + em = float(i+1) + tem = em + em + d = em*(b-em)*x/((qam+tem)*(a+tem)) + ap = az + d*am + bp = bz+d*bm + d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem)) + app = ap+d*az + bpp = bp+d*bz + aold = az*1 + am = ap/bpp + bm = bp/bpp + az = app/bpp + bz = 1.0 + newmask = N.less(abs(az-aold),EPS*abs(az)) + frozen = N.where(newmask*N.equal(mask,0), az, frozen) + mask = N.clip(mask+newmask,0,1) + noconverge = asum(N.equal(frozen,-1)) + if noconverge <> 0 and verbose: + print 'a or b too big, or ITMAX too small in Betacf for ',noconverge,' elements' + if arrayflag: + return frozen + else: + return frozen[0] + + + def agammln(xx): + """ +Returns the gamma function of xx. + Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt. +Adapted from: Numerical Recipies in C. Can handle multiple dims ... but +probably doesn't normally have to. + +Usage: agammln(xx) +""" + coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516, + 0.120858003e-2, -0.536382e-5] + x = xx - 1.0 + tmp = x + 5.5 + tmp = tmp - (x+0.5)*N.log(tmp) + ser = 1.0 + for j in range(len(coeff)): + x = x + 1 + ser = ser + coeff[j]/x + return -tmp + N.log(2.50662827465*ser) + + + def abetai(a,b,x,verbose=1): + """ +Returns the incomplete beta function: + + I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt) + +where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma +function of a. The continued fraction formulation is implemented +here, using the betacf function. (Adapted from: Numerical Recipies in +C.) Can handle multiple dimensions. + +Usage: abetai(a,b,x,verbose=1) +""" + TINY = 1e-15 + if type(a) == N.ndarray: + if asum(N.less(x,0)+N.greater(x,1)) <> 0: + raise ValueError, 'Bad x in abetai' + x = N.where(N.equal(x,0),TINY,x) + x = N.where(N.equal(x,1.0),1-TINY,x) + + bt = N.where(N.equal(x,0)+N.equal(x,1), 0, -1) + exponents = ( gammln(a+b)-gammln(a)-gammln(b)+a*N.log(x)+b* + N.log(1.0-x) ) + # 746 (below) is the MAX POSSIBLE BEFORE OVERFLOW + exponents = N.where(N.less(exponents,-740),-740,exponents) + bt = N.exp(exponents) + if type(x) == N.ndarray: + ans = N.where(N.less(x,(a+1)/(a+b+2.0)), + bt*abetacf(a,b,x,verbose)/float(a), + 1.0-bt*abetacf(b,a,1.0-x,verbose)/float(b)) + else: + if x<(a+1)/(a+b+2.0): + ans = bt*abetacf(a,b,x,verbose)/float(a) + else: + ans = 1.0-bt*abetacf(b,a,1.0-x,verbose)/float(b) + return ans + + +##################################### +####### AANOVA CALCULATIONS ####### +##################################### + + import numpy.linalg, operator + LA = numpy.linalg + + def aglm(data,para): + """ +Calculates a linear model fit ... anova/ancova/lin-regress/t-test/etc. Taken +from: + Peterson et al. Statistical limitations in functional neuroimaging + I. Non-inferential methods and statistical models. Phil Trans Royal Soc + Lond B 354: 1239-1260. + +Usage: aglm(data,para) +Returns: statistic, p-value ??? +""" + if len(para) <> len(data): + print "data and para must be same length in aglm" + return + n = len(para) + p = pstat.aunique(para) + x = N.zeros((n,len(p))) # design matrix + for l in range(len(p)): + x[:,l] = N.equal(para,p[l]) + b = N.dot(N.dot(LA.inv(N.dot(N.transpose(x),x)), # i.e., b=inv(X'X)X'Y + N.transpose(x)), + data) + diffs = (data - N.dot(x,b)) + s_sq = 1./(n-len(p)) * N.dot(N.transpose(diffs), diffs) + + if len(p) == 2: # ttest_ind + c = N.array([1,-1]) + df = n-2 + fact = asum(1.0/asum(x,0)) # i.e., 1/n1 + 1/n2 + 1/n3 ... + t = N.dot(c,b) / N.sqrt(s_sq*fact) + probs = abetai(0.5*df,0.5,float(df)/(df+t*t)) + return t, probs + + + def aF_oneway(*args): + """ +Performs a 1-way ANOVA, returning an F-value and probability given +any number of groups. From Heiman, pp.394-7. + +Usage: aF_oneway (*args) where *args is 2 or more arrays, one per + treatment group +Returns: f-value, probability +""" + na = len(args) # ANOVA on 'na' groups, each in it's own array + means = [0]*na + vars = [0]*na + ns = [0]*na + alldata = [] + tmp = map(N.array,args) + means = map(amean,tmp) + vars = map(avar,tmp) + ns = map(len,args) + alldata = N.concatenate(args) + bign = len(alldata) + sstot = ass(alldata)-(asquare_of_sums(alldata)/float(bign)) + ssbn = 0 + for a in args: + ssbn = ssbn + asquare_of_sums(N.array(a))/float(len(a)) + ssbn = ssbn - (asquare_of_sums(alldata)/float(bign)) + sswn = sstot-ssbn + dfbn = na-1 + dfwn = bign - na + msb = ssbn/float(dfbn) + msw = sswn/float(dfwn) + f = msb/msw + prob = fprob(dfbn,dfwn,f) + return f, prob + + + def aF_value (ER,EF,dfR,dfF): + """ +Returns an F-statistic given the following: + ER = error associated with the null hypothesis (the Restricted model) + EF = error associated with the alternate hypothesis (the Full model) + dfR = degrees of freedom the Restricted model + dfF = degrees of freedom associated with the Restricted model +""" + return ((ER-EF)/float(dfR-dfF) / (EF/float(dfF))) + + + def outputfstats(Enum, Eden, dfnum, dfden, f, prob): + Enum = round(Enum,3) + Eden = round(Eden,3) + dfnum = round(Enum,3) + dfden = round(dfden,3) + f = round(f,3) + prob = round(prob,3) + suffix = '' # for *s after the p-value + if prob < 0.001: suffix = ' ***' + elif prob < 0.01: suffix = ' **' + elif prob < 0.05: suffix = ' *' + title = [['EF/ER','DF','Mean Square','F-value','prob','']] + lofl = title+[[Enum, dfnum, round(Enum/float(dfnum),3), f, prob, suffix], + [Eden, dfden, round(Eden/float(dfden),3),'','','']] + pstat.printcc(lofl) + return + + + def F_value_multivariate(ER, EF, dfnum, dfden): + """ +Returns an F-statistic given the following: + ER = error associated with the null hypothesis (the Restricted model) + EF = error associated with the alternate hypothesis (the Full model) + dfR = degrees of freedom the Restricted model + dfF = degrees of freedom associated with the Restricted model +where ER and EF are matrices from a multivariate F calculation. +""" + if type(ER) in [IntType, FloatType]: + ER = N.array([[ER]]) + if type(EF) in [IntType, FloatType]: + EF = N.array([[EF]]) + n_um = (LA.det(ER) - LA.det(EF)) / float(dfnum) + d_en = LA.det(EF) / float(dfden) + return n_um / d_en + + +##################################### +####### ASUPPORT FUNCTIONS ######## +##################################### + + def asign(a): + """ +Usage: asign(a) +Returns: array shape of a, with -1 where a<0 and +1 where a>=0 +""" + a = N.asarray(a) + if ((type(a) == type(1.4)) or (type(a) == type(1))): + return a-a-N.less(a,0)+N.greater(a,0) + else: + return N.zeros(N.shape(a))-N.less(a,0)+N.greater(a,0) + + + def asum (a, dimension=None,keepdims=0): + """ +An alternative to the Numeric.add.reduce function, which allows one to +(1) collapse over multiple dimensions at once, and/or (2) to retain +all dimensions in the original array (squashing one down to size. +Dimension can equal None (ravel array first), an integer (the +dimension over which to operate), or a sequence (operate over multiple +dimensions). If keepdims=1, the resulting array will have as many +dimensions as the input array. + +Usage: asum(a, dimension=None, keepdims=0) +Returns: array summed along 'dimension'(s), same _number_ of dims if keepdims=1 +""" + if type(a) == N.ndarray and a.dtype in [N.int_, N.short, N.ubyte]: + a = a.astype(N.float_) + if dimension == None: + s = N.sum(N.ravel(a)) + elif type(dimension) in [IntType,FloatType]: + s = N.add.reduce(a, dimension) + if keepdims == 1: + shp = list(a.shape) + shp[dimension] = 1 + s = N.reshape(s,shp) + else: # must be a SEQUENCE of dims to sum over + dims = list(dimension) + dims.sort() + dims.reverse() + s = a *1.0 + for dim in dims: + s = N.add.reduce(s,dim) + if keepdims == 1: + shp = list(a.shape) + for dim in dims: + shp[dim] = 1 + s = N.reshape(s,shp) + return s + + + def acumsum (a,dimension=None): + """ +Returns an array consisting of the cumulative sum of the items in the +passed array. Dimension can equal None (ravel array first), an +integer (the dimension over which to operate), or a sequence (operate +over multiple dimensions, but this last one just barely makes sense). + +Usage: acumsum(a,dimension=None) +""" + if dimension == None: + a = N.ravel(a) + dimension = 0 + if type(dimension) in [ListType, TupleType, N.ndarray]: + dimension = list(dimension) + dimension.sort() + dimension.reverse() + for d in dimension: + a = N.add.accumulate(a,d) + return a + else: + return N.add.accumulate(a,dimension) + + + def ass(inarray, dimension=None, keepdims=0): + """ +Squares each value in the passed array, adds these squares & returns +the result. Unfortunate function name. :-) Defaults to ALL values in +the array. Dimension can equal None (ravel array first), an integer +(the dimension over which to operate), or a sequence (operate over +multiple dimensions). Set keepdims=1 to maintain the original number +of dimensions. + +Usage: ass(inarray, dimension=None, keepdims=0) +Returns: sum-along-'dimension' for (inarray*inarray) +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + return asum(inarray*inarray,dimension,keepdims) + + + def asummult (array1,array2,dimension=None,keepdims=0): + """ +Multiplies elements in array1 and array2, element by element, and +returns the sum (along 'dimension') of all resulting multiplications. +Dimension can equal None (ravel array first), an integer (the +dimension over which to operate), or a sequence (operate over multiple +dimensions). A trivial function, but included for completeness. + +Usage: asummult(array1,array2,dimension=None,keepdims=0) +""" + if dimension == None: + array1 = N.ravel(array1) + array2 = N.ravel(array2) + dimension = 0 + return asum(array1*array2,dimension,keepdims) + + + def asquare_of_sums(inarray, dimension=None, keepdims=0): + """ +Adds the values in the passed array, squares that sum, and returns the +result. Dimension can equal None (ravel array first), an integer (the +dimension over which to operate), or a sequence (operate over multiple +dimensions). If keepdims=1, the returned array will have the same +NUMBER of dimensions as the original. + +Usage: asquare_of_sums(inarray, dimension=None, keepdims=0) +Returns: the square of the sum over dim(s) in dimension +""" + if dimension == None: + inarray = N.ravel(inarray) + dimension = 0 + s = asum(inarray,dimension,keepdims) + if type(s) == N.ndarray: + return s.astype(N.float_)*s + else: + return float(s)*s + + + def asumdiffsquared(a,b, dimension=None, keepdims=0): + """ +Takes pairwise differences of the values in arrays a and b, squares +these differences, and returns the sum of these squares. Dimension +can equal None (ravel array first), an integer (the dimension over +which to operate), or a sequence (operate over multiple dimensions). +keepdims=1 means the return shape = len(a.shape) = len(b.shape) + +Usage: asumdiffsquared(a,b) +Returns: sum[ravel(a-b)**2] +""" + if dimension == None: + inarray = N.ravel(a) + dimension = 0 + return asum((a-b)**2,dimension,keepdims) + + + def ashellsort(inarray): + """ +Shellsort algorithm. Sorts a 1D-array. + +Usage: ashellsort(inarray) +Returns: sorted-inarray, sorting-index-vector (for original array) +""" + n = len(inarray) + svec = inarray *1.0 + ivec = range(n) + gap = n/2 # integer division needed + while gap >0: + for i in range(gap,n): + for j in range(i-gap,-1,-gap): + while j>=0 and svec[j]>svec[j+gap]: + temp = svec[j] + svec[j] = svec[j+gap] + svec[j+gap] = temp + itemp = ivec[j] + ivec[j] = ivec[j+gap] + ivec[j+gap] = itemp + gap = gap / 2 # integer division needed +# svec is now sorted input vector, ivec has the order svec[i] = vec[ivec[i]] + return svec, ivec + + + def arankdata(inarray): + """ +Ranks the data in inarray, dealing with ties appropritely. Assumes +a 1D inarray. Adapted from Gary Perlman's |Stat ranksort. + +Usage: arankdata(inarray) +Returns: array of length equal to inarray, containing rank scores +""" + n = len(inarray) + svec, ivec = ashellsort(inarray) + sumranks = 0 + dupcount = 0 + newarray = N.zeros(n,N.float_) + for i in range(n): + sumranks = sumranks + i + dupcount = dupcount + 1 + if i==n-1 or svec[i] <> svec[i+1]: + averank = sumranks / float(dupcount) + 1 + for j in range(i-dupcount+1,i+1): + newarray[ivec[j]] = averank + sumranks = 0 + dupcount = 0 + return newarray + + + def afindwithin(data): + """ +Returns a binary vector, 1=within-subject factor, 0=between. Input +equals the entire data array (i.e., column 1=random factor, last +column = measured values. + +Usage: afindwithin(data) data in |Stat format +""" + numfact = len(data[0])-2 + withinvec = [0]*numfact + for col in range(1,numfact+1): + rows = pstat.linexand(data,col,pstat.unique(pstat.colex(data,1))[0]) # get 1 level of this factor + if len(pstat.unique(pstat.colex(rows,0))) < len(rows): # if fewer subjects than scores on this factor + withinvec[col-1] = 1 + return withinvec + + + ######################################################### + ######################################################### + ###### RE-DEFINE DISPATCHES TO INCLUDE ARRAYS ######### + ######################################################### + ######################################################### + +## CENTRAL TENDENCY: + geometricmean = Dispatch ( (lgeometricmean, (ListType, TupleType)), + (ageometricmean, (N.ndarray,)) ) + harmonicmean = Dispatch ( (lharmonicmean, (ListType, TupleType)), + (aharmonicmean, (N.ndarray,)) ) + mean = Dispatch ( (lmean, (ListType, TupleType)), + (amean, (N.ndarray,)) ) + median = Dispatch ( (lmedian, (ListType, TupleType)), + (amedian, (N.ndarray,)) ) + medianscore = Dispatch ( (lmedianscore, (ListType, TupleType)), + (amedianscore, (N.ndarray,)) ) + mode = Dispatch ( (lmode, (ListType, TupleType)), + (amode, (N.ndarray,)) ) + tmean = Dispatch ( (atmean, (N.ndarray,)) ) + tvar = Dispatch ( (atvar, (N.ndarray,)) ) + tstdev = Dispatch ( (atstdev, (N.ndarray,)) ) + tsem = Dispatch ( (atsem, (N.ndarray,)) ) + +## VARIATION: + moment = Dispatch ( (lmoment, (ListType, TupleType)), + (amoment, (N.ndarray,)) ) + variation = Dispatch ( (lvariation, (ListType, TupleType)), + (avariation, (N.ndarray,)) ) + skew = Dispatch ( (lskew, (ListType, TupleType)), + (askew, (N.ndarray,)) ) + kurtosis = Dispatch ( (lkurtosis, (ListType, TupleType)), + (akurtosis, (N.ndarray,)) ) + describe = Dispatch ( (ldescribe, (ListType, TupleType)), + (adescribe, (N.ndarray,)) ) + +## DISTRIBUTION TESTS + + skewtest = Dispatch ( (askewtest, (ListType, TupleType)), + (askewtest, (N.ndarray,)) ) + kurtosistest = Dispatch ( (akurtosistest, (ListType, TupleType)), + (akurtosistest, (N.ndarray,)) ) + normaltest = Dispatch ( (anormaltest, (ListType, TupleType)), + (anormaltest, (N.ndarray,)) ) + +## FREQUENCY STATS: + itemfreq = Dispatch ( (litemfreq, (ListType, TupleType)), + (aitemfreq, (N.ndarray,)) ) + scoreatpercentile = Dispatch ( (lscoreatpercentile, (ListType, TupleType)), + (ascoreatpercentile, (N.ndarray,)) ) + percentileofscore = Dispatch ( (lpercentileofscore, (ListType, TupleType)), + (apercentileofscore, (N.ndarray,)) ) + histogram = Dispatch ( (lhistogram, (ListType, TupleType)), + (ahistogram, (N.ndarray,)) ) + cumfreq = Dispatch ( (lcumfreq, (ListType, TupleType)), + (acumfreq, (N.ndarray,)) ) + relfreq = Dispatch ( (lrelfreq, (ListType, TupleType)), + (arelfreq, (N.ndarray,)) ) + +## VARIABILITY: + obrientransform = Dispatch ( (lobrientransform, (ListType, TupleType)), + (aobrientransform, (N.ndarray,)) ) + samplevar = Dispatch ( (lsamplevar, (ListType, TupleType)), + (asamplevar, (N.ndarray,)) ) + samplestdev = Dispatch ( (lsamplestdev, (ListType, TupleType)), + (asamplestdev, (N.ndarray,)) ) + signaltonoise = Dispatch( (asignaltonoise, (N.ndarray,)),) + var = Dispatch ( (lvar, (ListType, TupleType)), + (avar, (N.ndarray,)) ) + stdev = Dispatch ( (lstdev, (ListType, TupleType)), + (astdev, (N.ndarray,)) ) + sterr = Dispatch ( (lsterr, (ListType, TupleType)), + (asterr, (N.ndarray,)) ) + sem = Dispatch ( (lsem, (ListType, TupleType)), + (asem, (N.ndarray,)) ) + z = Dispatch ( (lz, (ListType, TupleType)), + (az, (N.ndarray,)) ) + zs = Dispatch ( (lzs, (ListType, TupleType)), + (azs, (N.ndarray,)) ) + +## TRIMMING FCNS: + threshold = Dispatch( (athreshold, (N.ndarray,)),) + trimboth = Dispatch ( (ltrimboth, (ListType, TupleType)), + (atrimboth, (N.ndarray,)) ) + trim1 = Dispatch ( (ltrim1, (ListType, TupleType)), + (atrim1, (N.ndarray,)) ) + +## CORRELATION FCNS: + paired = Dispatch ( (lpaired, (ListType, TupleType)), + (apaired, (N.ndarray,)) ) + lincc = Dispatch ( (llincc, (ListType, TupleType)), + (alincc, (N.ndarray,)) ) + pearsonr = Dispatch ( (lpearsonr, (ListType, TupleType)), + (apearsonr, (N.ndarray,)) ) + spearmanr = Dispatch ( (lspearmanr, (ListType, TupleType)), + (aspearmanr, (N.ndarray,)) ) + pointbiserialr = Dispatch ( (lpointbiserialr, (ListType, TupleType)), + (apointbiserialr, (N.ndarray,)) ) + kendalltau = Dispatch ( (lkendalltau, (ListType, TupleType)), + (akendalltau, (N.ndarray,)) ) + linregress = Dispatch ( (llinregress, (ListType, TupleType)), + (alinregress, (N.ndarray,)) ) + +## INFERENTIAL STATS: + ttest_1samp = Dispatch ( (lttest_1samp, (ListType, TupleType)), + (attest_1samp, (N.ndarray,)) ) + ttest_ind = Dispatch ( (lttest_ind, (ListType, TupleType)), + (attest_ind, (N.ndarray,)) ) + ttest_rel = Dispatch ( (lttest_rel, (ListType, TupleType)), + (attest_rel, (N.ndarray,)) ) + chisquare = Dispatch ( (lchisquare, (ListType, TupleType)), + (achisquare, (N.ndarray,)) ) + ks_2samp = Dispatch ( (lks_2samp, (ListType, TupleType)), + (aks_2samp, (N.ndarray,)) ) + mannwhitneyu = Dispatch ( (lmannwhitneyu, (ListType, TupleType)), + (amannwhitneyu, (N.ndarray,)) ) + tiecorrect = Dispatch ( (ltiecorrect, (ListType, TupleType)), + (atiecorrect, (N.ndarray,)) ) + ranksums = Dispatch ( (lranksums, (ListType, TupleType)), + (aranksums, (N.ndarray,)) ) + wilcoxont = Dispatch ( (lwilcoxont, (ListType, TupleType)), + (awilcoxont, (N.ndarray,)) ) + kruskalwallish = Dispatch ( (lkruskalwallish, (ListType, TupleType)), + (akruskalwallish, (N.ndarray,)) ) + friedmanchisquare = Dispatch ( (lfriedmanchisquare, (ListType, TupleType)), + (afriedmanchisquare, (N.ndarray,)) ) + +## PROBABILITY CALCS: + chisqprob = Dispatch ( (lchisqprob, (IntType, FloatType)), + (achisqprob, (N.ndarray,)) ) + zprob = Dispatch ( (lzprob, (IntType, FloatType)), + (azprob, (N.ndarray,)) ) + ksprob = Dispatch ( (lksprob, (IntType, FloatType)), + (aksprob, (N.ndarray,)) ) + fprob = Dispatch ( (lfprob, (IntType, FloatType)), + (afprob, (N.ndarray,)) ) + betacf = Dispatch ( (lbetacf, (IntType, FloatType)), + (abetacf, (N.ndarray,)) ) + betai = Dispatch ( (lbetai, (IntType, FloatType)), + (abetai, (N.ndarray,)) ) + erfcc = Dispatch ( (lerfcc, (IntType, FloatType)), + (aerfcc, (N.ndarray,)) ) + gammln = Dispatch ( (lgammln, (IntType, FloatType)), + (agammln, (N.ndarray,)) ) + +## ANOVA FUNCTIONS: + F_oneway = Dispatch ( (lF_oneway, (ListType, TupleType)), + (aF_oneway, (N.ndarray,)) ) + F_value = Dispatch ( (lF_value, (ListType, TupleType)), + (aF_value, (N.ndarray,)) ) + +## SUPPORT FUNCTIONS: + incr = Dispatch ( (lincr, (ListType, TupleType, N.ndarray)), ) + sum = Dispatch ( (lsum, (ListType, TupleType)), + (asum, (N.ndarray,)) ) + cumsum = Dispatch ( (lcumsum, (ListType, TupleType)), + (acumsum, (N.ndarray,)) ) + ss = Dispatch ( (lss, (ListType, TupleType)), + (ass, (N.ndarray,)) ) + summult = Dispatch ( (lsummult, (ListType, TupleType)), + (asummult, (N.ndarray,)) ) + square_of_sums = Dispatch ( (lsquare_of_sums, (ListType, TupleType)), + (asquare_of_sums, (N.ndarray,)) ) + sumdiffsquared = Dispatch ( (lsumdiffsquared, (ListType, TupleType)), + (asumdiffsquared, (N.ndarray,)) ) + shellsort = Dispatch ( (lshellsort, (ListType, TupleType)), + (ashellsort, (N.ndarray,)) ) + rankdata = Dispatch ( (lrankdata, (ListType, TupleType)), + (arankdata, (N.ndarray,)) ) + findwithin = Dispatch ( (lfindwithin, (ListType, TupleType)), + (afindwithin, (N.ndarray,)) ) + +###################### END OF NUMERIC FUNCTION BLOCK ##################### + +###################### END OF STATISTICAL FUNCTIONS ###################### + +except ImportError: + pass diff --git a/utils/tabulator.py b/utils/tabulator.py new file mode 100644 index 00000000..3c6b1839 --- /dev/null +++ b/utils/tabulator.py @@ -0,0 +1,967 @@ +#!/usr/bin/python + +# Copyright 2011 Google Inc. All Rights Reserved. + +import getpass +import math +import numpy +import colortrans +from email_sender import EmailSender + + +def _IsFloat(v): + try: + float(v) + return True + except ValueError: + return False + + +def _AllFloat(values): + return all([_IsFloat(v) for v in values]) + + +def _GetFloats(values): + return [float(v) for v in values] + + +def _StripNone(results): + res = [] + for result in results: + if result is not None: + res.append(result) + return res + + +class TableGenerator(object): + """Creates a table from a list of list of dicts. + + The main public function is called GetTable(). + """ + SORT_BY_KEYS = 0 + SORT_BY_KEYS_DESC = 1 + SORT_BY_VALUES = 2 + SORT_BY_VALUES_DESC = 3 + + MISSING_VALUE = "x" + + def __init__(self, d, l, sort=SORT_BY_KEYS): + self._runs = d + self._labels = l + self._sort = sort + + def _AggregateKeys(self): + keys = set([]) + for run_list in self._runs: + for run in run_list: + keys = keys.union(run.keys()) + return keys + + def _GetHighestValue(self, key): + values = [] + for run_list in self._runs: + for run in run_list: + if key in run: + values.append(run[key]) + ret = max(_StripNone(values)) + return ret + + def _GetLowestValue(self, key): + values = [] + for run_list in self._runs: + for run in run_list: + if key in run: + values.append(run[key]) + ret = min(_StripNone(values)) + return ret + + def _SortKeys(self, keys): + if self._sort == self.SORT_BY_KEYS: + return sorted(keys) + elif self._sort == self.SORT_BY_VALUES: + return sorted(keys, key=lambda x: self._GetLowestValue(x)) + elif self._sort == self.SORT_BY_VALUES_DESC: + return sorted(keys, key=lambda x: self._GetHighestValue(x), reverse=True) + else: + assert 0, "Unimplemented sort %s" % self._sort + + def _GetKeys(self): + keys = self._AggregateKeys() + return self._SortKeys(keys) + + def GetTable(self): + """Returns a table from a list of list of dicts. + + The list of list of dicts is passed into the constructor of TableGenerator. + This method converts that into a canonical list of lists which represents a + table of values. + + Args: + None + Returns: + A list of lists which is the table. + + Example: + We have the following runs: + [[{"k1": "v1", "k2": "v2"}, {"k1": "v3"}], + [{"k1": "v4", "k4": "v5"}]] + and the following labels: + ["vanilla", "modified"] + it will return: + [["Key", "vanilla", "modified"] + ["k1", ["v1", "v3"], ["v4"]] + ["k2", ["v2"], []] + ["k4", [], ["v5"]]] + The returned table can then be processed further by other classes in this + module. + """ + keys = self._GetKeys() + header = ["keys"] + self._labels + table = [header] + for k in keys: + row = [k] + for run_list in self._runs: + v = [] + for run in run_list: + if k in run: + v.append(run[k]) + else: + v.append(None) + row.append(v) + table.append(row) + return table + + +class Result(object): + """A class that respresents a single result. + + This single result is obtained by condensing the information from a list of + runs and a list of baseline runs. + """ + + def __init__(self): + pass + + def _AllStringsSame(self, values): + values_set = set(values) + return len(values_set) == 1 + + def NeedsBaseline(self): + return False + + def _Literal(self, cell, values, baseline_values): + cell.value = " ".join([str(v) for v in values]) + + def _ComputeFloat(self, cell, values, baseline_values): + self._Literal(cell, values, baseline_values) + + def _ComputeString(self, cell, values, baseline_values): + self._Literal(cell, values, baseline_values) + + def _InvertIfLowerIsBetter(self, cell): + pass + + def _GetGmean(self, values): + if not values: + return float("nan") + return (reduce(lambda x, y: x*y, values))**(1.0/len(values)) + + def Compute(self, cell, values, baseline_values): + """Compute the result given a list of values and baseline values. + + Args: + cell: A cell data structure to populate. + values: List of values. + baseline_values: List of baseline values. Can be none if this is the + baseline itself. + """ + all_floats = True + values = _StripNone(values) + if not len(values): + cell.value = "" + return + if _AllFloat(values): + float_values = _GetFloats(values) + else: + all_floats = False + if baseline_values: + baseline_values = _StripNone(baseline_values) + if baseline_values: + if _AllFloat(baseline_values): + float_baseline_values = _GetFloats(baseline_values) + else: + all_floats = False + else: + if self.NeedsBaseline(): + cell.value = "" + return + float_baseline_values = None + if all_floats: + self._ComputeFloat(cell, float_values, float_baseline_values) + self._InvertIfLowerIsBetter(cell) + else: + self._ComputeString(cell, values, baseline_values) + + +class LiteralResult(Result): + def __init__(self, iteration=0): + super(LiteralResult, self).__init__() + self.iteration = iteration + + def Compute(self, cell, values, baseline_values): + try: + if values[self.iteration]: + cell.value = values[self.iteration] + else: + cell.value = "-" + except IndexError: + cell.value = "-" + + +class NonEmptyCountResult(Result): + def Compute(self, cell, values, baseline_values): + cell.value = len(_StripNone(values)) + + +class StringMeanResult(Result): + def _ComputeString(self, cell, values, baseline_values): + if self._AllStringsSame(values): + cell.value = str(values[0]) + else: + cell.value = "?" + +class AmeanResult(StringMeanResult): + def _ComputeFloat(self, cell, values, baseline_values): + cell.value = numpy.mean(values) + + +class RawResult(Result): + pass + + +class MinResult(Result): + def _ComputeFloat(self, cell, values, baseline_values): + cell.value = min(values) + + def _ComputeString(self, cell, values, baseline_values): + if values: + cell.value = min(values) + else: + cell.value = "" + + +class MaxResult(Result): + def _ComputeFloat(self, cell, values, baseline_values): + cell.value = max(values) + + def _ComputeString(self, cell, values, baseline_values): + if values: + cell.value = max(values) + else: + cell.value = "" + + +class NumericalResult(Result): + def _ComputeString(self, cell, values, baseline_values): + cell.value = "?" + + +class StdResult(NumericalResult): + def _ComputeFloat(self, cell, values, baseline_values): + cell.value = numpy.std(values) + + +class CoeffVarResult(NumericalResult): + def _ComputeFloat(self, cell, values, baseline_values): + noise = numpy.std(values)/numpy.mean(values) + cell.value = noise + + +class ComparisonResult(Result): + def NeedsBaseline(self): + return True + + def _ComputeString(self, cell, values, baseline_values): + value = None + baseline_value = None + if self._AllStringsSame(values): + value = values[0] + if self._AllStringsSame(baseline_values): + baseline_value = baseline_values[0] + if value is not None and baseline_value is not None: + if value == baseline_value: + cell.value = "SAME" + else: + cell.value = "DIFFERENT" + else: + cell.value = "?" + + +class StatsSignificant(ComparisonResult): + def _ComputeFloat(self, cell, values, baseline_values): + if len(values) < 2 or len(baseline_values) < 2: + cell.value = float("nan") + return + import stats + _, cell.value = stats.lttest_ind(values, baseline_values) + + def _ComputeString(self, cell, values, baseline_values): + return float("nan") + +class KeyAwareComparisonResult(ComparisonResult): + def _IsLowerBetter(self, key): + lower_is_better_keys = ["milliseconds", "ms", "seconds", "KB", + "rdbytes", "wrbytes"] + return any([key.startswith(l + "_") for l in lower_is_better_keys]) + + def _InvertIfLowerIsBetter(self, cell): + if self._IsLowerBetter(cell.name): + if cell.value: + cell.value = 1.0/cell.value + + +class AmeanRatioResult(KeyAwareComparisonResult): + def _ComputeFloat(self, cell, values, baseline_values): + if numpy.mean(baseline_values) != 0: + cell.value = numpy.mean(values)/numpy.mean(baseline_values) + elif numpy.mean(values) != 0: + cell.value = 0.00 + # cell.value = 0 means the values and baseline_values have big difference + else: + cell.value = 1.00 + # no difference if both values and baseline_values are 0 + + +class GmeanRatioResult(KeyAwareComparisonResult): + def _ComputeFloat(self, cell, values, baseline_values): + if self._GetGmean(baseline_values) != 0: + cell.value = self._GetGmean(values)/self._GetGmean(baseline_values) + elif self._GetGmean(values) != 0: + cell.value = 0.00 + else: + cell.value = 1.00 + + +class Color(object): + """Class that represents color in RGBA format.""" + + def __init__(self, r=0, g=0, b=0, a=0): + self.r = r + self.g = g + self.b = b + self.a = a + + def __str__(self): + return "r: %s g: %s: b: %s: a: %s" % (self.r, self.g, self.b, self.a) + + def Round(self): + """Round RGBA values to the nearest integer.""" + self.r = int(self.r) + self.g = int(self.g) + self.b = int(self.b) + self.a = int(self.a) + + def GetRGB(self): + """Get a hex representation of the color.""" + return "%02x%02x%02x" % (self.r, self.g, self.b) + + @classmethod + def Lerp(cls, ratio, a, b): + """Perform linear interpolation between two colors. + + Args: + ratio: The ratio to use for linear polation. + a: The first color object (used when ratio is 0). + b: The second color object (used when ratio is 1). + + Returns: + Linearly interpolated color. + """ + ret = cls() + ret.r = (b.r - a.r)*ratio + a.r + ret.g = (b.g - a.g)*ratio + a.g + ret.b = (b.b - a.b)*ratio + a.b + ret.a = (b.a - a.a)*ratio + a.a + return ret + + +class Format(object): + """A class that represents the format of a column.""" + + def __init__(self): + pass + + def Compute(self, cell): + """Computes the attributes of a cell based on its value. + + Attributes typically are color, width, etc. + + Args: + cell: The cell whose attributes are to be populated. + """ + if cell.value is None: + cell.string_value = "" + if isinstance(cell.value, float): + self._ComputeFloat(cell) + else: + self._ComputeString(cell) + + def _ComputeFloat(self, cell): + cell.string_value = "{0:.2f}".format(cell.value) + + def _ComputeString(self, cell): + cell.string_value = str(cell.value) + + def _GetColor(self, value, low, mid, high, power=6, mid_value=1.0): + min_value = 0.0 + max_value = 2.0 + if math.isnan(value): + return mid + if value > mid_value: + value = max_value - mid_value/value + + return self._GetColorBetweenRange(value, min_value, mid_value, max_value, + low, mid, high, power) + + def _GetColorBetweenRange(self, + value, + min_value, mid_value, max_value, + low_color, mid_color, high_color, + power): + assert value <= max_value + assert value >= min_value + if value > mid_value: + value = (max_value - value)/(max_value - mid_value) + value **= power + ret = Color.Lerp(value, high_color, mid_color) + else: + value = (value - min_value)/(mid_value - min_value) + value **= power + ret = Color.Lerp(value, low_color, mid_color) + ret.Round() + return ret + + +class StorageFormat(Format): + """Format the cell as a storage number. + + Example: + If the cell contains a value of 1024, the string_value will be 1.0K. + """ + + def _ComputeFloat(self, cell): + base = 1024 + suffices = ["K", "M", "G"] + v = float(cell.value) + current = 0 + while v >= base**(current + 1) and current < len(suffices): + current += 1 + + if current: + divisor = base**current + cell.string_value = "%1.1f%s" % ((v/divisor), suffices[current - 1]) + else: + cell.string_value = str(cell.value) + + +class CoeffVarFormat(Format): + """Format the cell as a percent. + + Example: + If the cell contains a value of 1.5, the string_value will be +150%. + """ + + def _ComputeFloat(self, cell): + cell.string_value = "%1.1f%%" % (float(cell.value) * 100) + cell.color = self._GetColor(cell.value, + Color(0, 255, 0, 0), + Color(0, 0, 0, 0), + Color(255, 0, 0, 0), + mid_value=0.02, + power=1) + + +class PercentFormat(Format): + """Format the cell as a percent. + + Example: + If the cell contains a value of 1.5, the string_value will be +50%. + """ + + def _ComputeFloat(self, cell): + cell.string_value = "%+1.1f%%" % ((float(cell.value) - 1) * 100) + cell.color = self._GetColor(cell.value, + Color(255, 0, 0, 0), + Color(0, 0, 0, 0), + Color(0, 255, 0, 0)) + + +class RatioFormat(Format): + """Format the cell as a ratio. + + Example: + If the cell contains a value of 1.5642, the string_value will be 1.56. + """ + + def _ComputeFloat(self, cell): + cell.string_value = "%+1.1f%%" % ((cell.value - 1) * 100) + cell.color = self._GetColor(cell.value, + Color(255, 0, 0, 0), + Color(0, 0, 0, 0), + Color(0, 255, 0, 0)) + + +class ColorBoxFormat(Format): + """Format the cell as a color box. + + Example: + If the cell contains a value of 1.5, it will get a green color. + If the cell contains a value of 0.5, it will get a red color. + The intensity of the green/red will be determined by how much above or below + 1.0 the value is. + """ + + def _ComputeFloat(self, cell): + cell.string_value = "--" + bgcolor = self._GetColor(cell.value, + Color(255, 0, 0, 0), + Color(255, 255, 255, 0), + Color(0, 255, 0, 0)) + cell.bgcolor = bgcolor + cell.color = bgcolor + + +class Cell(object): + """A class to represent a cell in a table. + + Attributes: + value: The raw value of the cell. + color: The color of the cell. + bgcolor: The background color of the cell. + string_value: The string value of the cell. + suffix: A string suffix to be attached to the value when displaying. + prefix: A string prefix to be attached to the value when displaying. + color_row: Indicates whether the whole row is to inherit this cell's color. + bgcolor_row: Indicates whether the whole row is to inherit this cell's + bgcolor. + width: Optional specifier to make a column narrower than the usual width. + The usual width of a column is the max of all its cells widths. + colspan: Set the colspan of the cell in the HTML table, this is used for + table headers. Default value is 1. + name: the test name of the cell. + """ + + def __init__(self): + self.value = None + self.color = None + self.bgcolor = None + self.string_value = None + self.suffix = None + self.prefix = None + # Entire row inherits this color. + self.color_row = False + self.bgcolor_row = False + self.width = None + self.colspan = 1 + self.name = None + + def __str__(self): + l = [] + l.append("value: %s" % self.value) + l.append("string_value: %s" % self.string_value) + return " ".join(l) + + +class Column(object): + """Class representing a column in a table. + + Attributes: + result: an object of the Result class. + fmt: an object of the Format class. + """ + + def __init__(self, result, fmt, name=""): + self.result = result + self.fmt = fmt + self.name = name + + +# Takes in: +# ["Key", "Label1", "Label2"] +# ["k", ["v", "v2"], [v3]] +# etc. +# Also takes in a format string. +# Returns a table like: +# ["Key", "Label1", "Label2"] +# ["k", avg("v", "v2"), stddev("v", "v2"), etc.]] +# according to format string +class TableFormatter(object): + """Class to convert a plain table into a cell-table. + + This class takes in a table generated by TableGenerator and a list of column + formats to apply to the table and returns a table of cells. + """ + + def __init__(self, table, columns): + """The constructor takes in a table and a list of columns. + + Args: + table: A list of lists of values. + columns: A list of column containing what to produce and how to format it. + """ + self._table = table + self._columns = columns + self._table_columns = [] + self._out_table = [] + + def _GenerateCellTable(self): + row_index = 0 + + for row in self._table[1:]: + key = Cell() + key.string_value = str(row[0]) + out_row = [key] + baseline = None + for values in row[1:]: + for column in self._columns: + cell = Cell() + cell.name = key.string_value + if column.result.NeedsBaseline(): + if baseline is not None: + column.result.Compute(cell, values, baseline) + column.fmt.Compute(cell) + out_row.append(cell) + if not row_index: + self._table_columns.append(column) + else: + column.result.Compute(cell, values, baseline) + column.fmt.Compute(cell) + out_row.append(cell) + if not row_index: + self._table_columns.append(column) + + if baseline is None: + baseline = values + self._out_table.append(out_row) + row_index += 1 + + # TODO(asharif): refactor this. + # Now generate header + key = Cell() + key.string_value = "Keys" + header = [key] + for column in self._table_columns: + cell = Cell() + if column.name: + cell.string_value = column.name + else: + result_name = column.result.__class__.__name__ + format_name = column.fmt.__class__.__name__ + + cell.string_value = "%s %s" % (result_name.replace("Result", ""), + format_name.replace("Format", "")) + + header.append(cell) + + self._out_table = [header] + self._out_table + + top_header = [] + colspan = 0 + for column in self._columns: + if not column.result.NeedsBaseline(): + colspan += 1 + for label in self._table[0]: + cell = Cell() + cell.string_value = str(label) + if cell.string_value != "keys": + cell.colspan = colspan + top_header.append(cell) + + self._out_table = [top_header] + self._out_table + + return self._out_table + + def _PrintOutTable(self): + o = "" + for row in self._out_table: + for cell in row: + o += str(cell) + " " + o += "\n" + print o + + def GetCellTable(self): + """Function to return a table of cells. + + The table (list of lists) is converted into a table of cells by this + function. + + Returns: + A table of cells with each cell having the properties and string values as + requiested by the columns passed in the constructor. + """ + # Generate the cell table, creating a list of dynamic columns on the fly. + return self._GenerateCellTable() + + +class TablePrinter(object): + """Class to print a cell table to the console, file or html.""" + PLAIN = 0 + CONSOLE = 1 + HTML = 2 + TSV = 3 + EMAIL = 4 + + def __init__(self, table, output_type): + """Constructor that stores the cell table and output type.""" + self._table = table + self._output_type = output_type + + # Compute whole-table properties like max-size, etc. + def _ComputeStyle(self): + self._row_styles = [] + for row in self._table: + row_style = Cell() + for cell in row: + if cell.color_row: + assert cell.color, "Cell color not set but color_row set!" + assert not row_style.color, "Multiple row_style.colors found!" + row_style.color = cell.color + self._row_styles.append(row_style) + + self._column_styles = [] + if len(self._table) < 2: + return + for i in range(len(self._table[1])): + column_style = Cell() + for row in self._table[1:]: + column_style.width = max(column_style.width, + len(row[i].string_value)) + self._column_styles.append(column_style) + + def _GetBGColorFix(self, color): + if self._output_type == self.CONSOLE: + rgb = color.GetRGB() + prefix, _ = colortrans.rgb2short(rgb) + prefix = "\033[48;5;%sm" % prefix + suffix = "\033[0m" + elif self._output_type in [self.EMAIL, self.HTML]: + rgb = color.GetRGB() + prefix = ("<FONT style=\"BACKGROUND-COLOR:#{0}\" color =#{0}>" + .format(rgb)) + suffix = "</FONT>" + elif self._output_type in [self.PLAIN, self.TSV]: + prefix = "" + suffix = "" + return prefix, suffix + + def _GetColorFix(self, color): + if self._output_type == self.CONSOLE: + rgb = color.GetRGB() + prefix, _ = colortrans.rgb2short(rgb) + prefix = "\033[38;5;%sm" % prefix + suffix = "\033[0m" + elif self._output_type in [self.EMAIL, self.HTML]: + rgb = color.GetRGB() + prefix = "<FONT COLOR=#{0}>".format(rgb) + suffix = "</FONT>" + elif self._output_type in [self.PLAIN, self.TSV]: + prefix = "" + suffix = "" + return prefix, suffix + + def Print(self): + """Print the table to a console, html, etc. + + Returns: + A string that contains the desired representation of the table. + """ + self._ComputeStyle() + return self._GetStringValue() + + def _GetCellValue(self, i, j): + cell = self._table[i][j] + color = None + out = cell.string_value + if self._row_styles[i].color: + color = self._row_styles[i].color + elif cell.color: + color = cell.color + + if self._row_styles[i].bgcolor: + bgcolor = self._row_styles[i].bgcolor + else: + bgcolor = cell.bgcolor + + raw_width = len(out) + + if color: + p, s = self._GetColorFix(color) + out = "%s%s%s" % (p, out, s) + + if bgcolor: + p, s = self._GetBGColorFix(bgcolor) + out = "%s%s%s" % (p, out, s) + + if self._output_type in [self.PLAIN, self.CONSOLE, self.EMAIL]: + if cell.width: + width = cell.width + else: + if self._column_styles: + width = self._column_styles[j].width + else: + width = len(cell.string_value) + if cell.colspan > 1: + width = 0 + for k in range(cell.colspan): + width += self._column_styles[1 + (j-1) * cell.colspan + k].width + if width > raw_width: + padding = ("%" + str(width - raw_width) + "s") % "" + out = padding + out + + if self._output_type == self.HTML: + if i < 2: + tag = "th" + else: + tag = "td" + out = "<{0} colspan = \"{2}\"> {1} </{0}>".format(tag, out, cell.colspan) + + return out + + def _GetHorizontalSeparator(self): + if self._output_type in [self.CONSOLE, self.PLAIN, self.EMAIL]: + return " " + if self._output_type == self.HTML: + return " " + if self._output_type == self.TSV: + return "\t" + + def _GetVerticalSeparator(self): + if self._output_type in [self.PLAIN, self.CONSOLE, self.TSV, self.EMAIL]: + return "\n" + if self._output_type == self.HTML: + return "</tr>\n<tr>" + + def _GetPrefix(self): + if self._output_type in [self.PLAIN, self.CONSOLE, self.TSV, self.EMAIL]: + return "" + if self._output_type == self.HTML: + return "<p></p><table id=\"box-table-a\">\n<tr>" + + def _GetSuffix(self): + if self._output_type in [self.PLAIN, self.CONSOLE, self.TSV, self.EMAIL]: + return "" + if self._output_type == self.HTML: + return "</tr>\n</table>" + + def _GetStringValue(self): + o = "" + o += self._GetPrefix() + for i in range(len(self._table)): + row = self._table[i] + for j in range(len(row)): + out = self._GetCellValue(i, j) + o += out + self._GetHorizontalSeparator() + o += self._GetVerticalSeparator() + o += self._GetSuffix() + return o + + +# Some common drivers +def GetSimpleTable(table, out_to=TablePrinter.CONSOLE): + """Prints a simple table. + + This is used by code that has a very simple list-of-lists and wants to produce + a table with ameans, a percentage ratio of ameans and a colorbox. + + Args: + table: a list of lists. + out_to: specify the fomat of output. Currently it supports HTML and CONSOLE. + + Returns: + A string version of the table that can be printed to the console. + + Example: + GetSimpleConsoleTable([["binary", "b1", "b2"],["size", "300", "400"]]) + will produce a colored table that can be printed to the console. + """ + columns = [ + Column(AmeanResult(), + Format()), + Column(AmeanRatioResult(), + PercentFormat()), + Column(AmeanRatioResult(), + ColorBoxFormat()), + ] + our_table = [table[0]] + for row in table[1:]: + our_row = [row[0]] + for v in row[1:]: + our_row.append([v]) + our_table.append(our_row) + + tf = TableFormatter(our_table, columns) + cell_table = tf.GetCellTable() + tp = TablePrinter(cell_table, out_to) + return tp.Print() + + +def GetComplexTable(runs, labels, out_to=TablePrinter.CONSOLE): + tg = TableGenerator(runs, labels, TableGenerator.SORT_BY_VALUES_DESC) + table = tg.GetTable() + columns = [Column(LiteralResult(), + Format(), + "Literal"), + Column(AmeanResult(), + Format()), + Column(StdResult(), + Format()), + Column(CoeffVarResult(), + CoeffVarFormat()), + Column(NonEmptyCountResult(), + Format()), + Column(AmeanRatioResult(), + PercentFormat()), + Column(AmeanRatioResult(), + RatioFormat()), + Column(GmeanRatioResult(), + RatioFormat()), + ] + tf = TableFormatter(table, columns) + cell_table = tf.GetCellTable() + tp = TablePrinter(cell_table, out_to) + return tp.Print() + +if __name__ == "__main__": + # Run a few small tests here. + runs = [ + [ + {"k1": "10", "k2": "12", "k5": "40", "k6": "40", + "ms_1": "20", "k7": "FAIL", "k8": "PASS", "k9": "PASS", + "k10": "0"}, + {"k1": "13", "k2": "14", "k3": "15", "ms_1": "10", "k8": "PASS", + "k9": "FAIL", "k10": "0"} + ], + [ + {"k1": "50", "k2": "51", "k3": "52", "k4": "53", "k5": "35", "k6": + "45", "ms_1": "200", "ms_2": "20", "k7": "FAIL", "k8": "PASS", "k9": + "PASS"}, + ], + ] + labels = ["vanilla", "modified"] + t = GetComplexTable(runs, labels, TablePrinter.CONSOLE) + print t + email = GetComplexTable(runs, labels, TablePrinter.EMAIL) + + simple_table = [ + ["binary", "b1", "b2", "b3"], + ["size", 100, 105, 108], + ["rodata", 100, 80, 70], + ["data", 100, 100, 100], + ["debug", 100, 140, 60], + ] + t = GetSimpleTable(simple_table) + print t + email += GetSimpleTable(simple_table, TablePrinter.PLAIN) + email_to = [getpass.getuser()] + email = "<pre style='font-size: 13px'>%s</pre>" % email + EmailSender().SendEmail(email_to, "SimpleTableTest", email, msg_type="html") diff --git a/utils/tabulator_test.py b/utils/tabulator_test.py new file mode 100644 index 00000000..e1892b07 --- /dev/null +++ b/utils/tabulator_test.py @@ -0,0 +1,96 @@ +# Copyright 2012 Google Inc. All Rights Reserved. + +"""Tests for misc.""" + +__author__ = 'asharif@google.com (Ahmad Sharif)' + +# System modules +import unittest + +# Local modules +import tabulator + + +class TabulatorTest(unittest.TestCase): + def testResult(self): + table = ["k1", ["1", "3"], ["55"]] + result = tabulator.Result() + cell = tabulator.Cell() + result.Compute(cell, table[2], table[1]) + expected = " ".join([str(float(v)) for v in table[2]]) + self.assertTrue(cell.value == expected) + + result = tabulator.AmeanResult() + cell = tabulator.Cell() + result.Compute(cell, table[2], table[1]) + self.assertTrue(cell.value == float(table[2][0])) + + def testStringMean(self): + smr = tabulator.StringMeanResult() + cell = tabulator.Cell() + value = "PASS" + values = [value for _ in range(3)] + smr.Compute(cell, values, None) + self.assertTrue(cell.value == value) + values.append("FAIL") + smr.Compute(cell, values, None) + self.assertTrue(cell.value == "?") + + def testStorageFormat(self): + sf = tabulator.StorageFormat() + cell = tabulator.Cell() + base = 1024.0 + cell.value = base + sf.Compute(cell) + self.assertTrue(cell.string_value == "1.0K") + cell.value = base**2 + sf.Compute(cell) + self.assertTrue(cell.string_value == "1.0M") + cell.value = base**3 + sf.Compute(cell) + self.assertTrue(cell.string_value == "1.0G") + + def testLerp(self): + c1 = tabulator.Color(0, 0, 0, 0) + c2 = tabulator.Color(255, 0, 0, 0) + c3 = tabulator.Color.Lerp(0.5, c1, c2) + self.assertTrue(c3.r == 127.5) + self.assertTrue(c3.g == 0) + self.assertTrue(c3.b == 0) + self.assertTrue(c3.a == 0) + c3.Round() + self.assertTrue(c3.r == 127) + + def testTableGenerator(self): + runs = [[{"k1": "10", "k2": "12"}, + {"k1": "13", "k2": "14", "k3": "15"}], + [{"k1": "50", "k2": "51", "k3": "52", "k4": "53"},]] + labels = ["vanilla", "modified"] + tg = tabulator.TableGenerator(runs, labels) + table = tg.GetTable() + header = table.pop(0) + + self.assertTrue(header == ["keys", "vanilla", "modified"]) + row = table.pop(0) + self.assertTrue(row == ["k1", ["10", "13"], ["50"]]) + row = table.pop(0) + self.assertTrue(row == ["k2", ["12", "14"], ["51"]]) + row = table.pop(0) + self.assertTrue(row == ["k3", [None, "15"], ["52"]]) + row = table.pop(0) + self.assertTrue(row == ["k4", [None, None], ["53"]]) + + table = tg.GetTable() + columns = [ + tabulator.Column(tabulator.AmeanResult(), + tabulator.Format()), + tabulator.Column(tabulator.AmeanRatioResult(), + tabulator.PercentFormat()), + ] + tf = tabulator.TableFormatter(table, columns) + table = tf.GetCellTable() + self.assertTrue(table) + + +if __name__ == '__main__': + unittest.main() |