trappy/utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163

#    Copyright 2015-2017 ARM Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pandas as pd
import numpy as np

"""Generic functions that can be used in multiple places in trappy
"""

def listify(to_select):
    """Utitlity function to handle both single and
    list inputs
    """

    if not isinstance(to_select, list):
        to_select = [to_select]

    return to_select

def handle_duplicate_index(data,
                           max_delta=0.000001):
    """Handle duplicate values in index

    :param data: The timeseries input
    :type data: :mod:`pandas.Series`

    :param max_delta: Maximum interval adjustment value that
        will be added to duplicate indices
    :type max_delta: float

    Consider the following case where a series needs to be reindexed
    to a new index (which can be required when different series need to
    be combined and compared):
    ::

        import pandas
        values = [0, 1, 2, 3, 4]
        index = [0.0, 1.0, 1.0, 6.0, 7.0]
        series = pandas.Series(values, index=index)
        new_index = [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 7.0]
        series.reindex(new_index)

    The above code fails with:
    ::

        ValueError: cannot reindex from a duplicate axis

    The function :func:`handle_duplicate_axis` changes the duplicate values
    to
    ::

        >>> import pandas
        >>> from trappy.utils import handle_duplicate_index

        >>> values = [0, 1, 2, 3, 4]
        index = [0.0, 1.0, 1.0, 6.0, 7.0]
        series = pandas.Series(values, index=index)
        series = handle_duplicate_index(series)
        print series.index.values
        >>> [ 0.        1.        1.000001  6.        7.      ]

    """

    index = data.index
    new_index = index.values

    dups = index.get_duplicates()

    for dup in dups:
        # Leave one of the values intact
        dup_index_left = index.searchsorted(dup, side="left")
        dup_index_right = index.searchsorted(dup, side="right") - 1
        num_dups = dup_index_right - dup_index_left + 1

        # Calculate delta that needs to be added to each duplicate
        # index
        try:
            delta = (index[dup_index_right + 1] - dup) / num_dups
        except IndexError:
            # dup_index_right + 1 is outside of the series (i.e. the
            # dup is at the end of the series).
            delta = max_delta

        # Clamp the maximum delta added to max_delta
        if delta > max_delta:
            delta = max_delta

        # Add a delta to the others
        dup_index_left += 1
        while dup_index_left <= dup_index_right:
            new_index[dup_index_left] += delta
            delta += delta
            dup_index_left += 1

    return data.reindex(new_index)

# Iterate fast over all rows in a data frame and apply fn
def apply_callback(df, fn, *kwargs):
    iters = df.itertuples()
    event_tuple = iters.next()

    # Column names beginning with underscore will not be preserved in tuples
    # due to constraints on namedtuple field names, so store mappings from
    # column name to column number for each trace event.
    col_idxs = { name: idx for idx, name in enumerate(['Time'] + df.columns.tolist()) }

    while True:
        if not event_tuple:
            break
        event_dict = { col: event_tuple[idx] for col, idx in col_idxs.iteritems() }

        if kwargs:
            fn(event_dict, kwargs)
        else:
            fn(event_dict)

        event_tuple = next(iters, None)


def merge_dfs(pr_df, sec_df, pivot):
    # Keep track of last secondary event
    pivot_map = {}

    # An array accumating dicts with merged data
    merged_data = []
    def df_fn(data):
        # Store the latest secondary info
        if data['Time'][0] == 'secondary':
            pivot_map[data[pivot]] = data
            # Get rid of primary/secondary labels
            data['Time'] = data['Time'][1]
            return

        # Propogate latest secondary info
        for key, value in data.iteritems():
            if key == pivot:
                continue
            # Fast check for if value is nan (faster than np.isnan + try/except)
            if value != value and pivot_map.has_key(data[pivot]):
                data[key] = pivot_map[data[pivot]][key]

        # Get rid of primary/secondary labels
        data['Time'] = data['Time'][1]
        merged_data.append(data)

    df = pd.concat([pr_df, sec_df], keys=['primary', 'secondary']).sort_values(by='__line')
    apply_callback(df, df_fn)
    merged_df = pd.DataFrame.from_dict(merged_data)
    merged_df.set_index('Time', inplace=True)

    return merged_df