trappy/base.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273

#    Copyright 2015-2017 ARM Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Base class to parse trace.dat dumps"""

import re
import pandas as pd
import warnings

from resource import getrusage, RUSAGE_SELF

def _get_free_memory_kb():
    try:
        with open("/proc/meminfo") as f:
            memfree_line = [l for l in f.readlines() if "MemFree" in l][0]
            _, num_kb, _ = memfree_line.split()
            return int(num_kb)
    except:
        # Probably either not running on Linux (no /proc/meminfo), or format has
        # changed (we didn't find num_kb).
        return None

def trace_parser_explode_array(string, array_lengths):
    """Explode an array in the trace into individual elements for easy parsing

    Basically, turn :code:`load={1 1 2 2}` into :code:`load0=1 load1=1 load2=2
    load3=2`.

    :param string: Input string from the trace
    :type string: str

    :param array_lengths: A dictionary of array names and their
        expected length.  If we get array that's shorter than the expected
        length, additional keys have to be introduced with value 0 to
        compensate.
    :type array_lengths: dict

    For example:
    ::

        trace_parser_explode_array(string="load={1 2}",
                                   array_lengths={"load": 4})
        "load0=1 load1=2 load2=0 load3=0"
    """

    while True:
        match = re.search(r"[^ ]+={[^}]+}", string)
        if match is None:
            break

        to_explode = match.group()
        col_basename = re.match(r"([^=]+)=", to_explode).groups()[0]
        vals_str = re.search(r"{(.+)}", to_explode).groups()[0]
        vals_array = vals_str.split(' ')

        exploded_str = ""
        for (idx, val) in enumerate(vals_array):
            exploded_str += "{}{}={} ".format(col_basename, idx, val)

        vals_added = len(vals_array)
        if vals_added < array_lengths[col_basename]:
            for idx in range(vals_added, array_lengths[col_basename]):
                exploded_str += "{}{}=0 ".format(col_basename, idx)

        exploded_str = exploded_str[:-1]
        begin_idx = match.start()
        end_idx = match.end()

        string = string[:begin_idx] + exploded_str + string[end_idx:]

    return string

class Base(object):
    """Base class to parse trace.dat dumps.

    Don't use directly, create a subclass that has a unique_word class
    variable.  unique_word is a string that can uniquely identify
    lines in the trace that correspond to this event.  This is usually
    the trace_name (optionally followed by a semicolong,
    e.g. "sched_switch:") but it can be anything else for trace points
    generated using trace_printk().

    :param parse_raw: If :code:`True`, raw trace data (-R option) to
        trace-cmd will be used

    This class acts as a base class for all TRAPpy events

    """
    def __init__(self, parse_raw=False):
        self.data_frame = pd.DataFrame()
        self.data_array = []
        self.time_array = []
        self.comm_array = []
        self.pid_array = []
        self.tgid_array = []
        self.cpu_array = []
        self.callback = None
        self.parse_raw = parse_raw

    def finalize_object(self):
        pass

    def __get_trace_array_lengths(self):
        """Calculate the lengths of all arrays in the trace

        Returns a dict with the name of each array found in the trace
        as keys and their corresponding length as value

        """
        from collections import defaultdict

        pat_array = re.compile(r"([A-Za-z0-9_]+)={([^}]+)}")

        ret = defaultdict(int)

        for line in self.data_array:
            while True:
                match = re.search(pat_array, line)
                if not match:
                    break

                (array_name, array_elements) = match.groups()

                array_len = len(array_elements.split(' '))

                if array_len > ret[array_name]:
                    ret[array_name] = array_len

                line = line[match.end():]

            # Stop scanning if the trace doesn't have arrays
            if len(ret) == 0:
                break

        return ret

    def append_data(self, time, comm, pid, tgid, cpu, data):
        """Append data parsed from a line to the corresponding arrays

        The :mod:`DataFrame` will be created from this when the whole trace
        has been parsed.

        :param time: The time for the line that was printed in the trace
        :type time: float

        :param comm: The command name or the execname from which the trace
            line originated
        :type comm: str

        :param pid: The PID of the process from which the trace
            line originated
        :type pid: int

        :param data: The data for matching line in the trace
        :type data: str
        """

        self.time_array.append(time)
        self.comm_array.append(comm)
        self.pid_array.append(pid)
        self.tgid_array.append(tgid)
        self.cpu_array.append(cpu)
        self.data_array.append(data)

	if not self.callback:
            return
        data_dict = self.generate_data_dict(comm, pid, cpu, data)
        self.callback(time, data_dict)

    def generate_data_dict(self, comm, pid, tgid, cpu, data_str):
        data_dict = {"__comm": comm, "__pid": pid, "__tgid": tgid, "__cpu": cpu}
        prev_key = None
        for field in data_str.split():
            if "=" not in field:
                # Concatenation is supported only for "string" values
                if type(data_dict[prev_key]) is not str:
                    continue
                data_dict[prev_key] += ' ' + field
                continue
            (key, value) = field.split('=', 1)
            try:
                value = int(value)
            except ValueError:
                pass
            data_dict[key] = value
            prev_key = key
        return data_dict

    def generate_parsed_data(self):

        # Get a rough idea of how much memory we have to play with
        CHECK_MEM_COUNT = 10000
        kb_free = _get_free_memory_kb()
        starting_maxrss = getrusage(RUSAGE_SELF).ru_maxrss
        check_memory_usage = True
        check_memory_count = 1

        for (comm, pid, tgid, cpu, data_str) in zip(self.comm_array, self.pid_array,
                                              self.tgid_array, self.cpu_array,
                                              self.data_array):
            data_dict = self.generate_data_dict(comm, pid, tgid, cpu, data_str)

            # When running out of memory, Pandas has been observed to segfault
            # rather than throwing a proper Python error.
            # Look at how much memory our process is using and warn if we seem
            # to be getting close to the system's limit, check it only once
            # in the beginning and then every CHECK_MEM_COUNT events
            check_memory_count -= 1
            if check_memory_usage and check_memory_count == 0:
                kb_used = (getrusage(RUSAGE_SELF).ru_maxrss - starting_maxrss)
                if kb_free and kb_used > kb_free * 0.9:
                    warnings.warn("TRAPpy: Appear to be low on memory. "
                                  "If errors arise, try providing more RAM")
                    check_memory_usage = False
                check_memory_count = CHECK_MEM_COUNT

            yield data_dict

    def create_dataframe(self):
        """Create the final :mod:`pandas.DataFrame`"""
        if not self.time_array:
            return

        trace_arr_lengths = self.__get_trace_array_lengths()

        if trace_arr_lengths.items():
            for (idx, val) in enumerate(self.data_array):
                expl_val = trace_parser_explode_array(val, trace_arr_lengths)
                self.data_array[idx] = expl_val

        time_idx = pd.Index(self.time_array, name="Time")
        self.data_frame = pd.DataFrame(self.generate_parsed_data(), index=time_idx)

        self.time_array = []
        self.comm_array = []
        self.pid_array = []
        self.cpu_array = []
        self.data_array = []

    def write_csv(self, fname):
        """Write the csv info into a CSV file

        :param fname: The name of the CSV file
        :type fname: str
        """
        self.data_frame.to_csv(fname)

    def normalize_time(self, basetime):
        """Substract basetime from the Time of the data frame

        :param basetime: The offset which needs to be subtracted from
            the time index
        :type basetime: float
        """
        # HACK: We don't normalize anymore after the fact
        return

        if basetime and not self.data_frame.empty:
            self.data_frame.reset_index(inplace=True)
            self.data_frame["Time"] = self.data_frame["Time"] - basetime
            self.data_frame.set_index("Time", inplace=True)