1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
|
# Copyright 2015-2017 ARM Limited
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Base class to parse trace.dat dumps"""
import re
import pandas as pd
import warnings
from resource import getrusage, RUSAGE_SELF
def _get_free_memory_kb():
try:
with open("/proc/meminfo") as f:
memfree_line = [l for l in f.readlines() if "MemFree" in l][0]
_, num_kb, _ = memfree_line.split()
return int(num_kb)
except:
# Probably either not running on Linux (no /proc/meminfo), or format has
# changed (we didn't find num_kb).
return None
def trace_parser_explode_array(string, array_lengths):
"""Explode an array in the trace into individual elements for easy parsing
Basically, turn :code:`load={1 1 2 2}` into :code:`load0=1 load1=1 load2=2
load3=2`.
:param string: Input string from the trace
:type string: str
:param array_lengths: A dictionary of array names and their
expected length. If we get array that's shorter than the expected
length, additional keys have to be introduced with value 0 to
compensate.
:type array_lengths: dict
For example:
::
trace_parser_explode_array(string="load={1 2}",
array_lengths={"load": 4})
"load0=1 load1=2 load2=0 load3=0"
"""
while True:
match = re.search(r"[^ ]+={[^}]+}", string)
if match is None:
break
to_explode = match.group()
col_basename = re.match(r"([^=]+)=", to_explode).groups()[0]
vals_str = re.search(r"{(.+)}", to_explode).groups()[0]
vals_array = vals_str.split(' ')
exploded_str = ""
for (idx, val) in enumerate(vals_array):
exploded_str += "{}{}={} ".format(col_basename, idx, val)
vals_added = len(vals_array)
if vals_added < array_lengths[col_basename]:
for idx in range(vals_added, array_lengths[col_basename]):
exploded_str += "{}{}=0 ".format(col_basename, idx)
exploded_str = exploded_str[:-1]
begin_idx = match.start()
end_idx = match.end()
string = string[:begin_idx] + exploded_str + string[end_idx:]
return string
class Base(object):
"""Base class to parse trace.dat dumps.
Don't use directly, create a subclass that has a unique_word class
variable. unique_word is a string that can uniquely identify
lines in the trace that correspond to this event. This is usually
the trace_name (optionally followed by a semicolong,
e.g. "sched_switch:") but it can be anything else for trace points
generated using trace_printk().
:param parse_raw: If :code:`True`, raw trace data (-R option) to
trace-cmd will be used
This class acts as a base class for all TRAPpy events
"""
def __init__(self, parse_raw=False):
self.data_frame = pd.DataFrame()
self.data_array = []
self.time_array = []
self.comm_array = []
self.pid_array = []
self.tgid_array = []
self.cpu_array = []
self.callback = None
self.parse_raw = parse_raw
def finalize_object(self):
pass
def __get_trace_array_lengths(self):
"""Calculate the lengths of all arrays in the trace
Returns a dict with the name of each array found in the trace
as keys and their corresponding length as value
"""
from collections import defaultdict
pat_array = re.compile(r"([A-Za-z0-9_]+)={([^}]+)}")
ret = defaultdict(int)
for line in self.data_array:
while True:
match = re.search(pat_array, line)
if not match:
break
(array_name, array_elements) = match.groups()
array_len = len(array_elements.split(' '))
if array_len > ret[array_name]:
ret[array_name] = array_len
line = line[match.end():]
# Stop scanning if the trace doesn't have arrays
if len(ret) == 0:
break
return ret
def append_data(self, time, comm, pid, tgid, cpu, data):
"""Append data parsed from a line to the corresponding arrays
The :mod:`DataFrame` will be created from this when the whole trace
has been parsed.
:param time: The time for the line that was printed in the trace
:type time: float
:param comm: The command name or the execname from which the trace
line originated
:type comm: str
:param pid: The PID of the process from which the trace
line originated
:type pid: int
:param data: The data for matching line in the trace
:type data: str
"""
self.time_array.append(time)
self.comm_array.append(comm)
self.pid_array.append(pid)
self.tgid_array.append(tgid)
self.cpu_array.append(cpu)
self.data_array.append(data)
if not self.callback:
return
data_dict = self.generate_data_dict(comm, pid, cpu, data)
self.callback(time, data_dict)
def generate_data_dict(self, comm, pid, tgid, cpu, data_str):
data_dict = {"__comm": comm, "__pid": pid, "__tgid": tgid, "__cpu": cpu}
prev_key = None
for field in data_str.split():
if "=" not in field:
# Concatenation is supported only for "string" values
if type(data_dict[prev_key]) is not str:
continue
data_dict[prev_key] += ' ' + field
continue
(key, value) = field.split('=', 1)
try:
value = int(value)
except ValueError:
pass
data_dict[key] = value
prev_key = key
return data_dict
def generate_parsed_data(self):
# Get a rough idea of how much memory we have to play with
CHECK_MEM_COUNT = 10000
kb_free = _get_free_memory_kb()
starting_maxrss = getrusage(RUSAGE_SELF).ru_maxrss
check_memory_usage = True
check_memory_count = 1
for (comm, pid, tgid, cpu, data_str) in zip(self.comm_array, self.pid_array,
self.tgid_array, self.cpu_array,
self.data_array):
data_dict = self.generate_data_dict(comm, pid, tgid, cpu, data_str)
# When running out of memory, Pandas has been observed to segfault
# rather than throwing a proper Python error.
# Look at how much memory our process is using and warn if we seem
# to be getting close to the system's limit, check it only once
# in the beginning and then every CHECK_MEM_COUNT events
check_memory_count -= 1
if check_memory_usage and check_memory_count == 0:
kb_used = (getrusage(RUSAGE_SELF).ru_maxrss - starting_maxrss)
if kb_free and kb_used > kb_free * 0.9:
warnings.warn("TRAPpy: Appear to be low on memory. "
"If errors arise, try providing more RAM")
check_memory_usage = False
check_memory_count = CHECK_MEM_COUNT
yield data_dict
def create_dataframe(self):
"""Create the final :mod:`pandas.DataFrame`"""
if not self.time_array:
return
trace_arr_lengths = self.__get_trace_array_lengths()
if trace_arr_lengths.items():
for (idx, val) in enumerate(self.data_array):
expl_val = trace_parser_explode_array(val, trace_arr_lengths)
self.data_array[idx] = expl_val
time_idx = pd.Index(self.time_array, name="Time")
self.data_frame = pd.DataFrame(self.generate_parsed_data(), index=time_idx)
self.time_array = []
self.comm_array = []
self.pid_array = []
self.cpu_array = []
self.data_array = []
def write_csv(self, fname):
"""Write the csv info into a CSV file
:param fname: The name of the CSV file
:type fname: str
"""
self.data_frame.to_csv(fname)
def normalize_time(self, basetime):
"""Substract basetime from the Time of the data frame
:param basetime: The offset which needs to be subtracted from
the time index
:type basetime: float
"""
# HACK: We don't normalize anymore after the fact
return
if basetime and not self.data_frame.empty:
self.data_frame.reset_index(inplace=True)
self.data_frame["Time"] = self.data_frame["Time"] - basetime
self.data_frame.set_index("Time", inplace=True)
|