aboutsummaryrefslogtreecommitdiff
path: root/starlark/profile.go
blob: 38da2b2e9ff227a3c0f38192c563d3257f1ea79a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
// Copyright 2019 The Bazel Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package starlark

// This file defines a simple execution-time profiler for Starlark.
// It measures the wall time spent executing Starlark code, and emits a
// gzipped protocol message in pprof format (github.com/google/pprof).
//
// When profiling is enabled, the interpreter calls the profiler to
// indicate the start and end of each "span" or time interval. A leaf
// function (whether Go or Starlark) has a single span. A function that
// calls another function has spans for each interval in which it is the
// top of the stack. (A LOAD instruction also ends a span.)
//
// At the start of a span, the interpreter records the current time in
// the thread's topmost frame. At the end of the span, it obtains the
// time again and subtracts the span start time. The difference is added
// to an accumulator variable in the thread. If the accumulator exceeds
// some fixed quantum (10ms, say), the profiler records the current call
// stack and sends it to the profiler goroutine, along with the number
// of quanta, which are subtracted. For example, if the accumulator
// holds 3ms and then a completed span adds 25ms to it, its value is 28ms,
// which exceeeds 10ms. The profiler records a stack with the value 20ms
// (2 quanta), and the accumulator is left with 8ms.
//
// The profiler goroutine converts the stacks into the pprof format and
// emits a gzip-compressed protocol message to the designated output
// file. We use a hand-written streaming proto encoder to avoid
// dependencies on pprof and proto, and to avoid the need to
// materialize the profile data structure in memory.
//
// A limitation of this profiler is that it measures wall time, which
// does not necessarily correspond to CPU time. A CPU profiler requires
// that only running (not runnable) threads are sampled; this is
// commonly achieved by having the kernel deliver a (PROF) signal to an
// arbitrary running thread, through setitimer(2). The CPU profiler in the
// Go runtime uses this mechanism, but it is not possible for a Go
// application to register a SIGPROF handler, nor is it possible for a
// Go handler for some other signal to read the stack pointer of
// the interrupted thread.
//
// Two caveats:
// (1) it is tempting to send the leaf Frame directly to the profiler
// goroutine instead of making a copy of the stack, since a Frame is a
// spaghetti stack--a linked list. However, as soon as execution
// resumes, the stack's Frame.pc values may be mutated, so Frames are
// not safe to share with the asynchronous profiler goroutine.
// (2) it is tempting to use Callables as keys in a map when tabulating
// the pprof protocols's Function entities. However, we cannot assume
// that Callables are valid map keys, and furthermore we must not
// pin function values in memory indefinitely as this may cause lambda
// values to keep their free variables live much longer than necessary.

// TODO(adonovan):
// - make Start/Stop fully thread-safe.
// - fix the pc hack.
// - experiment with other values of quantum.

import (
	"bufio"
	"bytes"
	"compress/gzip"
	"encoding/binary"
	"fmt"
	"io"
	"log"
	"reflect"
	"sync/atomic"
	"time"
	"unsafe"

	"go.starlark.net/syntax"
)

// StartProfile enables time profiling of all Starlark threads,
// and writes a profile in pprof format to w.
// It must be followed by a call to StopProfiler to stop
// the profiler and finalize the profile.
//
// StartProfile returns an error if profiling was already enabled.
//
// StartProfile must not be called concurrently with Starlark execution.
func StartProfile(w io.Writer) error {
	if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) {
		return fmt.Errorf("profiler already running")
	}

	// TODO(adonovan): make the API fully concurrency-safe.
	// The main challenge is racy reads/writes of profiler.events,
	// and of send/close races on the channel it refers to.
	// It's easy to solve them with a mutex but harder to do
	// it efficiently.

	profiler.events = make(chan *profEvent, 1)
	profiler.done = make(chan error)

	go profile(w)

	return nil
}

// StopProfiler stops the profiler started by a prior call to
// StartProfile and finalizes the profile. It returns an error if the
// profile could not be completed.
//
// StopProfiler must not be called concurrently with Starlark execution.
func StopProfile() error {
	// Terminate the profiler goroutine and get its result.
	close(profiler.events)
	err := <-profiler.done

	profiler.done = nil
	profiler.events = nil
	atomic.StoreUint32(&profiler.on, 0)

	return err
}

// globals
var profiler struct {
	on     uint32          // nonzero => profiler running
	events chan *profEvent // profile events from interpreter threads
	done   chan error      // indicates profiler goroutine is ready
}

func (thread *Thread) beginProfSpan() {
	if profiler.events == nil {
		return // profiling not enabled
	}

	thread.frameAt(0).spanStart = nanotime()
}

// TODO(adonovan): experiment with smaller values,
// which trade space and time for greater precision.
const quantum = 10 * time.Millisecond

func (thread *Thread) endProfSpan() {
	if profiler.events == nil {
		return // profiling not enabled
	}

	// Add the span to the thread's accumulator.
	thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart)
	if thread.proftime < quantum {
		return
	}

	// Only record complete quanta.
	n := thread.proftime / quantum
	thread.proftime -= n * quantum

	// Copy the stack.
	// (We can't save thread.frame because its pc will change.)
	ev := &profEvent{
		thread: thread,
		time:   n * quantum,
	}
	ev.stack = ev.stackSpace[:0]
	for i := range thread.stack {
		fr := thread.frameAt(i)
		ev.stack = append(ev.stack, profFrame{
			pos: fr.Position(),
			fn:  fr.Callable(),
			pc:  fr.pc,
		})
	}

	profiler.events <- ev
}

type profEvent struct {
	thread     *Thread // currently unused
	time       time.Duration
	stack      []profFrame
	stackSpace [8]profFrame // initial space for stack
}

type profFrame struct {
	fn  Callable        // don't hold this live for too long (prevents GC of lambdas)
	pc  uint32          // program counter (Starlark frames only)
	pos syntax.Position // position of pc within this frame
}

// profile is the profiler goroutine.
// It runs until StopProfiler is called.
func profile(w io.Writer) {
	// Field numbers from pprof protocol.
	// See https://github.com/google/pprof/blob/master/proto/profile.proto
	const (
		Profile_sample_type    = 1  // repeated ValueType
		Profile_sample         = 2  // repeated Sample
		Profile_mapping        = 3  // repeated Mapping
		Profile_location       = 4  // repeated Location
		Profile_function       = 5  // repeated Function
		Profile_string_table   = 6  // repeated string
		Profile_time_nanos     = 9  // int64
		Profile_duration_nanos = 10 // int64
		Profile_period_type    = 11 // ValueType
		Profile_period         = 12 // int64

		ValueType_type = 1 // int64
		ValueType_unit = 2 // int64

		Sample_location_id = 1 // repeated uint64
		Sample_value       = 2 // repeated int64
		Sample_label       = 3 // repeated Label

		Label_key      = 1 // int64
		Label_str      = 2 // int64
		Label_num      = 3 // int64
		Label_num_unit = 4 // int64

		Location_id         = 1 // uint64
		Location_mapping_id = 2 // uint64
		Location_address    = 3 // uint64
		Location_line       = 4 // repeated Line

		Line_function_id = 1 // uint64
		Line_line        = 2 // int64

		Function_id          = 1 // uint64
		Function_name        = 2 // int64
		Function_system_name = 3 // int64
		Function_filename    = 4 // int64
		Function_start_line  = 5 // int64
	)

	bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks
	gz := gzip.NewWriter(bufw)
	enc := protoEncoder{w: gz}

	// strings
	stringIndex := make(map[string]int64)
	str := func(s string) int64 {
		i, ok := stringIndex[s]
		if !ok {
			i = int64(len(stringIndex))
			enc.string(Profile_string_table, s)
			stringIndex[s] = i
		}
		return i
	}
	str("") // entry 0

	// functions
	//
	// function returns the ID of a Callable for use in Line.FunctionId.
	// The ID is the same as the function's logical address,
	// which is supplied by the caller to avoid the need to recompute it.
	functionId := make(map[uintptr]uint64)
	function := func(fn Callable, addr uintptr) uint64 {
		id, ok := functionId[addr]
		if !ok {
			id = uint64(addr)

			var pos syntax.Position
			if fn, ok := fn.(callableWithPosition); ok {
				pos = fn.Position()
			}

			name := fn.Name()
			if name == "<toplevel>" {
				name = pos.Filename()
			}

			nameIndex := str(name)

			fun := new(bytes.Buffer)
			funenc := protoEncoder{w: fun}
			funenc.uint(Function_id, id)
			funenc.int(Function_name, nameIndex)
			funenc.int(Function_system_name, nameIndex)
			funenc.int(Function_filename, str(pos.Filename()))
			funenc.int(Function_start_line, int64(pos.Line))
			enc.bytes(Profile_function, fun.Bytes())

			functionId[addr] = id
		}
		return id
	}

	// locations
	//
	// location returns the ID of the location denoted by fr.
	// For Starlark frames, this is the Frame pc.
	locationId := make(map[uintptr]uint64)
	location := func(fr profFrame) uint64 {
		fnAddr := profFuncAddr(fr.fn)

		// For Starlark functions, the frame position
		// represents the current PC value.
		// Mix it into the low bits of the address.
		// This is super hacky and may result in collisions
		// in large functions or if functions are numerous.
		// TODO(adonovan): fix: try making this cleaner by treating
		// each bytecode segment as a Profile.Mapping.
		pcAddr := fnAddr
		if _, ok := fr.fn.(*Function); ok {
			pcAddr = (pcAddr << 16) ^ uintptr(fr.pc)
		}

		id, ok := locationId[pcAddr]
		if !ok {
			id = uint64(pcAddr)

			line := new(bytes.Buffer)
			lineenc := protoEncoder{w: line}
			lineenc.uint(Line_function_id, function(fr.fn, fnAddr))
			lineenc.int(Line_line, int64(fr.pos.Line))
			loc := new(bytes.Buffer)
			locenc := protoEncoder{w: loc}
			locenc.uint(Location_id, id)
			locenc.uint(Location_address, uint64(pcAddr))
			locenc.bytes(Location_line, line.Bytes())
			enc.bytes(Profile_location, loc.Bytes())

			locationId[pcAddr] = id
		}
		return id
	}

	wallNanos := new(bytes.Buffer)
	wnenc := protoEncoder{w: wallNanos}
	wnenc.int(ValueType_type, str("wall"))
	wnenc.int(ValueType_unit, str("nanoseconds"))

	// informational fields of Profile
	enc.bytes(Profile_sample_type, wallNanos.Bytes())
	enc.int(Profile_period, quantum.Nanoseconds())     // magnitude of sampling period
	enc.bytes(Profile_period_type, wallNanos.Bytes())  // dimension and unit of period
	enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile

	startNano := nanotime()

	// Read profile events from the channel
	// until it is closed by StopProfiler.
	for e := range profiler.events {
		sample := new(bytes.Buffer)
		sampleenc := protoEncoder{w: sample}
		sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds
		for _, fr := range e.stack {
			sampleenc.uint(Sample_location_id, location(fr))
		}
		enc.bytes(Profile_sample, sample.Bytes())
	}

	endNano := nanotime()
	enc.int(Profile_duration_nanos, endNano-startNano)

	err := gz.Close() // Close reports any prior write error
	if flushErr := bufw.Flush(); err == nil {
		err = flushErr
	}
	profiler.done <- err
}

// nanotime returns the time in nanoseconds since epoch.
// It is implemented by runtime.nanotime using the linkname hack;
// runtime.nanotime is defined for all OSs/ARCHS and uses the
// monotonic system clock, which there is no portable way to access.
// Should that function ever go away, these alternatives exist:
//
// 	// POSIX only. REALTIME not MONOTONIC. 17ns.
// 	var tv syscall.Timeval
// 	syscall.Gettimeofday(&tv) // can't fail
// 	return tv.Nano()
//
// 	// Portable. REALTIME not MONOTONIC. 46ns.
// 	return time.Now().Nanoseconds()
//
//      // POSIX only. Adds a dependency.
//	import "golang.org/x/sys/unix"
//	var ts unix.Timespec
// 	unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail
//	return unix.TimespecToNsec(ts)
//
//go:linkname nanotime runtime.nanotime
func nanotime() int64

// profFuncAddr returns the canonical "address"
// of a Callable for use by the profiler.
func profFuncAddr(fn Callable) uintptr {
	switch fn := fn.(type) {
	case *Builtin:
		return reflect.ValueOf(fn.fn).Pointer()
	case *Function:
		return uintptr(unsafe.Pointer(fn.funcode))
	}

	// User-defined callable types are typically of
	// of kind pointer-to-struct. Handle them specially.
	if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr {
		return v.Pointer()
	}

	// Address zero is reserved by the protocol.
	// Use 1 for callables we don't recognize.
	log.Printf("Starlark profiler: no address for Callable %T", fn)
	return 1
}

// We encode the protocol message by hand to avoid making
// the interpreter depend on both github.com/google/pprof
// and github.com/golang/protobuf.
//
// This also avoids the need to materialize a protocol message object
// tree of unbounded size and serialize it all at the end.
// The pprof format appears to have been designed to
// permit streaming implementations such as this one.
//
// See https://developers.google.com/protocol-buffers/docs/encoding.
type protoEncoder struct {
	w   io.Writer // *bytes.Buffer or *gzip.Writer
	tmp [binary.MaxVarintLen64]byte
}

func (e *protoEncoder) uvarint(x uint64) {
	n := binary.PutUvarint(e.tmp[:], x)
	e.w.Write(e.tmp[:n])
}

func (e *protoEncoder) tag(field, wire uint) {
	e.uvarint(uint64(field<<3 | wire))
}

func (e *protoEncoder) string(field uint, s string) {
	e.tag(field, 2) // length-delimited
	e.uvarint(uint64(len(s)))
	io.WriteString(e.w, s)
}

func (e *protoEncoder) bytes(field uint, b []byte) {
	e.tag(field, 2) // length-delimited
	e.uvarint(uint64(len(b)))
	e.w.Write(b)
}

func (e *protoEncoder) uint(field uint, x uint64) {
	e.tag(field, 0) // varint
	e.uvarint(x)
}

func (e *protoEncoder) int(field uint, x int64) {
	e.tag(field, 0) // varint
	e.uvarint(uint64(x))
}