aboutsummaryrefslogtreecommitdiff
path: root/starlark/profile.go
diff options
context:
space:
mode:
Diffstat (limited to 'starlark/profile.go')
-rw-r--r--starlark/profile.go449
1 files changed, 449 insertions, 0 deletions
diff --git a/starlark/profile.go b/starlark/profile.go
new file mode 100644
index 0000000..38da2b2
--- /dev/null
+++ b/starlark/profile.go
@@ -0,0 +1,449 @@
+// Copyright 2019 The Bazel Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package starlark
+
+// This file defines a simple execution-time profiler for Starlark.
+// It measures the wall time spent executing Starlark code, and emits a
+// gzipped protocol message in pprof format (github.com/google/pprof).
+//
+// When profiling is enabled, the interpreter calls the profiler to
+// indicate the start and end of each "span" or time interval. A leaf
+// function (whether Go or Starlark) has a single span. A function that
+// calls another function has spans for each interval in which it is the
+// top of the stack. (A LOAD instruction also ends a span.)
+//
+// At the start of a span, the interpreter records the current time in
+// the thread's topmost frame. At the end of the span, it obtains the
+// time again and subtracts the span start time. The difference is added
+// to an accumulator variable in the thread. If the accumulator exceeds
+// some fixed quantum (10ms, say), the profiler records the current call
+// stack and sends it to the profiler goroutine, along with the number
+// of quanta, which are subtracted. For example, if the accumulator
+// holds 3ms and then a completed span adds 25ms to it, its value is 28ms,
+// which exceeeds 10ms. The profiler records a stack with the value 20ms
+// (2 quanta), and the accumulator is left with 8ms.
+//
+// The profiler goroutine converts the stacks into the pprof format and
+// emits a gzip-compressed protocol message to the designated output
+// file. We use a hand-written streaming proto encoder to avoid
+// dependencies on pprof and proto, and to avoid the need to
+// materialize the profile data structure in memory.
+//
+// A limitation of this profiler is that it measures wall time, which
+// does not necessarily correspond to CPU time. A CPU profiler requires
+// that only running (not runnable) threads are sampled; this is
+// commonly achieved by having the kernel deliver a (PROF) signal to an
+// arbitrary running thread, through setitimer(2). The CPU profiler in the
+// Go runtime uses this mechanism, but it is not possible for a Go
+// application to register a SIGPROF handler, nor is it possible for a
+// Go handler for some other signal to read the stack pointer of
+// the interrupted thread.
+//
+// Two caveats:
+// (1) it is tempting to send the leaf Frame directly to the profiler
+// goroutine instead of making a copy of the stack, since a Frame is a
+// spaghetti stack--a linked list. However, as soon as execution
+// resumes, the stack's Frame.pc values may be mutated, so Frames are
+// not safe to share with the asynchronous profiler goroutine.
+// (2) it is tempting to use Callables as keys in a map when tabulating
+// the pprof protocols's Function entities. However, we cannot assume
+// that Callables are valid map keys, and furthermore we must not
+// pin function values in memory indefinitely as this may cause lambda
+// values to keep their free variables live much longer than necessary.
+
+// TODO(adonovan):
+// - make Start/Stop fully thread-safe.
+// - fix the pc hack.
+// - experiment with other values of quantum.
+
+import (
+ "bufio"
+ "bytes"
+ "compress/gzip"
+ "encoding/binary"
+ "fmt"
+ "io"
+ "log"
+ "reflect"
+ "sync/atomic"
+ "time"
+ "unsafe"
+
+ "go.starlark.net/syntax"
+)
+
+// StartProfile enables time profiling of all Starlark threads,
+// and writes a profile in pprof format to w.
+// It must be followed by a call to StopProfiler to stop
+// the profiler and finalize the profile.
+//
+// StartProfile returns an error if profiling was already enabled.
+//
+// StartProfile must not be called concurrently with Starlark execution.
+func StartProfile(w io.Writer) error {
+ if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) {
+ return fmt.Errorf("profiler already running")
+ }
+
+ // TODO(adonovan): make the API fully concurrency-safe.
+ // The main challenge is racy reads/writes of profiler.events,
+ // and of send/close races on the channel it refers to.
+ // It's easy to solve them with a mutex but harder to do
+ // it efficiently.
+
+ profiler.events = make(chan *profEvent, 1)
+ profiler.done = make(chan error)
+
+ go profile(w)
+
+ return nil
+}
+
+// StopProfiler stops the profiler started by a prior call to
+// StartProfile and finalizes the profile. It returns an error if the
+// profile could not be completed.
+//
+// StopProfiler must not be called concurrently with Starlark execution.
+func StopProfile() error {
+ // Terminate the profiler goroutine and get its result.
+ close(profiler.events)
+ err := <-profiler.done
+
+ profiler.done = nil
+ profiler.events = nil
+ atomic.StoreUint32(&profiler.on, 0)
+
+ return err
+}
+
+// globals
+var profiler struct {
+ on uint32 // nonzero => profiler running
+ events chan *profEvent // profile events from interpreter threads
+ done chan error // indicates profiler goroutine is ready
+}
+
+func (thread *Thread) beginProfSpan() {
+ if profiler.events == nil {
+ return // profiling not enabled
+ }
+
+ thread.frameAt(0).spanStart = nanotime()
+}
+
+// TODO(adonovan): experiment with smaller values,
+// which trade space and time for greater precision.
+const quantum = 10 * time.Millisecond
+
+func (thread *Thread) endProfSpan() {
+ if profiler.events == nil {
+ return // profiling not enabled
+ }
+
+ // Add the span to the thread's accumulator.
+ thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart)
+ if thread.proftime < quantum {
+ return
+ }
+
+ // Only record complete quanta.
+ n := thread.proftime / quantum
+ thread.proftime -= n * quantum
+
+ // Copy the stack.
+ // (We can't save thread.frame because its pc will change.)
+ ev := &profEvent{
+ thread: thread,
+ time: n * quantum,
+ }
+ ev.stack = ev.stackSpace[:0]
+ for i := range thread.stack {
+ fr := thread.frameAt(i)
+ ev.stack = append(ev.stack, profFrame{
+ pos: fr.Position(),
+ fn: fr.Callable(),
+ pc: fr.pc,
+ })
+ }
+
+ profiler.events <- ev
+}
+
+type profEvent struct {
+ thread *Thread // currently unused
+ time time.Duration
+ stack []profFrame
+ stackSpace [8]profFrame // initial space for stack
+}
+
+type profFrame struct {
+ fn Callable // don't hold this live for too long (prevents GC of lambdas)
+ pc uint32 // program counter (Starlark frames only)
+ pos syntax.Position // position of pc within this frame
+}
+
+// profile is the profiler goroutine.
+// It runs until StopProfiler is called.
+func profile(w io.Writer) {
+ // Field numbers from pprof protocol.
+ // See https://github.com/google/pprof/blob/master/proto/profile.proto
+ const (
+ Profile_sample_type = 1 // repeated ValueType
+ Profile_sample = 2 // repeated Sample
+ Profile_mapping = 3 // repeated Mapping
+ Profile_location = 4 // repeated Location
+ Profile_function = 5 // repeated Function
+ Profile_string_table = 6 // repeated string
+ Profile_time_nanos = 9 // int64
+ Profile_duration_nanos = 10 // int64
+ Profile_period_type = 11 // ValueType
+ Profile_period = 12 // int64
+
+ ValueType_type = 1 // int64
+ ValueType_unit = 2 // int64
+
+ Sample_location_id = 1 // repeated uint64
+ Sample_value = 2 // repeated int64
+ Sample_label = 3 // repeated Label
+
+ Label_key = 1 // int64
+ Label_str = 2 // int64
+ Label_num = 3 // int64
+ Label_num_unit = 4 // int64
+
+ Location_id = 1 // uint64
+ Location_mapping_id = 2 // uint64
+ Location_address = 3 // uint64
+ Location_line = 4 // repeated Line
+
+ Line_function_id = 1 // uint64
+ Line_line = 2 // int64
+
+ Function_id = 1 // uint64
+ Function_name = 2 // int64
+ Function_system_name = 3 // int64
+ Function_filename = 4 // int64
+ Function_start_line = 5 // int64
+ )
+
+ bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks
+ gz := gzip.NewWriter(bufw)
+ enc := protoEncoder{w: gz}
+
+ // strings
+ stringIndex := make(map[string]int64)
+ str := func(s string) int64 {
+ i, ok := stringIndex[s]
+ if !ok {
+ i = int64(len(stringIndex))
+ enc.string(Profile_string_table, s)
+ stringIndex[s] = i
+ }
+ return i
+ }
+ str("") // entry 0
+
+ // functions
+ //
+ // function returns the ID of a Callable for use in Line.FunctionId.
+ // The ID is the same as the function's logical address,
+ // which is supplied by the caller to avoid the need to recompute it.
+ functionId := make(map[uintptr]uint64)
+ function := func(fn Callable, addr uintptr) uint64 {
+ id, ok := functionId[addr]
+ if !ok {
+ id = uint64(addr)
+
+ var pos syntax.Position
+ if fn, ok := fn.(callableWithPosition); ok {
+ pos = fn.Position()
+ }
+
+ name := fn.Name()
+ if name == "<toplevel>" {
+ name = pos.Filename()
+ }
+
+ nameIndex := str(name)
+
+ fun := new(bytes.Buffer)
+ funenc := protoEncoder{w: fun}
+ funenc.uint(Function_id, id)
+ funenc.int(Function_name, nameIndex)
+ funenc.int(Function_system_name, nameIndex)
+ funenc.int(Function_filename, str(pos.Filename()))
+ funenc.int(Function_start_line, int64(pos.Line))
+ enc.bytes(Profile_function, fun.Bytes())
+
+ functionId[addr] = id
+ }
+ return id
+ }
+
+ // locations
+ //
+ // location returns the ID of the location denoted by fr.
+ // For Starlark frames, this is the Frame pc.
+ locationId := make(map[uintptr]uint64)
+ location := func(fr profFrame) uint64 {
+ fnAddr := profFuncAddr(fr.fn)
+
+ // For Starlark functions, the frame position
+ // represents the current PC value.
+ // Mix it into the low bits of the address.
+ // This is super hacky and may result in collisions
+ // in large functions or if functions are numerous.
+ // TODO(adonovan): fix: try making this cleaner by treating
+ // each bytecode segment as a Profile.Mapping.
+ pcAddr := fnAddr
+ if _, ok := fr.fn.(*Function); ok {
+ pcAddr = (pcAddr << 16) ^ uintptr(fr.pc)
+ }
+
+ id, ok := locationId[pcAddr]
+ if !ok {
+ id = uint64(pcAddr)
+
+ line := new(bytes.Buffer)
+ lineenc := protoEncoder{w: line}
+ lineenc.uint(Line_function_id, function(fr.fn, fnAddr))
+ lineenc.int(Line_line, int64(fr.pos.Line))
+ loc := new(bytes.Buffer)
+ locenc := protoEncoder{w: loc}
+ locenc.uint(Location_id, id)
+ locenc.uint(Location_address, uint64(pcAddr))
+ locenc.bytes(Location_line, line.Bytes())
+ enc.bytes(Profile_location, loc.Bytes())
+
+ locationId[pcAddr] = id
+ }
+ return id
+ }
+
+ wallNanos := new(bytes.Buffer)
+ wnenc := protoEncoder{w: wallNanos}
+ wnenc.int(ValueType_type, str("wall"))
+ wnenc.int(ValueType_unit, str("nanoseconds"))
+
+ // informational fields of Profile
+ enc.bytes(Profile_sample_type, wallNanos.Bytes())
+ enc.int(Profile_period, quantum.Nanoseconds()) // magnitude of sampling period
+ enc.bytes(Profile_period_type, wallNanos.Bytes()) // dimension and unit of period
+ enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile
+
+ startNano := nanotime()
+
+ // Read profile events from the channel
+ // until it is closed by StopProfiler.
+ for e := range profiler.events {
+ sample := new(bytes.Buffer)
+ sampleenc := protoEncoder{w: sample}
+ sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds
+ for _, fr := range e.stack {
+ sampleenc.uint(Sample_location_id, location(fr))
+ }
+ enc.bytes(Profile_sample, sample.Bytes())
+ }
+
+ endNano := nanotime()
+ enc.int(Profile_duration_nanos, endNano-startNano)
+
+ err := gz.Close() // Close reports any prior write error
+ if flushErr := bufw.Flush(); err == nil {
+ err = flushErr
+ }
+ profiler.done <- err
+}
+
+// nanotime returns the time in nanoseconds since epoch.
+// It is implemented by runtime.nanotime using the linkname hack;
+// runtime.nanotime is defined for all OSs/ARCHS and uses the
+// monotonic system clock, which there is no portable way to access.
+// Should that function ever go away, these alternatives exist:
+//
+// // POSIX only. REALTIME not MONOTONIC. 17ns.
+// var tv syscall.Timeval
+// syscall.Gettimeofday(&tv) // can't fail
+// return tv.Nano()
+//
+// // Portable. REALTIME not MONOTONIC. 46ns.
+// return time.Now().Nanoseconds()
+//
+// // POSIX only. Adds a dependency.
+// import "golang.org/x/sys/unix"
+// var ts unix.Timespec
+// unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail
+// return unix.TimespecToNsec(ts)
+//
+//go:linkname nanotime runtime.nanotime
+func nanotime() int64
+
+// profFuncAddr returns the canonical "address"
+// of a Callable for use by the profiler.
+func profFuncAddr(fn Callable) uintptr {
+ switch fn := fn.(type) {
+ case *Builtin:
+ return reflect.ValueOf(fn.fn).Pointer()
+ case *Function:
+ return uintptr(unsafe.Pointer(fn.funcode))
+ }
+
+ // User-defined callable types are typically of
+ // of kind pointer-to-struct. Handle them specially.
+ if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr {
+ return v.Pointer()
+ }
+
+ // Address zero is reserved by the protocol.
+ // Use 1 for callables we don't recognize.
+ log.Printf("Starlark profiler: no address for Callable %T", fn)
+ return 1
+}
+
+// We encode the protocol message by hand to avoid making
+// the interpreter depend on both github.com/google/pprof
+// and github.com/golang/protobuf.
+//
+// This also avoids the need to materialize a protocol message object
+// tree of unbounded size and serialize it all at the end.
+// The pprof format appears to have been designed to
+// permit streaming implementations such as this one.
+//
+// See https://developers.google.com/protocol-buffers/docs/encoding.
+type protoEncoder struct {
+ w io.Writer // *bytes.Buffer or *gzip.Writer
+ tmp [binary.MaxVarintLen64]byte
+}
+
+func (e *protoEncoder) uvarint(x uint64) {
+ n := binary.PutUvarint(e.tmp[:], x)
+ e.w.Write(e.tmp[:n])
+}
+
+func (e *protoEncoder) tag(field, wire uint) {
+ e.uvarint(uint64(field<<3 | wire))
+}
+
+func (e *protoEncoder) string(field uint, s string) {
+ e.tag(field, 2) // length-delimited
+ e.uvarint(uint64(len(s)))
+ io.WriteString(e.w, s)
+}
+
+func (e *protoEncoder) bytes(field uint, b []byte) {
+ e.tag(field, 2) // length-delimited
+ e.uvarint(uint64(len(b)))
+ e.w.Write(b)
+}
+
+func (e *protoEncoder) uint(field uint, x uint64) {
+ e.tag(field, 0) // varint
+ e.uvarint(x)
+}
+
+func (e *protoEncoder) int(field uint, x int64) {
+ e.tag(field, 0) // varint
+ e.uvarint(uint64(x))
+}