diff options
Diffstat (limited to 'starlark/profile.go')
-rw-r--r-- | starlark/profile.go | 449 |
1 files changed, 449 insertions, 0 deletions
diff --git a/starlark/profile.go b/starlark/profile.go new file mode 100644 index 0000000..38da2b2 --- /dev/null +++ b/starlark/profile.go @@ -0,0 +1,449 @@ +// Copyright 2019 The Bazel Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package starlark + +// This file defines a simple execution-time profiler for Starlark. +// It measures the wall time spent executing Starlark code, and emits a +// gzipped protocol message in pprof format (github.com/google/pprof). +// +// When profiling is enabled, the interpreter calls the profiler to +// indicate the start and end of each "span" or time interval. A leaf +// function (whether Go or Starlark) has a single span. A function that +// calls another function has spans for each interval in which it is the +// top of the stack. (A LOAD instruction also ends a span.) +// +// At the start of a span, the interpreter records the current time in +// the thread's topmost frame. At the end of the span, it obtains the +// time again and subtracts the span start time. The difference is added +// to an accumulator variable in the thread. If the accumulator exceeds +// some fixed quantum (10ms, say), the profiler records the current call +// stack and sends it to the profiler goroutine, along with the number +// of quanta, which are subtracted. For example, if the accumulator +// holds 3ms and then a completed span adds 25ms to it, its value is 28ms, +// which exceeeds 10ms. The profiler records a stack with the value 20ms +// (2 quanta), and the accumulator is left with 8ms. +// +// The profiler goroutine converts the stacks into the pprof format and +// emits a gzip-compressed protocol message to the designated output +// file. We use a hand-written streaming proto encoder to avoid +// dependencies on pprof and proto, and to avoid the need to +// materialize the profile data structure in memory. +// +// A limitation of this profiler is that it measures wall time, which +// does not necessarily correspond to CPU time. A CPU profiler requires +// that only running (not runnable) threads are sampled; this is +// commonly achieved by having the kernel deliver a (PROF) signal to an +// arbitrary running thread, through setitimer(2). The CPU profiler in the +// Go runtime uses this mechanism, but it is not possible for a Go +// application to register a SIGPROF handler, nor is it possible for a +// Go handler for some other signal to read the stack pointer of +// the interrupted thread. +// +// Two caveats: +// (1) it is tempting to send the leaf Frame directly to the profiler +// goroutine instead of making a copy of the stack, since a Frame is a +// spaghetti stack--a linked list. However, as soon as execution +// resumes, the stack's Frame.pc values may be mutated, so Frames are +// not safe to share with the asynchronous profiler goroutine. +// (2) it is tempting to use Callables as keys in a map when tabulating +// the pprof protocols's Function entities. However, we cannot assume +// that Callables are valid map keys, and furthermore we must not +// pin function values in memory indefinitely as this may cause lambda +// values to keep their free variables live much longer than necessary. + +// TODO(adonovan): +// - make Start/Stop fully thread-safe. +// - fix the pc hack. +// - experiment with other values of quantum. + +import ( + "bufio" + "bytes" + "compress/gzip" + "encoding/binary" + "fmt" + "io" + "log" + "reflect" + "sync/atomic" + "time" + "unsafe" + + "go.starlark.net/syntax" +) + +// StartProfile enables time profiling of all Starlark threads, +// and writes a profile in pprof format to w. +// It must be followed by a call to StopProfiler to stop +// the profiler and finalize the profile. +// +// StartProfile returns an error if profiling was already enabled. +// +// StartProfile must not be called concurrently with Starlark execution. +func StartProfile(w io.Writer) error { + if !atomic.CompareAndSwapUint32(&profiler.on, 0, 1) { + return fmt.Errorf("profiler already running") + } + + // TODO(adonovan): make the API fully concurrency-safe. + // The main challenge is racy reads/writes of profiler.events, + // and of send/close races on the channel it refers to. + // It's easy to solve them with a mutex but harder to do + // it efficiently. + + profiler.events = make(chan *profEvent, 1) + profiler.done = make(chan error) + + go profile(w) + + return nil +} + +// StopProfiler stops the profiler started by a prior call to +// StartProfile and finalizes the profile. It returns an error if the +// profile could not be completed. +// +// StopProfiler must not be called concurrently with Starlark execution. +func StopProfile() error { + // Terminate the profiler goroutine and get its result. + close(profiler.events) + err := <-profiler.done + + profiler.done = nil + profiler.events = nil + atomic.StoreUint32(&profiler.on, 0) + + return err +} + +// globals +var profiler struct { + on uint32 // nonzero => profiler running + events chan *profEvent // profile events from interpreter threads + done chan error // indicates profiler goroutine is ready +} + +func (thread *Thread) beginProfSpan() { + if profiler.events == nil { + return // profiling not enabled + } + + thread.frameAt(0).spanStart = nanotime() +} + +// TODO(adonovan): experiment with smaller values, +// which trade space and time for greater precision. +const quantum = 10 * time.Millisecond + +func (thread *Thread) endProfSpan() { + if profiler.events == nil { + return // profiling not enabled + } + + // Add the span to the thread's accumulator. + thread.proftime += time.Duration(nanotime() - thread.frameAt(0).spanStart) + if thread.proftime < quantum { + return + } + + // Only record complete quanta. + n := thread.proftime / quantum + thread.proftime -= n * quantum + + // Copy the stack. + // (We can't save thread.frame because its pc will change.) + ev := &profEvent{ + thread: thread, + time: n * quantum, + } + ev.stack = ev.stackSpace[:0] + for i := range thread.stack { + fr := thread.frameAt(i) + ev.stack = append(ev.stack, profFrame{ + pos: fr.Position(), + fn: fr.Callable(), + pc: fr.pc, + }) + } + + profiler.events <- ev +} + +type profEvent struct { + thread *Thread // currently unused + time time.Duration + stack []profFrame + stackSpace [8]profFrame // initial space for stack +} + +type profFrame struct { + fn Callable // don't hold this live for too long (prevents GC of lambdas) + pc uint32 // program counter (Starlark frames only) + pos syntax.Position // position of pc within this frame +} + +// profile is the profiler goroutine. +// It runs until StopProfiler is called. +func profile(w io.Writer) { + // Field numbers from pprof protocol. + // See https://github.com/google/pprof/blob/master/proto/profile.proto + const ( + Profile_sample_type = 1 // repeated ValueType + Profile_sample = 2 // repeated Sample + Profile_mapping = 3 // repeated Mapping + Profile_location = 4 // repeated Location + Profile_function = 5 // repeated Function + Profile_string_table = 6 // repeated string + Profile_time_nanos = 9 // int64 + Profile_duration_nanos = 10 // int64 + Profile_period_type = 11 // ValueType + Profile_period = 12 // int64 + + ValueType_type = 1 // int64 + ValueType_unit = 2 // int64 + + Sample_location_id = 1 // repeated uint64 + Sample_value = 2 // repeated int64 + Sample_label = 3 // repeated Label + + Label_key = 1 // int64 + Label_str = 2 // int64 + Label_num = 3 // int64 + Label_num_unit = 4 // int64 + + Location_id = 1 // uint64 + Location_mapping_id = 2 // uint64 + Location_address = 3 // uint64 + Location_line = 4 // repeated Line + + Line_function_id = 1 // uint64 + Line_line = 2 // int64 + + Function_id = 1 // uint64 + Function_name = 2 // int64 + Function_system_name = 3 // int64 + Function_filename = 4 // int64 + Function_start_line = 5 // int64 + ) + + bufw := bufio.NewWriter(w) // write file in 4KB (not 240B flate-sized) chunks + gz := gzip.NewWriter(bufw) + enc := protoEncoder{w: gz} + + // strings + stringIndex := make(map[string]int64) + str := func(s string) int64 { + i, ok := stringIndex[s] + if !ok { + i = int64(len(stringIndex)) + enc.string(Profile_string_table, s) + stringIndex[s] = i + } + return i + } + str("") // entry 0 + + // functions + // + // function returns the ID of a Callable for use in Line.FunctionId. + // The ID is the same as the function's logical address, + // which is supplied by the caller to avoid the need to recompute it. + functionId := make(map[uintptr]uint64) + function := func(fn Callable, addr uintptr) uint64 { + id, ok := functionId[addr] + if !ok { + id = uint64(addr) + + var pos syntax.Position + if fn, ok := fn.(callableWithPosition); ok { + pos = fn.Position() + } + + name := fn.Name() + if name == "<toplevel>" { + name = pos.Filename() + } + + nameIndex := str(name) + + fun := new(bytes.Buffer) + funenc := protoEncoder{w: fun} + funenc.uint(Function_id, id) + funenc.int(Function_name, nameIndex) + funenc.int(Function_system_name, nameIndex) + funenc.int(Function_filename, str(pos.Filename())) + funenc.int(Function_start_line, int64(pos.Line)) + enc.bytes(Profile_function, fun.Bytes()) + + functionId[addr] = id + } + return id + } + + // locations + // + // location returns the ID of the location denoted by fr. + // For Starlark frames, this is the Frame pc. + locationId := make(map[uintptr]uint64) + location := func(fr profFrame) uint64 { + fnAddr := profFuncAddr(fr.fn) + + // For Starlark functions, the frame position + // represents the current PC value. + // Mix it into the low bits of the address. + // This is super hacky and may result in collisions + // in large functions or if functions are numerous. + // TODO(adonovan): fix: try making this cleaner by treating + // each bytecode segment as a Profile.Mapping. + pcAddr := fnAddr + if _, ok := fr.fn.(*Function); ok { + pcAddr = (pcAddr << 16) ^ uintptr(fr.pc) + } + + id, ok := locationId[pcAddr] + if !ok { + id = uint64(pcAddr) + + line := new(bytes.Buffer) + lineenc := protoEncoder{w: line} + lineenc.uint(Line_function_id, function(fr.fn, fnAddr)) + lineenc.int(Line_line, int64(fr.pos.Line)) + loc := new(bytes.Buffer) + locenc := protoEncoder{w: loc} + locenc.uint(Location_id, id) + locenc.uint(Location_address, uint64(pcAddr)) + locenc.bytes(Location_line, line.Bytes()) + enc.bytes(Profile_location, loc.Bytes()) + + locationId[pcAddr] = id + } + return id + } + + wallNanos := new(bytes.Buffer) + wnenc := protoEncoder{w: wallNanos} + wnenc.int(ValueType_type, str("wall")) + wnenc.int(ValueType_unit, str("nanoseconds")) + + // informational fields of Profile + enc.bytes(Profile_sample_type, wallNanos.Bytes()) + enc.int(Profile_period, quantum.Nanoseconds()) // magnitude of sampling period + enc.bytes(Profile_period_type, wallNanos.Bytes()) // dimension and unit of period + enc.int(Profile_time_nanos, time.Now().UnixNano()) // start (real) time of profile + + startNano := nanotime() + + // Read profile events from the channel + // until it is closed by StopProfiler. + for e := range profiler.events { + sample := new(bytes.Buffer) + sampleenc := protoEncoder{w: sample} + sampleenc.int(Sample_value, e.time.Nanoseconds()) // wall nanoseconds + for _, fr := range e.stack { + sampleenc.uint(Sample_location_id, location(fr)) + } + enc.bytes(Profile_sample, sample.Bytes()) + } + + endNano := nanotime() + enc.int(Profile_duration_nanos, endNano-startNano) + + err := gz.Close() // Close reports any prior write error + if flushErr := bufw.Flush(); err == nil { + err = flushErr + } + profiler.done <- err +} + +// nanotime returns the time in nanoseconds since epoch. +// It is implemented by runtime.nanotime using the linkname hack; +// runtime.nanotime is defined for all OSs/ARCHS and uses the +// monotonic system clock, which there is no portable way to access. +// Should that function ever go away, these alternatives exist: +// +// // POSIX only. REALTIME not MONOTONIC. 17ns. +// var tv syscall.Timeval +// syscall.Gettimeofday(&tv) // can't fail +// return tv.Nano() +// +// // Portable. REALTIME not MONOTONIC. 46ns. +// return time.Now().Nanoseconds() +// +// // POSIX only. Adds a dependency. +// import "golang.org/x/sys/unix" +// var ts unix.Timespec +// unix.ClockGettime(CLOCK_MONOTONIC, &ts) // can't fail +// return unix.TimespecToNsec(ts) +// +//go:linkname nanotime runtime.nanotime +func nanotime() int64 + +// profFuncAddr returns the canonical "address" +// of a Callable for use by the profiler. +func profFuncAddr(fn Callable) uintptr { + switch fn := fn.(type) { + case *Builtin: + return reflect.ValueOf(fn.fn).Pointer() + case *Function: + return uintptr(unsafe.Pointer(fn.funcode)) + } + + // User-defined callable types are typically of + // of kind pointer-to-struct. Handle them specially. + if v := reflect.ValueOf(fn); v.Type().Kind() == reflect.Ptr { + return v.Pointer() + } + + // Address zero is reserved by the protocol. + // Use 1 for callables we don't recognize. + log.Printf("Starlark profiler: no address for Callable %T", fn) + return 1 +} + +// We encode the protocol message by hand to avoid making +// the interpreter depend on both github.com/google/pprof +// and github.com/golang/protobuf. +// +// This also avoids the need to materialize a protocol message object +// tree of unbounded size and serialize it all at the end. +// The pprof format appears to have been designed to +// permit streaming implementations such as this one. +// +// See https://developers.google.com/protocol-buffers/docs/encoding. +type protoEncoder struct { + w io.Writer // *bytes.Buffer or *gzip.Writer + tmp [binary.MaxVarintLen64]byte +} + +func (e *protoEncoder) uvarint(x uint64) { + n := binary.PutUvarint(e.tmp[:], x) + e.w.Write(e.tmp[:n]) +} + +func (e *protoEncoder) tag(field, wire uint) { + e.uvarint(uint64(field<<3 | wire)) +} + +func (e *protoEncoder) string(field uint, s string) { + e.tag(field, 2) // length-delimited + e.uvarint(uint64(len(s))) + io.WriteString(e.w, s) +} + +func (e *protoEncoder) bytes(field uint, b []byte) { + e.tag(field, 2) // length-delimited + e.uvarint(uint64(len(b))) + e.w.Write(b) +} + +func (e *protoEncoder) uint(field uint, x uint64) { + e.tag(field, 0) // varint + e.uvarint(x) +} + +func (e *protoEncoder) int(field uint, x int64) { + e.tag(field, 0) // varint + e.uvarint(uint64(x)) +} |