diff options
Diffstat (limited to 'cap/cap.go')
-rw-r--r-- | cap/cap.go | 461 |
1 files changed, 461 insertions, 0 deletions
diff --git a/cap/cap.go b/cap/cap.go new file mode 100644 index 0000000..5ccef59 --- /dev/null +++ b/cap/cap.go @@ -0,0 +1,461 @@ +// Package cap provides all the Linux Capabilities userspace library API +// bindings in native Go. +// +// Capabilities are a feature of the Linux kernel that allow fine +// grain permissions to perform privileged operations. Privileged +// operations are required to do irregular system level operations +// from code. You can read more about how Capabilities are intended to +// work here: +// +// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf +// +// This package supports native Go bindings for all the features +// described in that paper as well as supporting subsequent changes to +// the kernel for other styles of inheritable Capability. +// +// Some simple things you can do with this package are: +// +// // Read and display the capabilities of the running process +// c := cap.GetProc() +// log.Printf("this process has these caps:", c) +// +// // Drop any privilege a process might have (including for root, +// // but note root 'owns' a lot of system files so a cap-limited +// // root can still do considerable damage to a running system). +// old := cap.GetProc() +// empty := cap.NewSet() +// if err := empty.SetProc(); err != nil { +// log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err) +// } +// now := cap.GetProc() +// if cap.Differs(now.Compare(empty)) { +// log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty) +// } +// +// See https://sites.google.com/site/fullycapable/ for recent updates, +// some more complete walk-through examples of ways of using +// 'cap.Set's etc and information on how to file bugs. +// +// For CGo linked binaries, behind the scenes, the package +// "kernel.org/pub/linux/libs/security/libcap/psx" is used to perform +// POSIX semantics system calls that manipulate thread state +// uniformly over the whole Go (and CGo linked) process runtime. +// +// Note, if the Go runtime syscall interface contains the Linux +// variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see +// https://github.com/golang/go/issues/1435 for its history) then +// the "psx" package will use that to invoke Capability setting system +// calls in pure Go binaries. In such an enhanced Go runtime, to force +// this behavior, use the CGO_ENABLED=0 environment variable. +// +// +// Copyright (c) 2019-21 Andrew G. Morgan <morgan@kernel.org> +// +// The cap and psx packages are licensed with a (you choose) BSD +// 3-clause or GPL2. See LICENSE file for details. +package cap // import "kernel.org/pub/linux/libs/security/libcap/cap" + +import ( + "errors" + "sort" + "sync" + "syscall" + "unsafe" +) + +// Value is the type of a single capability (or permission) bit. +type Value uint + +// Flag is the type of one of the three Value dimensions held in a +// Set. It is also used in the (*IAB).Fill() method for changing the +// Bounding and Ambient Vectors. +type Flag uint + +// Effective, Permitted, Inheritable are the three Flags of Values +// held in a Set. +const ( + Effective Flag = iota + Permitted + Inheritable +) + +// String identifies a Flag value by its conventional "e", "p" or "i" +// string abbreviation. +func (f Flag) String() string { + switch f { + case Effective: + return "e" + case Permitted: + return "p" + case Inheritable: + return "i" + default: + return "<Error>" + } +} + +// data holds a 32-bit slice of the compressed bitmaps of capability +// sets as understood by the kernel. +type data [Inheritable + 1]uint32 + +// Set is an opaque capabilities container for a set of system +// capbilities. It holds individually addressable capability Value's +// for the three capability Flag's. See GetFlag() and SetFlag() for +// how to adjust them individually, and Clear() and ClearFlag() for +// how to do bulk operations. +// +// For admin tasks associated with managing namespace specific file +// capabilities, Set can also support a namespace-root-UID value which +// defaults to zero. See GetNSOwner() and SetNSOwner(). +type Set struct { + // mu protects all other members of a Set. + mu sync.RWMutex + + // flat holds Flag Value bitmaps for all capabilities + // associated with this Set. + flat []data + + // Linux specific + nsRoot int +} + +// Various known kernel magic values. +const ( + kv1 = 0x19980330 // First iteration of process capabilities (32 bits). + kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated. + kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits). +) + +var ( + // starUp protects setting of the following values: magic, + // words, maxValues. + startUp sync.Once + + // magic holds the preferred magic number for the kernel ABI. + magic uint32 + + // words holds the number of uint32's associated with each + // capability Flag for this session. + words int + + // maxValues holds the number of bit values that are named by + // the running kernel. This is generally expected to match + // ValueCount which is autogenerated at packaging time. + maxValues uint +) + +type header struct { + magic uint32 + pid int32 +} + +// scwMu is used to fully serialize the write system calls. Note, this +// is generally not necesary, but in the case of Launch we get into a +// situation where the launching thread is temporarily allowed to +// deviate from the kernel state of the rest of the runtime and +// allowing other threads to perform w* syscalls will potentially +// interfere with the launching process. +var scwMu sync.Mutex + +// syscaller is a type for abstracting syscalls. The r* variants are +// for reading state, and can be parallelized, the w* variants need to +// be serialized so all OS threads can share state. +type syscaller struct { + r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) + w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) + r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) + w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) +} + +// caprcall provides a pointer etc wrapper for the system calls +// associated with getcap. +//go:uintptrescapes +func (sc *syscaller) caprcall(call uintptr, h *header, d []data) error { + x := uintptr(0) + if d != nil { + x = uintptr(unsafe.Pointer(&d[0])) + } + _, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0) + if err != 0 { + return err + } + return nil +} + +// capwcall provides a pointer etc wrapper for the system calls +// associated with setcap. +//go:uintptrescapes +func (sc *syscaller) capwcall(call uintptr, h *header, d []data) error { + x := uintptr(0) + if d != nil { + x = uintptr(unsafe.Pointer(&d[0])) + } + _, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0) + if err != 0 { + return err + } + return nil +} + +// prctlrcall provides a wrapper for the prctl systemcalls that only +// read kernel state. There is a limited number of arguments needed +// and the caller should use 0 for those not needed. +func (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) { + r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2) + if err != 0 { + return int(r), err + } + return int(r), nil +} + +// prctlrcall6 provides a wrapper for the prctl systemcalls that only +// read kernel state and require 6 arguments - ambient cap API, I'm +// looking at you. There is a limited number of arguments needed and +// the caller should use 0 for those not needed. +func (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { + r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) + if err != 0 { + return int(r), err + } + return int(r), nil +} + +// prctlwcall provides a wrapper for the prctl systemcalls that +// write/modify kernel state. Where available, these will use the +// POSIX semantics fixup system calls. There is a limited number of +// arguments needed and the caller should use 0 for those not needed. +func (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) { + r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2) + if err != 0 { + return int(r), err + } + return int(r), nil +} + +// prctlwcall6 provides a wrapper for the prctl systemcalls that +// write/modify kernel state and require 6 arguments - ambient cap +// API, I'm looking at you. (Where available, these will use the POSIX +// semantics fixup system calls). There is a limited number of +// arguments needed and the caller should use 0 for those not needed. +func (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { + r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) + if err != 0 { + return int(r), err + } + return int(r), nil +} + +// cInit perfoms the lazy identification of the capability vintage of +// the running system. +func (sc *syscaller) cInit() { + h := &header{ + magic: kv3, + } + sc.caprcall(syscall.SYS_CAPGET, h, nil) + magic = h.magic + switch magic { + case kv1: + words = 1 + case kv2, kv3: + words = 2 + default: + // Fall back to a known good version. + magic = kv3 + words = 2 + } + // Use the bounding set to evaluate which capabilities exist. + maxValues = uint(sort.Search(32*words, func(n int) bool { + _, err := GetBound(Value(n)) + return err != nil + })) + if maxValues == 0 { + // Fall back to using the largest value defined at build time. + maxValues = NamedCount + } +} + +// MaxBits returns the number of kernel-named capabilities discovered +// at runtime in the current system. +func MaxBits() Value { + startUp.Do(multisc.cInit) + return Value(maxValues) +} + +// NewSet returns an empty capability set. +func NewSet() *Set { + startUp.Do(multisc.cInit) + return &Set{ + flat: make([]data, words), + } +} + +// ErrBadSet indicates a nil pointer was used for a *Set, or the +// request of the Set is invalid in some way. +var ErrBadSet = errors.New("bad capability set") + +// Dup returns a copy of the specified capability set. +func (c *Set) Dup() (*Set, error) { + if c == nil || len(c.flat) == 0 { + return nil, ErrBadSet + } + n := NewSet() + c.mu.RLock() + defer c.mu.RUnlock() + copy(n.flat, c.flat) + n.nsRoot = c.nsRoot + return n, nil +} + +// GetPID returns the capability set associated with the target process +// id; pid=0 is an alias for current. +func GetPID(pid int) (*Set, error) { + v := NewSet() + if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil { + return nil, err + } + return v, nil +} + +// GetProc returns the capability Set of the current process. If the +// kernel is unable to determine the Set associated with the current +// process, the function panic()s. +func GetProc() *Set { + c, err := GetPID(0) + if err != nil { + panic(err) + } + return c +} + +func (sc *syscaller) setProc(c *Set) error { + if c == nil || len(c.flat) == 0 { + return ErrBadSet + } + return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat) +} + +// SetProc attempts to set the capability Set of the current +// process. The kernel will perform permission checks and an error +// will be returned if the attempt fails. Should the attempt fail +// no process capabilities will have been modified. +func (c *Set) SetProc() error { + scwMu.Lock() + defer scwMu.Unlock() + return multisc.setProc(c) +} + +// defines from uapi/linux/prctl.h +const ( + prCapBSetRead = 23 + prCapBSetDrop = 24 +) + +// GetBound determines if a specific capability is currently part of +// the local bounding set. On systems where the bounding set Value is +// not present, this function returns an error. +func GetBound(val Value) (bool, error) { + v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0) + if err != nil { + return false, err + } + return v > 0, nil +} + +//go:uintptrescapes +func (sc *syscaller) dropBound(val ...Value) error { + for _, v := range val { + if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil { + return err + } + } + return nil +} + +// DropBound attempts to suppress bounding set Values. The kernel will +// never allow a bounding set Value bit to be raised once successfully +// dropped. However, dropping requires the current process is +// sufficiently capable (usually via cap.SETPCAP being raised in the +// Effective flag of the process' Set). Note, the drops are performed +// in order and if one bounding value cannot be dropped, the function +// returns immediately with an error which may leave the system in an +// ill-defined state. The caller can determine where things went wrong +// using GetBound(). +func DropBound(val ...Value) error { + scwMu.Lock() + defer scwMu.Unlock() + return multisc.dropBound(val...) +} + +// defines from uapi/linux/prctl.h +const ( + prCapAmbient = 47 + + prCapAmbientIsSet = 1 + prCapAmbientRaise = 2 + prCapAmbientLower = 3 + prCapAmbientClearAll = 4 +) + +// GetAmbient determines if a specific capability is currently part of +// the local ambient set. On systems where the ambient set Value is +// not present, this function returns an error. +func GetAmbient(val Value) (bool, error) { + r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0) + return r > 0, err +} + +//go:uintptrescapes +func (sc *syscaller) setAmbient(enable bool, val ...Value) error { + dir := uintptr(prCapAmbientLower) + if enable { + dir = prCapAmbientRaise + } + for _, v := range val { + _, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0) + if err != nil { + return err + } + } + return nil +} + +// SetAmbient attempts to set a specific Value bit to the state, +// enable. This function will return an error if insufficient +// permission is available to perform this task. The settings are +// performed in order and the function returns immediately an error is +// detected. Use GetAmbient() to unravel where things went +// wrong. Note, the cap package manages an abstraction IAB that +// captures all three inheritable vectors in a single type. Consider +// using that. +func SetAmbient(enable bool, val ...Value) error { + scwMu.Lock() + defer scwMu.Unlock() + return multisc.setAmbient(enable, val...) +} + +func (sc *syscaller) resetAmbient() error { + var v bool + var err error + + for c := Value(0); !v; c++ { + if v, err = GetAmbient(c); err != nil { + // no non-zero values found. + return nil + } + } + _, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0) + return err +} + +// ResetAmbient attempts to ensure the Ambient set is fully +// cleared. It works by first reading the set and if it finds any bits +// raised it will attempt a reset. The test before attempting a reset +// behavior is a workaround for situations where the Ambient API is +// locked, but a reset is not actually needed. No Ambient bit not +// already raised in both the Permitted and Inheritable Set is allowed +// to be raised by the kernel. +func ResetAmbient() error { + scwMu.Lock() + defer scwMu.Unlock() + return multisc.resetAmbient() +} |