转载

Docker的Privileged和Cap

Docker的Privileged和Cap

docker使用--privileged --cap-add --cap-drop来控制容器的权限,能控制哪些权限,是怎样实现的?

参考代码版本:docker-1.12.0

cap

首先,caplist有两个基准范围:

①默认Cap集合

[CAP_CHOWN CAP_DAC_OVERRIDE CAP_FSETID CAP_FOWNER CAP_MKNOD CAP_NET_RAW CAP_SETGID CAP_SETUID CAP_SETFCAP CAP_SETPCAP CAP_NET_BIND_SERVICE CAP_SYS_CHROOT CAP_KILL CAP_AUDIT_WRITE]

src/oci/default_linux.go

s.Process.Capabilities = []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
}

②最大Cap集合

[CAP_CHOWN CAP_DAC_OVERRIDE CAP_DAC_READ_SEARCH CAP_FOWNER CAP_FSETID CAP_KILL CAP_SETGID CAP_SETUID CAP_SETPCAP CAP_LINUX_IMMUTABLE CAP_NET_BIND_SERVICE CAP_NET_BROADCAST CAP_NET_ADMIN CAP_NET_RAW CAP_IPC_LOCK CAP_IPC_OWNER CAP_SYS_MODULE CAP_SYS_RAWIO CAP_SYS_CHROOT CAP_SYS_PTRACE CAP_SYS_PACCT CAP_SYS_ADMIN CAP_SYS_BOOT CAP_SYS_NICE CAP_SYS_RESOURCE CAP_SYS_TIME CAP_SYS_TTY_CONFIG CAP_MKNOD CAP_LEASE CAP_AUDIT_WRITE CAP_AUDIT_CONTROL CAP_SETFCAP CAP_MAC_OVERRIDE CAP_MAC_ADMIN CAP_SYSLOG CAP_WAKE_ALARM CAP_BLOCK_SUSPEND]

定义在这里:src/vender/src/github.com/syndtr/gocapability/capability/enum.go

其次,怎样决定容器启动时有哪些cap?

src/daemon/oci_linux.go

func setCapabilities(s *specs.Spec, c *container.Container) error {
var caplist []string
var err error
if c.HostConfig.Privileged {
caplist = caps.GetAllCapabilities()
} else {
caplist, err = caps.TweakCapabilities(s.Process.Capabilities, c.HostConfig.CapAdd, c.HostConfig.CapDrop)
if err != nil {
return err
}
}
s.Process.Capabilities = caplist
return nil
}

也就是说,如果是--privileged启动,那么你就获得最大的cap,如果不是,就根据--cap-add和--cap-drop ++--

最后,每个cap代表什么含义:

例如SYS_ADMIN,代表着container进程允许使用clone、mount、sethostname等系统调用

src/profile/seccomp/seccomp_default.go

case "CAP_SYS_ADMIN":
capSysAdmin = true
syscalls = append(syscalls, []*types.Syscall{
{
Name: "bpf",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "clone",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "fanotify_init",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "lookup_dcookie",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "mount",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "perf_event_open",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "setdomainname",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "sethostname",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "setns",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "umount",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "umount2",
Action: types.ActAllow,
Args: []*types.Arg{},
},
{
Name: "unshare",
Action: types.ActAllow,
Args: []*types.Arg{},
},
}...)

试一下:

[root@cc101 ~]# docker run -it autodeploy.chinacloud.com:4000/kollaglue/centos-binary-nova-compute:2.0.1 /bin/bash

()[root@a69e2a3cdea3 /]# hostname

a69e2a3cdea3

()[root@a69e2a3cdea3 /]# hostname test

hostname: you must be root to change the host name

默认权限下,不能使用sethostname系统调用

加上CAP_SYS_ADMIN

[root@cc101 ~]# docker run -it --rm -u root --cap-add SYS_ADMIN autodeploy.chinacloud.com:4000/kollaglue/centos-binary-nova-compute:2.0.1 /bin/bash

()[root@5c973256f09b /]# hostname

5c973256f09b

()[root@5c973256f09b /]# hostname test

()[root@5c973256f09b /]# hostname

test

这次sethostname成功

privileged

翻译为“特权”,代表container可以使用的最大权利,除了前述的获得最多cap这个功能外,privileged还有哪些特权呢?

sysfs和cgroup文件系统的权限ro->rw

src/daemon/oci_linux.go

func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
...
if c.HostConfig.Privileged {
if !s.Root.Readonly {
// clear readonly for /sys
for i := range s.Mounts {
if s.Mounts[i].Destination == "/sys" {
clearReadOnly(&s.Mounts[i])
}
}
}
s.Linux.ReadonlyPaths = nil
s.Linux.MaskedPaths = nil
}

// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
if uidMap, _ := daemon.GetUIDGIDMaps(); uidMap != nil || c.HostConfig.Privileged {
for i, m := range s.Mounts {
if m.Type == "cgroup" {
clearReadOnly(&s.Mounts[i])
}
}
}
...
}

获得主机的device

src/daemon/oci_linux.go

func setDevices(s *specs.Spec, c *container.Container) error {
// Build lists of devices allowed and created within the container.
var devs []specs.Device
devPermissions := s.Linux.Resources.Devices
if c.HostConfig.Privileged {
hostDevices, err := devices.HostDevices()
if err != nil {
return err
}
for _, d := range hostDevices {
devs = append(devs, specDevice(d))
}
rwm := "rwm"
devPermissions = []specs.DeviceCgroup{
{
Allow: true,
Access: &rwm,
},
}
} else {
for _, deviceMapping := range c.HostConfig.Devices {
d, dPermissions, err := getDevicesFromPath(deviceMapping)
if err != nil {
return err
}
devs = append(devs, d...)
devPermissions = append(devPermissions, dPermissions...)
}
}

试一下

[root@cc101 ~]# docker run -it --rm -u root autodeploy.chinacloud.com:4000/kollaglue/centos-binary-nova-compute:2.0.1 /bin/bash

()[root@bf2849514850 /]#

()[root@bf2849514850 /]# ls /dev

console fd full fuse kcore mqueue null ptmx pts random shm stderr stdin stdout tty urandom zero

()[root@bf2849514850 /]# exit

exit

这次加上特权

[root@cc101 ~]# docker run -it --rm -u root --privileged autodeploy.chinacloud.com:4000/kollaglue/centos-binary-nova-compute:2.0.1 /bin/bash

()[root@f30d3af44de4 /]# ls /dev

autofs dm-14 dm-27 dm-9 loop0 ppp sdb tty11 tty24 tty37 tty5 tty62 usbmon2 vcsa6

bsg dm-15 dm-28 dri loop1 ptmx sdb1 tty12 tty25 tty38 tty50 tty63 vcs vfio

btrfs-control dm-16 dm-29 fb0 loop-control ptp0 sg0 tty13 tty26 tty39 tty51 tty7 vcs1 vga_arbiter

bus dm-17 dm-3 fd mapper ptp1 sg1 tty14 tty27 tty4 tty52 tty8 vcs2 vhost-net

console dm-18 dm-30 full mcelog ptp2 shm tty15 tty28 tty40 tty53 tty9 vcs3 zero

cpu dm-19 dm-31 fuse mem ptp3 snapshot tty16 tty29 tty41 tty54 ttyS0 vcs4

cpu_dma_latency dm-2 dm-32 hidraw0 mqueue pts snd tty17 tty3 tty42 tty55 ttyS1 vcs5

crash dm-20 dm-33 hidraw1 net random stderr tty18 tty30 tty43 tty56 ttyS2 vcs6

dm-0 dm-21 dm-34 hidraw2 network_latency raw stdin tty19 tty31 tty44 tty57 ttyS3 vcsa

dm-1 dm-22 dm-4 hpet network_throughput rtc0 stdout tty2 tty32 tty45 tty58 uhid vcsa1

dm-10 dm-23 dm-5 input null sda tty tty20 tty33 tty46 tty59 uinput vcsa2

dm-11 dm-24 dm-6 kcore nvram sda1 tty0 tty21 tty34 tty47 tty6 urandom vcsa3

dm-12 dm-25 dm-7 kmsg oldmem sda2 tty1 tty22 tty35 tty48 tty60 usbmon0 vcsa4

dm-13 dm-26 dm-8 kvm port sda3 tty10 tty23 tty36 tty49 tty61 usbmon1 vcsa5

最后,相关数据结构定义:src/vendor/src/github.com/opencontainers/specs/specs-go/config.go

// Spec is the base configuration for the container.
type Spec struct {
// Version is the version of the specification that is supported.
Version string `json:"ociVersion"`
// Platform is the host information for OS and Arch.
Platform Platform `json:"platform"`
// Process is the container's main process.
Process Process `json:"process"`
// Root is the root information for the container's filesystem.
Root Root `json:"root"`
// Hostname is the container's host name.
Hostname string `json:"hostname,omitempty"`
// Mounts profile configuration for adding mounts to the container's filesystem.
Mounts []Mount `json:"mounts,omitempty"`
// Hooks are the commands run at various lifecycle events of the container.
Hooks Hooks `json:"hooks"`
// Annotations is an unstructured key value map that may be set by external tools to store and retrieve arbitrary metadata.
Annotations map[string]string `json:"annotations,omitempty"`

// Linux is platform specific configuration for Linux based containers.
Linux Linux `json:"linux" platform:"linux,omitempty"`
// Solaris is platform specific configuration for Solaris containers.
Solaris Solaris `json:"solaris" platform:"solaris,omitempty"`
}
...
原文  http://dockone.io/article/1622
正文到此结束
Loading...