参考:Control Groups vs. Control Groups
systemd的源码链接:https://github.com/systemd/systemd
ubuntu系统开机后,可以看到已经挂载了很多文件系统,其中有些是systemd在代码里自动挂载的,比如cgroup。
/dev/root / ext4 rw,relatime 0 0 devtmpfs /dev devtmpfs rw,relatime,size=1742432k,nr_inodes=435608,mode=755 0 0 sysfs /sys sysfs rw,nosuid,nodev,noexec,relatime 0 0 proc /proc proc rw,nosuid,nodev,noexec,relatime 0 0 selinuxfs /sys/fs/selinux selinuxfs rw,relatime 0 0 tmpfs /dev/shm tmpfs rw,nosuid,nodev 0 0 devpts /dev/pts devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000 0 0 tmpfs /run tmpfs rw,nosuid,nodev,size=348908k,mode=755 0 0 tmpfs /run/lock tmpfs rw,nosuid,nodev,noexec,relatime,size=5120k 0 0 tmpfs /sys/fs/cgroup tmpfs rw,nosuid,nodev,noexec,mode=755 0 0 cgroup2 /sys/fs/cgroup/unified cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate 0 0 cgroup /sys/fs/cgroup/systemd cgroup rw,nosuid,nodev,noexec,relatime,xattr,name=systemd 0 0 none /sys/fs/bpf bpf rw,nosuid,nodev,noexec,relatime,mode=700 0 0 cgroup /sys/fs/cgroup/debug cgroup rw,nosuid,nodev,noexec,relatime,debug 0 0 cgroup /sys/fs/cgroup/hugetlb cgroup rw,nosuid,nodev,noexec,relatime,hugetlb 0 0 cgroup /sys/fs/cgroup/memory cgroup rw,nosuid,nodev,noexec,relatime,memory 0 0 cgroup /sys/fs/cgroup/rdma cgroup rw,nosuid,nodev,noexec,relatime,rdma 0 0 cgroup /sys/fs/cgroup/blkio cgroup rw,nosuid,nodev,noexec,relatime,blkio 0 0 cgroup /sys/fs/cgroup/cpu,cpuacct cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct 0 0 cgroup /sys/fs/cgroup/cpuset cgroup rw,nosuid,nodev,noexec,relatime,cpuset 0 0 cgroup /sys/fs/cgroup/misc cgroup rw,nosuid,nodev,noexec,relatime,misc 0 0 cgroup /sys/fs/cgroup/pids cgroup rw,nosuid,nodev,noexec,relatime,pids 0 0 cgroup /sys/fs/cgroup/perf_event cgroup rw,nosuid,nodev,noexec,relatime,perf_event 0 0 cgroup /sys/fs/cgroup/devices cgroup rw,nosuid,nodev,noexec,relatime,devices 0 0 cgroup /sys/fs/cgroup/freezer cgroup rw,nosuid,nodev,noexec,relatime,freezer 0 0 systemd-1 /proc/sys/fs/binfmt_misc autofs rw,relatime,fd=29,pgrp=1,timeout=0,minproto=5,maxproto=5,direct 0 0 debugfs /sys/kernel/debug debugfs rw,nosuid,nodev,noexec,relatime 0 0 mqueue /dev/mqueue mqueue rw,nosuid,nodev,noexec,relatime 0 0 hugetlbfs /dev/hugepages hugetlbfs rw,relatime,pagesize=2M 0 0 tracefs /sys/kernel/tracing tracefs rw,nosuid,nodev,noexec,relatime 0 0 hostshare /mnt 9p rw,sync,dirsync,relatime,access=client,trans=virtio 0 0 tmpfs /run/user/1000 tmpfs rw,nosuid,nodev,relatime,size=348904k,mode=700,uid=1000,gid=1000 0 0
下面简要分析这部分的代码:
main -> mount_setup -> mount_points_setup //遍历mount_table数组,调用mount_one挂载文件系统,systemd在挂载每个文件系统前,会执行每个数组项中指定的condition_fn,判断当前环境是否可以挂载. //比如在/sys/fs/cgroup下挂载tmpfs文件系统 -> initialize_runtime -> mount_cgroup_controllers -> cg_kernel_controllers //通过读取/proc/cgroups获取当前系统可用的cgroup子系统 -> 根据前一步获取到的cgroup子系统,挂载每个cgroup子系统,其中会调用join_with来检查需要合并挂载的cgroup 子系统,目前支持合并挂载的cgroup子系统是:cpu和cpuacct,net_cls和net_prio -> 当所有的cgroup子系统都挂载完毕后,重新将/sys/fs/cgroup挂载为只读 -> invoke_main_loop //进入事件循环
mount_table定义如下:
static const MountPoint mount_table[] = { { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK }, { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_FATAL|MNT_IN_CONTAINER }, { "devtmpfs", "/dev", "devtmpfs", "mode=755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME, NULL, MNT_FATAL|MNT_IN_CONTAINER }, { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_NONE }, #if ENABLE_SMACK { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV, mac_smack_use, MNT_FATAL }, { "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME, mac_smack_use, MNT_FATAL }, #endif { "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, NULL, MNT_FATAL|MNT_IN_CONTAINER }, { "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, NULL, MNT_IN_CONTAINER }, #if ENABLE_SMACK { "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, mac_smack_use, MNT_FATAL }, #endif { "tmpfs", "/run", "tmpfs", "mode=755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME, NULL, MNT_FATAL|MNT_IN_CONTAINER }, { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate,memory_recursiveprot", MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE }, { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_legacy_wanted, MNT_IN_CONTAINER }, { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV, cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER }, { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_NONE }, #if ENABLE_EFI { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, is_efi_boot, MNT_NONE }, #endif { "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL, MNT_NONE, }, };