diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 3f7ec04a2b..a40e6d5e36 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -2059,6 +2059,11 @@ func SetSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name i return nil case linux.SO_BINDTODEVICE: + // Linux requires CAP_NET_RAW to use SO_BINDTODEVICE. + // See net/core/sock.c:sock_setsockopt(). + if !t.HasCapabilityIn(linux.CAP_NET_RAW, t.NetworkNamespace().UserNamespace()) { + return syserr.ErrNotPermitted + } n := bytes.IndexByte(optVal, 0) if n == -1 { n = len(optVal) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index ab6840fb9c..d7b54fd4df 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -64,6 +64,12 @@ func mknodat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode linux.FileMod if mode.FileType() == 0 { mode |= linux.ModeRegular } + // Linux requires CAP_MKNOD to create block or character device nodes. + // See fs/namei.c:vfs_mknod(). + if (mode.FileType() == linux.ModeCharacterDevice || mode.FileType() == linux.ModeBlockDevice) && + !t.HasCapabilityIn(linux.CAP_MKNOD, t.Credentials().UserNamespace) { + return linuxerr.EPERM + } major, minor := linux.DecodeDeviceID(dev) return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ Mode: mode &^ linux.FileMode(t.FSContext().Umask()), diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 415d97ed38..c948aea7ab 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -479,6 +479,16 @@ func SchedSetaffinity(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) if task == nil { return 0, nil, linuxerr.ESRCH } + // Linux requires the caller's EUID to match the target's + // real or effective UID, or CAP_SYS_NICE. See + // kernel/sched/core.c:check_same_owner(). + creds := t.Credentials() + tcreds := task.Credentials() + if creds.EffectiveKUID != tcreds.EffectiveKUID && + creds.EffectiveKUID != tcreds.RealKUID && + !creds.HasCapabilityIn(linux.CAP_SYS_NICE, tcreds.UserNamespace) { + return 0, nil, linuxerr.EPERM + } } mask := sched.NewCPUSet(t.Kernel().ApplicationCores()) @@ -729,6 +739,18 @@ func Setpriority(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uin return 0, nil, linuxerr.ESRCH } + // Linux requires UID match or CAP_SYS_NICE to set + // another process's priority. See kernel/sys.c:set_one_prio(). + if task != t { + creds := t.Credentials() + tcreds := task.Credentials() + if creds.EffectiveKUID != tcreds.RealKUID && + creds.EffectiveKUID != tcreds.EffectiveKUID && + !creds.HasCapabilityIn(linux.CAP_SYS_NICE, tcreds.UserNamespace) { + return 0, nil, linuxerr.EPERM + } + } + task.SetNiceness(niceval) case linux.PRIO_USER: fallthrough diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 6e1c6d03ed..ffebdbd1db 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -82,6 +82,10 @@ syscall_test( test = "//test/syscalls/linux:chmod_test", ) +syscall_test( + test = "//test/syscalls/linux:capability_checks_test", +) + syscall_test( size = "medium", add_fusefs = True, diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index adce583a80..33a646a7ce 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -466,6 +466,23 @@ cc_binary( ], ) +cc_binary( + name = "capability_checks_test", + testonly = 1, + srcs = ["capability_checks.cc"], + linkstatic = 1, + malloc = "//test/util:errno_safe_allocator", + deps = select_gtest() + [ + "//test/util:capability_util", + "//test/util:file_descriptor", + "//test/util:fs_util", + "//test/util:temp_path", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:thread_util", + ], +) + cc_binary( name = "chown_test", testonly = 1, diff --git a/test/syscalls/linux/capability_checks.cc b/test/syscalls/linux/capability_checks.cc new file mode 100644 index 0000000000..08ebc40312 --- /dev/null +++ b/test/syscalls/linux/capability_checks.cc @@ -0,0 +1,241 @@ +// Copyright 2026 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Tests for missing capability checks that were added to match Linux behavior. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "gtest/gtest.h" +#include "test/util/capability_util.h" +#include "test/util/file_descriptor.h" +#include "test/util/fs_util.h" +#include "test/util/temp_path.h" +#include "test/util/test_util.h" +#include "test/util/thread_util.h" + +namespace gvisor { +namespace testing { +namespace { + +// Test that SO_BINDTODEVICE requires CAP_NET_RAW. +// Linux enforces this in net/core/sock.c:sock_setsockopt(). +TEST(SoBindToDeviceCapTest, RequiresCapNetRaw) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_RAW))); + + int fd; + ASSERT_THAT(fd = socket(AF_INET, SOCK_STREAM, 0), SyscallSucceeds()); + FileDescriptor sock(fd); + + // With CAP_NET_RAW: should succeed (or ENODEV if "lo" is not found). + const std::string dev = "lo"; + int ret = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, dev.c_str(), + dev.size() + 1); + // Accept either success or ENODEV (device not found in some configs). + if (ret != 0) { + EXPECT_EQ(errno, ENODEV); + } + + // Drop CAP_NET_RAW: should fail with EPERM. + AutoCapability cap(CAP_NET_RAW, false); + EXPECT_THAT(setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, dev.c_str(), + dev.size() + 1), + SyscallFailsWithErrno(EPERM)); +} + +// Test that mknod(S_IFCHR) requires CAP_MKNOD. +// Linux enforces this in fs/namei.c:vfs_mknod(). +TEST(MknodCapTest, CharDevRequiresCapMknod) { + if (IsRunningOnGvisor()) { + GTEST_SKIP() << "runsc does not permit creating character device nodes"; + } + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_MKNOD))); + + auto path = NewTempAbsPath(); + + // With CAP_MKNOD: should succeed. + ASSERT_THAT(mknod(path.c_str(), S_IFCHR | 0666, makedev(1, 3)), + SyscallSucceeds()); + ASSERT_THAT(unlink(path.c_str()), SyscallSucceeds()); + + // Drop CAP_MKNOD: should fail with EPERM. + AutoCapability cap(CAP_MKNOD, false); + EXPECT_THAT(mknod(path.c_str(), S_IFCHR | 0666, makedev(1, 3)), + SyscallFailsWithErrno(EPERM)); +} + +// Test that mknod(S_IFBLK) requires CAP_MKNOD. +TEST(MknodCapTest, BlockDevRequiresCapMknod) { + if (IsRunningOnGvisor()) { + GTEST_SKIP() << "runsc does not permit creating block device nodes"; + } + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_MKNOD))); + + auto path = NewTempAbsPath(); + + // With CAP_MKNOD: should succeed. + ASSERT_THAT(mknod(path.c_str(), S_IFBLK | 0666, makedev(7, 0)), + SyscallSucceeds()); + ASSERT_THAT(unlink(path.c_str()), SyscallSucceeds()); + + // Drop CAP_MKNOD: should fail with EPERM. + AutoCapability cap(CAP_MKNOD, false); + EXPECT_THAT(mknod(path.c_str(), S_IFBLK | 0666, makedev(7, 0)), + SyscallFailsWithErrno(EPERM)); +} + +// Test that mknod(S_IFIFO) does NOT require CAP_MKNOD. +TEST(MknodCapTest, FifoDoesNotRequireCapMknod) { + auto path = NewTempAbsPath(); + AutoCapability cap(CAP_MKNOD, false); + EXPECT_THAT(mknod(path.c_str(), S_IFIFO | 0666, 0), SyscallSucceeds()); + EXPECT_THAT(unlink(path.c_str()), SyscallSucceeds()); +} + +// Test that sched_setaffinity on another task owned by a different UID +// requires CAP_SYS_NICE. Linux enforces this in +// kernel/sched/core.c:check_same_owner(). +// +// We cannot target PID 1 because in gvisor the test process and PID 1 +// typically share UID 0. The UID match would bypass the capability check, +// making the test always pass. Instead, we fork a child that changes its +// UID to nobody (65534), then the parent (still root but without +// CAP_SYS_NICE) attempts sched_setaffinity on the child. +TEST(SchedSetaffinityCapTest, OtherUidRequiresCapSysNice) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE))); + // Need CAP_SETUID to change the child's UID. + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID))); + + // Two pipes for synchronization: + // - child_ready: child signals parent after changing UID + // - parent_done: parent signals child to exit after testing + int child_ready[2]; + int parent_done[2]; + ASSERT_THAT(pipe(child_ready), SyscallSucceeds()); + ASSERT_THAT(pipe(parent_done), SyscallSucceeds()); + + pid_t child = fork(); + ASSERT_THAT(child, SyscallSucceeds()); + + if (child == 0) { + close(child_ready[0]); + close(parent_done[1]); + // Change to nobody UID. + if (setresuid(65534, 65534, 65534) != 0) { + _exit(1); + } + // Signal parent that we're ready. + char ready = 'r'; + write(child_ready[1], &ready, 1); + close(child_ready[1]); + // Wait until parent is done. + char buf; + read(parent_done[0], &buf, 1); + close(parent_done[0]); + _exit(0); + } + + close(child_ready[1]); + close(parent_done[0]); + + // Wait for the child to change UID. + char ready; + ASSERT_THAT(read(child_ready[0], &ready, 1), + SyscallSucceedsWithValue(1)); + close(child_ready[0]); + + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(0, &mask); + + // With CAP_SYS_NICE: should succeed on a different-UID process. + EXPECT_THAT(sched_setaffinity(child, sizeof(mask), &mask), + SyscallSucceeds()); + + // Drop CAP_SYS_NICE: should fail with EPERM (different UID, no cap). + AutoCapability cap(CAP_SYS_NICE, false); + EXPECT_THAT(sched_setaffinity(child, sizeof(mask), &mask), + SyscallFailsWithErrno(EPERM)); + + // Clean up child. + close(parent_done[1]); + int status; + ASSERT_THAT(waitpid(child, &status, 0), SyscallSucceeds()); +} + +// Test that setpriority on another task owned by a different UID requires +// CAP_SYS_NICE. Linux enforces this in kernel/sys.c:set_one_prio(). +// +// Same approach as above: fork a child with a different UID. +TEST(SetpriorityCapTest, OtherUidRequiresCapSysNice) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_NICE))); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID))); + + int child_ready[2]; + int parent_done[2]; + ASSERT_THAT(pipe(child_ready), SyscallSucceeds()); + ASSERT_THAT(pipe(parent_done), SyscallSucceeds()); + + pid_t child = fork(); + ASSERT_THAT(child, SyscallSucceeds()); + + if (child == 0) { + close(child_ready[0]); + close(parent_done[1]); + if (setresuid(65534, 65534, 65534) != 0) { + _exit(1); + } + char ready = 'r'; + write(child_ready[1], &ready, 1); + close(child_ready[1]); + char buf; + read(parent_done[0], &buf, 1); + close(parent_done[0]); + _exit(0); + } + + close(child_ready[1]); + close(parent_done[0]); + + char ready; + ASSERT_THAT(read(child_ready[0], &ready, 1), + SyscallSucceedsWithValue(1)); + close(child_ready[0]); + + // With CAP_SYS_NICE: should succeed. + EXPECT_THAT(setpriority(PRIO_PROCESS, child, 0), SyscallSucceeds()); + + // Drop CAP_SYS_NICE: should fail with EPERM. + AutoCapability cap(CAP_SYS_NICE, false); + EXPECT_THAT(setpriority(PRIO_PROCESS, child, 0), + SyscallFailsWithErrno(EPERM)); + + close(parent_done[1]); + int status; + ASSERT_THAT(waitpid(child, &status, 0), SyscallSucceeds()); +} + +} // namespace +} // namespace testing +} // namespace gvisor