yawkar's blog ยท software, linux, nix, cats... what? Light modeDark mode


[youki] Fixing readonly rootfs in rootless containers when NODEV/NOEXEC/NOSUID are set

TL;DR

How did youki handle root.readonly: true?

youki:crates/libcontainer/src/process/init/process.rs#L199-L213:

if matches!(args.container_type, ContainerType::InitContainer) {
  if ctx.rootfs_ro {
    ctx.syscall
      .mount(
        None,
        Path::new("/"),
        None,
        // ๐Ÿ‘€ Notice how it didn't bother about
        // ๐Ÿ‘€ any of the original mount flags
        MsFlags::MS_RDONLY | MsFlags::MS_REMOUNT | MsFlags::MS_BIND,
        None,
      )
      .map_err(|err| {
        tracing::error!(?err, "failed to remount root `/` as readonly");
        InitProcessError::SyscallOther(err)
      })?;
  }

This works perfectly for rootful containers run by a user with the CAP_SYS_ADMIN capability (youki resets effective and drops capabilities a little bit later in the flow).

But for rootless containers, a new user namespace is always created and Linux locks some of the mount flags in this case. It makes sense because we generally don't want a subordinate user namespace to be able to drop our security restrictions.

When these flags are locked, an attempt to remount without these flags gets EPERM from the kernel.

linux:/fs/namespace.c#L3326-L3344:

/*
 * Handle reconfiguration of the mountpoint only without alteration of the
 * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
 * to mount(2).
 */
static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags)
{
  struct super_block *sb = path->mnt->mnt_sb;
  struct mount *mnt = real_mount(path->mnt);
  int ret;

  if (!check_mnt(mnt))
    return -EINVAL;

  if (!path_mounted(path))
    return -EINVAL;

  // ๐Ÿ‘€
  if (!can_change_locked_flags(mnt, mnt_flags))
    return -EPERM;

linux:/fs/namespace.c#L3242-L3273

/*
 * Don't allow locked mount flags to be cleared.
 *
 * No locks need to be held here while testing the various MNT_LOCK
 * flags because those flags can never be cleared once they are set.
 */
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
{
	unsigned int fl = mnt->mnt.mnt_flags;

	if ((fl & MNT_LOCK_READONLY) &&
	  !(mnt_flags & MNT_READONLY))
		return false;

	if ((fl & MNT_LOCK_NODEV) &&
	  !(mnt_flags & MNT_NODEV))
		return false;

	if ((fl & MNT_LOCK_NOSUID) &&
	  !(mnt_flags & MNT_NOSUID))
		return false;

	if ((fl & MNT_LOCK_NOEXEC) &&
	  !(mnt_flags & MNT_NOEXEC))
		return false;

	if ((fl & MNT_LOCK_ATIME) &&
	  ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
		return false;

	return true;
}

linux:/fs/namespace.c#L2412-L2437:

static void lock_mnt_tree(struct mount *mnt)
{
  struct mount *p;

  for (p = mnt; p; p = next_mnt(p, mnt)) {
    int flags = p->mnt.mnt_flags;
    /* Don't allow unprivileged users to change mount flags */
    flags |= MNT_LOCK_ATIME;

    if (flags & MNT_READONLY)
      flags |= MNT_LOCK_READONLY;

    if (flags & MNT_NODEV)
      flags |= MNT_LOCK_NODEV;

    if (flags & MNT_NOSUID)
      flags |= MNT_LOCK_NOSUID;

    if (flags & MNT_NOEXEC)
      flags |= MNT_LOCK_NOEXEC;
    /* Don't allow unprivileged users to reveal what is under a mount */
    if (list_empty(&p->mnt_expire) && p != mnt)
      flags |= MNT_LOCKED;
    p->mnt.mnt_flags = flags;
  }
}

Usage examples show us that the lock is placed when a mount is cloned across a user namespace boundary:

How does youki handle root.readonly: true after fix?

youki:crates/libcontainer/src/process/init/process.rs#L201-L225:

if matches!(args.container_type, ContainerType::InitContainer) {
  if ctx.rootfs_ro {
    // ๐Ÿ‘€ Here we get the original mount flags ...
    let current_flags = statfs("/")
      .map_err(|err| {
        tracing::error!(?err, "failed to statfs root '/' to get current mount flags");
        InitProcessError::SyscallOther(SyscallError::Nix(err))
      })?
      .flags()
      .bits();
    ctx.syscall
      .mount(
        None,
        Path::new("/"),
        None,
        MsFlags::MS_RDONLY
          | MsFlags::MS_REMOUNT
          | MsFlags::MS_BIND
          // ๐Ÿ‘€ ... and here we reuse them!
          | MsFlags::from_bits_truncate(current_flags),
        None,
      )
      .map_err(|err| {
        tracing::error!(?err, "failed to remount root `/` as readonly");
        InitProcessError::SyscallOther(err)
      })?;
  }

A note on youki's testing infrastructure

youki has a really nice testing framework called contest. It makes it very easy to set up fixtures and hook into a container before the init process enters it.

It also allows hooking inside the container using runtimetest static binary which is compiled with a set of validators which are just simple functions that execute some checks and may output something into stderr which indicates a validation failure.

youki:/tests/contest/contest/src/tests/root_readonly_true/root_readonly_tests.rs#L31-L63:

fn root_readonly_true_in_userns_test() -> TestResult {
  // ๐Ÿ‘€ Here we get the effective user under which the test-runner itself is running.
  // ๐Ÿ‘€ We need it to safely map the user inside the new container's user namespace to our user.
  let uid = nix::unistd::geteuid().as_raw();
  let gid = nix::unistd::getegid().as_raw();
  let mut spec = Spec::rootless(uid, gid);
  // ๐Ÿ‘€ Set readonly to `true`
  spec.set_root(RootBuilder::default().readonly(true).build().ok())
    .set_process(
      ProcessBuilder::default()
        // ๐Ÿ‘€ Here I use `root_readonly` validator that is already made by someone else:
        // ๐Ÿ‘€ https://github.com/YawKar/youki/blob/e4b4896c6dbfd28270e11beb73e4799d7317556c/tests/contest/runtimetest/src/main.rs#L50
        .args(vec!["runtimetest".to_string(), "root_readonly".to_string()])
        .build()
        .ok(),
    );
  test_inside_container(&spec, &CreateOptions::default(), &|rootfs: &Path| {
    // Bind-mount the rootfs onto itself with MS_NODEV | MS_NOSUID, simulating a
    // filesystem that has those flags locked (the typical case in user namespaces).
    // Without the fix for #3517, the subsequent readonly remount would fail with
    // EPERM because the kernel rejects dropping these flags in a user namespace.
    // ๐Ÿ‘€ Here's why we need 2 mount calls:
    // ๐Ÿ‘€ Initially '/' is just a directory inside the container namespace.
    // ๐Ÿ‘€ Linux doesn't allow mount() calls on directories.
    // ๐Ÿ‘€ We need to MS_BIND '/' to make a mount point out of it.
    nix::mount::mount(
      Some(rootfs),
      rootfs,
      None::<&str>,
      MsFlags::MS_BIND,
      None::<&str>,
    )?;
    // ๐Ÿ‘€ Now that we have a mount point we are free to modify mount flags!
    nix::mount::mount(
      Some(rootfs),
      rootfs,
      None::<&str>,
      MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NODEV | MsFlags::MS_NOSUID,
      None::<&str>,
    )?;
    Ok(())
  })
}

You can read more about it here: Youki Developer docs: Contest.