[youki] Fixing readonly rootfs in rootless containers when NODEV/NOEXEC/NOSUID are set

18 May 2026

issue: #3517:[root.readonly: true] does not work on filesystems mounted with nodev or nosuid in usernamespace

fix: #3536:preserve mount flags for readonly remount of rootfs in init

TL;DR

youki forgot about original mount flags of the filesystem it was trying to remount
in rootless containers Linux locks some of these mount flags down for security reasons
when Linux sees that youki's init process tries to remount rootfs and drop locked flags it throws EPERM
solution is to include the original mount flags to the remount call

How did youki handle `root.readonly: true`?

youki:crates/libcontainer/src/process/init/process.rs#L199-L213:

199if matches!(args.container_type, ContainerType::InitContainer) {
200  if ctx.rootfs_ro {
201    ctx.syscall
202      .mount(
203        None,
204        Path::new("/"),
205        None,
206        // 👀 Notice how it didn't bother about
207        // 👀 any of the original mount flags
208        MsFlags::MS_RDONLY | MsFlags::MS_REMOUNT | MsFlags::MS_BIND,
209        None,
210      )
211      .map_err(|err| {
212        tracing::error!(?err, "failed to remount root `/` as readonly");
213        InitProcessError::SyscallOther(err)
214      })?;
215  }

This works perfectly for rootful containers run by a user with the CAP_SYS_ADMIN capability (youki resets effective and drops capabilities a little bit later in the flow).

But for rootless containers, a new user namespace is always created and Linux locks some of the mount flags in this case. It makes sense because we generally don't want a subordinate user namespace to be able to drop our security restrictions.

When these flags are locked, an attempt to remount without these flags gets EPERM from the kernel.

linux:/fs/namespace.c#L3326-L3344:

3326/*
3327 * Handle reconfiguration of the mountpoint only without alteration of the
3328 * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
3329 * to mount(2).
3330 */
3331static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags)
3332{
3333  struct super_block *sb = path->mnt->mnt_sb;
3334  struct mount *mnt = real_mount(path->mnt);
3335  int ret;
3336
3337  if (!check_mnt(mnt))
3338    return -EINVAL;
3339
3340  if (!path_mounted(path))
3341    return -EINVAL;
3342
3343  // 👀
3344  if (!can_change_locked_flags(mnt, mnt_flags))
3345    return -EPERM;

linux:/fs/namespace.c#L3242-L3273

3242/*
3243 * Don't allow locked mount flags to be cleared.
3244 *
3245 * No locks need to be held here while testing the various MNT_LOCK
3246 * flags because those flags can never be cleared once they are set.
3247 */
3248static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
3249{
3250	unsigned int fl = mnt->mnt.mnt_flags;
3251
3252	if ((fl & MNT_LOCK_READONLY) &&
3253	  !(mnt_flags & MNT_READONLY))
3254		return false;
3255
3256	if ((fl & MNT_LOCK_NODEV) &&
3257	  !(mnt_flags & MNT_NODEV))
3258		return false;
3259
3260	if ((fl & MNT_LOCK_NOSUID) &&
3261	  !(mnt_flags & MNT_NOSUID))
3262		return false;
3263
3264	if ((fl & MNT_LOCK_NOEXEC) &&
3265	  !(mnt_flags & MNT_NOEXEC))
3266		return false;
3267
3268	if ((fl & MNT_LOCK_ATIME) &&
3269	  ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
3270		return false;
3271
3272	return true;
3273}

linux:/fs/namespace.c#L2412-L2437:

2412static void lock_mnt_tree(struct mount *mnt)
2413{
2414  struct mount *p;
2415
2416  for (p = mnt; p; p = next_mnt(p, mnt)) {
2417    int flags = p->mnt.mnt_flags;
2418    /* Don't allow unprivileged users to change mount flags */
2419    flags |= MNT_LOCK_ATIME;
2420
2421    if (flags & MNT_READONLY)
2422      flags |= MNT_LOCK_READONLY;
2423
2424    if (flags & MNT_NODEV)
2425      flags |= MNT_LOCK_NODEV;
2426
2427    if (flags & MNT_NOSUID)
2428      flags |= MNT_LOCK_NOSUID;
2429
2430    if (flags & MNT_NOEXEC)
2431      flags |= MNT_LOCK_NOEXEC;
2432    /* Don't allow unprivileged users to reveal what is under a mount */
2433    if (list_empty(&p->mnt_expire) && p != mnt)
2434      flags |= MNT_LOCKED;
2435    p->mnt.mnt_flags = flags;
2436  }
2437}

Usage examples show us that the lock is placed when a mount is cloned across a user namespace boundary:

linux:/fs/namespace.c#L2637-L2639

2637/* Notice when we are propagating across user namespaces */
2638if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2639  lock_mnt_tree(child);

linux:/fs/namespace.c#L3156-L3162

3156/*
3157 * now mount the detached tree on top of the copy
3158 * of the real rootfs we created.
3159 */
3160attach_mnt(mnt, new_ns_root, mp.mp);
3161if (user_ns != ns->user_ns)
3162  lock_mnt_tree(new_ns_root);

linux:/fs/namespace.c#L4266-L4270

4266if (user_ns != ns->user_ns) {
4267  guard(mount_writer)();
4268  lock_mnt_tree(new);
4269}
4270new_ns->root = new;

How does youki handle `root.readonly: true` after fix?

youki:crates/libcontainer/src/process/init/process.rs#L201-L225:

201if matches!(args.container_type, ContainerType::InitContainer) {
202  if ctx.rootfs_ro {
203    // 👀 Here we get the original mount flags ...
204    let current_flags = statfs("/")
205      .map_err(|err| {
206        tracing::error!(?err, "failed to statfs root '/' to get current mount flags");
207        InitProcessError::SyscallOther(SyscallError::Nix(err))
208      })?
209      .flags()
210      .bits();
211    ctx.syscall
212      .mount(
213        None,
214        Path::new("/"),
215        None,
216        MsFlags::MS_RDONLY
217          | MsFlags::MS_REMOUNT
218          | MsFlags::MS_BIND
219          // 👀 ... and here we reuse them!
220          | MsFlags::from_bits_truncate(current_flags),
221        None,
222      )
223      .map_err(|err| {
224        tracing::error!(?err, "failed to remount root `/` as readonly");
225        InitProcessError::SyscallOther(err)
226      })?;
227  }

A note on `youki`'s testing infrastructure

youki has a really nice testing framework called contest. It makes it very easy to set up fixtures and hook into a container before the init process enters it.

It also allows hooking inside the container using runtimetest static binary which is compiled with a set of validators which are just simple functions that execute some checks and may output something into stderr which indicates a validation failure.

youki:/tests/contest/contest/src/tests/root_readonly_true/root_readonly_tests.rs#L31-L63:

31fn root_readonly_true_in_userns_test() -> TestResult {
32  // 👀 Here we get the effective user under which the test-runner itself is running.
33  // 👀 We need it to safely map the user inside the new container's user namespace to our user.
34  let uid = nix::unistd::geteuid().as_raw();
35  let gid = nix::unistd::getegid().as_raw();
36  let mut spec = Spec::rootless(uid, gid);
37  // 👀 Set readonly to `true`
38  spec.set_root(RootBuilder::default().readonly(true).build().ok())
39    .set_process(
40      ProcessBuilder::default()
41        // 👀 Here I use `root_readonly` validator that is already made by someone else:
42        // 👀 https://github.com/YawKar/youki/blob/e4b4896c6dbfd28270e11beb73e4799d7317556c/tests/contest/runtimetest/src/main.rs#L50
43        .args(vec!["runtimetest".to_string(), "root_readonly".to_string()])
44        .build()
45        .ok(),
46    );
47  test_inside_container(&spec, &CreateOptions::default(), &|rootfs: &Path| {
48    // Bind-mount the rootfs onto itself with MS_NODEV | MS_NOSUID, simulating a
49    // filesystem that has those flags locked (the typical case in user namespaces).
50    // Without the fix for #3517, the subsequent readonly remount would fail with
51    // EPERM because the kernel rejects dropping these flags in a user namespace.
52    // 👀 Here's why we need 2 mount calls:
53    // 👀 Initially '/' is just a directory inside the container namespace.
54    // 👀 Linux doesn't allow mount() calls on directories.
55    // 👀 We need to MS_BIND '/' to make a mount point out of it.
56    nix::mount::mount(
57      Some(rootfs),
58      rootfs,
59      None::<&str>,
60      MsFlags::MS_BIND,
61      None::<&str>,
62    )?;
63    // 👀 Now that we have a mount point we are free to modify mount flags!
64    nix::mount::mount(
65      Some(rootfs),
66      rootfs,
67      None::<&str>,
68      MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NODEV | MsFlags::MS_NOSUID,
69      None::<&str>,
70    )?;
71    Ok(())
72  })
73}

You can read more about it here: Youki Developer docs: Contest.

TL;DR

How did youki handle root.readonly: true?

How does youki handle root.readonly: true after fix?

A note on youki's testing infrastructure

How did youki handle `root.readonly: true`?

How does youki handle `root.readonly: true` after fix?

A note on `youki`'s testing infrastructure