diff options
Diffstat (limited to 'sys')
52 files changed, 1334 insertions, 415 deletions
diff --git a/sys/amd64/conf/GENERIC b/sys/amd64/conf/GENERIC index 786edc4125c9..c98cfce8613a 100644 --- a/sys/amd64/conf/GENERIC +++ b/sys/amd64/conf/GENERIC @@ -95,8 +95,6 @@ options RCTL # Resource limits # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. -# For full debugger support use (turn off in stable branch): -include "std.debug" # Kernel dump features. options EKCD # Support for encrypted kernel dumps diff --git a/sys/amd64/conf/GENERIC-NODEBUG b/sys/amd64/conf/GENERIC-NODEBUG deleted file mode 100644 index 1939b0efd352..000000000000 --- a/sys/amd64/conf/GENERIC-NODEBUG +++ /dev/null @@ -1,31 +0,0 @@ -# -# GENERIC-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file -# for FreeBSD/amd64 -# -# This configuration file removes several debugging options, including -# WITNESS and INVARIANTS checking, which are known to have significant -# performance impact on running systems. When benchmarking new features -# this kernel should be used instead of the standard GENERIC. -# This kernel configuration should never appear outside of the HEAD -# of the FreeBSD tree. -# -# For more information on this file, please read the config(5) manual page, -# and/or the handbook section on Kernel Configuration Files: -# -# https://docs.freebsd.org/en/books/handbook/kernelconfig/#kernelconfig-config -# -# The handbook is also available locally in /usr/share/doc/handbook -# if you've installed the doc distribution, otherwise always see the -# FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the -# latest information. -# -# An exhaustive list of options and more detailed explanations of the -# device lines is also present in the ../../conf/NOTES and NOTES files. -# If you are in doubt as to the purpose or necessity of a line, check first -# in NOTES. -# - -include GENERIC -include "std.nodebug" - -ident GENERIC-NODEBUG diff --git a/sys/amd64/conf/MINIMAL b/sys/amd64/conf/MINIMAL index 0baf6d6431de..ec5ab2fcaee3 100644 --- a/sys/amd64/conf/MINIMAL +++ b/sys/amd64/conf/MINIMAL @@ -73,8 +73,6 @@ options INCLUDE_CONFIG_FILE # Include this file in kernel # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. -# For full debugger support use (turn off in stable branch): -include "std.debug" # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel diff --git a/sys/amd64/conf/MINIMAL-NODEBUG b/sys/amd64/conf/MINIMAL-NODEBUG deleted file mode 100644 index 7b7c22bbcaf6..000000000000 --- a/sys/amd64/conf/MINIMAL-NODEBUG +++ /dev/null @@ -1,11 +0,0 @@ -# -# MINIMAL-NODEBUG -- Non-debug MINIMAL kernel. -# -# This is the MINIMAL equivalent to GENERIC-NODEBUG. - -#NO_UNIVERSE - -include MINIMAL -include "std.nodebug" - -ident MINIMAL-NODEBUG diff --git a/sys/arm/conf/GENERIC-NODEBUG b/sys/arm/conf/GENERIC-NODEBUG deleted file mode 100644 index 0b3199245187..000000000000 --- a/sys/arm/conf/GENERIC-NODEBUG +++ /dev/null @@ -1,31 +0,0 @@ -# -# GENERIC-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file -# for FreeBSD/arm -# -# This configuration file removes several debugging options, including -# WITNESS and INVARIANTS checking, which are known to have significant -# performance impact on running systems. When benchmarking new features -# this kernel should be used instead of the standard GENERIC. -# This kernel configuration should never appear outside of the HEAD -# of the FreeBSD tree. -# -# For more information on this file, please read the config(5) manual page, -# and/or the handbook section on Kernel Configuration Files: -# -# https://docs.freebsd.org/en/books/handbook/kernelconfig/#kernelconfig-config -# -# The handbook is also available locally in /usr/share/doc/handbook -# if you've installed the doc distribution, otherwise always see the -# FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the -# latest information. -# -# An exhaustive list of options and more detailed explanations of the -# device lines is also present in the ../../conf/NOTES and NOTES files. -# If you are in doubt as to the purpose or necessity of a line, check first -# in NOTES. -# - -include GENERIC -include "std.nodebug" - -ident GENERIC-NODEBUG diff --git a/sys/arm/conf/std.armv7 b/sys/arm/conf/std.armv7 index 4ef60c331212..15d8304ae5f1 100644 --- a/sys/arm/conf/std.armv7 +++ b/sys/arm/conf/std.armv7 @@ -65,9 +65,6 @@ options KDB_TRACE # Print a stack trace for a panic. options USB_DEBUG # Enable usb debug support code -# For full debugger support use (turn off in stable branch): -include "std.debug" - # Optional extras, never enabled by default: #options BOOTVERBOSE #options DEBUG # May result in extreme spewage diff --git a/sys/arm64/arm64/machdep.c b/sys/arm64/arm64/machdep.c index 627b02e82d34..47c701e8588c 100644 --- a/sys/arm64/arm64/machdep.c +++ b/sys/arm64/arm64/machdep.c @@ -858,7 +858,7 @@ initarm(struct arm64_bootparams *abp) cninit(); set_ttbr0(abp->kern_ttbr0); - cpu_tlb_flushID(); + pmap_s1_invalidate_all_kernel(); if (!valid) panic("Invalid bus configuration: %s", diff --git a/sys/arm64/arm64/pmap.c b/sys/arm64/arm64/pmap.c index 48b62442e68f..8a4395aa1c89 100644 --- a/sys/arm64/arm64/pmap.c +++ b/sys/arm64/arm64/pmap.c @@ -190,6 +190,8 @@ pt_entry_t __read_mostly pmap_gp_attr; #define PMAP_SAN_PTE_BITS (ATTR_AF | ATTR_S1_XN | pmap_sh_attr | \ ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW)) +static bool __read_mostly pmap_multiple_tlbi = false; + struct pmap_large_md_page { struct rwlock pv_lock; struct md_page pv_page; @@ -1297,7 +1299,7 @@ pmap_bootstrap_dmap(vm_size_t kernlen) } } - cpu_tlb_flushID(); + pmap_s1_invalidate_all_kernel(); bs_state.dmap_valid = true; @@ -1399,7 +1401,7 @@ pmap_bootstrap(void) /* And the l3 tables for the early devmap */ pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE)); - cpu_tlb_flushID(); + pmap_s1_invalidate_all_kernel(); #define alloc_pages(var, np) \ (var) = bs_state.freemempos; \ @@ -1723,6 +1725,51 @@ CPU_FEAT(feat_hafdbs, "Hardware management of the Access flag and dirty state", pmap_dbm_check, pmap_dbm_has_errata, pmap_dbm_enable, CPU_FEAT_AFTER_DEV | CPU_FEAT_PER_CPU); +static cpu_feat_en +pmap_multiple_tlbi_check(const struct cpu_feat *feat __unused, u_int midr) +{ + /* + * Cortex-A55 erratum 2441007 (Cat B rare) + * Present in all revisions + */ + if (CPU_IMPL(midr) == CPU_IMPL_ARM && + CPU_PART(midr) == CPU_PART_CORTEX_A55) + return (FEAT_DEFAULT_DISABLE); + + /* + * Cortex-A76 erratum 1286807 (Cat B rare) + * Present in r0p0 - r3p0 + * Fixed in r3p1 + */ + if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A76, + 0, 0, 3, 0)) + return (FEAT_DEFAULT_DISABLE); + + /* + * Cortex-A510 erratum 2441009 (Cat B rare) + * Present in r0p0 - r1p1 + * Fixed in r1p2 + */ + if (midr_check_var_part_range(midr, CPU_IMPL_ARM, CPU_PART_CORTEX_A510, + 0, 0, 1, 1)) + return (FEAT_DEFAULT_DISABLE); + + return (FEAT_ALWAYS_DISABLE); +} + +static bool +pmap_multiple_tlbi_enable(const struct cpu_feat *feat __unused, + cpu_feat_errata errata_status, u_int *errata_list __unused, + u_int errata_count __unused) +{ + pmap_multiple_tlbi = true; + return (true); +} + +CPU_FEAT(errata_multi_tlbi, "Multiple TLBI errata", + pmap_multiple_tlbi_check, NULL, pmap_multiple_tlbi_enable, + CPU_FEAT_EARLY_BOOT | CPU_FEAT_PER_CPU); + /* * Initialize the pmap module. * @@ -1876,9 +1923,17 @@ pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only) r = TLBI_VA(va); if (pmap == kernel_pmap) { pmap_s1_invalidate_kernel(r, final_only); + if (pmap_multiple_tlbi) { + dsb(ish); + pmap_s1_invalidate_kernel(r, final_only); + } } else { r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); pmap_s1_invalidate_user(r, final_only); + if (pmap_multiple_tlbi) { + dsb(ish); + pmap_s1_invalidate_user(r, final_only); + } } dsb(ish); isb(); @@ -1920,12 +1975,24 @@ pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, end = TLBI_VA(eva); for (r = start; r < end; r += TLBI_VA(stride)) pmap_s1_invalidate_kernel(r, final_only); + + if (pmap_multiple_tlbi) { + dsb(ish); + for (r = start; r < end; r += TLBI_VA(stride)) + pmap_s1_invalidate_kernel(r, final_only); + } } else { start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); start |= TLBI_VA(sva); end |= TLBI_VA(eva); for (r = start; r < end; r += TLBI_VA(stride)) pmap_s1_invalidate_user(r, final_only); + + if (pmap_multiple_tlbi) { + dsb(ish); + for (r = start; r < end; r += TLBI_VA(stride)) + pmap_s1_invalidate_user(r, final_only); + } } dsb(ish); isb(); @@ -1961,6 +2028,19 @@ pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pmap_s2_invalidate_range(pmap, sva, eva, final_only); } +void +pmap_s1_invalidate_all_kernel(void) +{ + dsb(ishst); + __asm __volatile("tlbi vmalle1is"); + dsb(ish); + if (pmap_multiple_tlbi) { + __asm __volatile("tlbi vmalle1is"); + dsb(ish); + } + isb(); +} + /* * Invalidates all cached intermediate- and final-level TLB entries for the * given virtual address space. @@ -1975,9 +2055,17 @@ pmap_s1_invalidate_all(pmap_t pmap) dsb(ishst); if (pmap == kernel_pmap) { __asm __volatile("tlbi vmalle1is"); + if (pmap_multiple_tlbi) { + dsb(ish); + __asm __volatile("tlbi vmalle1is"); + } } else { r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)); __asm __volatile("tlbi aside1is, %0" : : "r" (r)); + if (pmap_multiple_tlbi) { + dsb(ish); + __asm __volatile("tlbi aside1is, %0" : : "r" (r)); + } } dsb(ish); isb(); @@ -7965,7 +8053,7 @@ pmap_mapbios(vm_paddr_t pa, vm_size_t size) pa += L2_SIZE; } if ((old_l2e & ATTR_DESCR_VALID) != 0) - pmap_s1_invalidate_all(kernel_pmap); + pmap_s1_invalidate_all_kernel(); else { /* * Because the old entries were invalid and the new @@ -8056,7 +8144,7 @@ pmap_unmapbios(void *p, vm_size_t size) } } if (preinit_map) { - pmap_s1_invalidate_all(kernel_pmap); + pmap_s1_invalidate_all_kernel(); return; } diff --git a/sys/arm64/conf/GENERIC-MMCCAM-NODEBUG b/sys/arm64/conf/GENERIC-MMCCAM-NODEBUG deleted file mode 100644 index b2e865129012..000000000000 --- a/sys/arm64/conf/GENERIC-MMCCAM-NODEBUG +++ /dev/null @@ -1,14 +0,0 @@ -# -# GENERIC-MMCCAM-NODEBUG -# -# Custom kernel for arm64 plus MMCCAM as opposed to the prior MMC stack. It is -# present to keep it building in tree since it wouldn't work in LINT. This -# version without debugging features. -# - -#NO_UNIVERSE - -include GENERIC-MMCCAM -include "std.nodebug" - -ident GENERIC-MMCCAM-NODEBUG diff --git a/sys/arm64/conf/GENERIC-NODEBUG b/sys/arm64/conf/GENERIC-NODEBUG deleted file mode 100644 index 086942dfaab1..000000000000 --- a/sys/arm64/conf/GENERIC-NODEBUG +++ /dev/null @@ -1,31 +0,0 @@ -# -# GENERIC-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file -# for FreeBSD/arm64 -# -# This configuration file removes several debugging options, including -# WITNESS and INVARIANTS checking, which are known to have significant -# performance impact on running systems. When benchmarking new features -# this kernel should be used instead of the standard GENERIC. -# This kernel configuration should never appear outside of the HEAD -# of the FreeBSD tree. -# -# For more information on this file, please read the config(5) manual page, -# and/or the handbook section on Kernel Configuration Files: -# -# https://docs.freebsd.org/en/books/handbook/kernelconfig/#kernelconfig-config -# -# The handbook is also available locally in /usr/share/doc/handbook -# if you've installed the doc distribution, otherwise always see the -# FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the -# latest information. -# -# An exhaustive list of options and more detailed explanations of the -# device lines is also present in the ../../conf/NOTES and NOTES files. -# If you are in doubt as to the purpose or necessity of a line, check first -# in NOTES. -# - -include GENERIC -include "std.nodebug" - -ident GENERIC-NODEBUG diff --git a/sys/arm64/conf/std.arm64 b/sys/arm64/conf/std.arm64 index a0568466cfaf..58f3748e2700 100644 --- a/sys/arm64/conf/std.arm64 +++ b/sys/arm64/conf/std.arm64 @@ -74,8 +74,6 @@ options PERTHREAD_SSP # Per-thread SSP canary # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. -# For full debugger support use (turn off in stable branch): -include "std.debug" # Kernel Sanitizers #options COVERAGE # Generic kernel coverage. Used by KCOV diff --git a/sys/arm64/include/cpu.h b/sys/arm64/include/cpu.h index f07b67d18abf..07a783138f42 100644 --- a/sys/arm64/include/cpu.h +++ b/sys/arm64/include/cpu.h @@ -193,9 +193,6 @@ (((mask) & PCPU_GET(midr)) == \ ((mask) & CPU_ID_RAW((impl), (part), (var), (rev)))) -#define CPU_MATCH_RAW(mask, devid) \ - (((mask) & PCPU_GET(midr)) == ((mask) & (devid))) - #if !defined(__ASSEMBLER__) static inline bool midr_check_var_part_range(u_int midr, u_int impl, u_int part, u_int var_low, diff --git a/sys/arm64/include/pmap.h b/sys/arm64/include/pmap.h index 357c1a0d8232..406b6e2c5e0a 100644 --- a/sys/arm64/include/pmap.h +++ b/sys/arm64/include/pmap.h @@ -175,6 +175,8 @@ int pmap_fault(pmap_t, uint64_t, uint64_t); struct pcb *pmap_switch(struct thread *); +void pmap_s1_invalidate_all_kernel(void); + extern void (*pmap_clean_stage2_tlbi)(void); extern void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool); diff --git a/sys/compat/freebsd32/freebsd32_syscall.h b/sys/compat/freebsd32/freebsd32_syscall.h index 90cd21a80923..54063150eef9 100644 --- a/sys/compat/freebsd32/freebsd32_syscall.h +++ b/sys/compat/freebsd32/freebsd32_syscall.h @@ -515,4 +515,6 @@ #define FREEBSD32_SYS_inotify_rm_watch 594 #define FREEBSD32_SYS_getgroups 595 #define FREEBSD32_SYS_setgroups 596 -#define FREEBSD32_SYS_MAXSYSCALL 597 +#define FREEBSD32_SYS_jail_attach_jd 597 +#define FREEBSD32_SYS_jail_remove_jd 598 +#define FREEBSD32_SYS_MAXSYSCALL 599 diff --git a/sys/compat/freebsd32/freebsd32_syscalls.c b/sys/compat/freebsd32/freebsd32_syscalls.c index f0f8d26554b5..f7cc4c284e4d 100644 --- a/sys/compat/freebsd32/freebsd32_syscalls.c +++ b/sys/compat/freebsd32/freebsd32_syscalls.c @@ -602,4 +602,6 @@ const char *freebsd32_syscallnames[] = { "inotify_rm_watch", /* 594 = inotify_rm_watch */ "getgroups", /* 595 = getgroups */ "setgroups", /* 596 = setgroups */ + "jail_attach_jd", /* 597 = jail_attach_jd */ + "jail_remove_jd", /* 598 = jail_remove_jd */ }; diff --git a/sys/compat/freebsd32/freebsd32_sysent.c b/sys/compat/freebsd32/freebsd32_sysent.c index 12f1a346c3e9..18f809ef04e3 100644 --- a/sys/compat/freebsd32/freebsd32_sysent.c +++ b/sys/compat/freebsd32/freebsd32_sysent.c @@ -664,4 +664,6 @@ struct sysent freebsd32_sysent[] = { { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */ { .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 595 = getgroups */ { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ + { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ + { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ }; diff --git a/sys/compat/freebsd32/freebsd32_systrace_args.c b/sys/compat/freebsd32/freebsd32_systrace_args.c index e471c5148021..29a5497e9efa 100644 --- a/sys/compat/freebsd32/freebsd32_systrace_args.c +++ b/sys/compat/freebsd32/freebsd32_systrace_args.c @@ -3413,6 +3413,20 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 2; break; } + /* jail_attach_jd */ + case 597: { + struct jail_attach_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } + /* jail_remove_jd */ + case 598: { + struct jail_remove_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } default: *n_args = 0; break; @@ -9222,6 +9236,26 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* jail_attach_jd */ + case 597: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; + /* jail_remove_jd */ + case 598: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -11130,6 +11164,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* jail_attach_jd */ + case 597: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* jail_remove_jd */ + case 598: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/conf/files b/sys/conf/files index d89813c70355..9661bafea8f9 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -3808,6 +3808,7 @@ kern/kern_hhook.c standard kern/kern_idle.c standard kern/kern_intr.c standard kern/kern_jail.c standard +kern/kern_jaildesc.c standard kern/kern_jailmeta.c standard kern/kern_kcov.c optional kcov \ compile-with "${NOSAN_C} ${MSAN_CFLAGS}" diff --git a/sys/dev/cyapa/cyapa.c b/sys/dev/cyapa/cyapa.c index 50fa4faa560a..ed755f992949 100644 --- a/sys/dev/cyapa/cyapa.c +++ b/sys/dev/cyapa/cyapa.c @@ -761,42 +761,60 @@ again: /* * Generate report */ - c0 = 0; - if (delta_x < 0) - c0 |= 0x10; - if (delta_y < 0) - c0 |= 0x20; - c0 |= 0x08; - if (but & CYAPA_FNGR_LEFT) - c0 |= 0x01; - if (but & CYAPA_FNGR_MIDDLE) - c0 |= 0x04; - if (but & CYAPA_FNGR_RIGHT) - c0 |= 0x02; - - fifo_write_char(sc, &sc->rfifo, c0); - fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_x); - fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_y); - switch(sc->zenabled) { - case 1: - /* Z axis all 8 bits */ - fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_z); - break; - case 2: - /* - * Z axis low 4 bits + 4th button and 5th button - * (high 2 bits must be left 0). Auto-scale - * delta_z to fit to avoid a wrong-direction - * overflow (don't try to retain the remainder). - */ - while (delta_z > 7 || delta_z < -8) - delta_z >>= 1; - c0 = (uint8_t)delta_z & 0x0F; + if (sc->mode.level == 1) { + c0 = MOUSE_SYS_SYNC; + if (but & CYAPA_FNGR_LEFT) + c0 |= MOUSE_SYS_BUTTON1UP; + if (but & CYAPA_FNGR_MIDDLE) + c0 |= MOUSE_SYS_BUTTON2UP; + if (but & CYAPA_FNGR_RIGHT) + c0 |= MOUSE_SYS_BUTTON3UP; fifo_write_char(sc, &sc->rfifo, c0); - break; - default: - /* basic PS/2 */ - break; + fifo_write_char(sc, &sc->rfifo, delta_x >> 1); + fifo_write_char(sc, &sc->rfifo, delta_y >> 1); + fifo_write_char(sc, &sc->rfifo, delta_x - (delta_x >> 1)); + fifo_write_char(sc, &sc->rfifo, delta_y - (delta_y >> 1)); + fifo_write_char(sc, &sc->rfifo, delta_z >> 1); + fifo_write_char(sc, &sc->rfifo, delta_z - (delta_z >> 1)); + fifo_write_char(sc, &sc->rfifo, MOUSE_SYS_EXTBUTTONS); + } else { + c0 = 0; + if (delta_x < 0) + c0 |= 0x10; + if (delta_y < 0) + c0 |= 0x20; + c0 |= 0x08; + if (but & CYAPA_FNGR_LEFT) + c0 |= 0x01; + if (but & CYAPA_FNGR_MIDDLE) + c0 |= 0x04; + if (but & CYAPA_FNGR_RIGHT) + c0 |= 0x02; + + fifo_write_char(sc, &sc->rfifo, c0); + fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_x); + fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_y); + switch(sc->zenabled) { + case 1: + /* Z axis all 8 bits */ + fifo_write_char(sc, &sc->rfifo, (uint8_t)delta_z); + break; + case 2: + /* + * Z axis low 4 bits + 4th button and 5th button + * (high 2 bits must be left 0). Auto-scale + * delta_z to fit to avoid a wrong-direction + * overflow (don't try to retain the remainder). + */ + while (delta_z > 7 || delta_z < -8) + delta_z >>= 1; + c0 = (uint8_t)delta_z & 0x0F; + fifo_write_char(sc, &sc->rfifo, c0); + break; + default: + /* basic PS/2 */ + break; + } } cyapa_notify(sc); } @@ -1205,6 +1223,11 @@ cyapaioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread ((mousemode_t *)data)->packetsize = MOUSE_PS2_PACKETSIZE; break; + case 1: + ((mousemode_t *)data)->protocol = MOUSE_PROTO_SYSMOUSE; + ((mousemode_t *)data)->packetsize = + MOUSE_SYS_PACKETSIZE; + break; case 2: ((mousemode_t *)data)->protocol = MOUSE_PROTO_PS2; ((mousemode_t *)data)->packetsize = @@ -1223,7 +1246,7 @@ cyapaioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread error = EINVAL; break; } - sc->mode.level = *(int *)data ? 2 : 0; + sc->mode.level = *(int *)data; sc->zenabled = sc->mode.level ? 1 : 0; break; diff --git a/sys/dev/sound/pci/hda/hdac.c b/sys/dev/sound/pci/hda/hdac.c index 900578b73de4..90cd74d28b3d 100644 --- a/sys/dev/sound/pci/hda/hdac.c +++ b/sys/dev/sound/pci/hda/hdac.c @@ -1773,17 +1773,17 @@ hdac_detach(device_t dev) struct hdac_softc *sc = device_get_softc(dev); int i, error; + callout_drain(&sc->poll_callout); + hdac_irq_free(sc); + taskqueue_drain(taskqueue_thread, &sc->unsolq_task); + error = bus_generic_detach(dev); if (error != 0) return (error); hdac_lock(sc); - callout_stop(&sc->poll_callout); hdac_reset(sc, false); hdac_unlock(sc); - callout_drain(&sc->poll_callout); - taskqueue_drain(taskqueue_thread, &sc->unsolq_task); - hdac_irq_free(sc); for (i = 0; i < sc->num_ss; i++) hdac_dma_free(sc, &sc->streams[i].bdl); @@ -2206,4 +2206,4 @@ static driver_t hdac_driver = { sizeof(struct hdac_softc), }; -DRIVER_MODULE(snd_hda, pci, hdac_driver, NULL, NULL); +DRIVER_MODULE_ORDERED(snd_hda, pci, hdac_driver, NULL, NULL, SI_ORDER_ANY); diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c index 4f19af6281a3..73f27ac147ff 100644 --- a/sys/dev/virtio/network/if_vtnet.c +++ b/sys/dev/virtio/network/if_vtnet.c @@ -1178,6 +1178,7 @@ vtnet_setup_interface(struct vtnet_softc *sc) if (sc->vtnet_max_mtu >= ETHERMTU_JUMBO) if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0); if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU, 0); + if_setcapabilitiesbit(ifp, IFCAP_HWSTATS, 0); /* * Capabilities after here are not enabled by default. @@ -3036,16 +3037,14 @@ vtnet_get_counter(if_t ifp, ift_counter cnt) return (rxaccum.vrxs_iqdrops); case IFCOUNTER_IERRORS: return (rxaccum.vrxs_ierrors); + case IFCOUNTER_IBYTES: + return (rxaccum.vrxs_ibytes); case IFCOUNTER_OPACKETS: return (txaccum.vtxs_opackets); case IFCOUNTER_OBYTES: - if (!VTNET_ALTQ_ENABLED) - return (txaccum.vtxs_obytes); - /* FALLTHROUGH */ + return (txaccum.vtxs_obytes); case IFCOUNTER_OMCASTS: - if (!VTNET_ALTQ_ENABLED) - return (txaccum.vtxs_omcasts); - /* FALLTHROUGH */ + return (txaccum.vtxs_omcasts); default: return (if_get_counter_default(ifp, cnt)); } diff --git a/sys/i386/conf/GENERIC b/sys/i386/conf/GENERIC index 88b8967cd693..f426c3d11874 100644 --- a/sys/i386/conf/GENERIC +++ b/sys/i386/conf/GENERIC @@ -89,8 +89,6 @@ options RCTL # Resource limits # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. -# For full debugger support use (turn off in stable branch): -include "std.debug" # Kernel dump features. options EKCD # Support for encrypted kernel dumps diff --git a/sys/i386/conf/GENERIC-NODEBUG b/sys/i386/conf/GENERIC-NODEBUG deleted file mode 100644 index a93304481b5f..000000000000 --- a/sys/i386/conf/GENERIC-NODEBUG +++ /dev/null @@ -1,33 +0,0 @@ -# -# GENERIC-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file -# for FreeBSD/i386 -# -# This configuration file removes several debugging options, including -# WITNESS and INVARIANTS checking, which are known to have significant -# performance impact on running systems. When benchmarking new features -# this kernel should be used instead of the standard GENERIC. -# This kernel configuration should never appear outside of the HEAD -# of the FreeBSD tree. -# -# For more information on this file, please read the config(5) manual page, -# and/or the handbook section on Kernel Configuration Files: -# -# https://docs.freebsd.org/en/books/handbook/kernelconfig/#kernelconfig-config -# -# The handbook is also available locally in /usr/share/doc/handbook -# if you've installed the doc distribution, otherwise always see the -# FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the -# latest information. -# -# An exhaustive list of options and more detailed explanations of the -# device lines is also present in the ../../conf/NOTES and NOTES files. -# If you are in doubt as to the purpose or necessity of a line, check first -# in NOTES. -# - -#NO_UNIVERSE - -include GENERIC -include "std.nodebug" - -ident GENERIC-NODEBUG diff --git a/sys/i386/conf/MINIMAL b/sys/i386/conf/MINIMAL index 8019617ca4d4..6b70c4e59825 100644 --- a/sys/i386/conf/MINIMAL +++ b/sys/i386/conf/MINIMAL @@ -83,8 +83,6 @@ options INCLUDE_CONFIG_FILE # Include this file in kernel # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. -# For full debugger support use (turn off in stable branch): -include "std.debug" # Make an SMP-capable kernel by default options SMP # Symmetric MultiProcessor Kernel diff --git a/sys/i386/conf/MINIMAL-NODEBUG b/sys/i386/conf/MINIMAL-NODEBUG deleted file mode 100644 index 7b7c22bbcaf6..000000000000 --- a/sys/i386/conf/MINIMAL-NODEBUG +++ /dev/null @@ -1,11 +0,0 @@ -# -# MINIMAL-NODEBUG -- Non-debug MINIMAL kernel. -# -# This is the MINIMAL equivalent to GENERIC-NODEBUG. - -#NO_UNIVERSE - -include MINIMAL -include "std.nodebug" - -ident MINIMAL-NODEBUG diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index fcd232cde21e..e42e7dcf8b44 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -663,4 +663,6 @@ struct sysent sysent[] = { { .sy_narg = AS(inotify_rm_watch_args), .sy_call = (sy_call_t *)sys_inotify_rm_watch, .sy_auevent = AUE_INOTIFY, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 594 = inotify_rm_watch */ { .sy_narg = AS(getgroups_args), .sy_call = (sy_call_t *)sys_getgroups, .sy_auevent = AUE_GETGROUPS, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 595 = getgroups */ { .sy_narg = AS(setgroups_args), .sy_call = (sy_call_t *)sys_setgroups, .sy_auevent = AUE_SETGROUPS, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 596 = setgroups */ + { .sy_narg = AS(jail_attach_jd_args), .sy_call = (sy_call_t *)sys_jail_attach_jd, .sy_auevent = AUE_JAIL_ATTACH, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 597 = jail_attach_jd */ + { .sy_narg = AS(jail_remove_jd_args), .sy_call = (sy_call_t *)sys_jail_remove_jd, .sy_auevent = AUE_JAIL_REMOVE, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 598 = jail_remove_jd */ }; diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index a27ab33b34da..057235574eb5 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -5250,6 +5250,8 @@ file_type_to_name(short type) return ("eventfd"); case DTYPE_TIMERFD: return ("timerfd"); + case DTYPE_JAILDESC: + return ("jail"); default: return ("unkn"); } diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index eb77a5064113..501adc151d44 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -50,6 +50,7 @@ #include <sys/filedesc.h> #include <sys/filio.h> #include <sys/fcntl.h> +#include <sys/jail.h> #include <sys/kthread.h> #include <sys/selinfo.h> #include <sys/queue.h> @@ -163,6 +164,9 @@ static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); +static int filt_jailattach(struct knote *kn); +static void filt_jaildetach(struct knote *kn); +static int filt_jail(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static void filt_timerexpire_l(struct knote *kn, bool proc_locked); @@ -195,6 +199,12 @@ static const struct filterops proc_filtops = { .f_detach = filt_procdetach, .f_event = filt_proc, }; +static const struct filterops jail_filtops = { + .f_isfd = 0, + .f_attach = filt_jailattach, + .f_detach = filt_jaildetach, + .f_event = filt_jail, +}; static const struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, @@ -365,6 +375,7 @@ static struct { [~EVFILT_USER] = { &user_filtops, 1 }, [~EVFILT_SENDFILE] = { &null_filtops }, [~EVFILT_EMPTY] = { &file_filtops, 1 }, + [~EVFILT_JAIL] = { &jail_filtops, 1 }, }; /* @@ -528,7 +539,8 @@ filt_proc(struct knote *kn, long hint) * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the - * child's pid. + * child's pid. This is also called on jail creation, which is treated + * the same way by jail events. */ void knote_fork(struct knlist *list, int pid) @@ -555,6 +567,8 @@ knote_fork(struct knlist *list, int pid) /* * The same as knote(), activate the event. */ + _Static_assert(NOTE_JAIL_CHILD == NOTE_FORK, + "NOTE_JAIL_CHILD should be the same as NOTE_FORK"); if ((kn->kn_sfflags & NOTE_TRACK) == 0) { if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 1); @@ -614,6 +628,124 @@ knote_fork(struct knlist *list, int pid) } } +int +filt_jailattach(struct knote *kn) +{ + struct prison *pr; + bool immediate; + + immediate = false; + if (kn->kn_id == 0) { + /* Let jid=0 watch the current prison (including prison0). */ + pr = curthread->td_ucred->cr_prison; + mtx_lock(&pr->pr_mtx); + } else if (kn->kn_flags & (EV_FLAG1 | EV_FLAG2)) { + /* + * The kernel registers prisons before they are valid, + * so prison_find_child will fail. + */ + TAILQ_FOREACH(pr, &allprison, pr_list) { + if (pr->pr_id < kn->kn_id) + continue; + if (pr->pr_id > kn->kn_id) { + pr = NULL; + break; + } + mtx_lock(&pr->pr_mtx); + break; + } + if (pr == NULL) + return (ENOENT); + } else { + sx_slock(&allprison_lock); + pr = prison_find_child(curthread->td_ucred->cr_prison, + kn->kn_id); + sx_sunlock(&allprison_lock); + if (pr == NULL) + return (ENOENT); + if (!prison_isalive(pr)) { + mtx_unlock(&pr->pr_mtx); + return (ENOENT); + } + } + kn->kn_ptr.p_prison = pr; + kn->kn_flags |= EV_CLEAR; + + /* + * Internal flag indicating registration done by kernel for the + * purposes of getting a NOTE_CHILD notification. + */ + if (kn->kn_flags & EV_FLAG2) { + kn->kn_flags &= ~EV_FLAG2; + kn->kn_data = kn->kn_sdata; /* parent id */ + kn->kn_fflags = NOTE_CHILD; + kn->kn_sfflags &= ~NOTE_JAIL_CTRLMASK; + immediate = true; /* Force immediate activation of child note. */ + } + /* + * Internal flag indicating registration done by kernel (for other than + * NOTE_CHILD). + */ + if (kn->kn_flags & EV_FLAG1) { + kn->kn_flags &= ~EV_FLAG1; + } + + knlist_add(pr->pr_klist, kn, 1); + + /* Immediately activate any child notes. */ + if (immediate) + KNOTE_ACTIVATE(kn, 0); + + mtx_unlock(&pr->pr_mtx); + return (0); +} + +void +filt_jaildetach(struct knote *kn) +{ + if (kn->kn_ptr.p_prison != NULL) { + knlist_remove(kn->kn_knlist, kn, 0); + kn->kn_ptr.p_prison = NULL; + } else + kn->kn_status |= KN_DETACHED; +} + +int +filt_jail(struct knote *kn, long hint) +{ + struct prison *pr; + u_int event; + + pr = kn->kn_ptr.p_prison; + if (pr == NULL) /* already activated, from attach filter */ + return (0); + + /* Mask off extra data. */ + event = (u_int)hint & NOTE_JAIL_CTRLMASK; + + /* If the user is interested in this event, record it. */ + if (kn->kn_sfflags & event) + kn->kn_fflags |= event; + + /* Report the attached process id. */ + if (event == NOTE_JAIL_ATTACH) { + if (kn->kn_data != 0) + kn->kn_fflags |= NOTE_JAIL_ATTACH_MULTI; + kn->kn_data = hint & NOTE_JAIL_DATAMASK; + } + + /* Prison is gone, so flag the event as finished. */ + if (event == NOTE_JAIL_REMOVE) { + kn->kn_flags |= EV_EOF | EV_ONESHOT; + kn->kn_ptr.p_prison = NULL; + if (kn->kn_fflags == 0) + kn->kn_flags |= EV_DROP; + return (1); + } + + return (kn->kn_fflags != 0); +} + /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. @@ -1597,8 +1729,8 @@ findkn: /* * If possible, find an existing knote to use for this kevent. */ - if (kev->filter == EVFILT_PROC && - (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { + if ((kev->filter == EVFILT_PROC || kev->filter == EVFILT_JAIL) + && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { /* This is an internal creation of a process tracking * note. Don't attempt to coalesce this with an * existing note. @@ -2800,6 +2932,7 @@ knote_init(void) knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue); + prison0.pr_klist = knlist_alloc(&prison0.pr_mtx); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 7c9a15ae18f3..5a1fbe23ddeb 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -39,15 +39,18 @@ #include <sys/kernel.h> #include <sys/systm.h> #include <sys/errno.h> +#include <sys/file.h> #include <sys/sysproto.h> #include <sys/malloc.h> #include <sys/osd.h> #include <sys/priv.h> #include <sys/proc.h> #include <sys/epoch.h> +#include <sys/event.h> #include <sys/taskqueue.h> #include <sys/fcntl.h> #include <sys/jail.h> +#include <sys/jaildesc.h> #include <sys/linker.h> #include <sys/lock.h> #include <sys/mman.h> @@ -154,7 +157,8 @@ static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); static int prison_lock_xlock(struct prison *pr, int flags); -static void prison_cleanup(struct prison *pr); +static void prison_cleanup_locked(struct prison *pr); +static void prison_cleanup_unlocked(struct prison *pr); static void prison_free_not_last(struct prison *pr); static void prison_proc_free_not_last(struct prison *pr); static void prison_proc_relink(struct prison *opr, struct prison *npr, @@ -167,6 +171,7 @@ static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif +static void prison_knote(struct prison *pr, long hint); /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ @@ -985,6 +990,8 @@ prison_ip_cnt(const struct prison *pr, const pr_family_t af) int kern_jail_set(struct thread *td, struct uio *optuio, int flags) { + struct file *jfp_out; + struct jaildesc *desc_in; struct nameidata nd; #ifdef INET struct prison_ip *ip4; @@ -995,6 +1002,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) struct vfsopt *opt; struct vfsoptlist *opts; struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr; + struct ucred *jdcred; struct vnode *root; char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; char *g_path, *osrelstr; @@ -1008,7 +1016,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) int created, cuflags, descend, drflags, enforce; int error, errmsg_len, errmsg_pos; int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; - int deadid, jid, jsys, len, level; + int deadid, jfd_in, jfd_out, jfd_pos, jid, jsys, len, level; int childmax, osreldt, rsnum, slevel; #ifdef INET int ip4s; @@ -1018,22 +1026,32 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) int ip6s; bool redo_ip6; #endif + bool maybe_changed; uint64_t pr_allow, ch_allow, pr_flags, ch_flags; uint64_t pr_allow_diff; unsigned tallow; char numbuf[12]; - error = priv_check(td, PRIV_JAIL_SET); - if (!error && (flags & JAIL_ATTACH)) - error = priv_check(td, PRIV_JAIL_ATTACH); - if (error) - return (error); mypr = td->td_ucred->cr_prison; - if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) + if (((flags & (JAIL_CREATE | JAIL_AT_DESC)) == JAIL_CREATE) + && mypr->pr_childmax == 0) return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); + if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) + == (JAIL_USE_DESC | JAIL_AT_DESC)) + return (EINVAL); + prison_hold(mypr); +#ifdef INET + ip4 = NULL; +#endif +#ifdef INET6 + ip6 = NULL; +#endif + g_path = NULL; + jfp_out = NULL; + jfd_out = -1; /* * Check all the parameters before committing to anything. Not all * errors can be caught early, but we may as well try. Also, this @@ -1046,14 +1064,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) */ error = vfs_buildopts(optuio, &opts); if (error) - return (error); -#ifdef INET - ip4 = NULL; -#endif -#ifdef INET6 - ip6 = NULL; -#endif - g_path = NULL; + goto done_free; cuflags = flags & (JAIL_CREATE | JAIL_UPDATE); if (!cuflags) { @@ -1062,6 +1073,72 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) goto done_errmsg; } + error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); + if (error == ENOENT) { + if (flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | + JAIL_OWN_DESC)) { + vfs_opterror(opts, "missing desc"); + goto done_errmsg; + } + jfd_in = -1; + } else if (error != 0) + goto done_free; + else { + if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | + JAIL_OWN_DESC))) { + vfs_opterror(opts, "unexpected desc"); + goto done_errmsg; + } + if (flags & JAIL_AT_DESC) { + /* + * Look up and create jails based on the + * descriptor's prison. + */ + prison_free(mypr); + error = jaildesc_find(td, jfd_in, &desc_in, &mypr, + NULL); + if (error != 0) { + vfs_opterror(opts, error == ENOENT + ? "descriptor to dead jail" + : "not a jail descriptor"); + goto done_errmsg; + } + /* + * Check file permissions using the current + * credentials, and operation permissions + * using the descriptor's credentials. + */ + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VEXEC, td->td_ucred); + JAILDESC_UNLOCK(desc_in); + if (error != 0) + goto done_free; + if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0) { + error = EPERM; + goto done_free; + } + } + if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { + /* Allocate a jail descriptor to return later. */ + error = jaildesc_alloc(td, &jfp_out, &jfd_out, + flags & JAIL_OWN_DESC); + if (error) + goto done_free; + } + } + + /* + * Delay the permission check if using a jail descriptor, + * until we get the descriptor's credentials. + */ + if (!(flags & JAIL_USE_DESC)) { + error = priv_check(td, PRIV_JAIL_SET); + if (error == 0 && (flags & JAIL_ATTACH)) + error = priv_check(td, PRIV_JAIL_ATTACH); + if (error) + goto done_free; + } + error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; @@ -1422,6 +1499,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) pr = NULL; inspr = NULL; deadpr = NULL; + maybe_changed = false; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); @@ -1436,7 +1514,57 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) error = EAGAIN; goto done_deref; } - if (jid != 0) { + if (flags & JAIL_USE_DESC) { + /* Get the jail from its descriptor. */ + error = jaildesc_find(td, jfd_in, &desc_in, &pr, &jdcred); + if (error) { + vfs_opterror(opts, error == ENOENT + ? "descriptor to dead jail" + : "not a jail descriptor"); + goto done_deref; + } + drflags |= PD_DEREF; + /* + * Check file permissions using the current credentials, + * and operation permissions using the descriptor's + * credentials. + */ + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VWRITE, td->td_ucred); + if (error == 0 && (flags & JAIL_ATTACH)) + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VEXEC, td->td_ucred); + JAILDESC_UNLOCK(desc_in); + if (error == 0) + error = priv_check_cred(jdcred, PRIV_JAIL_SET); + if (error == 0 && (flags & JAIL_ATTACH)) + error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); + crfree(jdcred); + if (error) + goto done_deref; + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + if (cuflags == JAIL_CREATE) { + error = EEXIST; + vfs_opterror(opts, "jail %d already exists", + pr->pr_id); + goto done_deref; + } + if (!prison_isalive(pr)) { + /* While a jid can be resurrected, the prison + * itself cannot. + */ + error = ENOENT; + vfs_opterror(opts, "jail %d is dying", pr->pr_id); + goto done_deref; + } + if (jid != 0 && jid != pr->pr_id) { + error = EINVAL; + vfs_opterror(opts, "cannot change jid"); + goto done_deref; + } + jid = pr->pr_id; + } else if (jid != 0) { if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); @@ -1570,7 +1698,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } } } - /* Update: must provide a jid or name. */ + /* Update: must provide a desc, jid, or name. */ else if (cuflags == JAIL_UPDATE && pr == NULL) { error = ENOENT; vfs_opterror(opts, "update specified no jail"); @@ -1643,6 +1771,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; + pr->pr_klist = knlist_alloc(&pr->pr_mtx); /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) @@ -1722,8 +1851,10 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * Grab a reference for existing prisons, to ensure they * continue to exist for the duration of the call. */ - prison_hold(pr); - drflags |= PD_DEREF; + if (!(drflags & PD_DEREF)) { + prison_hold(pr); + drflags |= PD_DEREF; + } #if defined(VIMAGE) && (defined(INET) || defined(INET6)) if ((pr->pr_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) { @@ -1880,6 +2011,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) goto done_deref; } } + maybe_changed = true; /* Set the parameters of the prison. */ #ifdef INET @@ -2112,7 +2244,12 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * reference via persistence, or is about to gain one via attachment. */ if (created) { - drflags = prison_lock_xlock(pr, drflags); + sx_assert(&allprison_lock, SX_XLOCKED); + mtx_lock(&ppr->pr_mtx); + knote_fork(ppr->pr_klist, pr->pr_id); + mtx_unlock(&ppr->pr_mtx); + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; pr->pr_state = PRISON_STATE_ALIVE; } @@ -2146,10 +2283,37 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) printf("Warning jail jid=%d: mountd/nfsd requires a separate" " file system\n", pr->pr_id); + /* + * Now that the prison is fully created without error, set the + * jail descriptor if one was requested. This is the only + * parameter that is returned to the caller (except the error + * message). + */ + if (jfd_out >= 0) { + if (!(drflags & PD_LOCKED)) { + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + } + jfd_pos = 2 * vfs_getopt_pos(opts, "desc") + 1; + if (optuio->uio_segflg == UIO_SYSSPACE) + *(int*)optuio->uio_iov[jfd_pos].iov_base = jfd_out; + else + (void)copyout(&jfd_out, + optuio->uio_iov[jfd_pos].iov_base, sizeof(jfd_out)); + jaildesc_set_prison(jfp_out, pr); + } + drflags &= ~PD_KILL; td->td_retval[0] = pr->pr_id; done_deref: + /* + * Report changes to kevent. This can happen even if the + * system call fails, as changes might have been made before + * the failure. + */ + if (maybe_changed && !created) + prison_knote(pr, NOTE_JAIL_SET); /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); @@ -2176,15 +2340,21 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } } done_free: + /* Clean up other resources. */ #ifdef INET prison_ip_free(ip4); #endif #ifdef INET6 prison_ip_free(ip6); #endif + if (jfp_out != NULL) + fdrop(jfp_out, td); + if (error && jfd_out >= 0) + (void)kern_close(td, jfd_out); if (g_path != NULL) free(g_path, M_TEMP); vfs_freeopts(opts); + prison_free(mypr); return (error); } @@ -2329,16 +2499,22 @@ int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { struct bool_flags *bf; + struct file *jfp_out; + struct jaildesc *desc_in; struct jailsys_flags *jsf; struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos; + int jfd_in, jfd_out; unsigned f; if (flags & ~JAIL_GET_MASK) return (EINVAL); + if ((flags & (JAIL_USE_DESC | JAIL_AT_DESC)) + == (JAIL_USE_DESC | JAIL_AT_DESC)) + return (EINVAL); /* Get the parameter list. */ error = vfs_buildopts(optuio, &opts); @@ -2346,13 +2522,81 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); mypr = td->td_ucred->cr_prison; + prison_hold(mypr); pr = NULL; + jfp_out = NULL; + jfd_out = -1; /* - * Find the prison specified by one of: lastjid, jid, name. + * Find the prison specified by one of: desc, lastjid, jid, name. */ sx_slock(&allprison_lock); drflags = PD_LIST_SLOCKED; + + error = vfs_copyopt(opts, "desc", &jfd_in, sizeof(jfd_in)); + if (error == ENOENT) { + if (flags & (JAIL_AT_DESC | JAIL_GET_DESC | JAIL_OWN_DESC)) { + vfs_opterror(opts, "missing desc"); + goto done; + } + } else if (error == 0) { + if (!(flags & (JAIL_USE_DESC | JAIL_AT_DESC | JAIL_GET_DESC | + JAIL_OWN_DESC))) { + vfs_opterror(opts, "unexpected desc"); + goto done; + } + if (flags & JAIL_USE_DESC) { + /* Get the jail from its descriptor. */ + error = jaildesc_find(td, jfd_in, &desc_in, &pr, NULL); + if (error) { + vfs_opterror(opts, error == ENOENT + ? "descriptor to dead jail" + : "not a jail descriptor"); + goto done; + } + drflags |= PD_DEREF; + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VREAD, td->td_ucred); + JAILDESC_UNLOCK(desc_in); + if (error != 0) + goto done; + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + if (!(prison_isalive(pr) || (flags & JAIL_DYING))) { + error = ENOENT; + vfs_opterror(opts, "jail %d is dying", + pr->pr_id); + goto done; + } + goto found_prison; + } + if (flags & JAIL_AT_DESC) { + /* Look up jails based on the descriptor's prison. */ + prison_free(mypr); + error = jaildesc_find(td, jfd_in, &desc_in, &mypr, + NULL); + if (error != 0) { + vfs_opterror(opts, error == ENOENT + ? "descriptor to dead jail" + : "not a jail descriptor"); + goto done; + } + error = vaccess(VREG, desc_in->jd_mode, desc_in->jd_uid, + desc_in->jd_gid, VEXEC, td->td_ucred); + JAILDESC_UNLOCK(desc_in); + if (error != 0) + goto done; + } + if (flags & (JAIL_GET_DESC | JAIL_OWN_DESC)) { + /* Allocate a jail descriptor to return later. */ + error = jaildesc_alloc(td, &jfp_out, &jfd_out, + flags & JAIL_OWN_DESC); + if (error) + goto done; + } + } else + goto done; + error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { @@ -2421,9 +2665,17 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) found_prison: /* Get the parameters of the prison. */ - prison_hold(pr); - drflags |= PD_DEREF; + if (!(drflags & PD_DEREF)) { + prison_hold(pr); + drflags |= PD_DEREF; + } td->td_retval[0] = pr->pr_id; + if (jfd_out >= 0) { + error = vfs_setopt(opts, "desc", &jfd_out, sizeof(jfd_out)); + if (error != 0 && error != ENOENT) + goto done; + jaildesc_set_prison(jfp_out, pr); + } error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done; @@ -2603,6 +2855,13 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) prison_deref(pr, drflags); else if (drflags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); + else if (drflags & PD_LIST_XLOCKED) + sx_xunlock(&allprison_lock); + /* Clean up other resources. */ + if (jfp_out != NULL) + (void)fdrop(jfp_out, td); + if (error && jfd_out >= 0) + (void)kern_close(td, jfd_out); if (error && errmsg_pos >= 0) { /* Write the error message back to userspace. */ vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len); @@ -2619,6 +2878,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) } } vfs_freeopts(opts); + prison_free(mypr); return (error); } @@ -2643,14 +2903,63 @@ sys_jail_remove(struct thread *td, struct jail_remove_args *uap) sx_xunlock(&allprison_lock); return (EINVAL); } + prison_hold(pr); + prison_remove(pr); + return (0); +} + +/* + * struct jail_remove_jd_args { + * int fd; + * }; + */ +int +sys_jail_remove_jd(struct thread *td, struct jail_remove_jd_args *uap) +{ + struct jaildesc *jd; + struct prison *pr; + struct ucred *jdcred; + int error; + + error = jaildesc_find(td, uap->fd, &jd, &pr, &jdcred); + if (error) + return (error); + /* + * Check file permissions using the current credentials, and + * operation permissions using the descriptor's credentials. + */ + error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VWRITE, + td->td_ucred); + JAILDESC_UNLOCK(jd); + if (error == 0) + error = priv_check_cred(jdcred, PRIV_JAIL_REMOVE); + crfree(jdcred); + if (error) { + prison_free(pr); + return (error); + } + sx_xlock(&allprison_lock); + mtx_lock(&pr->pr_mtx); + prison_remove(pr); + return (0); +} + +/* + * Begin the removal process for a prison. The allprison lock should + * be held exclusively, and the prison should be both locked and held. + */ +void +prison_remove(struct prison *pr) +{ + sx_assert(&allprison_lock, SA_XLOCKED); + mtx_assert(&pr->pr_mtx, MA_OWNED); if (!prison_isalive(pr)) { /* Silently ignore already-dying prisons. */ mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); - return (0); + return; } - prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED); - return (0); + prison_deref(pr, PD_KILL | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); } /* @@ -2685,6 +2994,53 @@ sys_jail_attach(struct thread *td, struct jail_attach_args *uap) return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED)); } +/* + * struct jail_attach_jd_args { + * int fd; + * }; + */ +int +sys_jail_attach_jd(struct thread *td, struct jail_attach_jd_args *uap) +{ + struct jaildesc *jd; + struct prison *pr; + struct ucred *jdcred; + int drflags, error; + + sx_slock(&allprison_lock); + drflags = PD_LIST_SLOCKED; + error = jaildesc_find(td, uap->fd, &jd, &pr, &jdcred); + if (error) + goto fail; + drflags |= PD_DEREF; + /* + * Check file permissions using the current credentials, and + * operation permissions using the descriptor's credentials. + */ + error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VEXEC, + td->td_ucred); + JAILDESC_UNLOCK(jd); + if (error == 0) + error = priv_check_cred(jdcred, PRIV_JAIL_ATTACH); + crfree(jdcred); + if (error) + goto fail; + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; + + /* Do not allow a process to attach to a prison that is not alive. */ + if (!prison_isalive(pr)) { + error = EINVAL; + goto fail; + } + + return (do_jail_attach(td, pr, drflags)); + + fail: + prison_deref(pr, drflags); + return (error); +} + static int do_jail_attach(struct thread *td, struct prison *pr, int drflags) { @@ -2703,9 +3059,12 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags) * a process root from one prison, but attached to the jail * of another. */ - prison_hold(pr); + if (!(drflags & PD_DEREF)) { + prison_hold(pr); + drflags |= PD_DEREF; + } refcount_acquire(&pr->pr_uref); - drflags |= PD_DEREF | PD_DEUREF; + drflags |= PD_DEUREF; mtx_unlock(&pr->pr_mtx); drflags &= ~PD_LOCKED; @@ -2755,6 +3114,7 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags) prison_proc_relink(oldcred->cr_prison, pr, p); prison_deref(oldcred->cr_prison, drflags); crfree(oldcred); + prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid); /* * If the prison was killed while changing credentials, die along @@ -3182,9 +3542,10 @@ prison_deref(struct prison *pr, int flags) refcount_load(&prison0.pr_uref) > 0, ("prison0 pr_uref=0")); pr->pr_state = PRISON_STATE_DYING; + prison_cleanup_locked(pr); mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; - prison_cleanup(pr); + prison_cleanup_unlocked(pr); } } } @@ -3327,8 +3688,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) } if (!(cpr->pr_flags & PR_REMOVE)) continue; - prison_cleanup(cpr); + prison_cleanup_unlocked(cpr); mtx_lock(&cpr->pr_mtx); + prison_cleanup_locked(cpr); cpr->pr_flags &= ~PR_REMOVE; if (cpr->pr_flags & PR_PERSIST) { cpr->pr_flags &= ~PR_PERSIST; @@ -3363,8 +3725,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); - prison_cleanup(pr); + prison_cleanup_unlocked(pr); mtx_lock(&pr->pr_mtx); + prison_cleanup_locked(pr); if (pr->pr_flags & PR_PERSIST) { pr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(pr); @@ -3411,10 +3774,22 @@ prison_lock_xlock(struct prison *pr, int flags) /* * Release a prison's resources when it starts dying (when the last user - * reference is dropped, or when it is killed). + * reference is dropped, or when it is killed). Two functions are called, + * for work that requires a locked prison or an unlocked one. */ static void -prison_cleanup(struct prison *pr) +prison_cleanup_locked(struct prison *pr) +{ + sx_assert(&allprison_lock, SA_XLOCKED); + mtx_assert(&pr->pr_mtx, MA_OWNED); + prison_knote(pr, NOTE_JAIL_REMOVE); + knlist_detach(pr->pr_klist); + jaildesc_prison_cleanup(pr); + pr->pr_klist = NULL; +} + +static void +prison_cleanup_unlocked(struct prison *pr) { sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_NOTOWNED); @@ -4616,6 +4991,7 @@ sysctl_jail_param(SYSCTL_HANDLER_ARGS) * jail creation time but cannot be changed in an existing jail. */ SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); +SYSCTL_JAIL_PARAM(, desc, CTLTYPE_INT | CTLFLAG_RW, "I", "Jail descriptor"); SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); @@ -5039,6 +5415,22 @@ prison_racct_detach(struct prison *pr) } #endif /* RACCT */ +/* + * Submit a knote for a prison, locking if necessary. + */ +static void +prison_knote(struct prison *pr, long hint) +{ + int locked; + + locked = mtx_owned(&pr->pr_mtx); + if (!locked) + mtx_lock(&pr->pr_mtx); + KNOTE_LOCKED(pr->pr_klist, hint); + if (!locked) + mtx_unlock(&pr->pr_mtx); +} + #ifdef DDB static void diff --git a/sys/kern/kern_jaildesc.c b/sys/kern/kern_jaildesc.c new file mode 100644 index 000000000000..e00ec9a4bfff --- /dev/null +++ b/sys/kern/kern_jaildesc.c @@ -0,0 +1,337 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 James Gritton. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/jail.h> +#include <sys/jaildesc.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/mutex.h> +#include <sys/priv.h> +#include <sys/stat.h> +#include <sys/sysproto.h> +#include <sys/systm.h> +#include <sys/ucred.h> +#include <sys/vnode.h> + +MALLOC_DEFINE(M_JAILDESC, "jaildesc", "jail descriptors"); + +static fo_stat_t jaildesc_stat; +static fo_close_t jaildesc_close; +static fo_chmod_t jaildesc_chmod; +static fo_chown_t jaildesc_chown; +static fo_fill_kinfo_t jaildesc_fill_kinfo; +static fo_cmp_t jaildesc_cmp; + +static struct fileops jaildesc_ops = { + .fo_read = invfo_rdwr, + .fo_write = invfo_rdwr, + .fo_truncate = invfo_truncate, + .fo_ioctl = invfo_ioctl, + .fo_poll = invfo_poll, + .fo_kqfilter = invfo_kqfilter, + .fo_stat = jaildesc_stat, + .fo_close = jaildesc_close, + .fo_chmod = jaildesc_chmod, + .fo_chown = jaildesc_chown, + .fo_sendfile = invfo_sendfile, + .fo_fill_kinfo = jaildesc_fill_kinfo, + .fo_cmp = jaildesc_cmp, + .fo_flags = DFLAG_PASSABLE, +}; + +/* + * Given a jail descriptor number, return the jaildesc, its prison, + * and its credential. The jaildesc will be returned locked, and + * prison and the credential will be returned held. + */ +int +jaildesc_find(struct thread *td, int fd, struct jaildesc **jdp, + struct prison **prp, struct ucred **ucredp) +{ + struct file *fp; + struct jaildesc *jd; + struct prison *pr; + int error; + + error = fget(td, fd, &cap_no_rights, &fp); + if (error != 0) + return (error); + if (fp->f_type != DTYPE_JAILDESC) { + error = EBADF; + goto out; + } + jd = fp->f_data; + JAILDESC_LOCK(jd); + pr = jd->jd_prison; + if (pr == NULL || !prison_isvalid(pr)) { + error = ENOENT; + JAILDESC_UNLOCK(jd); + goto out; + } + prison_hold(pr); + *prp = pr; + if (jdp != NULL) + *jdp = jd; + else + JAILDESC_UNLOCK(jd); + if (ucredp != NULL) + *ucredp = crhold(fp->f_cred); + out: + fdrop(fp, td); + return (error); +} + +/* + * Allocate a new jail decriptor, not yet associated with a prison. + * Return the file pointer (with a reference held) and the descriptor + * number. + */ +int +jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning) +{ + struct file *fp; + struct jaildesc *jd; + int error; + mode_t mode; + + if (owning) { + error = priv_check(td, PRIV_JAIL_REMOVE); + if (error != 0) + return (error); + mode = S_ISTXT; + } else + mode = 0; + jd = malloc(sizeof(*jd), M_JAILDESC, M_WAITOK | M_ZERO); + error = falloc_caps(td, &fp, fdp, 0, NULL); + finit(fp, priv_check_cred(fp->f_cred, PRIV_JAIL_SET) == 0 + ? FREAD | FWRITE : FREAD, DTYPE_JAILDESC, jd, &jaildesc_ops); + if (error != 0) { + free(jd, M_JAILDESC); + return (error); + } + JAILDESC_LOCK_INIT(jd); + jd->jd_uid = fp->f_cred->cr_uid; + jd->jd_gid = fp->f_cred->cr_gid; + jd->jd_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH | mode + | (priv_check(td, PRIV_JAIL_SET) == 0 ? S_IWUSR | S_IXUSR : 0) + | (priv_check(td, PRIV_JAIL_ATTACH) == 0 ? S_IXUSR : 0); + *fpp = fp; + return (0); +} + +/* + * Assocate a jail descriptor with its prison. + */ +void +jaildesc_set_prison(struct file *fp, struct prison *pr) +{ + struct jaildesc *jd; + + mtx_assert(&pr->pr_mtx, MA_OWNED); + jd = fp->f_data; + JAILDESC_LOCK(jd); + jd->jd_prison = pr; + LIST_INSERT_HEAD(&pr->pr_descs, jd, jd_list); + prison_hold(pr); + JAILDESC_UNLOCK(jd); +} + +/* + * Detach the all jail descriptors from a prison. + */ +void +jaildesc_prison_cleanup(struct prison *pr) +{ + struct jaildesc *jd; + + mtx_assert(&pr->pr_mtx, MA_OWNED); + while ((jd = LIST_FIRST(&pr->pr_descs))) { + JAILDESC_LOCK(jd); + LIST_REMOVE(jd, jd_list); + jd->jd_prison = NULL; + JAILDESC_UNLOCK(jd); + prison_free(pr); + } +} + +static int +jaildesc_close(struct file *fp, struct thread *td) +{ + struct jaildesc *jd; + struct prison *pr; + + jd = fp->f_data; + fp->f_data = NULL; + if (jd != NULL) { + JAILDESC_LOCK(jd); + pr = jd->jd_prison; + if (pr != NULL) { + /* + * Free or remove the associated prison. + * This requires a second check after re- + * ordering locks. This jaildesc can remain + * unlocked once we have a prison reference, + * because that prison is the only place that + * still points back to it. + */ + prison_hold(pr); + JAILDESC_UNLOCK(jd); + if (jd->jd_mode & S_ISTXT) { + sx_xlock(&allprison_lock); + prison_lock(pr); + if (jd->jd_prison != NULL) { + /* + * Unlink the prison, but don't free + * it; that will be done as part of + * of prison_remove. + */ + LIST_REMOVE(jd, jd_list); + prison_remove(pr); + } else { + prison_unlock(pr); + sx_xunlock(&allprison_lock); + } + } else { + prison_lock(pr); + if (jd->jd_prison != NULL) { + LIST_REMOVE(jd, jd_list); + prison_free(pr); + } + prison_unlock(pr); + } + prison_free(pr); + } + JAILDESC_LOCK_DESTROY(jd); + free(jd, M_JAILDESC); + } + finit(fp, 0, DTYPE_NONE, NULL, &badfileops); + return (0); +} + +static int +jaildesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) +{ + struct jaildesc *jd; + + bzero(sb, sizeof(struct stat)); + jd = fp->f_data; + JAILDESC_LOCK(jd); + if (jd->jd_prison != NULL) { + sb->st_ino = jd->jd_prison ? jd->jd_prison->pr_id : 0; + sb->st_uid = jd->jd_uid; + sb->st_gid = jd->jd_gid; + sb->st_mode = jd->jd_mode; + } else + sb->st_mode = S_IFREG; + JAILDESC_UNLOCK(jd); + return (0); +} + +static int +jaildesc_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, + struct thread *td) +{ + struct jaildesc *jd; + int error; + + /* Reject permissions that the creator doesn't have. */ + if (((mode & (S_IWUSR | S_IWGRP | S_IWOTH)) + && priv_check_cred(fp->f_cred, PRIV_JAIL_SET) != 0) + || ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) + && priv_check_cred(fp->f_cred, PRIV_JAIL_ATTACH) != 0 + && priv_check_cred(fp->f_cred, PRIV_JAIL_SET) != 0) + || ((mode & S_ISTXT) + && priv_check_cred(fp->f_cred, PRIV_JAIL_REMOVE) != 0)) + return (EPERM); + if (mode & (S_ISUID | S_ISGID)) + return (EINVAL); + jd = fp->f_data; + JAILDESC_LOCK(jd); + error = vaccess(VREG, jd->jd_mode, jd->jd_uid, jd->jd_gid, VADMIN, + active_cred); + if (error == 0) + jd->jd_mode = S_IFREG | (mode & ALLPERMS); + JAILDESC_UNLOCK(jd); + return (error); +} + +static int +jaildesc_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, + struct thread *td) +{ + struct jaildesc *jd; + int error; + + error = 0; + jd = fp->f_data; + JAILDESC_LOCK(jd); + if (uid == (uid_t)-1) + uid = jd->jd_uid; + if (gid == (gid_t)-1) + gid = jd->jd_gid; + if ((uid != jd->jd_uid && uid != active_cred->cr_uid) || + (gid != jd->jd_gid && !groupmember(gid, active_cred))) + error = priv_check_cred(active_cred, PRIV_VFS_CHOWN); + if (error == 0) { + jd->jd_uid = uid; + jd->jd_gid = gid; + } + JAILDESC_UNLOCK(jd); + return (error); +} + +static int +jaildesc_fill_kinfo(struct file *fp, struct kinfo_file *kif, + struct filedesc *fdp) +{ + return (EINVAL); +} + +static int +jaildesc_cmp(struct file *fp1, struct file *fp2, struct thread *td) +{ + struct jaildesc *jd1, *jd2; + int jid1, jid2; + + if (fp2->f_type != DTYPE_JAILDESC) + return (3); + jd1 = fp1->f_data; + JAILDESC_LOCK(jd1); + jid1 = jd1->jd_prison ? (uintptr_t)jd1->jd_prison->pr_id : 0; + JAILDESC_UNLOCK(jd1); + jd2 = fp2->f_data; + JAILDESC_LOCK(jd2); + jid2 = jd2->jd_prison ? (uintptr_t)jd2->jd_prison->pr_id : 0; + JAILDESC_UNLOCK(jd2); + return (kcmp_cmp(jid1, jid2)); +} diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 4122f9261871..4cef89cd5219 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -602,4 +602,6 @@ const char *syscallnames[] = { "inotify_rm_watch", /* 594 = inotify_rm_watch */ "getgroups", /* 595 = getgroups */ "setgroups", /* 596 = setgroups */ + "jail_attach_jd", /* 597 = jail_attach_jd */ + "jail_remove_jd", /* 598 = jail_remove_jd */ }; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index fa64597d14a5..911f9093824b 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3383,5 +3383,15 @@ _In_reads_(gidsetsize) const gid_t *gidset ); } +597 AUE_JAIL_ATTACH STD { + int jail_attach_jd( + int fd + ); + } +598 AUE_JAIL_REMOVE STD { + int jail_remove_jd( + int fd + ); + } ; vim: syntax=off diff --git a/sys/kern/systrace_args.c b/sys/kern/systrace_args.c index 2b1ea9eed8d4..e28fef931ea8 100644 --- a/sys/kern/systrace_args.c +++ b/sys/kern/systrace_args.c @@ -3500,6 +3500,20 @@ systrace_args(int sysnum, void *params, uint64_t *uarg, int *n_args) *n_args = 2; break; } + /* jail_attach_jd */ + case 597: { + struct jail_attach_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } + /* jail_remove_jd */ + case 598: { + struct jail_remove_jd_args *p = params; + iarg[a++] = p->fd; /* int */ + *n_args = 1; + break; + } default: *n_args = 0; break; @@ -9367,6 +9381,26 @@ systrace_entry_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) break; }; break; + /* jail_attach_jd */ + case 597: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; + /* jail_remove_jd */ + case 598: + switch (ndx) { + case 0: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -11365,6 +11399,16 @@ systrace_return_setargdesc(int sysnum, int ndx, char *desc, size_t descsz) if (ndx == 0 || ndx == 1) p = "int"; break; + /* jail_attach_jd */ + case 597: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* jail_remove_jd */ + case 598: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; diff --git a/sys/modules/sound/driver/hda/Makefile b/sys/modules/sound/driver/hda/Makefile index 0eec98fc53e1..1e137dc5671c 100644 --- a/sys/modules/sound/driver/hda/Makefile +++ b/sys/modules/sound/driver/hda/Makefile @@ -2,7 +2,7 @@ KMOD= snd_hda SRCS= device_if.h bus_if.h pci_if.h channel_if.h mixer_if.h hdac_if.h -SRCS+= hdaa.c hdaa.h hdaa_patches.c hdac.c hdac_if.h hdac_if.c -SRCS+= hdacc.c hdac_private.h hdac_reg.h hda_reg.h hdac.h +SRCS+= hdaa.c hdaa.h hdaa_patches.c hdacc.c hdac.c hdac_if.c +SRCS+= hdac_private.h hdac_reg.h hda_reg.h hdac.h .include <bsd.kmod.mk> diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c index b6c55fac50b3..6e08ad2796a8 100644 --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -128,8 +128,25 @@ SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, "Enable/Disable TCP SACK support"); VNET_DEFINE(int, tcp_do_newsack) = 1; -SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, revised, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(tcp_do_newsack), 0, + +static int +sysctl_net_inet_tcp_sack_revised(SYSCTL_HANDLER_ARGS) +{ + int error; + int new; + + new = V_tcp_do_newsack; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + V_tcp_do_newsack = new; + gone_in(16, "net.inet.tcp.sack.revised will be deprecated." + " net.inet.tcp.sack.enable will always follow RFC6675 SACK.\n"); + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp_sack, OID_AUTO, revised, CTLFLAG_VNET | CTLFLAG_RW | CTLTYPE_INT, + &VNET_NAME(tcp_do_newsack), 0, sysctl_net_inet_tcp_sack_revised, "CU", "Use revised SACK loss recovery per RFC 6675"); VNET_DEFINE(int, tcp_do_lrd) = 1; diff --git a/sys/powerpc/conf/GENERIC b/sys/powerpc/conf/GENERIC index 7c7d2809d784..1346fa8f9476 100644 --- a/sys/powerpc/conf/GENERIC +++ b/sys/powerpc/conf/GENERIC @@ -90,8 +90,6 @@ options RCTL # Resource limits # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. -# For full debugger support use (turn off in stable branch): -include "std.debug" # Kernel dump features. options EKCD # Support for encrypted kernel dumps diff --git a/sys/powerpc/conf/GENERIC-NODEBUG b/sys/powerpc/conf/GENERIC-NODEBUG deleted file mode 100644 index 0761376a8160..000000000000 --- a/sys/powerpc/conf/GENERIC-NODEBUG +++ /dev/null @@ -1,31 +0,0 @@ -# -# GENERIC-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file -# for FreeBSD/powerpc -# -# This configuration file removes several debugging options, including -# WITNESS and INVARIANTS checking, which are known to have significant -# performance impact on running systems. When benchmarking new features -# this kernel should be used instead of the standard GENERIC. -# This kernel configuration should never appear outside of the HEAD -# of the FreeBSD tree. -# -# For more information on this file, please read the config(5) manual page, -# and/or the handbook section on Kernel Configuration Files: -# -# https://docs.freebsd.org/en/books/handbook/kernelconfig/#kernelconfig-config -# -# The handbook is also available locally in /usr/share/doc/handbook -# if you've installed the doc distribution, otherwise always see the -# FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the -# latest information. -# -# An exhaustive list of options and more detailed explanations of the -# device lines is also present in the ../../conf/NOTES and NOTES files. -# If you are in doubt as to the purpose or necessity of a line, check first -# in NOTES. -# - -include GENERIC -include "std.nodebug" - -ident GENERIC-NODEBUG diff --git a/sys/powerpc/conf/GENERIC64 b/sys/powerpc/conf/GENERIC64 index 630c88b97dd7..6675a4d299f4 100644 --- a/sys/powerpc/conf/GENERIC64 +++ b/sys/powerpc/conf/GENERIC64 @@ -100,8 +100,6 @@ options RCTL # Resource limits # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. -# For full debugger support use (turn off in stable branch): -include "std.debug" # Kernel dump features. options EKCD # Support for encrypted kernel dumps diff --git a/sys/powerpc/conf/GENERIC64-NODEBUG b/sys/powerpc/conf/GENERIC64-NODEBUG deleted file mode 100644 index a4c3dbd856e2..000000000000 --- a/sys/powerpc/conf/GENERIC64-NODEBUG +++ /dev/null @@ -1,31 +0,0 @@ -# -# GENERIC64-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file -# for FreeBSD/powerpc -# -# This configuration file removes several debugging options, including -# WITNESS and INVARIANTS checking, which are known to have significant -# performance impact on running systems. When benchmarking new features -# this kernel should be used instead of the standard GENERIC64. -# This kernel configuration should never appear outside of the HEAD -# of the FreeBSD tree. -# -# For more information on this file, please read the config(5) manual page, -# and/or the handbook section on Kernel Configuration Files: -# -# https://docs.freebsd.org/en/books/handbook/kernelconfig/#kernelconfig-config -# -# The handbook is also available locally in /usr/share/doc/handbook -# if you've installed the doc distribution, otherwise always see the -# FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the -# latest information. -# -# An exhaustive list of options and more detailed explanations of the -# device lines is also present in the ../../conf/NOTES and NOTES files. -# If you are in doubt as to the purpose or necessity of a line, check first -# in NOTES. -# - -include GENERIC64 -include "std.nodebug" - -ident GENERIC64-NODEBUG diff --git a/sys/powerpc/conf/GENERIC64LE b/sys/powerpc/conf/GENERIC64LE index eb9a9441425d..f6f7d44c424b 100644 --- a/sys/powerpc/conf/GENERIC64LE +++ b/sys/powerpc/conf/GENERIC64LE @@ -96,8 +96,6 @@ options RCTL # Resource limits # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. -# For full debugger support use (turn off in stable branch): -include "std.debug" # Kernel dump features. options EKCD # Support for encrypted kernel dumps diff --git a/sys/powerpc/conf/GENERIC64LE-NODEBUG b/sys/powerpc/conf/GENERIC64LE-NODEBUG deleted file mode 100644 index fd2d3ca84a19..000000000000 --- a/sys/powerpc/conf/GENERIC64LE-NODEBUG +++ /dev/null @@ -1,31 +0,0 @@ -# -# GENERIC64LE-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file -# for FreeBSD/powerpc -# -# This configuration file removes several debugging options, including -# WITNESS and INVARIANTS checking, which are known to have significant -# performance impact on running systems. When benchmarking new features -# this kernel should be used instead of the standard GENERIC64LE. -# This kernel configuration should never appear outside of the HEAD -# of the FreeBSD tree. -# -# For more information on this file, please read the config(5) manual page, -# and/or the handbook section on Kernel Configuration Files: -# -# https://docs.freebsd.org/en/books/handbook/kernelconfig/#kernelconfig-config -# -# The handbook is also available locally in /usr/share/doc/handbook -# if you've installed the doc distribution, otherwise always see the -# FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the -# latest information. -# -# An exhaustive list of options and more detailed explanations of the -# device lines is also present in the ../../conf/NOTES and NOTES files. -# If you are in doubt as to the purpose or necessity of a line, check first -# in NOTES. -# - -include GENERIC64LE -include "std.nodebug" - -ident GENERIC64LE-NODEBUG diff --git a/sys/riscv/conf/GENERIC b/sys/riscv/conf/GENERIC index 2ff711e80127..187e7396e884 100644 --- a/sys/riscv/conf/GENERIC +++ b/sys/riscv/conf/GENERIC @@ -167,8 +167,6 @@ device xilinx_spi # Xilinx AXI SPI Controller # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. -# For full debugger support use (turn off in stable branch): -include "std.debug" # options EARLY_PRINTF=sbi # Kernel dump features. diff --git a/sys/riscv/conf/GENERIC-NODEBUG b/sys/riscv/conf/GENERIC-NODEBUG deleted file mode 100644 index e4f4b41f2c41..000000000000 --- a/sys/riscv/conf/GENERIC-NODEBUG +++ /dev/null @@ -1,31 +0,0 @@ -# -# GENERIC-NODEBUG -- WITNESS and INVARIANTS free kernel configuration file -# for FreeBSD/riscv -# -# This configuration file removes several debugging options, including -# WITNESS and INVARIANTS checking, which are known to have significant -# performance impact on running systems. When benchmarking new features -# this kernel should be used instead of the standard GENERIC. -# This kernel configuration should never appear outside of the HEAD -# of the FreeBSD tree. -# -# For more information on this file, please read the config(5) manual page, -# and/or the handbook section on Kernel Configuration Files: -# -# https://docs.freebsd.org/en/books/handbook/kernelconfig/#kernelconfig-config -# -# The handbook is also available locally in /usr/share/doc/handbook -# if you've installed the doc distribution, otherwise always see the -# FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the -# latest information. -# -# An exhaustive list of options and more detailed explanations of the -# device lines is also present in the ../../conf/NOTES and NOTES files. -# If you are in doubt as to the purpose or necessity of a line, check first -# in NOTES. -# - -include GENERIC -include "std.nodebug" - -ident GENERIC-NODEBUG diff --git a/sys/sys/event.h b/sys/sys/event.h index 1b30e4292de8..f161d2c938c1 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -45,7 +45,8 @@ #define EVFILT_USER (-11) /* User events */ #define EVFILT_SENDFILE (-12) /* attached to sendfile requests */ #define EVFILT_EMPTY (-13) /* empty send socket buf */ -#define EVFILT_SYSCOUNT 13 +#define EVFILT_JAIL (-14) /* attached to struct prison */ +#define EVFILT_SYSCOUNT 14 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define EV_SET(kevp_, a, b, c, d, e, f) do { \ @@ -204,10 +205,19 @@ struct freebsd11_kevent32 { #define NOTE_PCTRLMASK 0xf0000000 /* mask for hint bits */ #define NOTE_PDATAMASK 0x000fffff /* mask for pid */ -/* additional flags for EVFILT_PROC */ -#define NOTE_TRACK 0x00000001 /* follow across forks */ +/* data/hint flags for EVFILT_JAIL */ +#define NOTE_JAIL_SET 0x80000000 /* jail was modified */ +#define NOTE_JAIL_CHILD 0x40000000 /* child jail was created */ +#define NOTE_JAIL_ATTACH 0x20000000 /* jail was attached to */ +#define NOTE_JAIL_REMOVE 0x10000000 /* jail was removed */ +#define NOTE_JAIL_ATTACH_MULTI 0x08000000 /* multiple procs attached */ +#define NOTE_JAIL_CTRLMASK 0xf0000000 /* mask for hint bits */ +#define NOTE_JAIL_DATAMASK 0x000fffff /* mask for pid */ + +/* additional flags for EVFILT_PROC and EVFILT_JAIL */ +#define NOTE_TRACK 0x00000001 /* follow across fork/create */ #define NOTE_TRACKERR 0x00000002 /* could not track child */ -#define NOTE_CHILD 0x00000004 /* am a child process */ +#define NOTE_CHILD 0x00000004 /* am a child process/jail */ /* additional flags for EVFILT_TIMER */ #define NOTE_SECONDS 0x00000001 /* data is seconds */ @@ -309,6 +319,7 @@ struct knote { struct proc *p_proc; /* proc pointer */ struct kaiocb *p_aio; /* AIO job pointer */ struct aioliojob *p_lio; /* LIO job pointer */ + struct prison *p_prison; /* prison pointer */ void *p_v; /* generic other pointer */ } kn_ptr; const struct filterops *kn_fop; diff --git a/sys/sys/file.h b/sys/sys/file.h index 63313926c4f0..cc3c733580fd 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -72,6 +72,7 @@ struct nameidata; #define DTYPE_EVENTFD 13 /* eventfd */ #define DTYPE_TIMERFD 14 /* timerfd */ #define DTYPE_INOTIFY 15 /* inotify descriptor */ +#define DTYPE_JAILDESC 16 /* jail descriptor */ #ifdef _KERNEL diff --git a/sys/sys/jail.h b/sys/sys/jail.h index d2655c52e832..e12e8c3178c9 100644 --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -99,8 +99,12 @@ enum prison_state { #define JAIL_UPDATE 0x02 /* Update parameters of existing jail */ #define JAIL_ATTACH 0x04 /* Attach to jail upon creation */ #define JAIL_DYING 0x08 /* Allow getting a dying jail */ -#define JAIL_SET_MASK 0x0f /* JAIL_DYING is deprecated/ignored here */ -#define JAIL_GET_MASK 0x08 +#define JAIL_USE_DESC 0x10 /* Get/set jail in descriptor */ +#define JAIL_AT_DESC 0x20 /* Find/add jail under descriptor */ +#define JAIL_GET_DESC 0x40 /* Return a new jail descriptor */ +#define JAIL_OWN_DESC 0x80 /* Return a new owning jail descriptor */ +#define JAIL_SET_MASK 0xff /* JAIL_DYING is deprecated/ignored here */ +#define JAIL_GET_MASK 0xf8 #define JAIL_SYS_DISABLE 0 #define JAIL_SYS_NEW 1 @@ -115,7 +119,9 @@ int jail(struct jail *); int jail_set(struct iovec *, unsigned int, int); int jail_get(struct iovec *, unsigned int, int); int jail_attach(int); +int jail_attach_jd(int); int jail_remove(int); +int jail_remove_jd(int); __END_DECLS #else /* _KERNEL */ @@ -144,6 +150,8 @@ MALLOC_DECLARE(M_PRISON); #define JAIL_META_PRIVATE "meta" #define JAIL_META_SHARED "env" +struct jaildesc; +struct knlist; struct racct; struct prison_racct; @@ -189,7 +197,9 @@ struct prison { struct vnode *pr_root; /* (c) vnode to rdir */ struct prison_ip *pr_addrs[PR_FAMILY_MAX]; /* (p,n) IPs of jail */ struct prison_racct *pr_prison_racct; /* (c) racct jail proxy */ - void *pr_sparep[3]; + struct knlist *pr_klist; /* (m) attached knotes */ + LIST_HEAD(, jaildesc) pr_descs; /* (a) attached descriptors */ + void *pr_sparep; int pr_childcount; /* (a) number of child jails */ int pr_childmax; /* (p) maximum child jails */ unsigned pr_allow; /* (p) PR_ALLOW_* flags */ @@ -425,10 +435,11 @@ SYSCTL_DECL(_security_jail_param); /* * Kernel support functions for jail(). */ -struct ucred; +struct knote; struct mount; struct sockaddr; struct statfs; +struct ucred; struct vfsconf; /* @@ -463,6 +474,7 @@ void prison_proc_free(struct prison *); void prison_proc_link(struct prison *, struct proc *); void prison_proc_unlink(struct prison *, struct proc *); void prison_proc_iterate(struct prison *, void (*)(struct proc *, void *), void *); +void prison_remove(struct prison *); void prison_set_allow(struct ucred *cred, unsigned flag, int enable); bool prison_ischild(struct prison *, struct prison *); bool prison_isalive(const struct prison *); diff --git a/sys/sys/jaildesc.h b/sys/sys/jaildesc.h new file mode 100644 index 000000000000..4bed1ab3b88a --- /dev/null +++ b/sys/sys/jaildesc.h @@ -0,0 +1,85 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2025 James Gritton. + * All rights reserved. + * + * This software was developed at the University of Cambridge Computer + * Laboratory with support from a grant from Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_JAILDESC_H_ +#define _SYS_JAILDESC_H_ + +#ifdef _KERNEL + +#include <sys/queue.h> +#include <sys/_lock.h> +#include <sys/_mutex.h> +#include <sys/_types.h> + +struct prison; + +/*- + * struct jaildesc describes a jail descriptor, which points to a struct + * prison. struct prison in turn has a linked list of struct jaildesc. + * + * Locking key: + * (c) set on creation, remains unchanged + * (d) jd_lock + * (p) jd_prison->pr_mtx + */ +struct jaildesc { + LIST_ENTRY(jaildesc) jd_list; /* (d,p) this prison's descs */ + struct prison *jd_prison; /* (d) the prison */ + struct mtx jd_lock; + uid_t jd_uid; /* (d) nominal file owner */ + gid_t jd_gid; /* (d) nominal file group */ + mode_t jd_mode; /* (d) descriptor permissions */ + unsigned jd_flags; /* (d) JDF_* flags */ +}; + +/* + * Locking macros for the jaildesc. + */ +#define JAILDESC_LOCK_DESTROY(jd) mtx_destroy(&(jd)->jd_lock) +#define JAILDESC_LOCK_INIT(jd) mtx_init(&(jd)->jd_lock, "jaildesc", \ + NULL, MTX_DEF) +#define JAILDESC_LOCK(jd) mtx_lock(&(jd)->jd_lock) +#define JAILDESC_UNLOCK(jd) mtx_unlock(&(jd)->jd_lock) + +/* + * Flags for the jd_flags field + */ +#define JDF_REMOVED 0x00000002 /* jail was removed */ + +int jaildesc_find(struct thread *td, int fd, struct jaildesc **jdp, + struct prison **prp, struct ucred **ucredp); +int jaildesc_alloc(struct thread *td, struct file **fpp, int *fdp, int owning); +void jaildesc_set_prison(struct file *jd, struct prison *pr); +void jaildesc_prison_cleanup(struct prison *pr); + +#endif /* _KERNEL */ + +#endif /* !_SYS_JAILDESC_H_ */ diff --git a/sys/sys/param.h b/sys/sys/param.h index fc2a78883f1e..c21c086e15ad 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -74,7 +74,7 @@ * cannot include sys/param.h and should only be updated here. */ #undef __FreeBSD_version -#define __FreeBSD_version 1500063 +#define __FreeBSD_version 1500064 /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, diff --git a/sys/sys/syscall.h b/sys/sys/syscall.h index 2d6903967e15..cff27b8be316 100644 --- a/sys/sys/syscall.h +++ b/sys/sys/syscall.h @@ -535,4 +535,6 @@ #define SYS_inotify_rm_watch 594 #define SYS_getgroups 595 #define SYS_setgroups 596 -#define SYS_MAXSYSCALL 597 +#define SYS_jail_attach_jd 597 +#define SYS_jail_remove_jd 598 +#define SYS_MAXSYSCALL 599 diff --git a/sys/sys/syscall.mk b/sys/sys/syscall.mk index d1172c2dc7bf..443dbadcfbff 100644 --- a/sys/sys/syscall.mk +++ b/sys/sys/syscall.mk @@ -438,4 +438,6 @@ MIASM = \ inotify_add_watch_at.o \ inotify_rm_watch.o \ getgroups.o \ - setgroups.o + setgroups.o \ + jail_attach_jd.o \ + jail_remove_jd.o diff --git a/sys/sys/sysproto.h b/sys/sys/sysproto.h index 98311a6dbf94..8dda4b4533ea 100644 --- a/sys/sys/sysproto.h +++ b/sys/sys/sysproto.h @@ -1901,6 +1901,12 @@ struct setgroups_args { char gidsetsize_l_[PADL_(int)]; int gidsetsize; char gidsetsize_r_[PADR_(int)]; char gidset_l_[PADL_(const gid_t *)]; const gid_t * gidset; char gidset_r_[PADR_(const gid_t *)]; }; +struct jail_attach_jd_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; +}; +struct jail_remove_jd_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; +}; int sys__exit(struct thread *, struct _exit_args *); int sys_fork(struct thread *, struct fork_args *); int sys_read(struct thread *, struct read_args *); @@ -2305,6 +2311,8 @@ int sys_inotify_add_watch_at(struct thread *, struct inotify_add_watch_at_args * int sys_inotify_rm_watch(struct thread *, struct inotify_rm_watch_args *); int sys_getgroups(struct thread *, struct getgroups_args *); int sys_setgroups(struct thread *, struct setgroups_args *); +int sys_jail_attach_jd(struct thread *, struct jail_attach_jd_args *); +int sys_jail_remove_jd(struct thread *, struct jail_remove_jd_args *); #ifdef COMPAT_43 @@ -3301,6 +3309,8 @@ int freebsd14_setgroups(struct thread *, struct freebsd14_setgroups_args *); #define SYS_AUE_inotify_rm_watch AUE_INOTIFY #define SYS_AUE_getgroups AUE_GETGROUPS #define SYS_AUE_setgroups AUE_SETGROUPS +#define SYS_AUE_jail_attach_jd AUE_JAIL_ATTACH +#define SYS_AUE_jail_remove_jd AUE_JAIL_REMOVE #undef PAD_ #undef PADL_ diff --git a/sys/sys/user.h b/sys/sys/user.h index 103236b6ed1b..3183f0792256 100644 --- a/sys/sys/user.h +++ b/sys/sys/user.h @@ -266,6 +266,7 @@ struct user { #define KF_TYPE_EVENTFD 13 #define KF_TYPE_TIMERFD 14 #define KF_TYPE_INOTIFY 15 +#define KF_TYPE_JAILDESC 16 #define KF_TYPE_UNKNOWN 255 #define KF_VTYPE_VNON 0 @@ -453,6 +454,9 @@ struct kinfo_file { uint64_t kf_timerfd_addr; } kf_timerfd; struct { + int32_t kf_jid; + } kf_jail; + struct { uint64_t kf_kqueue_addr; int32_t kf_kqueue_count; int32_t kf_kqueue_state; |
