memfd.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. /*
  2. * memfd_create system call and file sealing support
  3. *
  4. * Code was originally included in shmem.c, and broken out to facilitate
  5. * use by hugetlbfs as well as tmpfs.
  6. *
  7. * This file is released under the GPL.
  8. */
  9. #include <linux/fs.h>
  10. #include <linux/vfs.h>
  11. #include <linux/pagemap.h>
  12. #include <linux/file.h>
  13. #include <linux/mm.h>
  14. #include <linux/sched/signal.h>
  15. #include <linux/khugepaged.h>
  16. #include <linux/syscalls.h>
  17. #include <linux/hugetlb.h>
  18. #include <linux/shmem_fs.h>
  19. #include <linux/memfd.h>
  20. #include <linux/pid_namespace.h>
  21. #include <uapi/linux/memfd.h>
  22. /*
  23. * We need a tag: a new tag would expand every xa_node by 8 bytes,
  24. * so reuse a tag which we firmly believe is never set or cleared on tmpfs
  25. * or hugetlbfs because they are memory only filesystems.
  26. */
  27. #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
  28. #define LAST_SCAN 4 /* about 150ms max */
  29. static bool memfd_folio_has_extra_refs(struct folio *folio)
  30. {
  31. return folio_ref_count(folio) - folio_mapcount(folio) !=
  32. folio_nr_pages(folio);
  33. }
  34. static void memfd_tag_pins(struct xa_state *xas)
  35. {
  36. struct folio *folio;
  37. int latency = 0;
  38. lru_add_drain();
  39. xas_lock_irq(xas);
  40. xas_for_each(xas, folio, ULONG_MAX) {
  41. if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
  42. xas_set_mark(xas, MEMFD_TAG_PINNED);
  43. if (++latency < XA_CHECK_SCHED)
  44. continue;
  45. latency = 0;
  46. xas_pause(xas);
  47. xas_unlock_irq(xas);
  48. cond_resched();
  49. xas_lock_irq(xas);
  50. }
  51. xas_unlock_irq(xas);
  52. }
  53. /*
  54. * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c).
  55. * It is mainly called to allocate a folio in a memfd when the caller
  56. * (memfd_pin_folios()) cannot find a folio in the page cache at a given
  57. * index in the mapping.
  58. */
  59. struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
  60. {
  61. #ifdef CONFIG_HUGETLB_PAGE
  62. struct folio *folio;
  63. gfp_t gfp_mask;
  64. int err;
  65. if (is_file_hugepages(memfd)) {
  66. /*
  67. * The folio would most likely be accessed by a DMA driver,
  68. * therefore, we have zone memory constraints where we can
  69. * alloc from. Also, the folio will be pinned for an indefinite
  70. * amount of time, so it is not expected to be migrated away.
  71. */
  72. struct hstate *h = hstate_file(memfd);
  73. gfp_mask = htlb_alloc_mask(h);
  74. gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
  75. idx >>= huge_page_order(h);
  76. folio = alloc_hugetlb_folio_reserve(h,
  77. numa_node_id(),
  78. NULL,
  79. gfp_mask);
  80. if (folio) {
  81. err = hugetlb_add_to_page_cache(folio,
  82. memfd->f_mapping,
  83. idx);
  84. if (err) {
  85. folio_put(folio);
  86. return ERR_PTR(err);
  87. }
  88. folio_unlock(folio);
  89. return folio;
  90. }
  91. return ERR_PTR(-ENOMEM);
  92. }
  93. #endif
  94. return shmem_read_folio(memfd->f_mapping, idx);
  95. }
  96. /*
  97. * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
  98. * via get_user_pages(), drivers might have some pending I/O without any active
  99. * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
  100. * and see whether it has an elevated ref-count. If so, we tag them and wait for
  101. * them to be dropped.
  102. * The caller must guarantee that no new user will acquire writable references
  103. * to those folios to avoid races.
  104. */
  105. static int memfd_wait_for_pins(struct address_space *mapping)
  106. {
  107. XA_STATE(xas, &mapping->i_pages, 0);
  108. struct folio *folio;
  109. int error, scan;
  110. memfd_tag_pins(&xas);
  111. error = 0;
  112. for (scan = 0; scan <= LAST_SCAN; scan++) {
  113. int latency = 0;
  114. if (!xas_marked(&xas, MEMFD_TAG_PINNED))
  115. break;
  116. if (!scan)
  117. lru_add_drain_all();
  118. else if (schedule_timeout_killable((HZ << scan) / 200))
  119. scan = LAST_SCAN;
  120. xas_set(&xas, 0);
  121. xas_lock_irq(&xas);
  122. xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
  123. bool clear = true;
  124. if (!xa_is_value(folio) &&
  125. memfd_folio_has_extra_refs(folio)) {
  126. /*
  127. * On the last scan, we clean up all those tags
  128. * we inserted; but make a note that we still
  129. * found folios pinned.
  130. */
  131. if (scan == LAST_SCAN)
  132. error = -EBUSY;
  133. else
  134. clear = false;
  135. }
  136. if (clear)
  137. xas_clear_mark(&xas, MEMFD_TAG_PINNED);
  138. if (++latency < XA_CHECK_SCHED)
  139. continue;
  140. latency = 0;
  141. xas_pause(&xas);
  142. xas_unlock_irq(&xas);
  143. cond_resched();
  144. xas_lock_irq(&xas);
  145. }
  146. xas_unlock_irq(&xas);
  147. }
  148. return error;
  149. }
  150. unsigned int *memfd_file_seals_ptr(struct file *file)
  151. {
  152. if (shmem_file(file))
  153. return &SHMEM_I(file_inode(file))->seals;
  154. #ifdef CONFIG_HUGETLBFS
  155. if (is_file_hugepages(file))
  156. return &HUGETLBFS_I(file_inode(file))->seals;
  157. #endif
  158. return NULL;
  159. }
  160. #define F_ALL_SEALS (F_SEAL_SEAL | \
  161. F_SEAL_EXEC | \
  162. F_SEAL_SHRINK | \
  163. F_SEAL_GROW | \
  164. F_SEAL_WRITE | \
  165. F_SEAL_FUTURE_WRITE)
  166. static int memfd_add_seals(struct file *file, unsigned int seals)
  167. {
  168. struct inode *inode = file_inode(file);
  169. unsigned int *file_seals;
  170. int error;
  171. /*
  172. * SEALING
  173. * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
  174. * but restrict access to a specific subset of file operations. Seals
  175. * can only be added, but never removed. This way, mutually untrusted
  176. * parties can share common memory regions with a well-defined policy.
  177. * A malicious peer can thus never perform unwanted operations on a
  178. * shared object.
  179. *
  180. * Seals are only supported on special tmpfs or hugetlbfs files and
  181. * always affect the whole underlying inode. Once a seal is set, it
  182. * may prevent some kinds of access to the file. Currently, the
  183. * following seals are defined:
  184. * SEAL_SEAL: Prevent further seals from being set on this file
  185. * SEAL_SHRINK: Prevent the file from shrinking
  186. * SEAL_GROW: Prevent the file from growing
  187. * SEAL_WRITE: Prevent write access to the file
  188. * SEAL_EXEC: Prevent modification of the exec bits in the file mode
  189. *
  190. * As we don't require any trust relationship between two parties, we
  191. * must prevent seals from being removed. Therefore, sealing a file
  192. * only adds a given set of seals to the file, it never touches
  193. * existing seals. Furthermore, the "setting seals"-operation can be
  194. * sealed itself, which basically prevents any further seal from being
  195. * added.
  196. *
  197. * Semantics of sealing are only defined on volatile files. Only
  198. * anonymous tmpfs and hugetlbfs files support sealing. More
  199. * importantly, seals are never written to disk. Therefore, there's
  200. * no plan to support it on other file types.
  201. */
  202. if (!(file->f_mode & FMODE_WRITE))
  203. return -EPERM;
  204. if (seals & ~(unsigned int)F_ALL_SEALS)
  205. return -EINVAL;
  206. inode_lock(inode);
  207. file_seals = memfd_file_seals_ptr(file);
  208. if (!file_seals) {
  209. error = -EINVAL;
  210. goto unlock;
  211. }
  212. if (*file_seals & F_SEAL_SEAL) {
  213. error = -EPERM;
  214. goto unlock;
  215. }
  216. if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
  217. error = mapping_deny_writable(file->f_mapping);
  218. if (error)
  219. goto unlock;
  220. error = memfd_wait_for_pins(file->f_mapping);
  221. if (error) {
  222. mapping_allow_writable(file->f_mapping);
  223. goto unlock;
  224. }
  225. }
  226. /*
  227. * SEAL_EXEC implys SEAL_WRITE, making W^X from the start.
  228. */
  229. if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
  230. seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
  231. *file_seals |= seals;
  232. error = 0;
  233. unlock:
  234. inode_unlock(inode);
  235. return error;
  236. }
  237. static int memfd_get_seals(struct file *file)
  238. {
  239. unsigned int *seals = memfd_file_seals_ptr(file);
  240. return seals ? *seals : -EINVAL;
  241. }
  242. long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
  243. {
  244. long error;
  245. switch (cmd) {
  246. case F_ADD_SEALS:
  247. error = memfd_add_seals(file, arg);
  248. break;
  249. case F_GET_SEALS:
  250. error = memfd_get_seals(file);
  251. break;
  252. default:
  253. error = -EINVAL;
  254. break;
  255. }
  256. return error;
  257. }
  258. #define MFD_NAME_PREFIX "memfd:"
  259. #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
  260. #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
  261. #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
  262. static int check_sysctl_memfd_noexec(unsigned int *flags)
  263. {
  264. #ifdef CONFIG_SYSCTL
  265. struct pid_namespace *ns = task_active_pid_ns(current);
  266. int sysctl = pidns_memfd_noexec_scope(ns);
  267. if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
  268. if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
  269. *flags |= MFD_NOEXEC_SEAL;
  270. else
  271. *flags |= MFD_EXEC;
  272. }
  273. if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
  274. pr_err_ratelimited(
  275. "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
  276. current->comm, task_pid_nr(current), sysctl);
  277. return -EACCES;
  278. }
  279. #endif
  280. return 0;
  281. }
  282. SYSCALL_DEFINE2(memfd_create,
  283. const char __user *, uname,
  284. unsigned int, flags)
  285. {
  286. unsigned int *file_seals;
  287. struct file *file;
  288. int fd, error;
  289. char *name;
  290. long len;
  291. if (!(flags & MFD_HUGETLB)) {
  292. if (flags & ~(unsigned int)MFD_ALL_FLAGS)
  293. return -EINVAL;
  294. } else {
  295. /* Allow huge page size encoding in flags. */
  296. if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
  297. (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
  298. return -EINVAL;
  299. }
  300. /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
  301. if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
  302. return -EINVAL;
  303. error = check_sysctl_memfd_noexec(&flags);
  304. if (error < 0)
  305. return error;
  306. /* length includes terminating zero */
  307. len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
  308. if (len <= 0)
  309. return -EFAULT;
  310. if (len > MFD_NAME_MAX_LEN + 1)
  311. return -EINVAL;
  312. name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
  313. if (!name)
  314. return -ENOMEM;
  315. strcpy(name, MFD_NAME_PREFIX);
  316. if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
  317. error = -EFAULT;
  318. goto err_name;
  319. }
  320. /* terminating-zero may have changed after strnlen_user() returned */
  321. if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
  322. error = -EFAULT;
  323. goto err_name;
  324. }
  325. fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
  326. if (fd < 0) {
  327. error = fd;
  328. goto err_name;
  329. }
  330. if (flags & MFD_HUGETLB) {
  331. file = hugetlb_file_setup(name, 0, VM_NORESERVE,
  332. HUGETLB_ANONHUGE_INODE,
  333. (flags >> MFD_HUGE_SHIFT) &
  334. MFD_HUGE_MASK);
  335. } else
  336. file = shmem_file_setup(name, 0, VM_NORESERVE);
  337. if (IS_ERR(file)) {
  338. error = PTR_ERR(file);
  339. goto err_fd;
  340. }
  341. file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
  342. file->f_flags |= O_LARGEFILE;
  343. if (flags & MFD_NOEXEC_SEAL) {
  344. struct inode *inode = file_inode(file);
  345. inode->i_mode &= ~0111;
  346. file_seals = memfd_file_seals_ptr(file);
  347. if (file_seals) {
  348. *file_seals &= ~F_SEAL_SEAL;
  349. *file_seals |= F_SEAL_EXEC;
  350. }
  351. } else if (flags & MFD_ALLOW_SEALING) {
  352. /* MFD_EXEC and MFD_ALLOW_SEALING are set */
  353. file_seals = memfd_file_seals_ptr(file);
  354. if (file_seals)
  355. *file_seals &= ~F_SEAL_SEAL;
  356. }
  357. fd_install(fd, file);
  358. kfree(name);
  359. return fd;
  360. err_fd:
  361. put_unused_fd(fd);
  362. err_name:
  363. kfree(name);
  364. return error;
  365. }