vfio_compat.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
  3. */
  4. #include <linux/file.h>
  5. #include <linux/interval_tree.h>
  6. #include <linux/iommu.h>
  7. #include <linux/iommufd.h>
  8. #include <linux/slab.h>
  9. #include <linux/vfio.h>
  10. #include <uapi/linux/vfio.h>
  11. #include <uapi/linux/iommufd.h>
  12. #include "iommufd_private.h"
  13. static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
  14. {
  15. struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);
  16. xa_lock(&ictx->objects);
  17. if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
  18. goto out_unlock;
  19. ioas = ictx->vfio_ioas;
  20. out_unlock:
  21. xa_unlock(&ictx->objects);
  22. return ioas;
  23. }
  24. /**
  25. * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists
  26. * @ictx: Context to operate on
  27. * @out_ioas_id: The IOAS ID of the compatibility IOAS
  28. *
  29. * Return the ID of the current compatibility IOAS. The ID can be passed into
  30. * other functions that take an ioas_id.
  31. */
  32. int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
  33. {
  34. struct iommufd_ioas *ioas;
  35. ioas = get_compat_ioas(ictx);
  36. if (IS_ERR(ioas))
  37. return PTR_ERR(ioas);
  38. *out_ioas_id = ioas->obj.id;
  39. iommufd_put_object(ictx, &ioas->obj);
  40. return 0;
  41. }
  42. EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO);
  43. /**
  44. * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached
  45. * @ictx: Context to operate on
  46. *
  47. * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types.
  48. */
  49. int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
  50. {
  51. int ret;
  52. xa_lock(&ictx->objects);
  53. if (!ictx->vfio_ioas) {
  54. ictx->no_iommu_mode = 1;
  55. ret = 0;
  56. } else {
  57. ret = -EINVAL;
  58. }
  59. xa_unlock(&ictx->objects);
  60. return ret;
  61. }
  62. EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO);
  63. /**
  64. * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created
  65. * @ictx: Context to operate on
  66. *
  67. * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
  68. * on since they do not have an IOAS ID input in their ABI. Only attaching a
  69. * group should cause a default creation of the internal ioas, this does nothing
  70. * if an existing ioas has already been assigned somehow.
  71. */
  72. int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx)
  73. {
  74. struct iommufd_ioas *ioas = NULL;
  75. int ret;
  76. ioas = iommufd_ioas_alloc(ictx);
  77. if (IS_ERR(ioas))
  78. return PTR_ERR(ioas);
  79. xa_lock(&ictx->objects);
  80. /*
  81. * VFIO won't allow attaching a container to both iommu and no iommu
  82. * operation
  83. */
  84. if (ictx->no_iommu_mode) {
  85. ret = -EINVAL;
  86. goto out_abort;
  87. }
  88. if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) {
  89. ret = 0;
  90. iommufd_put_object(ictx, &ictx->vfio_ioas->obj);
  91. goto out_abort;
  92. }
  93. ictx->vfio_ioas = ioas;
  94. xa_unlock(&ictx->objects);
  95. /*
  96. * An automatically created compat IOAS is treated as a userspace
  97. * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
  98. * and if not manually destroyed it will be destroyed automatically
  99. * at iommufd release.
  100. */
  101. iommufd_object_finalize(ictx, &ioas->obj);
  102. return 0;
  103. out_abort:
  104. xa_unlock(&ictx->objects);
  105. iommufd_object_abort(ictx, &ioas->obj);
  106. return ret;
  107. }
  108. EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO);
  109. int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
  110. {
  111. struct iommu_vfio_ioas *cmd = ucmd->cmd;
  112. struct iommufd_ioas *ioas;
  113. if (cmd->__reserved)
  114. return -EOPNOTSUPP;
  115. switch (cmd->op) {
  116. case IOMMU_VFIO_IOAS_GET:
  117. ioas = get_compat_ioas(ucmd->ictx);
  118. if (IS_ERR(ioas))
  119. return PTR_ERR(ioas);
  120. cmd->ioas_id = ioas->obj.id;
  121. iommufd_put_object(ucmd->ictx, &ioas->obj);
  122. return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
  123. case IOMMU_VFIO_IOAS_SET:
  124. ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
  125. if (IS_ERR(ioas))
  126. return PTR_ERR(ioas);
  127. xa_lock(&ucmd->ictx->objects);
  128. ucmd->ictx->vfio_ioas = ioas;
  129. xa_unlock(&ucmd->ictx->objects);
  130. iommufd_put_object(ucmd->ictx, &ioas->obj);
  131. return 0;
  132. case IOMMU_VFIO_IOAS_CLEAR:
  133. xa_lock(&ucmd->ictx->objects);
  134. ucmd->ictx->vfio_ioas = NULL;
  135. xa_unlock(&ucmd->ictx->objects);
  136. return 0;
  137. default:
  138. return -EOPNOTSUPP;
  139. }
  140. }
  141. static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
  142. void __user *arg)
  143. {
  144. u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
  145. size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
  146. struct vfio_iommu_type1_dma_map map;
  147. int iommu_prot = IOMMU_CACHE;
  148. struct iommufd_ioas *ioas;
  149. unsigned long iova;
  150. int rc;
  151. if (copy_from_user(&map, arg, minsz))
  152. return -EFAULT;
  153. if (map.argsz < minsz || map.flags & ~supported_flags)
  154. return -EINVAL;
  155. if (map.flags & VFIO_DMA_MAP_FLAG_READ)
  156. iommu_prot |= IOMMU_READ;
  157. if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
  158. iommu_prot |= IOMMU_WRITE;
  159. ioas = get_compat_ioas(ictx);
  160. if (IS_ERR(ioas))
  161. return PTR_ERR(ioas);
  162. /*
  163. * Maps created through the legacy interface always use VFIO compatible
  164. * rlimit accounting. If the user wishes to use the faster user based
  165. * rlimit accounting then they must use the new interface.
  166. */
  167. iova = map.iova;
  168. rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
  169. map.size, iommu_prot, 0);
  170. iommufd_put_object(ictx, &ioas->obj);
  171. return rc;
  172. }
  173. static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
  174. void __user *arg)
  175. {
  176. size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
  177. /*
  178. * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
  179. * dirty tracking direction:
  180. * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
  181. * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
  182. */
  183. u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
  184. struct vfio_iommu_type1_dma_unmap unmap;
  185. unsigned long unmapped = 0;
  186. struct iommufd_ioas *ioas;
  187. int rc;
  188. if (copy_from_user(&unmap, arg, minsz))
  189. return -EFAULT;
  190. if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
  191. return -EINVAL;
  192. ioas = get_compat_ioas(ictx);
  193. if (IS_ERR(ioas))
  194. return PTR_ERR(ioas);
  195. if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
  196. if (unmap.iova != 0 || unmap.size != 0) {
  197. rc = -EINVAL;
  198. goto err_put;
  199. }
  200. rc = iopt_unmap_all(&ioas->iopt, &unmapped);
  201. } else {
  202. if (READ_ONCE(ioas->iopt.disable_large_pages)) {
  203. /*
  204. * Create cuts at the start and last of the requested
  205. * range. If the start IOVA is 0 then it doesn't need to
  206. * be cut.
  207. */
  208. unsigned long iovas[] = { unmap.iova + unmap.size - 1,
  209. unmap.iova - 1 };
  210. rc = iopt_cut_iova(&ioas->iopt, iovas,
  211. unmap.iova ? 2 : 1);
  212. if (rc)
  213. goto err_put;
  214. }
  215. rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
  216. &unmapped);
  217. }
  218. unmap.size = unmapped;
  219. if (copy_to_user(arg, &unmap, minsz))
  220. rc = -EFAULT;
  221. err_put:
  222. iommufd_put_object(ictx, &ioas->obj);
  223. return rc;
  224. }
  225. static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
  226. {
  227. struct iommufd_hwpt_paging *hwpt_paging;
  228. struct iommufd_ioas *ioas;
  229. int rc = 1;
  230. ioas = get_compat_ioas(ictx);
  231. if (IS_ERR(ioas))
  232. return PTR_ERR(ioas);
  233. mutex_lock(&ioas->mutex);
  234. list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
  235. if (!hwpt_paging->enforce_cache_coherency) {
  236. rc = 0;
  237. break;
  238. }
  239. }
  240. mutex_unlock(&ioas->mutex);
  241. iommufd_put_object(ictx, &ioas->obj);
  242. return rc;
  243. }
  244. static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
  245. unsigned long type)
  246. {
  247. switch (type) {
  248. case VFIO_TYPE1_IOMMU:
  249. case VFIO_TYPE1v2_IOMMU:
  250. case VFIO_UNMAP_ALL:
  251. return 1;
  252. case VFIO_NOIOMMU_IOMMU:
  253. return IS_ENABLED(CONFIG_VFIO_NOIOMMU);
  254. case VFIO_DMA_CC_IOMMU:
  255. return iommufd_vfio_cc_iommu(ictx);
  256. /*
  257. * This is obsolete, and to be removed from VFIO. It was an incomplete
  258. * idea that got merged.
  259. * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
  260. */
  261. case VFIO_TYPE1_NESTING_IOMMU:
  262. return 0;
  263. /*
  264. * VFIO_DMA_MAP_FLAG_VADDR
  265. * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
  266. * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
  267. *
  268. * It is hard to see how this could be implemented safely.
  269. */
  270. case VFIO_UPDATE_VADDR:
  271. default:
  272. return 0;
  273. }
  274. }
  275. static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
  276. {
  277. bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode);
  278. struct iommufd_ioas *ioas = NULL;
  279. int rc = 0;
  280. /*
  281. * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all
  282. * other ioctls. We let them keep working but they mostly fail since no
  283. * IOAS should exist.
  284. */
  285. if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU &&
  286. no_iommu_mode) {
  287. if (!capable(CAP_SYS_RAWIO))
  288. return -EPERM;
  289. return 0;
  290. }
  291. if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) ||
  292. no_iommu_mode)
  293. return -EINVAL;
  294. /* VFIO fails the set_iommu if there is no group */
  295. ioas = get_compat_ioas(ictx);
  296. if (IS_ERR(ioas))
  297. return PTR_ERR(ioas);
  298. /*
  299. * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
  300. * the middle of mapped ranges. This is complicated by huge page support
  301. * which creates single large IOPTEs that cannot be split by the iommu
  302. * driver. TYPE1 is very old at this point and likely nothing uses it,
  303. * however it is simple enough to emulate by simply disabling the
  304. * problematic large IOPTEs. Then we can safely unmap within any range.
  305. */
  306. if (type == VFIO_TYPE1_IOMMU)
  307. rc = iopt_disable_large_pages(&ioas->iopt);
  308. iommufd_put_object(ictx, &ioas->obj);
  309. return rc;
  310. }
  311. static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
  312. {
  313. struct io_pagetable *iopt = &ioas->iopt;
  314. unsigned long pgsize_bitmap = ULONG_MAX;
  315. struct iommu_domain *domain;
  316. unsigned long index;
  317. down_read(&iopt->domains_rwsem);
  318. xa_for_each(&iopt->domains, index, domain)
  319. pgsize_bitmap &= domain->pgsize_bitmap;
  320. /* See vfio_update_pgsize_bitmap() */
  321. if (pgsize_bitmap & ~PAGE_MASK) {
  322. pgsize_bitmap &= PAGE_MASK;
  323. pgsize_bitmap |= PAGE_SIZE;
  324. }
  325. pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
  326. up_read(&iopt->domains_rwsem);
  327. return pgsize_bitmap;
  328. }
  329. static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
  330. struct vfio_info_cap_header __user *cur,
  331. size_t avail)
  332. {
  333. struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
  334. container_of(cur,
  335. struct vfio_iommu_type1_info_cap_iova_range __user,
  336. header);
  337. struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
  338. .header = {
  339. .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
  340. .version = 1,
  341. },
  342. };
  343. struct interval_tree_span_iter span;
  344. interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
  345. ULONG_MAX) {
  346. struct vfio_iova_range range;
  347. if (!span.is_hole)
  348. continue;
  349. range.start = span.start_hole;
  350. range.end = span.last_hole;
  351. if (avail >= struct_size(&cap_iovas, iova_ranges,
  352. cap_iovas.nr_iovas + 1) &&
  353. copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
  354. &range, sizeof(range)))
  355. return -EFAULT;
  356. cap_iovas.nr_iovas++;
  357. }
  358. if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
  359. copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
  360. return -EFAULT;
  361. return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
  362. }
  363. static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
  364. struct vfio_info_cap_header __user *cur,
  365. size_t avail)
  366. {
  367. struct vfio_iommu_type1_info_dma_avail cap_dma = {
  368. .header = {
  369. .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
  370. .version = 1,
  371. },
  372. /*
  373. * iommufd's limit is based on the cgroup's memory limit.
  374. * Normally vfio would return U16_MAX here, and provide a module
  375. * parameter to adjust it. Since S390 qemu userspace actually
  376. * pays attention and needs a value bigger than U16_MAX return
  377. * U32_MAX.
  378. */
  379. .avail = U32_MAX,
  380. };
  381. if (avail >= sizeof(cap_dma) &&
  382. copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
  383. return -EFAULT;
  384. return sizeof(cap_dma);
  385. }
  386. static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
  387. void __user *arg)
  388. {
  389. typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
  390. struct vfio_info_cap_header __user *cur,
  391. size_t avail);
  392. static const fill_cap_fn fill_fns[] = {
  393. iommufd_fill_cap_dma_avail,
  394. iommufd_fill_cap_iova,
  395. };
  396. size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
  397. struct vfio_info_cap_header __user *last_cap = NULL;
  398. struct vfio_iommu_type1_info info = {};
  399. struct iommufd_ioas *ioas;
  400. size_t total_cap_size;
  401. int rc;
  402. int i;
  403. if (copy_from_user(&info, arg, minsz))
  404. return -EFAULT;
  405. if (info.argsz < minsz)
  406. return -EINVAL;
  407. minsz = min_t(size_t, info.argsz, sizeof(info));
  408. ioas = get_compat_ioas(ictx);
  409. if (IS_ERR(ioas))
  410. return PTR_ERR(ioas);
  411. info.flags = VFIO_IOMMU_INFO_PGSIZES;
  412. info.iova_pgsizes = iommufd_get_pagesizes(ioas);
  413. info.cap_offset = 0;
  414. down_read(&ioas->iopt.iova_rwsem);
  415. total_cap_size = sizeof(info);
  416. for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
  417. int cap_size;
  418. if (info.argsz > total_cap_size)
  419. cap_size = fill_fns[i](ioas, arg + total_cap_size,
  420. info.argsz - total_cap_size);
  421. else
  422. cap_size = fill_fns[i](ioas, NULL, 0);
  423. if (cap_size < 0) {
  424. rc = cap_size;
  425. goto out_put;
  426. }
  427. cap_size = ALIGN(cap_size, sizeof(u64));
  428. if (last_cap && info.argsz >= total_cap_size &&
  429. put_user(total_cap_size, &last_cap->next)) {
  430. rc = -EFAULT;
  431. goto out_put;
  432. }
  433. last_cap = arg + total_cap_size;
  434. total_cap_size += cap_size;
  435. }
  436. /*
  437. * If the user did not provide enough space then only some caps are
  438. * returned and the argsz will be updated to the correct amount to get
  439. * all caps.
  440. */
  441. if (info.argsz >= total_cap_size)
  442. info.cap_offset = sizeof(info);
  443. info.argsz = total_cap_size;
  444. info.flags |= VFIO_IOMMU_INFO_CAPS;
  445. if (copy_to_user(arg, &info, minsz)) {
  446. rc = -EFAULT;
  447. goto out_put;
  448. }
  449. rc = 0;
  450. out_put:
  451. up_read(&ioas->iopt.iova_rwsem);
  452. iommufd_put_object(ictx, &ioas->obj);
  453. return rc;
  454. }
  455. int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
  456. unsigned long arg)
  457. {
  458. void __user *uarg = (void __user *)arg;
  459. switch (cmd) {
  460. case VFIO_GET_API_VERSION:
  461. return VFIO_API_VERSION;
  462. case VFIO_SET_IOMMU:
  463. return iommufd_vfio_set_iommu(ictx, arg);
  464. case VFIO_CHECK_EXTENSION:
  465. return iommufd_vfio_check_extension(ictx, arg);
  466. case VFIO_IOMMU_GET_INFO:
  467. return iommufd_vfio_iommu_get_info(ictx, uarg);
  468. case VFIO_IOMMU_MAP_DMA:
  469. return iommufd_vfio_map_dma(ictx, cmd, uarg);
  470. case VFIO_IOMMU_UNMAP_DMA:
  471. return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
  472. case VFIO_IOMMU_DIRTY_PAGES:
  473. default:
  474. return -ENOIOCTLCMD;
  475. }
  476. return -ENOIOCTLCMD;
  477. }