npu-dma.c 25 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012
  1. /*
  2. * This file implements the DMA operations for NVLink devices. The NPU
  3. * devices all point to the same iommu table as the parent PCI device.
  4. *
  5. * Copyright Alistair Popple, IBM Corporation 2015.
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of version 2 of the GNU General Public
  9. * License as published by the Free Software Foundation.
  10. */
  11. #include <linux/slab.h>
  12. #include <linux/mmu_notifier.h>
  13. #include <linux/mmu_context.h>
  14. #include <linux/of.h>
  15. #include <linux/export.h>
  16. #include <linux/pci.h>
  17. #include <linux/memblock.h>
  18. #include <linux/iommu.h>
  19. #include <linux/debugfs.h>
  20. #include <asm/debugfs.h>
  21. #include <asm/tlb.h>
  22. #include <asm/powernv.h>
  23. #include <asm/reg.h>
  24. #include <asm/opal.h>
  25. #include <asm/io.h>
  26. #include <asm/iommu.h>
  27. #include <asm/pnv-pci.h>
  28. #include <asm/msi_bitmap.h>
  29. #include <asm/opal.h>
  30. #include "powernv.h"
  31. #include "pci.h"
  32. #define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
  33. /*
  34. * spinlock to protect initialisation of an npu_context for a particular
  35. * mm_struct.
  36. */
  37. static DEFINE_SPINLOCK(npu_context_lock);
  38. /*
  39. * When an address shootdown range exceeds this threshold we invalidate the
  40. * entire TLB on the GPU for the given PID rather than each specific address in
  41. * the range.
  42. */
  43. static uint64_t atsd_threshold = 2 * 1024 * 1024;
  44. static struct dentry *atsd_threshold_dentry;
  45. /*
  46. * Other types of TCE cache invalidation are not functional in the
  47. * hardware.
  48. */
  49. static struct pci_dev *get_pci_dev(struct device_node *dn)
  50. {
  51. struct pci_dn *pdn = PCI_DN(dn);
  52. struct pci_dev *pdev;
  53. pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
  54. pdn->busno, pdn->devfn);
  55. /*
  56. * pci_get_domain_bus_and_slot() increased the reference count of
  57. * the PCI device, but callers don't need that actually as the PE
  58. * already holds a reference to the device. Since callers aren't
  59. * aware of the reference count change, call pci_dev_put() now to
  60. * avoid leaks.
  61. */
  62. if (pdev)
  63. pci_dev_put(pdev);
  64. return pdev;
  65. }
  66. /* Given a NPU device get the associated PCI device. */
  67. struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
  68. {
  69. struct device_node *dn;
  70. struct pci_dev *gpdev;
  71. if (WARN_ON(!npdev))
  72. return NULL;
  73. if (WARN_ON(!npdev->dev.of_node))
  74. return NULL;
  75. /* Get assoicated PCI device */
  76. dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
  77. if (!dn)
  78. return NULL;
  79. gpdev = get_pci_dev(dn);
  80. of_node_put(dn);
  81. return gpdev;
  82. }
  83. EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
  84. /* Given the real PCI device get a linked NPU device. */
  85. struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
  86. {
  87. struct device_node *dn;
  88. struct pci_dev *npdev;
  89. if (WARN_ON(!gpdev))
  90. return NULL;
  91. /* Not all PCI devices have device-tree nodes */
  92. if (!gpdev->dev.of_node)
  93. return NULL;
  94. /* Get assoicated PCI device */
  95. dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
  96. if (!dn)
  97. return NULL;
  98. npdev = get_pci_dev(dn);
  99. of_node_put(dn);
  100. return npdev;
  101. }
  102. EXPORT_SYMBOL(pnv_pci_get_npu_dev);
  103. #define NPU_DMA_OP_UNSUPPORTED() \
  104. dev_err_once(dev, "%s operation unsupported for NVLink devices\n", \
  105. __func__)
  106. static void *dma_npu_alloc(struct device *dev, size_t size,
  107. dma_addr_t *dma_handle, gfp_t flag,
  108. unsigned long attrs)
  109. {
  110. NPU_DMA_OP_UNSUPPORTED();
  111. return NULL;
  112. }
  113. static void dma_npu_free(struct device *dev, size_t size,
  114. void *vaddr, dma_addr_t dma_handle,
  115. unsigned long attrs)
  116. {
  117. NPU_DMA_OP_UNSUPPORTED();
  118. }
  119. static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page,
  120. unsigned long offset, size_t size,
  121. enum dma_data_direction direction,
  122. unsigned long attrs)
  123. {
  124. NPU_DMA_OP_UNSUPPORTED();
  125. return 0;
  126. }
  127. static int dma_npu_map_sg(struct device *dev, struct scatterlist *sglist,
  128. int nelems, enum dma_data_direction direction,
  129. unsigned long attrs)
  130. {
  131. NPU_DMA_OP_UNSUPPORTED();
  132. return 0;
  133. }
  134. static int dma_npu_dma_supported(struct device *dev, u64 mask)
  135. {
  136. NPU_DMA_OP_UNSUPPORTED();
  137. return 0;
  138. }
  139. static u64 dma_npu_get_required_mask(struct device *dev)
  140. {
  141. NPU_DMA_OP_UNSUPPORTED();
  142. return 0;
  143. }
  144. static const struct dma_map_ops dma_npu_ops = {
  145. .map_page = dma_npu_map_page,
  146. .map_sg = dma_npu_map_sg,
  147. .alloc = dma_npu_alloc,
  148. .free = dma_npu_free,
  149. .dma_supported = dma_npu_dma_supported,
  150. .get_required_mask = dma_npu_get_required_mask,
  151. };
  152. /*
  153. * Returns the PE assoicated with the PCI device of the given
  154. * NPU. Returns the linked pci device if pci_dev != NULL.
  155. */
  156. static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
  157. struct pci_dev **gpdev)
  158. {
  159. struct pnv_phb *phb;
  160. struct pci_controller *hose;
  161. struct pci_dev *pdev;
  162. struct pnv_ioda_pe *pe;
  163. struct pci_dn *pdn;
  164. pdev = pnv_pci_get_gpu_dev(npe->pdev);
  165. if (!pdev)
  166. return NULL;
  167. pdn = pci_get_pdn(pdev);
  168. if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
  169. return NULL;
  170. hose = pci_bus_to_host(pdev->bus);
  171. phb = hose->private_data;
  172. pe = &phb->ioda.pe_array[pdn->pe_number];
  173. if (gpdev)
  174. *gpdev = pdev;
  175. return pe;
  176. }
  177. long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
  178. struct iommu_table *tbl)
  179. {
  180. struct pnv_phb *phb = npe->phb;
  181. int64_t rc;
  182. const unsigned long size = tbl->it_indirect_levels ?
  183. tbl->it_level_size : tbl->it_size;
  184. const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
  185. const __u64 win_size = tbl->it_size << tbl->it_page_shift;
  186. pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
  187. start_addr, start_addr + win_size - 1,
  188. IOMMU_PAGE_SIZE(tbl));
  189. rc = opal_pci_map_pe_dma_window(phb->opal_id,
  190. npe->pe_number,
  191. npe->pe_number,
  192. tbl->it_indirect_levels + 1,
  193. __pa(tbl->it_base),
  194. size << 3,
  195. IOMMU_PAGE_SIZE(tbl));
  196. if (rc) {
  197. pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
  198. return rc;
  199. }
  200. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  201. /* Add the table to the list so its TCE cache will get invalidated */
  202. pnv_pci_link_table_and_group(phb->hose->node, num,
  203. tbl, &npe->table_group);
  204. return 0;
  205. }
  206. long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
  207. {
  208. struct pnv_phb *phb = npe->phb;
  209. int64_t rc;
  210. pe_info(npe, "Removing DMA window\n");
  211. rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
  212. npe->pe_number,
  213. 0/* levels */, 0/* table address */,
  214. 0/* table size */, 0/* page size */);
  215. if (rc) {
  216. pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
  217. return rc;
  218. }
  219. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  220. pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
  221. &npe->table_group);
  222. return 0;
  223. }
  224. /*
  225. * Enables 32 bit DMA on NPU.
  226. */
  227. static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
  228. {
  229. struct pci_dev *gpdev;
  230. struct pnv_ioda_pe *gpe;
  231. int64_t rc;
  232. /*
  233. * Find the assoicated PCI devices and get the dma window
  234. * information from there.
  235. */
  236. if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
  237. return;
  238. gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
  239. if (!gpe)
  240. return;
  241. rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
  242. /*
  243. * We don't initialise npu_pe->tce32_table as we always use
  244. * dma_npu_ops which are nops.
  245. */
  246. set_dma_ops(&npe->pdev->dev, &dma_npu_ops);
  247. }
  248. /*
  249. * Enables bypass mode on the NPU. The NPU only supports one
  250. * window per link, so bypass needs to be explicitly enabled or
  251. * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
  252. * active at the same time.
  253. */
  254. static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
  255. {
  256. struct pnv_phb *phb = npe->phb;
  257. int64_t rc = 0;
  258. phys_addr_t top = memblock_end_of_DRAM();
  259. if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
  260. return -EINVAL;
  261. rc = pnv_npu_unset_window(npe, 0);
  262. if (rc != OPAL_SUCCESS)
  263. return rc;
  264. /* Enable the bypass window */
  265. top = roundup_pow_of_two(top);
  266. dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
  267. npe->pe_number);
  268. rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
  269. npe->pe_number, npe->pe_number,
  270. 0 /* bypass base */, top);
  271. if (rc == OPAL_SUCCESS)
  272. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  273. return rc;
  274. }
  275. void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
  276. {
  277. int i;
  278. struct pnv_phb *phb;
  279. struct pci_dn *pdn;
  280. struct pnv_ioda_pe *npe;
  281. struct pci_dev *npdev;
  282. for (i = 0; ; ++i) {
  283. npdev = pnv_pci_get_npu_dev(gpdev, i);
  284. if (!npdev)
  285. break;
  286. pdn = pci_get_pdn(npdev);
  287. if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
  288. return;
  289. phb = pci_bus_to_host(npdev->bus)->private_data;
  290. /* We only do bypass if it's enabled on the linked device */
  291. npe = &phb->ioda.pe_array[pdn->pe_number];
  292. if (bypass) {
  293. dev_info(&npdev->dev,
  294. "Using 64-bit DMA iommu bypass\n");
  295. pnv_npu_dma_set_bypass(npe);
  296. } else {
  297. dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
  298. pnv_npu_dma_set_32(npe);
  299. }
  300. }
  301. }
  302. /* Switch ownership from platform code to external user (e.g. VFIO) */
  303. void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
  304. {
  305. struct pnv_phb *phb = npe->phb;
  306. int64_t rc;
  307. /*
  308. * Note: NPU has just a single TVE in the hardware which means that
  309. * while used by the kernel, it can have either 32bit window or
  310. * DMA bypass but never both. So we deconfigure 32bit window only
  311. * if it was enabled at the moment of ownership change.
  312. */
  313. if (npe->table_group.tables[0]) {
  314. pnv_npu_unset_window(npe, 0);
  315. return;
  316. }
  317. /* Disable bypass */
  318. rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
  319. npe->pe_number, npe->pe_number,
  320. 0 /* bypass base */, 0);
  321. if (rc) {
  322. pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
  323. return;
  324. }
  325. pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
  326. }
  327. struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
  328. {
  329. struct pnv_phb *phb = npe->phb;
  330. struct pci_bus *pbus = phb->hose->bus;
  331. struct pci_dev *npdev, *gpdev = NULL, *gptmp;
  332. struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
  333. if (!gpe || !gpdev)
  334. return NULL;
  335. list_for_each_entry(npdev, &pbus->devices, bus_list) {
  336. gptmp = pnv_pci_get_gpu_dev(npdev);
  337. if (gptmp != gpdev)
  338. continue;
  339. pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
  340. iommu_group_add_device(gpe->table_group.group, &npdev->dev);
  341. }
  342. return gpe;
  343. }
  344. /* Maximum number of nvlinks per npu */
  345. #define NV_MAX_LINKS 6
  346. /* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
  347. static int max_npu2_index;
  348. struct npu_context {
  349. struct mm_struct *mm;
  350. struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
  351. struct mmu_notifier mn;
  352. struct kref kref;
  353. bool nmmu_flush;
  354. /* Callback to stop translation requests on a given GPU */
  355. void (*release_cb)(struct npu_context *context, void *priv);
  356. /*
  357. * Private pointer passed to the above callback for usage by
  358. * device drivers.
  359. */
  360. void *priv;
  361. };
  362. struct mmio_atsd_reg {
  363. struct npu *npu;
  364. int reg;
  365. };
  366. /*
  367. * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
  368. * if none are available.
  369. */
  370. static int get_mmio_atsd_reg(struct npu *npu)
  371. {
  372. int i;
  373. for (i = 0; i < npu->mmio_atsd_count; i++) {
  374. if (!test_bit(i, &npu->mmio_atsd_usage))
  375. if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
  376. return i;
  377. }
  378. return -ENOSPC;
  379. }
  380. static void put_mmio_atsd_reg(struct npu *npu, int reg)
  381. {
  382. clear_bit_unlock(reg, &npu->mmio_atsd_usage);
  383. }
  384. /* MMIO ATSD register offsets */
  385. #define XTS_ATSD_AVA 1
  386. #define XTS_ATSD_STAT 2
  387. static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
  388. unsigned long launch, unsigned long va)
  389. {
  390. struct npu *npu = mmio_atsd_reg->npu;
  391. int reg = mmio_atsd_reg->reg;
  392. __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
  393. eieio();
  394. __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]);
  395. }
  396. static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
  397. unsigned long pid, bool flush)
  398. {
  399. int i;
  400. unsigned long launch;
  401. for (i = 0; i <= max_npu2_index; i++) {
  402. if (mmio_atsd_reg[i].reg < 0)
  403. continue;
  404. /* IS set to invalidate matching PID */
  405. launch = PPC_BIT(12);
  406. /* PRS set to process-scoped */
  407. launch |= PPC_BIT(13);
  408. /* AP */
  409. launch |= (u64)
  410. mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
  411. /* PID */
  412. launch |= pid << PPC_BITLSHIFT(38);
  413. /* No flush */
  414. launch |= !flush << PPC_BITLSHIFT(39);
  415. /* Invalidating the entire process doesn't use a va */
  416. mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
  417. }
  418. }
  419. static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
  420. unsigned long va, unsigned long pid, bool flush)
  421. {
  422. int i;
  423. unsigned long launch;
  424. for (i = 0; i <= max_npu2_index; i++) {
  425. if (mmio_atsd_reg[i].reg < 0)
  426. continue;
  427. /* IS set to invalidate target VA */
  428. launch = 0;
  429. /* PRS set to process scoped */
  430. launch |= PPC_BIT(13);
  431. /* AP */
  432. launch |= (u64)
  433. mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
  434. /* PID */
  435. launch |= pid << PPC_BITLSHIFT(38);
  436. /* No flush */
  437. launch |= !flush << PPC_BITLSHIFT(39);
  438. mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
  439. }
  440. }
  441. #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
  442. static void mmio_invalidate_wait(
  443. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
  444. {
  445. struct npu *npu;
  446. int i, reg;
  447. /* Wait for all invalidations to complete */
  448. for (i = 0; i <= max_npu2_index; i++) {
  449. if (mmio_atsd_reg[i].reg < 0)
  450. continue;
  451. /* Wait for completion */
  452. npu = mmio_atsd_reg[i].npu;
  453. reg = mmio_atsd_reg[i].reg;
  454. while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
  455. cpu_relax();
  456. }
  457. }
  458. /*
  459. * Acquires all the address translation shootdown (ATSD) registers required to
  460. * launch an ATSD on all links this npu_context is active on.
  461. */
  462. static void acquire_atsd_reg(struct npu_context *npu_context,
  463. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
  464. {
  465. int i, j;
  466. struct npu *npu;
  467. struct pci_dev *npdev;
  468. struct pnv_phb *nphb;
  469. for (i = 0; i <= max_npu2_index; i++) {
  470. mmio_atsd_reg[i].reg = -1;
  471. for (j = 0; j < NV_MAX_LINKS; j++) {
  472. /*
  473. * There are no ordering requirements with respect to
  474. * the setup of struct npu_context, but to ensure
  475. * consistent behaviour we need to ensure npdev[][] is
  476. * only read once.
  477. */
  478. npdev = READ_ONCE(npu_context->npdev[i][j]);
  479. if (!npdev)
  480. continue;
  481. nphb = pci_bus_to_host(npdev->bus)->private_data;
  482. npu = &nphb->npu;
  483. mmio_atsd_reg[i].npu = npu;
  484. mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
  485. while (mmio_atsd_reg[i].reg < 0) {
  486. mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
  487. cpu_relax();
  488. }
  489. break;
  490. }
  491. }
  492. }
  493. /*
  494. * Release previously acquired ATSD registers. To avoid deadlocks the registers
  495. * must be released in the same order they were acquired above in
  496. * acquire_atsd_reg.
  497. */
  498. static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
  499. {
  500. int i;
  501. for (i = 0; i <= max_npu2_index; i++) {
  502. /*
  503. * We can't rely on npu_context->npdev[][] being the same here
  504. * as when acquire_atsd_reg() was called, hence we use the
  505. * values stored in mmio_atsd_reg during the acquire phase
  506. * rather than re-reading npdev[][].
  507. */
  508. if (mmio_atsd_reg[i].reg < 0)
  509. continue;
  510. put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
  511. }
  512. }
  513. /*
  514. * Invalidate either a single address or an entire PID depending on
  515. * the value of va.
  516. */
  517. static void mmio_invalidate(struct npu_context *npu_context, int va,
  518. unsigned long address, bool flush)
  519. {
  520. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
  521. unsigned long pid = npu_context->mm->context.id;
  522. if (npu_context->nmmu_flush)
  523. /*
  524. * Unfortunately the nest mmu does not support flushing specific
  525. * addresses so we have to flush the whole mm once before
  526. * shooting down the GPU translation.
  527. */
  528. flush_all_mm(npu_context->mm);
  529. /*
  530. * Loop over all the NPUs this process is active on and launch
  531. * an invalidate.
  532. */
  533. acquire_atsd_reg(npu_context, mmio_atsd_reg);
  534. if (va)
  535. mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
  536. else
  537. mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
  538. mmio_invalidate_wait(mmio_atsd_reg);
  539. if (flush) {
  540. /*
  541. * The GPU requires two flush ATSDs to ensure all entries have
  542. * been flushed. We use PID 0 as it will never be used for a
  543. * process on the GPU.
  544. */
  545. mmio_invalidate_pid(mmio_atsd_reg, 0, true);
  546. mmio_invalidate_wait(mmio_atsd_reg);
  547. mmio_invalidate_pid(mmio_atsd_reg, 0, true);
  548. mmio_invalidate_wait(mmio_atsd_reg);
  549. }
  550. release_atsd_reg(mmio_atsd_reg);
  551. }
  552. static void pnv_npu2_mn_release(struct mmu_notifier *mn,
  553. struct mm_struct *mm)
  554. {
  555. struct npu_context *npu_context = mn_to_npu_context(mn);
  556. /* Call into device driver to stop requests to the NMMU */
  557. if (npu_context->release_cb)
  558. npu_context->release_cb(npu_context, npu_context->priv);
  559. /*
  560. * There should be no more translation requests for this PID, but we
  561. * need to ensure any entries for it are removed from the TLB.
  562. */
  563. mmio_invalidate(npu_context, 0, 0, true);
  564. }
  565. static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
  566. struct mm_struct *mm,
  567. unsigned long address,
  568. pte_t pte)
  569. {
  570. struct npu_context *npu_context = mn_to_npu_context(mn);
  571. mmio_invalidate(npu_context, 1, address, true);
  572. }
  573. static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
  574. struct mm_struct *mm,
  575. unsigned long start, unsigned long end)
  576. {
  577. struct npu_context *npu_context = mn_to_npu_context(mn);
  578. unsigned long address;
  579. if (end - start > atsd_threshold) {
  580. /*
  581. * Just invalidate the entire PID if the address range is too
  582. * large.
  583. */
  584. mmio_invalidate(npu_context, 0, 0, true);
  585. } else {
  586. for (address = start; address < end; address += PAGE_SIZE)
  587. mmio_invalidate(npu_context, 1, address, false);
  588. /* Do the flush only on the final addess == end */
  589. mmio_invalidate(npu_context, 1, address, true);
  590. }
  591. }
  592. static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
  593. .release = pnv_npu2_mn_release,
  594. .change_pte = pnv_npu2_mn_change_pte,
  595. .invalidate_range = pnv_npu2_mn_invalidate_range,
  596. };
  597. /*
  598. * Call into OPAL to setup the nmmu context for the current task in
  599. * the NPU. This must be called to setup the context tables before the
  600. * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
  601. *
  602. * A release callback should be registered to allow a device driver to
  603. * be notified that it should not launch any new translation requests
  604. * as the final TLB invalidate is about to occur.
  605. *
  606. * Returns an error if there no contexts are currently available or a
  607. * npu_context which should be passed to pnv_npu2_handle_fault().
  608. *
  609. * mmap_sem must be held in write mode and must not be called from interrupt
  610. * context.
  611. */
  612. struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
  613. unsigned long flags,
  614. void (*cb)(struct npu_context *, void *),
  615. void *priv)
  616. {
  617. int rc;
  618. u32 nvlink_index;
  619. struct device_node *nvlink_dn;
  620. struct mm_struct *mm = current->mm;
  621. struct pnv_phb *nphb;
  622. struct npu *npu;
  623. struct npu_context *npu_context;
  624. /*
  625. * At present we don't support GPUs connected to multiple NPUs and I'm
  626. * not sure the hardware does either.
  627. */
  628. struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
  629. if (!firmware_has_feature(FW_FEATURE_OPAL))
  630. return ERR_PTR(-ENODEV);
  631. if (!npdev)
  632. /* No nvlink associated with this GPU device */
  633. return ERR_PTR(-ENODEV);
  634. nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
  635. if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
  636. &nvlink_index)))
  637. return ERR_PTR(-ENODEV);
  638. if (!mm || mm->context.id == 0) {
  639. /*
  640. * Kernel thread contexts are not supported and context id 0 is
  641. * reserved on the GPU.
  642. */
  643. return ERR_PTR(-EINVAL);
  644. }
  645. nphb = pci_bus_to_host(npdev->bus)->private_data;
  646. npu = &nphb->npu;
  647. /*
  648. * Setup the NPU context table for a particular GPU. These need to be
  649. * per-GPU as we need the tables to filter ATSDs when there are no
  650. * active contexts on a particular GPU. It is safe for these to be
  651. * called concurrently with destroy as the OPAL call takes appropriate
  652. * locks and refcounts on init/destroy.
  653. */
  654. rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
  655. PCI_DEVID(gpdev->bus->number, gpdev->devfn));
  656. if (rc < 0)
  657. return ERR_PTR(-ENOSPC);
  658. /*
  659. * We store the npu pci device so we can more easily get at the
  660. * associated npus.
  661. */
  662. spin_lock(&npu_context_lock);
  663. npu_context = mm->context.npu_context;
  664. if (npu_context) {
  665. if (npu_context->release_cb != cb ||
  666. npu_context->priv != priv) {
  667. spin_unlock(&npu_context_lock);
  668. opal_npu_destroy_context(nphb->opal_id, mm->context.id,
  669. PCI_DEVID(gpdev->bus->number,
  670. gpdev->devfn));
  671. return ERR_PTR(-EINVAL);
  672. }
  673. WARN_ON(!kref_get_unless_zero(&npu_context->kref));
  674. }
  675. spin_unlock(&npu_context_lock);
  676. if (!npu_context) {
  677. /*
  678. * We can set up these fields without holding the
  679. * npu_context_lock as the npu_context hasn't been returned to
  680. * the caller meaning it can't be destroyed. Parallel allocation
  681. * is protected against by mmap_sem.
  682. */
  683. rc = -ENOMEM;
  684. npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
  685. if (npu_context) {
  686. kref_init(&npu_context->kref);
  687. npu_context->mm = mm;
  688. npu_context->mn.ops = &nv_nmmu_notifier_ops;
  689. rc = __mmu_notifier_register(&npu_context->mn, mm);
  690. }
  691. if (rc) {
  692. kfree(npu_context);
  693. opal_npu_destroy_context(nphb->opal_id, mm->context.id,
  694. PCI_DEVID(gpdev->bus->number,
  695. gpdev->devfn));
  696. return ERR_PTR(rc);
  697. }
  698. mm->context.npu_context = npu_context;
  699. }
  700. npu_context->release_cb = cb;
  701. npu_context->priv = priv;
  702. /*
  703. * npdev is a pci_dev pointer setup by the PCI code. We assign it to
  704. * npdev[][] to indicate to the mmu notifiers that an invalidation
  705. * should also be sent over this nvlink. The notifiers don't use any
  706. * other fields in npu_context, so we just need to ensure that when they
  707. * deference npu_context->npdev[][] it is either a valid pointer or
  708. * NULL.
  709. */
  710. WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
  711. if (!nphb->npu.nmmu_flush) {
  712. /*
  713. * If we're not explicitly flushing ourselves we need to mark
  714. * the thread for global flushes
  715. */
  716. npu_context->nmmu_flush = false;
  717. mm_context_add_copro(mm);
  718. } else
  719. npu_context->nmmu_flush = true;
  720. return npu_context;
  721. }
  722. EXPORT_SYMBOL(pnv_npu2_init_context);
  723. static void pnv_npu2_release_context(struct kref *kref)
  724. {
  725. struct npu_context *npu_context =
  726. container_of(kref, struct npu_context, kref);
  727. if (!npu_context->nmmu_flush)
  728. mm_context_remove_copro(npu_context->mm);
  729. npu_context->mm->context.npu_context = NULL;
  730. }
  731. /*
  732. * Destroy a context on the given GPU. May free the npu_context if it is no
  733. * longer active on any GPUs. Must not be called from interrupt context.
  734. */
  735. void pnv_npu2_destroy_context(struct npu_context *npu_context,
  736. struct pci_dev *gpdev)
  737. {
  738. int removed;
  739. struct pnv_phb *nphb;
  740. struct npu *npu;
  741. struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
  742. struct device_node *nvlink_dn;
  743. u32 nvlink_index;
  744. if (WARN_ON(!npdev))
  745. return;
  746. if (!firmware_has_feature(FW_FEATURE_OPAL))
  747. return;
  748. nphb = pci_bus_to_host(npdev->bus)->private_data;
  749. npu = &nphb->npu;
  750. nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
  751. if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
  752. &nvlink_index)))
  753. return;
  754. WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
  755. opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
  756. PCI_DEVID(gpdev->bus->number, gpdev->devfn));
  757. spin_lock(&npu_context_lock);
  758. removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
  759. spin_unlock(&npu_context_lock);
  760. /*
  761. * We need to do this outside of pnv_npu2_release_context so that it is
  762. * outside the spinlock as mmu_notifier_destroy uses SRCU.
  763. */
  764. if (removed) {
  765. mmu_notifier_unregister(&npu_context->mn,
  766. npu_context->mm);
  767. kfree(npu_context);
  768. }
  769. }
  770. EXPORT_SYMBOL(pnv_npu2_destroy_context);
  771. /*
  772. * Assumes mmap_sem is held for the contexts associated mm.
  773. */
  774. int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
  775. unsigned long *flags, unsigned long *status, int count)
  776. {
  777. u64 rc = 0, result = 0;
  778. int i, is_write;
  779. struct page *page[1];
  780. /* mmap_sem should be held so the struct_mm must be present */
  781. struct mm_struct *mm = context->mm;
  782. if (!firmware_has_feature(FW_FEATURE_OPAL))
  783. return -ENODEV;
  784. WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
  785. for (i = 0; i < count; i++) {
  786. is_write = flags[i] & NPU2_WRITE;
  787. rc = get_user_pages_remote(NULL, mm, ea[i], 1,
  788. is_write ? FOLL_WRITE : 0,
  789. page, NULL, NULL);
  790. /*
  791. * To support virtualised environments we will have to do an
  792. * access to the page to ensure it gets faulted into the
  793. * hypervisor. For the moment virtualisation is not supported in
  794. * other areas so leave the access out.
  795. */
  796. if (rc != 1) {
  797. status[i] = rc;
  798. result = -EFAULT;
  799. continue;
  800. }
  801. status[i] = 0;
  802. put_page(page[0]);
  803. }
  804. return result;
  805. }
  806. EXPORT_SYMBOL(pnv_npu2_handle_fault);
  807. int pnv_npu2_init(struct pnv_phb *phb)
  808. {
  809. unsigned int i;
  810. u64 mmio_atsd;
  811. struct device_node *dn;
  812. struct pci_dev *gpdev;
  813. static int npu_index;
  814. uint64_t rc = 0;
  815. if (!atsd_threshold_dentry) {
  816. atsd_threshold_dentry = debugfs_create_x64("atsd_threshold",
  817. 0600, powerpc_debugfs_root, &atsd_threshold);
  818. }
  819. phb->npu.nmmu_flush =
  820. of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
  821. for_each_child_of_node(phb->hose->dn, dn) {
  822. gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
  823. if (gpdev) {
  824. rc = opal_npu_map_lpar(phb->opal_id,
  825. PCI_DEVID(gpdev->bus->number, gpdev->devfn),
  826. 0, 0);
  827. if (rc)
  828. dev_err(&gpdev->dev,
  829. "Error %lld mapping device to LPAR\n",
  830. rc);
  831. }
  832. }
  833. for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
  834. i, &mmio_atsd); i++)
  835. phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
  836. pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
  837. phb->npu.mmio_atsd_count = i;
  838. phb->npu.mmio_atsd_usage = 0;
  839. npu_index++;
  840. if (WARN_ON(npu_index >= NV_MAX_NPUS))
  841. return -ENOSPC;
  842. max_npu2_index = npu_index;
  843. phb->npu.index = npu_index;
  844. return 0;
  845. }