test_hmm.c 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * This is a module to test the HMM (Heterogeneous Memory Management)
  4. * mirror and zone device private memory migration APIs of the kernel.
  5. * Userspace programs can register with the driver to mirror their own address
  6. * space and can use the device to read/write any valid virtual address.
  7. */
  8. #include <linux/init.h>
  9. #include <linux/fs.h>
  10. #include <linux/mm.h>
  11. #include <linux/module.h>
  12. #include <linux/kernel.h>
  13. #include <linux/cdev.h>
  14. #include <linux/device.h>
  15. #include <linux/memremap.h>
  16. #include <linux/mutex.h>
  17. #include <linux/rwsem.h>
  18. #include <linux/sched.h>
  19. #include <linux/slab.h>
  20. #include <linux/highmem.h>
  21. #include <linux/delay.h>
  22. #include <linux/pagemap.h>
  23. #include <linux/hmm.h>
  24. #include <linux/vmalloc.h>
  25. #include <linux/swap.h>
  26. #include <linux/swapops.h>
  27. #include <linux/sched/mm.h>
  28. #include <linux/platform_device.h>
  29. #include <linux/rmap.h>
  30. #include <linux/mmu_notifier.h>
  31. #include <linux/migrate.h>
  32. #include "test_hmm_uapi.h"
  33. #define DMIRROR_NDEVICES 4
  34. #define DMIRROR_RANGE_FAULT_TIMEOUT 1000
  35. #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U)
  36. #define DEVMEM_CHUNKS_RESERVE 16
  37. /*
  38. * For device_private pages, dpage is just a dummy struct page
  39. * representing a piece of device memory. dmirror_devmem_alloc_page
  40. * allocates a real system memory page as backing storage to fake a
  41. * real device. zone_device_data points to that backing page. But
  42. * for device_coherent memory, the struct page represents real
  43. * physical CPU-accessible memory that we can use directly.
  44. */
  45. #define BACKING_PAGE(page) (is_device_private_page((page)) ? \
  46. (page)->zone_device_data : (page))
  47. static unsigned long spm_addr_dev0;
  48. module_param(spm_addr_dev0, long, 0644);
  49. MODULE_PARM_DESC(spm_addr_dev0,
  50. "Specify start address for SPM (special purpose memory) used for device 0. By setting this Coherent device type will be used. Make sure spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
  51. static unsigned long spm_addr_dev1;
  52. module_param(spm_addr_dev1, long, 0644);
  53. MODULE_PARM_DESC(spm_addr_dev1,
  54. "Specify start address for SPM (special purpose memory) used for device 1. By setting this Coherent device type will be used. Make sure spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE.");
  55. static const struct dev_pagemap_ops dmirror_devmem_ops;
  56. static const struct mmu_interval_notifier_ops dmirror_min_ops;
  57. static dev_t dmirror_dev;
  58. struct dmirror_device;
  59. struct dmirror_bounce {
  60. void *ptr;
  61. unsigned long size;
  62. unsigned long addr;
  63. unsigned long cpages;
  64. };
  65. #define DPT_XA_TAG_ATOMIC 1UL
  66. #define DPT_XA_TAG_WRITE 3UL
  67. /*
  68. * Data structure to track address ranges and register for mmu interval
  69. * notifier updates.
  70. */
  71. struct dmirror_interval {
  72. struct mmu_interval_notifier notifier;
  73. struct dmirror *dmirror;
  74. };
  75. /*
  76. * Data attached to the open device file.
  77. * Note that it might be shared after a fork().
  78. */
  79. struct dmirror {
  80. struct dmirror_device *mdevice;
  81. struct xarray pt;
  82. struct mmu_interval_notifier notifier;
  83. struct mutex mutex;
  84. };
  85. /*
  86. * ZONE_DEVICE pages for migration and simulating device memory.
  87. */
  88. struct dmirror_chunk {
  89. struct dev_pagemap pagemap;
  90. struct dmirror_device *mdevice;
  91. bool remove;
  92. };
  93. /*
  94. * Per device data.
  95. */
  96. struct dmirror_device {
  97. struct cdev cdevice;
  98. unsigned int zone_device_type;
  99. struct device device;
  100. unsigned int devmem_capacity;
  101. unsigned int devmem_count;
  102. struct dmirror_chunk **devmem_chunks;
  103. struct mutex devmem_lock; /* protects the above */
  104. unsigned long calloc;
  105. unsigned long cfree;
  106. struct page *free_pages;
  107. spinlock_t lock; /* protects the above */
  108. };
  109. static struct dmirror_device dmirror_devices[DMIRROR_NDEVICES];
  110. static int dmirror_bounce_init(struct dmirror_bounce *bounce,
  111. unsigned long addr,
  112. unsigned long size)
  113. {
  114. bounce->addr = addr;
  115. bounce->size = size;
  116. bounce->cpages = 0;
  117. bounce->ptr = vmalloc(size);
  118. if (!bounce->ptr)
  119. return -ENOMEM;
  120. return 0;
  121. }
  122. static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
  123. {
  124. return (mdevice->zone_device_type ==
  125. HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
  126. }
  127. static enum migrate_vma_direction
  128. dmirror_select_device(struct dmirror *dmirror)
  129. {
  130. return (dmirror->mdevice->zone_device_type ==
  131. HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
  132. MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
  133. MIGRATE_VMA_SELECT_DEVICE_COHERENT;
  134. }
  135. static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
  136. {
  137. vfree(bounce->ptr);
  138. }
  139. static int dmirror_fops_open(struct inode *inode, struct file *filp)
  140. {
  141. struct cdev *cdev = inode->i_cdev;
  142. struct dmirror *dmirror;
  143. int ret;
  144. /* Mirror this process address space */
  145. dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL);
  146. if (dmirror == NULL)
  147. return -ENOMEM;
  148. dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice);
  149. mutex_init(&dmirror->mutex);
  150. xa_init(&dmirror->pt);
  151. ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm,
  152. 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops);
  153. if (ret) {
  154. kfree(dmirror);
  155. return ret;
  156. }
  157. filp->private_data = dmirror;
  158. return 0;
  159. }
  160. static int dmirror_fops_release(struct inode *inode, struct file *filp)
  161. {
  162. struct dmirror *dmirror = filp->private_data;
  163. mmu_interval_notifier_remove(&dmirror->notifier);
  164. xa_destroy(&dmirror->pt);
  165. kfree(dmirror);
  166. return 0;
  167. }
  168. static struct dmirror_chunk *dmirror_page_to_chunk(struct page *page)
  169. {
  170. return container_of(page->pgmap, struct dmirror_chunk, pagemap);
  171. }
  172. static struct dmirror_device *dmirror_page_to_device(struct page *page)
  173. {
  174. return dmirror_page_to_chunk(page)->mdevice;
  175. }
  176. static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range)
  177. {
  178. unsigned long *pfns = range->hmm_pfns;
  179. unsigned long pfn;
  180. for (pfn = (range->start >> PAGE_SHIFT);
  181. pfn < (range->end >> PAGE_SHIFT);
  182. pfn++, pfns++) {
  183. struct page *page;
  184. void *entry;
  185. /*
  186. * Since we asked for hmm_range_fault() to populate pages,
  187. * it shouldn't return an error entry on success.
  188. */
  189. WARN_ON(*pfns & HMM_PFN_ERROR);
  190. WARN_ON(!(*pfns & HMM_PFN_VALID));
  191. page = hmm_pfn_to_page(*pfns);
  192. WARN_ON(!page);
  193. entry = page;
  194. if (*pfns & HMM_PFN_WRITE)
  195. entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
  196. else if (WARN_ON(range->default_flags & HMM_PFN_WRITE))
  197. return -EFAULT;
  198. entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
  199. if (xa_is_err(entry))
  200. return xa_err(entry);
  201. }
  202. return 0;
  203. }
  204. static void dmirror_do_update(struct dmirror *dmirror, unsigned long start,
  205. unsigned long end)
  206. {
  207. unsigned long pfn;
  208. void *entry;
  209. /*
  210. * The XArray doesn't hold references to pages since it relies on
  211. * the mmu notifier to clear page pointers when they become stale.
  212. * Therefore, it is OK to just clear the entry.
  213. */
  214. xa_for_each_range(&dmirror->pt, pfn, entry, start >> PAGE_SHIFT,
  215. end >> PAGE_SHIFT)
  216. xa_erase(&dmirror->pt, pfn);
  217. }
  218. static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
  219. const struct mmu_notifier_range *range,
  220. unsigned long cur_seq)
  221. {
  222. struct dmirror *dmirror = container_of(mni, struct dmirror, notifier);
  223. /*
  224. * Ignore invalidation callbacks for device private pages since
  225. * the invalidation is handled as part of the migration process.
  226. */
  227. if (range->event == MMU_NOTIFY_MIGRATE &&
  228. range->owner == dmirror->mdevice)
  229. return true;
  230. if (mmu_notifier_range_blockable(range))
  231. mutex_lock(&dmirror->mutex);
  232. else if (!mutex_trylock(&dmirror->mutex))
  233. return false;
  234. mmu_interval_set_seq(mni, cur_seq);
  235. dmirror_do_update(dmirror, range->start, range->end);
  236. mutex_unlock(&dmirror->mutex);
  237. return true;
  238. }
  239. static const struct mmu_interval_notifier_ops dmirror_min_ops = {
  240. .invalidate = dmirror_interval_invalidate,
  241. };
  242. static int dmirror_range_fault(struct dmirror *dmirror,
  243. struct hmm_range *range)
  244. {
  245. struct mm_struct *mm = dmirror->notifier.mm;
  246. unsigned long timeout =
  247. jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
  248. int ret;
  249. while (true) {
  250. if (time_after(jiffies, timeout)) {
  251. ret = -EBUSY;
  252. goto out;
  253. }
  254. range->notifier_seq = mmu_interval_read_begin(range->notifier);
  255. mmap_read_lock(mm);
  256. ret = hmm_range_fault(range);
  257. mmap_read_unlock(mm);
  258. if (ret) {
  259. if (ret == -EBUSY)
  260. continue;
  261. goto out;
  262. }
  263. mutex_lock(&dmirror->mutex);
  264. if (mmu_interval_read_retry(range->notifier,
  265. range->notifier_seq)) {
  266. mutex_unlock(&dmirror->mutex);
  267. continue;
  268. }
  269. break;
  270. }
  271. ret = dmirror_do_fault(dmirror, range);
  272. mutex_unlock(&dmirror->mutex);
  273. out:
  274. return ret;
  275. }
  276. static int dmirror_fault(struct dmirror *dmirror, unsigned long start,
  277. unsigned long end, bool write)
  278. {
  279. struct mm_struct *mm = dmirror->notifier.mm;
  280. unsigned long addr;
  281. unsigned long pfns[64];
  282. struct hmm_range range = {
  283. .notifier = &dmirror->notifier,
  284. .hmm_pfns = pfns,
  285. .pfn_flags_mask = 0,
  286. .default_flags =
  287. HMM_PFN_REQ_FAULT | (write ? HMM_PFN_REQ_WRITE : 0),
  288. .dev_private_owner = dmirror->mdevice,
  289. };
  290. int ret = 0;
  291. /* Since the mm is for the mirrored process, get a reference first. */
  292. if (!mmget_not_zero(mm))
  293. return 0;
  294. for (addr = start; addr < end; addr = range.end) {
  295. range.start = addr;
  296. range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
  297. ret = dmirror_range_fault(dmirror, &range);
  298. if (ret)
  299. break;
  300. }
  301. mmput(mm);
  302. return ret;
  303. }
  304. static int dmirror_do_read(struct dmirror *dmirror, unsigned long start,
  305. unsigned long end, struct dmirror_bounce *bounce)
  306. {
  307. unsigned long pfn;
  308. void *ptr;
  309. ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
  310. for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
  311. void *entry;
  312. struct page *page;
  313. entry = xa_load(&dmirror->pt, pfn);
  314. page = xa_untag_pointer(entry);
  315. if (!page)
  316. return -ENOENT;
  317. memcpy_from_page(ptr, page, 0, PAGE_SIZE);
  318. ptr += PAGE_SIZE;
  319. bounce->cpages++;
  320. }
  321. return 0;
  322. }
  323. static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
  324. {
  325. struct dmirror_bounce bounce;
  326. unsigned long start, end;
  327. unsigned long size = cmd->npages << PAGE_SHIFT;
  328. int ret;
  329. start = cmd->addr;
  330. end = start + size;
  331. if (end < start)
  332. return -EINVAL;
  333. ret = dmirror_bounce_init(&bounce, start, size);
  334. if (ret)
  335. return ret;
  336. while (1) {
  337. mutex_lock(&dmirror->mutex);
  338. ret = dmirror_do_read(dmirror, start, end, &bounce);
  339. mutex_unlock(&dmirror->mutex);
  340. if (ret != -ENOENT)
  341. break;
  342. start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
  343. ret = dmirror_fault(dmirror, start, end, false);
  344. if (ret)
  345. break;
  346. cmd->faults++;
  347. }
  348. if (ret == 0) {
  349. if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
  350. bounce.size))
  351. ret = -EFAULT;
  352. }
  353. cmd->cpages = bounce.cpages;
  354. dmirror_bounce_fini(&bounce);
  355. return ret;
  356. }
  357. static int dmirror_do_write(struct dmirror *dmirror, unsigned long start,
  358. unsigned long end, struct dmirror_bounce *bounce)
  359. {
  360. unsigned long pfn;
  361. void *ptr;
  362. ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK);
  363. for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
  364. void *entry;
  365. struct page *page;
  366. entry = xa_load(&dmirror->pt, pfn);
  367. page = xa_untag_pointer(entry);
  368. if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE)
  369. return -ENOENT;
  370. memcpy_to_page(page, 0, ptr, PAGE_SIZE);
  371. ptr += PAGE_SIZE;
  372. bounce->cpages++;
  373. }
  374. return 0;
  375. }
  376. static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd)
  377. {
  378. struct dmirror_bounce bounce;
  379. unsigned long start, end;
  380. unsigned long size = cmd->npages << PAGE_SHIFT;
  381. int ret;
  382. start = cmd->addr;
  383. end = start + size;
  384. if (end < start)
  385. return -EINVAL;
  386. ret = dmirror_bounce_init(&bounce, start, size);
  387. if (ret)
  388. return ret;
  389. if (copy_from_user(bounce.ptr, u64_to_user_ptr(cmd->ptr),
  390. bounce.size)) {
  391. ret = -EFAULT;
  392. goto fini;
  393. }
  394. while (1) {
  395. mutex_lock(&dmirror->mutex);
  396. ret = dmirror_do_write(dmirror, start, end, &bounce);
  397. mutex_unlock(&dmirror->mutex);
  398. if (ret != -ENOENT)
  399. break;
  400. start = cmd->addr + (bounce.cpages << PAGE_SHIFT);
  401. ret = dmirror_fault(dmirror, start, end, true);
  402. if (ret)
  403. break;
  404. cmd->faults++;
  405. }
  406. fini:
  407. cmd->cpages = bounce.cpages;
  408. dmirror_bounce_fini(&bounce);
  409. return ret;
  410. }
  411. static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
  412. struct page **ppage)
  413. {
  414. struct dmirror_chunk *devmem;
  415. struct resource *res = NULL;
  416. unsigned long pfn;
  417. unsigned long pfn_first;
  418. unsigned long pfn_last;
  419. void *ptr;
  420. int ret = -ENOMEM;
  421. devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
  422. if (!devmem)
  423. return ret;
  424. switch (mdevice->zone_device_type) {
  425. case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
  426. res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
  427. "hmm_dmirror");
  428. if (IS_ERR_OR_NULL(res))
  429. goto err_devmem;
  430. devmem->pagemap.range.start = res->start;
  431. devmem->pagemap.range.end = res->end;
  432. devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
  433. break;
  434. case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
  435. devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) ?
  436. spm_addr_dev0 :
  437. spm_addr_dev1;
  438. devmem->pagemap.range.end = devmem->pagemap.range.start +
  439. DEVMEM_CHUNK_SIZE - 1;
  440. devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
  441. break;
  442. default:
  443. ret = -EINVAL;
  444. goto err_devmem;
  445. }
  446. devmem->pagemap.nr_range = 1;
  447. devmem->pagemap.ops = &dmirror_devmem_ops;
  448. devmem->pagemap.owner = mdevice;
  449. mutex_lock(&mdevice->devmem_lock);
  450. if (mdevice->devmem_count == mdevice->devmem_capacity) {
  451. struct dmirror_chunk **new_chunks;
  452. unsigned int new_capacity;
  453. new_capacity = mdevice->devmem_capacity +
  454. DEVMEM_CHUNKS_RESERVE;
  455. new_chunks = krealloc(mdevice->devmem_chunks,
  456. sizeof(new_chunks[0]) * new_capacity,
  457. GFP_KERNEL);
  458. if (!new_chunks)
  459. goto err_release;
  460. mdevice->devmem_capacity = new_capacity;
  461. mdevice->devmem_chunks = new_chunks;
  462. }
  463. ptr = memremap_pages(&devmem->pagemap, numa_node_id());
  464. if (IS_ERR_OR_NULL(ptr)) {
  465. if (ptr)
  466. ret = PTR_ERR(ptr);
  467. else
  468. ret = -EFAULT;
  469. goto err_release;
  470. }
  471. devmem->mdevice = mdevice;
  472. pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
  473. pfn_last = pfn_first + (range_len(&devmem->pagemap.range) >> PAGE_SHIFT);
  474. mdevice->devmem_chunks[mdevice->devmem_count++] = devmem;
  475. mutex_unlock(&mdevice->devmem_lock);
  476. pr_info("added new %u MB chunk (total %u chunks, %u MB) PFNs [0x%lx 0x%lx)\n",
  477. DEVMEM_CHUNK_SIZE / (1024 * 1024),
  478. mdevice->devmem_count,
  479. mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)),
  480. pfn_first, pfn_last);
  481. spin_lock(&mdevice->lock);
  482. for (pfn = pfn_first; pfn < pfn_last; pfn++) {
  483. struct page *page = pfn_to_page(pfn);
  484. page->zone_device_data = mdevice->free_pages;
  485. mdevice->free_pages = page;
  486. }
  487. if (ppage) {
  488. *ppage = mdevice->free_pages;
  489. mdevice->free_pages = (*ppage)->zone_device_data;
  490. mdevice->calloc++;
  491. }
  492. spin_unlock(&mdevice->lock);
  493. return 0;
  494. err_release:
  495. mutex_unlock(&mdevice->devmem_lock);
  496. if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
  497. release_mem_region(devmem->pagemap.range.start,
  498. range_len(&devmem->pagemap.range));
  499. err_devmem:
  500. kfree(devmem);
  501. return ret;
  502. }
  503. static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
  504. {
  505. struct page *dpage = NULL;
  506. struct page *rpage = NULL;
  507. /*
  508. * For ZONE_DEVICE private type, this is a fake device so we allocate
  509. * real system memory to store our device memory.
  510. * For ZONE_DEVICE coherent type we use the actual dpage to store the
  511. * data and ignore rpage.
  512. */
  513. if (dmirror_is_private_zone(mdevice)) {
  514. rpage = alloc_page(GFP_HIGHUSER);
  515. if (!rpage)
  516. return NULL;
  517. }
  518. spin_lock(&mdevice->lock);
  519. if (mdevice->free_pages) {
  520. dpage = mdevice->free_pages;
  521. mdevice->free_pages = dpage->zone_device_data;
  522. mdevice->calloc++;
  523. spin_unlock(&mdevice->lock);
  524. } else {
  525. spin_unlock(&mdevice->lock);
  526. if (dmirror_allocate_chunk(mdevice, &dpage))
  527. goto error;
  528. }
  529. zone_device_page_init(dpage);
  530. dpage->zone_device_data = rpage;
  531. return dpage;
  532. error:
  533. if (rpage)
  534. __free_page(rpage);
  535. return NULL;
  536. }
  537. static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
  538. struct dmirror *dmirror)
  539. {
  540. struct dmirror_device *mdevice = dmirror->mdevice;
  541. const unsigned long *src = args->src;
  542. unsigned long *dst = args->dst;
  543. unsigned long addr;
  544. for (addr = args->start; addr < args->end; addr += PAGE_SIZE,
  545. src++, dst++) {
  546. struct page *spage;
  547. struct page *dpage;
  548. struct page *rpage;
  549. if (!(*src & MIGRATE_PFN_MIGRATE))
  550. continue;
  551. /*
  552. * Note that spage might be NULL which is OK since it is an
  553. * unallocated pte_none() or read-only zero page.
  554. */
  555. spage = migrate_pfn_to_page(*src);
  556. if (WARN(spage && is_zone_device_page(spage),
  557. "page already in device spage pfn: 0x%lx\n",
  558. page_to_pfn(spage)))
  559. continue;
  560. dpage = dmirror_devmem_alloc_page(mdevice);
  561. if (!dpage)
  562. continue;
  563. rpage = BACKING_PAGE(dpage);
  564. if (spage)
  565. copy_highpage(rpage, spage);
  566. else
  567. clear_highpage(rpage);
  568. /*
  569. * Normally, a device would use the page->zone_device_data to
  570. * point to the mirror but here we use it to hold the page for
  571. * the simulated device memory and that page holds the pointer
  572. * to the mirror.
  573. */
  574. rpage->zone_device_data = dmirror;
  575. pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n",
  576. page_to_pfn(spage), page_to_pfn(dpage));
  577. *dst = migrate_pfn(page_to_pfn(dpage));
  578. if ((*src & MIGRATE_PFN_WRITE) ||
  579. (!spage && args->vma->vm_flags & VM_WRITE))
  580. *dst |= MIGRATE_PFN_WRITE;
  581. }
  582. }
  583. static int dmirror_check_atomic(struct dmirror *dmirror, unsigned long start,
  584. unsigned long end)
  585. {
  586. unsigned long pfn;
  587. for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) {
  588. void *entry;
  589. entry = xa_load(&dmirror->pt, pfn);
  590. if (xa_pointer_tag(entry) == DPT_XA_TAG_ATOMIC)
  591. return -EPERM;
  592. }
  593. return 0;
  594. }
  595. static int dmirror_atomic_map(unsigned long start, unsigned long end,
  596. struct page **pages, struct dmirror *dmirror)
  597. {
  598. unsigned long pfn, mapped = 0;
  599. int i;
  600. /* Map the migrated pages into the device's page tables. */
  601. mutex_lock(&dmirror->mutex);
  602. for (i = 0, pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++, i++) {
  603. void *entry;
  604. if (!pages[i])
  605. continue;
  606. entry = pages[i];
  607. entry = xa_tag_pointer(entry, DPT_XA_TAG_ATOMIC);
  608. entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
  609. if (xa_is_err(entry)) {
  610. mutex_unlock(&dmirror->mutex);
  611. return xa_err(entry);
  612. }
  613. mapped++;
  614. }
  615. mutex_unlock(&dmirror->mutex);
  616. return mapped;
  617. }
  618. static int dmirror_migrate_finalize_and_map(struct migrate_vma *args,
  619. struct dmirror *dmirror)
  620. {
  621. unsigned long start = args->start;
  622. unsigned long end = args->end;
  623. const unsigned long *src = args->src;
  624. const unsigned long *dst = args->dst;
  625. unsigned long pfn;
  626. /* Map the migrated pages into the device's page tables. */
  627. mutex_lock(&dmirror->mutex);
  628. for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++,
  629. src++, dst++) {
  630. struct page *dpage;
  631. void *entry;
  632. if (!(*src & MIGRATE_PFN_MIGRATE))
  633. continue;
  634. dpage = migrate_pfn_to_page(*dst);
  635. if (!dpage)
  636. continue;
  637. entry = BACKING_PAGE(dpage);
  638. if (*dst & MIGRATE_PFN_WRITE)
  639. entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE);
  640. entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC);
  641. if (xa_is_err(entry)) {
  642. mutex_unlock(&dmirror->mutex);
  643. return xa_err(entry);
  644. }
  645. }
  646. mutex_unlock(&dmirror->mutex);
  647. return 0;
  648. }
  649. static int dmirror_exclusive(struct dmirror *dmirror,
  650. struct hmm_dmirror_cmd *cmd)
  651. {
  652. unsigned long start, end, addr;
  653. unsigned long size = cmd->npages << PAGE_SHIFT;
  654. struct mm_struct *mm = dmirror->notifier.mm;
  655. struct page *pages[64];
  656. struct dmirror_bounce bounce;
  657. unsigned long next;
  658. int ret;
  659. start = cmd->addr;
  660. end = start + size;
  661. if (end < start)
  662. return -EINVAL;
  663. /* Since the mm is for the mirrored process, get a reference first. */
  664. if (!mmget_not_zero(mm))
  665. return -EINVAL;
  666. mmap_read_lock(mm);
  667. for (addr = start; addr < end; addr = next) {
  668. unsigned long mapped = 0;
  669. int i;
  670. next = min(end, addr + (ARRAY_SIZE(pages) << PAGE_SHIFT));
  671. ret = make_device_exclusive_range(mm, addr, next, pages, NULL);
  672. /*
  673. * Do dmirror_atomic_map() iff all pages are marked for
  674. * exclusive access to avoid accessing uninitialized
  675. * fields of pages.
  676. */
  677. if (ret == (next - addr) >> PAGE_SHIFT)
  678. mapped = dmirror_atomic_map(addr, next, pages, dmirror);
  679. for (i = 0; i < ret; i++) {
  680. if (pages[i]) {
  681. unlock_page(pages[i]);
  682. put_page(pages[i]);
  683. }
  684. }
  685. if (addr + (mapped << PAGE_SHIFT) < next) {
  686. mmap_read_unlock(mm);
  687. mmput(mm);
  688. return -EBUSY;
  689. }
  690. }
  691. mmap_read_unlock(mm);
  692. mmput(mm);
  693. /* Return the migrated data for verification. */
  694. ret = dmirror_bounce_init(&bounce, start, size);
  695. if (ret)
  696. return ret;
  697. mutex_lock(&dmirror->mutex);
  698. ret = dmirror_do_read(dmirror, start, end, &bounce);
  699. mutex_unlock(&dmirror->mutex);
  700. if (ret == 0) {
  701. if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
  702. bounce.size))
  703. ret = -EFAULT;
  704. }
  705. cmd->cpages = bounce.cpages;
  706. dmirror_bounce_fini(&bounce);
  707. return ret;
  708. }
  709. static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
  710. struct dmirror *dmirror)
  711. {
  712. const unsigned long *src = args->src;
  713. unsigned long *dst = args->dst;
  714. unsigned long start = args->start;
  715. unsigned long end = args->end;
  716. unsigned long addr;
  717. for (addr = start; addr < end; addr += PAGE_SIZE,
  718. src++, dst++) {
  719. struct page *dpage, *spage;
  720. spage = migrate_pfn_to_page(*src);
  721. if (!spage || !(*src & MIGRATE_PFN_MIGRATE))
  722. continue;
  723. if (WARN_ON(!is_device_private_page(spage) &&
  724. !is_device_coherent_page(spage)))
  725. continue;
  726. spage = BACKING_PAGE(spage);
  727. dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr);
  728. if (!dpage)
  729. continue;
  730. pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n",
  731. page_to_pfn(spage), page_to_pfn(dpage));
  732. lock_page(dpage);
  733. xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
  734. copy_highpage(dpage, spage);
  735. *dst = migrate_pfn(page_to_pfn(dpage));
  736. if (*src & MIGRATE_PFN_WRITE)
  737. *dst |= MIGRATE_PFN_WRITE;
  738. }
  739. return 0;
  740. }
  741. static unsigned long
  742. dmirror_successful_migrated_pages(struct migrate_vma *migrate)
  743. {
  744. unsigned long cpages = 0;
  745. unsigned long i;
  746. for (i = 0; i < migrate->npages; i++) {
  747. if (migrate->src[i] & MIGRATE_PFN_VALID &&
  748. migrate->src[i] & MIGRATE_PFN_MIGRATE)
  749. cpages++;
  750. }
  751. return cpages;
  752. }
  753. static int dmirror_migrate_to_system(struct dmirror *dmirror,
  754. struct hmm_dmirror_cmd *cmd)
  755. {
  756. unsigned long start, end, addr;
  757. unsigned long size = cmd->npages << PAGE_SHIFT;
  758. struct mm_struct *mm = dmirror->notifier.mm;
  759. struct vm_area_struct *vma;
  760. unsigned long src_pfns[64] = { 0 };
  761. unsigned long dst_pfns[64] = { 0 };
  762. struct migrate_vma args = { 0 };
  763. unsigned long next;
  764. int ret;
  765. start = cmd->addr;
  766. end = start + size;
  767. if (end < start)
  768. return -EINVAL;
  769. /* Since the mm is for the mirrored process, get a reference first. */
  770. if (!mmget_not_zero(mm))
  771. return -EINVAL;
  772. cmd->cpages = 0;
  773. mmap_read_lock(mm);
  774. for (addr = start; addr < end; addr = next) {
  775. vma = vma_lookup(mm, addr);
  776. if (!vma || !(vma->vm_flags & VM_READ)) {
  777. ret = -EINVAL;
  778. goto out;
  779. }
  780. next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
  781. if (next > vma->vm_end)
  782. next = vma->vm_end;
  783. args.vma = vma;
  784. args.src = src_pfns;
  785. args.dst = dst_pfns;
  786. args.start = addr;
  787. args.end = next;
  788. args.pgmap_owner = dmirror->mdevice;
  789. args.flags = dmirror_select_device(dmirror);
  790. ret = migrate_vma_setup(&args);
  791. if (ret)
  792. goto out;
  793. pr_debug("Migrating from device mem to sys mem\n");
  794. dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
  795. migrate_vma_pages(&args);
  796. cmd->cpages += dmirror_successful_migrated_pages(&args);
  797. migrate_vma_finalize(&args);
  798. }
  799. out:
  800. mmap_read_unlock(mm);
  801. mmput(mm);
  802. return ret;
  803. }
  804. static int dmirror_migrate_to_device(struct dmirror *dmirror,
  805. struct hmm_dmirror_cmd *cmd)
  806. {
  807. unsigned long start, end, addr;
  808. unsigned long size = cmd->npages << PAGE_SHIFT;
  809. struct mm_struct *mm = dmirror->notifier.mm;
  810. struct vm_area_struct *vma;
  811. unsigned long src_pfns[64] = { 0 };
  812. unsigned long dst_pfns[64] = { 0 };
  813. struct dmirror_bounce bounce;
  814. struct migrate_vma args = { 0 };
  815. unsigned long next;
  816. int ret;
  817. start = cmd->addr;
  818. end = start + size;
  819. if (end < start)
  820. return -EINVAL;
  821. /* Since the mm is for the mirrored process, get a reference first. */
  822. if (!mmget_not_zero(mm))
  823. return -EINVAL;
  824. mmap_read_lock(mm);
  825. for (addr = start; addr < end; addr = next) {
  826. vma = vma_lookup(mm, addr);
  827. if (!vma || !(vma->vm_flags & VM_READ)) {
  828. ret = -EINVAL;
  829. goto out;
  830. }
  831. next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT));
  832. if (next > vma->vm_end)
  833. next = vma->vm_end;
  834. args.vma = vma;
  835. args.src = src_pfns;
  836. args.dst = dst_pfns;
  837. args.start = addr;
  838. args.end = next;
  839. args.pgmap_owner = dmirror->mdevice;
  840. args.flags = MIGRATE_VMA_SELECT_SYSTEM;
  841. ret = migrate_vma_setup(&args);
  842. if (ret)
  843. goto out;
  844. pr_debug("Migrating from sys mem to device mem\n");
  845. dmirror_migrate_alloc_and_copy(&args, dmirror);
  846. migrate_vma_pages(&args);
  847. dmirror_migrate_finalize_and_map(&args, dmirror);
  848. migrate_vma_finalize(&args);
  849. }
  850. mmap_read_unlock(mm);
  851. mmput(mm);
  852. /*
  853. * Return the migrated data for verification.
  854. * Only for pages in device zone
  855. */
  856. ret = dmirror_bounce_init(&bounce, start, size);
  857. if (ret)
  858. return ret;
  859. mutex_lock(&dmirror->mutex);
  860. ret = dmirror_do_read(dmirror, start, end, &bounce);
  861. mutex_unlock(&dmirror->mutex);
  862. if (ret == 0) {
  863. if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr,
  864. bounce.size))
  865. ret = -EFAULT;
  866. }
  867. cmd->cpages = bounce.cpages;
  868. dmirror_bounce_fini(&bounce);
  869. return ret;
  870. out:
  871. mmap_read_unlock(mm);
  872. mmput(mm);
  873. return ret;
  874. }
  875. static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
  876. unsigned char *perm, unsigned long entry)
  877. {
  878. struct page *page;
  879. if (entry & HMM_PFN_ERROR) {
  880. *perm = HMM_DMIRROR_PROT_ERROR;
  881. return;
  882. }
  883. if (!(entry & HMM_PFN_VALID)) {
  884. *perm = HMM_DMIRROR_PROT_NONE;
  885. return;
  886. }
  887. page = hmm_pfn_to_page(entry);
  888. if (is_device_private_page(page)) {
  889. /* Is the page migrated to this device or some other? */
  890. if (dmirror->mdevice == dmirror_page_to_device(page))
  891. *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL;
  892. else
  893. *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE;
  894. } else if (is_device_coherent_page(page)) {
  895. /* Is the page migrated to this device or some other? */
  896. if (dmirror->mdevice == dmirror_page_to_device(page))
  897. *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL;
  898. else
  899. *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE;
  900. } else if (is_zero_pfn(page_to_pfn(page)))
  901. *perm = HMM_DMIRROR_PROT_ZERO;
  902. else
  903. *perm = HMM_DMIRROR_PROT_NONE;
  904. if (entry & HMM_PFN_WRITE)
  905. *perm |= HMM_DMIRROR_PROT_WRITE;
  906. else
  907. *perm |= HMM_DMIRROR_PROT_READ;
  908. if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT)
  909. *perm |= HMM_DMIRROR_PROT_PMD;
  910. else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT)
  911. *perm |= HMM_DMIRROR_PROT_PUD;
  912. }
  913. static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni,
  914. const struct mmu_notifier_range *range,
  915. unsigned long cur_seq)
  916. {
  917. struct dmirror_interval *dmi =
  918. container_of(mni, struct dmirror_interval, notifier);
  919. struct dmirror *dmirror = dmi->dmirror;
  920. if (mmu_notifier_range_blockable(range))
  921. mutex_lock(&dmirror->mutex);
  922. else if (!mutex_trylock(&dmirror->mutex))
  923. return false;
  924. /*
  925. * Snapshots only need to set the sequence number since any
  926. * invalidation in the interval invalidates the whole snapshot.
  927. */
  928. mmu_interval_set_seq(mni, cur_seq);
  929. mutex_unlock(&dmirror->mutex);
  930. return true;
  931. }
  932. static const struct mmu_interval_notifier_ops dmirror_mrn_ops = {
  933. .invalidate = dmirror_snapshot_invalidate,
  934. };
  935. static int dmirror_range_snapshot(struct dmirror *dmirror,
  936. struct hmm_range *range,
  937. unsigned char *perm)
  938. {
  939. struct mm_struct *mm = dmirror->notifier.mm;
  940. struct dmirror_interval notifier;
  941. unsigned long timeout =
  942. jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
  943. unsigned long i;
  944. unsigned long n;
  945. int ret = 0;
  946. notifier.dmirror = dmirror;
  947. range->notifier = &notifier.notifier;
  948. ret = mmu_interval_notifier_insert(range->notifier, mm,
  949. range->start, range->end - range->start,
  950. &dmirror_mrn_ops);
  951. if (ret)
  952. return ret;
  953. while (true) {
  954. if (time_after(jiffies, timeout)) {
  955. ret = -EBUSY;
  956. goto out;
  957. }
  958. range->notifier_seq = mmu_interval_read_begin(range->notifier);
  959. mmap_read_lock(mm);
  960. ret = hmm_range_fault(range);
  961. mmap_read_unlock(mm);
  962. if (ret) {
  963. if (ret == -EBUSY)
  964. continue;
  965. goto out;
  966. }
  967. mutex_lock(&dmirror->mutex);
  968. if (mmu_interval_read_retry(range->notifier,
  969. range->notifier_seq)) {
  970. mutex_unlock(&dmirror->mutex);
  971. continue;
  972. }
  973. break;
  974. }
  975. n = (range->end - range->start) >> PAGE_SHIFT;
  976. for (i = 0; i < n; i++)
  977. dmirror_mkentry(dmirror, range, perm + i, range->hmm_pfns[i]);
  978. mutex_unlock(&dmirror->mutex);
  979. out:
  980. mmu_interval_notifier_remove(range->notifier);
  981. return ret;
  982. }
  983. static int dmirror_snapshot(struct dmirror *dmirror,
  984. struct hmm_dmirror_cmd *cmd)
  985. {
  986. struct mm_struct *mm = dmirror->notifier.mm;
  987. unsigned long start, end;
  988. unsigned long size = cmd->npages << PAGE_SHIFT;
  989. unsigned long addr;
  990. unsigned long next;
  991. unsigned long pfns[64];
  992. unsigned char perm[64];
  993. char __user *uptr;
  994. struct hmm_range range = {
  995. .hmm_pfns = pfns,
  996. .dev_private_owner = dmirror->mdevice,
  997. };
  998. int ret = 0;
  999. start = cmd->addr;
  1000. end = start + size;
  1001. if (end < start)
  1002. return -EINVAL;
  1003. /* Since the mm is for the mirrored process, get a reference first. */
  1004. if (!mmget_not_zero(mm))
  1005. return -EINVAL;
  1006. /*
  1007. * Register a temporary notifier to detect invalidations even if it
  1008. * overlaps with other mmu_interval_notifiers.
  1009. */
  1010. uptr = u64_to_user_ptr(cmd->ptr);
  1011. for (addr = start; addr < end; addr = next) {
  1012. unsigned long n;
  1013. next = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end);
  1014. range.start = addr;
  1015. range.end = next;
  1016. ret = dmirror_range_snapshot(dmirror, &range, perm);
  1017. if (ret)
  1018. break;
  1019. n = (range.end - range.start) >> PAGE_SHIFT;
  1020. if (copy_to_user(uptr, perm, n)) {
  1021. ret = -EFAULT;
  1022. break;
  1023. }
  1024. cmd->cpages += n;
  1025. uptr += n;
  1026. }
  1027. mmput(mm);
  1028. return ret;
  1029. }
  1030. static void dmirror_device_evict_chunk(struct dmirror_chunk *chunk)
  1031. {
  1032. unsigned long start_pfn = chunk->pagemap.range.start >> PAGE_SHIFT;
  1033. unsigned long end_pfn = chunk->pagemap.range.end >> PAGE_SHIFT;
  1034. unsigned long npages = end_pfn - start_pfn + 1;
  1035. unsigned long i;
  1036. unsigned long *src_pfns;
  1037. unsigned long *dst_pfns;
  1038. src_pfns = kvcalloc(npages, sizeof(*src_pfns), GFP_KERNEL | __GFP_NOFAIL);
  1039. dst_pfns = kvcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL | __GFP_NOFAIL);
  1040. migrate_device_range(src_pfns, start_pfn, npages);
  1041. for (i = 0; i < npages; i++) {
  1042. struct page *dpage, *spage;
  1043. spage = migrate_pfn_to_page(src_pfns[i]);
  1044. if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
  1045. continue;
  1046. if (WARN_ON(!is_device_private_page(spage) &&
  1047. !is_device_coherent_page(spage)))
  1048. continue;
  1049. spage = BACKING_PAGE(spage);
  1050. dpage = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_NOFAIL);
  1051. lock_page(dpage);
  1052. copy_highpage(dpage, spage);
  1053. dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
  1054. if (src_pfns[i] & MIGRATE_PFN_WRITE)
  1055. dst_pfns[i] |= MIGRATE_PFN_WRITE;
  1056. }
  1057. migrate_device_pages(src_pfns, dst_pfns, npages);
  1058. migrate_device_finalize(src_pfns, dst_pfns, npages);
  1059. kvfree(src_pfns);
  1060. kvfree(dst_pfns);
  1061. }
  1062. /* Removes free pages from the free list so they can't be re-allocated */
  1063. static void dmirror_remove_free_pages(struct dmirror_chunk *devmem)
  1064. {
  1065. struct dmirror_device *mdevice = devmem->mdevice;
  1066. struct page *page;
  1067. for (page = mdevice->free_pages; page; page = page->zone_device_data)
  1068. if (dmirror_page_to_chunk(page) == devmem)
  1069. mdevice->free_pages = page->zone_device_data;
  1070. }
  1071. static void dmirror_device_remove_chunks(struct dmirror_device *mdevice)
  1072. {
  1073. unsigned int i;
  1074. mutex_lock(&mdevice->devmem_lock);
  1075. if (mdevice->devmem_chunks) {
  1076. for (i = 0; i < mdevice->devmem_count; i++) {
  1077. struct dmirror_chunk *devmem =
  1078. mdevice->devmem_chunks[i];
  1079. spin_lock(&mdevice->lock);
  1080. devmem->remove = true;
  1081. dmirror_remove_free_pages(devmem);
  1082. spin_unlock(&mdevice->lock);
  1083. dmirror_device_evict_chunk(devmem);
  1084. memunmap_pages(&devmem->pagemap);
  1085. if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
  1086. release_mem_region(devmem->pagemap.range.start,
  1087. range_len(&devmem->pagemap.range));
  1088. kfree(devmem);
  1089. }
  1090. mdevice->devmem_count = 0;
  1091. mdevice->devmem_capacity = 0;
  1092. mdevice->free_pages = NULL;
  1093. kfree(mdevice->devmem_chunks);
  1094. mdevice->devmem_chunks = NULL;
  1095. }
  1096. mutex_unlock(&mdevice->devmem_lock);
  1097. }
  1098. static long dmirror_fops_unlocked_ioctl(struct file *filp,
  1099. unsigned int command,
  1100. unsigned long arg)
  1101. {
  1102. void __user *uarg = (void __user *)arg;
  1103. struct hmm_dmirror_cmd cmd;
  1104. struct dmirror *dmirror;
  1105. int ret;
  1106. dmirror = filp->private_data;
  1107. if (!dmirror)
  1108. return -EINVAL;
  1109. if (copy_from_user(&cmd, uarg, sizeof(cmd)))
  1110. return -EFAULT;
  1111. if (cmd.addr & ~PAGE_MASK)
  1112. return -EINVAL;
  1113. if (cmd.addr >= (cmd.addr + (cmd.npages << PAGE_SHIFT)))
  1114. return -EINVAL;
  1115. cmd.cpages = 0;
  1116. cmd.faults = 0;
  1117. switch (command) {
  1118. case HMM_DMIRROR_READ:
  1119. ret = dmirror_read(dmirror, &cmd);
  1120. break;
  1121. case HMM_DMIRROR_WRITE:
  1122. ret = dmirror_write(dmirror, &cmd);
  1123. break;
  1124. case HMM_DMIRROR_MIGRATE_TO_DEV:
  1125. ret = dmirror_migrate_to_device(dmirror, &cmd);
  1126. break;
  1127. case HMM_DMIRROR_MIGRATE_TO_SYS:
  1128. ret = dmirror_migrate_to_system(dmirror, &cmd);
  1129. break;
  1130. case HMM_DMIRROR_EXCLUSIVE:
  1131. ret = dmirror_exclusive(dmirror, &cmd);
  1132. break;
  1133. case HMM_DMIRROR_CHECK_EXCLUSIVE:
  1134. ret = dmirror_check_atomic(dmirror, cmd.addr,
  1135. cmd.addr + (cmd.npages << PAGE_SHIFT));
  1136. break;
  1137. case HMM_DMIRROR_SNAPSHOT:
  1138. ret = dmirror_snapshot(dmirror, &cmd);
  1139. break;
  1140. case HMM_DMIRROR_RELEASE:
  1141. dmirror_device_remove_chunks(dmirror->mdevice);
  1142. ret = 0;
  1143. break;
  1144. default:
  1145. return -EINVAL;
  1146. }
  1147. if (ret)
  1148. return ret;
  1149. if (copy_to_user(uarg, &cmd, sizeof(cmd)))
  1150. return -EFAULT;
  1151. return 0;
  1152. }
  1153. static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma)
  1154. {
  1155. unsigned long addr;
  1156. for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
  1157. struct page *page;
  1158. int ret;
  1159. page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  1160. if (!page)
  1161. return -ENOMEM;
  1162. ret = vm_insert_page(vma, addr, page);
  1163. if (ret) {
  1164. __free_page(page);
  1165. return ret;
  1166. }
  1167. put_page(page);
  1168. }
  1169. return 0;
  1170. }
  1171. static const struct file_operations dmirror_fops = {
  1172. .open = dmirror_fops_open,
  1173. .release = dmirror_fops_release,
  1174. .mmap = dmirror_fops_mmap,
  1175. .unlocked_ioctl = dmirror_fops_unlocked_ioctl,
  1176. .llseek = default_llseek,
  1177. .owner = THIS_MODULE,
  1178. };
  1179. static void dmirror_devmem_free(struct page *page)
  1180. {
  1181. struct page *rpage = BACKING_PAGE(page);
  1182. struct dmirror_device *mdevice;
  1183. if (rpage != page)
  1184. __free_page(rpage);
  1185. mdevice = dmirror_page_to_device(page);
  1186. spin_lock(&mdevice->lock);
  1187. /* Return page to our allocator if not freeing the chunk */
  1188. if (!dmirror_page_to_chunk(page)->remove) {
  1189. mdevice->cfree++;
  1190. page->zone_device_data = mdevice->free_pages;
  1191. mdevice->free_pages = page;
  1192. }
  1193. spin_unlock(&mdevice->lock);
  1194. }
  1195. static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
  1196. {
  1197. struct migrate_vma args = { 0 };
  1198. unsigned long src_pfns = 0;
  1199. unsigned long dst_pfns = 0;
  1200. struct page *rpage;
  1201. struct dmirror *dmirror;
  1202. vm_fault_t ret;
  1203. /*
  1204. * Normally, a device would use the page->zone_device_data to point to
  1205. * the mirror but here we use it to hold the page for the simulated
  1206. * device memory and that page holds the pointer to the mirror.
  1207. */
  1208. rpage = vmf->page->zone_device_data;
  1209. dmirror = rpage->zone_device_data;
  1210. /* FIXME demonstrate how we can adjust migrate range */
  1211. args.vma = vmf->vma;
  1212. args.start = vmf->address;
  1213. args.end = args.start + PAGE_SIZE;
  1214. args.src = &src_pfns;
  1215. args.dst = &dst_pfns;
  1216. args.pgmap_owner = dmirror->mdevice;
  1217. args.flags = dmirror_select_device(dmirror);
  1218. args.fault_page = vmf->page;
  1219. if (migrate_vma_setup(&args))
  1220. return VM_FAULT_SIGBUS;
  1221. ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
  1222. if (ret)
  1223. return ret;
  1224. migrate_vma_pages(&args);
  1225. /*
  1226. * No device finalize step is needed since
  1227. * dmirror_devmem_fault_alloc_and_copy() will have already
  1228. * invalidated the device page table.
  1229. */
  1230. migrate_vma_finalize(&args);
  1231. return 0;
  1232. }
  1233. static const struct dev_pagemap_ops dmirror_devmem_ops = {
  1234. .page_free = dmirror_devmem_free,
  1235. .migrate_to_ram = dmirror_devmem_fault,
  1236. };
  1237. static int dmirror_device_init(struct dmirror_device *mdevice, int id)
  1238. {
  1239. dev_t dev;
  1240. int ret;
  1241. dev = MKDEV(MAJOR(dmirror_dev), id);
  1242. mutex_init(&mdevice->devmem_lock);
  1243. spin_lock_init(&mdevice->lock);
  1244. cdev_init(&mdevice->cdevice, &dmirror_fops);
  1245. mdevice->cdevice.owner = THIS_MODULE;
  1246. device_initialize(&mdevice->device);
  1247. mdevice->device.devt = dev;
  1248. ret = dev_set_name(&mdevice->device, "hmm_dmirror%u", id);
  1249. if (ret)
  1250. return ret;
  1251. ret = cdev_device_add(&mdevice->cdevice, &mdevice->device);
  1252. if (ret)
  1253. return ret;
  1254. /* Build a list of free ZONE_DEVICE struct pages */
  1255. return dmirror_allocate_chunk(mdevice, NULL);
  1256. }
  1257. static void dmirror_device_remove(struct dmirror_device *mdevice)
  1258. {
  1259. dmirror_device_remove_chunks(mdevice);
  1260. cdev_device_del(&mdevice->cdevice, &mdevice->device);
  1261. }
  1262. static int __init hmm_dmirror_init(void)
  1263. {
  1264. int ret;
  1265. int id = 0;
  1266. int ndevices = 0;
  1267. ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES,
  1268. "HMM_DMIRROR");
  1269. if (ret)
  1270. goto err_unreg;
  1271. memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0]));
  1272. dmirror_devices[ndevices++].zone_device_type =
  1273. HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
  1274. dmirror_devices[ndevices++].zone_device_type =
  1275. HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
  1276. if (spm_addr_dev0 && spm_addr_dev1) {
  1277. dmirror_devices[ndevices++].zone_device_type =
  1278. HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
  1279. dmirror_devices[ndevices++].zone_device_type =
  1280. HMM_DMIRROR_MEMORY_DEVICE_COHERENT;
  1281. }
  1282. for (id = 0; id < ndevices; id++) {
  1283. ret = dmirror_device_init(dmirror_devices + id, id);
  1284. if (ret)
  1285. goto err_chrdev;
  1286. }
  1287. pr_info("HMM test module loaded. This is only for testing HMM.\n");
  1288. return 0;
  1289. err_chrdev:
  1290. while (--id >= 0)
  1291. dmirror_device_remove(dmirror_devices + id);
  1292. unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
  1293. err_unreg:
  1294. return ret;
  1295. }
  1296. static void __exit hmm_dmirror_exit(void)
  1297. {
  1298. int id;
  1299. for (id = 0; id < DMIRROR_NDEVICES; id++)
  1300. if (dmirror_devices[id].zone_device_type)
  1301. dmirror_device_remove(dmirror_devices + id);
  1302. unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES);
  1303. }
  1304. module_init(hmm_dmirror_init);
  1305. module_exit(hmm_dmirror_exit);
  1306. MODULE_DESCRIPTION("HMM (Heterogeneous Memory Management) test module");
  1307. MODULE_LICENSE("GPL");