dax.c 56 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * fs/dax.c - Direct Access filesystem code
  4. * Copyright (c) 2013-2014 Intel Corporation
  5. * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
  6. * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
  7. */
  8. #include <linux/atomic.h>
  9. #include <linux/blkdev.h>
  10. #include <linux/buffer_head.h>
  11. #include <linux/dax.h>
  12. #include <linux/fs.h>
  13. #include <linux/highmem.h>
  14. #include <linux/memcontrol.h>
  15. #include <linux/mm.h>
  16. #include <linux/mutex.h>
  17. #include <linux/pagevec.h>
  18. #include <linux/sched.h>
  19. #include <linux/sched/signal.h>
  20. #include <linux/uio.h>
  21. #include <linux/vmstat.h>
  22. #include <linux/pfn_t.h>
  23. #include <linux/sizes.h>
  24. #include <linux/mmu_notifier.h>
  25. #include <linux/iomap.h>
  26. #include <linux/rmap.h>
  27. #include <asm/pgalloc.h>
  28. #define CREATE_TRACE_POINTS
  29. #include <trace/events/fs_dax.h>
  30. /* We choose 4096 entries - same as per-zone page wait tables */
  31. #define DAX_WAIT_TABLE_BITS 12
  32. #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  33. /* The 'colour' (ie low bits) within a PMD of a page offset. */
  34. #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
  35. #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
  36. static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  37. static int __init init_dax_wait_table(void)
  38. {
  39. int i;
  40. for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  41. init_waitqueue_head(wait_table + i);
  42. return 0;
  43. }
  44. fs_initcall(init_dax_wait_table);
  45. /*
  46. * DAX pagecache entries use XArray value entries so they can't be mistaken
  47. * for pages. We use one bit for locking, one bit for the entry size (PMD)
  48. * and two more to tell us if the entry is a zero page or an empty entry that
  49. * is just used for locking. In total four special bits.
  50. *
  51. * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
  52. * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
  53. * block allocation.
  54. */
  55. #define DAX_SHIFT (4)
  56. #define DAX_LOCKED (1UL << 0)
  57. #define DAX_PMD (1UL << 1)
  58. #define DAX_ZERO_PAGE (1UL << 2)
  59. #define DAX_EMPTY (1UL << 3)
  60. static unsigned long dax_to_pfn(void *entry)
  61. {
  62. return xa_to_value(entry) >> DAX_SHIFT;
  63. }
  64. static void *dax_make_entry(pfn_t pfn, unsigned long flags)
  65. {
  66. return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
  67. }
  68. static bool dax_is_locked(void *entry)
  69. {
  70. return xa_to_value(entry) & DAX_LOCKED;
  71. }
  72. static unsigned int dax_entry_order(void *entry)
  73. {
  74. if (xa_to_value(entry) & DAX_PMD)
  75. return PMD_ORDER;
  76. return 0;
  77. }
  78. static unsigned long dax_is_pmd_entry(void *entry)
  79. {
  80. return xa_to_value(entry) & DAX_PMD;
  81. }
  82. static bool dax_is_pte_entry(void *entry)
  83. {
  84. return !(xa_to_value(entry) & DAX_PMD);
  85. }
  86. static int dax_is_zero_entry(void *entry)
  87. {
  88. return xa_to_value(entry) & DAX_ZERO_PAGE;
  89. }
  90. static int dax_is_empty_entry(void *entry)
  91. {
  92. return xa_to_value(entry) & DAX_EMPTY;
  93. }
  94. /*
  95. * true if the entry that was found is of a smaller order than the entry
  96. * we were looking for
  97. */
  98. static bool dax_is_conflict(void *entry)
  99. {
  100. return entry == XA_RETRY_ENTRY;
  101. }
  102. /*
  103. * DAX page cache entry locking
  104. */
  105. struct exceptional_entry_key {
  106. struct xarray *xa;
  107. pgoff_t entry_start;
  108. };
  109. struct wait_exceptional_entry_queue {
  110. wait_queue_entry_t wait;
  111. struct exceptional_entry_key key;
  112. };
  113. /**
  114. * enum dax_wake_mode: waitqueue wakeup behaviour
  115. * @WAKE_ALL: wake all waiters in the waitqueue
  116. * @WAKE_NEXT: wake only the first waiter in the waitqueue
  117. */
  118. enum dax_wake_mode {
  119. WAKE_ALL,
  120. WAKE_NEXT,
  121. };
  122. static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
  123. void *entry, struct exceptional_entry_key *key)
  124. {
  125. unsigned long hash;
  126. unsigned long index = xas->xa_index;
  127. /*
  128. * If 'entry' is a PMD, align the 'index' that we use for the wait
  129. * queue to the start of that PMD. This ensures that all offsets in
  130. * the range covered by the PMD map to the same bit lock.
  131. */
  132. if (dax_is_pmd_entry(entry))
  133. index &= ~PG_PMD_COLOUR;
  134. key->xa = xas->xa;
  135. key->entry_start = index;
  136. hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
  137. return wait_table + hash;
  138. }
  139. static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
  140. unsigned int mode, int sync, void *keyp)
  141. {
  142. struct exceptional_entry_key *key = keyp;
  143. struct wait_exceptional_entry_queue *ewait =
  144. container_of(wait, struct wait_exceptional_entry_queue, wait);
  145. if (key->xa != ewait->key.xa ||
  146. key->entry_start != ewait->key.entry_start)
  147. return 0;
  148. return autoremove_wake_function(wait, mode, sync, NULL);
  149. }
  150. /*
  151. * @entry may no longer be the entry at the index in the mapping.
  152. * The important information it's conveying is whether the entry at
  153. * this index used to be a PMD entry.
  154. */
  155. static void dax_wake_entry(struct xa_state *xas, void *entry,
  156. enum dax_wake_mode mode)
  157. {
  158. struct exceptional_entry_key key;
  159. wait_queue_head_t *wq;
  160. wq = dax_entry_waitqueue(xas, entry, &key);
  161. /*
  162. * Checking for locked entry and prepare_to_wait_exclusive() happens
  163. * under the i_pages lock, ditto for entry handling in our callers.
  164. * So at this point all tasks that could have seen our entry locked
  165. * must be in the waitqueue and the following check will see them.
  166. */
  167. if (waitqueue_active(wq))
  168. __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
  169. }
  170. /*
  171. * Look up entry in page cache, wait for it to become unlocked if it
  172. * is a DAX entry and return it. The caller must subsequently call
  173. * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
  174. * if it did. The entry returned may have a larger order than @order.
  175. * If @order is larger than the order of the entry found in i_pages, this
  176. * function returns a dax_is_conflict entry.
  177. *
  178. * Must be called with the i_pages lock held.
  179. */
  180. static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
  181. {
  182. void *entry;
  183. struct wait_exceptional_entry_queue ewait;
  184. wait_queue_head_t *wq;
  185. init_wait(&ewait.wait);
  186. ewait.wait.func = wake_exceptional_entry_func;
  187. for (;;) {
  188. entry = xas_find_conflict(xas);
  189. if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  190. return entry;
  191. if (dax_entry_order(entry) < order)
  192. return XA_RETRY_ENTRY;
  193. if (!dax_is_locked(entry))
  194. return entry;
  195. wq = dax_entry_waitqueue(xas, entry, &ewait.key);
  196. prepare_to_wait_exclusive(wq, &ewait.wait,
  197. TASK_UNINTERRUPTIBLE);
  198. xas_unlock_irq(xas);
  199. xas_reset(xas);
  200. schedule();
  201. finish_wait(wq, &ewait.wait);
  202. xas_lock_irq(xas);
  203. }
  204. }
  205. /*
  206. * The only thing keeping the address space around is the i_pages lock
  207. * (it's cycled in clear_inode() after removing the entries from i_pages)
  208. * After we call xas_unlock_irq(), we cannot touch xas->xa.
  209. */
  210. static void wait_entry_unlocked(struct xa_state *xas, void *entry)
  211. {
  212. struct wait_exceptional_entry_queue ewait;
  213. wait_queue_head_t *wq;
  214. init_wait(&ewait.wait);
  215. ewait.wait.func = wake_exceptional_entry_func;
  216. wq = dax_entry_waitqueue(xas, entry, &ewait.key);
  217. /*
  218. * Unlike get_unlocked_entry() there is no guarantee that this
  219. * path ever successfully retrieves an unlocked entry before an
  220. * inode dies. Perform a non-exclusive wait in case this path
  221. * never successfully performs its own wake up.
  222. */
  223. prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
  224. xas_unlock_irq(xas);
  225. schedule();
  226. finish_wait(wq, &ewait.wait);
  227. }
  228. static void put_unlocked_entry(struct xa_state *xas, void *entry,
  229. enum dax_wake_mode mode)
  230. {
  231. if (entry && !dax_is_conflict(entry))
  232. dax_wake_entry(xas, entry, mode);
  233. }
  234. /*
  235. * We used the xa_state to get the entry, but then we locked the entry and
  236. * dropped the xa_lock, so we know the xa_state is stale and must be reset
  237. * before use.
  238. */
  239. static void dax_unlock_entry(struct xa_state *xas, void *entry)
  240. {
  241. void *old;
  242. BUG_ON(dax_is_locked(entry));
  243. xas_reset(xas);
  244. xas_lock_irq(xas);
  245. old = xas_store(xas, entry);
  246. xas_unlock_irq(xas);
  247. BUG_ON(!dax_is_locked(old));
  248. dax_wake_entry(xas, entry, WAKE_NEXT);
  249. }
  250. /*
  251. * Return: The entry stored at this location before it was locked.
  252. */
  253. static void *dax_lock_entry(struct xa_state *xas, void *entry)
  254. {
  255. unsigned long v = xa_to_value(entry);
  256. return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
  257. }
  258. static unsigned long dax_entry_size(void *entry)
  259. {
  260. if (dax_is_zero_entry(entry))
  261. return 0;
  262. else if (dax_is_empty_entry(entry))
  263. return 0;
  264. else if (dax_is_pmd_entry(entry))
  265. return PMD_SIZE;
  266. else
  267. return PAGE_SIZE;
  268. }
  269. static unsigned long dax_end_pfn(void *entry)
  270. {
  271. return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
  272. }
  273. /*
  274. * Iterate through all mapped pfns represented by an entry, i.e. skip
  275. * 'empty' and 'zero' entries.
  276. */
  277. #define for_each_mapped_pfn(entry, pfn) \
  278. for (pfn = dax_to_pfn(entry); \
  279. pfn < dax_end_pfn(entry); pfn++)
  280. static inline bool dax_page_is_shared(struct page *page)
  281. {
  282. return page->mapping == PAGE_MAPPING_DAX_SHARED;
  283. }
  284. /*
  285. * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
  286. * refcount.
  287. */
  288. static inline void dax_page_share_get(struct page *page)
  289. {
  290. if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
  291. /*
  292. * Reset the index if the page was already mapped
  293. * regularly before.
  294. */
  295. if (page->mapping)
  296. page->share = 1;
  297. page->mapping = PAGE_MAPPING_DAX_SHARED;
  298. }
  299. page->share++;
  300. }
  301. static inline unsigned long dax_page_share_put(struct page *page)
  302. {
  303. return --page->share;
  304. }
  305. /*
  306. * When it is called in dax_insert_entry(), the shared flag will indicate that
  307. * whether this entry is shared by multiple files. If so, set the page->mapping
  308. * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
  309. */
  310. static void dax_associate_entry(void *entry, struct address_space *mapping,
  311. struct vm_area_struct *vma, unsigned long address, bool shared)
  312. {
  313. unsigned long size = dax_entry_size(entry), pfn, index;
  314. int i = 0;
  315. if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  316. return;
  317. index = linear_page_index(vma, address & ~(size - 1));
  318. for_each_mapped_pfn(entry, pfn) {
  319. struct page *page = pfn_to_page(pfn);
  320. if (shared) {
  321. dax_page_share_get(page);
  322. } else {
  323. WARN_ON_ONCE(page->mapping);
  324. page->mapping = mapping;
  325. page->index = index + i++;
  326. }
  327. }
  328. }
  329. static void dax_disassociate_entry(void *entry, struct address_space *mapping,
  330. bool trunc)
  331. {
  332. unsigned long pfn;
  333. if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  334. return;
  335. for_each_mapped_pfn(entry, pfn) {
  336. struct page *page = pfn_to_page(pfn);
  337. WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
  338. if (dax_page_is_shared(page)) {
  339. /* keep the shared flag if this page is still shared */
  340. if (dax_page_share_put(page) > 0)
  341. continue;
  342. } else
  343. WARN_ON_ONCE(page->mapping && page->mapping != mapping);
  344. page->mapping = NULL;
  345. page->index = 0;
  346. }
  347. }
  348. static struct page *dax_busy_page(void *entry)
  349. {
  350. unsigned long pfn;
  351. for_each_mapped_pfn(entry, pfn) {
  352. struct page *page = pfn_to_page(pfn);
  353. if (page_ref_count(page) > 1)
  354. return page;
  355. }
  356. return NULL;
  357. }
  358. /**
  359. * dax_lock_folio - Lock the DAX entry corresponding to a folio
  360. * @folio: The folio whose entry we want to lock
  361. *
  362. * Context: Process context.
  363. * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could
  364. * not be locked.
  365. */
  366. dax_entry_t dax_lock_folio(struct folio *folio)
  367. {
  368. XA_STATE(xas, NULL, 0);
  369. void *entry;
  370. /* Ensure folio->mapping isn't freed while we look at it */
  371. rcu_read_lock();
  372. for (;;) {
  373. struct address_space *mapping = READ_ONCE(folio->mapping);
  374. entry = NULL;
  375. if (!mapping || !dax_mapping(mapping))
  376. break;
  377. /*
  378. * In the device-dax case there's no need to lock, a
  379. * struct dev_pagemap pin is sufficient to keep the
  380. * inode alive, and we assume we have dev_pagemap pin
  381. * otherwise we would not have a valid pfn_to_page()
  382. * translation.
  383. */
  384. entry = (void *)~0UL;
  385. if (S_ISCHR(mapping->host->i_mode))
  386. break;
  387. xas.xa = &mapping->i_pages;
  388. xas_lock_irq(&xas);
  389. if (mapping != folio->mapping) {
  390. xas_unlock_irq(&xas);
  391. continue;
  392. }
  393. xas_set(&xas, folio->index);
  394. entry = xas_load(&xas);
  395. if (dax_is_locked(entry)) {
  396. rcu_read_unlock();
  397. wait_entry_unlocked(&xas, entry);
  398. rcu_read_lock();
  399. continue;
  400. }
  401. dax_lock_entry(&xas, entry);
  402. xas_unlock_irq(&xas);
  403. break;
  404. }
  405. rcu_read_unlock();
  406. return (dax_entry_t)entry;
  407. }
  408. void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
  409. {
  410. struct address_space *mapping = folio->mapping;
  411. XA_STATE(xas, &mapping->i_pages, folio->index);
  412. if (S_ISCHR(mapping->host->i_mode))
  413. return;
  414. dax_unlock_entry(&xas, (void *)cookie);
  415. }
  416. /*
  417. * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
  418. * @mapping: the file's mapping whose entry we want to lock
  419. * @index: the offset within this file
  420. * @page: output the dax page corresponding to this dax entry
  421. *
  422. * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
  423. * could not be locked.
  424. */
  425. dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
  426. struct page **page)
  427. {
  428. XA_STATE(xas, NULL, 0);
  429. void *entry;
  430. rcu_read_lock();
  431. for (;;) {
  432. entry = NULL;
  433. if (!dax_mapping(mapping))
  434. break;
  435. xas.xa = &mapping->i_pages;
  436. xas_lock_irq(&xas);
  437. xas_set(&xas, index);
  438. entry = xas_load(&xas);
  439. if (dax_is_locked(entry)) {
  440. rcu_read_unlock();
  441. wait_entry_unlocked(&xas, entry);
  442. rcu_read_lock();
  443. continue;
  444. }
  445. if (!entry ||
  446. dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
  447. /*
  448. * Because we are looking for entry from file's mapping
  449. * and index, so the entry may not be inserted for now,
  450. * or even a zero/empty entry. We don't think this is
  451. * an error case. So, return a special value and do
  452. * not output @page.
  453. */
  454. entry = (void *)~0UL;
  455. } else {
  456. *page = pfn_to_page(dax_to_pfn(entry));
  457. dax_lock_entry(&xas, entry);
  458. }
  459. xas_unlock_irq(&xas);
  460. break;
  461. }
  462. rcu_read_unlock();
  463. return (dax_entry_t)entry;
  464. }
  465. void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
  466. dax_entry_t cookie)
  467. {
  468. XA_STATE(xas, &mapping->i_pages, index);
  469. if (cookie == ~0UL)
  470. return;
  471. dax_unlock_entry(&xas, (void *)cookie);
  472. }
  473. /*
  474. * Find page cache entry at given index. If it is a DAX entry, return it
  475. * with the entry locked. If the page cache doesn't contain an entry at
  476. * that index, add a locked empty entry.
  477. *
  478. * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
  479. * either return that locked entry or will return VM_FAULT_FALLBACK.
  480. * This will happen if there are any PTE entries within the PMD range
  481. * that we are requesting.
  482. *
  483. * We always favor PTE entries over PMD entries. There isn't a flow where we
  484. * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
  485. * insertion will fail if it finds any PTE entries already in the tree, and a
  486. * PTE insertion will cause an existing PMD entry to be unmapped and
  487. * downgraded to PTE entries. This happens for both PMD zero pages as
  488. * well as PMD empty entries.
  489. *
  490. * The exception to this downgrade path is for PMD entries that have
  491. * real storage backing them. We will leave these real PMD entries in
  492. * the tree, and PTE writes will simply dirty the entire PMD entry.
  493. *
  494. * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
  495. * persistent memory the benefit is doubtful. We can add that later if we can
  496. * show it helps.
  497. *
  498. * On error, this function does not return an ERR_PTR. Instead it returns
  499. * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
  500. * overlap with xarray value entries.
  501. */
  502. static void *grab_mapping_entry(struct xa_state *xas,
  503. struct address_space *mapping, unsigned int order)
  504. {
  505. unsigned long index = xas->xa_index;
  506. bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
  507. void *entry;
  508. retry:
  509. pmd_downgrade = false;
  510. xas_lock_irq(xas);
  511. entry = get_unlocked_entry(xas, order);
  512. if (entry) {
  513. if (dax_is_conflict(entry))
  514. goto fallback;
  515. if (!xa_is_value(entry)) {
  516. xas_set_err(xas, -EIO);
  517. goto out_unlock;
  518. }
  519. if (order == 0) {
  520. if (dax_is_pmd_entry(entry) &&
  521. (dax_is_zero_entry(entry) ||
  522. dax_is_empty_entry(entry))) {
  523. pmd_downgrade = true;
  524. }
  525. }
  526. }
  527. if (pmd_downgrade) {
  528. /*
  529. * Make sure 'entry' remains valid while we drop
  530. * the i_pages lock.
  531. */
  532. dax_lock_entry(xas, entry);
  533. /*
  534. * Besides huge zero pages the only other thing that gets
  535. * downgraded are empty entries which don't need to be
  536. * unmapped.
  537. */
  538. if (dax_is_zero_entry(entry)) {
  539. xas_unlock_irq(xas);
  540. unmap_mapping_pages(mapping,
  541. xas->xa_index & ~PG_PMD_COLOUR,
  542. PG_PMD_NR, false);
  543. xas_reset(xas);
  544. xas_lock_irq(xas);
  545. }
  546. dax_disassociate_entry(entry, mapping, false);
  547. xas_store(xas, NULL); /* undo the PMD join */
  548. dax_wake_entry(xas, entry, WAKE_ALL);
  549. mapping->nrpages -= PG_PMD_NR;
  550. entry = NULL;
  551. xas_set(xas, index);
  552. }
  553. if (entry) {
  554. dax_lock_entry(xas, entry);
  555. } else {
  556. unsigned long flags = DAX_EMPTY;
  557. if (order > 0)
  558. flags |= DAX_PMD;
  559. entry = dax_make_entry(pfn_to_pfn_t(0), flags);
  560. dax_lock_entry(xas, entry);
  561. if (xas_error(xas))
  562. goto out_unlock;
  563. mapping->nrpages += 1UL << order;
  564. }
  565. out_unlock:
  566. xas_unlock_irq(xas);
  567. if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
  568. goto retry;
  569. if (xas->xa_node == XA_ERROR(-ENOMEM))
  570. return xa_mk_internal(VM_FAULT_OOM);
  571. if (xas_error(xas))
  572. return xa_mk_internal(VM_FAULT_SIGBUS);
  573. return entry;
  574. fallback:
  575. xas_unlock_irq(xas);
  576. return xa_mk_internal(VM_FAULT_FALLBACK);
  577. }
  578. /**
  579. * dax_layout_busy_page_range - find first pinned page in @mapping
  580. * @mapping: address space to scan for a page with ref count > 1
  581. * @start: Starting offset. Page containing 'start' is included.
  582. * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
  583. * pages from 'start' till the end of file are included.
  584. *
  585. * DAX requires ZONE_DEVICE mapped pages. These pages are never
  586. * 'onlined' to the page allocator so they are considered idle when
  587. * page->count == 1. A filesystem uses this interface to determine if
  588. * any page in the mapping is busy, i.e. for DMA, or other
  589. * get_user_pages() usages.
  590. *
  591. * It is expected that the filesystem is holding locks to block the
  592. * establishment of new mappings in this address_space. I.e. it expects
  593. * to be able to run unmap_mapping_range() and subsequently not race
  594. * mapping_mapped() becoming true.
  595. */
  596. struct page *dax_layout_busy_page_range(struct address_space *mapping,
  597. loff_t start, loff_t end)
  598. {
  599. void *entry;
  600. unsigned int scanned = 0;
  601. struct page *page = NULL;
  602. pgoff_t start_idx = start >> PAGE_SHIFT;
  603. pgoff_t end_idx;
  604. XA_STATE(xas, &mapping->i_pages, start_idx);
  605. /*
  606. * In the 'limited' case get_user_pages() for dax is disabled.
  607. */
  608. if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  609. return NULL;
  610. if (!dax_mapping(mapping) || !mapping_mapped(mapping))
  611. return NULL;
  612. /* If end == LLONG_MAX, all pages from start to till end of file */
  613. if (end == LLONG_MAX)
  614. end_idx = ULONG_MAX;
  615. else
  616. end_idx = end >> PAGE_SHIFT;
  617. /*
  618. * If we race get_user_pages_fast() here either we'll see the
  619. * elevated page count in the iteration and wait, or
  620. * get_user_pages_fast() will see that the page it took a reference
  621. * against is no longer mapped in the page tables and bail to the
  622. * get_user_pages() slow path. The slow path is protected by
  623. * pte_lock() and pmd_lock(). New references are not taken without
  624. * holding those locks, and unmap_mapping_pages() will not zero the
  625. * pte or pmd without holding the respective lock, so we are
  626. * guaranteed to either see new references or prevent new
  627. * references from being established.
  628. */
  629. unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
  630. xas_lock_irq(&xas);
  631. xas_for_each(&xas, entry, end_idx) {
  632. if (WARN_ON_ONCE(!xa_is_value(entry)))
  633. continue;
  634. if (unlikely(dax_is_locked(entry)))
  635. entry = get_unlocked_entry(&xas, 0);
  636. if (entry)
  637. page = dax_busy_page(entry);
  638. put_unlocked_entry(&xas, entry, WAKE_NEXT);
  639. if (page)
  640. break;
  641. if (++scanned % XA_CHECK_SCHED)
  642. continue;
  643. xas_pause(&xas);
  644. xas_unlock_irq(&xas);
  645. cond_resched();
  646. xas_lock_irq(&xas);
  647. }
  648. xas_unlock_irq(&xas);
  649. return page;
  650. }
  651. EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
  652. struct page *dax_layout_busy_page(struct address_space *mapping)
  653. {
  654. return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
  655. }
  656. EXPORT_SYMBOL_GPL(dax_layout_busy_page);
  657. static int __dax_invalidate_entry(struct address_space *mapping,
  658. pgoff_t index, bool trunc)
  659. {
  660. XA_STATE(xas, &mapping->i_pages, index);
  661. int ret = 0;
  662. void *entry;
  663. xas_lock_irq(&xas);
  664. entry = get_unlocked_entry(&xas, 0);
  665. if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  666. goto out;
  667. if (!trunc &&
  668. (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
  669. xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
  670. goto out;
  671. dax_disassociate_entry(entry, mapping, trunc);
  672. xas_store(&xas, NULL);
  673. mapping->nrpages -= 1UL << dax_entry_order(entry);
  674. ret = 1;
  675. out:
  676. put_unlocked_entry(&xas, entry, WAKE_ALL);
  677. xas_unlock_irq(&xas);
  678. return ret;
  679. }
  680. static int __dax_clear_dirty_range(struct address_space *mapping,
  681. pgoff_t start, pgoff_t end)
  682. {
  683. XA_STATE(xas, &mapping->i_pages, start);
  684. unsigned int scanned = 0;
  685. void *entry;
  686. xas_lock_irq(&xas);
  687. xas_for_each(&xas, entry, end) {
  688. entry = get_unlocked_entry(&xas, 0);
  689. xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
  690. xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
  691. put_unlocked_entry(&xas, entry, WAKE_NEXT);
  692. if (++scanned % XA_CHECK_SCHED)
  693. continue;
  694. xas_pause(&xas);
  695. xas_unlock_irq(&xas);
  696. cond_resched();
  697. xas_lock_irq(&xas);
  698. }
  699. xas_unlock_irq(&xas);
  700. return 0;
  701. }
  702. /*
  703. * Delete DAX entry at @index from @mapping. Wait for it
  704. * to be unlocked before deleting it.
  705. */
  706. int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  707. {
  708. int ret = __dax_invalidate_entry(mapping, index, true);
  709. /*
  710. * This gets called from truncate / punch_hole path. As such, the caller
  711. * must hold locks protecting against concurrent modifications of the
  712. * page cache (usually fs-private i_mmap_sem for writing). Since the
  713. * caller has seen a DAX entry for this index, we better find it
  714. * at that index as well...
  715. */
  716. WARN_ON_ONCE(!ret);
  717. return ret;
  718. }
  719. /*
  720. * Invalidate DAX entry if it is clean.
  721. */
  722. int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
  723. pgoff_t index)
  724. {
  725. return __dax_invalidate_entry(mapping, index, false);
  726. }
  727. static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
  728. {
  729. return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
  730. }
  731. static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
  732. {
  733. pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
  734. void *vto, *kaddr;
  735. long rc;
  736. int id;
  737. id = dax_read_lock();
  738. rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
  739. &kaddr, NULL);
  740. if (rc < 0) {
  741. dax_read_unlock(id);
  742. return rc;
  743. }
  744. vto = kmap_atomic(vmf->cow_page);
  745. copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
  746. kunmap_atomic(vto);
  747. dax_read_unlock(id);
  748. return 0;
  749. }
  750. /*
  751. * MAP_SYNC on a dax mapping guarantees dirty metadata is
  752. * flushed on write-faults (non-cow), but not read-faults.
  753. */
  754. static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
  755. struct vm_area_struct *vma)
  756. {
  757. return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
  758. (iter->iomap.flags & IOMAP_F_DIRTY);
  759. }
  760. /*
  761. * By this point grab_mapping_entry() has ensured that we have a locked entry
  762. * of the appropriate size so we don't have to worry about downgrading PMDs to
  763. * PTEs. If we happen to be trying to insert a PTE and there is a PMD
  764. * already in the tree, we will skip the insertion and just dirty the PMD as
  765. * appropriate.
  766. */
  767. static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
  768. const struct iomap_iter *iter, void *entry, pfn_t pfn,
  769. unsigned long flags)
  770. {
  771. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  772. void *new_entry = dax_make_entry(pfn, flags);
  773. bool write = iter->flags & IOMAP_WRITE;
  774. bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma);
  775. bool shared = iter->iomap.flags & IOMAP_F_SHARED;
  776. if (dirty)
  777. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  778. if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
  779. unsigned long index = xas->xa_index;
  780. /* we are replacing a zero page with block mapping */
  781. if (dax_is_pmd_entry(entry))
  782. unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
  783. PG_PMD_NR, false);
  784. else /* pte entry */
  785. unmap_mapping_pages(mapping, index, 1, false);
  786. }
  787. xas_reset(xas);
  788. xas_lock_irq(xas);
  789. if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
  790. void *old;
  791. dax_disassociate_entry(entry, mapping, false);
  792. dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
  793. shared);
  794. /*
  795. * Only swap our new entry into the page cache if the current
  796. * entry is a zero page or an empty entry. If a normal PTE or
  797. * PMD entry is already in the cache, we leave it alone. This
  798. * means that if we are trying to insert a PTE and the
  799. * existing entry is a PMD, we will just leave the PMD in the
  800. * tree and dirty it if necessary.
  801. */
  802. old = dax_lock_entry(xas, new_entry);
  803. WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
  804. DAX_LOCKED));
  805. entry = new_entry;
  806. } else {
  807. xas_load(xas); /* Walk the xa_state */
  808. }
  809. if (dirty)
  810. xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
  811. if (write && shared)
  812. xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
  813. xas_unlock_irq(xas);
  814. return entry;
  815. }
  816. static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
  817. struct address_space *mapping, void *entry)
  818. {
  819. unsigned long pfn, index, count, end;
  820. long ret = 0;
  821. struct vm_area_struct *vma;
  822. /*
  823. * A page got tagged dirty in DAX mapping? Something is seriously
  824. * wrong.
  825. */
  826. if (WARN_ON(!xa_is_value(entry)))
  827. return -EIO;
  828. if (unlikely(dax_is_locked(entry))) {
  829. void *old_entry = entry;
  830. entry = get_unlocked_entry(xas, 0);
  831. /* Entry got punched out / reallocated? */
  832. if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  833. goto put_unlocked;
  834. /*
  835. * Entry got reallocated elsewhere? No need to writeback.
  836. * We have to compare pfns as we must not bail out due to
  837. * difference in lockbit or entry type.
  838. */
  839. if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
  840. goto put_unlocked;
  841. if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
  842. dax_is_zero_entry(entry))) {
  843. ret = -EIO;
  844. goto put_unlocked;
  845. }
  846. /* Another fsync thread may have already done this entry */
  847. if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
  848. goto put_unlocked;
  849. }
  850. /* Lock the entry to serialize with page faults */
  851. dax_lock_entry(xas, entry);
  852. /*
  853. * We can clear the tag now but we have to be careful so that concurrent
  854. * dax_writeback_one() calls for the same index cannot finish before we
  855. * actually flush the caches. This is achieved as the calls will look
  856. * at the entry only under the i_pages lock and once they do that
  857. * they will see the entry locked and wait for it to unlock.
  858. */
  859. xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
  860. xas_unlock_irq(xas);
  861. /*
  862. * If dax_writeback_mapping_range() was given a wbc->range_start
  863. * in the middle of a PMD, the 'index' we use needs to be
  864. * aligned to the start of the PMD.
  865. * This allows us to flush for PMD_SIZE and not have to worry about
  866. * partial PMD writebacks.
  867. */
  868. pfn = dax_to_pfn(entry);
  869. count = 1UL << dax_entry_order(entry);
  870. index = xas->xa_index & ~(count - 1);
  871. end = index + count - 1;
  872. /* Walk all mappings of a given index of a file and writeprotect them */
  873. i_mmap_lock_read(mapping);
  874. vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
  875. pfn_mkclean_range(pfn, count, index, vma);
  876. cond_resched();
  877. }
  878. i_mmap_unlock_read(mapping);
  879. dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
  880. /*
  881. * After we have flushed the cache, we can clear the dirty tag. There
  882. * cannot be new dirty data in the pfn after the flush has completed as
  883. * the pfn mappings are writeprotected and fault waits for mapping
  884. * entry lock.
  885. */
  886. xas_reset(xas);
  887. xas_lock_irq(xas);
  888. xas_store(xas, entry);
  889. xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
  890. dax_wake_entry(xas, entry, WAKE_NEXT);
  891. trace_dax_writeback_one(mapping->host, index, count);
  892. return ret;
  893. put_unlocked:
  894. put_unlocked_entry(xas, entry, WAKE_NEXT);
  895. return ret;
  896. }
  897. /*
  898. * Flush the mapping to the persistent domain within the byte range of [start,
  899. * end]. This is required by data integrity operations to ensure file data is
  900. * on persistent storage prior to completion of the operation.
  901. */
  902. int dax_writeback_mapping_range(struct address_space *mapping,
  903. struct dax_device *dax_dev, struct writeback_control *wbc)
  904. {
  905. XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
  906. struct inode *inode = mapping->host;
  907. pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
  908. void *entry;
  909. int ret = 0;
  910. unsigned int scanned = 0;
  911. if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  912. return -EIO;
  913. if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
  914. return 0;
  915. trace_dax_writeback_range(inode, xas.xa_index, end_index);
  916. tag_pages_for_writeback(mapping, xas.xa_index, end_index);
  917. xas_lock_irq(&xas);
  918. xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
  919. ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
  920. if (ret < 0) {
  921. mapping_set_error(mapping, ret);
  922. break;
  923. }
  924. if (++scanned % XA_CHECK_SCHED)
  925. continue;
  926. xas_pause(&xas);
  927. xas_unlock_irq(&xas);
  928. cond_resched();
  929. xas_lock_irq(&xas);
  930. }
  931. xas_unlock_irq(&xas);
  932. trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
  933. return ret;
  934. }
  935. EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  936. static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
  937. size_t size, void **kaddr, pfn_t *pfnp)
  938. {
  939. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  940. int id, rc = 0;
  941. long length;
  942. id = dax_read_lock();
  943. length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
  944. DAX_ACCESS, kaddr, pfnp);
  945. if (length < 0) {
  946. rc = length;
  947. goto out;
  948. }
  949. if (!pfnp)
  950. goto out_check_addr;
  951. rc = -EINVAL;
  952. if (PFN_PHYS(length) < size)
  953. goto out;
  954. if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
  955. goto out;
  956. /* For larger pages we need devmap */
  957. if (length > 1 && !pfn_t_devmap(*pfnp))
  958. goto out;
  959. rc = 0;
  960. out_check_addr:
  961. if (!kaddr)
  962. goto out;
  963. if (!*kaddr)
  964. rc = -EFAULT;
  965. out:
  966. dax_read_unlock(id);
  967. return rc;
  968. }
  969. /**
  970. * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
  971. * by copying the data before and after the range to be written.
  972. * @pos: address to do copy from.
  973. * @length: size of copy operation.
  974. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
  975. * @srcmap: iomap srcmap
  976. * @daddr: destination address to copy to.
  977. *
  978. * This can be called from two places. Either during DAX write fault (page
  979. * aligned), to copy the length size data to daddr. Or, while doing normal DAX
  980. * write operation, dax_iomap_iter() might call this to do the copy of either
  981. * start or end unaligned address. In the latter case the rest of the copy of
  982. * aligned ranges is taken care by dax_iomap_iter() itself.
  983. * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
  984. * area to make sure no old data remains.
  985. */
  986. static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
  987. const struct iomap *srcmap, void *daddr)
  988. {
  989. loff_t head_off = pos & (align_size - 1);
  990. size_t size = ALIGN(head_off + length, align_size);
  991. loff_t end = pos + length;
  992. loff_t pg_end = round_up(end, align_size);
  993. /* copy_all is usually in page fault case */
  994. bool copy_all = head_off == 0 && end == pg_end;
  995. /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
  996. bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
  997. srcmap->type == IOMAP_UNWRITTEN;
  998. void *saddr = NULL;
  999. int ret = 0;
  1000. if (!zero_edge) {
  1001. ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
  1002. if (ret)
  1003. return dax_mem2blk_err(ret);
  1004. }
  1005. if (copy_all) {
  1006. if (zero_edge)
  1007. memset(daddr, 0, size);
  1008. else
  1009. ret = copy_mc_to_kernel(daddr, saddr, length);
  1010. goto out;
  1011. }
  1012. /* Copy the head part of the range */
  1013. if (head_off) {
  1014. if (zero_edge)
  1015. memset(daddr, 0, head_off);
  1016. else {
  1017. ret = copy_mc_to_kernel(daddr, saddr, head_off);
  1018. if (ret)
  1019. return -EIO;
  1020. }
  1021. }
  1022. /* Copy the tail part of the range */
  1023. if (end < pg_end) {
  1024. loff_t tail_off = head_off + length;
  1025. loff_t tail_len = pg_end - end;
  1026. if (zero_edge)
  1027. memset(daddr + tail_off, 0, tail_len);
  1028. else {
  1029. ret = copy_mc_to_kernel(daddr + tail_off,
  1030. saddr + tail_off, tail_len);
  1031. if (ret)
  1032. return -EIO;
  1033. }
  1034. }
  1035. out:
  1036. if (zero_edge)
  1037. dax_flush(srcmap->dax_dev, daddr, size);
  1038. return ret ? -EIO : 0;
  1039. }
  1040. /*
  1041. * The user has performed a load from a hole in the file. Allocating a new
  1042. * page in the file would cause excessive storage usage for workloads with
  1043. * sparse files. Instead we insert a read-only mapping of the 4k zero page.
  1044. * If this page is ever written to we will re-fault and change the mapping to
  1045. * point to real DAX storage instead.
  1046. */
  1047. static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  1048. const struct iomap_iter *iter, void **entry)
  1049. {
  1050. struct inode *inode = iter->inode;
  1051. unsigned long vaddr = vmf->address;
  1052. pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
  1053. vm_fault_t ret;
  1054. *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
  1055. ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
  1056. trace_dax_load_hole(inode, vmf, ret);
  1057. return ret;
  1058. }
  1059. #ifdef CONFIG_FS_DAX_PMD
  1060. static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  1061. const struct iomap_iter *iter, void **entry)
  1062. {
  1063. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1064. unsigned long pmd_addr = vmf->address & PMD_MASK;
  1065. struct vm_area_struct *vma = vmf->vma;
  1066. struct inode *inode = mapping->host;
  1067. pgtable_t pgtable = NULL;
  1068. struct folio *zero_folio;
  1069. spinlock_t *ptl;
  1070. pmd_t pmd_entry;
  1071. pfn_t pfn;
  1072. zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
  1073. if (unlikely(!zero_folio))
  1074. goto fallback;
  1075. pfn = page_to_pfn_t(&zero_folio->page);
  1076. *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
  1077. DAX_PMD | DAX_ZERO_PAGE);
  1078. if (arch_needs_pgtable_deposit()) {
  1079. pgtable = pte_alloc_one(vma->vm_mm);
  1080. if (!pgtable)
  1081. return VM_FAULT_OOM;
  1082. }
  1083. ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
  1084. if (!pmd_none(*(vmf->pmd))) {
  1085. spin_unlock(ptl);
  1086. goto fallback;
  1087. }
  1088. if (pgtable) {
  1089. pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  1090. mm_inc_nr_ptes(vma->vm_mm);
  1091. }
  1092. pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
  1093. pmd_entry = pmd_mkhuge(pmd_entry);
  1094. set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
  1095. spin_unlock(ptl);
  1096. trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
  1097. return VM_FAULT_NOPAGE;
  1098. fallback:
  1099. if (pgtable)
  1100. pte_free(vma->vm_mm, pgtable);
  1101. trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
  1102. return VM_FAULT_FALLBACK;
  1103. }
  1104. #else
  1105. static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  1106. const struct iomap_iter *iter, void **entry)
  1107. {
  1108. return VM_FAULT_FALLBACK;
  1109. }
  1110. #endif /* CONFIG_FS_DAX_PMD */
  1111. static s64 dax_unshare_iter(struct iomap_iter *iter)
  1112. {
  1113. struct iomap *iomap = &iter->iomap;
  1114. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1115. loff_t copy_pos = iter->pos;
  1116. u64 copy_len = iomap_length(iter);
  1117. u32 mod;
  1118. int id = 0;
  1119. s64 ret = 0;
  1120. void *daddr = NULL, *saddr = NULL;
  1121. if (!iomap_want_unshare_iter(iter))
  1122. return iomap_length(iter);
  1123. /*
  1124. * Extend the file range to be aligned to fsblock/pagesize, because
  1125. * we need to copy entire blocks, not just the byte range specified.
  1126. * Invalidate the mapping because we're about to CoW.
  1127. */
  1128. mod = offset_in_page(copy_pos);
  1129. if (mod) {
  1130. copy_len += mod;
  1131. copy_pos -= mod;
  1132. }
  1133. mod = offset_in_page(copy_pos + copy_len);
  1134. if (mod)
  1135. copy_len += PAGE_SIZE - mod;
  1136. invalidate_inode_pages2_range(iter->inode->i_mapping,
  1137. copy_pos >> PAGE_SHIFT,
  1138. (copy_pos + copy_len - 1) >> PAGE_SHIFT);
  1139. id = dax_read_lock();
  1140. ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL);
  1141. if (ret < 0)
  1142. goto out_unlock;
  1143. ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL);
  1144. if (ret < 0)
  1145. goto out_unlock;
  1146. if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
  1147. ret = iomap_length(iter);
  1148. else
  1149. ret = -EIO;
  1150. out_unlock:
  1151. dax_read_unlock(id);
  1152. return dax_mem2blk_err(ret);
  1153. }
  1154. int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
  1155. const struct iomap_ops *ops)
  1156. {
  1157. struct iomap_iter iter = {
  1158. .inode = inode,
  1159. .pos = pos,
  1160. .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX,
  1161. };
  1162. loff_t size = i_size_read(inode);
  1163. int ret;
  1164. if (pos < 0 || pos >= size)
  1165. return 0;
  1166. iter.len = min(len, size - pos);
  1167. while ((ret = iomap_iter(&iter, ops)) > 0)
  1168. iter.processed = dax_unshare_iter(&iter);
  1169. return ret;
  1170. }
  1171. EXPORT_SYMBOL_GPL(dax_file_unshare);
  1172. static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
  1173. {
  1174. const struct iomap *iomap = &iter->iomap;
  1175. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1176. unsigned offset = offset_in_page(pos);
  1177. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1178. void *kaddr;
  1179. long ret;
  1180. ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
  1181. NULL);
  1182. if (ret < 0)
  1183. return dax_mem2blk_err(ret);
  1184. memset(kaddr + offset, 0, size);
  1185. if (iomap->flags & IOMAP_F_SHARED)
  1186. ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap,
  1187. kaddr);
  1188. else
  1189. dax_flush(iomap->dax_dev, kaddr + offset, size);
  1190. return ret;
  1191. }
  1192. static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
  1193. {
  1194. const struct iomap *iomap = &iter->iomap;
  1195. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1196. loff_t pos = iter->pos;
  1197. u64 length = iomap_length(iter);
  1198. s64 written = 0;
  1199. /* already zeroed? we're done. */
  1200. if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
  1201. return length;
  1202. /*
  1203. * invalidate the pages whose sharing state is to be changed
  1204. * because of CoW.
  1205. */
  1206. if (iomap->flags & IOMAP_F_SHARED)
  1207. invalidate_inode_pages2_range(iter->inode->i_mapping,
  1208. pos >> PAGE_SHIFT,
  1209. (pos + length - 1) >> PAGE_SHIFT);
  1210. do {
  1211. unsigned offset = offset_in_page(pos);
  1212. unsigned size = min_t(u64, PAGE_SIZE - offset, length);
  1213. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1214. long rc;
  1215. int id;
  1216. id = dax_read_lock();
  1217. if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
  1218. rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
  1219. else
  1220. rc = dax_memzero(iter, pos, size);
  1221. dax_read_unlock(id);
  1222. if (rc < 0)
  1223. return rc;
  1224. pos += size;
  1225. length -= size;
  1226. written += size;
  1227. } while (length > 0);
  1228. if (did_zero)
  1229. *did_zero = true;
  1230. return written;
  1231. }
  1232. int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
  1233. const struct iomap_ops *ops)
  1234. {
  1235. struct iomap_iter iter = {
  1236. .inode = inode,
  1237. .pos = pos,
  1238. .len = len,
  1239. .flags = IOMAP_DAX | IOMAP_ZERO,
  1240. };
  1241. int ret;
  1242. while ((ret = iomap_iter(&iter, ops)) > 0)
  1243. iter.processed = dax_zero_iter(&iter, did_zero);
  1244. return ret;
  1245. }
  1246. EXPORT_SYMBOL_GPL(dax_zero_range);
  1247. int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
  1248. const struct iomap_ops *ops)
  1249. {
  1250. unsigned int blocksize = i_blocksize(inode);
  1251. unsigned int off = pos & (blocksize - 1);
  1252. /* Block boundary? Nothing to do */
  1253. if (!off)
  1254. return 0;
  1255. return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
  1256. }
  1257. EXPORT_SYMBOL_GPL(dax_truncate_page);
  1258. static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
  1259. struct iov_iter *iter)
  1260. {
  1261. const struct iomap *iomap = &iomi->iomap;
  1262. const struct iomap *srcmap = iomap_iter_srcmap(iomi);
  1263. loff_t length = iomap_length(iomi);
  1264. loff_t pos = iomi->pos;
  1265. struct dax_device *dax_dev = iomap->dax_dev;
  1266. loff_t end = pos + length, done = 0;
  1267. bool write = iov_iter_rw(iter) == WRITE;
  1268. bool cow = write && iomap->flags & IOMAP_F_SHARED;
  1269. ssize_t ret = 0;
  1270. size_t xfer;
  1271. int id;
  1272. if (!write) {
  1273. end = min(end, i_size_read(iomi->inode));
  1274. if (pos >= end)
  1275. return 0;
  1276. if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  1277. return iov_iter_zero(min(length, end - pos), iter);
  1278. }
  1279. /*
  1280. * In DAX mode, enforce either pure overwrites of written extents, or
  1281. * writes to unwritten extents as part of a copy-on-write operation.
  1282. */
  1283. if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
  1284. !(iomap->flags & IOMAP_F_SHARED)))
  1285. return -EIO;
  1286. /*
  1287. * Write can allocate block for an area which has a hole page mapped
  1288. * into page tables. We have to tear down these mappings so that data
  1289. * written by write(2) is visible in mmap.
  1290. */
  1291. if (iomap->flags & IOMAP_F_NEW || cow) {
  1292. /*
  1293. * Filesystem allows CoW on non-shared extents. The src extents
  1294. * may have been mmapped with dirty mark before. To be able to
  1295. * invalidate its dax entries, we need to clear the dirty mark
  1296. * in advance.
  1297. */
  1298. if (cow)
  1299. __dax_clear_dirty_range(iomi->inode->i_mapping,
  1300. pos >> PAGE_SHIFT,
  1301. (end - 1) >> PAGE_SHIFT);
  1302. invalidate_inode_pages2_range(iomi->inode->i_mapping,
  1303. pos >> PAGE_SHIFT,
  1304. (end - 1) >> PAGE_SHIFT);
  1305. }
  1306. id = dax_read_lock();
  1307. while (pos < end) {
  1308. unsigned offset = pos & (PAGE_SIZE - 1);
  1309. const size_t size = ALIGN(length + offset, PAGE_SIZE);
  1310. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1311. ssize_t map_len;
  1312. bool recovery = false;
  1313. void *kaddr;
  1314. if (fatal_signal_pending(current)) {
  1315. ret = -EINTR;
  1316. break;
  1317. }
  1318. map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
  1319. DAX_ACCESS, &kaddr, NULL);
  1320. if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) {
  1321. map_len = dax_direct_access(dax_dev, pgoff,
  1322. PHYS_PFN(size), DAX_RECOVERY_WRITE,
  1323. &kaddr, NULL);
  1324. if (map_len > 0)
  1325. recovery = true;
  1326. }
  1327. if (map_len < 0) {
  1328. ret = dax_mem2blk_err(map_len);
  1329. break;
  1330. }
  1331. if (cow) {
  1332. ret = dax_iomap_copy_around(pos, length, PAGE_SIZE,
  1333. srcmap, kaddr);
  1334. if (ret)
  1335. break;
  1336. }
  1337. map_len = PFN_PHYS(map_len);
  1338. kaddr += offset;
  1339. map_len -= offset;
  1340. if (map_len > end - pos)
  1341. map_len = end - pos;
  1342. if (recovery)
  1343. xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
  1344. map_len, iter);
  1345. else if (write)
  1346. xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
  1347. map_len, iter);
  1348. else
  1349. xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
  1350. map_len, iter);
  1351. pos += xfer;
  1352. length -= xfer;
  1353. done += xfer;
  1354. if (xfer == 0)
  1355. ret = -EFAULT;
  1356. if (xfer < map_len)
  1357. break;
  1358. }
  1359. dax_read_unlock(id);
  1360. return done ? done : ret;
  1361. }
  1362. /**
  1363. * dax_iomap_rw - Perform I/O to a DAX file
  1364. * @iocb: The control block for this I/O
  1365. * @iter: The addresses to do I/O from or to
  1366. * @ops: iomap ops passed from the file system
  1367. *
  1368. * This function performs read and write operations to directly mapped
  1369. * persistent memory. The callers needs to take care of read/write exclusion
  1370. * and evicting any page cache pages in the region under I/O.
  1371. */
  1372. ssize_t
  1373. dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
  1374. const struct iomap_ops *ops)
  1375. {
  1376. struct iomap_iter iomi = {
  1377. .inode = iocb->ki_filp->f_mapping->host,
  1378. .pos = iocb->ki_pos,
  1379. .len = iov_iter_count(iter),
  1380. .flags = IOMAP_DAX,
  1381. };
  1382. loff_t done = 0;
  1383. int ret;
  1384. if (!iomi.len)
  1385. return 0;
  1386. if (iov_iter_rw(iter) == WRITE) {
  1387. lockdep_assert_held_write(&iomi.inode->i_rwsem);
  1388. iomi.flags |= IOMAP_WRITE;
  1389. } else if (!sb_rdonly(iomi.inode->i_sb)) {
  1390. lockdep_assert_held(&iomi.inode->i_rwsem);
  1391. }
  1392. if (iocb->ki_flags & IOCB_NOWAIT)
  1393. iomi.flags |= IOMAP_NOWAIT;
  1394. while ((ret = iomap_iter(&iomi, ops)) > 0)
  1395. iomi.processed = dax_iomap_iter(&iomi, iter);
  1396. done = iomi.pos - iocb->ki_pos;
  1397. iocb->ki_pos = iomi.pos;
  1398. return done ? done : ret;
  1399. }
  1400. EXPORT_SYMBOL_GPL(dax_iomap_rw);
  1401. static vm_fault_t dax_fault_return(int error)
  1402. {
  1403. if (error == 0)
  1404. return VM_FAULT_NOPAGE;
  1405. return vmf_error(error);
  1406. }
  1407. /*
  1408. * When handling a synchronous page fault and the inode need a fsync, we can
  1409. * insert the PTE/PMD into page tables only after that fsync happened. Skip
  1410. * insertion for now and return the pfn so that caller can insert it after the
  1411. * fsync is done.
  1412. */
  1413. static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
  1414. {
  1415. if (WARN_ON_ONCE(!pfnp))
  1416. return VM_FAULT_SIGBUS;
  1417. *pfnp = pfn;
  1418. return VM_FAULT_NEEDDSYNC;
  1419. }
  1420. static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
  1421. const struct iomap_iter *iter)
  1422. {
  1423. vm_fault_t ret;
  1424. int error = 0;
  1425. switch (iter->iomap.type) {
  1426. case IOMAP_HOLE:
  1427. case IOMAP_UNWRITTEN:
  1428. clear_user_highpage(vmf->cow_page, vmf->address);
  1429. break;
  1430. case IOMAP_MAPPED:
  1431. error = copy_cow_page_dax(vmf, iter);
  1432. break;
  1433. default:
  1434. WARN_ON_ONCE(1);
  1435. error = -EIO;
  1436. break;
  1437. }
  1438. if (error)
  1439. return dax_fault_return(error);
  1440. __SetPageUptodate(vmf->cow_page);
  1441. ret = finish_fault(vmf);
  1442. if (!ret)
  1443. return VM_FAULT_DONE_COW;
  1444. return ret;
  1445. }
  1446. /**
  1447. * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
  1448. * @vmf: vm fault instance
  1449. * @iter: iomap iter
  1450. * @pfnp: pfn to be returned
  1451. * @xas: the dax mapping tree of a file
  1452. * @entry: an unlocked dax entry to be inserted
  1453. * @pmd: distinguish whether it is a pmd fault
  1454. */
  1455. static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
  1456. const struct iomap_iter *iter, pfn_t *pfnp,
  1457. struct xa_state *xas, void **entry, bool pmd)
  1458. {
  1459. const struct iomap *iomap = &iter->iomap;
  1460. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1461. size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
  1462. loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
  1463. bool write = iter->flags & IOMAP_WRITE;
  1464. unsigned long entry_flags = pmd ? DAX_PMD : 0;
  1465. int err = 0;
  1466. pfn_t pfn;
  1467. void *kaddr;
  1468. if (!pmd && vmf->cow_page)
  1469. return dax_fault_cow_page(vmf, iter);
  1470. /* if we are reading UNWRITTEN and HOLE, return a hole. */
  1471. if (!write &&
  1472. (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
  1473. if (!pmd)
  1474. return dax_load_hole(xas, vmf, iter, entry);
  1475. return dax_pmd_load_hole(xas, vmf, iter, entry);
  1476. }
  1477. if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
  1478. WARN_ON_ONCE(1);
  1479. return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
  1480. }
  1481. err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
  1482. if (err)
  1483. return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
  1484. *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
  1485. if (write && iomap->flags & IOMAP_F_SHARED) {
  1486. err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr);
  1487. if (err)
  1488. return dax_fault_return(err);
  1489. }
  1490. if (dax_fault_is_synchronous(iter, vmf->vma))
  1491. return dax_fault_synchronous_pfnp(pfnp, pfn);
  1492. /* insert PMD pfn */
  1493. if (pmd)
  1494. return vmf_insert_pfn_pmd(vmf, pfn, write);
  1495. /* insert PTE pfn */
  1496. if (write)
  1497. return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
  1498. return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
  1499. }
  1500. static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
  1501. int *iomap_errp, const struct iomap_ops *ops)
  1502. {
  1503. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1504. XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
  1505. struct iomap_iter iter = {
  1506. .inode = mapping->host,
  1507. .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
  1508. .len = PAGE_SIZE,
  1509. .flags = IOMAP_DAX | IOMAP_FAULT,
  1510. };
  1511. vm_fault_t ret = 0;
  1512. void *entry;
  1513. int error;
  1514. trace_dax_pte_fault(iter.inode, vmf, ret);
  1515. /*
  1516. * Check whether offset isn't beyond end of file now. Caller is supposed
  1517. * to hold locks serializing us with truncate / punch hole so this is
  1518. * a reliable test.
  1519. */
  1520. if (iter.pos >= i_size_read(iter.inode)) {
  1521. ret = VM_FAULT_SIGBUS;
  1522. goto out;
  1523. }
  1524. if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
  1525. iter.flags |= IOMAP_WRITE;
  1526. entry = grab_mapping_entry(&xas, mapping, 0);
  1527. if (xa_is_internal(entry)) {
  1528. ret = xa_to_internal(entry);
  1529. goto out;
  1530. }
  1531. /*
  1532. * It is possible, particularly with mixed reads & writes to private
  1533. * mappings, that we have raced with a PMD fault that overlaps with
  1534. * the PTE we need to set up. If so just return and the fault will be
  1535. * retried.
  1536. */
  1537. if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
  1538. ret = VM_FAULT_NOPAGE;
  1539. goto unlock_entry;
  1540. }
  1541. while ((error = iomap_iter(&iter, ops)) > 0) {
  1542. if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
  1543. iter.processed = -EIO; /* fs corruption? */
  1544. continue;
  1545. }
  1546. ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
  1547. if (ret != VM_FAULT_SIGBUS &&
  1548. (iter.iomap.flags & IOMAP_F_NEW)) {
  1549. count_vm_event(PGMAJFAULT);
  1550. count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
  1551. ret |= VM_FAULT_MAJOR;
  1552. }
  1553. if (!(ret & VM_FAULT_ERROR))
  1554. iter.processed = PAGE_SIZE;
  1555. }
  1556. if (iomap_errp)
  1557. *iomap_errp = error;
  1558. if (!ret && error)
  1559. ret = dax_fault_return(error);
  1560. unlock_entry:
  1561. dax_unlock_entry(&xas, entry);
  1562. out:
  1563. trace_dax_pte_fault_done(iter.inode, vmf, ret);
  1564. return ret;
  1565. }
  1566. #ifdef CONFIG_FS_DAX_PMD
  1567. static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
  1568. pgoff_t max_pgoff)
  1569. {
  1570. unsigned long pmd_addr = vmf->address & PMD_MASK;
  1571. bool write = vmf->flags & FAULT_FLAG_WRITE;
  1572. /*
  1573. * Make sure that the faulting address's PMD offset (color) matches
  1574. * the PMD offset from the start of the file. This is necessary so
  1575. * that a PMD range in the page table overlaps exactly with a PMD
  1576. * range in the page cache.
  1577. */
  1578. if ((vmf->pgoff & PG_PMD_COLOUR) !=
  1579. ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
  1580. return true;
  1581. /* Fall back to PTEs if we're going to COW */
  1582. if (write && !(vmf->vma->vm_flags & VM_SHARED))
  1583. return true;
  1584. /* If the PMD would extend outside the VMA */
  1585. if (pmd_addr < vmf->vma->vm_start)
  1586. return true;
  1587. if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
  1588. return true;
  1589. /* If the PMD would extend beyond the file size */
  1590. if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
  1591. return true;
  1592. return false;
  1593. }
  1594. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
  1595. const struct iomap_ops *ops)
  1596. {
  1597. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1598. XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
  1599. struct iomap_iter iter = {
  1600. .inode = mapping->host,
  1601. .len = PMD_SIZE,
  1602. .flags = IOMAP_DAX | IOMAP_FAULT,
  1603. };
  1604. vm_fault_t ret = VM_FAULT_FALLBACK;
  1605. pgoff_t max_pgoff;
  1606. void *entry;
  1607. if (vmf->flags & FAULT_FLAG_WRITE)
  1608. iter.flags |= IOMAP_WRITE;
  1609. /*
  1610. * Check whether offset isn't beyond end of file now. Caller is
  1611. * supposed to hold locks serializing us with truncate / punch hole so
  1612. * this is a reliable test.
  1613. */
  1614. max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
  1615. trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
  1616. if (xas.xa_index >= max_pgoff) {
  1617. ret = VM_FAULT_SIGBUS;
  1618. goto out;
  1619. }
  1620. if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
  1621. goto fallback;
  1622. /*
  1623. * grab_mapping_entry() will make sure we get an empty PMD entry,
  1624. * a zero PMD entry or a DAX PMD. If it can't (because a PTE
  1625. * entry is already in the array, for instance), it will return
  1626. * VM_FAULT_FALLBACK.
  1627. */
  1628. entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
  1629. if (xa_is_internal(entry)) {
  1630. ret = xa_to_internal(entry);
  1631. goto fallback;
  1632. }
  1633. /*
  1634. * It is possible, particularly with mixed reads & writes to private
  1635. * mappings, that we have raced with a PTE fault that overlaps with
  1636. * the PMD we need to set up. If so just return and the fault will be
  1637. * retried.
  1638. */
  1639. if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
  1640. !pmd_devmap(*vmf->pmd)) {
  1641. ret = 0;
  1642. goto unlock_entry;
  1643. }
  1644. iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
  1645. while (iomap_iter(&iter, ops) > 0) {
  1646. if (iomap_length(&iter) < PMD_SIZE)
  1647. continue; /* actually breaks out of the loop */
  1648. ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
  1649. if (ret != VM_FAULT_FALLBACK)
  1650. iter.processed = PMD_SIZE;
  1651. }
  1652. unlock_entry:
  1653. dax_unlock_entry(&xas, entry);
  1654. fallback:
  1655. if (ret == VM_FAULT_FALLBACK) {
  1656. split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
  1657. count_vm_event(THP_FAULT_FALLBACK);
  1658. }
  1659. out:
  1660. trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
  1661. return ret;
  1662. }
  1663. #else
  1664. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
  1665. const struct iomap_ops *ops)
  1666. {
  1667. return VM_FAULT_FALLBACK;
  1668. }
  1669. #endif /* CONFIG_FS_DAX_PMD */
  1670. /**
  1671. * dax_iomap_fault - handle a page fault on a DAX file
  1672. * @vmf: The description of the fault
  1673. * @order: Order of the page to fault in
  1674. * @pfnp: PFN to insert for synchronous faults if fsync is required
  1675. * @iomap_errp: Storage for detailed error code in case of error
  1676. * @ops: Iomap ops passed from the file system
  1677. *
  1678. * When a page fault occurs, filesystems may call this helper in
  1679. * their fault handler for DAX files. dax_iomap_fault() assumes the caller
  1680. * has done all the necessary locking for page fault to proceed
  1681. * successfully.
  1682. */
  1683. vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
  1684. pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
  1685. {
  1686. if (order == 0)
  1687. return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
  1688. else if (order == PMD_ORDER)
  1689. return dax_iomap_pmd_fault(vmf, pfnp, ops);
  1690. else
  1691. return VM_FAULT_FALLBACK;
  1692. }
  1693. EXPORT_SYMBOL_GPL(dax_iomap_fault);
  1694. /*
  1695. * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
  1696. * @vmf: The description of the fault
  1697. * @pfn: PFN to insert
  1698. * @order: Order of entry to insert.
  1699. *
  1700. * This function inserts a writeable PTE or PMD entry into the page tables
  1701. * for an mmaped DAX file. It also marks the page cache entry as dirty.
  1702. */
  1703. static vm_fault_t
  1704. dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
  1705. {
  1706. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1707. XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
  1708. void *entry;
  1709. vm_fault_t ret;
  1710. xas_lock_irq(&xas);
  1711. entry = get_unlocked_entry(&xas, order);
  1712. /* Did we race with someone splitting entry or so? */
  1713. if (!entry || dax_is_conflict(entry) ||
  1714. (order == 0 && !dax_is_pte_entry(entry))) {
  1715. put_unlocked_entry(&xas, entry, WAKE_NEXT);
  1716. xas_unlock_irq(&xas);
  1717. trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
  1718. VM_FAULT_NOPAGE);
  1719. return VM_FAULT_NOPAGE;
  1720. }
  1721. xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
  1722. dax_lock_entry(&xas, entry);
  1723. xas_unlock_irq(&xas);
  1724. if (order == 0)
  1725. ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
  1726. #ifdef CONFIG_FS_DAX_PMD
  1727. else if (order == PMD_ORDER)
  1728. ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
  1729. #endif
  1730. else
  1731. ret = VM_FAULT_FALLBACK;
  1732. dax_unlock_entry(&xas, entry);
  1733. trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
  1734. return ret;
  1735. }
  1736. /**
  1737. * dax_finish_sync_fault - finish synchronous page fault
  1738. * @vmf: The description of the fault
  1739. * @order: Order of entry to be inserted
  1740. * @pfn: PFN to insert
  1741. *
  1742. * This function ensures that the file range touched by the page fault is
  1743. * stored persistently on the media and handles inserting of appropriate page
  1744. * table entry.
  1745. */
  1746. vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
  1747. pfn_t pfn)
  1748. {
  1749. int err;
  1750. loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
  1751. size_t len = PAGE_SIZE << order;
  1752. err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
  1753. if (err)
  1754. return VM_FAULT_SIGBUS;
  1755. return dax_insert_pfn_mkwrite(vmf, pfn, order);
  1756. }
  1757. EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
  1758. static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
  1759. struct iomap_iter *it_dest, u64 len, bool *same)
  1760. {
  1761. const struct iomap *smap = &it_src->iomap;
  1762. const struct iomap *dmap = &it_dest->iomap;
  1763. loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
  1764. void *saddr, *daddr;
  1765. int id, ret;
  1766. len = min(len, min(smap->length, dmap->length));
  1767. if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
  1768. *same = true;
  1769. return len;
  1770. }
  1771. if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
  1772. *same = false;
  1773. return 0;
  1774. }
  1775. id = dax_read_lock();
  1776. ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
  1777. &saddr, NULL);
  1778. if (ret < 0)
  1779. goto out_unlock;
  1780. ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
  1781. &daddr, NULL);
  1782. if (ret < 0)
  1783. goto out_unlock;
  1784. *same = !memcmp(saddr, daddr, len);
  1785. if (!*same)
  1786. len = 0;
  1787. dax_read_unlock(id);
  1788. return len;
  1789. out_unlock:
  1790. dax_read_unlock(id);
  1791. return -EIO;
  1792. }
  1793. int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
  1794. struct inode *dst, loff_t dstoff, loff_t len, bool *same,
  1795. const struct iomap_ops *ops)
  1796. {
  1797. struct iomap_iter src_iter = {
  1798. .inode = src,
  1799. .pos = srcoff,
  1800. .len = len,
  1801. .flags = IOMAP_DAX,
  1802. };
  1803. struct iomap_iter dst_iter = {
  1804. .inode = dst,
  1805. .pos = dstoff,
  1806. .len = len,
  1807. .flags = IOMAP_DAX,
  1808. };
  1809. int ret, compared = 0;
  1810. while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
  1811. (ret = iomap_iter(&dst_iter, ops)) > 0) {
  1812. compared = dax_range_compare_iter(&src_iter, &dst_iter,
  1813. min(src_iter.len, dst_iter.len), same);
  1814. if (compared < 0)
  1815. return ret;
  1816. src_iter.processed = dst_iter.processed = compared;
  1817. }
  1818. return ret;
  1819. }
  1820. int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  1821. struct file *file_out, loff_t pos_out,
  1822. loff_t *len, unsigned int remap_flags,
  1823. const struct iomap_ops *ops)
  1824. {
  1825. return __generic_remap_file_range_prep(file_in, pos_in, file_out,
  1826. pos_out, len, remap_flags, ops);
  1827. }
  1828. EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);