dax.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * dax: direct host memory access
  4. * Copyright (C) 2020 Red Hat, Inc.
  5. */
  6. #include "fuse_i.h"
  7. #include <linux/delay.h>
  8. #include <linux/dax.h>
  9. #include <linux/uio.h>
  10. #include <linux/pagemap.h>
  11. #include <linux/pfn_t.h>
  12. #include <linux/iomap.h>
  13. #include <linux/interval_tree.h>
  14. /*
  15. * Default memory range size. A power of 2 so it agrees with common FUSE_INIT
  16. * map_alignment values 4KB and 64KB.
  17. */
  18. #define FUSE_DAX_SHIFT 21
  19. #define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT)
  20. #define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE)
  21. /* Number of ranges reclaimer will try to free in one invocation */
  22. #define FUSE_DAX_RECLAIM_CHUNK (10)
  23. /*
  24. * Dax memory reclaim threshold in percetage of total ranges. When free
  25. * number of free ranges drops below this threshold, reclaim can trigger
  26. * Default is 20%
  27. */
  28. #define FUSE_DAX_RECLAIM_THRESHOLD (20)
  29. /** Translation information for file offsets to DAX window offsets */
  30. struct fuse_dax_mapping {
  31. /* Pointer to inode where this memory range is mapped */
  32. struct inode *inode;
  33. /* Will connect in fcd->free_ranges to keep track of free memory */
  34. struct list_head list;
  35. /* For interval tree in file/inode */
  36. struct interval_tree_node itn;
  37. /* Will connect in fc->busy_ranges to keep track busy memory */
  38. struct list_head busy_list;
  39. /** Position in DAX window */
  40. u64 window_offset;
  41. /** Length of mapping, in bytes */
  42. loff_t length;
  43. /* Is this mapping read-only or read-write */
  44. bool writable;
  45. /* reference count when the mapping is used by dax iomap. */
  46. refcount_t refcnt;
  47. };
  48. /* Per-inode dax map */
  49. struct fuse_inode_dax {
  50. /* Semaphore to protect modifications to the dmap tree */
  51. struct rw_semaphore sem;
  52. /* Sorted rb tree of struct fuse_dax_mapping elements */
  53. struct rb_root_cached tree;
  54. unsigned long nr;
  55. };
  56. struct fuse_conn_dax {
  57. /* DAX device */
  58. struct dax_device *dev;
  59. /* Lock protecting accessess to members of this structure */
  60. spinlock_t lock;
  61. /* List of memory ranges which are busy */
  62. unsigned long nr_busy_ranges;
  63. struct list_head busy_ranges;
  64. /* Worker to free up memory ranges */
  65. struct delayed_work free_work;
  66. /* Wait queue for a dax range to become free */
  67. wait_queue_head_t range_waitq;
  68. /* DAX Window Free Ranges */
  69. long nr_free_ranges;
  70. struct list_head free_ranges;
  71. unsigned long nr_ranges;
  72. };
  73. static inline struct fuse_dax_mapping *
  74. node_to_dmap(struct interval_tree_node *node)
  75. {
  76. if (!node)
  77. return NULL;
  78. return container_of(node, struct fuse_dax_mapping, itn);
  79. }
  80. static struct fuse_dax_mapping *
  81. alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode);
  82. static void
  83. __kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms)
  84. {
  85. unsigned long free_threshold;
  86. /* If number of free ranges are below threshold, start reclaim */
  87. free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100,
  88. 1);
  89. if (fcd->nr_free_ranges < free_threshold)
  90. queue_delayed_work(system_long_wq, &fcd->free_work,
  91. msecs_to_jiffies(delay_ms));
  92. }
  93. static void kick_dmap_free_worker(struct fuse_conn_dax *fcd,
  94. unsigned long delay_ms)
  95. {
  96. spin_lock(&fcd->lock);
  97. __kick_dmap_free_worker(fcd, delay_ms);
  98. spin_unlock(&fcd->lock);
  99. }
  100. static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
  101. {
  102. struct fuse_dax_mapping *dmap;
  103. spin_lock(&fcd->lock);
  104. dmap = list_first_entry_or_null(&fcd->free_ranges,
  105. struct fuse_dax_mapping, list);
  106. if (dmap) {
  107. list_del_init(&dmap->list);
  108. WARN_ON(fcd->nr_free_ranges <= 0);
  109. fcd->nr_free_ranges--;
  110. }
  111. __kick_dmap_free_worker(fcd, 0);
  112. spin_unlock(&fcd->lock);
  113. return dmap;
  114. }
  115. /* This assumes fcd->lock is held */
  116. static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd,
  117. struct fuse_dax_mapping *dmap)
  118. {
  119. list_del_init(&dmap->busy_list);
  120. WARN_ON(fcd->nr_busy_ranges == 0);
  121. fcd->nr_busy_ranges--;
  122. }
  123. static void dmap_remove_busy_list(struct fuse_conn_dax *fcd,
  124. struct fuse_dax_mapping *dmap)
  125. {
  126. spin_lock(&fcd->lock);
  127. __dmap_remove_busy_list(fcd, dmap);
  128. spin_unlock(&fcd->lock);
  129. }
  130. /* This assumes fcd->lock is held */
  131. static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
  132. struct fuse_dax_mapping *dmap)
  133. {
  134. list_add_tail(&dmap->list, &fcd->free_ranges);
  135. fcd->nr_free_ranges++;
  136. wake_up(&fcd->range_waitq);
  137. }
  138. static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd,
  139. struct fuse_dax_mapping *dmap)
  140. {
  141. /* Return fuse_dax_mapping to free list */
  142. spin_lock(&fcd->lock);
  143. __dmap_add_to_free_pool(fcd, dmap);
  144. spin_unlock(&fcd->lock);
  145. }
  146. static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx,
  147. struct fuse_dax_mapping *dmap, bool writable,
  148. bool upgrade)
  149. {
  150. struct fuse_mount *fm = get_fuse_mount(inode);
  151. struct fuse_conn_dax *fcd = fm->fc->dax;
  152. struct fuse_inode *fi = get_fuse_inode(inode);
  153. struct fuse_setupmapping_in inarg;
  154. loff_t offset = start_idx << FUSE_DAX_SHIFT;
  155. FUSE_ARGS(args);
  156. ssize_t err;
  157. WARN_ON(fcd->nr_free_ranges < 0);
  158. /* Ask fuse daemon to setup mapping */
  159. memset(&inarg, 0, sizeof(inarg));
  160. inarg.foffset = offset;
  161. inarg.fh = -1;
  162. inarg.moffset = dmap->window_offset;
  163. inarg.len = FUSE_DAX_SZ;
  164. inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ;
  165. if (writable)
  166. inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE;
  167. args.opcode = FUSE_SETUPMAPPING;
  168. args.nodeid = fi->nodeid;
  169. args.in_numargs = 1;
  170. args.in_args[0].size = sizeof(inarg);
  171. args.in_args[0].value = &inarg;
  172. err = fuse_simple_request(fm, &args);
  173. if (err < 0)
  174. return err;
  175. dmap->writable = writable;
  176. if (!upgrade) {
  177. /*
  178. * We don't take a reference on inode. inode is valid right now
  179. * and when inode is going away, cleanup logic should first
  180. * cleanup dmap entries.
  181. */
  182. dmap->inode = inode;
  183. dmap->itn.start = dmap->itn.last = start_idx;
  184. /* Protected by fi->dax->sem */
  185. interval_tree_insert(&dmap->itn, &fi->dax->tree);
  186. fi->dax->nr++;
  187. spin_lock(&fcd->lock);
  188. list_add_tail(&dmap->busy_list, &fcd->busy_ranges);
  189. fcd->nr_busy_ranges++;
  190. spin_unlock(&fcd->lock);
  191. }
  192. return 0;
  193. }
  194. static int fuse_send_removemapping(struct inode *inode,
  195. struct fuse_removemapping_in *inargp,
  196. struct fuse_removemapping_one *remove_one)
  197. {
  198. struct fuse_inode *fi = get_fuse_inode(inode);
  199. struct fuse_mount *fm = get_fuse_mount(inode);
  200. FUSE_ARGS(args);
  201. args.opcode = FUSE_REMOVEMAPPING;
  202. args.nodeid = fi->nodeid;
  203. args.in_numargs = 2;
  204. args.in_args[0].size = sizeof(*inargp);
  205. args.in_args[0].value = inargp;
  206. args.in_args[1].size = inargp->count * sizeof(*remove_one);
  207. args.in_args[1].value = remove_one;
  208. return fuse_simple_request(fm, &args);
  209. }
  210. static int dmap_removemapping_list(struct inode *inode, unsigned int num,
  211. struct list_head *to_remove)
  212. {
  213. struct fuse_removemapping_one *remove_one, *ptr;
  214. struct fuse_removemapping_in inarg;
  215. struct fuse_dax_mapping *dmap;
  216. int ret, i = 0, nr_alloc;
  217. nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY);
  218. remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS);
  219. if (!remove_one)
  220. return -ENOMEM;
  221. ptr = remove_one;
  222. list_for_each_entry(dmap, to_remove, list) {
  223. ptr->moffset = dmap->window_offset;
  224. ptr->len = dmap->length;
  225. ptr++;
  226. i++;
  227. num--;
  228. if (i >= nr_alloc || num == 0) {
  229. memset(&inarg, 0, sizeof(inarg));
  230. inarg.count = i;
  231. ret = fuse_send_removemapping(inode, &inarg,
  232. remove_one);
  233. if (ret)
  234. goto out;
  235. ptr = remove_one;
  236. i = 0;
  237. }
  238. }
  239. out:
  240. kfree(remove_one);
  241. return ret;
  242. }
  243. /*
  244. * Cleanup dmap entry and add back to free list. This should be called with
  245. * fcd->lock held.
  246. */
  247. static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd,
  248. struct fuse_dax_mapping *dmap)
  249. {
  250. pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n",
  251. dmap->itn.start, dmap->itn.last, dmap->window_offset,
  252. dmap->length);
  253. __dmap_remove_busy_list(fcd, dmap);
  254. dmap->inode = NULL;
  255. dmap->itn.start = dmap->itn.last = 0;
  256. __dmap_add_to_free_pool(fcd, dmap);
  257. }
  258. /*
  259. * Free inode dmap entries whose range falls inside [start, end].
  260. * Does not take any locks. At this point of time it should only be
  261. * called from evict_inode() path where we know all dmap entries can be
  262. * reclaimed.
  263. */
  264. static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd,
  265. struct inode *inode,
  266. loff_t start, loff_t end)
  267. {
  268. struct fuse_inode *fi = get_fuse_inode(inode);
  269. struct fuse_dax_mapping *dmap, *n;
  270. int err, num = 0;
  271. LIST_HEAD(to_remove);
  272. unsigned long start_idx = start >> FUSE_DAX_SHIFT;
  273. unsigned long end_idx = end >> FUSE_DAX_SHIFT;
  274. struct interval_tree_node *node;
  275. while (1) {
  276. node = interval_tree_iter_first(&fi->dax->tree, start_idx,
  277. end_idx);
  278. if (!node)
  279. break;
  280. dmap = node_to_dmap(node);
  281. /* inode is going away. There should not be any users of dmap */
  282. WARN_ON(refcount_read(&dmap->refcnt) > 1);
  283. interval_tree_remove(&dmap->itn, &fi->dax->tree);
  284. num++;
  285. list_add(&dmap->list, &to_remove);
  286. }
  287. /* Nothing to remove */
  288. if (list_empty(&to_remove))
  289. return;
  290. WARN_ON(fi->dax->nr < num);
  291. fi->dax->nr -= num;
  292. err = dmap_removemapping_list(inode, num, &to_remove);
  293. if (err && err != -ENOTCONN) {
  294. pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n",
  295. start, end);
  296. }
  297. spin_lock(&fcd->lock);
  298. list_for_each_entry_safe(dmap, n, &to_remove, list) {
  299. list_del_init(&dmap->list);
  300. dmap_reinit_add_to_free_pool(fcd, dmap);
  301. }
  302. spin_unlock(&fcd->lock);
  303. }
  304. static int dmap_removemapping_one(struct inode *inode,
  305. struct fuse_dax_mapping *dmap)
  306. {
  307. struct fuse_removemapping_one forget_one;
  308. struct fuse_removemapping_in inarg;
  309. memset(&inarg, 0, sizeof(inarg));
  310. inarg.count = 1;
  311. memset(&forget_one, 0, sizeof(forget_one));
  312. forget_one.moffset = dmap->window_offset;
  313. forget_one.len = dmap->length;
  314. return fuse_send_removemapping(inode, &inarg, &forget_one);
  315. }
  316. /*
  317. * It is called from evict_inode() and by that time inode is going away. So
  318. * this function does not take any locks like fi->dax->sem for traversing
  319. * that fuse inode interval tree. If that lock is taken then lock validator
  320. * complains of deadlock situation w.r.t fs_reclaim lock.
  321. */
  322. void fuse_dax_inode_cleanup(struct inode *inode)
  323. {
  324. struct fuse_conn *fc = get_fuse_conn(inode);
  325. struct fuse_inode *fi = get_fuse_inode(inode);
  326. /*
  327. * fuse_evict_inode() has already called truncate_inode_pages_final()
  328. * before we arrive here. So we should not have to worry about any
  329. * pages/exception entries still associated with inode.
  330. */
  331. inode_reclaim_dmap_range(fc->dax, inode, 0, -1);
  332. WARN_ON(fi->dax->nr);
  333. }
  334. static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length)
  335. {
  336. iomap->addr = IOMAP_NULL_ADDR;
  337. iomap->length = length;
  338. iomap->type = IOMAP_HOLE;
  339. }
  340. static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length,
  341. struct iomap *iomap, struct fuse_dax_mapping *dmap,
  342. unsigned int flags)
  343. {
  344. loff_t offset, len;
  345. loff_t i_size = i_size_read(inode);
  346. offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT);
  347. len = min(length, dmap->length - offset);
  348. /* If length is beyond end of file, truncate further */
  349. if (pos + len > i_size)
  350. len = i_size - pos;
  351. if (len > 0) {
  352. iomap->addr = dmap->window_offset + offset;
  353. iomap->length = len;
  354. if (flags & IOMAP_FAULT)
  355. iomap->length = ALIGN(len, PAGE_SIZE);
  356. iomap->type = IOMAP_MAPPED;
  357. /*
  358. * increace refcnt so that reclaim code knows this dmap is in
  359. * use. This assumes fi->dax->sem mutex is held either
  360. * shared/exclusive.
  361. */
  362. refcount_inc(&dmap->refcnt);
  363. /* iomap->private should be NULL */
  364. WARN_ON_ONCE(iomap->private);
  365. iomap->private = dmap;
  366. } else {
  367. /* Mapping beyond end of file is hole */
  368. fuse_fill_iomap_hole(iomap, length);
  369. }
  370. }
  371. static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos,
  372. loff_t length, unsigned int flags,
  373. struct iomap *iomap)
  374. {
  375. struct fuse_inode *fi = get_fuse_inode(inode);
  376. struct fuse_conn *fc = get_fuse_conn(inode);
  377. struct fuse_conn_dax *fcd = fc->dax;
  378. struct fuse_dax_mapping *dmap, *alloc_dmap = NULL;
  379. int ret;
  380. bool writable = flags & IOMAP_WRITE;
  381. unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
  382. struct interval_tree_node *node;
  383. /*
  384. * Can't do inline reclaim in fault path. We call
  385. * dax_layout_busy_page() before we free a range. And
  386. * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it.
  387. * In fault path we enter with mapping->invalidate_lock held and can't
  388. * drop it. Also in fault path we hold mapping->invalidate_lock shared
  389. * and not exclusive, so that creates further issues with
  390. * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault()
  391. * will wait for a memory range to become free and retry.
  392. */
  393. if (flags & IOMAP_FAULT) {
  394. alloc_dmap = alloc_dax_mapping(fcd);
  395. if (!alloc_dmap)
  396. return -EAGAIN;
  397. } else {
  398. alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode);
  399. if (IS_ERR(alloc_dmap))
  400. return PTR_ERR(alloc_dmap);
  401. }
  402. /* If we are here, we should have memory allocated */
  403. if (WARN_ON(!alloc_dmap))
  404. return -EIO;
  405. /*
  406. * Take write lock so that only one caller can try to setup mapping
  407. * and other waits.
  408. */
  409. down_write(&fi->dax->sem);
  410. /*
  411. * We dropped lock. Check again if somebody else setup
  412. * mapping already.
  413. */
  414. node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
  415. if (node) {
  416. dmap = node_to_dmap(node);
  417. fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
  418. dmap_add_to_free_pool(fcd, alloc_dmap);
  419. up_write(&fi->dax->sem);
  420. return 0;
  421. }
  422. /* Setup one mapping */
  423. ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap,
  424. writable, false);
  425. if (ret < 0) {
  426. dmap_add_to_free_pool(fcd, alloc_dmap);
  427. up_write(&fi->dax->sem);
  428. return ret;
  429. }
  430. fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags);
  431. up_write(&fi->dax->sem);
  432. return 0;
  433. }
  434. static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos,
  435. loff_t length, unsigned int flags,
  436. struct iomap *iomap)
  437. {
  438. struct fuse_inode *fi = get_fuse_inode(inode);
  439. struct fuse_dax_mapping *dmap;
  440. int ret;
  441. unsigned long idx = pos >> FUSE_DAX_SHIFT;
  442. struct interval_tree_node *node;
  443. /*
  444. * Take exclusive lock so that only one caller can try to setup
  445. * mapping and others wait.
  446. */
  447. down_write(&fi->dax->sem);
  448. node = interval_tree_iter_first(&fi->dax->tree, idx, idx);
  449. /* We are holding either inode lock or invalidate_lock, and that should
  450. * ensure that dmap can't be truncated. We are holding a reference
  451. * on dmap and that should make sure it can't be reclaimed. So dmap
  452. * should still be there in tree despite the fact we dropped and
  453. * re-acquired the fi->dax->sem lock.
  454. */
  455. ret = -EIO;
  456. if (WARN_ON(!node))
  457. goto out_err;
  458. dmap = node_to_dmap(node);
  459. /* We took an extra reference on dmap to make sure its not reclaimd.
  460. * Now we hold fi->dax->sem lock and that reference is not needed
  461. * anymore. Drop it.
  462. */
  463. if (refcount_dec_and_test(&dmap->refcnt)) {
  464. /* refcount should not hit 0. This object only goes
  465. * away when fuse connection goes away
  466. */
  467. WARN_ON_ONCE(1);
  468. }
  469. /* Maybe another thread already upgraded mapping while we were not
  470. * holding lock.
  471. */
  472. if (dmap->writable) {
  473. ret = 0;
  474. goto out_fill_iomap;
  475. }
  476. ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true,
  477. true);
  478. if (ret < 0)
  479. goto out_err;
  480. out_fill_iomap:
  481. fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
  482. out_err:
  483. up_write(&fi->dax->sem);
  484. return ret;
  485. }
  486. /* This is just for DAX and the mapping is ephemeral, do not use it for other
  487. * purposes since there is no block device with a permanent mapping.
  488. */
  489. static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
  490. unsigned int flags, struct iomap *iomap,
  491. struct iomap *srcmap)
  492. {
  493. struct fuse_inode *fi = get_fuse_inode(inode);
  494. struct fuse_conn *fc = get_fuse_conn(inode);
  495. struct fuse_dax_mapping *dmap;
  496. bool writable = flags & IOMAP_WRITE;
  497. unsigned long start_idx = pos >> FUSE_DAX_SHIFT;
  498. struct interval_tree_node *node;
  499. /* We don't support FIEMAP */
  500. if (WARN_ON(flags & IOMAP_REPORT))
  501. return -EIO;
  502. iomap->offset = pos;
  503. iomap->flags = 0;
  504. iomap->bdev = NULL;
  505. iomap->dax_dev = fc->dax->dev;
  506. /*
  507. * Both read/write and mmap path can race here. So we need something
  508. * to make sure if we are setting up mapping, then other path waits
  509. *
  510. * For now, use a semaphore for this. It probably needs to be
  511. * optimized later.
  512. */
  513. down_read(&fi->dax->sem);
  514. node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
  515. if (node) {
  516. dmap = node_to_dmap(node);
  517. if (writable && !dmap->writable) {
  518. /* Upgrade read-only mapping to read-write. This will
  519. * require exclusive fi->dax->sem lock as we don't want
  520. * two threads to be trying to this simultaneously
  521. * for same dmap. So drop shared lock and acquire
  522. * exclusive lock.
  523. *
  524. * Before dropping fi->dax->sem lock, take reference
  525. * on dmap so that its not freed by range reclaim.
  526. */
  527. refcount_inc(&dmap->refcnt);
  528. up_read(&fi->dax->sem);
  529. pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n",
  530. __func__, pos, length);
  531. return fuse_upgrade_dax_mapping(inode, pos, length,
  532. flags, iomap);
  533. } else {
  534. fuse_fill_iomap(inode, pos, length, iomap, dmap, flags);
  535. up_read(&fi->dax->sem);
  536. return 0;
  537. }
  538. } else {
  539. up_read(&fi->dax->sem);
  540. pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n",
  541. __func__, pos, length);
  542. if (pos >= i_size_read(inode))
  543. goto iomap_hole;
  544. return fuse_setup_new_dax_mapping(inode, pos, length, flags,
  545. iomap);
  546. }
  547. /*
  548. * If read beyond end of file happens, fs code seems to return
  549. * it as hole
  550. */
  551. iomap_hole:
  552. fuse_fill_iomap_hole(iomap, length);
  553. pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n",
  554. __func__, pos, length, iomap->length);
  555. return 0;
  556. }
  557. static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length,
  558. ssize_t written, unsigned int flags,
  559. struct iomap *iomap)
  560. {
  561. struct fuse_dax_mapping *dmap = iomap->private;
  562. if (dmap) {
  563. if (refcount_dec_and_test(&dmap->refcnt)) {
  564. /* refcount should not hit 0. This object only goes
  565. * away when fuse connection goes away
  566. */
  567. WARN_ON_ONCE(1);
  568. }
  569. }
  570. /* DAX writes beyond end-of-file aren't handled using iomap, so the
  571. * file size is unchanged and there is nothing to do here.
  572. */
  573. return 0;
  574. }
  575. static const struct iomap_ops fuse_iomap_ops = {
  576. .iomap_begin = fuse_iomap_begin,
  577. .iomap_end = fuse_iomap_end,
  578. };
  579. static void fuse_wait_dax_page(struct inode *inode)
  580. {
  581. filemap_invalidate_unlock(inode->i_mapping);
  582. schedule();
  583. filemap_invalidate_lock(inode->i_mapping);
  584. }
  585. /* Should be called with mapping->invalidate_lock held exclusively */
  586. static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
  587. loff_t start, loff_t end)
  588. {
  589. struct page *page;
  590. page = dax_layout_busy_page_range(inode->i_mapping, start, end);
  591. if (!page)
  592. return 0;
  593. *retry = true;
  594. return ___wait_var_event(&page->_refcount,
  595. atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
  596. 0, 0, fuse_wait_dax_page(inode));
  597. }
  598. int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
  599. u64 dmap_end)
  600. {
  601. bool retry;
  602. int ret;
  603. do {
  604. retry = false;
  605. ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
  606. dmap_end);
  607. } while (ret == 0 && retry);
  608. return ret;
  609. }
  610. ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
  611. {
  612. struct inode *inode = file_inode(iocb->ki_filp);
  613. ssize_t ret;
  614. if (iocb->ki_flags & IOCB_NOWAIT) {
  615. if (!inode_trylock_shared(inode))
  616. return -EAGAIN;
  617. } else {
  618. inode_lock_shared(inode);
  619. }
  620. ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops);
  621. inode_unlock_shared(inode);
  622. /* TODO file_accessed(iocb->f_filp) */
  623. return ret;
  624. }
  625. static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from)
  626. {
  627. struct inode *inode = file_inode(iocb->ki_filp);
  628. return (iov_iter_rw(from) == WRITE &&
  629. ((iocb->ki_pos) >= i_size_read(inode) ||
  630. (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode))));
  631. }
  632. static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from)
  633. {
  634. struct inode *inode = file_inode(iocb->ki_filp);
  635. struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
  636. ssize_t ret;
  637. ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
  638. fuse_write_update_attr(inode, iocb->ki_pos, ret);
  639. return ret;
  640. }
  641. ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
  642. {
  643. struct inode *inode = file_inode(iocb->ki_filp);
  644. ssize_t ret;
  645. if (iocb->ki_flags & IOCB_NOWAIT) {
  646. if (!inode_trylock(inode))
  647. return -EAGAIN;
  648. } else {
  649. inode_lock(inode);
  650. }
  651. ret = generic_write_checks(iocb, from);
  652. if (ret <= 0)
  653. goto out;
  654. ret = file_remove_privs(iocb->ki_filp);
  655. if (ret)
  656. goto out;
  657. /* TODO file_update_time() but we don't want metadata I/O */
  658. /* Do not use dax for file extending writes as write and on
  659. * disk i_size increase are not atomic otherwise.
  660. */
  661. if (file_extending_write(iocb, from))
  662. ret = fuse_dax_direct_write(iocb, from);
  663. else
  664. ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops);
  665. out:
  666. inode_unlock(inode);
  667. if (ret > 0)
  668. ret = generic_write_sync(iocb, ret);
  669. return ret;
  670. }
  671. static int fuse_dax_writepages(struct address_space *mapping,
  672. struct writeback_control *wbc)
  673. {
  674. struct inode *inode = mapping->host;
  675. struct fuse_conn *fc = get_fuse_conn(inode);
  676. return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
  677. }
  678. static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order,
  679. bool write)
  680. {
  681. vm_fault_t ret;
  682. struct inode *inode = file_inode(vmf->vma->vm_file);
  683. struct super_block *sb = inode->i_sb;
  684. pfn_t pfn;
  685. int error = 0;
  686. struct fuse_conn *fc = get_fuse_conn(inode);
  687. struct fuse_conn_dax *fcd = fc->dax;
  688. bool retry = false;
  689. if (write)
  690. sb_start_pagefault(sb);
  691. retry:
  692. if (retry && !(fcd->nr_free_ranges > 0))
  693. wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0));
  694. /*
  695. * We need to serialize against not only truncate but also against
  696. * fuse dax memory range reclaim. While a range is being reclaimed,
  697. * we do not want any read/write/mmap to make progress and try
  698. * to populate page cache or access memory we are trying to free.
  699. */
  700. filemap_invalidate_lock_shared(inode->i_mapping);
  701. ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops);
  702. if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
  703. error = 0;
  704. retry = true;
  705. filemap_invalidate_unlock_shared(inode->i_mapping);
  706. goto retry;
  707. }
  708. if (ret & VM_FAULT_NEEDDSYNC)
  709. ret = dax_finish_sync_fault(vmf, order, pfn);
  710. filemap_invalidate_unlock_shared(inode->i_mapping);
  711. if (write)
  712. sb_end_pagefault(sb);
  713. return ret;
  714. }
  715. static vm_fault_t fuse_dax_fault(struct vm_fault *vmf)
  716. {
  717. return __fuse_dax_fault(vmf, 0, vmf->flags & FAULT_FLAG_WRITE);
  718. }
  719. static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
  720. {
  721. return __fuse_dax_fault(vmf, order, vmf->flags & FAULT_FLAG_WRITE);
  722. }
  723. static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf)
  724. {
  725. return __fuse_dax_fault(vmf, 0, true);
  726. }
  727. static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
  728. {
  729. return __fuse_dax_fault(vmf, 0, true);
  730. }
  731. static const struct vm_operations_struct fuse_dax_vm_ops = {
  732. .fault = fuse_dax_fault,
  733. .huge_fault = fuse_dax_huge_fault,
  734. .page_mkwrite = fuse_dax_page_mkwrite,
  735. .pfn_mkwrite = fuse_dax_pfn_mkwrite,
  736. };
  737. int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma)
  738. {
  739. file_accessed(file);
  740. vma->vm_ops = &fuse_dax_vm_ops;
  741. vm_flags_set(vma, VM_MIXEDMAP | VM_HUGEPAGE);
  742. return 0;
  743. }
  744. static int dmap_writeback_invalidate(struct inode *inode,
  745. struct fuse_dax_mapping *dmap)
  746. {
  747. int ret;
  748. loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT;
  749. loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1);
  750. ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos);
  751. if (ret) {
  752. pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n",
  753. ret, start_pos, end_pos);
  754. return ret;
  755. }
  756. ret = invalidate_inode_pages2_range(inode->i_mapping,
  757. start_pos >> PAGE_SHIFT,
  758. end_pos >> PAGE_SHIFT);
  759. if (ret)
  760. pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n",
  761. ret);
  762. return ret;
  763. }
  764. static int reclaim_one_dmap_locked(struct inode *inode,
  765. struct fuse_dax_mapping *dmap)
  766. {
  767. int ret;
  768. struct fuse_inode *fi = get_fuse_inode(inode);
  769. /*
  770. * igrab() was done to make sure inode won't go under us, and this
  771. * further avoids the race with evict().
  772. */
  773. ret = dmap_writeback_invalidate(inode, dmap);
  774. if (ret)
  775. return ret;
  776. /* Remove dax mapping from inode interval tree now */
  777. interval_tree_remove(&dmap->itn, &fi->dax->tree);
  778. fi->dax->nr--;
  779. /* It is possible that umount/shutdown has killed the fuse connection
  780. * and worker thread is trying to reclaim memory in parallel. Don't
  781. * warn in that case.
  782. */
  783. ret = dmap_removemapping_one(inode, dmap);
  784. if (ret && ret != -ENOTCONN) {
  785. pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n",
  786. dmap->window_offset, dmap->length, ret);
  787. }
  788. return 0;
  789. }
  790. /* Find first mapped dmap for an inode and return file offset. Caller needs
  791. * to hold fi->dax->sem lock either shared or exclusive.
  792. */
  793. static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode)
  794. {
  795. struct fuse_inode *fi = get_fuse_inode(inode);
  796. struct fuse_dax_mapping *dmap;
  797. struct interval_tree_node *node;
  798. for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node;
  799. node = interval_tree_iter_next(node, 0, -1)) {
  800. dmap = node_to_dmap(node);
  801. /* still in use. */
  802. if (refcount_read(&dmap->refcnt) > 1)
  803. continue;
  804. return dmap;
  805. }
  806. return NULL;
  807. }
  808. /*
  809. * Find first mapping in the tree and free it and return it. Do not add
  810. * it back to free pool.
  811. */
  812. static struct fuse_dax_mapping *
  813. inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode,
  814. bool *retry)
  815. {
  816. struct fuse_inode *fi = get_fuse_inode(inode);
  817. struct fuse_dax_mapping *dmap;
  818. u64 dmap_start, dmap_end;
  819. unsigned long start_idx;
  820. int ret;
  821. struct interval_tree_node *node;
  822. filemap_invalidate_lock(inode->i_mapping);
  823. /* Lookup a dmap and corresponding file offset to reclaim. */
  824. down_read(&fi->dax->sem);
  825. dmap = inode_lookup_first_dmap(inode);
  826. if (dmap) {
  827. start_idx = dmap->itn.start;
  828. dmap_start = start_idx << FUSE_DAX_SHIFT;
  829. dmap_end = dmap_start + FUSE_DAX_SZ - 1;
  830. }
  831. up_read(&fi->dax->sem);
  832. if (!dmap)
  833. goto out_mmap_sem;
  834. /*
  835. * Make sure there are no references to inode pages using
  836. * get_user_pages()
  837. */
  838. ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
  839. if (ret) {
  840. pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n",
  841. ret);
  842. dmap = ERR_PTR(ret);
  843. goto out_mmap_sem;
  844. }
  845. down_write(&fi->dax->sem);
  846. node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
  847. /* Range already got reclaimed by somebody else */
  848. if (!node) {
  849. if (retry)
  850. *retry = true;
  851. goto out_write_dmap_sem;
  852. }
  853. dmap = node_to_dmap(node);
  854. /* still in use. */
  855. if (refcount_read(&dmap->refcnt) > 1) {
  856. dmap = NULL;
  857. if (retry)
  858. *retry = true;
  859. goto out_write_dmap_sem;
  860. }
  861. ret = reclaim_one_dmap_locked(inode, dmap);
  862. if (ret < 0) {
  863. dmap = ERR_PTR(ret);
  864. goto out_write_dmap_sem;
  865. }
  866. /* Clean up dmap. Do not add back to free list */
  867. dmap_remove_busy_list(fcd, dmap);
  868. dmap->inode = NULL;
  869. dmap->itn.start = dmap->itn.last = 0;
  870. pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n",
  871. __func__, inode, dmap->window_offset, dmap->length);
  872. out_write_dmap_sem:
  873. up_write(&fi->dax->sem);
  874. out_mmap_sem:
  875. filemap_invalidate_unlock(inode->i_mapping);
  876. return dmap;
  877. }
  878. static struct fuse_dax_mapping *
  879. alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode)
  880. {
  881. struct fuse_dax_mapping *dmap;
  882. struct fuse_inode *fi = get_fuse_inode(inode);
  883. while (1) {
  884. bool retry = false;
  885. dmap = alloc_dax_mapping(fcd);
  886. if (dmap)
  887. return dmap;
  888. dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry);
  889. /*
  890. * Either we got a mapping or it is an error, return in both
  891. * the cases.
  892. */
  893. if (dmap)
  894. return dmap;
  895. /* If we could not reclaim a mapping because it
  896. * had a reference or some other temporary failure,
  897. * Try again. We want to give up inline reclaim only
  898. * if there is no range assigned to this node. Otherwise
  899. * if a deadlock is possible if we sleep with
  900. * mapping->invalidate_lock held and worker to free memory
  901. * can't make progress due to unavailability of
  902. * mapping->invalidate_lock. So sleep only if fi->dax->nr=0
  903. */
  904. if (retry)
  905. continue;
  906. /*
  907. * There are no mappings which can be reclaimed. Wait for one.
  908. * We are not holding fi->dax->sem. So it is possible
  909. * that range gets added now. But as we are not holding
  910. * mapping->invalidate_lock, worker should still be able to
  911. * free up a range and wake us up.
  912. */
  913. if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) {
  914. if (wait_event_killable_exclusive(fcd->range_waitq,
  915. (fcd->nr_free_ranges > 0))) {
  916. return ERR_PTR(-EINTR);
  917. }
  918. }
  919. }
  920. }
  921. static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd,
  922. struct inode *inode,
  923. unsigned long start_idx)
  924. {
  925. int ret;
  926. struct fuse_inode *fi = get_fuse_inode(inode);
  927. struct fuse_dax_mapping *dmap;
  928. struct interval_tree_node *node;
  929. /* Find fuse dax mapping at file offset inode. */
  930. node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx);
  931. /* Range already got cleaned up by somebody else */
  932. if (!node)
  933. return 0;
  934. dmap = node_to_dmap(node);
  935. /* still in use. */
  936. if (refcount_read(&dmap->refcnt) > 1)
  937. return 0;
  938. ret = reclaim_one_dmap_locked(inode, dmap);
  939. if (ret < 0)
  940. return ret;
  941. /* Cleanup dmap entry and add back to free list */
  942. spin_lock(&fcd->lock);
  943. dmap_reinit_add_to_free_pool(fcd, dmap);
  944. spin_unlock(&fcd->lock);
  945. return ret;
  946. }
  947. /*
  948. * Free a range of memory.
  949. * Locking:
  950. * 1. Take mapping->invalidate_lock to block dax faults.
  951. * 2. Take fi->dax->sem to protect interval tree and also to make sure
  952. * read/write can not reuse a dmap which we might be freeing.
  953. */
  954. static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd,
  955. struct inode *inode,
  956. unsigned long start_idx,
  957. unsigned long end_idx)
  958. {
  959. int ret;
  960. struct fuse_inode *fi = get_fuse_inode(inode);
  961. loff_t dmap_start = start_idx << FUSE_DAX_SHIFT;
  962. loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1;
  963. filemap_invalidate_lock(inode->i_mapping);
  964. ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end);
  965. if (ret) {
  966. pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n",
  967. ret);
  968. goto out_mmap_sem;
  969. }
  970. down_write(&fi->dax->sem);
  971. ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx);
  972. up_write(&fi->dax->sem);
  973. out_mmap_sem:
  974. filemap_invalidate_unlock(inode->i_mapping);
  975. return ret;
  976. }
  977. static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd,
  978. unsigned long nr_to_free)
  979. {
  980. struct fuse_dax_mapping *dmap, *pos, *temp;
  981. int ret, nr_freed = 0;
  982. unsigned long start_idx = 0, end_idx = 0;
  983. struct inode *inode = NULL;
  984. /* Pick first busy range and free it for now*/
  985. while (1) {
  986. if (nr_freed >= nr_to_free)
  987. break;
  988. dmap = NULL;
  989. spin_lock(&fcd->lock);
  990. if (!fcd->nr_busy_ranges) {
  991. spin_unlock(&fcd->lock);
  992. return 0;
  993. }
  994. list_for_each_entry_safe(pos, temp, &fcd->busy_ranges,
  995. busy_list) {
  996. /* skip this range if it's in use. */
  997. if (refcount_read(&pos->refcnt) > 1)
  998. continue;
  999. inode = igrab(pos->inode);
  1000. /*
  1001. * This inode is going away. That will free
  1002. * up all the ranges anyway, continue to
  1003. * next range.
  1004. */
  1005. if (!inode)
  1006. continue;
  1007. /*
  1008. * Take this element off list and add it tail. If
  1009. * this element can't be freed, it will help with
  1010. * selecting new element in next iteration of loop.
  1011. */
  1012. dmap = pos;
  1013. list_move_tail(&dmap->busy_list, &fcd->busy_ranges);
  1014. start_idx = end_idx = dmap->itn.start;
  1015. break;
  1016. }
  1017. spin_unlock(&fcd->lock);
  1018. if (!dmap)
  1019. return 0;
  1020. ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx);
  1021. iput(inode);
  1022. if (ret)
  1023. return ret;
  1024. nr_freed++;
  1025. }
  1026. return 0;
  1027. }
  1028. static void fuse_dax_free_mem_worker(struct work_struct *work)
  1029. {
  1030. int ret;
  1031. struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax,
  1032. free_work.work);
  1033. ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK);
  1034. if (ret) {
  1035. pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n",
  1036. ret);
  1037. }
  1038. /* If number of free ranges are still below threshold, requeue */
  1039. kick_dmap_free_worker(fcd, 1);
  1040. }
  1041. static void fuse_free_dax_mem_ranges(struct list_head *mem_list)
  1042. {
  1043. struct fuse_dax_mapping *range, *temp;
  1044. /* Free All allocated elements */
  1045. list_for_each_entry_safe(range, temp, mem_list, list) {
  1046. list_del(&range->list);
  1047. if (!list_empty(&range->busy_list))
  1048. list_del(&range->busy_list);
  1049. kfree(range);
  1050. }
  1051. }
  1052. void fuse_dax_conn_free(struct fuse_conn *fc)
  1053. {
  1054. if (fc->dax) {
  1055. fuse_free_dax_mem_ranges(&fc->dax->free_ranges);
  1056. kfree(fc->dax);
  1057. fc->dax = NULL;
  1058. }
  1059. }
  1060. static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd)
  1061. {
  1062. long nr_pages, nr_ranges;
  1063. struct fuse_dax_mapping *range;
  1064. int ret, id;
  1065. size_t dax_size = -1;
  1066. unsigned long i;
  1067. init_waitqueue_head(&fcd->range_waitq);
  1068. INIT_LIST_HEAD(&fcd->free_ranges);
  1069. INIT_LIST_HEAD(&fcd->busy_ranges);
  1070. INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker);
  1071. id = dax_read_lock();
  1072. nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size),
  1073. DAX_ACCESS, NULL, NULL);
  1074. dax_read_unlock(id);
  1075. if (nr_pages < 0) {
  1076. pr_debug("dax_direct_access() returned %ld\n", nr_pages);
  1077. return nr_pages;
  1078. }
  1079. nr_ranges = nr_pages/FUSE_DAX_PAGES;
  1080. pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n",
  1081. __func__, nr_pages, nr_ranges);
  1082. for (i = 0; i < nr_ranges; i++) {
  1083. range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL);
  1084. ret = -ENOMEM;
  1085. if (!range)
  1086. goto out_err;
  1087. /* TODO: This offset only works if virtio-fs driver is not
  1088. * having some memory hidden at the beginning. This needs
  1089. * better handling
  1090. */
  1091. range->window_offset = i * FUSE_DAX_SZ;
  1092. range->length = FUSE_DAX_SZ;
  1093. INIT_LIST_HEAD(&range->busy_list);
  1094. refcount_set(&range->refcnt, 1);
  1095. list_add_tail(&range->list, &fcd->free_ranges);
  1096. }
  1097. fcd->nr_free_ranges = nr_ranges;
  1098. fcd->nr_ranges = nr_ranges;
  1099. return 0;
  1100. out_err:
  1101. /* Free All allocated elements */
  1102. fuse_free_dax_mem_ranges(&fcd->free_ranges);
  1103. return ret;
  1104. }
  1105. int fuse_dax_conn_alloc(struct fuse_conn *fc, enum fuse_dax_mode dax_mode,
  1106. struct dax_device *dax_dev)
  1107. {
  1108. struct fuse_conn_dax *fcd;
  1109. int err;
  1110. fc->dax_mode = dax_mode;
  1111. if (!dax_dev)
  1112. return 0;
  1113. fcd = kzalloc(sizeof(*fcd), GFP_KERNEL);
  1114. if (!fcd)
  1115. return -ENOMEM;
  1116. spin_lock_init(&fcd->lock);
  1117. fcd->dev = dax_dev;
  1118. err = fuse_dax_mem_range_init(fcd);
  1119. if (err) {
  1120. kfree(fcd);
  1121. return err;
  1122. }
  1123. fc->dax = fcd;
  1124. return 0;
  1125. }
  1126. bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
  1127. {
  1128. struct fuse_conn *fc = get_fuse_conn_super(sb);
  1129. fi->dax = NULL;
  1130. if (fc->dax) {
  1131. fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT);
  1132. if (!fi->dax)
  1133. return false;
  1134. init_rwsem(&fi->dax->sem);
  1135. fi->dax->tree = RB_ROOT_CACHED;
  1136. }
  1137. return true;
  1138. }
  1139. static const struct address_space_operations fuse_dax_file_aops = {
  1140. .writepages = fuse_dax_writepages,
  1141. .direct_IO = noop_direct_IO,
  1142. .dirty_folio = noop_dirty_folio,
  1143. };
  1144. static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags)
  1145. {
  1146. struct fuse_conn *fc = get_fuse_conn(inode);
  1147. enum fuse_dax_mode dax_mode = fc->dax_mode;
  1148. if (dax_mode == FUSE_DAX_NEVER)
  1149. return false;
  1150. /*
  1151. * fc->dax may be NULL in 'inode' mode when filesystem device doesn't
  1152. * support DAX, in which case it will silently fallback to 'never' mode.
  1153. */
  1154. if (!fc->dax)
  1155. return false;
  1156. if (dax_mode == FUSE_DAX_ALWAYS)
  1157. return true;
  1158. /* dax_mode is FUSE_DAX_INODE* */
  1159. return fc->inode_dax && (flags & FUSE_ATTR_DAX);
  1160. }
  1161. void fuse_dax_inode_init(struct inode *inode, unsigned int flags)
  1162. {
  1163. if (!fuse_should_enable_dax(inode, flags))
  1164. return;
  1165. inode->i_flags |= S_DAX;
  1166. inode->i_data.a_ops = &fuse_dax_file_aops;
  1167. }
  1168. void fuse_dax_dontcache(struct inode *inode, unsigned int flags)
  1169. {
  1170. struct fuse_conn *fc = get_fuse_conn(inode);
  1171. if (fuse_is_inode_dax_mode(fc->dax_mode) &&
  1172. ((bool) IS_DAX(inode) != (bool) (flags & FUSE_ATTR_DAX)))
  1173. d_mark_dontcache(inode);
  1174. }
  1175. bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment)
  1176. {
  1177. if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) {
  1178. pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n",
  1179. map_alignment, FUSE_DAX_SZ);
  1180. return false;
  1181. }
  1182. return true;
  1183. }
  1184. void fuse_dax_cancel_work(struct fuse_conn *fc)
  1185. {
  1186. struct fuse_conn_dax *fcd = fc->dax;
  1187. if (fcd)
  1188. cancel_delayed_work_sync(&fcd->free_work);
  1189. }
  1190. EXPORT_SYMBOL_GPL(fuse_dax_cancel_work);