zdata.c 50 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2018 HUAWEI, Inc.
  4. * https://www.huawei.com/
  5. * Copyright (C) 2022 Alibaba Cloud
  6. */
  7. #include "compress.h"
  8. #include <linux/psi.h>
  9. #include <linux/cpuhotplug.h>
  10. #include <trace/events/erofs.h>
  11. #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
  12. #define Z_EROFS_INLINE_BVECS 2
  13. struct z_erofs_bvec {
  14. struct page *page;
  15. int offset;
  16. unsigned int end;
  17. };
  18. #define __Z_EROFS_BVSET(name, total) \
  19. struct name { \
  20. /* point to the next page which contains the following bvecs */ \
  21. struct page *nextpage; \
  22. struct z_erofs_bvec bvec[total]; \
  23. }
  24. __Z_EROFS_BVSET(z_erofs_bvset,);
  25. __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
  26. /*
  27. * Structure fields follow one of the following exclusion rules.
  28. *
  29. * I: Modifiable by initialization/destruction paths and read-only
  30. * for everyone else;
  31. *
  32. * L: Field should be protected by the pcluster lock;
  33. *
  34. * A: Field should be accessed / updated in atomic for parallelized code.
  35. */
  36. struct z_erofs_pcluster {
  37. struct mutex lock;
  38. struct lockref lockref;
  39. /* A: point to next chained pcluster or TAILs */
  40. struct z_erofs_pcluster *next;
  41. /* I: start block address of this pcluster */
  42. erofs_off_t index;
  43. /* L: the maximum decompression size of this round */
  44. unsigned int length;
  45. /* L: total number of bvecs */
  46. unsigned int vcnt;
  47. /* I: pcluster size (compressed size) in bytes */
  48. unsigned int pclustersize;
  49. /* I: page offset of start position of decompression */
  50. unsigned short pageofs_out;
  51. /* I: page offset of inline compressed data */
  52. unsigned short pageofs_in;
  53. union {
  54. /* L: inline a certain number of bvec for bootstrap */
  55. struct z_erofs_bvset_inline bvset;
  56. /* I: can be used to free the pcluster by RCU. */
  57. struct rcu_head rcu;
  58. };
  59. /* I: compression algorithm format */
  60. unsigned char algorithmformat;
  61. /* L: whether partial decompression or not */
  62. bool partial;
  63. /* L: whether extra buffer allocations are best-effort */
  64. bool besteffort;
  65. /* A: compressed bvecs (can be cached or inplaced pages) */
  66. struct z_erofs_bvec compressed_bvecs[];
  67. };
  68. /* the end of a chain of pclusters */
  69. #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
  70. struct z_erofs_decompressqueue {
  71. struct super_block *sb;
  72. struct z_erofs_pcluster *head;
  73. atomic_t pending_bios;
  74. union {
  75. struct completion done;
  76. struct work_struct work;
  77. struct kthread_work kthread_work;
  78. } u;
  79. bool eio, sync;
  80. };
  81. static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
  82. {
  83. return !pcl->index;
  84. }
  85. static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
  86. {
  87. return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
  88. }
  89. #define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
  90. static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
  91. {
  92. return fo->mapping == MNGD_MAPPING(sbi);
  93. }
  94. #define Z_EROFS_ONSTACK_PAGES 32
  95. /*
  96. * since pclustersize is variable for big pcluster feature, introduce slab
  97. * pools implementation for different pcluster sizes.
  98. */
  99. struct z_erofs_pcluster_slab {
  100. struct kmem_cache *slab;
  101. unsigned int maxpages;
  102. char name[48];
  103. };
  104. #define _PCLP(n) { .maxpages = n }
  105. static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
  106. _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
  107. _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
  108. };
  109. struct z_erofs_bvec_iter {
  110. struct page *bvpage;
  111. struct z_erofs_bvset *bvset;
  112. unsigned int nr, cur;
  113. };
  114. static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
  115. {
  116. if (iter->bvpage)
  117. kunmap_local(iter->bvset);
  118. return iter->bvpage;
  119. }
  120. static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
  121. {
  122. unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
  123. /* have to access nextpage in advance, otherwise it will be unmapped */
  124. struct page *nextpage = iter->bvset->nextpage;
  125. struct page *oldpage;
  126. DBG_BUGON(!nextpage);
  127. oldpage = z_erofs_bvec_iter_end(iter);
  128. iter->bvpage = nextpage;
  129. iter->bvset = kmap_local_page(nextpage);
  130. iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
  131. iter->cur = 0;
  132. return oldpage;
  133. }
  134. static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
  135. struct z_erofs_bvset_inline *bvset,
  136. unsigned int bootstrap_nr,
  137. unsigned int cur)
  138. {
  139. *iter = (struct z_erofs_bvec_iter) {
  140. .nr = bootstrap_nr,
  141. .bvset = (struct z_erofs_bvset *)bvset,
  142. };
  143. while (cur > iter->nr) {
  144. cur -= iter->nr;
  145. z_erofs_bvset_flip(iter);
  146. }
  147. iter->cur = cur;
  148. }
  149. static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
  150. struct z_erofs_bvec *bvec,
  151. struct page **candidate_bvpage,
  152. struct page **pagepool)
  153. {
  154. if (iter->cur >= iter->nr) {
  155. struct page *nextpage = *candidate_bvpage;
  156. if (!nextpage) {
  157. nextpage = __erofs_allocpage(pagepool, GFP_KERNEL,
  158. true);
  159. if (!nextpage)
  160. return -ENOMEM;
  161. set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
  162. }
  163. DBG_BUGON(iter->bvset->nextpage);
  164. iter->bvset->nextpage = nextpage;
  165. z_erofs_bvset_flip(iter);
  166. iter->bvset->nextpage = NULL;
  167. *candidate_bvpage = NULL;
  168. }
  169. iter->bvset->bvec[iter->cur++] = *bvec;
  170. return 0;
  171. }
  172. static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
  173. struct z_erofs_bvec *bvec,
  174. struct page **old_bvpage)
  175. {
  176. if (iter->cur == iter->nr)
  177. *old_bvpage = z_erofs_bvset_flip(iter);
  178. else
  179. *old_bvpage = NULL;
  180. *bvec = iter->bvset->bvec[iter->cur++];
  181. }
  182. static void z_erofs_destroy_pcluster_pool(void)
  183. {
  184. int i;
  185. for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
  186. if (!pcluster_pool[i].slab)
  187. continue;
  188. kmem_cache_destroy(pcluster_pool[i].slab);
  189. pcluster_pool[i].slab = NULL;
  190. }
  191. }
  192. static int z_erofs_create_pcluster_pool(void)
  193. {
  194. struct z_erofs_pcluster_slab *pcs;
  195. struct z_erofs_pcluster *a;
  196. unsigned int size;
  197. for (pcs = pcluster_pool;
  198. pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
  199. size = struct_size(a, compressed_bvecs, pcs->maxpages);
  200. sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
  201. pcs->slab = kmem_cache_create(pcs->name, size, 0,
  202. SLAB_RECLAIM_ACCOUNT, NULL);
  203. if (pcs->slab)
  204. continue;
  205. z_erofs_destroy_pcluster_pool();
  206. return -ENOMEM;
  207. }
  208. return 0;
  209. }
  210. static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
  211. {
  212. unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT;
  213. struct z_erofs_pcluster_slab *pcs = pcluster_pool;
  214. for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
  215. struct z_erofs_pcluster *pcl;
  216. if (nrpages > pcs->maxpages)
  217. continue;
  218. pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
  219. if (!pcl)
  220. return ERR_PTR(-ENOMEM);
  221. pcl->pclustersize = size;
  222. return pcl;
  223. }
  224. return ERR_PTR(-EINVAL);
  225. }
  226. static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
  227. {
  228. unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
  229. int i;
  230. for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
  231. struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
  232. if (pclusterpages > pcs->maxpages)
  233. continue;
  234. kmem_cache_free(pcs->slab, pcl);
  235. return;
  236. }
  237. DBG_BUGON(1);
  238. }
  239. static struct workqueue_struct *z_erofs_workqueue __read_mostly;
  240. #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
  241. static struct kthread_worker __rcu **z_erofs_pcpu_workers;
  242. static void erofs_destroy_percpu_workers(void)
  243. {
  244. struct kthread_worker *worker;
  245. unsigned int cpu;
  246. for_each_possible_cpu(cpu) {
  247. worker = rcu_dereference_protected(
  248. z_erofs_pcpu_workers[cpu], 1);
  249. rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
  250. if (worker)
  251. kthread_destroy_worker(worker);
  252. }
  253. kfree(z_erofs_pcpu_workers);
  254. }
  255. static struct kthread_worker *erofs_init_percpu_worker(int cpu)
  256. {
  257. struct kthread_worker *worker =
  258. kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
  259. if (IS_ERR(worker))
  260. return worker;
  261. if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI))
  262. sched_set_fifo_low(worker->task);
  263. return worker;
  264. }
  265. static int erofs_init_percpu_workers(void)
  266. {
  267. struct kthread_worker *worker;
  268. unsigned int cpu;
  269. z_erofs_pcpu_workers = kcalloc(num_possible_cpus(),
  270. sizeof(struct kthread_worker *), GFP_ATOMIC);
  271. if (!z_erofs_pcpu_workers)
  272. return -ENOMEM;
  273. for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */
  274. worker = erofs_init_percpu_worker(cpu);
  275. if (!IS_ERR(worker))
  276. rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
  277. }
  278. return 0;
  279. }
  280. #else
  281. static inline void erofs_destroy_percpu_workers(void) {}
  282. static inline int erofs_init_percpu_workers(void) { return 0; }
  283. #endif
  284. #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
  285. static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
  286. static enum cpuhp_state erofs_cpuhp_state;
  287. static int erofs_cpu_online(unsigned int cpu)
  288. {
  289. struct kthread_worker *worker, *old;
  290. worker = erofs_init_percpu_worker(cpu);
  291. if (IS_ERR(worker))
  292. return PTR_ERR(worker);
  293. spin_lock(&z_erofs_pcpu_worker_lock);
  294. old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
  295. lockdep_is_held(&z_erofs_pcpu_worker_lock));
  296. if (!old)
  297. rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
  298. spin_unlock(&z_erofs_pcpu_worker_lock);
  299. if (old)
  300. kthread_destroy_worker(worker);
  301. return 0;
  302. }
  303. static int erofs_cpu_offline(unsigned int cpu)
  304. {
  305. struct kthread_worker *worker;
  306. spin_lock(&z_erofs_pcpu_worker_lock);
  307. worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
  308. lockdep_is_held(&z_erofs_pcpu_worker_lock));
  309. rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
  310. spin_unlock(&z_erofs_pcpu_worker_lock);
  311. synchronize_rcu();
  312. if (worker)
  313. kthread_destroy_worker(worker);
  314. return 0;
  315. }
  316. static int erofs_cpu_hotplug_init(void)
  317. {
  318. int state;
  319. state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
  320. "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline);
  321. if (state < 0)
  322. return state;
  323. erofs_cpuhp_state = state;
  324. return 0;
  325. }
  326. static void erofs_cpu_hotplug_destroy(void)
  327. {
  328. if (erofs_cpuhp_state)
  329. cpuhp_remove_state_nocalls(erofs_cpuhp_state);
  330. }
  331. #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
  332. static inline int erofs_cpu_hotplug_init(void) { return 0; }
  333. static inline void erofs_cpu_hotplug_destroy(void) {}
  334. #endif
  335. void z_erofs_exit_subsystem(void)
  336. {
  337. erofs_cpu_hotplug_destroy();
  338. erofs_destroy_percpu_workers();
  339. destroy_workqueue(z_erofs_workqueue);
  340. z_erofs_destroy_pcluster_pool();
  341. z_erofs_exit_decompressor();
  342. }
  343. int __init z_erofs_init_subsystem(void)
  344. {
  345. int err = z_erofs_init_decompressor();
  346. if (err)
  347. goto err_decompressor;
  348. err = z_erofs_create_pcluster_pool();
  349. if (err)
  350. goto err_pcluster_pool;
  351. z_erofs_workqueue = alloc_workqueue("erofs_worker",
  352. WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
  353. if (!z_erofs_workqueue) {
  354. err = -ENOMEM;
  355. goto err_workqueue_init;
  356. }
  357. err = erofs_init_percpu_workers();
  358. if (err)
  359. goto err_pcpu_worker;
  360. err = erofs_cpu_hotplug_init();
  361. if (err < 0)
  362. goto err_cpuhp_init;
  363. return err;
  364. err_cpuhp_init:
  365. erofs_destroy_percpu_workers();
  366. err_pcpu_worker:
  367. destroy_workqueue(z_erofs_workqueue);
  368. err_workqueue_init:
  369. z_erofs_destroy_pcluster_pool();
  370. err_pcluster_pool:
  371. z_erofs_exit_decompressor();
  372. err_decompressor:
  373. return err;
  374. }
  375. enum z_erofs_pclustermode {
  376. /* It has previously been linked into another processing chain */
  377. Z_EROFS_PCLUSTER_INFLIGHT,
  378. /*
  379. * A weaker form of Z_EROFS_PCLUSTER_FOLLOWED; the difference is that it
  380. * may be dispatched to the bypass queue later due to uptodated managed
  381. * folios. All file-backed folios related to this pcluster cannot be
  382. * reused for in-place I/O (or bvpage) since the pcluster may be decoded
  383. * in a separate queue (and thus out of order).
  384. */
  385. Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
  386. /*
  387. * The pcluster has just been linked to our processing chain.
  388. * File-backed folios (except for the head page) related to it can be
  389. * used for in-place I/O (or bvpage).
  390. */
  391. Z_EROFS_PCLUSTER_FOLLOWED,
  392. };
  393. struct z_erofs_frontend {
  394. struct inode *const inode;
  395. struct erofs_map_blocks map;
  396. struct z_erofs_bvec_iter biter;
  397. struct page *pagepool;
  398. struct page *candidate_bvpage;
  399. struct z_erofs_pcluster *pcl, *head;
  400. enum z_erofs_pclustermode mode;
  401. erofs_off_t headoffset;
  402. /* a pointer used to pick up inplace I/O pages */
  403. unsigned int icur;
  404. };
  405. #define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \
  406. .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \
  407. .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho }
  408. static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe)
  409. {
  410. unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
  411. if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
  412. return false;
  413. if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
  414. return true;
  415. if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
  416. fe->map.m_la < fe->headoffset)
  417. return true;
  418. return false;
  419. }
  420. static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
  421. {
  422. struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
  423. struct z_erofs_pcluster *pcl = fe->pcl;
  424. unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
  425. bool shouldalloc = z_erofs_should_alloc_cache(fe);
  426. bool standalone = true;
  427. /*
  428. * optimistic allocation without direct reclaim since inplace I/O
  429. * can be used if low memory otherwise.
  430. */
  431. gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
  432. __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
  433. unsigned int i;
  434. if (i_blocksize(fe->inode) != PAGE_SIZE ||
  435. fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
  436. return;
  437. for (i = 0; i < pclusterpages; ++i) {
  438. struct page *page, *newpage;
  439. /* Inaccurate check w/o locking to avoid unneeded lookups */
  440. if (READ_ONCE(pcl->compressed_bvecs[i].page))
  441. continue;
  442. page = find_get_page(mc, pcl->index + i);
  443. if (!page) {
  444. /* I/O is needed, no possible to decompress directly */
  445. standalone = false;
  446. if (!shouldalloc)
  447. continue;
  448. /*
  449. * Try cached I/O if allocation succeeds or fallback to
  450. * in-place I/O instead to avoid any direct reclaim.
  451. */
  452. newpage = erofs_allocpage(&fe->pagepool, gfp);
  453. if (!newpage)
  454. continue;
  455. set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
  456. }
  457. spin_lock(&pcl->lockref.lock);
  458. if (!pcl->compressed_bvecs[i].page) {
  459. pcl->compressed_bvecs[i].page = page ? page : newpage;
  460. spin_unlock(&pcl->lockref.lock);
  461. continue;
  462. }
  463. spin_unlock(&pcl->lockref.lock);
  464. if (page)
  465. put_page(page);
  466. else if (newpage)
  467. erofs_pagepool_add(&fe->pagepool, newpage);
  468. }
  469. /*
  470. * don't do inplace I/O if all compressed pages are available in
  471. * managed cache since it can be moved to the bypass queue instead.
  472. */
  473. if (standalone)
  474. fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
  475. }
  476. /* (erofs_shrinker) disconnect cached encoded data with pclusters */
  477. static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
  478. struct z_erofs_pcluster *pcl)
  479. {
  480. unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
  481. struct folio *folio;
  482. int i;
  483. DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
  484. /* Each cached folio contains one page unless bs > ps is supported */
  485. for (i = 0; i < pclusterpages; ++i) {
  486. if (pcl->compressed_bvecs[i].page) {
  487. folio = page_folio(pcl->compressed_bvecs[i].page);
  488. /* Avoid reclaiming or migrating this folio */
  489. if (!folio_trylock(folio))
  490. return -EBUSY;
  491. if (!erofs_folio_is_managed(sbi, folio))
  492. continue;
  493. pcl->compressed_bvecs[i].page = NULL;
  494. folio_detach_private(folio);
  495. folio_unlock(folio);
  496. }
  497. }
  498. return 0;
  499. }
  500. static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
  501. {
  502. struct z_erofs_pcluster *pcl = folio_get_private(folio);
  503. struct z_erofs_bvec *bvec = pcl->compressed_bvecs;
  504. struct z_erofs_bvec *end = bvec + z_erofs_pclusterpages(pcl);
  505. bool ret;
  506. if (!folio_test_private(folio))
  507. return true;
  508. ret = false;
  509. spin_lock(&pcl->lockref.lock);
  510. if (pcl->lockref.count <= 0) {
  511. DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
  512. for (; bvec < end; ++bvec) {
  513. if (bvec->page && page_folio(bvec->page) == folio) {
  514. bvec->page = NULL;
  515. folio_detach_private(folio);
  516. ret = true;
  517. break;
  518. }
  519. }
  520. }
  521. spin_unlock(&pcl->lockref.lock);
  522. return ret;
  523. }
  524. /*
  525. * It will be called only on inode eviction. In case that there are still some
  526. * decompression requests in progress, wait with rescheduling for a bit here.
  527. * An extra lock could be introduced instead but it seems unnecessary.
  528. */
  529. static void z_erofs_cache_invalidate_folio(struct folio *folio,
  530. size_t offset, size_t length)
  531. {
  532. const size_t stop = length + offset;
  533. /* Check for potential overflow in debug mode */
  534. DBG_BUGON(stop > folio_size(folio) || stop < length);
  535. if (offset == 0 && stop == folio_size(folio))
  536. while (!z_erofs_cache_release_folio(folio, 0))
  537. cond_resched();
  538. }
  539. static const struct address_space_operations z_erofs_cache_aops = {
  540. .release_folio = z_erofs_cache_release_folio,
  541. .invalidate_folio = z_erofs_cache_invalidate_folio,
  542. };
  543. int z_erofs_init_super(struct super_block *sb)
  544. {
  545. struct inode *const inode = new_inode(sb);
  546. if (!inode)
  547. return -ENOMEM;
  548. set_nlink(inode, 1);
  549. inode->i_size = OFFSET_MAX;
  550. inode->i_mapping->a_ops = &z_erofs_cache_aops;
  551. mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
  552. EROFS_SB(sb)->managed_cache = inode;
  553. xa_init(&EROFS_SB(sb)->managed_pslots);
  554. return 0;
  555. }
  556. /* callers must be with pcluster lock held */
  557. static int z_erofs_attach_page(struct z_erofs_frontend *fe,
  558. struct z_erofs_bvec *bvec, bool exclusive)
  559. {
  560. struct z_erofs_pcluster *pcl = fe->pcl;
  561. int ret;
  562. if (exclusive) {
  563. /* give priority for inplaceio to use file pages first */
  564. spin_lock(&pcl->lockref.lock);
  565. while (fe->icur > 0) {
  566. if (pcl->compressed_bvecs[--fe->icur].page)
  567. continue;
  568. pcl->compressed_bvecs[fe->icur] = *bvec;
  569. spin_unlock(&pcl->lockref.lock);
  570. return 0;
  571. }
  572. spin_unlock(&pcl->lockref.lock);
  573. /* otherwise, check if it can be used as a bvpage */
  574. if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
  575. !fe->candidate_bvpage)
  576. fe->candidate_bvpage = bvec->page;
  577. }
  578. ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
  579. &fe->pagepool);
  580. fe->pcl->vcnt += (ret >= 0);
  581. return ret;
  582. }
  583. static bool z_erofs_get_pcluster(struct z_erofs_pcluster *pcl)
  584. {
  585. if (lockref_get_not_zero(&pcl->lockref))
  586. return true;
  587. spin_lock(&pcl->lockref.lock);
  588. if (__lockref_is_dead(&pcl->lockref)) {
  589. spin_unlock(&pcl->lockref.lock);
  590. return false;
  591. }
  592. if (!pcl->lockref.count++)
  593. atomic_long_dec(&erofs_global_shrink_cnt);
  594. spin_unlock(&pcl->lockref.lock);
  595. return true;
  596. }
  597. static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
  598. {
  599. struct erofs_map_blocks *map = &fe->map;
  600. struct super_block *sb = fe->inode->i_sb;
  601. struct erofs_sb_info *sbi = EROFS_SB(sb);
  602. bool ztailpacking = map->m_flags & EROFS_MAP_META;
  603. struct z_erofs_pcluster *pcl, *pre;
  604. int err;
  605. if (!(map->m_flags & EROFS_MAP_ENCODED) ||
  606. (!ztailpacking && !erofs_blknr(sb, map->m_pa))) {
  607. DBG_BUGON(1);
  608. return -EFSCORRUPTED;
  609. }
  610. /* no available pcluster, let's allocate one */
  611. pcl = z_erofs_alloc_pcluster(map->m_plen);
  612. if (IS_ERR(pcl))
  613. return PTR_ERR(pcl);
  614. spin_lock_init(&pcl->lockref.lock);
  615. pcl->lockref.count = 1; /* one ref for this request */
  616. pcl->algorithmformat = map->m_algorithmformat;
  617. pcl->length = 0;
  618. pcl->partial = true;
  619. pcl->next = fe->head;
  620. pcl->pageofs_out = map->m_la & ~PAGE_MASK;
  621. fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
  622. /*
  623. * lock all primary followed works before visible to others
  624. * and mutex_trylock *never* fails for a new pcluster.
  625. */
  626. mutex_init(&pcl->lock);
  627. DBG_BUGON(!mutex_trylock(&pcl->lock));
  628. if (ztailpacking) {
  629. pcl->index = 0; /* which indicates ztailpacking */
  630. } else {
  631. pcl->index = erofs_blknr(sb, map->m_pa);
  632. while (1) {
  633. xa_lock(&sbi->managed_pslots);
  634. pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
  635. NULL, pcl, GFP_KERNEL);
  636. if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
  637. xa_unlock(&sbi->managed_pslots);
  638. break;
  639. }
  640. /* try to legitimize the current in-tree one */
  641. xa_unlock(&sbi->managed_pslots);
  642. cond_resched();
  643. }
  644. if (xa_is_err(pre)) {
  645. err = xa_err(pre);
  646. goto err_out;
  647. } else if (pre) {
  648. fe->pcl = pre;
  649. err = -EEXIST;
  650. goto err_out;
  651. }
  652. }
  653. fe->head = fe->pcl = pcl;
  654. return 0;
  655. err_out:
  656. mutex_unlock(&pcl->lock);
  657. z_erofs_free_pcluster(pcl);
  658. return err;
  659. }
  660. static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
  661. {
  662. struct erofs_map_blocks *map = &fe->map;
  663. struct super_block *sb = fe->inode->i_sb;
  664. erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
  665. struct z_erofs_pcluster *pcl = NULL;
  666. int ret;
  667. DBG_BUGON(fe->pcl);
  668. /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
  669. DBG_BUGON(!fe->head);
  670. if (!(map->m_flags & EROFS_MAP_META)) {
  671. while (1) {
  672. rcu_read_lock();
  673. pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
  674. if (!pcl || z_erofs_get_pcluster(pcl)) {
  675. DBG_BUGON(pcl && blknr != pcl->index);
  676. rcu_read_unlock();
  677. break;
  678. }
  679. rcu_read_unlock();
  680. }
  681. } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
  682. DBG_BUGON(1);
  683. return -EFSCORRUPTED;
  684. }
  685. if (pcl) {
  686. fe->pcl = pcl;
  687. ret = -EEXIST;
  688. } else {
  689. ret = z_erofs_register_pcluster(fe);
  690. }
  691. if (ret == -EEXIST) {
  692. mutex_lock(&fe->pcl->lock);
  693. /* check if this pcluster hasn't been linked into any chain. */
  694. if (!cmpxchg(&fe->pcl->next, NULL, fe->head)) {
  695. /* .. so it can be attached to our submission chain */
  696. fe->head = fe->pcl;
  697. fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
  698. } else { /* otherwise, it belongs to an inflight chain */
  699. fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
  700. }
  701. } else if (ret) {
  702. return ret;
  703. }
  704. z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
  705. Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
  706. if (!z_erofs_is_inline_pcluster(fe->pcl)) {
  707. /* bind cache first when cached decompression is preferred */
  708. z_erofs_bind_cache(fe);
  709. } else {
  710. void *mptr;
  711. mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP);
  712. if (IS_ERR(mptr)) {
  713. ret = PTR_ERR(mptr);
  714. erofs_err(sb, "failed to get inline data %d", ret);
  715. return ret;
  716. }
  717. get_page(map->buf.page);
  718. WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
  719. fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
  720. fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
  721. }
  722. /* file-backed inplace I/O pages are traversed in reverse order */
  723. fe->icur = z_erofs_pclusterpages(fe->pcl);
  724. return 0;
  725. }
  726. static void z_erofs_rcu_callback(struct rcu_head *head)
  727. {
  728. z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu));
  729. }
  730. static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
  731. struct z_erofs_pcluster *pcl)
  732. {
  733. if (pcl->lockref.count)
  734. return false;
  735. /*
  736. * Note that all cached folios should be detached before deleted from
  737. * the XArray. Otherwise some folios could be still attached to the
  738. * orphan old pcluster when the new one is available in the tree.
  739. */
  740. if (erofs_try_to_free_all_cached_folios(sbi, pcl))
  741. return false;
  742. /*
  743. * It's impossible to fail after the pcluster is freezed, but in order
  744. * to avoid some race conditions, add a DBG_BUGON to observe this.
  745. */
  746. DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
  747. lockref_mark_dead(&pcl->lockref);
  748. return true;
  749. }
  750. static bool erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
  751. struct z_erofs_pcluster *pcl)
  752. {
  753. bool free;
  754. spin_lock(&pcl->lockref.lock);
  755. free = __erofs_try_to_release_pcluster(sbi, pcl);
  756. spin_unlock(&pcl->lockref.lock);
  757. if (free) {
  758. atomic_long_dec(&erofs_global_shrink_cnt);
  759. call_rcu(&pcl->rcu, z_erofs_rcu_callback);
  760. }
  761. return free;
  762. }
  763. unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr)
  764. {
  765. struct z_erofs_pcluster *pcl;
  766. unsigned long index, freed = 0;
  767. xa_lock(&sbi->managed_pslots);
  768. xa_for_each(&sbi->managed_pslots, index, pcl) {
  769. /* try to shrink each valid pcluster */
  770. if (!erofs_try_to_release_pcluster(sbi, pcl))
  771. continue;
  772. xa_unlock(&sbi->managed_pslots);
  773. ++freed;
  774. if (!--nr)
  775. return freed;
  776. xa_lock(&sbi->managed_pslots);
  777. }
  778. xa_unlock(&sbi->managed_pslots);
  779. return freed;
  780. }
  781. static void z_erofs_put_pcluster(struct erofs_sb_info *sbi,
  782. struct z_erofs_pcluster *pcl, bool try_free)
  783. {
  784. bool free = false;
  785. if (lockref_put_or_lock(&pcl->lockref))
  786. return;
  787. DBG_BUGON(__lockref_is_dead(&pcl->lockref));
  788. if (!--pcl->lockref.count) {
  789. if (try_free && xa_trylock(&sbi->managed_pslots)) {
  790. free = __erofs_try_to_release_pcluster(sbi, pcl);
  791. xa_unlock(&sbi->managed_pslots);
  792. }
  793. atomic_long_add(!free, &erofs_global_shrink_cnt);
  794. }
  795. spin_unlock(&pcl->lockref.lock);
  796. if (free)
  797. call_rcu(&pcl->rcu, z_erofs_rcu_callback);
  798. }
  799. static void z_erofs_pcluster_end(struct z_erofs_frontend *fe)
  800. {
  801. struct z_erofs_pcluster *pcl = fe->pcl;
  802. if (!pcl)
  803. return;
  804. z_erofs_bvec_iter_end(&fe->biter);
  805. mutex_unlock(&pcl->lock);
  806. if (fe->candidate_bvpage)
  807. fe->candidate_bvpage = NULL;
  808. /* Drop refcount if it doesn't belong to our processing chain */
  809. if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
  810. z_erofs_put_pcluster(EROFS_I_SB(fe->inode), pcl, false);
  811. fe->pcl = NULL;
  812. }
  813. static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
  814. unsigned int cur, unsigned int end, erofs_off_t pos)
  815. {
  816. struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
  817. struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
  818. unsigned int cnt;
  819. u8 *src;
  820. if (!packed_inode)
  821. return -EFSCORRUPTED;
  822. buf.mapping = packed_inode->i_mapping;
  823. for (; cur < end; cur += cnt, pos += cnt) {
  824. cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
  825. src = erofs_bread(&buf, pos, EROFS_KMAP);
  826. if (IS_ERR(src)) {
  827. erofs_put_metabuf(&buf);
  828. return PTR_ERR(src);
  829. }
  830. memcpy_to_folio(folio, cur, src, cnt);
  831. }
  832. erofs_put_metabuf(&buf);
  833. return 0;
  834. }
  835. static int z_erofs_scan_folio(struct z_erofs_frontend *f,
  836. struct folio *folio, bool ra)
  837. {
  838. struct inode *const inode = f->inode;
  839. struct erofs_map_blocks *const map = &f->map;
  840. const loff_t offset = folio_pos(folio);
  841. const unsigned int bs = i_blocksize(inode);
  842. unsigned int end = folio_size(folio), split = 0, cur, pgs;
  843. bool tight, excl;
  844. int err = 0;
  845. tight = (bs == PAGE_SIZE);
  846. erofs_onlinefolio_init(folio);
  847. do {
  848. if (offset + end - 1 < map->m_la ||
  849. offset + end - 1 >= map->m_la + map->m_llen) {
  850. z_erofs_pcluster_end(f);
  851. map->m_la = offset + end - 1;
  852. map->m_llen = 0;
  853. err = z_erofs_map_blocks_iter(inode, map, 0);
  854. if (err)
  855. break;
  856. }
  857. cur = offset > map->m_la ? 0 : map->m_la - offset;
  858. pgs = round_down(cur, PAGE_SIZE);
  859. /* bump split parts first to avoid several separate cases */
  860. ++split;
  861. if (!(map->m_flags & EROFS_MAP_MAPPED)) {
  862. folio_zero_segment(folio, cur, end);
  863. tight = false;
  864. } else if (map->m_flags & __EROFS_MAP_FRAGMENT) {
  865. erofs_off_t fpos = offset + cur - map->m_la;
  866. err = z_erofs_read_fragment(inode->i_sb, folio, cur,
  867. cur + min(map->m_llen - fpos, end - cur),
  868. EROFS_I(inode)->z_fragmentoff + fpos);
  869. if (err)
  870. break;
  871. tight = false;
  872. } else {
  873. if (!f->pcl) {
  874. err = z_erofs_pcluster_begin(f);
  875. if (err)
  876. break;
  877. f->pcl->besteffort |= !ra;
  878. }
  879. pgs = round_down(end - 1, PAGE_SIZE);
  880. /*
  881. * Ensure this partial page belongs to this submit chain
  882. * rather than other concurrent submit chains or
  883. * noio(bypass) chains since those chains are handled
  884. * asynchronously thus it cannot be used for inplace I/O
  885. * or bvpage (should be processed in the strict order.)
  886. */
  887. tight &= (f->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
  888. excl = false;
  889. if (cur <= pgs) {
  890. excl = (split <= 1) || tight;
  891. cur = pgs;
  892. }
  893. err = z_erofs_attach_page(f, &((struct z_erofs_bvec) {
  894. .page = folio_page(folio, pgs >> PAGE_SHIFT),
  895. .offset = offset + pgs - map->m_la,
  896. .end = end - pgs, }), excl);
  897. if (err)
  898. break;
  899. erofs_onlinefolio_split(folio);
  900. if (f->pcl->length < offset + end - map->m_la) {
  901. f->pcl->length = offset + end - map->m_la;
  902. f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
  903. }
  904. if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
  905. !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
  906. f->pcl->length == map->m_llen)
  907. f->pcl->partial = false;
  908. }
  909. /* shorten the remaining extent to update progress */
  910. map->m_llen = offset + cur - map->m_la;
  911. map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
  912. if (cur <= pgs) {
  913. split = cur < pgs;
  914. tight = (bs == PAGE_SIZE);
  915. }
  916. } while ((end = cur) > 0);
  917. erofs_onlinefolio_end(folio, err, false);
  918. return err;
  919. }
  920. static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
  921. unsigned int readahead_pages)
  922. {
  923. /* auto: enable for read_folio, disable for readahead */
  924. if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
  925. !readahead_pages)
  926. return true;
  927. if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
  928. (readahead_pages <= sbi->opt.max_sync_decompress_pages))
  929. return true;
  930. return false;
  931. }
  932. static bool z_erofs_page_is_invalidated(struct page *page)
  933. {
  934. return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page);
  935. }
  936. struct z_erofs_backend {
  937. struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
  938. struct super_block *sb;
  939. struct z_erofs_pcluster *pcl;
  940. /* pages with the longest decompressed length for deduplication */
  941. struct page **decompressed_pages;
  942. /* pages to keep the compressed data */
  943. struct page **compressed_pages;
  944. struct list_head decompressed_secondary_bvecs;
  945. struct page **pagepool;
  946. unsigned int onstack_used, nr_pages;
  947. /* indicate if temporary copies should be preserved for later use */
  948. bool keepxcpy;
  949. };
  950. struct z_erofs_bvec_item {
  951. struct z_erofs_bvec bvec;
  952. struct list_head list;
  953. };
  954. static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
  955. struct z_erofs_bvec *bvec)
  956. {
  957. int poff = bvec->offset + be->pcl->pageofs_out;
  958. struct z_erofs_bvec_item *item;
  959. struct page **page;
  960. if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE ||
  961. bvec->offset + bvec->end == be->pcl->length)) {
  962. DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages);
  963. page = be->decompressed_pages + (poff >> PAGE_SHIFT);
  964. if (!*page) {
  965. *page = bvec->page;
  966. return;
  967. }
  968. } else {
  969. be->keepxcpy = true;
  970. }
  971. /* (cold path) one pcluster is requested multiple times */
  972. item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
  973. item->bvec = *bvec;
  974. list_add(&item->list, &be->decompressed_secondary_bvecs);
  975. }
  976. static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
  977. {
  978. unsigned int off0 = be->pcl->pageofs_out;
  979. struct list_head *p, *n;
  980. list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
  981. struct z_erofs_bvec_item *bvi;
  982. unsigned int end, cur;
  983. void *dst, *src;
  984. bvi = container_of(p, struct z_erofs_bvec_item, list);
  985. cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
  986. end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
  987. bvi->bvec.end);
  988. dst = kmap_local_page(bvi->bvec.page);
  989. while (cur < end) {
  990. unsigned int pgnr, scur, len;
  991. pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
  992. DBG_BUGON(pgnr >= be->nr_pages);
  993. scur = bvi->bvec.offset + cur -
  994. ((pgnr << PAGE_SHIFT) - off0);
  995. len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
  996. if (!be->decompressed_pages[pgnr]) {
  997. err = -EFSCORRUPTED;
  998. cur += len;
  999. continue;
  1000. }
  1001. src = kmap_local_page(be->decompressed_pages[pgnr]);
  1002. memcpy(dst + cur, src + scur, len);
  1003. kunmap_local(src);
  1004. cur += len;
  1005. }
  1006. kunmap_local(dst);
  1007. erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true);
  1008. list_del(p);
  1009. kfree(bvi);
  1010. }
  1011. }
  1012. static void z_erofs_parse_out_bvecs(struct z_erofs_backend *be)
  1013. {
  1014. struct z_erofs_pcluster *pcl = be->pcl;
  1015. struct z_erofs_bvec_iter biter;
  1016. struct page *old_bvpage;
  1017. int i;
  1018. z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
  1019. for (i = 0; i < pcl->vcnt; ++i) {
  1020. struct z_erofs_bvec bvec;
  1021. z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
  1022. if (old_bvpage)
  1023. z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
  1024. DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
  1025. z_erofs_do_decompressed_bvec(be, &bvec);
  1026. }
  1027. old_bvpage = z_erofs_bvec_iter_end(&biter);
  1028. if (old_bvpage)
  1029. z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
  1030. }
  1031. static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
  1032. {
  1033. struct z_erofs_pcluster *pcl = be->pcl;
  1034. unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
  1035. int i, err = 0;
  1036. *overlapped = false;
  1037. for (i = 0; i < pclusterpages; ++i) {
  1038. struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
  1039. struct page *page = bvec->page;
  1040. /* compressed data ought to be valid when decompressing */
  1041. if (IS_ERR(page) || !page) {
  1042. bvec->page = NULL; /* clear the failure reason */
  1043. err = page ? PTR_ERR(page) : -EIO;
  1044. continue;
  1045. }
  1046. be->compressed_pages[i] = page;
  1047. if (z_erofs_is_inline_pcluster(pcl) ||
  1048. erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
  1049. if (!PageUptodate(page))
  1050. err = -EIO;
  1051. continue;
  1052. }
  1053. DBG_BUGON(z_erofs_page_is_invalidated(page));
  1054. if (z_erofs_is_shortlived_page(page))
  1055. continue;
  1056. z_erofs_do_decompressed_bvec(be, bvec);
  1057. *overlapped = true;
  1058. }
  1059. return err;
  1060. }
  1061. static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
  1062. {
  1063. struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
  1064. struct z_erofs_pcluster *pcl = be->pcl;
  1065. unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
  1066. const struct z_erofs_decompressor *decomp =
  1067. z_erofs_decomp[pcl->algorithmformat];
  1068. int i, j, jtop, err2;
  1069. struct page *page;
  1070. bool overlapped;
  1071. bool try_free = true;
  1072. mutex_lock(&pcl->lock);
  1073. be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
  1074. /* allocate (de)compressed page arrays if cannot be kept on stack */
  1075. be->decompressed_pages = NULL;
  1076. be->compressed_pages = NULL;
  1077. be->onstack_used = 0;
  1078. if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
  1079. be->decompressed_pages = be->onstack_pages;
  1080. be->onstack_used = be->nr_pages;
  1081. memset(be->decompressed_pages, 0,
  1082. sizeof(struct page *) * be->nr_pages);
  1083. }
  1084. if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
  1085. be->compressed_pages = be->onstack_pages + be->onstack_used;
  1086. if (!be->decompressed_pages)
  1087. be->decompressed_pages =
  1088. kvcalloc(be->nr_pages, sizeof(struct page *),
  1089. GFP_KERNEL | __GFP_NOFAIL);
  1090. if (!be->compressed_pages)
  1091. be->compressed_pages =
  1092. kvcalloc(pclusterpages, sizeof(struct page *),
  1093. GFP_KERNEL | __GFP_NOFAIL);
  1094. z_erofs_parse_out_bvecs(be);
  1095. err2 = z_erofs_parse_in_bvecs(be, &overlapped);
  1096. if (err2)
  1097. err = err2;
  1098. if (!err)
  1099. err = decomp->decompress(&(struct z_erofs_decompress_req) {
  1100. .sb = be->sb,
  1101. .in = be->compressed_pages,
  1102. .out = be->decompressed_pages,
  1103. .pageofs_in = pcl->pageofs_in,
  1104. .pageofs_out = pcl->pageofs_out,
  1105. .inputsize = pcl->pclustersize,
  1106. .outputsize = pcl->length,
  1107. .alg = pcl->algorithmformat,
  1108. .inplace_io = overlapped,
  1109. .partial_decoding = pcl->partial,
  1110. .fillgaps = be->keepxcpy,
  1111. .gfp = pcl->besteffort ? GFP_KERNEL :
  1112. GFP_NOWAIT | __GFP_NORETRY
  1113. }, be->pagepool);
  1114. /* must handle all compressed pages before actual file pages */
  1115. if (z_erofs_is_inline_pcluster(pcl)) {
  1116. page = pcl->compressed_bvecs[0].page;
  1117. WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
  1118. put_page(page);
  1119. } else {
  1120. /* managed folios are still left in compressed_bvecs[] */
  1121. for (i = 0; i < pclusterpages; ++i) {
  1122. page = be->compressed_pages[i];
  1123. if (!page)
  1124. continue;
  1125. if (erofs_folio_is_managed(sbi, page_folio(page))) {
  1126. try_free = false;
  1127. continue;
  1128. }
  1129. (void)z_erofs_put_shortlivedpage(be->pagepool, page);
  1130. WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
  1131. }
  1132. }
  1133. if (be->compressed_pages < be->onstack_pages ||
  1134. be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
  1135. kvfree(be->compressed_pages);
  1136. jtop = 0;
  1137. z_erofs_fill_other_copies(be, err);
  1138. for (i = 0; i < be->nr_pages; ++i) {
  1139. page = be->decompressed_pages[i];
  1140. if (!page)
  1141. continue;
  1142. DBG_BUGON(z_erofs_page_is_invalidated(page));
  1143. if (!z_erofs_is_shortlived_page(page)) {
  1144. erofs_onlinefolio_end(page_folio(page), err, true);
  1145. continue;
  1146. }
  1147. if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
  1148. erofs_pagepool_add(be->pagepool, page);
  1149. continue;
  1150. }
  1151. for (j = 0; j < jtop && be->decompressed_pages[j] != page; ++j)
  1152. ;
  1153. if (j >= jtop) /* this bounce page is newly detected */
  1154. be->decompressed_pages[jtop++] = page;
  1155. }
  1156. while (jtop)
  1157. erofs_pagepool_add(be->pagepool,
  1158. be->decompressed_pages[--jtop]);
  1159. if (be->decompressed_pages != be->onstack_pages)
  1160. kvfree(be->decompressed_pages);
  1161. pcl->length = 0;
  1162. pcl->partial = true;
  1163. pcl->besteffort = false;
  1164. pcl->bvset.nextpage = NULL;
  1165. pcl->vcnt = 0;
  1166. /* pcluster lock MUST be taken before the following line */
  1167. WRITE_ONCE(pcl->next, NULL);
  1168. mutex_unlock(&pcl->lock);
  1169. if (z_erofs_is_inline_pcluster(pcl))
  1170. z_erofs_free_pcluster(pcl);
  1171. else
  1172. z_erofs_put_pcluster(sbi, pcl, try_free);
  1173. return err;
  1174. }
  1175. static int z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
  1176. struct page **pagepool)
  1177. {
  1178. struct z_erofs_backend be = {
  1179. .sb = io->sb,
  1180. .pagepool = pagepool,
  1181. .decompressed_secondary_bvecs =
  1182. LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
  1183. .pcl = io->head,
  1184. };
  1185. struct z_erofs_pcluster *next;
  1186. int err = io->eio ? -EIO : 0;
  1187. for (; be.pcl != Z_EROFS_PCLUSTER_TAIL; be.pcl = next) {
  1188. DBG_BUGON(!be.pcl);
  1189. next = READ_ONCE(be.pcl->next);
  1190. err = z_erofs_decompress_pcluster(&be, err) ?: err;
  1191. }
  1192. return err;
  1193. }
  1194. static void z_erofs_decompressqueue_work(struct work_struct *work)
  1195. {
  1196. struct z_erofs_decompressqueue *bgq =
  1197. container_of(work, struct z_erofs_decompressqueue, u.work);
  1198. struct page *pagepool = NULL;
  1199. DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
  1200. z_erofs_decompress_queue(bgq, &pagepool);
  1201. erofs_release_pages(&pagepool);
  1202. kvfree(bgq);
  1203. }
  1204. #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
  1205. static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
  1206. {
  1207. z_erofs_decompressqueue_work((struct work_struct *)work);
  1208. }
  1209. #endif
  1210. /* Use (kthread_)work in atomic contexts to minimize scheduling overhead */
  1211. static inline bool z_erofs_in_atomic(void)
  1212. {
  1213. if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth())
  1214. return true;
  1215. if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
  1216. return true;
  1217. return !preemptible();
  1218. }
  1219. static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
  1220. int bios)
  1221. {
  1222. struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
  1223. /* wake up the caller thread for sync decompression */
  1224. if (io->sync) {
  1225. if (!atomic_add_return(bios, &io->pending_bios))
  1226. complete(&io->u.done);
  1227. return;
  1228. }
  1229. if (atomic_add_return(bios, &io->pending_bios))
  1230. return;
  1231. if (z_erofs_in_atomic()) {
  1232. #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
  1233. struct kthread_worker *worker;
  1234. rcu_read_lock();
  1235. worker = rcu_dereference(
  1236. z_erofs_pcpu_workers[raw_smp_processor_id()]);
  1237. if (!worker) {
  1238. INIT_WORK(&io->u.work, z_erofs_decompressqueue_work);
  1239. queue_work(z_erofs_workqueue, &io->u.work);
  1240. } else {
  1241. kthread_queue_work(worker, &io->u.kthread_work);
  1242. }
  1243. rcu_read_unlock();
  1244. #else
  1245. queue_work(z_erofs_workqueue, &io->u.work);
  1246. #endif
  1247. /* enable sync decompression for readahead */
  1248. if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
  1249. sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
  1250. return;
  1251. }
  1252. z_erofs_decompressqueue_work(&io->u.work);
  1253. }
  1254. static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
  1255. struct z_erofs_frontend *f,
  1256. struct z_erofs_pcluster *pcl,
  1257. unsigned int nr,
  1258. struct address_space *mc)
  1259. {
  1260. gfp_t gfp = mapping_gfp_mask(mc);
  1261. bool tocache = false;
  1262. struct z_erofs_bvec zbv;
  1263. struct address_space *mapping;
  1264. struct folio *folio;
  1265. struct page *page;
  1266. int bs = i_blocksize(f->inode);
  1267. /* Except for inplace folios, the entire folio can be used for I/Os */
  1268. bvec->bv_offset = 0;
  1269. bvec->bv_len = PAGE_SIZE;
  1270. repeat:
  1271. spin_lock(&pcl->lockref.lock);
  1272. zbv = pcl->compressed_bvecs[nr];
  1273. spin_unlock(&pcl->lockref.lock);
  1274. if (!zbv.page)
  1275. goto out_allocfolio;
  1276. bvec->bv_page = zbv.page;
  1277. DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
  1278. folio = page_folio(zbv.page);
  1279. /*
  1280. * Handle preallocated cached folios. We tried to allocate such folios
  1281. * without triggering direct reclaim. If allocation failed, inplace
  1282. * file-backed folios will be used instead.
  1283. */
  1284. if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
  1285. tocache = true;
  1286. goto out_tocache;
  1287. }
  1288. mapping = READ_ONCE(folio->mapping);
  1289. /*
  1290. * File-backed folios for inplace I/Os are all locked steady,
  1291. * therefore it is impossible for `mapping` to be NULL.
  1292. */
  1293. if (mapping && mapping != mc) {
  1294. if (zbv.offset < 0)
  1295. bvec->bv_offset = round_up(-zbv.offset, bs);
  1296. bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
  1297. return;
  1298. }
  1299. folio_lock(folio);
  1300. if (likely(folio->mapping == mc)) {
  1301. /*
  1302. * The cached folio is still in managed cache but without
  1303. * a valid `->private` pcluster hint. Let's reconnect them.
  1304. */
  1305. if (!folio_test_private(folio)) {
  1306. folio_attach_private(folio, pcl);
  1307. /* compressed_bvecs[] already takes a ref before */
  1308. folio_put(folio);
  1309. }
  1310. if (likely(folio->private == pcl)) {
  1311. /* don't submit cache I/Os again if already uptodate */
  1312. if (folio_test_uptodate(folio)) {
  1313. folio_unlock(folio);
  1314. bvec->bv_page = NULL;
  1315. }
  1316. return;
  1317. }
  1318. /*
  1319. * Already linked with another pcluster, which only appears in
  1320. * crafted images by fuzzers for now. But handle this anyway.
  1321. */
  1322. tocache = false; /* use temporary short-lived pages */
  1323. } else {
  1324. DBG_BUGON(1); /* referenced managed folios can't be truncated */
  1325. tocache = true;
  1326. }
  1327. folio_unlock(folio);
  1328. folio_put(folio);
  1329. out_allocfolio:
  1330. page = __erofs_allocpage(&f->pagepool, gfp, true);
  1331. spin_lock(&pcl->lockref.lock);
  1332. if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
  1333. if (page)
  1334. erofs_pagepool_add(&f->pagepool, page);
  1335. spin_unlock(&pcl->lockref.lock);
  1336. cond_resched();
  1337. goto repeat;
  1338. }
  1339. pcl->compressed_bvecs[nr].page = page ? page : ERR_PTR(-ENOMEM);
  1340. spin_unlock(&pcl->lockref.lock);
  1341. bvec->bv_page = page;
  1342. if (!page)
  1343. return;
  1344. folio = page_folio(page);
  1345. out_tocache:
  1346. if (!tocache || bs != PAGE_SIZE ||
  1347. filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
  1348. /* turn into a temporary shortlived folio (1 ref) */
  1349. folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
  1350. return;
  1351. }
  1352. folio_attach_private(folio, pcl);
  1353. /* drop a refcount added by allocpage (then 2 refs in total here) */
  1354. folio_put(folio);
  1355. }
  1356. static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
  1357. struct z_erofs_decompressqueue *fgq, bool *fg)
  1358. {
  1359. struct z_erofs_decompressqueue *q;
  1360. if (fg && !*fg) {
  1361. q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
  1362. if (!q) {
  1363. *fg = true;
  1364. goto fg_out;
  1365. }
  1366. #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
  1367. kthread_init_work(&q->u.kthread_work,
  1368. z_erofs_decompressqueue_kthread_work);
  1369. #else
  1370. INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
  1371. #endif
  1372. } else {
  1373. fg_out:
  1374. q = fgq;
  1375. init_completion(&fgq->u.done);
  1376. atomic_set(&fgq->pending_bios, 0);
  1377. q->eio = false;
  1378. q->sync = true;
  1379. }
  1380. q->sb = sb;
  1381. q->head = Z_EROFS_PCLUSTER_TAIL;
  1382. return q;
  1383. }
  1384. /* define decompression jobqueue types */
  1385. enum {
  1386. JQ_BYPASS,
  1387. JQ_SUBMIT,
  1388. NR_JOBQUEUES,
  1389. };
  1390. static void z_erofs_move_to_bypass_queue(struct z_erofs_pcluster *pcl,
  1391. struct z_erofs_pcluster *next,
  1392. struct z_erofs_pcluster **qtail[])
  1393. {
  1394. WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
  1395. WRITE_ONCE(*qtail[JQ_SUBMIT], next);
  1396. WRITE_ONCE(*qtail[JQ_BYPASS], pcl);
  1397. qtail[JQ_BYPASS] = &pcl->next;
  1398. }
  1399. static void z_erofs_endio(struct bio *bio)
  1400. {
  1401. struct z_erofs_decompressqueue *q = bio->bi_private;
  1402. blk_status_t err = bio->bi_status;
  1403. struct folio_iter fi;
  1404. bio_for_each_folio_all(fi, bio) {
  1405. struct folio *folio = fi.folio;
  1406. DBG_BUGON(folio_test_uptodate(folio));
  1407. DBG_BUGON(z_erofs_page_is_invalidated(&folio->page));
  1408. if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio))
  1409. continue;
  1410. if (!err)
  1411. folio_mark_uptodate(folio);
  1412. folio_unlock(folio);
  1413. }
  1414. if (err)
  1415. q->eio = true;
  1416. z_erofs_decompress_kickoff(q, -1);
  1417. if (bio->bi_bdev)
  1418. bio_put(bio);
  1419. }
  1420. static void z_erofs_submit_queue(struct z_erofs_frontend *f,
  1421. struct z_erofs_decompressqueue *fgq,
  1422. bool *force_fg, bool readahead)
  1423. {
  1424. struct super_block *sb = f->inode->i_sb;
  1425. struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
  1426. struct z_erofs_pcluster **qtail[NR_JOBQUEUES];
  1427. struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
  1428. struct z_erofs_pcluster *pcl, *next;
  1429. /* bio is NULL initially, so no need to initialize last_{index,bdev} */
  1430. erofs_off_t last_pa;
  1431. unsigned int nr_bios = 0;
  1432. struct bio *bio = NULL;
  1433. unsigned long pflags;
  1434. int memstall = 0;
  1435. /* No need to read from device for pclusters in the bypass queue. */
  1436. q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
  1437. q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
  1438. qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
  1439. qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
  1440. /* by default, all need io submission */
  1441. q[JQ_SUBMIT]->head = next = f->head;
  1442. do {
  1443. struct erofs_map_dev mdev;
  1444. erofs_off_t cur, end;
  1445. struct bio_vec bvec;
  1446. unsigned int i = 0;
  1447. bool bypass = true;
  1448. pcl = next;
  1449. next = READ_ONCE(pcl->next);
  1450. if (z_erofs_is_inline_pcluster(pcl)) {
  1451. z_erofs_move_to_bypass_queue(pcl, next, qtail);
  1452. continue;
  1453. }
  1454. /* no device id here, thus it will always succeed */
  1455. mdev = (struct erofs_map_dev) {
  1456. .m_pa = erofs_pos(sb, pcl->index),
  1457. };
  1458. (void)erofs_map_dev(sb, &mdev);
  1459. cur = mdev.m_pa;
  1460. end = cur + pcl->pclustersize;
  1461. do {
  1462. bvec.bv_page = NULL;
  1463. if (bio && (cur != last_pa ||
  1464. bio->bi_bdev != mdev.m_bdev)) {
  1465. drain_io:
  1466. if (erofs_is_fileio_mode(EROFS_SB(sb)))
  1467. erofs_fileio_submit_bio(bio);
  1468. else if (erofs_is_fscache_mode(sb))
  1469. erofs_fscache_submit_bio(bio);
  1470. else
  1471. submit_bio(bio);
  1472. if (memstall) {
  1473. psi_memstall_leave(&pflags);
  1474. memstall = 0;
  1475. }
  1476. bio = NULL;
  1477. }
  1478. if (!bvec.bv_page) {
  1479. z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
  1480. if (!bvec.bv_page)
  1481. continue;
  1482. if (cur + bvec.bv_len > end)
  1483. bvec.bv_len = end - cur;
  1484. DBG_BUGON(bvec.bv_len < sb->s_blocksize);
  1485. }
  1486. if (unlikely(PageWorkingset(bvec.bv_page)) &&
  1487. !memstall) {
  1488. psi_memstall_enter(&pflags);
  1489. memstall = 1;
  1490. }
  1491. if (!bio) {
  1492. if (erofs_is_fileio_mode(EROFS_SB(sb)))
  1493. bio = erofs_fileio_bio_alloc(&mdev);
  1494. else if (erofs_is_fscache_mode(sb))
  1495. bio = erofs_fscache_bio_alloc(&mdev);
  1496. else
  1497. bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
  1498. REQ_OP_READ, GFP_NOIO);
  1499. bio->bi_end_io = z_erofs_endio;
  1500. bio->bi_iter.bi_sector = cur >> 9;
  1501. bio->bi_private = q[JQ_SUBMIT];
  1502. if (readahead)
  1503. bio->bi_opf |= REQ_RAHEAD;
  1504. ++nr_bios;
  1505. }
  1506. if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
  1507. bvec.bv_offset))
  1508. goto drain_io;
  1509. last_pa = cur + bvec.bv_len;
  1510. bypass = false;
  1511. } while ((cur += bvec.bv_len) < end);
  1512. if (!bypass)
  1513. qtail[JQ_SUBMIT] = &pcl->next;
  1514. else
  1515. z_erofs_move_to_bypass_queue(pcl, next, qtail);
  1516. } while (next != Z_EROFS_PCLUSTER_TAIL);
  1517. if (bio) {
  1518. if (erofs_is_fileio_mode(EROFS_SB(sb)))
  1519. erofs_fileio_submit_bio(bio);
  1520. else if (erofs_is_fscache_mode(sb))
  1521. erofs_fscache_submit_bio(bio);
  1522. else
  1523. submit_bio(bio);
  1524. }
  1525. if (memstall)
  1526. psi_memstall_leave(&pflags);
  1527. /*
  1528. * although background is preferred, no one is pending for submission.
  1529. * don't issue decompression but drop it directly instead.
  1530. */
  1531. if (!*force_fg && !nr_bios) {
  1532. kvfree(q[JQ_SUBMIT]);
  1533. return;
  1534. }
  1535. z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
  1536. }
  1537. static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages)
  1538. {
  1539. struct z_erofs_decompressqueue io[NR_JOBQUEUES];
  1540. struct erofs_sb_info *sbi = EROFS_I_SB(f->inode);
  1541. bool force_fg = z_erofs_is_sync_decompress(sbi, rapages);
  1542. int err;
  1543. if (f->head == Z_EROFS_PCLUSTER_TAIL)
  1544. return 0;
  1545. z_erofs_submit_queue(f, io, &force_fg, !!rapages);
  1546. /* handle bypass queue (no i/o pclusters) immediately */
  1547. err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
  1548. if (!force_fg)
  1549. return err;
  1550. /* wait until all bios are completed */
  1551. wait_for_completion_io(&io[JQ_SUBMIT].u.done);
  1552. /* handle synchronous decompress queue in the caller context */
  1553. return z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool) ?: err;
  1554. }
  1555. /*
  1556. * Since partial uptodate is still unimplemented for now, we have to use
  1557. * approximate readmore strategies as a start.
  1558. */
  1559. static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f,
  1560. struct readahead_control *rac, bool backmost)
  1561. {
  1562. struct inode *inode = f->inode;
  1563. struct erofs_map_blocks *map = &f->map;
  1564. erofs_off_t cur, end, headoffset = f->headoffset;
  1565. int err;
  1566. if (backmost) {
  1567. if (rac)
  1568. end = headoffset + readahead_length(rac) - 1;
  1569. else
  1570. end = headoffset + PAGE_SIZE - 1;
  1571. map->m_la = end;
  1572. err = z_erofs_map_blocks_iter(inode, map,
  1573. EROFS_GET_BLOCKS_READMORE);
  1574. if (err)
  1575. return;
  1576. /* expand ra for the trailing edge if readahead */
  1577. if (rac) {
  1578. cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
  1579. readahead_expand(rac, headoffset, cur - headoffset);
  1580. return;
  1581. }
  1582. end = round_up(end, PAGE_SIZE);
  1583. } else {
  1584. end = round_up(map->m_la, PAGE_SIZE);
  1585. if (!map->m_llen)
  1586. return;
  1587. }
  1588. cur = map->m_la + map->m_llen - 1;
  1589. while ((cur >= end) && (cur < i_size_read(inode))) {
  1590. pgoff_t index = cur >> PAGE_SHIFT;
  1591. struct folio *folio;
  1592. folio = erofs_grab_folio_nowait(inode->i_mapping, index);
  1593. if (!IS_ERR_OR_NULL(folio)) {
  1594. if (folio_test_uptodate(folio))
  1595. folio_unlock(folio);
  1596. else
  1597. z_erofs_scan_folio(f, folio, !!rac);
  1598. folio_put(folio);
  1599. }
  1600. if (cur < PAGE_SIZE)
  1601. break;
  1602. cur = (index << PAGE_SHIFT) - 1;
  1603. }
  1604. }
  1605. static int z_erofs_read_folio(struct file *file, struct folio *folio)
  1606. {
  1607. struct inode *const inode = folio->mapping->host;
  1608. Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio));
  1609. int err;
  1610. trace_erofs_read_folio(folio, false);
  1611. z_erofs_pcluster_readmore(&f, NULL, true);
  1612. err = z_erofs_scan_folio(&f, folio, false);
  1613. z_erofs_pcluster_readmore(&f, NULL, false);
  1614. z_erofs_pcluster_end(&f);
  1615. /* if some pclusters are ready, need submit them anyway */
  1616. err = z_erofs_runqueue(&f, 0) ?: err;
  1617. if (err && err != -EINTR)
  1618. erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
  1619. err, folio->index, EROFS_I(inode)->nid);
  1620. erofs_put_metabuf(&f.map.buf);
  1621. erofs_release_pages(&f.pagepool);
  1622. return err;
  1623. }
  1624. static void z_erofs_readahead(struct readahead_control *rac)
  1625. {
  1626. struct inode *const inode = rac->mapping->host;
  1627. Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac));
  1628. unsigned int nrpages = readahead_count(rac);
  1629. struct folio *head = NULL, *folio;
  1630. int err;
  1631. trace_erofs_readahead(inode, readahead_index(rac), nrpages, false);
  1632. z_erofs_pcluster_readmore(&f, rac, true);
  1633. while ((folio = readahead_folio(rac))) {
  1634. folio->private = head;
  1635. head = folio;
  1636. }
  1637. /* traverse in reverse order for best metadata I/O performance */
  1638. while (head) {
  1639. folio = head;
  1640. head = folio_get_private(folio);
  1641. err = z_erofs_scan_folio(&f, folio, true);
  1642. if (err && err != -EINTR)
  1643. erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
  1644. folio->index, EROFS_I(inode)->nid);
  1645. }
  1646. z_erofs_pcluster_readmore(&f, rac, false);
  1647. z_erofs_pcluster_end(&f);
  1648. (void)z_erofs_runqueue(&f, nrpages);
  1649. erofs_put_metabuf(&f.map.buf);
  1650. erofs_release_pages(&f.pagepool);
  1651. }
  1652. const struct address_space_operations z_erofs_aops = {
  1653. .read_folio = z_erofs_read_folio,
  1654. .readahead = z_erofs_readahead,
  1655. };