xfs_icache.c 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  4. * All Rights Reserved.
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_log_format.h"
  11. #include "xfs_trans_resv.h"
  12. #include "xfs_mount.h"
  13. #include "xfs_inode.h"
  14. #include "xfs_trans.h"
  15. #include "xfs_trans_priv.h"
  16. #include "xfs_inode_item.h"
  17. #include "xfs_quota.h"
  18. #include "xfs_trace.h"
  19. #include "xfs_icache.h"
  20. #include "xfs_bmap_util.h"
  21. #include "xfs_dquot_item.h"
  22. #include "xfs_dquot.h"
  23. #include "xfs_reflink.h"
  24. #include "xfs_ialloc.h"
  25. #include "xfs_ag.h"
  26. #include "xfs_log_priv.h"
  27. #include "xfs_health.h"
  28. #include <linux/iversion.h>
  29. /* Radix tree tags for incore inode tree. */
  30. /* inode is to be reclaimed */
  31. #define XFS_ICI_RECLAIM_TAG 0
  32. /* Inode has speculative preallocations (posteof or cow) to clean. */
  33. #define XFS_ICI_BLOCKGC_TAG 1
  34. /*
  35. * The goal for walking incore inodes. These can correspond with incore inode
  36. * radix tree tags when convenient. Avoid existing XFS_IWALK namespace.
  37. */
  38. enum xfs_icwalk_goal {
  39. /* Goals directly associated with tagged inodes. */
  40. XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG,
  41. XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG,
  42. };
  43. static int xfs_icwalk(struct xfs_mount *mp,
  44. enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
  45. static int xfs_icwalk_ag(struct xfs_perag *pag,
  46. enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
  47. /*
  48. * Private inode cache walk flags for struct xfs_icwalk. Must not
  49. * coincide with XFS_ICWALK_FLAGS_VALID.
  50. */
  51. /* Stop scanning after icw_scan_limit inodes. */
  52. #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
  53. #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27)
  54. #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
  55. #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \
  56. XFS_ICWALK_FLAG_RECLAIM_SICK | \
  57. XFS_ICWALK_FLAG_UNION)
  58. /* Marks for the perag xarray */
  59. #define XFS_PERAG_RECLAIM_MARK XA_MARK_0
  60. #define XFS_PERAG_BLOCKGC_MARK XA_MARK_1
  61. static inline xa_mark_t ici_tag_to_mark(unsigned int tag)
  62. {
  63. if (tag == XFS_ICI_RECLAIM_TAG)
  64. return XFS_PERAG_RECLAIM_MARK;
  65. ASSERT(tag == XFS_ICI_BLOCKGC_TAG);
  66. return XFS_PERAG_BLOCKGC_MARK;
  67. }
  68. /*
  69. * Allocate and initialise an xfs_inode.
  70. */
  71. struct xfs_inode *
  72. xfs_inode_alloc(
  73. struct xfs_mount *mp,
  74. xfs_ino_t ino)
  75. {
  76. struct xfs_inode *ip;
  77. /*
  78. * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
  79. * and return NULL here on ENOMEM.
  80. */
  81. ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
  82. if (inode_init_always(mp->m_super, VFS_I(ip))) {
  83. kmem_cache_free(xfs_inode_cache, ip);
  84. return NULL;
  85. }
  86. /* VFS doesn't initialise i_mode! */
  87. VFS_I(ip)->i_mode = 0;
  88. mapping_set_folio_min_order(VFS_I(ip)->i_mapping,
  89. M_IGEO(mp)->min_folio_order);
  90. XFS_STATS_INC(mp, vn_active);
  91. ASSERT(atomic_read(&ip->i_pincount) == 0);
  92. ASSERT(ip->i_ino == 0);
  93. /* initialise the xfs inode */
  94. ip->i_ino = ino;
  95. ip->i_mount = mp;
  96. memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
  97. ip->i_cowfp = NULL;
  98. memset(&ip->i_af, 0, sizeof(ip->i_af));
  99. ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
  100. memset(&ip->i_df, 0, sizeof(ip->i_df));
  101. ip->i_flags = 0;
  102. ip->i_delayed_blks = 0;
  103. ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
  104. ip->i_nblocks = 0;
  105. ip->i_forkoff = 0;
  106. ip->i_sick = 0;
  107. ip->i_checked = 0;
  108. INIT_WORK(&ip->i_ioend_work, xfs_end_io);
  109. INIT_LIST_HEAD(&ip->i_ioend_list);
  110. spin_lock_init(&ip->i_ioend_lock);
  111. ip->i_next_unlinked = NULLAGINO;
  112. ip->i_prev_unlinked = 0;
  113. return ip;
  114. }
  115. STATIC void
  116. xfs_inode_free_callback(
  117. struct rcu_head *head)
  118. {
  119. struct inode *inode = container_of(head, struct inode, i_rcu);
  120. struct xfs_inode *ip = XFS_I(inode);
  121. switch (VFS_I(ip)->i_mode & S_IFMT) {
  122. case S_IFREG:
  123. case S_IFDIR:
  124. case S_IFLNK:
  125. xfs_idestroy_fork(&ip->i_df);
  126. break;
  127. }
  128. xfs_ifork_zap_attr(ip);
  129. if (ip->i_cowfp) {
  130. xfs_idestroy_fork(ip->i_cowfp);
  131. kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
  132. }
  133. if (ip->i_itemp) {
  134. ASSERT(!test_bit(XFS_LI_IN_AIL,
  135. &ip->i_itemp->ili_item.li_flags));
  136. xfs_inode_item_destroy(ip);
  137. ip->i_itemp = NULL;
  138. }
  139. kmem_cache_free(xfs_inode_cache, ip);
  140. }
  141. static void
  142. __xfs_inode_free(
  143. struct xfs_inode *ip)
  144. {
  145. /* asserts to verify all state is correct here */
  146. ASSERT(atomic_read(&ip->i_pincount) == 0);
  147. ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
  148. XFS_STATS_DEC(ip->i_mount, vn_active);
  149. call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
  150. }
  151. void
  152. xfs_inode_free(
  153. struct xfs_inode *ip)
  154. {
  155. ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
  156. /*
  157. * Because we use RCU freeing we need to ensure the inode always
  158. * appears to be reclaimed with an invalid inode number when in the
  159. * free state. The ip->i_flags_lock provides the barrier against lookup
  160. * races.
  161. */
  162. spin_lock(&ip->i_flags_lock);
  163. ip->i_flags = XFS_IRECLAIM;
  164. ip->i_ino = 0;
  165. spin_unlock(&ip->i_flags_lock);
  166. __xfs_inode_free(ip);
  167. }
  168. /*
  169. * Queue background inode reclaim work if there are reclaimable inodes and there
  170. * isn't reclaim work already scheduled or in progress.
  171. */
  172. static void
  173. xfs_reclaim_work_queue(
  174. struct xfs_mount *mp)
  175. {
  176. rcu_read_lock();
  177. if (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
  178. queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
  179. msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
  180. }
  181. rcu_read_unlock();
  182. }
  183. /*
  184. * Background scanning to trim preallocated space. This is queued based on the
  185. * 'speculative_prealloc_lifetime' tunable (5m by default).
  186. */
  187. static inline void
  188. xfs_blockgc_queue(
  189. struct xfs_perag *pag)
  190. {
  191. struct xfs_mount *mp = pag->pag_mount;
  192. if (!xfs_is_blockgc_enabled(mp))
  193. return;
  194. rcu_read_lock();
  195. if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
  196. queue_delayed_work(pag->pag_mount->m_blockgc_wq,
  197. &pag->pag_blockgc_work,
  198. msecs_to_jiffies(xfs_blockgc_secs * 1000));
  199. rcu_read_unlock();
  200. }
  201. /* Set a tag on both the AG incore inode tree and the AG radix tree. */
  202. static void
  203. xfs_perag_set_inode_tag(
  204. struct xfs_perag *pag,
  205. xfs_agino_t agino,
  206. unsigned int tag)
  207. {
  208. struct xfs_mount *mp = pag->pag_mount;
  209. bool was_tagged;
  210. lockdep_assert_held(&pag->pag_ici_lock);
  211. was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
  212. radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
  213. if (tag == XFS_ICI_RECLAIM_TAG)
  214. pag->pag_ici_reclaimable++;
  215. if (was_tagged)
  216. return;
  217. /* propagate the tag up into the perag radix tree */
  218. xa_set_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
  219. /* start background work */
  220. switch (tag) {
  221. case XFS_ICI_RECLAIM_TAG:
  222. xfs_reclaim_work_queue(mp);
  223. break;
  224. case XFS_ICI_BLOCKGC_TAG:
  225. xfs_blockgc_queue(pag);
  226. break;
  227. }
  228. trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
  229. }
  230. /* Clear a tag on both the AG incore inode tree and the AG radix tree. */
  231. static void
  232. xfs_perag_clear_inode_tag(
  233. struct xfs_perag *pag,
  234. xfs_agino_t agino,
  235. unsigned int tag)
  236. {
  237. struct xfs_mount *mp = pag->pag_mount;
  238. lockdep_assert_held(&pag->pag_ici_lock);
  239. /*
  240. * Reclaim can signal (with a null agino) that it cleared its own tag
  241. * by removing the inode from the radix tree.
  242. */
  243. if (agino != NULLAGINO)
  244. radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
  245. else
  246. ASSERT(tag == XFS_ICI_RECLAIM_TAG);
  247. if (tag == XFS_ICI_RECLAIM_TAG)
  248. pag->pag_ici_reclaimable--;
  249. if (radix_tree_tagged(&pag->pag_ici_root, tag))
  250. return;
  251. /* clear the tag from the perag radix tree */
  252. xa_clear_mark(&mp->m_perags, pag->pag_agno, ici_tag_to_mark(tag));
  253. trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
  254. }
  255. /*
  256. * Find the next AG after @pag, or the first AG if @pag is NULL.
  257. */
  258. static struct xfs_perag *
  259. xfs_perag_grab_next_tag(
  260. struct xfs_mount *mp,
  261. struct xfs_perag *pag,
  262. int tag)
  263. {
  264. unsigned long index = 0;
  265. if (pag) {
  266. index = pag->pag_agno + 1;
  267. xfs_perag_rele(pag);
  268. }
  269. rcu_read_lock();
  270. pag = xa_find(&mp->m_perags, &index, ULONG_MAX, ici_tag_to_mark(tag));
  271. if (pag) {
  272. trace_xfs_perag_grab_next_tag(pag, _RET_IP_);
  273. if (!atomic_inc_not_zero(&pag->pag_active_ref))
  274. pag = NULL;
  275. }
  276. rcu_read_unlock();
  277. return pag;
  278. }
  279. /*
  280. * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
  281. * part of the structure. This is made more complex by the fact we store
  282. * information about the on-disk values in the VFS inode and so we can't just
  283. * overwrite the values unconditionally. Hence we save the parameters we
  284. * need to retain across reinitialisation, and rewrite them into the VFS inode
  285. * after reinitialisation even if it fails.
  286. */
  287. static int
  288. xfs_reinit_inode(
  289. struct xfs_mount *mp,
  290. struct inode *inode)
  291. {
  292. int error;
  293. uint32_t nlink = inode->i_nlink;
  294. uint32_t generation = inode->i_generation;
  295. uint64_t version = inode_peek_iversion(inode);
  296. umode_t mode = inode->i_mode;
  297. dev_t dev = inode->i_rdev;
  298. kuid_t uid = inode->i_uid;
  299. kgid_t gid = inode->i_gid;
  300. unsigned long state = inode->i_state;
  301. error = inode_init_always(mp->m_super, inode);
  302. set_nlink(inode, nlink);
  303. inode->i_generation = generation;
  304. inode_set_iversion_queried(inode, version);
  305. inode->i_mode = mode;
  306. inode->i_rdev = dev;
  307. inode->i_uid = uid;
  308. inode->i_gid = gid;
  309. inode->i_state = state;
  310. mapping_set_folio_min_order(inode->i_mapping,
  311. M_IGEO(mp)->min_folio_order);
  312. return error;
  313. }
  314. /*
  315. * Carefully nudge an inode whose VFS state has been torn down back into a
  316. * usable state. Drops the i_flags_lock and the rcu read lock.
  317. */
  318. static int
  319. xfs_iget_recycle(
  320. struct xfs_perag *pag,
  321. struct xfs_inode *ip) __releases(&ip->i_flags_lock)
  322. {
  323. struct xfs_mount *mp = ip->i_mount;
  324. struct inode *inode = VFS_I(ip);
  325. int error;
  326. trace_xfs_iget_recycle(ip);
  327. if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
  328. return -EAGAIN;
  329. /*
  330. * We need to make it look like the inode is being reclaimed to prevent
  331. * the actual reclaim workers from stomping over us while we recycle
  332. * the inode. We can't clear the radix tree tag yet as it requires
  333. * pag_ici_lock to be held exclusive.
  334. */
  335. ip->i_flags |= XFS_IRECLAIM;
  336. spin_unlock(&ip->i_flags_lock);
  337. rcu_read_unlock();
  338. ASSERT(!rwsem_is_locked(&inode->i_rwsem));
  339. error = xfs_reinit_inode(mp, inode);
  340. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  341. if (error) {
  342. /*
  343. * Re-initializing the inode failed, and we are in deep
  344. * trouble. Try to re-add it to the reclaim list.
  345. */
  346. rcu_read_lock();
  347. spin_lock(&ip->i_flags_lock);
  348. ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
  349. ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
  350. spin_unlock(&ip->i_flags_lock);
  351. rcu_read_unlock();
  352. trace_xfs_iget_recycle_fail(ip);
  353. return error;
  354. }
  355. spin_lock(&pag->pag_ici_lock);
  356. spin_lock(&ip->i_flags_lock);
  357. /*
  358. * Clear the per-lifetime state in the inode as we are now effectively
  359. * a new inode and need to return to the initial state before reuse
  360. * occurs.
  361. */
  362. ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
  363. ip->i_flags |= XFS_INEW;
  364. xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
  365. XFS_ICI_RECLAIM_TAG);
  366. inode->i_state = I_NEW;
  367. spin_unlock(&ip->i_flags_lock);
  368. spin_unlock(&pag->pag_ici_lock);
  369. return 0;
  370. }
  371. /*
  372. * If we are allocating a new inode, then check what was returned is
  373. * actually a free, empty inode. If we are not allocating an inode,
  374. * then check we didn't find a free inode.
  375. *
  376. * Returns:
  377. * 0 if the inode free state matches the lookup context
  378. * -ENOENT if the inode is free and we are not allocating
  379. * -EFSCORRUPTED if there is any state mismatch at all
  380. */
  381. static int
  382. xfs_iget_check_free_state(
  383. struct xfs_inode *ip,
  384. int flags)
  385. {
  386. if (flags & XFS_IGET_CREATE) {
  387. /* should be a free inode */
  388. if (VFS_I(ip)->i_mode != 0) {
  389. xfs_warn(ip->i_mount,
  390. "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
  391. ip->i_ino, VFS_I(ip)->i_mode);
  392. xfs_agno_mark_sick(ip->i_mount,
  393. XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
  394. XFS_SICK_AG_INOBT);
  395. return -EFSCORRUPTED;
  396. }
  397. if (ip->i_nblocks != 0) {
  398. xfs_warn(ip->i_mount,
  399. "Corruption detected! Free inode 0x%llx has blocks allocated!",
  400. ip->i_ino);
  401. xfs_agno_mark_sick(ip->i_mount,
  402. XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
  403. XFS_SICK_AG_INOBT);
  404. return -EFSCORRUPTED;
  405. }
  406. return 0;
  407. }
  408. /* should be an allocated inode */
  409. if (VFS_I(ip)->i_mode == 0)
  410. return -ENOENT;
  411. return 0;
  412. }
  413. /* Make all pending inactivation work start immediately. */
  414. static bool
  415. xfs_inodegc_queue_all(
  416. struct xfs_mount *mp)
  417. {
  418. struct xfs_inodegc *gc;
  419. int cpu;
  420. bool ret = false;
  421. for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
  422. gc = per_cpu_ptr(mp->m_inodegc, cpu);
  423. if (!llist_empty(&gc->list)) {
  424. mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
  425. ret = true;
  426. }
  427. }
  428. return ret;
  429. }
  430. /* Wait for all queued work and collect errors */
  431. static int
  432. xfs_inodegc_wait_all(
  433. struct xfs_mount *mp)
  434. {
  435. int cpu;
  436. int error = 0;
  437. flush_workqueue(mp->m_inodegc_wq);
  438. for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
  439. struct xfs_inodegc *gc;
  440. gc = per_cpu_ptr(mp->m_inodegc, cpu);
  441. if (gc->error && !error)
  442. error = gc->error;
  443. gc->error = 0;
  444. }
  445. return error;
  446. }
  447. /*
  448. * Check the validity of the inode we just found it the cache
  449. */
  450. static int
  451. xfs_iget_cache_hit(
  452. struct xfs_perag *pag,
  453. struct xfs_inode *ip,
  454. xfs_ino_t ino,
  455. int flags,
  456. int lock_flags) __releases(RCU)
  457. {
  458. struct inode *inode = VFS_I(ip);
  459. struct xfs_mount *mp = ip->i_mount;
  460. int error;
  461. /*
  462. * check for re-use of an inode within an RCU grace period due to the
  463. * radix tree nodes not being updated yet. We monitor for this by
  464. * setting the inode number to zero before freeing the inode structure.
  465. * If the inode has been reallocated and set up, then the inode number
  466. * will not match, so check for that, too.
  467. */
  468. spin_lock(&ip->i_flags_lock);
  469. if (ip->i_ino != ino)
  470. goto out_skip;
  471. /*
  472. * If we are racing with another cache hit that is currently
  473. * instantiating this inode or currently recycling it out of
  474. * reclaimable state, wait for the initialisation to complete
  475. * before continuing.
  476. *
  477. * If we're racing with the inactivation worker we also want to wait.
  478. * If we're creating a new file, it's possible that the worker
  479. * previously marked the inode as free on disk but hasn't finished
  480. * updating the incore state yet. The AGI buffer will be dirty and
  481. * locked to the icreate transaction, so a synchronous push of the
  482. * inodegc workers would result in deadlock. For a regular iget, the
  483. * worker is running already, so we might as well wait.
  484. *
  485. * XXX(hch): eventually we should do something equivalent to
  486. * wait_on_inode to wait for these flags to be cleared
  487. * instead of polling for it.
  488. */
  489. if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
  490. goto out_skip;
  491. if (ip->i_flags & XFS_NEED_INACTIVE) {
  492. /* Unlinked inodes cannot be re-grabbed. */
  493. if (VFS_I(ip)->i_nlink == 0) {
  494. error = -ENOENT;
  495. goto out_error;
  496. }
  497. goto out_inodegc_flush;
  498. }
  499. /*
  500. * Check the inode free state is valid. This also detects lookup
  501. * racing with unlinks.
  502. */
  503. error = xfs_iget_check_free_state(ip, flags);
  504. if (error)
  505. goto out_error;
  506. /* Skip inodes that have no vfs state. */
  507. if ((flags & XFS_IGET_INCORE) &&
  508. (ip->i_flags & XFS_IRECLAIMABLE))
  509. goto out_skip;
  510. /* The inode fits the selection criteria; process it. */
  511. if (ip->i_flags & XFS_IRECLAIMABLE) {
  512. /* Drops i_flags_lock and RCU read lock. */
  513. error = xfs_iget_recycle(pag, ip);
  514. if (error == -EAGAIN)
  515. goto out_skip;
  516. if (error)
  517. return error;
  518. } else {
  519. /* If the VFS inode is being torn down, pause and try again. */
  520. if (!igrab(inode))
  521. goto out_skip;
  522. /* We've got a live one. */
  523. spin_unlock(&ip->i_flags_lock);
  524. rcu_read_unlock();
  525. trace_xfs_iget_hit(ip);
  526. }
  527. if (lock_flags != 0)
  528. xfs_ilock(ip, lock_flags);
  529. if (!(flags & XFS_IGET_INCORE))
  530. xfs_iflags_clear(ip, XFS_ISTALE);
  531. XFS_STATS_INC(mp, xs_ig_found);
  532. return 0;
  533. out_skip:
  534. trace_xfs_iget_skip(ip);
  535. XFS_STATS_INC(mp, xs_ig_frecycle);
  536. error = -EAGAIN;
  537. out_error:
  538. spin_unlock(&ip->i_flags_lock);
  539. rcu_read_unlock();
  540. return error;
  541. out_inodegc_flush:
  542. spin_unlock(&ip->i_flags_lock);
  543. rcu_read_unlock();
  544. /*
  545. * Do not wait for the workers, because the caller could hold an AGI
  546. * buffer lock. We're just going to sleep in a loop anyway.
  547. */
  548. if (xfs_is_inodegc_enabled(mp))
  549. xfs_inodegc_queue_all(mp);
  550. return -EAGAIN;
  551. }
  552. static int
  553. xfs_iget_cache_miss(
  554. struct xfs_mount *mp,
  555. struct xfs_perag *pag,
  556. xfs_trans_t *tp,
  557. xfs_ino_t ino,
  558. struct xfs_inode **ipp,
  559. int flags,
  560. int lock_flags)
  561. {
  562. struct xfs_inode *ip;
  563. int error;
  564. xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
  565. ip = xfs_inode_alloc(mp, ino);
  566. if (!ip)
  567. return -ENOMEM;
  568. error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
  569. if (error)
  570. goto out_destroy;
  571. /*
  572. * For version 5 superblocks, if we are initialising a new inode and we
  573. * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
  574. * simply build the new inode core with a random generation number.
  575. *
  576. * For version 4 (and older) superblocks, log recovery is dependent on
  577. * the i_flushiter field being initialised from the current on-disk
  578. * value and hence we must also read the inode off disk even when
  579. * initializing new inodes.
  580. */
  581. if (xfs_has_v3inodes(mp) &&
  582. (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
  583. VFS_I(ip)->i_generation = get_random_u32();
  584. } else {
  585. struct xfs_buf *bp;
  586. error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
  587. if (error)
  588. goto out_destroy;
  589. error = xfs_inode_from_disk(ip,
  590. xfs_buf_offset(bp, ip->i_imap.im_boffset));
  591. if (!error)
  592. xfs_buf_set_ref(bp, XFS_INO_REF);
  593. else
  594. xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
  595. xfs_trans_brelse(tp, bp);
  596. if (error)
  597. goto out_destroy;
  598. }
  599. trace_xfs_iget_miss(ip);
  600. /*
  601. * Check the inode free state is valid. This also detects lookup
  602. * racing with unlinks.
  603. */
  604. error = xfs_iget_check_free_state(ip, flags);
  605. if (error)
  606. goto out_destroy;
  607. /*
  608. * Preload the radix tree so we can insert safely under the
  609. * write spinlock. Note that we cannot sleep inside the preload
  610. * region.
  611. */
  612. if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) {
  613. error = -EAGAIN;
  614. goto out_destroy;
  615. }
  616. /*
  617. * Because the inode hasn't been added to the radix-tree yet it can't
  618. * be found by another thread, so we can do the non-sleeping lock here.
  619. */
  620. if (lock_flags) {
  621. if (!xfs_ilock_nowait(ip, lock_flags))
  622. BUG();
  623. }
  624. /*
  625. * These values must be set before inserting the inode into the radix
  626. * tree as the moment it is inserted a concurrent lookup (allowed by the
  627. * RCU locking mechanism) can find it and that lookup must see that this
  628. * is an inode currently under construction (i.e. that XFS_INEW is set).
  629. * The ip->i_flags_lock that protects the XFS_INEW flag forms the
  630. * memory barrier that ensures this detection works correctly at lookup
  631. * time.
  632. */
  633. if (flags & XFS_IGET_DONTCACHE)
  634. d_mark_dontcache(VFS_I(ip));
  635. ip->i_udquot = NULL;
  636. ip->i_gdquot = NULL;
  637. ip->i_pdquot = NULL;
  638. xfs_iflags_set(ip, XFS_INEW);
  639. /* insert the new inode */
  640. spin_lock(&pag->pag_ici_lock);
  641. error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
  642. if (unlikely(error)) {
  643. WARN_ON(error != -EEXIST);
  644. XFS_STATS_INC(mp, xs_ig_dup);
  645. error = -EAGAIN;
  646. goto out_preload_end;
  647. }
  648. spin_unlock(&pag->pag_ici_lock);
  649. radix_tree_preload_end();
  650. *ipp = ip;
  651. return 0;
  652. out_preload_end:
  653. spin_unlock(&pag->pag_ici_lock);
  654. radix_tree_preload_end();
  655. if (lock_flags)
  656. xfs_iunlock(ip, lock_flags);
  657. out_destroy:
  658. __destroy_inode(VFS_I(ip));
  659. xfs_inode_free(ip);
  660. return error;
  661. }
  662. /*
  663. * Look up an inode by number in the given file system. The inode is looked up
  664. * in the cache held in each AG. If the inode is found in the cache, initialise
  665. * the vfs inode if necessary.
  666. *
  667. * If it is not in core, read it in from the file system's device, add it to the
  668. * cache and initialise the vfs inode.
  669. *
  670. * The inode is locked according to the value of the lock_flags parameter.
  671. * Inode lookup is only done during metadata operations and not as part of the
  672. * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
  673. */
  674. int
  675. xfs_iget(
  676. struct xfs_mount *mp,
  677. struct xfs_trans *tp,
  678. xfs_ino_t ino,
  679. uint flags,
  680. uint lock_flags,
  681. struct xfs_inode **ipp)
  682. {
  683. struct xfs_inode *ip;
  684. struct xfs_perag *pag;
  685. xfs_agino_t agino;
  686. int error;
  687. ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
  688. /* reject inode numbers outside existing AGs */
  689. if (!xfs_verify_ino(mp, ino))
  690. return -EINVAL;
  691. XFS_STATS_INC(mp, xs_ig_attempts);
  692. /* get the perag structure and ensure that it's inode capable */
  693. pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
  694. agino = XFS_INO_TO_AGINO(mp, ino);
  695. again:
  696. error = 0;
  697. rcu_read_lock();
  698. ip = radix_tree_lookup(&pag->pag_ici_root, agino);
  699. if (ip) {
  700. error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
  701. if (error)
  702. goto out_error_or_again;
  703. } else {
  704. rcu_read_unlock();
  705. if (flags & XFS_IGET_INCORE) {
  706. error = -ENODATA;
  707. goto out_error_or_again;
  708. }
  709. XFS_STATS_INC(mp, xs_ig_missed);
  710. error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
  711. flags, lock_flags);
  712. if (error)
  713. goto out_error_or_again;
  714. }
  715. xfs_perag_put(pag);
  716. *ipp = ip;
  717. /*
  718. * If we have a real type for an on-disk inode, we can setup the inode
  719. * now. If it's a new inode being created, xfs_init_new_inode will
  720. * handle it.
  721. */
  722. if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
  723. xfs_setup_existing_inode(ip);
  724. return 0;
  725. out_error_or_again:
  726. if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
  727. error == -EAGAIN) {
  728. delay(1);
  729. goto again;
  730. }
  731. xfs_perag_put(pag);
  732. return error;
  733. }
  734. /*
  735. * Grab the inode for reclaim exclusively.
  736. *
  737. * We have found this inode via a lookup under RCU, so the inode may have
  738. * already been freed, or it may be in the process of being recycled by
  739. * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
  740. * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
  741. * will not be set. Hence we need to check for both these flag conditions to
  742. * avoid inodes that are no longer reclaim candidates.
  743. *
  744. * Note: checking for other state flags here, under the i_flags_lock or not, is
  745. * racy and should be avoided. Those races should be resolved only after we have
  746. * ensured that we are able to reclaim this inode and the world can see that we
  747. * are going to reclaim it.
  748. *
  749. * Return true if we grabbed it, false otherwise.
  750. */
  751. static bool
  752. xfs_reclaim_igrab(
  753. struct xfs_inode *ip,
  754. struct xfs_icwalk *icw)
  755. {
  756. ASSERT(rcu_read_lock_held());
  757. spin_lock(&ip->i_flags_lock);
  758. if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
  759. __xfs_iflags_test(ip, XFS_IRECLAIM)) {
  760. /* not a reclaim candidate. */
  761. spin_unlock(&ip->i_flags_lock);
  762. return false;
  763. }
  764. /* Don't reclaim a sick inode unless the caller asked for it. */
  765. if (ip->i_sick &&
  766. (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
  767. spin_unlock(&ip->i_flags_lock);
  768. return false;
  769. }
  770. __xfs_iflags_set(ip, XFS_IRECLAIM);
  771. spin_unlock(&ip->i_flags_lock);
  772. return true;
  773. }
  774. /*
  775. * Inode reclaim is non-blocking, so the default action if progress cannot be
  776. * made is to "requeue" the inode for reclaim by unlocking it and clearing the
  777. * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about
  778. * blocking anymore and hence we can wait for the inode to be able to reclaim
  779. * it.
  780. *
  781. * We do no IO here - if callers require inodes to be cleaned they must push the
  782. * AIL first to trigger writeback of dirty inodes. This enables writeback to be
  783. * done in the background in a non-blocking manner, and enables memory reclaim
  784. * to make progress without blocking.
  785. */
  786. static void
  787. xfs_reclaim_inode(
  788. struct xfs_inode *ip,
  789. struct xfs_perag *pag)
  790. {
  791. xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
  792. if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
  793. goto out;
  794. if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
  795. goto out_iunlock;
  796. /*
  797. * Check for log shutdown because aborting the inode can move the log
  798. * tail and corrupt in memory state. This is fine if the log is shut
  799. * down, but if the log is still active and only the mount is shut down
  800. * then the in-memory log tail movement caused by the abort can be
  801. * incorrectly propagated to disk.
  802. */
  803. if (xlog_is_shutdown(ip->i_mount->m_log)) {
  804. xfs_iunpin_wait(ip);
  805. xfs_iflush_shutdown_abort(ip);
  806. goto reclaim;
  807. }
  808. if (xfs_ipincount(ip))
  809. goto out_clear_flush;
  810. if (!xfs_inode_clean(ip))
  811. goto out_clear_flush;
  812. xfs_iflags_clear(ip, XFS_IFLUSHING);
  813. reclaim:
  814. trace_xfs_inode_reclaiming(ip);
  815. /*
  816. * Because we use RCU freeing we need to ensure the inode always appears
  817. * to be reclaimed with an invalid inode number when in the free state.
  818. * We do this as early as possible under the ILOCK so that
  819. * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
  820. * detect races with us here. By doing this, we guarantee that once
  821. * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
  822. * it will see either a valid inode that will serialise correctly, or it
  823. * will see an invalid inode that it can skip.
  824. */
  825. spin_lock(&ip->i_flags_lock);
  826. ip->i_flags = XFS_IRECLAIM;
  827. ip->i_ino = 0;
  828. ip->i_sick = 0;
  829. ip->i_checked = 0;
  830. spin_unlock(&ip->i_flags_lock);
  831. ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
  832. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  833. XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
  834. /*
  835. * Remove the inode from the per-AG radix tree.
  836. *
  837. * Because radix_tree_delete won't complain even if the item was never
  838. * added to the tree assert that it's been there before to catch
  839. * problems with the inode life time early on.
  840. */
  841. spin_lock(&pag->pag_ici_lock);
  842. if (!radix_tree_delete(&pag->pag_ici_root,
  843. XFS_INO_TO_AGINO(ip->i_mount, ino)))
  844. ASSERT(0);
  845. xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
  846. spin_unlock(&pag->pag_ici_lock);
  847. /*
  848. * Here we do an (almost) spurious inode lock in order to coordinate
  849. * with inode cache radix tree lookups. This is because the lookup
  850. * can reference the inodes in the cache without taking references.
  851. *
  852. * We make that OK here by ensuring that we wait until the inode is
  853. * unlocked after the lookup before we go ahead and free it.
  854. */
  855. xfs_ilock(ip, XFS_ILOCK_EXCL);
  856. ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
  857. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  858. ASSERT(xfs_inode_clean(ip));
  859. __xfs_inode_free(ip);
  860. return;
  861. out_clear_flush:
  862. xfs_iflags_clear(ip, XFS_IFLUSHING);
  863. out_iunlock:
  864. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  865. out:
  866. xfs_iflags_clear(ip, XFS_IRECLAIM);
  867. }
  868. /* Reclaim sick inodes if we're unmounting or the fs went down. */
  869. static inline bool
  870. xfs_want_reclaim_sick(
  871. struct xfs_mount *mp)
  872. {
  873. return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
  874. xfs_is_shutdown(mp);
  875. }
  876. void
  877. xfs_reclaim_inodes(
  878. struct xfs_mount *mp)
  879. {
  880. struct xfs_icwalk icw = {
  881. .icw_flags = 0,
  882. };
  883. if (xfs_want_reclaim_sick(mp))
  884. icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
  885. while (xa_marked(&mp->m_perags, XFS_PERAG_RECLAIM_MARK)) {
  886. xfs_ail_push_all_sync(mp->m_ail);
  887. xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
  888. }
  889. }
  890. /*
  891. * The shrinker infrastructure determines how many inodes we should scan for
  892. * reclaim. We want as many clean inodes ready to reclaim as possible, so we
  893. * push the AIL here. We also want to proactively free up memory if we can to
  894. * minimise the amount of work memory reclaim has to do so we kick the
  895. * background reclaim if it isn't already scheduled.
  896. */
  897. long
  898. xfs_reclaim_inodes_nr(
  899. struct xfs_mount *mp,
  900. unsigned long nr_to_scan)
  901. {
  902. struct xfs_icwalk icw = {
  903. .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT,
  904. .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
  905. };
  906. if (xfs_want_reclaim_sick(mp))
  907. icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
  908. /* kick background reclaimer and push the AIL */
  909. xfs_reclaim_work_queue(mp);
  910. xfs_ail_push_all(mp->m_ail);
  911. xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
  912. return 0;
  913. }
  914. /*
  915. * Return the number of reclaimable inodes in the filesystem for
  916. * the shrinker to determine how much to reclaim.
  917. */
  918. long
  919. xfs_reclaim_inodes_count(
  920. struct xfs_mount *mp)
  921. {
  922. XA_STATE (xas, &mp->m_perags, 0);
  923. long reclaimable = 0;
  924. struct xfs_perag *pag;
  925. rcu_read_lock();
  926. xas_for_each_marked(&xas, pag, ULONG_MAX, XFS_PERAG_RECLAIM_MARK) {
  927. trace_xfs_reclaim_inodes_count(pag, _THIS_IP_);
  928. reclaimable += pag->pag_ici_reclaimable;
  929. }
  930. rcu_read_unlock();
  931. return reclaimable;
  932. }
  933. STATIC bool
  934. xfs_icwalk_match_id(
  935. struct xfs_inode *ip,
  936. struct xfs_icwalk *icw)
  937. {
  938. if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
  939. !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
  940. return false;
  941. if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
  942. !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
  943. return false;
  944. if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
  945. ip->i_projid != icw->icw_prid)
  946. return false;
  947. return true;
  948. }
  949. /*
  950. * A union-based inode filtering algorithm. Process the inode if any of the
  951. * criteria match. This is for global/internal scans only.
  952. */
  953. STATIC bool
  954. xfs_icwalk_match_id_union(
  955. struct xfs_inode *ip,
  956. struct xfs_icwalk *icw)
  957. {
  958. if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
  959. uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
  960. return true;
  961. if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
  962. gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
  963. return true;
  964. if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
  965. ip->i_projid == icw->icw_prid)
  966. return true;
  967. return false;
  968. }
  969. /*
  970. * Is this inode @ip eligible for eof/cow block reclamation, given some
  971. * filtering parameters @icw? The inode is eligible if @icw is null or
  972. * if the predicate functions match.
  973. */
  974. static bool
  975. xfs_icwalk_match(
  976. struct xfs_inode *ip,
  977. struct xfs_icwalk *icw)
  978. {
  979. bool match;
  980. if (!icw)
  981. return true;
  982. if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
  983. match = xfs_icwalk_match_id_union(ip, icw);
  984. else
  985. match = xfs_icwalk_match_id(ip, icw);
  986. if (!match)
  987. return false;
  988. /* skip the inode if the file size is too small */
  989. if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
  990. XFS_ISIZE(ip) < icw->icw_min_file_size)
  991. return false;
  992. return true;
  993. }
  994. /*
  995. * This is a fast pass over the inode cache to try to get reclaim moving on as
  996. * many inodes as possible in a short period of time. It kicks itself every few
  997. * seconds, as well as being kicked by the inode cache shrinker when memory
  998. * goes low.
  999. */
  1000. void
  1001. xfs_reclaim_worker(
  1002. struct work_struct *work)
  1003. {
  1004. struct xfs_mount *mp = container_of(to_delayed_work(work),
  1005. struct xfs_mount, m_reclaim_work);
  1006. xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
  1007. xfs_reclaim_work_queue(mp);
  1008. }
  1009. STATIC int
  1010. xfs_inode_free_eofblocks(
  1011. struct xfs_inode *ip,
  1012. struct xfs_icwalk *icw,
  1013. unsigned int *lockflags)
  1014. {
  1015. bool wait;
  1016. wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
  1017. if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
  1018. return 0;
  1019. /*
  1020. * If the mapping is dirty the operation can block and wait for some
  1021. * time. Unless we are waiting, skip it.
  1022. */
  1023. if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
  1024. return 0;
  1025. if (!xfs_icwalk_match(ip, icw))
  1026. return 0;
  1027. /*
  1028. * If the caller is waiting, return -EAGAIN to keep the background
  1029. * scanner moving and revisit the inode in a subsequent pass.
  1030. */
  1031. if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
  1032. if (wait)
  1033. return -EAGAIN;
  1034. return 0;
  1035. }
  1036. *lockflags |= XFS_IOLOCK_EXCL;
  1037. if (xfs_can_free_eofblocks(ip))
  1038. return xfs_free_eofblocks(ip);
  1039. /* inode could be preallocated */
  1040. trace_xfs_inode_free_eofblocks_invalid(ip);
  1041. xfs_inode_clear_eofblocks_tag(ip);
  1042. return 0;
  1043. }
  1044. static void
  1045. xfs_blockgc_set_iflag(
  1046. struct xfs_inode *ip,
  1047. unsigned long iflag)
  1048. {
  1049. struct xfs_mount *mp = ip->i_mount;
  1050. struct xfs_perag *pag;
  1051. ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
  1052. /*
  1053. * Don't bother locking the AG and looking up in the radix trees
  1054. * if we already know that we have the tag set.
  1055. */
  1056. if (ip->i_flags & iflag)
  1057. return;
  1058. spin_lock(&ip->i_flags_lock);
  1059. ip->i_flags |= iflag;
  1060. spin_unlock(&ip->i_flags_lock);
  1061. pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
  1062. spin_lock(&pag->pag_ici_lock);
  1063. xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
  1064. XFS_ICI_BLOCKGC_TAG);
  1065. spin_unlock(&pag->pag_ici_lock);
  1066. xfs_perag_put(pag);
  1067. }
  1068. void
  1069. xfs_inode_set_eofblocks_tag(
  1070. xfs_inode_t *ip)
  1071. {
  1072. trace_xfs_inode_set_eofblocks_tag(ip);
  1073. return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
  1074. }
  1075. static void
  1076. xfs_blockgc_clear_iflag(
  1077. struct xfs_inode *ip,
  1078. unsigned long iflag)
  1079. {
  1080. struct xfs_mount *mp = ip->i_mount;
  1081. struct xfs_perag *pag;
  1082. bool clear_tag;
  1083. ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
  1084. spin_lock(&ip->i_flags_lock);
  1085. ip->i_flags &= ~iflag;
  1086. clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
  1087. spin_unlock(&ip->i_flags_lock);
  1088. if (!clear_tag)
  1089. return;
  1090. pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
  1091. spin_lock(&pag->pag_ici_lock);
  1092. xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
  1093. XFS_ICI_BLOCKGC_TAG);
  1094. spin_unlock(&pag->pag_ici_lock);
  1095. xfs_perag_put(pag);
  1096. }
  1097. void
  1098. xfs_inode_clear_eofblocks_tag(
  1099. xfs_inode_t *ip)
  1100. {
  1101. trace_xfs_inode_clear_eofblocks_tag(ip);
  1102. return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
  1103. }
  1104. /*
  1105. * Prepare to free COW fork blocks from an inode.
  1106. */
  1107. static bool
  1108. xfs_prep_free_cowblocks(
  1109. struct xfs_inode *ip,
  1110. struct xfs_icwalk *icw)
  1111. {
  1112. bool sync;
  1113. sync = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
  1114. /*
  1115. * Just clear the tag if we have an empty cow fork or none at all. It's
  1116. * possible the inode was fully unshared since it was originally tagged.
  1117. */
  1118. if (!xfs_inode_has_cow_data(ip)) {
  1119. trace_xfs_inode_free_cowblocks_invalid(ip);
  1120. xfs_inode_clear_cowblocks_tag(ip);
  1121. return false;
  1122. }
  1123. /*
  1124. * A cowblocks trim of an inode can have a significant effect on
  1125. * fragmentation even when a reasonable COW extent size hint is set.
  1126. * Therefore, we prefer to not process cowblocks unless they are clean
  1127. * and idle. We can never process a cowblocks inode that is dirty or has
  1128. * in-flight I/O under any circumstances, because outstanding writeback
  1129. * or dio expects targeted COW fork blocks exist through write
  1130. * completion where they can be remapped into the data fork.
  1131. *
  1132. * Therefore, the heuristic used here is to never process inodes
  1133. * currently opened for write from background (i.e. non-sync) scans. For
  1134. * sync scans, use the pagecache/dio state of the inode to ensure we
  1135. * never free COW fork blocks out from under pending I/O.
  1136. */
  1137. if (!sync && inode_is_open_for_write(VFS_I(ip)))
  1138. return false;
  1139. return xfs_can_free_cowblocks(ip);
  1140. }
  1141. /*
  1142. * Automatic CoW Reservation Freeing
  1143. *
  1144. * These functions automatically garbage collect leftover CoW reservations
  1145. * that were made on behalf of a cowextsize hint when we start to run out
  1146. * of quota or when the reservations sit around for too long. If the file
  1147. * has dirty pages or is undergoing writeback, its CoW reservations will
  1148. * be retained.
  1149. *
  1150. * The actual garbage collection piggybacks off the same code that runs
  1151. * the speculative EOF preallocation garbage collector.
  1152. */
  1153. STATIC int
  1154. xfs_inode_free_cowblocks(
  1155. struct xfs_inode *ip,
  1156. struct xfs_icwalk *icw,
  1157. unsigned int *lockflags)
  1158. {
  1159. bool wait;
  1160. int ret = 0;
  1161. wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
  1162. if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
  1163. return 0;
  1164. if (!xfs_prep_free_cowblocks(ip, icw))
  1165. return 0;
  1166. if (!xfs_icwalk_match(ip, icw))
  1167. return 0;
  1168. /*
  1169. * If the caller is waiting, return -EAGAIN to keep the background
  1170. * scanner moving and revisit the inode in a subsequent pass.
  1171. */
  1172. if (!(*lockflags & XFS_IOLOCK_EXCL) &&
  1173. !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
  1174. if (wait)
  1175. return -EAGAIN;
  1176. return 0;
  1177. }
  1178. *lockflags |= XFS_IOLOCK_EXCL;
  1179. if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
  1180. if (wait)
  1181. return -EAGAIN;
  1182. return 0;
  1183. }
  1184. *lockflags |= XFS_MMAPLOCK_EXCL;
  1185. /*
  1186. * Check again, nobody else should be able to dirty blocks or change
  1187. * the reflink iflag now that we have the first two locks held.
  1188. */
  1189. if (xfs_prep_free_cowblocks(ip, icw))
  1190. ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
  1191. return ret;
  1192. }
  1193. void
  1194. xfs_inode_set_cowblocks_tag(
  1195. xfs_inode_t *ip)
  1196. {
  1197. trace_xfs_inode_set_cowblocks_tag(ip);
  1198. return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
  1199. }
  1200. void
  1201. xfs_inode_clear_cowblocks_tag(
  1202. xfs_inode_t *ip)
  1203. {
  1204. trace_xfs_inode_clear_cowblocks_tag(ip);
  1205. return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
  1206. }
  1207. /* Disable post-EOF and CoW block auto-reclamation. */
  1208. void
  1209. xfs_blockgc_stop(
  1210. struct xfs_mount *mp)
  1211. {
  1212. struct xfs_perag *pag;
  1213. xfs_agnumber_t agno;
  1214. if (!xfs_clear_blockgc_enabled(mp))
  1215. return;
  1216. for_each_perag(mp, agno, pag)
  1217. cancel_delayed_work_sync(&pag->pag_blockgc_work);
  1218. trace_xfs_blockgc_stop(mp, __return_address);
  1219. }
  1220. /* Enable post-EOF and CoW block auto-reclamation. */
  1221. void
  1222. xfs_blockgc_start(
  1223. struct xfs_mount *mp)
  1224. {
  1225. struct xfs_perag *pag = NULL;
  1226. if (xfs_set_blockgc_enabled(mp))
  1227. return;
  1228. trace_xfs_blockgc_start(mp, __return_address);
  1229. while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
  1230. xfs_blockgc_queue(pag);
  1231. }
  1232. /* Don't try to run block gc on an inode that's in any of these states. */
  1233. #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \
  1234. XFS_NEED_INACTIVE | \
  1235. XFS_INACTIVATING | \
  1236. XFS_IRECLAIMABLE | \
  1237. XFS_IRECLAIM)
  1238. /*
  1239. * Decide if the given @ip is eligible for garbage collection of speculative
  1240. * preallocations, and grab it if so. Returns true if it's ready to go or
  1241. * false if we should just ignore it.
  1242. */
  1243. static bool
  1244. xfs_blockgc_igrab(
  1245. struct xfs_inode *ip)
  1246. {
  1247. struct inode *inode = VFS_I(ip);
  1248. ASSERT(rcu_read_lock_held());
  1249. /* Check for stale RCU freed inode */
  1250. spin_lock(&ip->i_flags_lock);
  1251. if (!ip->i_ino)
  1252. goto out_unlock_noent;
  1253. if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
  1254. goto out_unlock_noent;
  1255. spin_unlock(&ip->i_flags_lock);
  1256. /* nothing to sync during shutdown */
  1257. if (xfs_is_shutdown(ip->i_mount))
  1258. return false;
  1259. /* If we can't grab the inode, it must on it's way to reclaim. */
  1260. if (!igrab(inode))
  1261. return false;
  1262. /* inode is valid */
  1263. return true;
  1264. out_unlock_noent:
  1265. spin_unlock(&ip->i_flags_lock);
  1266. return false;
  1267. }
  1268. /* Scan one incore inode for block preallocations that we can remove. */
  1269. static int
  1270. xfs_blockgc_scan_inode(
  1271. struct xfs_inode *ip,
  1272. struct xfs_icwalk *icw)
  1273. {
  1274. unsigned int lockflags = 0;
  1275. int error;
  1276. error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
  1277. if (error)
  1278. goto unlock;
  1279. error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
  1280. unlock:
  1281. if (lockflags)
  1282. xfs_iunlock(ip, lockflags);
  1283. xfs_irele(ip);
  1284. return error;
  1285. }
  1286. /* Background worker that trims preallocated space. */
  1287. void
  1288. xfs_blockgc_worker(
  1289. struct work_struct *work)
  1290. {
  1291. struct xfs_perag *pag = container_of(to_delayed_work(work),
  1292. struct xfs_perag, pag_blockgc_work);
  1293. struct xfs_mount *mp = pag->pag_mount;
  1294. int error;
  1295. trace_xfs_blockgc_worker(mp, __return_address);
  1296. error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
  1297. if (error)
  1298. xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
  1299. pag->pag_agno, error);
  1300. xfs_blockgc_queue(pag);
  1301. }
  1302. /*
  1303. * Try to free space in the filesystem by purging inactive inodes, eofblocks
  1304. * and cowblocks.
  1305. */
  1306. int
  1307. xfs_blockgc_free_space(
  1308. struct xfs_mount *mp,
  1309. struct xfs_icwalk *icw)
  1310. {
  1311. int error;
  1312. trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
  1313. error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
  1314. if (error)
  1315. return error;
  1316. return xfs_inodegc_flush(mp);
  1317. }
  1318. /*
  1319. * Reclaim all the free space that we can by scheduling the background blockgc
  1320. * and inodegc workers immediately and waiting for them all to clear.
  1321. */
  1322. int
  1323. xfs_blockgc_flush_all(
  1324. struct xfs_mount *mp)
  1325. {
  1326. struct xfs_perag *pag = NULL;
  1327. trace_xfs_blockgc_flush_all(mp, __return_address);
  1328. /*
  1329. * For each blockgc worker, move its queue time up to now. If it wasn't
  1330. * queued, it will not be requeued. Then flush whatever is left.
  1331. */
  1332. while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
  1333. mod_delayed_work(pag->pag_mount->m_blockgc_wq,
  1334. &pag->pag_blockgc_work, 0);
  1335. while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
  1336. flush_delayed_work(&pag->pag_blockgc_work);
  1337. return xfs_inodegc_flush(mp);
  1338. }
  1339. /*
  1340. * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which
  1341. * quota caused an allocation failure, so we make a best effort by including
  1342. * each quota under low free space conditions (less than 1% free space) in the
  1343. * scan.
  1344. *
  1345. * Callers must not hold any inode's ILOCK. If requesting a synchronous scan
  1346. * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
  1347. * MMAPLOCK.
  1348. */
  1349. int
  1350. xfs_blockgc_free_dquots(
  1351. struct xfs_mount *mp,
  1352. struct xfs_dquot *udqp,
  1353. struct xfs_dquot *gdqp,
  1354. struct xfs_dquot *pdqp,
  1355. unsigned int iwalk_flags)
  1356. {
  1357. struct xfs_icwalk icw = {0};
  1358. bool do_work = false;
  1359. if (!udqp && !gdqp && !pdqp)
  1360. return 0;
  1361. /*
  1362. * Run a scan to free blocks using the union filter to cover all
  1363. * applicable quotas in a single scan.
  1364. */
  1365. icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
  1366. if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
  1367. icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
  1368. icw.icw_flags |= XFS_ICWALK_FLAG_UID;
  1369. do_work = true;
  1370. }
  1371. if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
  1372. icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
  1373. icw.icw_flags |= XFS_ICWALK_FLAG_GID;
  1374. do_work = true;
  1375. }
  1376. if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
  1377. icw.icw_prid = pdqp->q_id;
  1378. icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
  1379. do_work = true;
  1380. }
  1381. if (!do_work)
  1382. return 0;
  1383. return xfs_blockgc_free_space(mp, &icw);
  1384. }
  1385. /* Run cow/eofblocks scans on the quotas attached to the inode. */
  1386. int
  1387. xfs_blockgc_free_quota(
  1388. struct xfs_inode *ip,
  1389. unsigned int iwalk_flags)
  1390. {
  1391. return xfs_blockgc_free_dquots(ip->i_mount,
  1392. xfs_inode_dquot(ip, XFS_DQTYPE_USER),
  1393. xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
  1394. xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
  1395. }
  1396. /* XFS Inode Cache Walking Code */
  1397. /*
  1398. * The inode lookup is done in batches to keep the amount of lock traffic and
  1399. * radix tree lookups to a minimum. The batch size is a trade off between
  1400. * lookup reduction and stack usage. This is in the reclaim path, so we can't
  1401. * be too greedy.
  1402. */
  1403. #define XFS_LOOKUP_BATCH 32
  1404. /*
  1405. * Decide if we want to grab this inode in anticipation of doing work towards
  1406. * the goal.
  1407. */
  1408. static inline bool
  1409. xfs_icwalk_igrab(
  1410. enum xfs_icwalk_goal goal,
  1411. struct xfs_inode *ip,
  1412. struct xfs_icwalk *icw)
  1413. {
  1414. switch (goal) {
  1415. case XFS_ICWALK_BLOCKGC:
  1416. return xfs_blockgc_igrab(ip);
  1417. case XFS_ICWALK_RECLAIM:
  1418. return xfs_reclaim_igrab(ip, icw);
  1419. default:
  1420. return false;
  1421. }
  1422. }
  1423. /*
  1424. * Process an inode. Each processing function must handle any state changes
  1425. * made by the icwalk igrab function. Return -EAGAIN to skip an inode.
  1426. */
  1427. static inline int
  1428. xfs_icwalk_process_inode(
  1429. enum xfs_icwalk_goal goal,
  1430. struct xfs_inode *ip,
  1431. struct xfs_perag *pag,
  1432. struct xfs_icwalk *icw)
  1433. {
  1434. int error = 0;
  1435. switch (goal) {
  1436. case XFS_ICWALK_BLOCKGC:
  1437. error = xfs_blockgc_scan_inode(ip, icw);
  1438. break;
  1439. case XFS_ICWALK_RECLAIM:
  1440. xfs_reclaim_inode(ip, pag);
  1441. break;
  1442. }
  1443. return error;
  1444. }
  1445. /*
  1446. * For a given per-AG structure @pag and a goal, grab qualifying inodes and
  1447. * process them in some manner.
  1448. */
  1449. static int
  1450. xfs_icwalk_ag(
  1451. struct xfs_perag *pag,
  1452. enum xfs_icwalk_goal goal,
  1453. struct xfs_icwalk *icw)
  1454. {
  1455. struct xfs_mount *mp = pag->pag_mount;
  1456. uint32_t first_index;
  1457. int last_error = 0;
  1458. int skipped;
  1459. bool done;
  1460. int nr_found;
  1461. restart:
  1462. done = false;
  1463. skipped = 0;
  1464. if (goal == XFS_ICWALK_RECLAIM)
  1465. first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
  1466. else
  1467. first_index = 0;
  1468. nr_found = 0;
  1469. do {
  1470. struct xfs_inode *batch[XFS_LOOKUP_BATCH];
  1471. int error = 0;
  1472. int i;
  1473. rcu_read_lock();
  1474. nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
  1475. (void **) batch, first_index,
  1476. XFS_LOOKUP_BATCH, goal);
  1477. if (!nr_found) {
  1478. done = true;
  1479. rcu_read_unlock();
  1480. break;
  1481. }
  1482. /*
  1483. * Grab the inodes before we drop the lock. if we found
  1484. * nothing, nr == 0 and the loop will be skipped.
  1485. */
  1486. for (i = 0; i < nr_found; i++) {
  1487. struct xfs_inode *ip = batch[i];
  1488. if (done || !xfs_icwalk_igrab(goal, ip, icw))
  1489. batch[i] = NULL;
  1490. /*
  1491. * Update the index for the next lookup. Catch
  1492. * overflows into the next AG range which can occur if
  1493. * we have inodes in the last block of the AG and we
  1494. * are currently pointing to the last inode.
  1495. *
  1496. * Because we may see inodes that are from the wrong AG
  1497. * due to RCU freeing and reallocation, only update the
  1498. * index if it lies in this AG. It was a race that lead
  1499. * us to see this inode, so another lookup from the
  1500. * same index will not find it again.
  1501. */
  1502. if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
  1503. continue;
  1504. first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
  1505. if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
  1506. done = true;
  1507. }
  1508. /* unlock now we've grabbed the inodes. */
  1509. rcu_read_unlock();
  1510. for (i = 0; i < nr_found; i++) {
  1511. if (!batch[i])
  1512. continue;
  1513. error = xfs_icwalk_process_inode(goal, batch[i], pag,
  1514. icw);
  1515. if (error == -EAGAIN) {
  1516. skipped++;
  1517. continue;
  1518. }
  1519. if (error && last_error != -EFSCORRUPTED)
  1520. last_error = error;
  1521. }
  1522. /* bail out if the filesystem is corrupted. */
  1523. if (error == -EFSCORRUPTED)
  1524. break;
  1525. cond_resched();
  1526. if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
  1527. icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
  1528. if (icw->icw_scan_limit <= 0)
  1529. break;
  1530. }
  1531. } while (nr_found && !done);
  1532. if (goal == XFS_ICWALK_RECLAIM) {
  1533. if (done)
  1534. first_index = 0;
  1535. WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
  1536. }
  1537. if (skipped) {
  1538. delay(1);
  1539. goto restart;
  1540. }
  1541. return last_error;
  1542. }
  1543. /* Walk all incore inodes to achieve a given goal. */
  1544. static int
  1545. xfs_icwalk(
  1546. struct xfs_mount *mp,
  1547. enum xfs_icwalk_goal goal,
  1548. struct xfs_icwalk *icw)
  1549. {
  1550. struct xfs_perag *pag = NULL;
  1551. int error = 0;
  1552. int last_error = 0;
  1553. while ((pag = xfs_perag_grab_next_tag(mp, pag, goal))) {
  1554. error = xfs_icwalk_ag(pag, goal, icw);
  1555. if (error) {
  1556. last_error = error;
  1557. if (error == -EFSCORRUPTED) {
  1558. xfs_perag_rele(pag);
  1559. break;
  1560. }
  1561. }
  1562. }
  1563. return last_error;
  1564. BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
  1565. }
  1566. #ifdef DEBUG
  1567. static void
  1568. xfs_check_delalloc(
  1569. struct xfs_inode *ip,
  1570. int whichfork)
  1571. {
  1572. struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
  1573. struct xfs_bmbt_irec got;
  1574. struct xfs_iext_cursor icur;
  1575. if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
  1576. return;
  1577. do {
  1578. if (isnullstartblock(got.br_startblock)) {
  1579. xfs_warn(ip->i_mount,
  1580. "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
  1581. ip->i_ino,
  1582. whichfork == XFS_DATA_FORK ? "data" : "cow",
  1583. got.br_startoff, got.br_blockcount);
  1584. }
  1585. } while (xfs_iext_next_extent(ifp, &icur, &got));
  1586. }
  1587. #else
  1588. #define xfs_check_delalloc(ip, whichfork) do { } while (0)
  1589. #endif
  1590. /* Schedule the inode for reclaim. */
  1591. static void
  1592. xfs_inodegc_set_reclaimable(
  1593. struct xfs_inode *ip)
  1594. {
  1595. struct xfs_mount *mp = ip->i_mount;
  1596. struct xfs_perag *pag;
  1597. if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
  1598. xfs_check_delalloc(ip, XFS_DATA_FORK);
  1599. xfs_check_delalloc(ip, XFS_COW_FORK);
  1600. ASSERT(0);
  1601. }
  1602. pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
  1603. spin_lock(&pag->pag_ici_lock);
  1604. spin_lock(&ip->i_flags_lock);
  1605. trace_xfs_inode_set_reclaimable(ip);
  1606. ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
  1607. ip->i_flags |= XFS_IRECLAIMABLE;
  1608. xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
  1609. XFS_ICI_RECLAIM_TAG);
  1610. spin_unlock(&ip->i_flags_lock);
  1611. spin_unlock(&pag->pag_ici_lock);
  1612. xfs_perag_put(pag);
  1613. }
  1614. /*
  1615. * Free all speculative preallocations and possibly even the inode itself.
  1616. * This is the last chance to make changes to an otherwise unreferenced file
  1617. * before incore reclamation happens.
  1618. */
  1619. static int
  1620. xfs_inodegc_inactivate(
  1621. struct xfs_inode *ip)
  1622. {
  1623. int error;
  1624. trace_xfs_inode_inactivating(ip);
  1625. error = xfs_inactive(ip);
  1626. xfs_inodegc_set_reclaimable(ip);
  1627. return error;
  1628. }
  1629. void
  1630. xfs_inodegc_worker(
  1631. struct work_struct *work)
  1632. {
  1633. struct xfs_inodegc *gc = container_of(to_delayed_work(work),
  1634. struct xfs_inodegc, work);
  1635. struct llist_node *node = llist_del_all(&gc->list);
  1636. struct xfs_inode *ip, *n;
  1637. struct xfs_mount *mp = gc->mp;
  1638. unsigned int nofs_flag;
  1639. /*
  1640. * Clear the cpu mask bit and ensure that we have seen the latest
  1641. * update of the gc structure associated with this CPU. This matches
  1642. * with the release semantics used when setting the cpumask bit in
  1643. * xfs_inodegc_queue.
  1644. */
  1645. cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
  1646. smp_mb__after_atomic();
  1647. WRITE_ONCE(gc->items, 0);
  1648. if (!node)
  1649. return;
  1650. /*
  1651. * We can allocate memory here while doing writeback on behalf of
  1652. * memory reclaim. To avoid memory allocation deadlocks set the
  1653. * task-wide nofs context for the following operations.
  1654. */
  1655. nofs_flag = memalloc_nofs_save();
  1656. ip = llist_entry(node, struct xfs_inode, i_gclist);
  1657. trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
  1658. WRITE_ONCE(gc->shrinker_hits, 0);
  1659. llist_for_each_entry_safe(ip, n, node, i_gclist) {
  1660. int error;
  1661. xfs_iflags_set(ip, XFS_INACTIVATING);
  1662. error = xfs_inodegc_inactivate(ip);
  1663. if (error && !gc->error)
  1664. gc->error = error;
  1665. }
  1666. memalloc_nofs_restore(nofs_flag);
  1667. }
  1668. /*
  1669. * Expedite all pending inodegc work to run immediately. This does not wait for
  1670. * completion of the work.
  1671. */
  1672. void
  1673. xfs_inodegc_push(
  1674. struct xfs_mount *mp)
  1675. {
  1676. if (!xfs_is_inodegc_enabled(mp))
  1677. return;
  1678. trace_xfs_inodegc_push(mp, __return_address);
  1679. xfs_inodegc_queue_all(mp);
  1680. }
  1681. /*
  1682. * Force all currently queued inode inactivation work to run immediately and
  1683. * wait for the work to finish.
  1684. */
  1685. int
  1686. xfs_inodegc_flush(
  1687. struct xfs_mount *mp)
  1688. {
  1689. xfs_inodegc_push(mp);
  1690. trace_xfs_inodegc_flush(mp, __return_address);
  1691. return xfs_inodegc_wait_all(mp);
  1692. }
  1693. /*
  1694. * Flush all the pending work and then disable the inode inactivation background
  1695. * workers and wait for them to stop. Caller must hold sb->s_umount to
  1696. * coordinate changes in the inodegc_enabled state.
  1697. */
  1698. void
  1699. xfs_inodegc_stop(
  1700. struct xfs_mount *mp)
  1701. {
  1702. bool rerun;
  1703. if (!xfs_clear_inodegc_enabled(mp))
  1704. return;
  1705. /*
  1706. * Drain all pending inodegc work, including inodes that could be
  1707. * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
  1708. * threads that sample the inodegc state just prior to us clearing it.
  1709. * The inodegc flag state prevents new threads from queuing more
  1710. * inodes, so we queue pending work items and flush the workqueue until
  1711. * all inodegc lists are empty. IOWs, we cannot use drain_workqueue
  1712. * here because it does not allow other unserialized mechanisms to
  1713. * reschedule inodegc work while this draining is in progress.
  1714. */
  1715. xfs_inodegc_queue_all(mp);
  1716. do {
  1717. flush_workqueue(mp->m_inodegc_wq);
  1718. rerun = xfs_inodegc_queue_all(mp);
  1719. } while (rerun);
  1720. trace_xfs_inodegc_stop(mp, __return_address);
  1721. }
  1722. /*
  1723. * Enable the inode inactivation background workers and schedule deferred inode
  1724. * inactivation work if there is any. Caller must hold sb->s_umount to
  1725. * coordinate changes in the inodegc_enabled state.
  1726. */
  1727. void
  1728. xfs_inodegc_start(
  1729. struct xfs_mount *mp)
  1730. {
  1731. if (xfs_set_inodegc_enabled(mp))
  1732. return;
  1733. trace_xfs_inodegc_start(mp, __return_address);
  1734. xfs_inodegc_queue_all(mp);
  1735. }
  1736. #ifdef CONFIG_XFS_RT
  1737. static inline bool
  1738. xfs_inodegc_want_queue_rt_file(
  1739. struct xfs_inode *ip)
  1740. {
  1741. struct xfs_mount *mp = ip->i_mount;
  1742. if (!XFS_IS_REALTIME_INODE(ip))
  1743. return false;
  1744. if (__percpu_counter_compare(&mp->m_frextents,
  1745. mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
  1746. XFS_FDBLOCKS_BATCH) < 0)
  1747. return true;
  1748. return false;
  1749. }
  1750. #else
  1751. # define xfs_inodegc_want_queue_rt_file(ip) (false)
  1752. #endif /* CONFIG_XFS_RT */
  1753. /*
  1754. * Schedule the inactivation worker when:
  1755. *
  1756. * - We've accumulated more than one inode cluster buffer's worth of inodes.
  1757. * - There is less than 5% free space left.
  1758. * - Any of the quotas for this inode are near an enforcement limit.
  1759. */
  1760. static inline bool
  1761. xfs_inodegc_want_queue_work(
  1762. struct xfs_inode *ip,
  1763. unsigned int items)
  1764. {
  1765. struct xfs_mount *mp = ip->i_mount;
  1766. if (items > mp->m_ino_geo.inodes_per_cluster)
  1767. return true;
  1768. if (__percpu_counter_compare(&mp->m_fdblocks,
  1769. mp->m_low_space[XFS_LOWSP_5_PCNT],
  1770. XFS_FDBLOCKS_BATCH) < 0)
  1771. return true;
  1772. if (xfs_inodegc_want_queue_rt_file(ip))
  1773. return true;
  1774. if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
  1775. return true;
  1776. if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
  1777. return true;
  1778. if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
  1779. return true;
  1780. return false;
  1781. }
  1782. /*
  1783. * Upper bound on the number of inodes in each AG that can be queued for
  1784. * inactivation at any given time, to avoid monopolizing the workqueue.
  1785. */
  1786. #define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK)
  1787. /*
  1788. * Make the frontend wait for inactivations when:
  1789. *
  1790. * - Memory shrinkers queued the inactivation worker and it hasn't finished.
  1791. * - The queue depth exceeds the maximum allowable percpu backlog.
  1792. *
  1793. * Note: If we are in a NOFS context here (e.g. current thread is running a
  1794. * transaction) the we don't want to block here as inodegc progress may require
  1795. * filesystem resources we hold to make progress and that could result in a
  1796. * deadlock. Hence we skip out of here if we are in a scoped NOFS context.
  1797. */
  1798. static inline bool
  1799. xfs_inodegc_want_flush_work(
  1800. struct xfs_inode *ip,
  1801. unsigned int items,
  1802. unsigned int shrinker_hits)
  1803. {
  1804. if (current->flags & PF_MEMALLOC_NOFS)
  1805. return false;
  1806. if (shrinker_hits > 0)
  1807. return true;
  1808. if (items > XFS_INODEGC_MAX_BACKLOG)
  1809. return true;
  1810. return false;
  1811. }
  1812. /*
  1813. * Queue a background inactivation worker if there are inodes that need to be
  1814. * inactivated and higher level xfs code hasn't disabled the background
  1815. * workers.
  1816. */
  1817. static void
  1818. xfs_inodegc_queue(
  1819. struct xfs_inode *ip)
  1820. {
  1821. struct xfs_mount *mp = ip->i_mount;
  1822. struct xfs_inodegc *gc;
  1823. int items;
  1824. unsigned int shrinker_hits;
  1825. unsigned int cpu_nr;
  1826. unsigned long queue_delay = 1;
  1827. trace_xfs_inode_set_need_inactive(ip);
  1828. spin_lock(&ip->i_flags_lock);
  1829. ip->i_flags |= XFS_NEED_INACTIVE;
  1830. spin_unlock(&ip->i_flags_lock);
  1831. cpu_nr = get_cpu();
  1832. gc = this_cpu_ptr(mp->m_inodegc);
  1833. llist_add(&ip->i_gclist, &gc->list);
  1834. items = READ_ONCE(gc->items);
  1835. WRITE_ONCE(gc->items, items + 1);
  1836. shrinker_hits = READ_ONCE(gc->shrinker_hits);
  1837. /*
  1838. * Ensure the list add is always seen by anyone who finds the cpumask
  1839. * bit set. This effectively gives the cpumask bit set operation
  1840. * release ordering semantics.
  1841. */
  1842. smp_mb__before_atomic();
  1843. if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
  1844. cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
  1845. /*
  1846. * We queue the work while holding the current CPU so that the work
  1847. * is scheduled to run on this CPU.
  1848. */
  1849. if (!xfs_is_inodegc_enabled(mp)) {
  1850. put_cpu();
  1851. return;
  1852. }
  1853. if (xfs_inodegc_want_queue_work(ip, items))
  1854. queue_delay = 0;
  1855. trace_xfs_inodegc_queue(mp, __return_address);
  1856. mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
  1857. queue_delay);
  1858. put_cpu();
  1859. if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
  1860. trace_xfs_inodegc_throttle(mp, __return_address);
  1861. flush_delayed_work(&gc->work);
  1862. }
  1863. }
  1864. /*
  1865. * We set the inode flag atomically with the radix tree tag. Once we get tag
  1866. * lookups on the radix tree, this inode flag can go away.
  1867. *
  1868. * We always use background reclaim here because even if the inode is clean, it
  1869. * still may be under IO and hence we have wait for IO completion to occur
  1870. * before we can reclaim the inode. The background reclaim path handles this
  1871. * more efficiently than we can here, so simply let background reclaim tear down
  1872. * all inodes.
  1873. */
  1874. void
  1875. xfs_inode_mark_reclaimable(
  1876. struct xfs_inode *ip)
  1877. {
  1878. struct xfs_mount *mp = ip->i_mount;
  1879. bool need_inactive;
  1880. XFS_STATS_INC(mp, vn_reclaim);
  1881. /*
  1882. * We should never get here with any of the reclaim flags already set.
  1883. */
  1884. ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
  1885. need_inactive = xfs_inode_needs_inactive(ip);
  1886. if (need_inactive) {
  1887. xfs_inodegc_queue(ip);
  1888. return;
  1889. }
  1890. /* Going straight to reclaim, so drop the dquots. */
  1891. xfs_qm_dqdetach(ip);
  1892. xfs_inodegc_set_reclaimable(ip);
  1893. }
  1894. /*
  1895. * Register a phony shrinker so that we can run background inodegc sooner when
  1896. * there's memory pressure. Inactivation does not itself free any memory but
  1897. * it does make inodes reclaimable, which eventually frees memory.
  1898. *
  1899. * The count function, seek value, and batch value are crafted to trigger the
  1900. * scan function during the second round of scanning. Hopefully this means
  1901. * that we reclaimed enough memory that initiating metadata transactions won't
  1902. * make things worse.
  1903. */
  1904. #define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY)
  1905. #define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
  1906. static unsigned long
  1907. xfs_inodegc_shrinker_count(
  1908. struct shrinker *shrink,
  1909. struct shrink_control *sc)
  1910. {
  1911. struct xfs_mount *mp = shrink->private_data;
  1912. struct xfs_inodegc *gc;
  1913. int cpu;
  1914. if (!xfs_is_inodegc_enabled(mp))
  1915. return 0;
  1916. for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
  1917. gc = per_cpu_ptr(mp->m_inodegc, cpu);
  1918. if (!llist_empty(&gc->list))
  1919. return XFS_INODEGC_SHRINKER_COUNT;
  1920. }
  1921. return 0;
  1922. }
  1923. static unsigned long
  1924. xfs_inodegc_shrinker_scan(
  1925. struct shrinker *shrink,
  1926. struct shrink_control *sc)
  1927. {
  1928. struct xfs_mount *mp = shrink->private_data;
  1929. struct xfs_inodegc *gc;
  1930. int cpu;
  1931. bool no_items = true;
  1932. if (!xfs_is_inodegc_enabled(mp))
  1933. return SHRINK_STOP;
  1934. trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
  1935. for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
  1936. gc = per_cpu_ptr(mp->m_inodegc, cpu);
  1937. if (!llist_empty(&gc->list)) {
  1938. unsigned int h = READ_ONCE(gc->shrinker_hits);
  1939. WRITE_ONCE(gc->shrinker_hits, h + 1);
  1940. mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
  1941. no_items = false;
  1942. }
  1943. }
  1944. /*
  1945. * If there are no inodes to inactivate, we don't want the shrinker
  1946. * to think there's deferred work to call us back about.
  1947. */
  1948. if (no_items)
  1949. return LONG_MAX;
  1950. return SHRINK_STOP;
  1951. }
  1952. /* Register a shrinker so we can accelerate inodegc and throttle queuing. */
  1953. int
  1954. xfs_inodegc_register_shrinker(
  1955. struct xfs_mount *mp)
  1956. {
  1957. mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
  1958. "xfs-inodegc:%s",
  1959. mp->m_super->s_id);
  1960. if (!mp->m_inodegc_shrinker)
  1961. return -ENOMEM;
  1962. mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
  1963. mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
  1964. mp->m_inodegc_shrinker->seeks = 0;
  1965. mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
  1966. mp->m_inodegc_shrinker->private_data = mp;
  1967. shrinker_register(mp->m_inodegc_shrinker);
  1968. return 0;
  1969. }