xfs_mount.c 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  4. * All Rights Reserved.
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_log_format.h"
  11. #include "xfs_trans_resv.h"
  12. #include "xfs_bit.h"
  13. #include "xfs_sb.h"
  14. #include "xfs_mount.h"
  15. #include "xfs_inode.h"
  16. #include "xfs_dir2.h"
  17. #include "xfs_ialloc.h"
  18. #include "xfs_alloc.h"
  19. #include "xfs_rtalloc.h"
  20. #include "xfs_bmap.h"
  21. #include "xfs_trans.h"
  22. #include "xfs_trans_priv.h"
  23. #include "xfs_log.h"
  24. #include "xfs_log_priv.h"
  25. #include "xfs_error.h"
  26. #include "xfs_quota.h"
  27. #include "xfs_fsops.h"
  28. #include "xfs_icache.h"
  29. #include "xfs_sysfs.h"
  30. #include "xfs_rmap_btree.h"
  31. #include "xfs_refcount_btree.h"
  32. #include "xfs_reflink.h"
  33. #include "xfs_extent_busy.h"
  34. #include "xfs_health.h"
  35. #include "xfs_trace.h"
  36. #include "xfs_ag.h"
  37. #include "xfs_rtbitmap.h"
  38. #include "scrub/stats.h"
  39. static DEFINE_MUTEX(xfs_uuid_table_mutex);
  40. static int xfs_uuid_table_size;
  41. static uuid_t *xfs_uuid_table;
  42. void
  43. xfs_uuid_table_free(void)
  44. {
  45. if (xfs_uuid_table_size == 0)
  46. return;
  47. kfree(xfs_uuid_table);
  48. xfs_uuid_table = NULL;
  49. xfs_uuid_table_size = 0;
  50. }
  51. /*
  52. * See if the UUID is unique among mounted XFS filesystems.
  53. * Mount fails if UUID is nil or a FS with the same UUID is already mounted.
  54. */
  55. STATIC int
  56. xfs_uuid_mount(
  57. struct xfs_mount *mp)
  58. {
  59. uuid_t *uuid = &mp->m_sb.sb_uuid;
  60. int hole, i;
  61. /* Publish UUID in struct super_block */
  62. super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));
  63. if (xfs_has_nouuid(mp))
  64. return 0;
  65. if (uuid_is_null(uuid)) {
  66. xfs_warn(mp, "Filesystem has null UUID - can't mount");
  67. return -EINVAL;
  68. }
  69. mutex_lock(&xfs_uuid_table_mutex);
  70. for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) {
  71. if (uuid_is_null(&xfs_uuid_table[i])) {
  72. hole = i;
  73. continue;
  74. }
  75. if (uuid_equal(uuid, &xfs_uuid_table[i]))
  76. goto out_duplicate;
  77. }
  78. if (hole < 0) {
  79. xfs_uuid_table = krealloc(xfs_uuid_table,
  80. (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
  81. GFP_KERNEL | __GFP_NOFAIL);
  82. hole = xfs_uuid_table_size++;
  83. }
  84. xfs_uuid_table[hole] = *uuid;
  85. mutex_unlock(&xfs_uuid_table_mutex);
  86. return 0;
  87. out_duplicate:
  88. mutex_unlock(&xfs_uuid_table_mutex);
  89. xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
  90. return -EINVAL;
  91. }
  92. STATIC void
  93. xfs_uuid_unmount(
  94. struct xfs_mount *mp)
  95. {
  96. uuid_t *uuid = &mp->m_sb.sb_uuid;
  97. int i;
  98. if (xfs_has_nouuid(mp))
  99. return;
  100. mutex_lock(&xfs_uuid_table_mutex);
  101. for (i = 0; i < xfs_uuid_table_size; i++) {
  102. if (uuid_is_null(&xfs_uuid_table[i]))
  103. continue;
  104. if (!uuid_equal(uuid, &xfs_uuid_table[i]))
  105. continue;
  106. memset(&xfs_uuid_table[i], 0, sizeof(uuid_t));
  107. break;
  108. }
  109. ASSERT(i < xfs_uuid_table_size);
  110. mutex_unlock(&xfs_uuid_table_mutex);
  111. }
  112. /*
  113. * Check size of device based on the (data/realtime) block count.
  114. * Note: this check is used by the growfs code as well as mount.
  115. */
  116. int
  117. xfs_sb_validate_fsb_count(
  118. xfs_sb_t *sbp,
  119. uint64_t nblocks)
  120. {
  121. uint64_t max_bytes;
  122. ASSERT(sbp->sb_blocklog >= BBSHIFT);
  123. if (check_shl_overflow(nblocks, sbp->sb_blocklog, &max_bytes))
  124. return -EFBIG;
  125. /* Limited by ULONG_MAX of page cache index */
  126. if (max_bytes >> PAGE_SHIFT > ULONG_MAX)
  127. return -EFBIG;
  128. return 0;
  129. }
  130. /*
  131. * xfs_readsb
  132. *
  133. * Does the initial read of the superblock.
  134. */
  135. int
  136. xfs_readsb(
  137. struct xfs_mount *mp,
  138. int flags)
  139. {
  140. unsigned int sector_size;
  141. struct xfs_buf *bp;
  142. struct xfs_sb *sbp = &mp->m_sb;
  143. int error;
  144. int loud = !(flags & XFS_MFSI_QUIET);
  145. const struct xfs_buf_ops *buf_ops;
  146. ASSERT(mp->m_sb_bp == NULL);
  147. ASSERT(mp->m_ddev_targp != NULL);
  148. /*
  149. * For the initial read, we must guess at the sector
  150. * size based on the block device. It's enough to
  151. * get the sb_sectsize out of the superblock and
  152. * then reread with the proper length.
  153. * We don't verify it yet, because it may not be complete.
  154. */
  155. sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
  156. buf_ops = NULL;
  157. /*
  158. * Allocate a (locked) buffer to hold the superblock. This will be kept
  159. * around at all times to optimize access to the superblock. Therefore,
  160. * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
  161. * elevated.
  162. */
  163. reread:
  164. error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
  165. BTOBB(sector_size), XBF_NO_IOACCT, &bp,
  166. buf_ops);
  167. if (error) {
  168. if (loud)
  169. xfs_warn(mp, "SB validate failed with error %d.", error);
  170. /* bad CRC means corrupted metadata */
  171. if (error == -EFSBADCRC)
  172. error = -EFSCORRUPTED;
  173. return error;
  174. }
  175. /*
  176. * Initialize the mount structure from the superblock.
  177. */
  178. xfs_sb_from_disk(sbp, bp->b_addr);
  179. /*
  180. * If we haven't validated the superblock, do so now before we try
  181. * to check the sector size and reread the superblock appropriately.
  182. */
  183. if (sbp->sb_magicnum != XFS_SB_MAGIC) {
  184. if (loud)
  185. xfs_warn(mp, "Invalid superblock magic number");
  186. error = -EINVAL;
  187. goto release_buf;
  188. }
  189. /*
  190. * We must be able to do sector-sized and sector-aligned IO.
  191. */
  192. if (sector_size > sbp->sb_sectsize) {
  193. if (loud)
  194. xfs_warn(mp, "device supports %u byte sectors (not %u)",
  195. sector_size, sbp->sb_sectsize);
  196. error = -ENOSYS;
  197. goto release_buf;
  198. }
  199. if (buf_ops == NULL) {
  200. /*
  201. * Re-read the superblock so the buffer is correctly sized,
  202. * and properly verified.
  203. */
  204. xfs_buf_relse(bp);
  205. sector_size = sbp->sb_sectsize;
  206. buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
  207. goto reread;
  208. }
  209. mp->m_features |= xfs_sb_version_to_features(sbp);
  210. xfs_reinit_percpu_counters(mp);
  211. /*
  212. * If logged xattrs are enabled after log recovery finishes, then set
  213. * the opstate so that log recovery will work properly.
  214. */
  215. if (xfs_sb_version_haslogxattrs(&mp->m_sb))
  216. xfs_set_using_logged_xattrs(mp);
  217. /* no need to be quiet anymore, so reset the buf ops */
  218. bp->b_ops = &xfs_sb_buf_ops;
  219. mp->m_sb_bp = bp;
  220. xfs_buf_unlock(bp);
  221. return 0;
  222. release_buf:
  223. xfs_buf_relse(bp);
  224. return error;
  225. }
  226. /*
  227. * If the sunit/swidth change would move the precomputed root inode value, we
  228. * must reject the ondisk change because repair will stumble over that.
  229. * However, we allow the mount to proceed because we never rejected this
  230. * combination before. Returns true to update the sb, false otherwise.
  231. */
  232. static inline int
  233. xfs_check_new_dalign(
  234. struct xfs_mount *mp,
  235. int new_dalign,
  236. bool *update_sb)
  237. {
  238. struct xfs_sb *sbp = &mp->m_sb;
  239. xfs_ino_t calc_ino;
  240. calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
  241. trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
  242. if (sbp->sb_rootino == calc_ino) {
  243. *update_sb = true;
  244. return 0;
  245. }
  246. xfs_warn(mp,
  247. "Cannot change stripe alignment; would require moving root inode.");
  248. /*
  249. * XXX: Next time we add a new incompat feature, this should start
  250. * returning -EINVAL to fail the mount. Until then, spit out a warning
  251. * that we're ignoring the administrator's instructions.
  252. */
  253. xfs_warn(mp, "Skipping superblock stripe alignment update.");
  254. *update_sb = false;
  255. return 0;
  256. }
  257. /*
  258. * If we were provided with new sunit/swidth values as mount options, make sure
  259. * that they pass basic alignment and superblock feature checks, and convert
  260. * them into the same units (FSB) that everything else expects. This step
  261. * /must/ be done before computing the inode geometry.
  262. */
  263. STATIC int
  264. xfs_validate_new_dalign(
  265. struct xfs_mount *mp)
  266. {
  267. if (mp->m_dalign == 0)
  268. return 0;
  269. /*
  270. * If stripe unit and stripe width are not multiples
  271. * of the fs blocksize turn off alignment.
  272. */
  273. if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
  274. (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
  275. xfs_warn(mp,
  276. "alignment check failed: sunit/swidth vs. blocksize(%d)",
  277. mp->m_sb.sb_blocksize);
  278. return -EINVAL;
  279. }
  280. /*
  281. * Convert the stripe unit and width to FSBs.
  282. */
  283. mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
  284. if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
  285. xfs_warn(mp,
  286. "alignment check failed: sunit/swidth vs. agsize(%d)",
  287. mp->m_sb.sb_agblocks);
  288. return -EINVAL;
  289. }
  290. if (!mp->m_dalign) {
  291. xfs_warn(mp,
  292. "alignment check failed: sunit(%d) less than bsize(%d)",
  293. mp->m_dalign, mp->m_sb.sb_blocksize);
  294. return -EINVAL;
  295. }
  296. mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
  297. if (!xfs_has_dalign(mp)) {
  298. xfs_warn(mp,
  299. "cannot change alignment: superblock does not support data alignment");
  300. return -EINVAL;
  301. }
  302. return 0;
  303. }
  304. /* Update alignment values based on mount options and sb values. */
  305. STATIC int
  306. xfs_update_alignment(
  307. struct xfs_mount *mp)
  308. {
  309. struct xfs_sb *sbp = &mp->m_sb;
  310. if (mp->m_dalign) {
  311. bool update_sb;
  312. int error;
  313. if (sbp->sb_unit == mp->m_dalign &&
  314. sbp->sb_width == mp->m_swidth)
  315. return 0;
  316. error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
  317. if (error || !update_sb)
  318. return error;
  319. sbp->sb_unit = mp->m_dalign;
  320. sbp->sb_width = mp->m_swidth;
  321. mp->m_update_sb = true;
  322. } else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) {
  323. mp->m_dalign = sbp->sb_unit;
  324. mp->m_swidth = sbp->sb_width;
  325. }
  326. return 0;
  327. }
  328. /*
  329. * precalculate the low space thresholds for dynamic speculative preallocation.
  330. */
  331. void
  332. xfs_set_low_space_thresholds(
  333. struct xfs_mount *mp)
  334. {
  335. uint64_t dblocks = mp->m_sb.sb_dblocks;
  336. uint64_t rtexts = mp->m_sb.sb_rextents;
  337. int i;
  338. do_div(dblocks, 100);
  339. do_div(rtexts, 100);
  340. for (i = 0; i < XFS_LOWSP_MAX; i++) {
  341. mp->m_low_space[i] = dblocks * (i + 1);
  342. mp->m_low_rtexts[i] = rtexts * (i + 1);
  343. }
  344. }
  345. /*
  346. * Check that the data (and log if separate) is an ok size.
  347. */
  348. STATIC int
  349. xfs_check_sizes(
  350. struct xfs_mount *mp)
  351. {
  352. struct xfs_buf *bp;
  353. xfs_daddr_t d;
  354. int error;
  355. d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
  356. if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
  357. xfs_warn(mp, "filesystem size mismatch detected");
  358. return -EFBIG;
  359. }
  360. error = xfs_buf_read_uncached(mp->m_ddev_targp,
  361. d - XFS_FSS_TO_BB(mp, 1),
  362. XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
  363. if (error) {
  364. xfs_warn(mp, "last sector read failed");
  365. return error;
  366. }
  367. xfs_buf_relse(bp);
  368. if (mp->m_logdev_targp == mp->m_ddev_targp)
  369. return 0;
  370. d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
  371. if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
  372. xfs_warn(mp, "log size mismatch detected");
  373. return -EFBIG;
  374. }
  375. error = xfs_buf_read_uncached(mp->m_logdev_targp,
  376. d - XFS_FSB_TO_BB(mp, 1),
  377. XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
  378. if (error) {
  379. xfs_warn(mp, "log device read failed");
  380. return error;
  381. }
  382. xfs_buf_relse(bp);
  383. return 0;
  384. }
  385. /*
  386. * Clear the quotaflags in memory and in the superblock.
  387. */
  388. int
  389. xfs_mount_reset_sbqflags(
  390. struct xfs_mount *mp)
  391. {
  392. mp->m_qflags = 0;
  393. /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
  394. if (mp->m_sb.sb_qflags == 0)
  395. return 0;
  396. spin_lock(&mp->m_sb_lock);
  397. mp->m_sb.sb_qflags = 0;
  398. spin_unlock(&mp->m_sb_lock);
  399. if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
  400. return 0;
  401. return xfs_sync_sb(mp, false);
  402. }
  403. uint64_t
  404. xfs_default_resblks(xfs_mount_t *mp)
  405. {
  406. uint64_t resblks;
  407. /*
  408. * We default to 5% or 8192 fsbs of space reserved, whichever is
  409. * smaller. This is intended to cover concurrent allocation
  410. * transactions when we initially hit enospc. These each require a 4
  411. * block reservation. Hence by default we cover roughly 2000 concurrent
  412. * allocation reservations.
  413. */
  414. resblks = mp->m_sb.sb_dblocks;
  415. do_div(resblks, 20);
  416. resblks = min_t(uint64_t, resblks, 8192);
  417. return resblks;
  418. }
  419. /* Ensure the summary counts are correct. */
  420. STATIC int
  421. xfs_check_summary_counts(
  422. struct xfs_mount *mp)
  423. {
  424. int error = 0;
  425. /*
  426. * The AG0 superblock verifier rejects in-progress filesystems,
  427. * so we should never see the flag set this far into mounting.
  428. */
  429. if (mp->m_sb.sb_inprogress) {
  430. xfs_err(mp, "sb_inprogress set after log recovery??");
  431. WARN_ON(1);
  432. return -EFSCORRUPTED;
  433. }
  434. /*
  435. * Now the log is mounted, we know if it was an unclean shutdown or
  436. * not. If it was, with the first phase of recovery has completed, we
  437. * have consistent AG blocks on disk. We have not recovered EFIs yet,
  438. * but they are recovered transactionally in the second recovery phase
  439. * later.
  440. *
  441. * If the log was clean when we mounted, we can check the summary
  442. * counters. If any of them are obviously incorrect, we can recompute
  443. * them from the AGF headers in the next step.
  444. */
  445. if (xfs_is_clean(mp) &&
  446. (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
  447. !xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
  448. mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
  449. xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
  450. /*
  451. * We can safely re-initialise incore superblock counters from the
  452. * per-ag data. These may not be correct if the filesystem was not
  453. * cleanly unmounted, so we waited for recovery to finish before doing
  454. * this.
  455. *
  456. * If the filesystem was cleanly unmounted or the previous check did
  457. * not flag anything weird, then we can trust the values in the
  458. * superblock to be correct and we don't need to do anything here.
  459. * Otherwise, recalculate the summary counters.
  460. */
  461. if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
  462. xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
  463. error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
  464. if (error)
  465. return error;
  466. }
  467. /*
  468. * Older kernels misused sb_frextents to reflect both incore
  469. * reservations made by running transactions and the actual count of
  470. * free rt extents in the ondisk metadata. Transactions committed
  471. * during runtime can therefore contain a superblock update that
  472. * undercounts the number of free rt extents tracked in the rt bitmap.
  473. * A clean unmount record will have the correct frextents value since
  474. * there can be no other transactions running at that point.
  475. *
  476. * If we're mounting the rt volume after recovering the log, recompute
  477. * frextents from the rtbitmap file to fix the inconsistency.
  478. */
  479. if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
  480. error = xfs_rtalloc_reinit_frextents(mp);
  481. if (error)
  482. return error;
  483. }
  484. return 0;
  485. }
  486. static void
  487. xfs_unmount_check(
  488. struct xfs_mount *mp)
  489. {
  490. if (xfs_is_shutdown(mp))
  491. return;
  492. if (percpu_counter_sum(&mp->m_ifree) >
  493. percpu_counter_sum(&mp->m_icount)) {
  494. xfs_alert(mp, "ifree/icount mismatch at unmount");
  495. xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
  496. }
  497. }
  498. /*
  499. * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
  500. * internal inode structures can be sitting in the CIL and AIL at this point,
  501. * so we need to unpin them, write them back and/or reclaim them before unmount
  502. * can proceed. In other words, callers are required to have inactivated all
  503. * inodes.
  504. *
  505. * An inode cluster that has been freed can have its buffer still pinned in
  506. * memory because the transaction is still sitting in a iclog. The stale inodes
  507. * on that buffer will be pinned to the buffer until the transaction hits the
  508. * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
  509. * may never see the pinned buffer, so nothing will push out the iclog and
  510. * unpin the buffer.
  511. *
  512. * Hence we need to force the log to unpin everything first. However, log
  513. * forces don't wait for the discards they issue to complete, so we have to
  514. * explicitly wait for them to complete here as well.
  515. *
  516. * Then we can tell the world we are unmounting so that error handling knows
  517. * that the filesystem is going away and we should error out anything that we
  518. * have been retrying in the background. This will prevent never-ending
  519. * retries in AIL pushing from hanging the unmount.
  520. *
  521. * Finally, we can push the AIL to clean all the remaining dirty objects, then
  522. * reclaim the remaining inodes that are still in memory at this point in time.
  523. */
  524. static void
  525. xfs_unmount_flush_inodes(
  526. struct xfs_mount *mp)
  527. {
  528. xfs_log_force(mp, XFS_LOG_SYNC);
  529. xfs_extent_busy_wait_all(mp);
  530. flush_workqueue(xfs_discard_wq);
  531. xfs_set_unmounting(mp);
  532. xfs_ail_push_all_sync(mp->m_ail);
  533. xfs_inodegc_stop(mp);
  534. cancel_delayed_work_sync(&mp->m_reclaim_work);
  535. xfs_reclaim_inodes(mp);
  536. xfs_health_unmount(mp);
  537. }
  538. static void
  539. xfs_mount_setup_inode_geom(
  540. struct xfs_mount *mp)
  541. {
  542. struct xfs_ino_geometry *igeo = M_IGEO(mp);
  543. igeo->attr_fork_offset = xfs_bmap_compute_attr_offset(mp);
  544. ASSERT(igeo->attr_fork_offset < XFS_LITINO(mp));
  545. xfs_ialloc_setup_geometry(mp);
  546. }
  547. /* Compute maximum possible height for per-AG btree types for this fs. */
  548. static inline void
  549. xfs_agbtree_compute_maxlevels(
  550. struct xfs_mount *mp)
  551. {
  552. unsigned int levels;
  553. levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels);
  554. levels = max(levels, mp->m_rmap_maxlevels);
  555. mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
  556. }
  557. /*
  558. * This function does the following on an initial mount of a file system:
  559. * - reads the superblock from disk and init the mount struct
  560. * - if we're a 32-bit kernel, do a size check on the superblock
  561. * so we don't mount terabyte filesystems
  562. * - init mount struct realtime fields
  563. * - allocate inode hash table for fs
  564. * - init directory manager
  565. * - perform recovery and init the log manager
  566. */
  567. int
  568. xfs_mountfs(
  569. struct xfs_mount *mp)
  570. {
  571. struct xfs_sb *sbp = &(mp->m_sb);
  572. struct xfs_inode *rip;
  573. struct xfs_ino_geometry *igeo = M_IGEO(mp);
  574. uint quotamount = 0;
  575. uint quotaflags = 0;
  576. int error = 0;
  577. xfs_sb_mount_common(mp, sbp);
  578. /*
  579. * Check for a mismatched features2 values. Older kernels read & wrote
  580. * into the wrong sb offset for sb_features2 on some platforms due to
  581. * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
  582. * which made older superblock reading/writing routines swap it as a
  583. * 64-bit value.
  584. *
  585. * For backwards compatibility, we make both slots equal.
  586. *
  587. * If we detect a mismatched field, we OR the set bits into the existing
  588. * features2 field in case it has already been modified; we don't want
  589. * to lose any features. We then update the bad location with the ORed
  590. * value so that older kernels will see any features2 flags. The
  591. * superblock writeback code ensures the new sb_features2 is copied to
  592. * sb_bad_features2 before it is logged or written to disk.
  593. */
  594. if (xfs_sb_has_mismatched_features2(sbp)) {
  595. xfs_warn(mp, "correcting sb_features alignment problem");
  596. sbp->sb_features2 |= sbp->sb_bad_features2;
  597. mp->m_update_sb = true;
  598. }
  599. /* always use v2 inodes by default now */
  600. if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
  601. mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
  602. mp->m_features |= XFS_FEAT_NLINK;
  603. mp->m_update_sb = true;
  604. }
  605. /*
  606. * If we were given new sunit/swidth options, do some basic validation
  607. * checks and convert the incore dalign and swidth values to the
  608. * same units (FSB) that everything else uses. This /must/ happen
  609. * before computing the inode geometry.
  610. */
  611. error = xfs_validate_new_dalign(mp);
  612. if (error)
  613. goto out;
  614. xfs_alloc_compute_maxlevels(mp);
  615. xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
  616. xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
  617. xfs_mount_setup_inode_geom(mp);
  618. xfs_rmapbt_compute_maxlevels(mp);
  619. xfs_refcountbt_compute_maxlevels(mp);
  620. xfs_agbtree_compute_maxlevels(mp);
  621. /*
  622. * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
  623. * is NOT aligned turn off m_dalign since allocator alignment is within
  624. * an ag, therefore ag has to be aligned at stripe boundary. Note that
  625. * we must compute the free space and rmap btree geometry before doing
  626. * this.
  627. */
  628. error = xfs_update_alignment(mp);
  629. if (error)
  630. goto out;
  631. /* enable fail_at_unmount as default */
  632. mp->m_fail_unmount = true;
  633. super_set_sysfs_name_id(mp->m_super);
  634. error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
  635. NULL, mp->m_super->s_id);
  636. if (error)
  637. goto out;
  638. error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
  639. &mp->m_kobj, "stats");
  640. if (error)
  641. goto out_remove_sysfs;
  642. xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
  643. error = xfs_error_sysfs_init(mp);
  644. if (error)
  645. goto out_remove_scrub_stats;
  646. error = xfs_errortag_init(mp);
  647. if (error)
  648. goto out_remove_error_sysfs;
  649. error = xfs_uuid_mount(mp);
  650. if (error)
  651. goto out_remove_errortag;
  652. /*
  653. * Update the preferred write size based on the information from the
  654. * on-disk superblock.
  655. */
  656. mp->m_allocsize_log =
  657. max_t(uint32_t, sbp->sb_blocklog, mp->m_allocsize_log);
  658. mp->m_allocsize_blocks = 1U << (mp->m_allocsize_log - sbp->sb_blocklog);
  659. /* set the low space thresholds for dynamic preallocation */
  660. xfs_set_low_space_thresholds(mp);
  661. /*
  662. * If enabled, sparse inode chunk alignment is expected to match the
  663. * cluster size. Full inode chunk alignment must match the chunk size,
  664. * but that is checked on sb read verification...
  665. */
  666. if (xfs_has_sparseinodes(mp) &&
  667. mp->m_sb.sb_spino_align !=
  668. XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
  669. xfs_warn(mp,
  670. "Sparse inode block alignment (%u) must match cluster size (%llu).",
  671. mp->m_sb.sb_spino_align,
  672. XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
  673. error = -EINVAL;
  674. goto out_remove_uuid;
  675. }
  676. /*
  677. * Check that the data (and log if separate) is an ok size.
  678. */
  679. error = xfs_check_sizes(mp);
  680. if (error)
  681. goto out_remove_uuid;
  682. /*
  683. * Initialize realtime fields in the mount structure
  684. */
  685. error = xfs_rtmount_init(mp);
  686. if (error) {
  687. xfs_warn(mp, "RT mount failed");
  688. goto out_remove_uuid;
  689. }
  690. /*
  691. * Copies the low order bits of the timestamp and the randomly
  692. * set "sequence" number out of a UUID.
  693. */
  694. mp->m_fixedfsid[0] =
  695. (get_unaligned_be16(&sbp->sb_uuid.b[8]) << 16) |
  696. get_unaligned_be16(&sbp->sb_uuid.b[4]);
  697. mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]);
  698. error = xfs_da_mount(mp);
  699. if (error) {
  700. xfs_warn(mp, "Failed dir/attr init: %d", error);
  701. goto out_remove_uuid;
  702. }
  703. /*
  704. * Initialize the precomputed transaction reservations values.
  705. */
  706. xfs_trans_init(mp);
  707. /*
  708. * Allocate and initialize the per-ag data.
  709. */
  710. error = xfs_initialize_perag(mp, 0, sbp->sb_agcount,
  711. mp->m_sb.sb_dblocks, &mp->m_maxagi);
  712. if (error) {
  713. xfs_warn(mp, "Failed per-ag init: %d", error);
  714. goto out_free_dir;
  715. }
  716. if (XFS_IS_CORRUPT(mp, !sbp->sb_logblocks)) {
  717. xfs_warn(mp, "no log defined");
  718. error = -EFSCORRUPTED;
  719. goto out_free_perag;
  720. }
  721. error = xfs_inodegc_register_shrinker(mp);
  722. if (error)
  723. goto out_fail_wait;
  724. /*
  725. * Log's mount-time initialization. The first part of recovery can place
  726. * some items on the AIL, to be handled when recovery is finished or
  727. * cancelled.
  728. */
  729. error = xfs_log_mount(mp, mp->m_logdev_targp,
  730. XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
  731. XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
  732. if (error) {
  733. xfs_warn(mp, "log mount failed");
  734. goto out_inodegc_shrinker;
  735. }
  736. /*
  737. * If logged xattrs are still enabled after log recovery finishes, then
  738. * they'll be available until unmount. Otherwise, turn them off.
  739. */
  740. if (xfs_sb_version_haslogxattrs(&mp->m_sb))
  741. xfs_set_using_logged_xattrs(mp);
  742. else
  743. xfs_clear_using_logged_xattrs(mp);
  744. /* Enable background inode inactivation workers. */
  745. xfs_inodegc_start(mp);
  746. xfs_blockgc_start(mp);
  747. /*
  748. * Now that we've recovered any pending superblock feature bit
  749. * additions, we can finish setting up the attr2 behaviour for the
  750. * mount. The noattr2 option overrides the superblock flag, so only
  751. * check the superblock feature flag if the mount option is not set.
  752. */
  753. if (xfs_has_noattr2(mp)) {
  754. mp->m_features &= ~XFS_FEAT_ATTR2;
  755. } else if (!xfs_has_attr2(mp) &&
  756. (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
  757. mp->m_features |= XFS_FEAT_ATTR2;
  758. }
  759. /*
  760. * Get and sanity-check the root inode.
  761. * Save the pointer to it in the mount structure.
  762. */
  763. error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_IGET_UNTRUSTED,
  764. XFS_ILOCK_EXCL, &rip);
  765. if (error) {
  766. xfs_warn(mp,
  767. "Failed to read root inode 0x%llx, error %d",
  768. sbp->sb_rootino, -error);
  769. goto out_log_dealloc;
  770. }
  771. ASSERT(rip != NULL);
  772. if (XFS_IS_CORRUPT(mp, !S_ISDIR(VFS_I(rip)->i_mode))) {
  773. xfs_warn(mp, "corrupted root inode %llu: not a directory",
  774. (unsigned long long)rip->i_ino);
  775. xfs_iunlock(rip, XFS_ILOCK_EXCL);
  776. error = -EFSCORRUPTED;
  777. goto out_rele_rip;
  778. }
  779. mp->m_rootip = rip; /* save it */
  780. xfs_iunlock(rip, XFS_ILOCK_EXCL);
  781. /*
  782. * Initialize realtime inode pointers in the mount structure
  783. */
  784. error = xfs_rtmount_inodes(mp);
  785. if (error) {
  786. /*
  787. * Free up the root inode.
  788. */
  789. xfs_warn(mp, "failed to read RT inodes");
  790. goto out_rele_rip;
  791. }
  792. /* Make sure the summary counts are ok. */
  793. error = xfs_check_summary_counts(mp);
  794. if (error)
  795. goto out_rtunmount;
  796. /*
  797. * If this is a read-only mount defer the superblock updates until
  798. * the next remount into writeable mode. Otherwise we would never
  799. * perform the update e.g. for the root filesystem.
  800. */
  801. if (mp->m_update_sb && !xfs_is_readonly(mp)) {
  802. error = xfs_sync_sb(mp, false);
  803. if (error) {
  804. xfs_warn(mp, "failed to write sb changes");
  805. goto out_rtunmount;
  806. }
  807. }
  808. /*
  809. * Initialise the XFS quota management subsystem for this mount
  810. */
  811. if (XFS_IS_QUOTA_ON(mp)) {
  812. error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
  813. if (error)
  814. goto out_rtunmount;
  815. } else {
  816. /*
  817. * If a file system had quotas running earlier, but decided to
  818. * mount without -o uquota/pquota/gquota options, revoke the
  819. * quotachecked license.
  820. */
  821. if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
  822. xfs_notice(mp, "resetting quota flags");
  823. error = xfs_mount_reset_sbqflags(mp);
  824. if (error)
  825. goto out_rtunmount;
  826. }
  827. }
  828. /*
  829. * Finish recovering the file system. This part needed to be delayed
  830. * until after the root and real-time bitmap inodes were consistently
  831. * read in. Temporarily create per-AG space reservations for metadata
  832. * btree shape changes because space freeing transactions (for inode
  833. * inactivation) require the per-AG reservation in lieu of reserving
  834. * blocks.
  835. */
  836. error = xfs_fs_reserve_ag_blocks(mp);
  837. if (error && error == -ENOSPC)
  838. xfs_warn(mp,
  839. "ENOSPC reserving per-AG metadata pool, log recovery may fail.");
  840. error = xfs_log_mount_finish(mp);
  841. xfs_fs_unreserve_ag_blocks(mp);
  842. if (error) {
  843. xfs_warn(mp, "log mount finish failed");
  844. goto out_rtunmount;
  845. }
  846. /*
  847. * Now the log is fully replayed, we can transition to full read-only
  848. * mode for read-only mounts. This will sync all the metadata and clean
  849. * the log so that the recovery we just performed does not have to be
  850. * replayed again on the next mount.
  851. *
  852. * We use the same quiesce mechanism as the rw->ro remount, as they are
  853. * semantically identical operations.
  854. */
  855. if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
  856. xfs_log_clean(mp);
  857. /*
  858. * Complete the quota initialisation, post-log-replay component.
  859. */
  860. if (quotamount) {
  861. ASSERT(mp->m_qflags == 0);
  862. mp->m_qflags = quotaflags;
  863. xfs_qm_mount_quotas(mp);
  864. }
  865. /*
  866. * Now we are mounted, reserve a small amount of unused space for
  867. * privileged transactions. This is needed so that transaction
  868. * space required for critical operations can dip into this pool
  869. * when at ENOSPC. This is needed for operations like create with
  870. * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
  871. * are not allowed to use this reserved space.
  872. *
  873. * This may drive us straight to ENOSPC on mount, but that implies
  874. * we were already there on the last unmount. Warn if this occurs.
  875. */
  876. if (!xfs_is_readonly(mp)) {
  877. error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
  878. if (error)
  879. xfs_warn(mp,
  880. "Unable to allocate reserve blocks. Continuing without reserve pool.");
  881. /* Reserve AG blocks for future btree expansion. */
  882. error = xfs_fs_reserve_ag_blocks(mp);
  883. if (error && error != -ENOSPC)
  884. goto out_agresv;
  885. }
  886. return 0;
  887. out_agresv:
  888. xfs_fs_unreserve_ag_blocks(mp);
  889. xfs_qm_unmount_quotas(mp);
  890. out_rtunmount:
  891. xfs_rtunmount_inodes(mp);
  892. out_rele_rip:
  893. xfs_irele(rip);
  894. /* Clean out dquots that might be in memory after quotacheck. */
  895. xfs_qm_unmount(mp);
  896. /*
  897. * Inactivate all inodes that might still be in memory after a log
  898. * intent recovery failure so that reclaim can free them. Metadata
  899. * inodes and the root directory shouldn't need inactivation, but the
  900. * mount failed for some reason, so pull down all the state and flee.
  901. */
  902. xfs_inodegc_flush(mp);
  903. /*
  904. * Flush all inode reclamation work and flush the log.
  905. * We have to do this /after/ rtunmount and qm_unmount because those
  906. * two will have scheduled delayed reclaim for the rt/quota inodes.
  907. *
  908. * This is slightly different from the unmountfs call sequence
  909. * because we could be tearing down a partially set up mount. In
  910. * particular, if log_mount_finish fails we bail out without calling
  911. * qm_unmount_quotas and therefore rely on qm_unmount to release the
  912. * quota inodes.
  913. */
  914. xfs_unmount_flush_inodes(mp);
  915. out_log_dealloc:
  916. xfs_log_mount_cancel(mp);
  917. out_inodegc_shrinker:
  918. shrinker_free(mp->m_inodegc_shrinker);
  919. out_fail_wait:
  920. if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
  921. xfs_buftarg_drain(mp->m_logdev_targp);
  922. xfs_buftarg_drain(mp->m_ddev_targp);
  923. out_free_perag:
  924. xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
  925. out_free_dir:
  926. xfs_da_unmount(mp);
  927. out_remove_uuid:
  928. xfs_uuid_unmount(mp);
  929. out_remove_errortag:
  930. xfs_errortag_del(mp);
  931. out_remove_error_sysfs:
  932. xfs_error_sysfs_del(mp);
  933. out_remove_scrub_stats:
  934. xchk_stats_unregister(mp->m_scrub_stats);
  935. xfs_sysfs_del(&mp->m_stats.xs_kobj);
  936. out_remove_sysfs:
  937. xfs_sysfs_del(&mp->m_kobj);
  938. out:
  939. return error;
  940. }
  941. /*
  942. * This flushes out the inodes,dquots and the superblock, unmounts the
  943. * log and makes sure that incore structures are freed.
  944. */
  945. void
  946. xfs_unmountfs(
  947. struct xfs_mount *mp)
  948. {
  949. int error;
  950. /*
  951. * Perform all on-disk metadata updates required to inactivate inodes
  952. * that the VFS evicted earlier in the unmount process. Freeing inodes
  953. * and discarding CoW fork preallocations can cause shape changes to
  954. * the free inode and refcount btrees, respectively, so we must finish
  955. * this before we discard the metadata space reservations. Metadata
  956. * inodes and the root directory do not require inactivation.
  957. */
  958. xfs_inodegc_flush(mp);
  959. xfs_blockgc_stop(mp);
  960. xfs_fs_unreserve_ag_blocks(mp);
  961. xfs_qm_unmount_quotas(mp);
  962. xfs_rtunmount_inodes(mp);
  963. xfs_irele(mp->m_rootip);
  964. xfs_unmount_flush_inodes(mp);
  965. xfs_qm_unmount(mp);
  966. /*
  967. * Unreserve any blocks we have so that when we unmount we don't account
  968. * the reserved free space as used. This is really only necessary for
  969. * lazy superblock counting because it trusts the incore superblock
  970. * counters to be absolutely correct on clean unmount.
  971. *
  972. * We don't bother correcting this elsewhere for lazy superblock
  973. * counting because on mount of an unclean filesystem we reconstruct the
  974. * correct counter value and this is irrelevant.
  975. *
  976. * For non-lazy counter filesystems, this doesn't matter at all because
  977. * we only every apply deltas to the superblock and hence the incore
  978. * value does not matter....
  979. */
  980. error = xfs_reserve_blocks(mp, 0);
  981. if (error)
  982. xfs_warn(mp, "Unable to free reserved block pool. "
  983. "Freespace may not be correct on next mount.");
  984. xfs_unmount_check(mp);
  985. /*
  986. * Indicate that it's ok to clear log incompat bits before cleaning
  987. * the log and writing the unmount record.
  988. */
  989. xfs_set_done_with_log_incompat(mp);
  990. xfs_log_unmount(mp);
  991. xfs_da_unmount(mp);
  992. xfs_uuid_unmount(mp);
  993. #if defined(DEBUG)
  994. xfs_errortag_clearall(mp);
  995. #endif
  996. shrinker_free(mp->m_inodegc_shrinker);
  997. xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
  998. xfs_errortag_del(mp);
  999. xfs_error_sysfs_del(mp);
  1000. xchk_stats_unregister(mp->m_scrub_stats);
  1001. xfs_sysfs_del(&mp->m_stats.xs_kobj);
  1002. xfs_sysfs_del(&mp->m_kobj);
  1003. }
  1004. /*
  1005. * Determine whether modifications can proceed. The caller specifies the minimum
  1006. * freeze level for which modifications should not be allowed. This allows
  1007. * certain operations to proceed while the freeze sequence is in progress, if
  1008. * necessary.
  1009. */
  1010. bool
  1011. xfs_fs_writable(
  1012. struct xfs_mount *mp,
  1013. int level)
  1014. {
  1015. ASSERT(level > SB_UNFROZEN);
  1016. if ((mp->m_super->s_writers.frozen >= level) ||
  1017. xfs_is_shutdown(mp) || xfs_is_readonly(mp))
  1018. return false;
  1019. return true;
  1020. }
  1021. void
  1022. xfs_add_freecounter(
  1023. struct xfs_mount *mp,
  1024. struct percpu_counter *counter,
  1025. uint64_t delta)
  1026. {
  1027. bool has_resv_pool = (counter == &mp->m_fdblocks);
  1028. uint64_t res_used;
  1029. /*
  1030. * If the reserve pool is depleted, put blocks back into it first.
  1031. * Most of the time the pool is full.
  1032. */
  1033. if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
  1034. percpu_counter_add(counter, delta);
  1035. return;
  1036. }
  1037. spin_lock(&mp->m_sb_lock);
  1038. res_used = mp->m_resblks - mp->m_resblks_avail;
  1039. if (res_used > delta) {
  1040. mp->m_resblks_avail += delta;
  1041. } else {
  1042. delta -= res_used;
  1043. mp->m_resblks_avail = mp->m_resblks;
  1044. percpu_counter_add(counter, delta);
  1045. }
  1046. spin_unlock(&mp->m_sb_lock);
  1047. }
  1048. int
  1049. xfs_dec_freecounter(
  1050. struct xfs_mount *mp,
  1051. struct percpu_counter *counter,
  1052. uint64_t delta,
  1053. bool rsvd)
  1054. {
  1055. int64_t lcounter;
  1056. uint64_t set_aside = 0;
  1057. s32 batch;
  1058. bool has_resv_pool;
  1059. ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
  1060. has_resv_pool = (counter == &mp->m_fdblocks);
  1061. if (rsvd)
  1062. ASSERT(has_resv_pool);
  1063. /*
  1064. * Taking blocks away, need to be more accurate the closer we
  1065. * are to zero.
  1066. *
  1067. * If the counter has a value of less than 2 * max batch size,
  1068. * then make everything serialise as we are real close to
  1069. * ENOSPC.
  1070. */
  1071. if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
  1072. XFS_FDBLOCKS_BATCH) < 0)
  1073. batch = 1;
  1074. else
  1075. batch = XFS_FDBLOCKS_BATCH;
  1076. /*
  1077. * Set aside allocbt blocks because these blocks are tracked as free
  1078. * space but not available for allocation. Technically this means that a
  1079. * single reservation cannot consume all remaining free space, but the
  1080. * ratio of allocbt blocks to usable free blocks should be rather small.
  1081. * The tradeoff without this is that filesystems that maintain high
  1082. * perag block reservations can over reserve physical block availability
  1083. * and fail physical allocation, which leads to much more serious
  1084. * problems (i.e. transaction abort, pagecache discards, etc.) than
  1085. * slightly premature -ENOSPC.
  1086. */
  1087. if (has_resv_pool)
  1088. set_aside = xfs_fdblocks_unavailable(mp);
  1089. percpu_counter_add_batch(counter, -((int64_t)delta), batch);
  1090. if (__percpu_counter_compare(counter, set_aside,
  1091. XFS_FDBLOCKS_BATCH) >= 0) {
  1092. /* we had space! */
  1093. return 0;
  1094. }
  1095. /*
  1096. * lock up the sb for dipping into reserves before releasing the space
  1097. * that took us to ENOSPC.
  1098. */
  1099. spin_lock(&mp->m_sb_lock);
  1100. percpu_counter_add(counter, delta);
  1101. if (!has_resv_pool || !rsvd)
  1102. goto fdblocks_enospc;
  1103. lcounter = (long long)mp->m_resblks_avail - delta;
  1104. if (lcounter >= 0) {
  1105. mp->m_resblks_avail = lcounter;
  1106. spin_unlock(&mp->m_sb_lock);
  1107. return 0;
  1108. }
  1109. xfs_warn_once(mp,
  1110. "Reserve blocks depleted! Consider increasing reserve pool size.");
  1111. fdblocks_enospc:
  1112. spin_unlock(&mp->m_sb_lock);
  1113. return -ENOSPC;
  1114. }
  1115. /*
  1116. * Used to free the superblock along various error paths.
  1117. */
  1118. void
  1119. xfs_freesb(
  1120. struct xfs_mount *mp)
  1121. {
  1122. struct xfs_buf *bp = mp->m_sb_bp;
  1123. xfs_buf_lock(bp);
  1124. mp->m_sb_bp = NULL;
  1125. xfs_buf_relse(bp);
  1126. }
  1127. /*
  1128. * If the underlying (data/log/rt) device is readonly, there are some
  1129. * operations that cannot proceed.
  1130. */
  1131. int
  1132. xfs_dev_is_read_only(
  1133. struct xfs_mount *mp,
  1134. char *message)
  1135. {
  1136. if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
  1137. xfs_readonly_buftarg(mp->m_logdev_targp) ||
  1138. (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
  1139. xfs_notice(mp, "%s required on read-only device.", message);
  1140. xfs_notice(mp, "write access unavailable, cannot proceed.");
  1141. return -EROFS;
  1142. }
  1143. return 0;
  1144. }
  1145. /* Force the summary counters to be recalculated at next mount. */
  1146. void
  1147. xfs_force_summary_recalc(
  1148. struct xfs_mount *mp)
  1149. {
  1150. if (!xfs_has_lazysbcount(mp))
  1151. return;
  1152. xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
  1153. }
  1154. /*
  1155. * Enable a log incompat feature flag in the primary superblock. The caller
  1156. * cannot have any other transactions in progress.
  1157. */
  1158. int
  1159. xfs_add_incompat_log_feature(
  1160. struct xfs_mount *mp,
  1161. uint32_t feature)
  1162. {
  1163. struct xfs_dsb *dsb;
  1164. int error;
  1165. ASSERT(hweight32(feature) == 1);
  1166. ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
  1167. /*
  1168. * Force the log to disk and kick the background AIL thread to reduce
  1169. * the chances that the bwrite will stall waiting for the AIL to unpin
  1170. * the primary superblock buffer. This isn't a data integrity
  1171. * operation, so we don't need a synchronous push.
  1172. */
  1173. error = xfs_log_force(mp, XFS_LOG_SYNC);
  1174. if (error)
  1175. return error;
  1176. xfs_ail_push_all(mp->m_ail);
  1177. /*
  1178. * Lock the primary superblock buffer to serialize all callers that
  1179. * are trying to set feature bits.
  1180. */
  1181. xfs_buf_lock(mp->m_sb_bp);
  1182. xfs_buf_hold(mp->m_sb_bp);
  1183. if (xfs_is_shutdown(mp)) {
  1184. error = -EIO;
  1185. goto rele;
  1186. }
  1187. if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature))
  1188. goto rele;
  1189. /*
  1190. * Write the primary superblock to disk immediately, because we need
  1191. * the log_incompat bit to be set in the primary super now to protect
  1192. * the log items that we're going to commit later.
  1193. */
  1194. dsb = mp->m_sb_bp->b_addr;
  1195. xfs_sb_to_disk(dsb, &mp->m_sb);
  1196. dsb->sb_features_log_incompat |= cpu_to_be32(feature);
  1197. error = xfs_bwrite(mp->m_sb_bp);
  1198. if (error)
  1199. goto shutdown;
  1200. /*
  1201. * Add the feature bits to the incore superblock before we unlock the
  1202. * buffer.
  1203. */
  1204. xfs_sb_add_incompat_log_features(&mp->m_sb, feature);
  1205. xfs_buf_relse(mp->m_sb_bp);
  1206. /* Log the superblock to disk. */
  1207. return xfs_sync_sb(mp, false);
  1208. shutdown:
  1209. xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
  1210. rele:
  1211. xfs_buf_relse(mp->m_sb_bp);
  1212. return error;
  1213. }
  1214. /*
  1215. * Clear all the log incompat flags from the superblock.
  1216. *
  1217. * The caller cannot be in a transaction, must ensure that the log does not
  1218. * contain any log items protected by any log incompat bit, and must ensure
  1219. * that there are no other threads that depend on the state of the log incompat
  1220. * feature flags in the primary super.
  1221. *
  1222. * Returns true if the superblock is dirty.
  1223. */
  1224. bool
  1225. xfs_clear_incompat_log_features(
  1226. struct xfs_mount *mp)
  1227. {
  1228. bool ret = false;
  1229. if (!xfs_has_crc(mp) ||
  1230. !xfs_sb_has_incompat_log_feature(&mp->m_sb,
  1231. XFS_SB_FEAT_INCOMPAT_LOG_ALL) ||
  1232. xfs_is_shutdown(mp) ||
  1233. !xfs_is_done_with_log_incompat(mp))
  1234. return false;
  1235. /*
  1236. * Update the incore superblock. We synchronize on the primary super
  1237. * buffer lock to be consistent with the add function, though at least
  1238. * in theory this shouldn't be necessary.
  1239. */
  1240. xfs_buf_lock(mp->m_sb_bp);
  1241. xfs_buf_hold(mp->m_sb_bp);
  1242. if (xfs_sb_has_incompat_log_feature(&mp->m_sb,
  1243. XFS_SB_FEAT_INCOMPAT_LOG_ALL)) {
  1244. xfs_sb_remove_incompat_log_features(&mp->m_sb);
  1245. ret = true;
  1246. }
  1247. xfs_buf_relse(mp->m_sb_bp);
  1248. return ret;
  1249. }
  1250. /*
  1251. * Update the in-core delayed block counter.
  1252. *
  1253. * We prefer to update the counter without having to take a spinlock for every
  1254. * counter update (i.e. batching). Each change to delayed allocation
  1255. * reservations can change can easily exceed the default percpu counter
  1256. * batching, so we use a larger batch factor here.
  1257. *
  1258. * Note that we don't currently have any callers requiring fast summation
  1259. * (e.g. percpu_counter_read) so we can use a big batch value here.
  1260. */
  1261. #define XFS_DELALLOC_BATCH (4096)
  1262. void
  1263. xfs_mod_delalloc(
  1264. struct xfs_inode *ip,
  1265. int64_t data_delta,
  1266. int64_t ind_delta)
  1267. {
  1268. struct xfs_mount *mp = ip->i_mount;
  1269. if (XFS_IS_REALTIME_INODE(ip)) {
  1270. percpu_counter_add_batch(&mp->m_delalloc_rtextents,
  1271. xfs_rtb_to_rtx(mp, data_delta),
  1272. XFS_DELALLOC_BATCH);
  1273. if (!ind_delta)
  1274. return;
  1275. data_delta = 0;
  1276. }
  1277. percpu_counter_add_batch(&mp->m_delalloc_blks, data_delta + ind_delta,
  1278. XFS_DELALLOC_BATCH);
  1279. }