backing-dev.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include <linux/blkdev.h>
  3. #include <linux/wait.h>
  4. #include <linux/rbtree.h>
  5. #include <linux/kthread.h>
  6. #include <linux/backing-dev.h>
  7. #include <linux/blk-cgroup.h>
  8. #include <linux/freezer.h>
  9. #include <linux/fs.h>
  10. #include <linux/pagemap.h>
  11. #include <linux/mm.h>
  12. #include <linux/sched/mm.h>
  13. #include <linux/sched.h>
  14. #include <linux/module.h>
  15. #include <linux/writeback.h>
  16. #include <linux/device.h>
  17. #include <trace/events/writeback.h>
  18. #include "internal.h"
  19. struct backing_dev_info noop_backing_dev_info;
  20. EXPORT_SYMBOL_GPL(noop_backing_dev_info);
  21. static const char *bdi_unknown_name = "(unknown)";
  22. /*
  23. * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
  24. * reader side locking.
  25. */
  26. DEFINE_SPINLOCK(bdi_lock);
  27. static u64 bdi_id_cursor;
  28. static struct rb_root bdi_tree = RB_ROOT;
  29. LIST_HEAD(bdi_list);
  30. /* bdi_wq serves all asynchronous writeback tasks */
  31. struct workqueue_struct *bdi_wq;
  32. #ifdef CONFIG_DEBUG_FS
  33. #include <linux/debugfs.h>
  34. #include <linux/seq_file.h>
  35. struct wb_stats {
  36. unsigned long nr_dirty;
  37. unsigned long nr_io;
  38. unsigned long nr_more_io;
  39. unsigned long nr_dirty_time;
  40. unsigned long nr_writeback;
  41. unsigned long nr_reclaimable;
  42. unsigned long nr_dirtied;
  43. unsigned long nr_written;
  44. unsigned long dirty_thresh;
  45. unsigned long wb_thresh;
  46. };
  47. static struct dentry *bdi_debug_root;
  48. static void bdi_debug_init(void)
  49. {
  50. bdi_debug_root = debugfs_create_dir("bdi", NULL);
  51. }
  52. static void collect_wb_stats(struct wb_stats *stats,
  53. struct bdi_writeback *wb)
  54. {
  55. struct inode *inode;
  56. spin_lock(&wb->list_lock);
  57. list_for_each_entry(inode, &wb->b_dirty, i_io_list)
  58. stats->nr_dirty++;
  59. list_for_each_entry(inode, &wb->b_io, i_io_list)
  60. stats->nr_io++;
  61. list_for_each_entry(inode, &wb->b_more_io, i_io_list)
  62. stats->nr_more_io++;
  63. list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
  64. if (inode->i_state & I_DIRTY_TIME)
  65. stats->nr_dirty_time++;
  66. spin_unlock(&wb->list_lock);
  67. stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
  68. stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
  69. stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
  70. stats->nr_written += wb_stat(wb, WB_WRITTEN);
  71. stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
  72. }
  73. #ifdef CONFIG_CGROUP_WRITEBACK
  74. static void bdi_collect_stats(struct backing_dev_info *bdi,
  75. struct wb_stats *stats)
  76. {
  77. struct bdi_writeback *wb;
  78. rcu_read_lock();
  79. list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
  80. if (!wb_tryget(wb))
  81. continue;
  82. collect_wb_stats(stats, wb);
  83. wb_put(wb);
  84. }
  85. rcu_read_unlock();
  86. }
  87. #else
  88. static void bdi_collect_stats(struct backing_dev_info *bdi,
  89. struct wb_stats *stats)
  90. {
  91. collect_wb_stats(stats, &bdi->wb);
  92. }
  93. #endif
  94. static int bdi_debug_stats_show(struct seq_file *m, void *v)
  95. {
  96. struct backing_dev_info *bdi = m->private;
  97. unsigned long background_thresh;
  98. unsigned long dirty_thresh;
  99. struct wb_stats stats;
  100. unsigned long tot_bw;
  101. global_dirty_limits(&background_thresh, &dirty_thresh);
  102. memset(&stats, 0, sizeof(stats));
  103. stats.dirty_thresh = dirty_thresh;
  104. bdi_collect_stats(bdi, &stats);
  105. tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
  106. seq_printf(m,
  107. "BdiWriteback: %10lu kB\n"
  108. "BdiReclaimable: %10lu kB\n"
  109. "BdiDirtyThresh: %10lu kB\n"
  110. "DirtyThresh: %10lu kB\n"
  111. "BackgroundThresh: %10lu kB\n"
  112. "BdiDirtied: %10lu kB\n"
  113. "BdiWritten: %10lu kB\n"
  114. "BdiWriteBandwidth: %10lu kBps\n"
  115. "b_dirty: %10lu\n"
  116. "b_io: %10lu\n"
  117. "b_more_io: %10lu\n"
  118. "b_dirty_time: %10lu\n"
  119. "bdi_list: %10u\n"
  120. "state: %10lx\n",
  121. K(stats.nr_writeback),
  122. K(stats.nr_reclaimable),
  123. K(stats.wb_thresh),
  124. K(dirty_thresh),
  125. K(background_thresh),
  126. K(stats.nr_dirtied),
  127. K(stats.nr_written),
  128. K(tot_bw),
  129. stats.nr_dirty,
  130. stats.nr_io,
  131. stats.nr_more_io,
  132. stats.nr_dirty_time,
  133. !list_empty(&bdi->bdi_list), bdi->wb.state);
  134. return 0;
  135. }
  136. DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
  137. static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
  138. struct wb_stats *stats)
  139. {
  140. seq_printf(m,
  141. "WbCgIno: %10lu\n"
  142. "WbWriteback: %10lu kB\n"
  143. "WbReclaimable: %10lu kB\n"
  144. "WbDirtyThresh: %10lu kB\n"
  145. "WbDirtied: %10lu kB\n"
  146. "WbWritten: %10lu kB\n"
  147. "WbWriteBandwidth: %10lu kBps\n"
  148. "b_dirty: %10lu\n"
  149. "b_io: %10lu\n"
  150. "b_more_io: %10lu\n"
  151. "b_dirty_time: %10lu\n"
  152. "state: %10lx\n\n",
  153. #ifdef CONFIG_CGROUP_WRITEBACK
  154. cgroup_ino(wb->memcg_css->cgroup),
  155. #else
  156. 1ul,
  157. #endif
  158. K(stats->nr_writeback),
  159. K(stats->nr_reclaimable),
  160. K(stats->wb_thresh),
  161. K(stats->nr_dirtied),
  162. K(stats->nr_written),
  163. K(wb->avg_write_bandwidth),
  164. stats->nr_dirty,
  165. stats->nr_io,
  166. stats->nr_more_io,
  167. stats->nr_dirty_time,
  168. wb->state);
  169. }
  170. static int cgwb_debug_stats_show(struct seq_file *m, void *v)
  171. {
  172. struct backing_dev_info *bdi = m->private;
  173. unsigned long background_thresh;
  174. unsigned long dirty_thresh;
  175. struct bdi_writeback *wb;
  176. global_dirty_limits(&background_thresh, &dirty_thresh);
  177. rcu_read_lock();
  178. list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
  179. struct wb_stats stats = { .dirty_thresh = dirty_thresh };
  180. if (!wb_tryget(wb))
  181. continue;
  182. collect_wb_stats(&stats, wb);
  183. /*
  184. * Calculate thresh of wb in writeback cgroup which is min of
  185. * thresh in global domain and thresh in cgroup domain. Drop
  186. * rcu lock because cgwb_calc_thresh may sleep in
  187. * cgroup_rstat_flush. We can do so here because we have a ref.
  188. */
  189. if (mem_cgroup_wb_domain(wb)) {
  190. rcu_read_unlock();
  191. stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
  192. rcu_read_lock();
  193. }
  194. wb_stats_show(m, wb, &stats);
  195. wb_put(wb);
  196. }
  197. rcu_read_unlock();
  198. return 0;
  199. }
  200. DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
  201. static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
  202. {
  203. bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
  204. debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
  205. &bdi_debug_stats_fops);
  206. debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
  207. &cgwb_debug_stats_fops);
  208. }
  209. static void bdi_debug_unregister(struct backing_dev_info *bdi)
  210. {
  211. debugfs_remove_recursive(bdi->debug_dir);
  212. }
  213. #else /* CONFIG_DEBUG_FS */
  214. static inline void bdi_debug_init(void)
  215. {
  216. }
  217. static inline void bdi_debug_register(struct backing_dev_info *bdi,
  218. const char *name)
  219. {
  220. }
  221. static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
  222. {
  223. }
  224. #endif /* CONFIG_DEBUG_FS */
  225. static ssize_t read_ahead_kb_store(struct device *dev,
  226. struct device_attribute *attr,
  227. const char *buf, size_t count)
  228. {
  229. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  230. unsigned long read_ahead_kb;
  231. ssize_t ret;
  232. ret = kstrtoul(buf, 10, &read_ahead_kb);
  233. if (ret < 0)
  234. return ret;
  235. bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
  236. return count;
  237. }
  238. #define BDI_SHOW(name, expr) \
  239. static ssize_t name##_show(struct device *dev, \
  240. struct device_attribute *attr, char *buf) \
  241. { \
  242. struct backing_dev_info *bdi = dev_get_drvdata(dev); \
  243. \
  244. return sysfs_emit(buf, "%lld\n", (long long)expr); \
  245. } \
  246. static DEVICE_ATTR_RW(name);
  247. BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
  248. static ssize_t min_ratio_store(struct device *dev,
  249. struct device_attribute *attr, const char *buf, size_t count)
  250. {
  251. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  252. unsigned int ratio;
  253. ssize_t ret;
  254. ret = kstrtouint(buf, 10, &ratio);
  255. if (ret < 0)
  256. return ret;
  257. ret = bdi_set_min_ratio(bdi, ratio);
  258. if (!ret)
  259. ret = count;
  260. return ret;
  261. }
  262. BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)
  263. static ssize_t min_ratio_fine_store(struct device *dev,
  264. struct device_attribute *attr, const char *buf, size_t count)
  265. {
  266. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  267. unsigned int ratio;
  268. ssize_t ret;
  269. ret = kstrtouint(buf, 10, &ratio);
  270. if (ret < 0)
  271. return ret;
  272. ret = bdi_set_min_ratio_no_scale(bdi, ratio);
  273. if (!ret)
  274. ret = count;
  275. return ret;
  276. }
  277. BDI_SHOW(min_ratio_fine, bdi->min_ratio)
  278. static ssize_t max_ratio_store(struct device *dev,
  279. struct device_attribute *attr, const char *buf, size_t count)
  280. {
  281. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  282. unsigned int ratio;
  283. ssize_t ret;
  284. ret = kstrtouint(buf, 10, &ratio);
  285. if (ret < 0)
  286. return ret;
  287. ret = bdi_set_max_ratio(bdi, ratio);
  288. if (!ret)
  289. ret = count;
  290. return ret;
  291. }
  292. BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)
  293. static ssize_t max_ratio_fine_store(struct device *dev,
  294. struct device_attribute *attr, const char *buf, size_t count)
  295. {
  296. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  297. unsigned int ratio;
  298. ssize_t ret;
  299. ret = kstrtouint(buf, 10, &ratio);
  300. if (ret < 0)
  301. return ret;
  302. ret = bdi_set_max_ratio_no_scale(bdi, ratio);
  303. if (!ret)
  304. ret = count;
  305. return ret;
  306. }
  307. BDI_SHOW(max_ratio_fine, bdi->max_ratio)
  308. static ssize_t min_bytes_show(struct device *dev,
  309. struct device_attribute *attr,
  310. char *buf)
  311. {
  312. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  313. return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
  314. }
  315. static ssize_t min_bytes_store(struct device *dev,
  316. struct device_attribute *attr, const char *buf, size_t count)
  317. {
  318. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  319. u64 bytes;
  320. ssize_t ret;
  321. ret = kstrtoull(buf, 10, &bytes);
  322. if (ret < 0)
  323. return ret;
  324. ret = bdi_set_min_bytes(bdi, bytes);
  325. if (!ret)
  326. ret = count;
  327. return ret;
  328. }
  329. static DEVICE_ATTR_RW(min_bytes);
  330. static ssize_t max_bytes_show(struct device *dev,
  331. struct device_attribute *attr,
  332. char *buf)
  333. {
  334. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  335. return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
  336. }
  337. static ssize_t max_bytes_store(struct device *dev,
  338. struct device_attribute *attr, const char *buf, size_t count)
  339. {
  340. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  341. u64 bytes;
  342. ssize_t ret;
  343. ret = kstrtoull(buf, 10, &bytes);
  344. if (ret < 0)
  345. return ret;
  346. ret = bdi_set_max_bytes(bdi, bytes);
  347. if (!ret)
  348. ret = count;
  349. return ret;
  350. }
  351. static DEVICE_ATTR_RW(max_bytes);
  352. static ssize_t stable_pages_required_show(struct device *dev,
  353. struct device_attribute *attr,
  354. char *buf)
  355. {
  356. dev_warn_once(dev,
  357. "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
  358. return sysfs_emit(buf, "%d\n", 0);
  359. }
  360. static DEVICE_ATTR_RO(stable_pages_required);
  361. static ssize_t strict_limit_store(struct device *dev,
  362. struct device_attribute *attr, const char *buf, size_t count)
  363. {
  364. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  365. unsigned int strict_limit;
  366. ssize_t ret;
  367. ret = kstrtouint(buf, 10, &strict_limit);
  368. if (ret < 0)
  369. return ret;
  370. ret = bdi_set_strict_limit(bdi, strict_limit);
  371. if (!ret)
  372. ret = count;
  373. return ret;
  374. }
  375. static ssize_t strict_limit_show(struct device *dev,
  376. struct device_attribute *attr, char *buf)
  377. {
  378. struct backing_dev_info *bdi = dev_get_drvdata(dev);
  379. return sysfs_emit(buf, "%d\n",
  380. !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
  381. }
  382. static DEVICE_ATTR_RW(strict_limit);
  383. static struct attribute *bdi_dev_attrs[] = {
  384. &dev_attr_read_ahead_kb.attr,
  385. &dev_attr_min_ratio.attr,
  386. &dev_attr_min_ratio_fine.attr,
  387. &dev_attr_max_ratio.attr,
  388. &dev_attr_max_ratio_fine.attr,
  389. &dev_attr_min_bytes.attr,
  390. &dev_attr_max_bytes.attr,
  391. &dev_attr_stable_pages_required.attr,
  392. &dev_attr_strict_limit.attr,
  393. NULL,
  394. };
  395. ATTRIBUTE_GROUPS(bdi_dev);
  396. static const struct class bdi_class = {
  397. .name = "bdi",
  398. .dev_groups = bdi_dev_groups,
  399. };
  400. static __init int bdi_class_init(void)
  401. {
  402. int ret;
  403. ret = class_register(&bdi_class);
  404. if (ret)
  405. return ret;
  406. bdi_debug_init();
  407. return 0;
  408. }
  409. postcore_initcall(bdi_class_init);
  410. static int __init default_bdi_init(void)
  411. {
  412. bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
  413. WQ_SYSFS, 0);
  414. if (!bdi_wq)
  415. return -ENOMEM;
  416. return 0;
  417. }
  418. subsys_initcall(default_bdi_init);
  419. static void wb_update_bandwidth_workfn(struct work_struct *work)
  420. {
  421. struct bdi_writeback *wb = container_of(to_delayed_work(work),
  422. struct bdi_writeback, bw_dwork);
  423. wb_update_bandwidth(wb);
  424. }
  425. /*
  426. * Initial write bandwidth: 100 MB/s
  427. */
  428. #define INIT_BW (100 << (20 - PAGE_SHIFT))
  429. static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
  430. gfp_t gfp)
  431. {
  432. int err;
  433. memset(wb, 0, sizeof(*wb));
  434. wb->bdi = bdi;
  435. wb->last_old_flush = jiffies;
  436. INIT_LIST_HEAD(&wb->b_dirty);
  437. INIT_LIST_HEAD(&wb->b_io);
  438. INIT_LIST_HEAD(&wb->b_more_io);
  439. INIT_LIST_HEAD(&wb->b_dirty_time);
  440. spin_lock_init(&wb->list_lock);
  441. atomic_set(&wb->writeback_inodes, 0);
  442. wb->bw_time_stamp = jiffies;
  443. wb->balanced_dirty_ratelimit = INIT_BW;
  444. wb->dirty_ratelimit = INIT_BW;
  445. wb->write_bandwidth = INIT_BW;
  446. wb->avg_write_bandwidth = INIT_BW;
  447. spin_lock_init(&wb->work_lock);
  448. INIT_LIST_HEAD(&wb->work_list);
  449. INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
  450. INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
  451. err = fprop_local_init_percpu(&wb->completions, gfp);
  452. if (err)
  453. return err;
  454. err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
  455. if (err)
  456. fprop_local_destroy_percpu(&wb->completions);
  457. return err;
  458. }
  459. static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
  460. /*
  461. * Remove bdi from the global list and shutdown any threads we have running
  462. */
  463. static void wb_shutdown(struct bdi_writeback *wb)
  464. {
  465. /* Make sure nobody queues further work */
  466. spin_lock_irq(&wb->work_lock);
  467. if (!test_and_clear_bit(WB_registered, &wb->state)) {
  468. spin_unlock_irq(&wb->work_lock);
  469. return;
  470. }
  471. spin_unlock_irq(&wb->work_lock);
  472. cgwb_remove_from_bdi_list(wb);
  473. /*
  474. * Drain work list and shutdown the delayed_work. !WB_registered
  475. * tells wb_workfn() that @wb is dying and its work_list needs to
  476. * be drained no matter what.
  477. */
  478. mod_delayed_work(bdi_wq, &wb->dwork, 0);
  479. flush_delayed_work(&wb->dwork);
  480. WARN_ON(!list_empty(&wb->work_list));
  481. flush_delayed_work(&wb->bw_dwork);
  482. }
  483. static void wb_exit(struct bdi_writeback *wb)
  484. {
  485. WARN_ON(delayed_work_pending(&wb->dwork));
  486. percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
  487. fprop_local_destroy_percpu(&wb->completions);
  488. }
  489. #ifdef CONFIG_CGROUP_WRITEBACK
  490. #include <linux/memcontrol.h>
  491. /*
  492. * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
  493. * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected.
  494. */
  495. static DEFINE_SPINLOCK(cgwb_lock);
  496. static struct workqueue_struct *cgwb_release_wq;
  497. static LIST_HEAD(offline_cgwbs);
  498. static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
  499. static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
  500. static void cgwb_free_rcu(struct rcu_head *rcu_head)
  501. {
  502. struct bdi_writeback *wb = container_of(rcu_head,
  503. struct bdi_writeback, rcu);
  504. percpu_ref_exit(&wb->refcnt);
  505. kfree(wb);
  506. }
  507. static void cgwb_release_workfn(struct work_struct *work)
  508. {
  509. struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
  510. release_work);
  511. struct backing_dev_info *bdi = wb->bdi;
  512. mutex_lock(&wb->bdi->cgwb_release_mutex);
  513. wb_shutdown(wb);
  514. css_put(wb->memcg_css);
  515. css_put(wb->blkcg_css);
  516. mutex_unlock(&wb->bdi->cgwb_release_mutex);
  517. /* triggers blkg destruction if no online users left */
  518. blkcg_unpin_online(wb->blkcg_css);
  519. fprop_local_destroy_percpu(&wb->memcg_completions);
  520. spin_lock_irq(&cgwb_lock);
  521. list_del(&wb->offline_node);
  522. spin_unlock_irq(&cgwb_lock);
  523. wb_exit(wb);
  524. bdi_put(bdi);
  525. WARN_ON_ONCE(!list_empty(&wb->b_attached));
  526. call_rcu(&wb->rcu, cgwb_free_rcu);
  527. }
  528. static void cgwb_release(struct percpu_ref *refcnt)
  529. {
  530. struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
  531. refcnt);
  532. queue_work(cgwb_release_wq, &wb->release_work);
  533. }
  534. static void cgwb_kill(struct bdi_writeback *wb)
  535. {
  536. lockdep_assert_held(&cgwb_lock);
  537. WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
  538. list_del(&wb->memcg_node);
  539. list_del(&wb->blkcg_node);
  540. list_add(&wb->offline_node, &offline_cgwbs);
  541. percpu_ref_kill(&wb->refcnt);
  542. }
  543. static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
  544. {
  545. spin_lock_irq(&cgwb_lock);
  546. list_del_rcu(&wb->bdi_node);
  547. spin_unlock_irq(&cgwb_lock);
  548. }
  549. static int cgwb_create(struct backing_dev_info *bdi,
  550. struct cgroup_subsys_state *memcg_css, gfp_t gfp)
  551. {
  552. struct mem_cgroup *memcg;
  553. struct cgroup_subsys_state *blkcg_css;
  554. struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
  555. struct bdi_writeback *wb;
  556. unsigned long flags;
  557. int ret = 0;
  558. memcg = mem_cgroup_from_css(memcg_css);
  559. blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
  560. memcg_cgwb_list = &memcg->cgwb_list;
  561. blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
  562. /* look up again under lock and discard on blkcg mismatch */
  563. spin_lock_irqsave(&cgwb_lock, flags);
  564. wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
  565. if (wb && wb->blkcg_css != blkcg_css) {
  566. cgwb_kill(wb);
  567. wb = NULL;
  568. }
  569. spin_unlock_irqrestore(&cgwb_lock, flags);
  570. if (wb)
  571. goto out_put;
  572. /* need to create a new one */
  573. wb = kmalloc(sizeof(*wb), gfp);
  574. if (!wb) {
  575. ret = -ENOMEM;
  576. goto out_put;
  577. }
  578. ret = wb_init(wb, bdi, gfp);
  579. if (ret)
  580. goto err_free;
  581. ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
  582. if (ret)
  583. goto err_wb_exit;
  584. ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
  585. if (ret)
  586. goto err_ref_exit;
  587. wb->memcg_css = memcg_css;
  588. wb->blkcg_css = blkcg_css;
  589. INIT_LIST_HEAD(&wb->b_attached);
  590. INIT_WORK(&wb->release_work, cgwb_release_workfn);
  591. set_bit(WB_registered, &wb->state);
  592. bdi_get(bdi);
  593. /*
  594. * The root wb determines the registered state of the whole bdi and
  595. * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
  596. * whether they're still online. Don't link @wb if any is dead.
  597. * See wb_memcg_offline() and wb_blkcg_offline().
  598. */
  599. ret = -ENODEV;
  600. spin_lock_irqsave(&cgwb_lock, flags);
  601. if (test_bit(WB_registered, &bdi->wb.state) &&
  602. blkcg_cgwb_list->next && memcg_cgwb_list->next) {
  603. /* we might have raced another instance of this function */
  604. ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
  605. if (!ret) {
  606. list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
  607. list_add(&wb->memcg_node, memcg_cgwb_list);
  608. list_add(&wb->blkcg_node, blkcg_cgwb_list);
  609. blkcg_pin_online(blkcg_css);
  610. css_get(memcg_css);
  611. css_get(blkcg_css);
  612. }
  613. }
  614. spin_unlock_irqrestore(&cgwb_lock, flags);
  615. if (ret) {
  616. if (ret == -EEXIST)
  617. ret = 0;
  618. goto err_fprop_exit;
  619. }
  620. goto out_put;
  621. err_fprop_exit:
  622. bdi_put(bdi);
  623. fprop_local_destroy_percpu(&wb->memcg_completions);
  624. err_ref_exit:
  625. percpu_ref_exit(&wb->refcnt);
  626. err_wb_exit:
  627. wb_exit(wb);
  628. err_free:
  629. kfree(wb);
  630. out_put:
  631. css_put(blkcg_css);
  632. return ret;
  633. }
  634. /**
  635. * wb_get_lookup - get wb for a given memcg
  636. * @bdi: target bdi
  637. * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
  638. *
  639. * Try to get the wb for @memcg_css on @bdi. The returned wb has its
  640. * refcount incremented.
  641. *
  642. * This function uses css_get() on @memcg_css and thus expects its refcnt
  643. * to be positive on invocation. IOW, rcu_read_lock() protection on
  644. * @memcg_css isn't enough. try_get it before calling this function.
  645. *
  646. * A wb is keyed by its associated memcg. As blkcg implicitly enables
  647. * memcg on the default hierarchy, memcg association is guaranteed to be
  648. * more specific (equal or descendant to the associated blkcg) and thus can
  649. * identify both the memcg and blkcg associations.
  650. *
  651. * Because the blkcg associated with a memcg may change as blkcg is enabled
  652. * and disabled closer to root in the hierarchy, each wb keeps track of
  653. * both the memcg and blkcg associated with it and verifies the blkcg on
  654. * each lookup. On mismatch, the existing wb is discarded and a new one is
  655. * created.
  656. */
  657. struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
  658. struct cgroup_subsys_state *memcg_css)
  659. {
  660. struct bdi_writeback *wb;
  661. if (!memcg_css->parent)
  662. return &bdi->wb;
  663. rcu_read_lock();
  664. wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
  665. if (wb) {
  666. struct cgroup_subsys_state *blkcg_css;
  667. /* see whether the blkcg association has changed */
  668. blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
  669. if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
  670. wb = NULL;
  671. css_put(blkcg_css);
  672. }
  673. rcu_read_unlock();
  674. return wb;
  675. }
  676. /**
  677. * wb_get_create - get wb for a given memcg, create if necessary
  678. * @bdi: target bdi
  679. * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
  680. * @gfp: allocation mask to use
  681. *
  682. * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
  683. * create one. See wb_get_lookup() for more details.
  684. */
  685. struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
  686. struct cgroup_subsys_state *memcg_css,
  687. gfp_t gfp)
  688. {
  689. struct bdi_writeback *wb;
  690. might_alloc(gfp);
  691. do {
  692. wb = wb_get_lookup(bdi, memcg_css);
  693. } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
  694. return wb;
  695. }
  696. static int cgwb_bdi_init(struct backing_dev_info *bdi)
  697. {
  698. int ret;
  699. INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
  700. mutex_init(&bdi->cgwb_release_mutex);
  701. init_rwsem(&bdi->wb_switch_rwsem);
  702. ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
  703. if (!ret) {
  704. bdi->wb.memcg_css = &root_mem_cgroup->css;
  705. bdi->wb.blkcg_css = blkcg_root_css;
  706. }
  707. return ret;
  708. }
  709. static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
  710. {
  711. struct radix_tree_iter iter;
  712. void **slot;
  713. struct bdi_writeback *wb;
  714. WARN_ON(test_bit(WB_registered, &bdi->wb.state));
  715. spin_lock_irq(&cgwb_lock);
  716. radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
  717. cgwb_kill(*slot);
  718. spin_unlock_irq(&cgwb_lock);
  719. mutex_lock(&bdi->cgwb_release_mutex);
  720. spin_lock_irq(&cgwb_lock);
  721. while (!list_empty(&bdi->wb_list)) {
  722. wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
  723. bdi_node);
  724. spin_unlock_irq(&cgwb_lock);
  725. wb_shutdown(wb);
  726. spin_lock_irq(&cgwb_lock);
  727. }
  728. spin_unlock_irq(&cgwb_lock);
  729. mutex_unlock(&bdi->cgwb_release_mutex);
  730. }
  731. /*
  732. * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
  733. *
  734. * Try to release dying cgwbs by switching attached inodes to the nearest
  735. * living ancestor's writeback. Processed wbs are placed at the end
  736. * of the list to guarantee the forward progress.
  737. */
  738. static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
  739. {
  740. struct bdi_writeback *wb;
  741. LIST_HEAD(processed);
  742. spin_lock_irq(&cgwb_lock);
  743. while (!list_empty(&offline_cgwbs)) {
  744. wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
  745. offline_node);
  746. list_move(&wb->offline_node, &processed);
  747. /*
  748. * If wb is dirty, cleaning up the writeback by switching
  749. * attached inodes will result in an effective removal of any
  750. * bandwidth restrictions, which isn't the goal. Instead,
  751. * it can be postponed until the next time, when all io
  752. * will be likely completed. If in the meantime some inodes
  753. * will get re-dirtied, they should be eventually switched to
  754. * a new cgwb.
  755. */
  756. if (wb_has_dirty_io(wb))
  757. continue;
  758. if (!wb_tryget(wb))
  759. continue;
  760. spin_unlock_irq(&cgwb_lock);
  761. while (cleanup_offline_cgwb(wb))
  762. cond_resched();
  763. spin_lock_irq(&cgwb_lock);
  764. wb_put(wb);
  765. }
  766. if (!list_empty(&processed))
  767. list_splice_tail(&processed, &offline_cgwbs);
  768. spin_unlock_irq(&cgwb_lock);
  769. }
  770. /**
  771. * wb_memcg_offline - kill all wb's associated with a memcg being offlined
  772. * @memcg: memcg being offlined
  773. *
  774. * Also prevents creation of any new wb's associated with @memcg.
  775. */
  776. void wb_memcg_offline(struct mem_cgroup *memcg)
  777. {
  778. struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
  779. struct bdi_writeback *wb, *next;
  780. spin_lock_irq(&cgwb_lock);
  781. list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
  782. cgwb_kill(wb);
  783. memcg_cgwb_list->next = NULL; /* prevent new wb's */
  784. spin_unlock_irq(&cgwb_lock);
  785. queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
  786. }
  787. /**
  788. * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
  789. * @css: blkcg being offlined
  790. *
  791. * Also prevents creation of any new wb's associated with @blkcg.
  792. */
  793. void wb_blkcg_offline(struct cgroup_subsys_state *css)
  794. {
  795. struct bdi_writeback *wb, *next;
  796. struct list_head *list = blkcg_get_cgwb_list(css);
  797. spin_lock_irq(&cgwb_lock);
  798. list_for_each_entry_safe(wb, next, list, blkcg_node)
  799. cgwb_kill(wb);
  800. list->next = NULL; /* prevent new wb's */
  801. spin_unlock_irq(&cgwb_lock);
  802. }
  803. static void cgwb_bdi_register(struct backing_dev_info *bdi)
  804. {
  805. spin_lock_irq(&cgwb_lock);
  806. list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
  807. spin_unlock_irq(&cgwb_lock);
  808. }
  809. static int __init cgwb_init(void)
  810. {
  811. /*
  812. * There can be many concurrent release work items overwhelming
  813. * system_wq. Put them in a separate wq and limit concurrency.
  814. * There's no point in executing many of these in parallel.
  815. */
  816. cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
  817. if (!cgwb_release_wq)
  818. return -ENOMEM;
  819. return 0;
  820. }
  821. subsys_initcall(cgwb_init);
  822. #else /* CONFIG_CGROUP_WRITEBACK */
  823. static int cgwb_bdi_init(struct backing_dev_info *bdi)
  824. {
  825. return wb_init(&bdi->wb, bdi, GFP_KERNEL);
  826. }
  827. static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
  828. static void cgwb_bdi_register(struct backing_dev_info *bdi)
  829. {
  830. list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
  831. }
  832. static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
  833. {
  834. list_del_rcu(&wb->bdi_node);
  835. }
  836. #endif /* CONFIG_CGROUP_WRITEBACK */
  837. int bdi_init(struct backing_dev_info *bdi)
  838. {
  839. bdi->dev = NULL;
  840. kref_init(&bdi->refcnt);
  841. bdi->min_ratio = 0;
  842. bdi->max_ratio = 100 * BDI_RATIO_SCALE;
  843. bdi->max_prop_frac = FPROP_FRAC_BASE;
  844. INIT_LIST_HEAD(&bdi->bdi_list);
  845. INIT_LIST_HEAD(&bdi->wb_list);
  846. init_waitqueue_head(&bdi->wb_waitq);
  847. bdi->last_bdp_sleep = jiffies;
  848. return cgwb_bdi_init(bdi);
  849. }
  850. struct backing_dev_info *bdi_alloc(int node_id)
  851. {
  852. struct backing_dev_info *bdi;
  853. bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
  854. if (!bdi)
  855. return NULL;
  856. if (bdi_init(bdi)) {
  857. kfree(bdi);
  858. return NULL;
  859. }
  860. bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
  861. bdi->ra_pages = VM_READAHEAD_PAGES;
  862. bdi->io_pages = VM_READAHEAD_PAGES;
  863. timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
  864. return bdi;
  865. }
  866. EXPORT_SYMBOL(bdi_alloc);
  867. static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
  868. {
  869. struct rb_node **p = &bdi_tree.rb_node;
  870. struct rb_node *parent = NULL;
  871. struct backing_dev_info *bdi;
  872. lockdep_assert_held(&bdi_lock);
  873. while (*p) {
  874. parent = *p;
  875. bdi = rb_entry(parent, struct backing_dev_info, rb_node);
  876. if (bdi->id > id)
  877. p = &(*p)->rb_left;
  878. else if (bdi->id < id)
  879. p = &(*p)->rb_right;
  880. else
  881. break;
  882. }
  883. if (parentp)
  884. *parentp = parent;
  885. return p;
  886. }
  887. /**
  888. * bdi_get_by_id - lookup and get bdi from its id
  889. * @id: bdi id to lookup
  890. *
  891. * Find bdi matching @id and get it. Returns NULL if the matching bdi
  892. * doesn't exist or is already unregistered.
  893. */
  894. struct backing_dev_info *bdi_get_by_id(u64 id)
  895. {
  896. struct backing_dev_info *bdi = NULL;
  897. struct rb_node **p;
  898. spin_lock_bh(&bdi_lock);
  899. p = bdi_lookup_rb_node(id, NULL);
  900. if (*p) {
  901. bdi = rb_entry(*p, struct backing_dev_info, rb_node);
  902. bdi_get(bdi);
  903. }
  904. spin_unlock_bh(&bdi_lock);
  905. return bdi;
  906. }
  907. int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
  908. {
  909. struct device *dev;
  910. struct rb_node *parent, **p;
  911. if (bdi->dev) /* The driver needs to use separate queues per device */
  912. return 0;
  913. vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
  914. dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
  915. if (IS_ERR(dev))
  916. return PTR_ERR(dev);
  917. cgwb_bdi_register(bdi);
  918. bdi->dev = dev;
  919. bdi_debug_register(bdi, dev_name(dev));
  920. set_bit(WB_registered, &bdi->wb.state);
  921. spin_lock_bh(&bdi_lock);
  922. bdi->id = ++bdi_id_cursor;
  923. p = bdi_lookup_rb_node(bdi->id, &parent);
  924. rb_link_node(&bdi->rb_node, parent, p);
  925. rb_insert_color(&bdi->rb_node, &bdi_tree);
  926. list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
  927. spin_unlock_bh(&bdi_lock);
  928. trace_writeback_bdi_register(bdi);
  929. return 0;
  930. }
  931. int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
  932. {
  933. va_list args;
  934. int ret;
  935. va_start(args, fmt);
  936. ret = bdi_register_va(bdi, fmt, args);
  937. va_end(args);
  938. return ret;
  939. }
  940. EXPORT_SYMBOL(bdi_register);
  941. void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
  942. {
  943. WARN_ON_ONCE(bdi->owner);
  944. bdi->owner = owner;
  945. get_device(owner);
  946. }
  947. /*
  948. * Remove bdi from bdi_list, and ensure that it is no longer visible
  949. */
  950. static void bdi_remove_from_list(struct backing_dev_info *bdi)
  951. {
  952. spin_lock_bh(&bdi_lock);
  953. rb_erase(&bdi->rb_node, &bdi_tree);
  954. list_del_rcu(&bdi->bdi_list);
  955. spin_unlock_bh(&bdi_lock);
  956. synchronize_rcu_expedited();
  957. }
  958. void bdi_unregister(struct backing_dev_info *bdi)
  959. {
  960. del_timer_sync(&bdi->laptop_mode_wb_timer);
  961. /* make sure nobody finds us on the bdi_list anymore */
  962. bdi_remove_from_list(bdi);
  963. wb_shutdown(&bdi->wb);
  964. cgwb_bdi_unregister(bdi);
  965. /*
  966. * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
  967. * update the global bdi_min_ratio.
  968. */
  969. if (bdi->min_ratio)
  970. bdi_set_min_ratio(bdi, 0);
  971. if (bdi->dev) {
  972. bdi_debug_unregister(bdi);
  973. device_unregister(bdi->dev);
  974. bdi->dev = NULL;
  975. }
  976. if (bdi->owner) {
  977. put_device(bdi->owner);
  978. bdi->owner = NULL;
  979. }
  980. }
  981. EXPORT_SYMBOL(bdi_unregister);
  982. static void release_bdi(struct kref *ref)
  983. {
  984. struct backing_dev_info *bdi =
  985. container_of(ref, struct backing_dev_info, refcnt);
  986. WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
  987. WARN_ON_ONCE(bdi->dev);
  988. wb_exit(&bdi->wb);
  989. kfree(bdi);
  990. }
  991. void bdi_put(struct backing_dev_info *bdi)
  992. {
  993. kref_put(&bdi->refcnt, release_bdi);
  994. }
  995. EXPORT_SYMBOL(bdi_put);
  996. struct backing_dev_info *inode_to_bdi(struct inode *inode)
  997. {
  998. struct super_block *sb;
  999. if (!inode)
  1000. return &noop_backing_dev_info;
  1001. sb = inode->i_sb;
  1002. #ifdef CONFIG_BLOCK
  1003. if (sb_is_blkdev_sb(sb))
  1004. return I_BDEV(inode)->bd_disk->bdi;
  1005. #endif
  1006. return sb->s_bdi;
  1007. }
  1008. EXPORT_SYMBOL(inode_to_bdi);
  1009. const char *bdi_dev_name(struct backing_dev_info *bdi)
  1010. {
  1011. if (!bdi || !bdi->dev)
  1012. return bdi_unknown_name;
  1013. return bdi->dev_name;
  1014. }
  1015. EXPORT_SYMBOL_GPL(bdi_dev_name);