file.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Simple file system for zoned block devices exposing zones as files.
  4. *
  5. * Copyright (C) 2022 Western Digital Corporation or its affiliates.
  6. */
  7. #include <linux/module.h>
  8. #include <linux/pagemap.h>
  9. #include <linux/iomap.h>
  10. #include <linux/init.h>
  11. #include <linux/slab.h>
  12. #include <linux/blkdev.h>
  13. #include <linux/statfs.h>
  14. #include <linux/writeback.h>
  15. #include <linux/quotaops.h>
  16. #include <linux/seq_file.h>
  17. #include <linux/parser.h>
  18. #include <linux/uio.h>
  19. #include <linux/mman.h>
  20. #include <linux/sched/mm.h>
  21. #include <linux/task_io_accounting_ops.h>
  22. #include "zonefs.h"
  23. #include "trace.h"
  24. static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
  25. loff_t length, unsigned int flags,
  26. struct iomap *iomap, struct iomap *srcmap)
  27. {
  28. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  29. struct zonefs_zone *z = zonefs_inode_zone(inode);
  30. struct super_block *sb = inode->i_sb;
  31. loff_t isize;
  32. /*
  33. * All blocks are always mapped below EOF. If reading past EOF,
  34. * act as if there is a hole up to the file maximum size.
  35. */
  36. mutex_lock(&zi->i_truncate_mutex);
  37. iomap->bdev = inode->i_sb->s_bdev;
  38. iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  39. isize = i_size_read(inode);
  40. if (iomap->offset >= isize) {
  41. iomap->type = IOMAP_HOLE;
  42. iomap->addr = IOMAP_NULL_ADDR;
  43. iomap->length = length;
  44. } else {
  45. iomap->type = IOMAP_MAPPED;
  46. iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  47. iomap->length = isize - iomap->offset;
  48. }
  49. mutex_unlock(&zi->i_truncate_mutex);
  50. trace_zonefs_iomap_begin(inode, iomap);
  51. return 0;
  52. }
  53. static const struct iomap_ops zonefs_read_iomap_ops = {
  54. .iomap_begin = zonefs_read_iomap_begin,
  55. };
  56. static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
  57. loff_t length, unsigned int flags,
  58. struct iomap *iomap, struct iomap *srcmap)
  59. {
  60. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  61. struct zonefs_zone *z = zonefs_inode_zone(inode);
  62. struct super_block *sb = inode->i_sb;
  63. loff_t isize;
  64. /* All write I/Os should always be within the file maximum size */
  65. if (WARN_ON_ONCE(offset + length > z->z_capacity))
  66. return -EIO;
  67. /*
  68. * Sequential zones can only accept direct writes. This is already
  69. * checked when writes are issued, so warn if we see a page writeback
  70. * operation.
  71. */
  72. if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
  73. return -EIO;
  74. /*
  75. * For conventional zones, all blocks are always mapped. For sequential
  76. * zones, all blocks after always mapped below the inode size (zone
  77. * write pointer) and unwriten beyond.
  78. */
  79. mutex_lock(&zi->i_truncate_mutex);
  80. iomap->bdev = inode->i_sb->s_bdev;
  81. iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
  82. iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
  83. isize = i_size_read(inode);
  84. if (iomap->offset >= isize) {
  85. iomap->type = IOMAP_UNWRITTEN;
  86. iomap->length = z->z_capacity - iomap->offset;
  87. } else {
  88. iomap->type = IOMAP_MAPPED;
  89. iomap->length = isize - iomap->offset;
  90. }
  91. mutex_unlock(&zi->i_truncate_mutex);
  92. trace_zonefs_iomap_begin(inode, iomap);
  93. return 0;
  94. }
  95. static const struct iomap_ops zonefs_write_iomap_ops = {
  96. .iomap_begin = zonefs_write_iomap_begin,
  97. };
  98. static int zonefs_read_folio(struct file *unused, struct folio *folio)
  99. {
  100. return iomap_read_folio(folio, &zonefs_read_iomap_ops);
  101. }
  102. static void zonefs_readahead(struct readahead_control *rac)
  103. {
  104. iomap_readahead(rac, &zonefs_read_iomap_ops);
  105. }
  106. /*
  107. * Map blocks for page writeback. This is used only on conventional zone files,
  108. * which implies that the page range can only be within the fixed inode size.
  109. */
  110. static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
  111. struct inode *inode, loff_t offset,
  112. unsigned int len)
  113. {
  114. struct zonefs_zone *z = zonefs_inode_zone(inode);
  115. if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
  116. return -EIO;
  117. if (WARN_ON_ONCE(offset >= i_size_read(inode)))
  118. return -EIO;
  119. /* If the mapping is already OK, nothing needs to be done */
  120. if (offset >= wpc->iomap.offset &&
  121. offset < wpc->iomap.offset + wpc->iomap.length)
  122. return 0;
  123. return zonefs_write_iomap_begin(inode, offset,
  124. z->z_capacity - offset,
  125. IOMAP_WRITE, &wpc->iomap, NULL);
  126. }
  127. static const struct iomap_writeback_ops zonefs_writeback_ops = {
  128. .map_blocks = zonefs_write_map_blocks,
  129. };
  130. static int zonefs_writepages(struct address_space *mapping,
  131. struct writeback_control *wbc)
  132. {
  133. struct iomap_writepage_ctx wpc = { };
  134. return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
  135. }
  136. static int zonefs_swap_activate(struct swap_info_struct *sis,
  137. struct file *swap_file, sector_t *span)
  138. {
  139. struct inode *inode = file_inode(swap_file);
  140. if (zonefs_inode_is_seq(inode)) {
  141. zonefs_err(inode->i_sb,
  142. "swap file: not a conventional zone file\n");
  143. return -EINVAL;
  144. }
  145. return iomap_swapfile_activate(sis, swap_file, span,
  146. &zonefs_read_iomap_ops);
  147. }
  148. const struct address_space_operations zonefs_file_aops = {
  149. .read_folio = zonefs_read_folio,
  150. .readahead = zonefs_readahead,
  151. .writepages = zonefs_writepages,
  152. .dirty_folio = iomap_dirty_folio,
  153. .release_folio = iomap_release_folio,
  154. .invalidate_folio = iomap_invalidate_folio,
  155. .migrate_folio = filemap_migrate_folio,
  156. .is_partially_uptodate = iomap_is_partially_uptodate,
  157. .error_remove_folio = generic_error_remove_folio,
  158. .swap_activate = zonefs_swap_activate,
  159. };
  160. int zonefs_file_truncate(struct inode *inode, loff_t isize)
  161. {
  162. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  163. struct zonefs_zone *z = zonefs_inode_zone(inode);
  164. loff_t old_isize;
  165. enum req_op op;
  166. int ret = 0;
  167. /*
  168. * Only sequential zone files can be truncated and truncation is allowed
  169. * only down to a 0 size, which is equivalent to a zone reset, and to
  170. * the maximum file size, which is equivalent to a zone finish.
  171. */
  172. if (!zonefs_zone_is_seq(z))
  173. return -EPERM;
  174. if (!isize)
  175. op = REQ_OP_ZONE_RESET;
  176. else if (isize == z->z_capacity)
  177. op = REQ_OP_ZONE_FINISH;
  178. else
  179. return -EPERM;
  180. inode_dio_wait(inode);
  181. /* Serialize against page faults */
  182. filemap_invalidate_lock(inode->i_mapping);
  183. /* Serialize against zonefs_iomap_begin() */
  184. mutex_lock(&zi->i_truncate_mutex);
  185. old_isize = i_size_read(inode);
  186. if (isize == old_isize)
  187. goto unlock;
  188. ret = zonefs_inode_zone_mgmt(inode, op);
  189. if (ret)
  190. goto unlock;
  191. /*
  192. * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
  193. * take care of open zones.
  194. */
  195. if (z->z_flags & ZONEFS_ZONE_OPEN) {
  196. /*
  197. * Truncating a zone to EMPTY or FULL is the equivalent of
  198. * closing the zone. For a truncation to 0, we need to
  199. * re-open the zone to ensure new writes can be processed.
  200. * For a truncation to the maximum file size, the zone is
  201. * closed and writes cannot be accepted anymore, so clear
  202. * the open flag.
  203. */
  204. if (!isize)
  205. ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
  206. else
  207. z->z_flags &= ~ZONEFS_ZONE_OPEN;
  208. }
  209. zonefs_update_stats(inode, isize);
  210. truncate_setsize(inode, isize);
  211. z->z_wpoffset = isize;
  212. zonefs_inode_account_active(inode);
  213. unlock:
  214. mutex_unlock(&zi->i_truncate_mutex);
  215. filemap_invalidate_unlock(inode->i_mapping);
  216. return ret;
  217. }
  218. static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
  219. int datasync)
  220. {
  221. struct inode *inode = file_inode(file);
  222. int ret = 0;
  223. if (unlikely(IS_IMMUTABLE(inode)))
  224. return -EPERM;
  225. /*
  226. * Since only direct writes are allowed in sequential files, page cache
  227. * flush is needed only for conventional zone files.
  228. */
  229. if (zonefs_inode_is_cnv(inode))
  230. ret = file_write_and_wait_range(file, start, end);
  231. if (!ret)
  232. ret = blkdev_issue_flush(inode->i_sb->s_bdev);
  233. if (ret)
  234. zonefs_io_error(inode, true);
  235. return ret;
  236. }
  237. static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
  238. {
  239. struct inode *inode = file_inode(vmf->vma->vm_file);
  240. vm_fault_t ret;
  241. if (unlikely(IS_IMMUTABLE(inode)))
  242. return VM_FAULT_SIGBUS;
  243. /*
  244. * Sanity check: only conventional zone files can have shared
  245. * writeable mappings.
  246. */
  247. if (zonefs_inode_is_seq(inode))
  248. return VM_FAULT_NOPAGE;
  249. sb_start_pagefault(inode->i_sb);
  250. file_update_time(vmf->vma->vm_file);
  251. /* Serialize against truncates */
  252. filemap_invalidate_lock_shared(inode->i_mapping);
  253. ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
  254. filemap_invalidate_unlock_shared(inode->i_mapping);
  255. sb_end_pagefault(inode->i_sb);
  256. return ret;
  257. }
  258. static const struct vm_operations_struct zonefs_file_vm_ops = {
  259. .fault = filemap_fault,
  260. .map_pages = filemap_map_pages,
  261. .page_mkwrite = zonefs_filemap_page_mkwrite,
  262. };
  263. static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
  264. {
  265. /*
  266. * Conventional zones accept random writes, so their files can support
  267. * shared writable mappings. For sequential zone files, only read
  268. * mappings are possible since there are no guarantees for write
  269. * ordering between msync() and page cache writeback.
  270. */
  271. if (zonefs_inode_is_seq(file_inode(file)) &&
  272. (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
  273. return -EINVAL;
  274. file_accessed(file);
  275. vma->vm_ops = &zonefs_file_vm_ops;
  276. return 0;
  277. }
  278. static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
  279. {
  280. loff_t isize = i_size_read(file_inode(file));
  281. /*
  282. * Seeks are limited to below the zone size for conventional zones
  283. * and below the zone write pointer for sequential zones. In both
  284. * cases, this limit is the inode size.
  285. */
  286. return generic_file_llseek_size(file, offset, whence, isize, isize);
  287. }
  288. static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
  289. int error, unsigned int flags)
  290. {
  291. struct inode *inode = file_inode(iocb->ki_filp);
  292. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  293. if (error) {
  294. /*
  295. * For Sync IOs, error recovery is called from
  296. * zonefs_file_dio_write().
  297. */
  298. if (!is_sync_kiocb(iocb))
  299. zonefs_io_error(inode, true);
  300. return error;
  301. }
  302. if (size && zonefs_inode_is_seq(inode)) {
  303. /*
  304. * Note that we may be seeing completions out of order,
  305. * but that is not a problem since a write completed
  306. * successfully necessarily means that all preceding writes
  307. * were also successful. So we can safely increase the inode
  308. * size to the write end location.
  309. */
  310. mutex_lock(&zi->i_truncate_mutex);
  311. if (i_size_read(inode) < iocb->ki_pos + size) {
  312. zonefs_update_stats(inode, iocb->ki_pos + size);
  313. zonefs_i_size_write(inode, iocb->ki_pos + size);
  314. }
  315. mutex_unlock(&zi->i_truncate_mutex);
  316. }
  317. return 0;
  318. }
  319. static const struct iomap_dio_ops zonefs_write_dio_ops = {
  320. .end_io = zonefs_file_write_dio_end_io,
  321. };
  322. /*
  323. * Do not exceed the LFS limits nor the file zone size. If pos is under the
  324. * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
  325. */
  326. static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
  327. loff_t count)
  328. {
  329. struct inode *inode = file_inode(file);
  330. struct zonefs_zone *z = zonefs_inode_zone(inode);
  331. loff_t limit = rlimit(RLIMIT_FSIZE);
  332. loff_t max_size = z->z_capacity;
  333. if (limit != RLIM_INFINITY) {
  334. if (pos >= limit) {
  335. send_sig(SIGXFSZ, current, 0);
  336. return -EFBIG;
  337. }
  338. count = min(count, limit - pos);
  339. }
  340. if (!(file->f_flags & O_LARGEFILE))
  341. max_size = min_t(loff_t, MAX_NON_LFS, max_size);
  342. if (unlikely(pos >= max_size))
  343. return -EFBIG;
  344. return min(count, max_size - pos);
  345. }
  346. static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
  347. {
  348. struct file *file = iocb->ki_filp;
  349. struct inode *inode = file_inode(file);
  350. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  351. struct zonefs_zone *z = zonefs_inode_zone(inode);
  352. loff_t count;
  353. if (IS_SWAPFILE(inode))
  354. return -ETXTBSY;
  355. if (!iov_iter_count(from))
  356. return 0;
  357. if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
  358. return -EINVAL;
  359. if (iocb->ki_flags & IOCB_APPEND) {
  360. if (zonefs_zone_is_cnv(z))
  361. return -EINVAL;
  362. mutex_lock(&zi->i_truncate_mutex);
  363. iocb->ki_pos = z->z_wpoffset;
  364. mutex_unlock(&zi->i_truncate_mutex);
  365. }
  366. count = zonefs_write_check_limits(file, iocb->ki_pos,
  367. iov_iter_count(from));
  368. if (count < 0)
  369. return count;
  370. iov_iter_truncate(from, count);
  371. return iov_iter_count(from);
  372. }
  373. /*
  374. * Handle direct writes. For sequential zone files, this is the only possible
  375. * write path. For these files, check that the user is issuing writes
  376. * sequentially from the end of the file. This code assumes that the block layer
  377. * delivers write requests to the device in sequential order. This is always the
  378. * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
  379. * elevator feature is being used (e.g. mq-deadline). The block layer always
  380. * automatically select such an elevator for zoned block devices during the
  381. * device initialization.
  382. */
  383. static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
  384. {
  385. struct inode *inode = file_inode(iocb->ki_filp);
  386. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  387. struct zonefs_zone *z = zonefs_inode_zone(inode);
  388. struct super_block *sb = inode->i_sb;
  389. ssize_t ret, count;
  390. /*
  391. * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
  392. * as this can cause write reordering (e.g. the first aio gets EAGAIN
  393. * on the inode lock but the second goes through but is now unaligned).
  394. */
  395. if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) &&
  396. (iocb->ki_flags & IOCB_NOWAIT))
  397. return -EOPNOTSUPP;
  398. if (iocb->ki_flags & IOCB_NOWAIT) {
  399. if (!inode_trylock(inode))
  400. return -EAGAIN;
  401. } else {
  402. inode_lock(inode);
  403. }
  404. count = zonefs_write_checks(iocb, from);
  405. if (count <= 0) {
  406. ret = count;
  407. goto inode_unlock;
  408. }
  409. if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
  410. ret = -EINVAL;
  411. goto inode_unlock;
  412. }
  413. /* Enforce sequential writes (append only) in sequential zones */
  414. if (zonefs_zone_is_seq(z)) {
  415. mutex_lock(&zi->i_truncate_mutex);
  416. if (iocb->ki_pos != z->z_wpoffset) {
  417. mutex_unlock(&zi->i_truncate_mutex);
  418. ret = -EINVAL;
  419. goto inode_unlock;
  420. }
  421. /*
  422. * Advance the zone write pointer offset. This assumes that the
  423. * IO will succeed, which is OK to do because we do not allow
  424. * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
  425. * fails, the error path will correct the write pointer offset.
  426. */
  427. z->z_wpoffset += count;
  428. zonefs_inode_account_active(inode);
  429. mutex_unlock(&zi->i_truncate_mutex);
  430. }
  431. /*
  432. * iomap_dio_rw() may return ENOTBLK if there was an issue with
  433. * page invalidation. Overwrite that error code with EBUSY so that
  434. * the user can make sense of the error.
  435. */
  436. ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
  437. &zonefs_write_dio_ops, 0, NULL, 0);
  438. if (ret == -ENOTBLK)
  439. ret = -EBUSY;
  440. /*
  441. * For a failed IO or partial completion, trigger error recovery
  442. * to update the zone write pointer offset to a correct value.
  443. * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
  444. * have executed error recovery if the IO already completed when we
  445. * reach here. However, we cannot know that and execute error recovery
  446. * again (that will not change anything).
  447. */
  448. if (zonefs_zone_is_seq(z)) {
  449. if (ret > 0 && ret != count)
  450. ret = -EIO;
  451. if (ret < 0 && ret != -EIOCBQUEUED)
  452. zonefs_io_error(inode, true);
  453. }
  454. inode_unlock:
  455. inode_unlock(inode);
  456. return ret;
  457. }
  458. static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
  459. struct iov_iter *from)
  460. {
  461. struct inode *inode = file_inode(iocb->ki_filp);
  462. ssize_t ret;
  463. /*
  464. * Direct IO writes are mandatory for sequential zone files so that the
  465. * write IO issuing order is preserved.
  466. */
  467. if (zonefs_inode_is_seq(inode))
  468. return -EIO;
  469. if (iocb->ki_flags & IOCB_NOWAIT) {
  470. if (!inode_trylock(inode))
  471. return -EAGAIN;
  472. } else {
  473. inode_lock(inode);
  474. }
  475. ret = zonefs_write_checks(iocb, from);
  476. if (ret <= 0)
  477. goto inode_unlock;
  478. ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL);
  479. if (ret == -EIO)
  480. zonefs_io_error(inode, true);
  481. inode_unlock:
  482. inode_unlock(inode);
  483. if (ret > 0)
  484. ret = generic_write_sync(iocb, ret);
  485. return ret;
  486. }
  487. static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  488. {
  489. struct inode *inode = file_inode(iocb->ki_filp);
  490. struct zonefs_zone *z = zonefs_inode_zone(inode);
  491. if (unlikely(IS_IMMUTABLE(inode)))
  492. return -EPERM;
  493. if (sb_rdonly(inode->i_sb))
  494. return -EROFS;
  495. /* Write operations beyond the zone capacity are not allowed */
  496. if (iocb->ki_pos >= z->z_capacity)
  497. return -EFBIG;
  498. if (iocb->ki_flags & IOCB_DIRECT) {
  499. ssize_t ret = zonefs_file_dio_write(iocb, from);
  500. if (ret != -ENOTBLK)
  501. return ret;
  502. }
  503. return zonefs_file_buffered_write(iocb, from);
  504. }
  505. static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
  506. int error, unsigned int flags)
  507. {
  508. if (error) {
  509. zonefs_io_error(file_inode(iocb->ki_filp), false);
  510. return error;
  511. }
  512. return 0;
  513. }
  514. static const struct iomap_dio_ops zonefs_read_dio_ops = {
  515. .end_io = zonefs_file_read_dio_end_io,
  516. };
  517. static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  518. {
  519. struct inode *inode = file_inode(iocb->ki_filp);
  520. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  521. struct zonefs_zone *z = zonefs_inode_zone(inode);
  522. struct super_block *sb = inode->i_sb;
  523. loff_t isize;
  524. ssize_t ret;
  525. /* Offline zones cannot be read */
  526. if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
  527. return -EPERM;
  528. if (iocb->ki_pos >= z->z_capacity)
  529. return 0;
  530. if (iocb->ki_flags & IOCB_NOWAIT) {
  531. if (!inode_trylock_shared(inode))
  532. return -EAGAIN;
  533. } else {
  534. inode_lock_shared(inode);
  535. }
  536. /* Limit read operations to written data */
  537. mutex_lock(&zi->i_truncate_mutex);
  538. isize = i_size_read(inode);
  539. if (iocb->ki_pos >= isize) {
  540. mutex_unlock(&zi->i_truncate_mutex);
  541. ret = 0;
  542. goto inode_unlock;
  543. }
  544. iov_iter_truncate(to, isize - iocb->ki_pos);
  545. mutex_unlock(&zi->i_truncate_mutex);
  546. if (iocb->ki_flags & IOCB_DIRECT) {
  547. size_t count = iov_iter_count(to);
  548. if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
  549. ret = -EINVAL;
  550. goto inode_unlock;
  551. }
  552. file_accessed(iocb->ki_filp);
  553. ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
  554. &zonefs_read_dio_ops, 0, NULL, 0);
  555. } else {
  556. ret = generic_file_read_iter(iocb, to);
  557. if (ret == -EIO)
  558. zonefs_io_error(inode, false);
  559. }
  560. inode_unlock:
  561. inode_unlock_shared(inode);
  562. return ret;
  563. }
  564. static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos,
  565. struct pipe_inode_info *pipe,
  566. size_t len, unsigned int flags)
  567. {
  568. struct inode *inode = file_inode(in);
  569. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  570. struct zonefs_zone *z = zonefs_inode_zone(inode);
  571. loff_t isize;
  572. ssize_t ret = 0;
  573. /* Offline zones cannot be read */
  574. if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
  575. return -EPERM;
  576. if (*ppos >= z->z_capacity)
  577. return 0;
  578. inode_lock_shared(inode);
  579. /* Limit read operations to written data */
  580. mutex_lock(&zi->i_truncate_mutex);
  581. isize = i_size_read(inode);
  582. if (*ppos >= isize)
  583. len = 0;
  584. else
  585. len = min_t(loff_t, len, isize - *ppos);
  586. mutex_unlock(&zi->i_truncate_mutex);
  587. if (len > 0) {
  588. ret = filemap_splice_read(in, ppos, pipe, len, flags);
  589. if (ret == -EIO)
  590. zonefs_io_error(inode, false);
  591. }
  592. inode_unlock_shared(inode);
  593. return ret;
  594. }
  595. /*
  596. * Write open accounting is done only for sequential files.
  597. */
  598. static inline bool zonefs_seq_file_need_wro(struct inode *inode,
  599. struct file *file)
  600. {
  601. if (zonefs_inode_is_cnv(inode))
  602. return false;
  603. if (!(file->f_mode & FMODE_WRITE))
  604. return false;
  605. return true;
  606. }
  607. static int zonefs_seq_file_write_open(struct inode *inode)
  608. {
  609. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  610. struct zonefs_zone *z = zonefs_inode_zone(inode);
  611. int ret = 0;
  612. mutex_lock(&zi->i_truncate_mutex);
  613. if (!zi->i_wr_refcnt) {
  614. struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
  615. unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
  616. if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
  617. if (sbi->s_max_wro_seq_files
  618. && wro > sbi->s_max_wro_seq_files) {
  619. atomic_dec(&sbi->s_wro_seq_files);
  620. ret = -EBUSY;
  621. goto unlock;
  622. }
  623. if (i_size_read(inode) < z->z_capacity) {
  624. ret = zonefs_inode_zone_mgmt(inode,
  625. REQ_OP_ZONE_OPEN);
  626. if (ret) {
  627. atomic_dec(&sbi->s_wro_seq_files);
  628. goto unlock;
  629. }
  630. z->z_flags |= ZONEFS_ZONE_OPEN;
  631. zonefs_inode_account_active(inode);
  632. }
  633. }
  634. }
  635. zi->i_wr_refcnt++;
  636. unlock:
  637. mutex_unlock(&zi->i_truncate_mutex);
  638. return ret;
  639. }
  640. static int zonefs_file_open(struct inode *inode, struct file *file)
  641. {
  642. int ret;
  643. file->f_mode |= FMODE_CAN_ODIRECT;
  644. ret = generic_file_open(inode, file);
  645. if (ret)
  646. return ret;
  647. if (zonefs_seq_file_need_wro(inode, file))
  648. return zonefs_seq_file_write_open(inode);
  649. return 0;
  650. }
  651. static void zonefs_seq_file_write_close(struct inode *inode)
  652. {
  653. struct zonefs_inode_info *zi = ZONEFS_I(inode);
  654. struct zonefs_zone *z = zonefs_inode_zone(inode);
  655. struct super_block *sb = inode->i_sb;
  656. struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
  657. int ret = 0;
  658. mutex_lock(&zi->i_truncate_mutex);
  659. zi->i_wr_refcnt--;
  660. if (zi->i_wr_refcnt)
  661. goto unlock;
  662. /*
  663. * The file zone may not be open anymore (e.g. the file was truncated to
  664. * its maximum size or it was fully written). For this case, we only
  665. * need to decrement the write open count.
  666. */
  667. if (z->z_flags & ZONEFS_ZONE_OPEN) {
  668. ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
  669. if (ret) {
  670. __zonefs_io_error(inode, false);
  671. /*
  672. * Leaving zones explicitly open may lead to a state
  673. * where most zones cannot be written (zone resources
  674. * exhausted). So take preventive action by remounting
  675. * read-only.
  676. */
  677. if (z->z_flags & ZONEFS_ZONE_OPEN &&
  678. !(sb->s_flags & SB_RDONLY)) {
  679. zonefs_warn(sb,
  680. "closing zone at %llu failed %d\n",
  681. z->z_sector, ret);
  682. zonefs_warn(sb,
  683. "remounting filesystem read-only\n");
  684. sb->s_flags |= SB_RDONLY;
  685. }
  686. goto unlock;
  687. }
  688. z->z_flags &= ~ZONEFS_ZONE_OPEN;
  689. zonefs_inode_account_active(inode);
  690. }
  691. atomic_dec(&sbi->s_wro_seq_files);
  692. unlock:
  693. mutex_unlock(&zi->i_truncate_mutex);
  694. }
  695. static int zonefs_file_release(struct inode *inode, struct file *file)
  696. {
  697. /*
  698. * If we explicitly open a zone we must close it again as well, but the
  699. * zone management operation can fail (either due to an IO error or as
  700. * the zone has gone offline or read-only). Make sure we don't fail the
  701. * close(2) for user-space.
  702. */
  703. if (zonefs_seq_file_need_wro(inode, file))
  704. zonefs_seq_file_write_close(inode);
  705. return 0;
  706. }
  707. const struct file_operations zonefs_file_operations = {
  708. .open = zonefs_file_open,
  709. .release = zonefs_file_release,
  710. .fsync = zonefs_file_fsync,
  711. .mmap = zonefs_file_mmap,
  712. .llseek = zonefs_file_llseek,
  713. .read_iter = zonefs_file_read_iter,
  714. .write_iter = zonefs_file_write_iter,
  715. .splice_read = zonefs_file_splice_read,
  716. .splice_write = iter_file_splice_write,
  717. .iopoll = iocb_bio_iopoll,
  718. };