file.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * (C) 2001 Clemson University and The University of Chicago
  4. * Copyright 2018 Omnibond Systems, L.L.C.
  5. *
  6. * See COPYING in top-level directory.
  7. */
  8. /*
  9. * Linux VFS file operations.
  10. */
  11. #include "protocol.h"
  12. #include "orangefs-kernel.h"
  13. #include "orangefs-bufmap.h"
  14. #include <linux/fs.h>
  15. #include <linux/filelock.h>
  16. #include <linux/pagemap.h>
  17. static int flush_racache(struct inode *inode)
  18. {
  19. struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
  20. struct orangefs_kernel_op_s *new_op;
  21. int ret;
  22. gossip_debug(GOSSIP_UTILS_DEBUG,
  23. "%s: %pU: Handle is %pU | fs_id %d\n", __func__,
  24. get_khandle_from_ino(inode), &orangefs_inode->refn.khandle,
  25. orangefs_inode->refn.fs_id);
  26. new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH);
  27. if (!new_op)
  28. return -ENOMEM;
  29. new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn;
  30. ret = service_operation(new_op, "orangefs_flush_racache",
  31. get_interruptible_flag(inode));
  32. gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n",
  33. __func__, ret);
  34. op_release(new_op);
  35. return ret;
  36. }
  37. /*
  38. * Post and wait for the I/O upcall to finish
  39. */
  40. ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
  41. loff_t *offset, struct iov_iter *iter, size_t total_size,
  42. loff_t readahead_size, struct orangefs_write_range *wr,
  43. int *index_return, struct file *file)
  44. {
  45. struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
  46. struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
  47. struct orangefs_kernel_op_s *new_op = NULL;
  48. int buffer_index;
  49. ssize_t ret;
  50. size_t copy_amount;
  51. int open_for_read;
  52. int open_for_write;
  53. new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
  54. if (!new_op)
  55. return -ENOMEM;
  56. /* synchronous I/O */
  57. new_op->upcall.req.io.readahead_size = readahead_size;
  58. new_op->upcall.req.io.io_type = type;
  59. new_op->upcall.req.io.refn = orangefs_inode->refn;
  60. populate_shared_memory:
  61. /* get a shared buffer index */
  62. buffer_index = orangefs_bufmap_get();
  63. if (buffer_index < 0) {
  64. ret = buffer_index;
  65. gossip_debug(GOSSIP_FILE_DEBUG,
  66. "%s: orangefs_bufmap_get failure (%zd)\n",
  67. __func__, ret);
  68. goto out;
  69. }
  70. gossip_debug(GOSSIP_FILE_DEBUG,
  71. "%s(%pU): GET op %p -> buffer_index %d\n",
  72. __func__,
  73. handle,
  74. new_op,
  75. buffer_index);
  76. new_op->uses_shared_memory = 1;
  77. new_op->upcall.req.io.buf_index = buffer_index;
  78. new_op->upcall.req.io.count = total_size;
  79. new_op->upcall.req.io.offset = *offset;
  80. if (type == ORANGEFS_IO_WRITE && wr) {
  81. new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
  82. new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
  83. }
  84. /*
  85. * Orangefs has no open, and orangefs checks file permissions
  86. * on each file access. Posix requires that file permissions
  87. * be checked on open and nowhere else. Orangefs-through-the-kernel
  88. * needs to seem posix compliant.
  89. *
  90. * The VFS opens files, even if the filesystem provides no
  91. * method. We can see if a file was successfully opened for
  92. * read and or for write by looking at file->f_mode.
  93. *
  94. * When writes are flowing from the page cache, file is no
  95. * longer available. We can trust the VFS to have checked
  96. * file->f_mode before writing to the page cache.
  97. *
  98. * The mode of a file might change between when it is opened
  99. * and IO commences, or it might be created with an arbitrary mode.
  100. *
  101. * We'll make sure we don't hit EACCES during the IO stage by
  102. * using UID 0. Some of the time we have access without changing
  103. * to UID 0 - how to check?
  104. */
  105. if (file) {
  106. open_for_write = file->f_mode & FMODE_WRITE;
  107. open_for_read = file->f_mode & FMODE_READ;
  108. } else {
  109. open_for_write = 1;
  110. open_for_read = 0; /* not relevant? */
  111. }
  112. if ((type == ORANGEFS_IO_WRITE) && open_for_write)
  113. new_op->upcall.uid = 0;
  114. if ((type == ORANGEFS_IO_READ) && open_for_read)
  115. new_op->upcall.uid = 0;
  116. gossip_debug(GOSSIP_FILE_DEBUG,
  117. "%s(%pU): offset: %llu total_size: %zd\n",
  118. __func__,
  119. handle,
  120. llu(*offset),
  121. total_size);
  122. /*
  123. * Stage 1: copy the buffers into client-core's address space
  124. */
  125. if (type == ORANGEFS_IO_WRITE && total_size) {
  126. ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index,
  127. total_size);
  128. if (ret < 0) {
  129. gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
  130. __func__, (long)ret);
  131. goto out;
  132. }
  133. }
  134. gossip_debug(GOSSIP_FILE_DEBUG,
  135. "%s(%pU): Calling post_io_request with tag (%llu)\n",
  136. __func__,
  137. handle,
  138. llu(new_op->tag));
  139. /* Stage 2: Service the I/O operation */
  140. ret = service_operation(new_op,
  141. type == ORANGEFS_IO_WRITE ?
  142. "file_write" :
  143. "file_read",
  144. get_interruptible_flag(inode));
  145. /*
  146. * If service_operation() returns -EAGAIN #and# the operation was
  147. * purged from orangefs_request_list or htable_ops_in_progress, then
  148. * we know that the client was restarted, causing the shared memory
  149. * area to be wiped clean. To restart a write operation in this
  150. * case, we must re-copy the data from the user's iovec to a NEW
  151. * shared memory location. To restart a read operation, we must get
  152. * a new shared memory location.
  153. */
  154. if (ret == -EAGAIN && op_state_purged(new_op)) {
  155. orangefs_bufmap_put(buffer_index);
  156. if (type == ORANGEFS_IO_WRITE)
  157. iov_iter_revert(iter, total_size);
  158. gossip_debug(GOSSIP_FILE_DEBUG,
  159. "%s:going to repopulate_shared_memory.\n",
  160. __func__);
  161. goto populate_shared_memory;
  162. }
  163. if (ret < 0) {
  164. if (ret == -EINTR) {
  165. /*
  166. * We can't return EINTR if any data was written,
  167. * it's not POSIX. It is minimally acceptable
  168. * to give a partial write, the way NFS does.
  169. *
  170. * It would be optimal to return all or nothing,
  171. * but if a userspace write is bigger than
  172. * an IO buffer, and the interrupt occurs
  173. * between buffer writes, that would not be
  174. * possible.
  175. */
  176. switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) {
  177. /*
  178. * If the op was waiting when the interrupt
  179. * occurred, then the client-core did not
  180. * trigger the write.
  181. */
  182. case OP_VFS_STATE_WAITING:
  183. if (*offset == 0)
  184. ret = -EINTR;
  185. else
  186. ret = 0;
  187. break;
  188. /*
  189. * If the op was in progress when the interrupt
  190. * occurred, then the client-core was able to
  191. * trigger the write.
  192. */
  193. case OP_VFS_STATE_INPROGR:
  194. if (type == ORANGEFS_IO_READ)
  195. ret = -EINTR;
  196. else
  197. ret = total_size;
  198. break;
  199. default:
  200. gossip_err("%s: unexpected op state :%d:.\n",
  201. __func__,
  202. new_op->op_state);
  203. ret = 0;
  204. break;
  205. }
  206. gossip_debug(GOSSIP_FILE_DEBUG,
  207. "%s: got EINTR, state:%d: %p\n",
  208. __func__,
  209. new_op->op_state,
  210. new_op);
  211. } else {
  212. gossip_err("%s: error in %s handle %pU, returning %zd\n",
  213. __func__,
  214. type == ORANGEFS_IO_READ ?
  215. "read from" : "write to",
  216. handle, ret);
  217. }
  218. if (orangefs_cancel_op_in_progress(new_op))
  219. return ret;
  220. goto out;
  221. }
  222. /*
  223. * Stage 3: Post copy buffers from client-core's address space
  224. */
  225. if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) {
  226. /*
  227. * NOTE: the iovector can either contain addresses which
  228. * can futher be kernel-space or user-space addresses.
  229. * or it can pointers to struct page's
  230. */
  231. copy_amount = new_op->downcall.resp.io.amt_complete;
  232. ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
  233. copy_amount);
  234. if (ret < 0) {
  235. gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
  236. __func__, (long)ret);
  237. goto out;
  238. }
  239. }
  240. gossip_debug(GOSSIP_FILE_DEBUG,
  241. "%s(%pU): Amount %s, returned by the sys-io call:%d\n",
  242. __func__,
  243. handle,
  244. type == ORANGEFS_IO_READ ? "read" : "written",
  245. (int)new_op->downcall.resp.io.amt_complete);
  246. ret = new_op->downcall.resp.io.amt_complete;
  247. out:
  248. if (buffer_index >= 0) {
  249. orangefs_bufmap_put(buffer_index);
  250. gossip_debug(GOSSIP_FILE_DEBUG,
  251. "%s(%pU): PUT buffer_index %d\n",
  252. __func__, handle, buffer_index);
  253. }
  254. op_release(new_op);
  255. return ret;
  256. }
  257. int orangefs_revalidate_mapping(struct inode *inode)
  258. {
  259. struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
  260. struct address_space *mapping = inode->i_mapping;
  261. unsigned long *bitlock = &orangefs_inode->bitlock;
  262. int ret;
  263. while (1) {
  264. ret = wait_on_bit(bitlock, 1, TASK_KILLABLE);
  265. if (ret)
  266. return ret;
  267. spin_lock(&inode->i_lock);
  268. if (test_bit(1, bitlock)) {
  269. spin_unlock(&inode->i_lock);
  270. continue;
  271. }
  272. if (!time_before(jiffies, orangefs_inode->mapping_time))
  273. break;
  274. spin_unlock(&inode->i_lock);
  275. return 0;
  276. }
  277. set_bit(1, bitlock);
  278. smp_wmb();
  279. spin_unlock(&inode->i_lock);
  280. unmap_mapping_range(mapping, 0, 0, 0);
  281. ret = filemap_write_and_wait(mapping);
  282. if (!ret)
  283. ret = invalidate_inode_pages2(mapping);
  284. orangefs_inode->mapping_time = jiffies +
  285. orangefs_cache_timeout_msecs*HZ/1000;
  286. clear_bit(1, bitlock);
  287. smp_mb__after_atomic();
  288. wake_up_bit(bitlock, 1);
  289. return ret;
  290. }
  291. static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
  292. struct iov_iter *iter)
  293. {
  294. int ret;
  295. orangefs_stats.reads++;
  296. down_read(&file_inode(iocb->ki_filp)->i_rwsem);
  297. ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
  298. if (ret)
  299. goto out;
  300. ret = generic_file_read_iter(iocb, iter);
  301. out:
  302. up_read(&file_inode(iocb->ki_filp)->i_rwsem);
  303. return ret;
  304. }
  305. static ssize_t orangefs_file_splice_read(struct file *in, loff_t *ppos,
  306. struct pipe_inode_info *pipe,
  307. size_t len, unsigned int flags)
  308. {
  309. struct inode *inode = file_inode(in);
  310. ssize_t ret;
  311. orangefs_stats.reads++;
  312. down_read(&inode->i_rwsem);
  313. ret = orangefs_revalidate_mapping(inode);
  314. if (ret)
  315. goto out;
  316. ret = filemap_splice_read(in, ppos, pipe, len, flags);
  317. out:
  318. up_read(&inode->i_rwsem);
  319. return ret;
  320. }
  321. static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
  322. struct iov_iter *iter)
  323. {
  324. int ret;
  325. orangefs_stats.writes++;
  326. if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) {
  327. ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
  328. if (ret)
  329. return ret;
  330. }
  331. ret = generic_file_write_iter(iocb, iter);
  332. return ret;
  333. }
  334. static vm_fault_t orangefs_fault(struct vm_fault *vmf)
  335. {
  336. struct file *file = vmf->vma->vm_file;
  337. int ret;
  338. ret = orangefs_inode_getattr(file->f_mapping->host,
  339. ORANGEFS_GETATTR_SIZE);
  340. if (ret == -ESTALE)
  341. ret = -EIO;
  342. if (ret) {
  343. gossip_err("%s: orangefs_inode_getattr failed, "
  344. "ret:%d:.\n", __func__, ret);
  345. return VM_FAULT_SIGBUS;
  346. }
  347. return filemap_fault(vmf);
  348. }
  349. static const struct vm_operations_struct orangefs_file_vm_ops = {
  350. .fault = orangefs_fault,
  351. .map_pages = filemap_map_pages,
  352. .page_mkwrite = orangefs_page_mkwrite,
  353. };
  354. /*
  355. * Memory map a region of a file.
  356. */
  357. static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
  358. {
  359. int ret;
  360. ret = orangefs_revalidate_mapping(file_inode(file));
  361. if (ret)
  362. return ret;
  363. gossip_debug(GOSSIP_FILE_DEBUG,
  364. "orangefs_file_mmap: called on %pD\n", file);
  365. /* set the sequential readahead hint */
  366. vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ);
  367. file_accessed(file);
  368. vma->vm_ops = &orangefs_file_vm_ops;
  369. return 0;
  370. }
  371. #define mapping_nrpages(idata) ((idata)->nrpages)
  372. /*
  373. * Called to notify the module that there are no more references to
  374. * this file (i.e. no processes have it open).
  375. *
  376. * \note Not called when each file is closed.
  377. */
  378. static int orangefs_file_release(struct inode *inode, struct file *file)
  379. {
  380. gossip_debug(GOSSIP_FILE_DEBUG,
  381. "orangefs_file_release: called on %pD\n",
  382. file);
  383. /*
  384. * remove all associated inode pages from the page cache and
  385. * readahead cache (if any); this forces an expensive refresh of
  386. * data for the next caller of mmap (or 'get_block' accesses)
  387. */
  388. if (mapping_nrpages(file->f_mapping)) {
  389. if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) {
  390. gossip_debug(GOSSIP_INODE_DEBUG,
  391. "calling flush_racache on %pU\n",
  392. get_khandle_from_ino(inode));
  393. flush_racache(inode);
  394. gossip_debug(GOSSIP_INODE_DEBUG,
  395. "flush_racache finished\n");
  396. }
  397. }
  398. return 0;
  399. }
  400. /*
  401. * Push all data for a specific file onto permanent storage.
  402. */
  403. static int orangefs_fsync(struct file *file,
  404. loff_t start,
  405. loff_t end,
  406. int datasync)
  407. {
  408. int ret;
  409. struct orangefs_inode_s *orangefs_inode =
  410. ORANGEFS_I(file_inode(file));
  411. struct orangefs_kernel_op_s *new_op = NULL;
  412. ret = filemap_write_and_wait_range(file_inode(file)->i_mapping,
  413. start, end);
  414. if (ret < 0)
  415. return ret;
  416. new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC);
  417. if (!new_op)
  418. return -ENOMEM;
  419. new_op->upcall.req.fsync.refn = orangefs_inode->refn;
  420. ret = service_operation(new_op,
  421. "orangefs_fsync",
  422. get_interruptible_flag(file_inode(file)));
  423. gossip_debug(GOSSIP_FILE_DEBUG,
  424. "orangefs_fsync got return value of %d\n",
  425. ret);
  426. op_release(new_op);
  427. return ret;
  428. }
  429. /*
  430. * Change the file pointer position for an instance of an open file.
  431. *
  432. * \note If .llseek is overriden, we must acquire lock as described in
  433. * Documentation/filesystems/locking.rst.
  434. *
  435. * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
  436. * require much changes to the FS
  437. */
  438. static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
  439. {
  440. int ret = -EINVAL;
  441. struct inode *inode = file_inode(file);
  442. if (origin == SEEK_END) {
  443. /*
  444. * revalidate the inode's file size.
  445. * NOTE: We are only interested in file size here,
  446. * so we set mask accordingly.
  447. */
  448. ret = orangefs_inode_getattr(file->f_mapping->host,
  449. ORANGEFS_GETATTR_SIZE);
  450. if (ret == -ESTALE)
  451. ret = -EIO;
  452. if (ret) {
  453. gossip_debug(GOSSIP_FILE_DEBUG,
  454. "%s:%s:%d calling make bad inode\n",
  455. __FILE__,
  456. __func__,
  457. __LINE__);
  458. return ret;
  459. }
  460. }
  461. gossip_debug(GOSSIP_FILE_DEBUG,
  462. "orangefs_file_llseek: offset is %ld | origin is %d"
  463. " | inode size is %lu\n",
  464. (long)offset,
  465. origin,
  466. (unsigned long)i_size_read(inode));
  467. return generic_file_llseek(file, offset, origin);
  468. }
  469. /*
  470. * Support local locks (locks that only this kernel knows about)
  471. * if Orangefs was mounted -o local_lock.
  472. */
  473. static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
  474. {
  475. int rc = -EINVAL;
  476. if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
  477. if (cmd == F_GETLK) {
  478. rc = 0;
  479. posix_test_lock(filp, fl);
  480. } else {
  481. rc = posix_lock_file(filp, fl, NULL);
  482. }
  483. }
  484. return rc;
  485. }
  486. static int orangefs_flush(struct file *file, fl_owner_t id)
  487. {
  488. /*
  489. * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the
  490. * service_operation in orangefs_fsync.
  491. *
  492. * Do not send fsync to OrangeFS server on a close. Do send fsync
  493. * on an explicit fsync call. This duplicates historical OrangeFS
  494. * behavior.
  495. */
  496. int r;
  497. r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX);
  498. if (r > 0)
  499. return 0;
  500. else
  501. return r;
  502. }
  503. /** ORANGEFS implementation of VFS file operations */
  504. const struct file_operations orangefs_file_operations = {
  505. .llseek = orangefs_file_llseek,
  506. .read_iter = orangefs_file_read_iter,
  507. .write_iter = orangefs_file_write_iter,
  508. .lock = orangefs_lock,
  509. .mmap = orangefs_file_mmap,
  510. .open = generic_file_open,
  511. .splice_read = orangefs_file_splice_read,
  512. .splice_write = iter_file_splice_write,
  513. .flush = orangefs_flush,
  514. .release = orangefs_file_release,
  515. .fsync = orangefs_fsync,
  516. };