splice.c 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * "splice": joining two ropes together by interweaving their strands.
  4. *
  5. * This is the "extended pipe" functionality, where a pipe is used as
  6. * an arbitrary in-memory buffer. Think of a pipe as a small kernel
  7. * buffer that you can use to transfer data from one end to the other.
  8. *
  9. * The traditional unix read/write is extended with a "splice()" operation
  10. * that transfers data buffers to or from a pipe buffer.
  11. *
  12. * Named by Larry McVoy, original implementation from Linus, extended by
  13. * Jens to support splicing to files, network, direct splicing, etc and
  14. * fixing lots of bugs.
  15. *
  16. * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  17. * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  18. * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  19. *
  20. */
  21. #include <linux/bvec.h>
  22. #include <linux/fs.h>
  23. #include <linux/file.h>
  24. #include <linux/pagemap.h>
  25. #include <linux/splice.h>
  26. #include <linux/memcontrol.h>
  27. #include <linux/mm_inline.h>
  28. #include <linux/swap.h>
  29. #include <linux/writeback.h>
  30. #include <linux/export.h>
  31. #include <linux/syscalls.h>
  32. #include <linux/uio.h>
  33. #include <linux/fsnotify.h>
  34. #include <linux/security.h>
  35. #include <linux/gfp.h>
  36. #include <linux/net.h>
  37. #include <linux/socket.h>
  38. #include <linux/sched/signal.h>
  39. #include "internal.h"
  40. /*
  41. * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
  42. * indicate they support non-blocking reads or writes, we must clear it
  43. * here if set to avoid blocking other users of this pipe if splice is
  44. * being done on it.
  45. */
  46. static noinline void pipe_clear_nowait(struct file *file)
  47. {
  48. fmode_t fmode = READ_ONCE(file->f_mode);
  49. do {
  50. if (!(fmode & FMODE_NOWAIT))
  51. break;
  52. } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
  53. }
  54. /*
  55. * Attempt to steal a page from a pipe buffer. This should perhaps go into
  56. * a vm helper function, it's already simplified quite a bit by the
  57. * addition of remove_mapping(). If success is returned, the caller may
  58. * attempt to reuse this page for another destination.
  59. */
  60. static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
  61. struct pipe_buffer *buf)
  62. {
  63. struct folio *folio = page_folio(buf->page);
  64. struct address_space *mapping;
  65. folio_lock(folio);
  66. mapping = folio_mapping(folio);
  67. if (mapping) {
  68. WARN_ON(!folio_test_uptodate(folio));
  69. /*
  70. * At least for ext2 with nobh option, we need to wait on
  71. * writeback completing on this folio, since we'll remove it
  72. * from the pagecache. Otherwise truncate wont wait on the
  73. * folio, allowing the disk blocks to be reused by someone else
  74. * before we actually wrote our data to them. fs corruption
  75. * ensues.
  76. */
  77. folio_wait_writeback(folio);
  78. if (!filemap_release_folio(folio, GFP_KERNEL))
  79. goto out_unlock;
  80. /*
  81. * If we succeeded in removing the mapping, set LRU flag
  82. * and return good.
  83. */
  84. if (remove_mapping(mapping, folio)) {
  85. buf->flags |= PIPE_BUF_FLAG_LRU;
  86. return true;
  87. }
  88. }
  89. /*
  90. * Raced with truncate or failed to remove folio from current
  91. * address space, unlock and return failure.
  92. */
  93. out_unlock:
  94. folio_unlock(folio);
  95. return false;
  96. }
  97. static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
  98. struct pipe_buffer *buf)
  99. {
  100. put_page(buf->page);
  101. buf->flags &= ~PIPE_BUF_FLAG_LRU;
  102. }
  103. /*
  104. * Check whether the contents of buf is OK to access. Since the content
  105. * is a page cache page, IO may be in flight.
  106. */
  107. static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
  108. struct pipe_buffer *buf)
  109. {
  110. struct folio *folio = page_folio(buf->page);
  111. int err;
  112. if (!folio_test_uptodate(folio)) {
  113. folio_lock(folio);
  114. /*
  115. * Folio got truncated/unhashed. This will cause a 0-byte
  116. * splice, if this is the first page.
  117. */
  118. if (!folio->mapping) {
  119. err = -ENODATA;
  120. goto error;
  121. }
  122. /*
  123. * Uh oh, read-error from disk.
  124. */
  125. if (!folio_test_uptodate(folio)) {
  126. err = -EIO;
  127. goto error;
  128. }
  129. /* Folio is ok after all, we are done */
  130. folio_unlock(folio);
  131. }
  132. return 0;
  133. error:
  134. folio_unlock(folio);
  135. return err;
  136. }
  137. const struct pipe_buf_operations page_cache_pipe_buf_ops = {
  138. .confirm = page_cache_pipe_buf_confirm,
  139. .release = page_cache_pipe_buf_release,
  140. .try_steal = page_cache_pipe_buf_try_steal,
  141. .get = generic_pipe_buf_get,
  142. };
  143. static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
  144. struct pipe_buffer *buf)
  145. {
  146. if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
  147. return false;
  148. buf->flags |= PIPE_BUF_FLAG_LRU;
  149. return generic_pipe_buf_try_steal(pipe, buf);
  150. }
  151. static const struct pipe_buf_operations user_page_pipe_buf_ops = {
  152. .release = page_cache_pipe_buf_release,
  153. .try_steal = user_page_pipe_buf_try_steal,
  154. .get = generic_pipe_buf_get,
  155. };
  156. static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
  157. {
  158. smp_mb();
  159. if (waitqueue_active(&pipe->rd_wait))
  160. wake_up_interruptible(&pipe->rd_wait);
  161. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  162. }
  163. /**
  164. * splice_to_pipe - fill passed data into a pipe
  165. * @pipe: pipe to fill
  166. * @spd: data to fill
  167. *
  168. * Description:
  169. * @spd contains a map of pages and len/offset tuples, along with
  170. * the struct pipe_buf_operations associated with these pages. This
  171. * function will link that data to the pipe.
  172. *
  173. */
  174. ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
  175. struct splice_pipe_desc *spd)
  176. {
  177. unsigned int spd_pages = spd->nr_pages;
  178. unsigned int tail = pipe->tail;
  179. unsigned int head = pipe->head;
  180. unsigned int mask = pipe->ring_size - 1;
  181. ssize_t ret = 0;
  182. int page_nr = 0;
  183. if (!spd_pages)
  184. return 0;
  185. if (unlikely(!pipe->readers)) {
  186. send_sig(SIGPIPE, current, 0);
  187. ret = -EPIPE;
  188. goto out;
  189. }
  190. while (!pipe_full(head, tail, pipe->max_usage)) {
  191. struct pipe_buffer *buf = &pipe->bufs[head & mask];
  192. buf->page = spd->pages[page_nr];
  193. buf->offset = spd->partial[page_nr].offset;
  194. buf->len = spd->partial[page_nr].len;
  195. buf->private = spd->partial[page_nr].private;
  196. buf->ops = spd->ops;
  197. buf->flags = 0;
  198. head++;
  199. pipe->head = head;
  200. page_nr++;
  201. ret += buf->len;
  202. if (!--spd->nr_pages)
  203. break;
  204. }
  205. if (!ret)
  206. ret = -EAGAIN;
  207. out:
  208. while (page_nr < spd_pages)
  209. spd->spd_release(spd, page_nr++);
  210. return ret;
  211. }
  212. EXPORT_SYMBOL_GPL(splice_to_pipe);
  213. ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
  214. {
  215. unsigned int head = pipe->head;
  216. unsigned int tail = pipe->tail;
  217. unsigned int mask = pipe->ring_size - 1;
  218. int ret;
  219. if (unlikely(!pipe->readers)) {
  220. send_sig(SIGPIPE, current, 0);
  221. ret = -EPIPE;
  222. } else if (pipe_full(head, tail, pipe->max_usage)) {
  223. ret = -EAGAIN;
  224. } else {
  225. pipe->bufs[head & mask] = *buf;
  226. pipe->head = head + 1;
  227. return buf->len;
  228. }
  229. pipe_buf_release(pipe, buf);
  230. return ret;
  231. }
  232. EXPORT_SYMBOL(add_to_pipe);
  233. /*
  234. * Check if we need to grow the arrays holding pages and partial page
  235. * descriptions.
  236. */
  237. int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
  238. {
  239. unsigned int max_usage = READ_ONCE(pipe->max_usage);
  240. spd->nr_pages_max = max_usage;
  241. if (max_usage <= PIPE_DEF_BUFFERS)
  242. return 0;
  243. spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
  244. spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
  245. GFP_KERNEL);
  246. if (spd->pages && spd->partial)
  247. return 0;
  248. kfree(spd->pages);
  249. kfree(spd->partial);
  250. return -ENOMEM;
  251. }
  252. void splice_shrink_spd(struct splice_pipe_desc *spd)
  253. {
  254. if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
  255. return;
  256. kfree(spd->pages);
  257. kfree(spd->partial);
  258. }
  259. /**
  260. * copy_splice_read - Copy data from a file and splice the copy into a pipe
  261. * @in: The file to read from
  262. * @ppos: Pointer to the file position to read from
  263. * @pipe: The pipe to splice into
  264. * @len: The amount to splice
  265. * @flags: The SPLICE_F_* flags
  266. *
  267. * This function allocates a bunch of pages sufficient to hold the requested
  268. * amount of data (but limited by the remaining pipe capacity), passes it to
  269. * the file's ->read_iter() to read into and then splices the used pages into
  270. * the pipe.
  271. *
  272. * Return: On success, the number of bytes read will be returned and *@ppos
  273. * will be updated if appropriate; 0 will be returned if there is no more data
  274. * to be read; -EAGAIN will be returned if the pipe had no space, and some
  275. * other negative error code will be returned on error. A short read may occur
  276. * if the pipe has insufficient space, we reach the end of the data or we hit a
  277. * hole.
  278. */
  279. ssize_t copy_splice_read(struct file *in, loff_t *ppos,
  280. struct pipe_inode_info *pipe,
  281. size_t len, unsigned int flags)
  282. {
  283. struct iov_iter to;
  284. struct bio_vec *bv;
  285. struct kiocb kiocb;
  286. struct page **pages;
  287. ssize_t ret;
  288. size_t used, npages, chunk, remain, keep = 0;
  289. int i;
  290. /* Work out how much data we can actually add into the pipe */
  291. used = pipe_occupancy(pipe->head, pipe->tail);
  292. npages = max_t(ssize_t, pipe->max_usage - used, 0);
  293. len = min_t(size_t, len, npages * PAGE_SIZE);
  294. npages = DIV_ROUND_UP(len, PAGE_SIZE);
  295. bv = kzalloc(array_size(npages, sizeof(bv[0])) +
  296. array_size(npages, sizeof(struct page *)), GFP_KERNEL);
  297. if (!bv)
  298. return -ENOMEM;
  299. pages = (struct page **)(bv + npages);
  300. npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
  301. if (!npages) {
  302. kfree(bv);
  303. return -ENOMEM;
  304. }
  305. remain = len = min_t(size_t, len, npages * PAGE_SIZE);
  306. for (i = 0; i < npages; i++) {
  307. chunk = min_t(size_t, PAGE_SIZE, remain);
  308. bv[i].bv_page = pages[i];
  309. bv[i].bv_offset = 0;
  310. bv[i].bv_len = chunk;
  311. remain -= chunk;
  312. }
  313. /* Do the I/O */
  314. iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
  315. init_sync_kiocb(&kiocb, in);
  316. kiocb.ki_pos = *ppos;
  317. ret = in->f_op->read_iter(&kiocb, &to);
  318. if (ret > 0) {
  319. keep = DIV_ROUND_UP(ret, PAGE_SIZE);
  320. *ppos = kiocb.ki_pos;
  321. }
  322. /*
  323. * Callers of ->splice_read() expect -EAGAIN on "can't put anything in
  324. * there", rather than -EFAULT.
  325. */
  326. if (ret == -EFAULT)
  327. ret = -EAGAIN;
  328. /* Free any pages that didn't get touched at all. */
  329. if (keep < npages)
  330. release_pages(pages + keep, npages - keep);
  331. /* Push the remaining pages into the pipe. */
  332. remain = ret;
  333. for (i = 0; i < keep; i++) {
  334. struct pipe_buffer *buf = pipe_head_buf(pipe);
  335. chunk = min_t(size_t, remain, PAGE_SIZE);
  336. *buf = (struct pipe_buffer) {
  337. .ops = &default_pipe_buf_ops,
  338. .page = bv[i].bv_page,
  339. .offset = 0,
  340. .len = chunk,
  341. };
  342. pipe->head++;
  343. remain -= chunk;
  344. }
  345. kfree(bv);
  346. return ret;
  347. }
  348. EXPORT_SYMBOL(copy_splice_read);
  349. const struct pipe_buf_operations default_pipe_buf_ops = {
  350. .release = generic_pipe_buf_release,
  351. .try_steal = generic_pipe_buf_try_steal,
  352. .get = generic_pipe_buf_get,
  353. };
  354. /* Pipe buffer operations for a socket and similar. */
  355. const struct pipe_buf_operations nosteal_pipe_buf_ops = {
  356. .release = generic_pipe_buf_release,
  357. .get = generic_pipe_buf_get,
  358. };
  359. EXPORT_SYMBOL(nosteal_pipe_buf_ops);
  360. static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
  361. {
  362. smp_mb();
  363. if (waitqueue_active(&pipe->wr_wait))
  364. wake_up_interruptible(&pipe->wr_wait);
  365. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  366. }
  367. /**
  368. * splice_from_pipe_feed - feed available data from a pipe to a file
  369. * @pipe: pipe to splice from
  370. * @sd: information to @actor
  371. * @actor: handler that splices the data
  372. *
  373. * Description:
  374. * This function loops over the pipe and calls @actor to do the
  375. * actual moving of a single struct pipe_buffer to the desired
  376. * destination. It returns when there's no more buffers left in
  377. * the pipe or if the requested number of bytes (@sd->total_len)
  378. * have been copied. It returns a positive number (one) if the
  379. * pipe needs to be filled with more data, zero if the required
  380. * number of bytes have been copied and -errno on error.
  381. *
  382. * This, together with splice_from_pipe_{begin,end,next}, may be
  383. * used to implement the functionality of __splice_from_pipe() when
  384. * locking is required around copying the pipe buffers to the
  385. * destination.
  386. */
  387. static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
  388. splice_actor *actor)
  389. {
  390. unsigned int head = pipe->head;
  391. unsigned int tail = pipe->tail;
  392. unsigned int mask = pipe->ring_size - 1;
  393. int ret;
  394. while (!pipe_empty(head, tail)) {
  395. struct pipe_buffer *buf = &pipe->bufs[tail & mask];
  396. sd->len = buf->len;
  397. if (sd->len > sd->total_len)
  398. sd->len = sd->total_len;
  399. ret = pipe_buf_confirm(pipe, buf);
  400. if (unlikely(ret)) {
  401. if (ret == -ENODATA)
  402. ret = 0;
  403. return ret;
  404. }
  405. ret = actor(pipe, buf, sd);
  406. if (ret <= 0)
  407. return ret;
  408. buf->offset += ret;
  409. buf->len -= ret;
  410. sd->num_spliced += ret;
  411. sd->len -= ret;
  412. sd->pos += ret;
  413. sd->total_len -= ret;
  414. if (!buf->len) {
  415. pipe_buf_release(pipe, buf);
  416. tail++;
  417. pipe->tail = tail;
  418. if (pipe->files)
  419. sd->need_wakeup = true;
  420. }
  421. if (!sd->total_len)
  422. return 0;
  423. }
  424. return 1;
  425. }
  426. /* We know we have a pipe buffer, but maybe it's empty? */
  427. static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
  428. {
  429. unsigned int tail = pipe->tail;
  430. unsigned int mask = pipe->ring_size - 1;
  431. struct pipe_buffer *buf = &pipe->bufs[tail & mask];
  432. if (unlikely(!buf->len)) {
  433. pipe_buf_release(pipe, buf);
  434. pipe->tail = tail+1;
  435. return true;
  436. }
  437. return false;
  438. }
  439. /**
  440. * splice_from_pipe_next - wait for some data to splice from
  441. * @pipe: pipe to splice from
  442. * @sd: information about the splice operation
  443. *
  444. * Description:
  445. * This function will wait for some data and return a positive
  446. * value (one) if pipe buffers are available. It will return zero
  447. * or -errno if no more data needs to be spliced.
  448. */
  449. static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
  450. {
  451. /*
  452. * Check for signal early to make process killable when there are
  453. * always buffers available
  454. */
  455. if (signal_pending(current))
  456. return -ERESTARTSYS;
  457. repeat:
  458. while (pipe_empty(pipe->head, pipe->tail)) {
  459. if (!pipe->writers)
  460. return 0;
  461. if (sd->num_spliced)
  462. return 0;
  463. if (sd->flags & SPLICE_F_NONBLOCK)
  464. return -EAGAIN;
  465. if (signal_pending(current))
  466. return -ERESTARTSYS;
  467. if (sd->need_wakeup) {
  468. wakeup_pipe_writers(pipe);
  469. sd->need_wakeup = false;
  470. }
  471. pipe_wait_readable(pipe);
  472. }
  473. if (eat_empty_buffer(pipe))
  474. goto repeat;
  475. return 1;
  476. }
  477. /**
  478. * splice_from_pipe_begin - start splicing from pipe
  479. * @sd: information about the splice operation
  480. *
  481. * Description:
  482. * This function should be called before a loop containing
  483. * splice_from_pipe_next() and splice_from_pipe_feed() to
  484. * initialize the necessary fields of @sd.
  485. */
  486. static void splice_from_pipe_begin(struct splice_desc *sd)
  487. {
  488. sd->num_spliced = 0;
  489. sd->need_wakeup = false;
  490. }
  491. /**
  492. * splice_from_pipe_end - finish splicing from pipe
  493. * @pipe: pipe to splice from
  494. * @sd: information about the splice operation
  495. *
  496. * Description:
  497. * This function will wake up pipe writers if necessary. It should
  498. * be called after a loop containing splice_from_pipe_next() and
  499. * splice_from_pipe_feed().
  500. */
  501. static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
  502. {
  503. if (sd->need_wakeup)
  504. wakeup_pipe_writers(pipe);
  505. }
  506. /**
  507. * __splice_from_pipe - splice data from a pipe to given actor
  508. * @pipe: pipe to splice from
  509. * @sd: information to @actor
  510. * @actor: handler that splices the data
  511. *
  512. * Description:
  513. * This function does little more than loop over the pipe and call
  514. * @actor to do the actual moving of a single struct pipe_buffer to
  515. * the desired destination. See pipe_to_file, pipe_to_sendmsg, or
  516. * pipe_to_user.
  517. *
  518. */
  519. ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
  520. splice_actor *actor)
  521. {
  522. int ret;
  523. splice_from_pipe_begin(sd);
  524. do {
  525. cond_resched();
  526. ret = splice_from_pipe_next(pipe, sd);
  527. if (ret > 0)
  528. ret = splice_from_pipe_feed(pipe, sd, actor);
  529. } while (ret > 0);
  530. splice_from_pipe_end(pipe, sd);
  531. return sd->num_spliced ? sd->num_spliced : ret;
  532. }
  533. EXPORT_SYMBOL(__splice_from_pipe);
  534. /**
  535. * splice_from_pipe - splice data from a pipe to a file
  536. * @pipe: pipe to splice from
  537. * @out: file to splice to
  538. * @ppos: position in @out
  539. * @len: how many bytes to splice
  540. * @flags: splice modifier flags
  541. * @actor: handler that splices the data
  542. *
  543. * Description:
  544. * See __splice_from_pipe. This function locks the pipe inode,
  545. * otherwise it's identical to __splice_from_pipe().
  546. *
  547. */
  548. ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
  549. loff_t *ppos, size_t len, unsigned int flags,
  550. splice_actor *actor)
  551. {
  552. ssize_t ret;
  553. struct splice_desc sd = {
  554. .total_len = len,
  555. .flags = flags,
  556. .pos = *ppos,
  557. .u.file = out,
  558. };
  559. pipe_lock(pipe);
  560. ret = __splice_from_pipe(pipe, &sd, actor);
  561. pipe_unlock(pipe);
  562. return ret;
  563. }
  564. /**
  565. * iter_file_splice_write - splice data from a pipe to a file
  566. * @pipe: pipe info
  567. * @out: file to write to
  568. * @ppos: position in @out
  569. * @len: number of bytes to splice
  570. * @flags: splice modifier flags
  571. *
  572. * Description:
  573. * Will either move or copy pages (determined by @flags options) from
  574. * the given pipe inode to the given file.
  575. * This one is ->write_iter-based.
  576. *
  577. */
  578. ssize_t
  579. iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
  580. loff_t *ppos, size_t len, unsigned int flags)
  581. {
  582. struct splice_desc sd = {
  583. .total_len = len,
  584. .flags = flags,
  585. .pos = *ppos,
  586. .u.file = out,
  587. };
  588. int nbufs = pipe->max_usage;
  589. struct bio_vec *array;
  590. ssize_t ret;
  591. if (!out->f_op->write_iter)
  592. return -EINVAL;
  593. array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL);
  594. if (unlikely(!array))
  595. return -ENOMEM;
  596. pipe_lock(pipe);
  597. splice_from_pipe_begin(&sd);
  598. while (sd.total_len) {
  599. struct kiocb kiocb;
  600. struct iov_iter from;
  601. unsigned int head, tail, mask;
  602. size_t left;
  603. int n;
  604. ret = splice_from_pipe_next(pipe, &sd);
  605. if (ret <= 0)
  606. break;
  607. if (unlikely(nbufs < pipe->max_usage)) {
  608. kfree(array);
  609. nbufs = pipe->max_usage;
  610. array = kcalloc(nbufs, sizeof(struct bio_vec),
  611. GFP_KERNEL);
  612. if (!array) {
  613. ret = -ENOMEM;
  614. break;
  615. }
  616. }
  617. head = pipe->head;
  618. tail = pipe->tail;
  619. mask = pipe->ring_size - 1;
  620. /* build the vector */
  621. left = sd.total_len;
  622. for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
  623. struct pipe_buffer *buf = &pipe->bufs[tail & mask];
  624. size_t this_len = buf->len;
  625. /* zero-length bvecs are not supported, skip them */
  626. if (!this_len)
  627. continue;
  628. this_len = min(this_len, left);
  629. ret = pipe_buf_confirm(pipe, buf);
  630. if (unlikely(ret)) {
  631. if (ret == -ENODATA)
  632. ret = 0;
  633. goto done;
  634. }
  635. bvec_set_page(&array[n], buf->page, this_len,
  636. buf->offset);
  637. left -= this_len;
  638. n++;
  639. }
  640. iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
  641. init_sync_kiocb(&kiocb, out);
  642. kiocb.ki_pos = sd.pos;
  643. ret = out->f_op->write_iter(&kiocb, &from);
  644. sd.pos = kiocb.ki_pos;
  645. if (ret <= 0)
  646. break;
  647. WARN_ONCE(ret > sd.total_len - left,
  648. "Splice Exceeded! ret=%zd tot=%zu left=%zu\n",
  649. ret, sd.total_len, left);
  650. sd.num_spliced += ret;
  651. sd.total_len -= ret;
  652. *ppos = sd.pos;
  653. /* dismiss the fully eaten buffers, adjust the partial one */
  654. tail = pipe->tail;
  655. while (ret) {
  656. struct pipe_buffer *buf = &pipe->bufs[tail & mask];
  657. if (ret >= buf->len) {
  658. ret -= buf->len;
  659. buf->len = 0;
  660. pipe_buf_release(pipe, buf);
  661. tail++;
  662. pipe->tail = tail;
  663. if (pipe->files)
  664. sd.need_wakeup = true;
  665. } else {
  666. buf->offset += ret;
  667. buf->len -= ret;
  668. ret = 0;
  669. }
  670. }
  671. }
  672. done:
  673. kfree(array);
  674. splice_from_pipe_end(pipe, &sd);
  675. pipe_unlock(pipe);
  676. if (sd.num_spliced)
  677. ret = sd.num_spliced;
  678. return ret;
  679. }
  680. EXPORT_SYMBOL(iter_file_splice_write);
  681. #ifdef CONFIG_NET
  682. /**
  683. * splice_to_socket - splice data from a pipe to a socket
  684. * @pipe: pipe to splice from
  685. * @out: socket to write to
  686. * @ppos: position in @out
  687. * @len: number of bytes to splice
  688. * @flags: splice modifier flags
  689. *
  690. * Description:
  691. * Will send @len bytes from the pipe to a network socket. No data copying
  692. * is involved.
  693. *
  694. */
  695. ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
  696. loff_t *ppos, size_t len, unsigned int flags)
  697. {
  698. struct socket *sock = sock_from_file(out);
  699. struct bio_vec bvec[16];
  700. struct msghdr msg = {};
  701. ssize_t ret = 0;
  702. size_t spliced = 0;
  703. bool need_wakeup = false;
  704. pipe_lock(pipe);
  705. while (len > 0) {
  706. unsigned int head, tail, mask, bc = 0;
  707. size_t remain = len;
  708. /*
  709. * Check for signal early to make process killable when there
  710. * are always buffers available
  711. */
  712. ret = -ERESTARTSYS;
  713. if (signal_pending(current))
  714. break;
  715. while (pipe_empty(pipe->head, pipe->tail)) {
  716. ret = 0;
  717. if (!pipe->writers)
  718. goto out;
  719. if (spliced)
  720. goto out;
  721. ret = -EAGAIN;
  722. if (flags & SPLICE_F_NONBLOCK)
  723. goto out;
  724. ret = -ERESTARTSYS;
  725. if (signal_pending(current))
  726. goto out;
  727. if (need_wakeup) {
  728. wakeup_pipe_writers(pipe);
  729. need_wakeup = false;
  730. }
  731. pipe_wait_readable(pipe);
  732. }
  733. head = pipe->head;
  734. tail = pipe->tail;
  735. mask = pipe->ring_size - 1;
  736. while (!pipe_empty(head, tail)) {
  737. struct pipe_buffer *buf = &pipe->bufs[tail & mask];
  738. size_t seg;
  739. if (!buf->len) {
  740. tail++;
  741. continue;
  742. }
  743. seg = min_t(size_t, remain, buf->len);
  744. ret = pipe_buf_confirm(pipe, buf);
  745. if (unlikely(ret)) {
  746. if (ret == -ENODATA)
  747. ret = 0;
  748. break;
  749. }
  750. bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset);
  751. remain -= seg;
  752. if (remain == 0 || bc >= ARRAY_SIZE(bvec))
  753. break;
  754. tail++;
  755. }
  756. if (!bc)
  757. break;
  758. msg.msg_flags = MSG_SPLICE_PAGES;
  759. if (flags & SPLICE_F_MORE)
  760. msg.msg_flags |= MSG_MORE;
  761. if (remain && pipe_occupancy(pipe->head, tail) > 0)
  762. msg.msg_flags |= MSG_MORE;
  763. if (out->f_flags & O_NONBLOCK)
  764. msg.msg_flags |= MSG_DONTWAIT;
  765. iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc,
  766. len - remain);
  767. ret = sock_sendmsg(sock, &msg);
  768. if (ret <= 0)
  769. break;
  770. spliced += ret;
  771. len -= ret;
  772. tail = pipe->tail;
  773. while (ret > 0) {
  774. struct pipe_buffer *buf = &pipe->bufs[tail & mask];
  775. size_t seg = min_t(size_t, ret, buf->len);
  776. buf->offset += seg;
  777. buf->len -= seg;
  778. ret -= seg;
  779. if (!buf->len) {
  780. pipe_buf_release(pipe, buf);
  781. tail++;
  782. }
  783. }
  784. if (tail != pipe->tail) {
  785. pipe->tail = tail;
  786. if (pipe->files)
  787. need_wakeup = true;
  788. }
  789. }
  790. out:
  791. pipe_unlock(pipe);
  792. if (need_wakeup)
  793. wakeup_pipe_writers(pipe);
  794. return spliced ?: ret;
  795. }
  796. #endif
  797. static int warn_unsupported(struct file *file, const char *op)
  798. {
  799. pr_debug_ratelimited(
  800. "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
  801. op, file, current->pid, current->comm);
  802. return -EINVAL;
  803. }
  804. /*
  805. * Attempt to initiate a splice from pipe to file.
  806. */
  807. static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out,
  808. loff_t *ppos, size_t len, unsigned int flags)
  809. {
  810. if (unlikely(!out->f_op->splice_write))
  811. return warn_unsupported(out, "write");
  812. return out->f_op->splice_write(pipe, out, ppos, len, flags);
  813. }
  814. /*
  815. * Indicate to the caller that there was a premature EOF when reading from the
  816. * source and the caller didn't indicate they would be sending more data after
  817. * this.
  818. */
  819. static void do_splice_eof(struct splice_desc *sd)
  820. {
  821. if (sd->splice_eof)
  822. sd->splice_eof(sd);
  823. }
  824. /*
  825. * Callers already called rw_verify_area() on the entire range.
  826. * No need to call it for sub ranges.
  827. */
  828. static ssize_t do_splice_read(struct file *in, loff_t *ppos,
  829. struct pipe_inode_info *pipe, size_t len,
  830. unsigned int flags)
  831. {
  832. unsigned int p_space;
  833. if (unlikely(!(in->f_mode & FMODE_READ)))
  834. return -EBADF;
  835. if (!len)
  836. return 0;
  837. /* Don't try to read more the pipe has space for. */
  838. p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
  839. len = min_t(size_t, len, p_space << PAGE_SHIFT);
  840. if (unlikely(len > MAX_RW_COUNT))
  841. len = MAX_RW_COUNT;
  842. if (unlikely(!in->f_op->splice_read))
  843. return warn_unsupported(in, "read");
  844. /*
  845. * O_DIRECT and DAX don't deal with the pagecache, so we allocate a
  846. * buffer, copy into it and splice that into the pipe.
  847. */
  848. if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host))
  849. return copy_splice_read(in, ppos, pipe, len, flags);
  850. return in->f_op->splice_read(in, ppos, pipe, len, flags);
  851. }
  852. /**
  853. * vfs_splice_read - Read data from a file and splice it into a pipe
  854. * @in: File to splice from
  855. * @ppos: Input file offset
  856. * @pipe: Pipe to splice to
  857. * @len: Number of bytes to splice
  858. * @flags: Splice modifier flags (SPLICE_F_*)
  859. *
  860. * Splice the requested amount of data from the input file to the pipe. This
  861. * is synchronous as the caller must hold the pipe lock across the entire
  862. * operation.
  863. *
  864. * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
  865. * a hole and a negative error code otherwise.
  866. */
  867. ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
  868. struct pipe_inode_info *pipe, size_t len,
  869. unsigned int flags)
  870. {
  871. ssize_t ret;
  872. ret = rw_verify_area(READ, in, ppos, len);
  873. if (unlikely(ret < 0))
  874. return ret;
  875. return do_splice_read(in, ppos, pipe, len, flags);
  876. }
  877. EXPORT_SYMBOL_GPL(vfs_splice_read);
  878. /**
  879. * splice_direct_to_actor - splices data directly between two non-pipes
  880. * @in: file to splice from
  881. * @sd: actor information on where to splice to
  882. * @actor: handles the data splicing
  883. *
  884. * Description:
  885. * This is a special case helper to splice directly between two
  886. * points, without requiring an explicit pipe. Internally an allocated
  887. * pipe is cached in the process, and reused during the lifetime of
  888. * that process.
  889. *
  890. */
  891. ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
  892. splice_direct_actor *actor)
  893. {
  894. struct pipe_inode_info *pipe;
  895. ssize_t ret, bytes;
  896. size_t len;
  897. int i, flags, more;
  898. /*
  899. * We require the input to be seekable, as we don't want to randomly
  900. * drop data for eg socket -> socket splicing. Use the piped splicing
  901. * for that!
  902. */
  903. if (unlikely(!(in->f_mode & FMODE_LSEEK)))
  904. return -EINVAL;
  905. /*
  906. * neither in nor out is a pipe, setup an internal pipe attached to
  907. * 'out' and transfer the wanted data from 'in' to 'out' through that
  908. */
  909. pipe = current->splice_pipe;
  910. if (unlikely(!pipe)) {
  911. pipe = alloc_pipe_info();
  912. if (!pipe)
  913. return -ENOMEM;
  914. /*
  915. * We don't have an immediate reader, but we'll read the stuff
  916. * out of the pipe right after the splice_to_pipe(). So set
  917. * PIPE_READERS appropriately.
  918. */
  919. pipe->readers = 1;
  920. current->splice_pipe = pipe;
  921. }
  922. /*
  923. * Do the splice.
  924. */
  925. bytes = 0;
  926. len = sd->total_len;
  927. /* Don't block on output, we have to drain the direct pipe. */
  928. flags = sd->flags;
  929. sd->flags &= ~SPLICE_F_NONBLOCK;
  930. /*
  931. * We signal MORE until we've read sufficient data to fulfill the
  932. * request and we keep signalling it if the caller set it.
  933. */
  934. more = sd->flags & SPLICE_F_MORE;
  935. sd->flags |= SPLICE_F_MORE;
  936. WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
  937. while (len) {
  938. size_t read_len;
  939. loff_t pos = sd->pos, prev_pos = pos;
  940. ret = do_splice_read(in, &pos, pipe, len, flags);
  941. if (unlikely(ret <= 0))
  942. goto read_failure;
  943. read_len = ret;
  944. sd->total_len = read_len;
  945. /*
  946. * If we now have sufficient data to fulfill the request then
  947. * we clear SPLICE_F_MORE if it was not set initially.
  948. */
  949. if (read_len >= len && !more)
  950. sd->flags &= ~SPLICE_F_MORE;
  951. /*
  952. * NOTE: nonblocking mode only applies to the input. We
  953. * must not do the output in nonblocking mode as then we
  954. * could get stuck data in the internal pipe:
  955. */
  956. ret = actor(pipe, sd);
  957. if (unlikely(ret <= 0)) {
  958. sd->pos = prev_pos;
  959. goto out_release;
  960. }
  961. bytes += ret;
  962. len -= ret;
  963. sd->pos = pos;
  964. if (ret < read_len) {
  965. sd->pos = prev_pos + ret;
  966. goto out_release;
  967. }
  968. }
  969. done:
  970. pipe->tail = pipe->head = 0;
  971. file_accessed(in);
  972. return bytes;
  973. read_failure:
  974. /*
  975. * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
  976. * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
  977. * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
  978. * least 1 byte *then* we will also do the ->splice_eof() call.
  979. */
  980. if (ret == 0 && !more && len > 0 && bytes)
  981. do_splice_eof(sd);
  982. out_release:
  983. /*
  984. * If we did an incomplete transfer we must release
  985. * the pipe buffers in question:
  986. */
  987. for (i = 0; i < pipe->ring_size; i++) {
  988. struct pipe_buffer *buf = &pipe->bufs[i];
  989. if (buf->ops)
  990. pipe_buf_release(pipe, buf);
  991. }
  992. if (!bytes)
  993. bytes = ret;
  994. goto done;
  995. }
  996. EXPORT_SYMBOL(splice_direct_to_actor);
  997. static int direct_splice_actor(struct pipe_inode_info *pipe,
  998. struct splice_desc *sd)
  999. {
  1000. struct file *file = sd->u.file;
  1001. long ret;
  1002. file_start_write(file);
  1003. ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
  1004. file_end_write(file);
  1005. return ret;
  1006. }
  1007. static int splice_file_range_actor(struct pipe_inode_info *pipe,
  1008. struct splice_desc *sd)
  1009. {
  1010. struct file *file = sd->u.file;
  1011. return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
  1012. }
  1013. static void direct_file_splice_eof(struct splice_desc *sd)
  1014. {
  1015. struct file *file = sd->u.file;
  1016. if (file->f_op->splice_eof)
  1017. file->f_op->splice_eof(file);
  1018. }
  1019. static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos,
  1020. struct file *out, loff_t *opos,
  1021. size_t len, unsigned int flags,
  1022. splice_direct_actor *actor)
  1023. {
  1024. struct splice_desc sd = {
  1025. .len = len,
  1026. .total_len = len,
  1027. .flags = flags,
  1028. .pos = *ppos,
  1029. .u.file = out,
  1030. .splice_eof = direct_file_splice_eof,
  1031. .opos = opos,
  1032. };
  1033. ssize_t ret;
  1034. if (unlikely(!(out->f_mode & FMODE_WRITE)))
  1035. return -EBADF;
  1036. if (unlikely(out->f_flags & O_APPEND))
  1037. return -EINVAL;
  1038. ret = splice_direct_to_actor(in, &sd, actor);
  1039. if (ret > 0)
  1040. *ppos = sd.pos;
  1041. return ret;
  1042. }
  1043. /**
  1044. * do_splice_direct - splices data directly between two files
  1045. * @in: file to splice from
  1046. * @ppos: input file offset
  1047. * @out: file to splice to
  1048. * @opos: output file offset
  1049. * @len: number of bytes to splice
  1050. * @flags: splice modifier flags
  1051. *
  1052. * Description:
  1053. * For use by do_sendfile(). splice can easily emulate sendfile, but
  1054. * doing it in the application would incur an extra system call
  1055. * (splice in + splice out, as compared to just sendfile()). So this helper
  1056. * can splice directly through a process-private pipe.
  1057. *
  1058. * Callers already called rw_verify_area() on the entire range.
  1059. */
  1060. ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
  1061. loff_t *opos, size_t len, unsigned int flags)
  1062. {
  1063. return do_splice_direct_actor(in, ppos, out, opos, len, flags,
  1064. direct_splice_actor);
  1065. }
  1066. EXPORT_SYMBOL(do_splice_direct);
  1067. /**
  1068. * splice_file_range - splices data between two files for copy_file_range()
  1069. * @in: file to splice from
  1070. * @ppos: input file offset
  1071. * @out: file to splice to
  1072. * @opos: output file offset
  1073. * @len: number of bytes to splice
  1074. *
  1075. * Description:
  1076. * For use by ->copy_file_range() methods.
  1077. * Like do_splice_direct(), but vfs_copy_file_range() already holds
  1078. * start_file_write() on @out file.
  1079. *
  1080. * Callers already called rw_verify_area() on the entire range.
  1081. */
  1082. ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
  1083. loff_t *opos, size_t len)
  1084. {
  1085. lockdep_assert(file_write_started(out));
  1086. return do_splice_direct_actor(in, ppos, out, opos,
  1087. min_t(size_t, len, MAX_RW_COUNT),
  1088. 0, splice_file_range_actor);
  1089. }
  1090. EXPORT_SYMBOL(splice_file_range);
  1091. static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
  1092. {
  1093. for (;;) {
  1094. if (unlikely(!pipe->readers)) {
  1095. send_sig(SIGPIPE, current, 0);
  1096. return -EPIPE;
  1097. }
  1098. if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
  1099. return 0;
  1100. if (flags & SPLICE_F_NONBLOCK)
  1101. return -EAGAIN;
  1102. if (signal_pending(current))
  1103. return -ERESTARTSYS;
  1104. pipe_wait_writable(pipe);
  1105. }
  1106. }
  1107. static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
  1108. struct pipe_inode_info *opipe,
  1109. size_t len, unsigned int flags);
  1110. ssize_t splice_file_to_pipe(struct file *in,
  1111. struct pipe_inode_info *opipe,
  1112. loff_t *offset,
  1113. size_t len, unsigned int flags)
  1114. {
  1115. ssize_t ret;
  1116. pipe_lock(opipe);
  1117. ret = wait_for_space(opipe, flags);
  1118. if (!ret)
  1119. ret = do_splice_read(in, offset, opipe, len, flags);
  1120. pipe_unlock(opipe);
  1121. if (ret > 0)
  1122. wakeup_pipe_readers(opipe);
  1123. return ret;
  1124. }
  1125. /*
  1126. * Determine where to splice to/from.
  1127. */
  1128. ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
  1129. loff_t *off_out, size_t len, unsigned int flags)
  1130. {
  1131. struct pipe_inode_info *ipipe;
  1132. struct pipe_inode_info *opipe;
  1133. loff_t offset;
  1134. ssize_t ret;
  1135. if (unlikely(!(in->f_mode & FMODE_READ) ||
  1136. !(out->f_mode & FMODE_WRITE)))
  1137. return -EBADF;
  1138. ipipe = get_pipe_info(in, true);
  1139. opipe = get_pipe_info(out, true);
  1140. if (ipipe && opipe) {
  1141. if (off_in || off_out)
  1142. return -ESPIPE;
  1143. /* Splicing to self would be fun, but... */
  1144. if (ipipe == opipe)
  1145. return -EINVAL;
  1146. if ((in->f_flags | out->f_flags) & O_NONBLOCK)
  1147. flags |= SPLICE_F_NONBLOCK;
  1148. ret = splice_pipe_to_pipe(ipipe, opipe, len, flags);
  1149. } else if (ipipe) {
  1150. if (off_in)
  1151. return -ESPIPE;
  1152. if (off_out) {
  1153. if (!(out->f_mode & FMODE_PWRITE))
  1154. return -EINVAL;
  1155. offset = *off_out;
  1156. } else {
  1157. offset = out->f_pos;
  1158. }
  1159. if (unlikely(out->f_flags & O_APPEND))
  1160. return -EINVAL;
  1161. ret = rw_verify_area(WRITE, out, &offset, len);
  1162. if (unlikely(ret < 0))
  1163. return ret;
  1164. if (in->f_flags & O_NONBLOCK)
  1165. flags |= SPLICE_F_NONBLOCK;
  1166. file_start_write(out);
  1167. ret = do_splice_from(ipipe, out, &offset, len, flags);
  1168. file_end_write(out);
  1169. if (!off_out)
  1170. out->f_pos = offset;
  1171. else
  1172. *off_out = offset;
  1173. } else if (opipe) {
  1174. if (off_out)
  1175. return -ESPIPE;
  1176. if (off_in) {
  1177. if (!(in->f_mode & FMODE_PREAD))
  1178. return -EINVAL;
  1179. offset = *off_in;
  1180. } else {
  1181. offset = in->f_pos;
  1182. }
  1183. ret = rw_verify_area(READ, in, &offset, len);
  1184. if (unlikely(ret < 0))
  1185. return ret;
  1186. if (out->f_flags & O_NONBLOCK)
  1187. flags |= SPLICE_F_NONBLOCK;
  1188. ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
  1189. if (!off_in)
  1190. in->f_pos = offset;
  1191. else
  1192. *off_in = offset;
  1193. } else {
  1194. ret = -EINVAL;
  1195. }
  1196. if (ret > 0) {
  1197. /*
  1198. * Generate modify out before access in:
  1199. * do_splice_from() may've already sent modify out,
  1200. * and this ensures the events get merged.
  1201. */
  1202. fsnotify_modify(out);
  1203. fsnotify_access(in);
  1204. }
  1205. return ret;
  1206. }
  1207. static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
  1208. struct file *out, loff_t __user *off_out,
  1209. size_t len, unsigned int flags)
  1210. {
  1211. struct pipe_inode_info *ipipe;
  1212. struct pipe_inode_info *opipe;
  1213. loff_t offset, *__off_in = NULL, *__off_out = NULL;
  1214. ssize_t ret;
  1215. ipipe = get_pipe_info(in, true);
  1216. opipe = get_pipe_info(out, true);
  1217. if (ipipe) {
  1218. if (off_in)
  1219. return -ESPIPE;
  1220. pipe_clear_nowait(in);
  1221. }
  1222. if (opipe) {
  1223. if (off_out)
  1224. return -ESPIPE;
  1225. pipe_clear_nowait(out);
  1226. }
  1227. if (off_out) {
  1228. if (copy_from_user(&offset, off_out, sizeof(loff_t)))
  1229. return -EFAULT;
  1230. __off_out = &offset;
  1231. }
  1232. if (off_in) {
  1233. if (copy_from_user(&offset, off_in, sizeof(loff_t)))
  1234. return -EFAULT;
  1235. __off_in = &offset;
  1236. }
  1237. ret = do_splice(in, __off_in, out, __off_out, len, flags);
  1238. if (ret < 0)
  1239. return ret;
  1240. if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
  1241. return -EFAULT;
  1242. if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
  1243. return -EFAULT;
  1244. return ret;
  1245. }
  1246. static ssize_t iter_to_pipe(struct iov_iter *from,
  1247. struct pipe_inode_info *pipe,
  1248. unsigned int flags)
  1249. {
  1250. struct pipe_buffer buf = {
  1251. .ops = &user_page_pipe_buf_ops,
  1252. .flags = flags
  1253. };
  1254. size_t total = 0;
  1255. ssize_t ret = 0;
  1256. while (iov_iter_count(from)) {
  1257. struct page *pages[16];
  1258. ssize_t left;
  1259. size_t start;
  1260. int i, n;
  1261. left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
  1262. if (left <= 0) {
  1263. ret = left;
  1264. break;
  1265. }
  1266. n = DIV_ROUND_UP(left + start, PAGE_SIZE);
  1267. for (i = 0; i < n; i++) {
  1268. int size = min_t(int, left, PAGE_SIZE - start);
  1269. buf.page = pages[i];
  1270. buf.offset = start;
  1271. buf.len = size;
  1272. ret = add_to_pipe(pipe, &buf);
  1273. if (unlikely(ret < 0)) {
  1274. iov_iter_revert(from, left);
  1275. // this one got dropped by add_to_pipe()
  1276. while (++i < n)
  1277. put_page(pages[i]);
  1278. goto out;
  1279. }
  1280. total += ret;
  1281. left -= size;
  1282. start = 0;
  1283. }
  1284. }
  1285. out:
  1286. return total ? total : ret;
  1287. }
  1288. static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
  1289. struct splice_desc *sd)
  1290. {
  1291. int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
  1292. return n == sd->len ? n : -EFAULT;
  1293. }
  1294. /*
  1295. * For lack of a better implementation, implement vmsplice() to userspace
  1296. * as a simple copy of the pipes pages to the user iov.
  1297. */
  1298. static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
  1299. unsigned int flags)
  1300. {
  1301. struct pipe_inode_info *pipe = get_pipe_info(file, true);
  1302. struct splice_desc sd = {
  1303. .total_len = iov_iter_count(iter),
  1304. .flags = flags,
  1305. .u.data = iter
  1306. };
  1307. ssize_t ret = 0;
  1308. if (!pipe)
  1309. return -EBADF;
  1310. pipe_clear_nowait(file);
  1311. if (sd.total_len) {
  1312. pipe_lock(pipe);
  1313. ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
  1314. pipe_unlock(pipe);
  1315. }
  1316. if (ret > 0)
  1317. fsnotify_access(file);
  1318. return ret;
  1319. }
  1320. /*
  1321. * vmsplice splices a user address range into a pipe. It can be thought of
  1322. * as splice-from-memory, where the regular splice is splice-from-file (or
  1323. * to file). In both cases the output is a pipe, naturally.
  1324. */
  1325. static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
  1326. unsigned int flags)
  1327. {
  1328. struct pipe_inode_info *pipe;
  1329. ssize_t ret = 0;
  1330. unsigned buf_flag = 0;
  1331. if (flags & SPLICE_F_GIFT)
  1332. buf_flag = PIPE_BUF_FLAG_GIFT;
  1333. pipe = get_pipe_info(file, true);
  1334. if (!pipe)
  1335. return -EBADF;
  1336. pipe_clear_nowait(file);
  1337. pipe_lock(pipe);
  1338. ret = wait_for_space(pipe, flags);
  1339. if (!ret)
  1340. ret = iter_to_pipe(iter, pipe, buf_flag);
  1341. pipe_unlock(pipe);
  1342. if (ret > 0) {
  1343. wakeup_pipe_readers(pipe);
  1344. fsnotify_modify(file);
  1345. }
  1346. return ret;
  1347. }
  1348. static int vmsplice_type(struct fd f, int *type)
  1349. {
  1350. if (!fd_file(f))
  1351. return -EBADF;
  1352. if (fd_file(f)->f_mode & FMODE_WRITE) {
  1353. *type = ITER_SOURCE;
  1354. } else if (fd_file(f)->f_mode & FMODE_READ) {
  1355. *type = ITER_DEST;
  1356. } else {
  1357. fdput(f);
  1358. return -EBADF;
  1359. }
  1360. return 0;
  1361. }
  1362. /*
  1363. * Note that vmsplice only really supports true splicing _from_ user memory
  1364. * to a pipe, not the other way around. Splicing from user memory is a simple
  1365. * operation that can be supported without any funky alignment restrictions
  1366. * or nasty vm tricks. We simply map in the user memory and fill them into
  1367. * a pipe. The reverse isn't quite as easy, though. There are two possible
  1368. * solutions for that:
  1369. *
  1370. * - memcpy() the data internally, at which point we might as well just
  1371. * do a regular read() on the buffer anyway.
  1372. * - Lots of nasty vm tricks, that are neither fast nor flexible (it
  1373. * has restriction limitations on both ends of the pipe).
  1374. *
  1375. * Currently we punt and implement it as a normal copy, see pipe_to_user().
  1376. *
  1377. */
  1378. SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
  1379. unsigned long, nr_segs, unsigned int, flags)
  1380. {
  1381. struct iovec iovstack[UIO_FASTIOV];
  1382. struct iovec *iov = iovstack;
  1383. struct iov_iter iter;
  1384. ssize_t error;
  1385. struct fd f;
  1386. int type;
  1387. if (unlikely(flags & ~SPLICE_F_ALL))
  1388. return -EINVAL;
  1389. f = fdget(fd);
  1390. error = vmsplice_type(f, &type);
  1391. if (error)
  1392. return error;
  1393. error = import_iovec(type, uiov, nr_segs,
  1394. ARRAY_SIZE(iovstack), &iov, &iter);
  1395. if (error < 0)
  1396. goto out_fdput;
  1397. if (!iov_iter_count(&iter))
  1398. error = 0;
  1399. else if (type == ITER_SOURCE)
  1400. error = vmsplice_to_pipe(fd_file(f), &iter, flags);
  1401. else
  1402. error = vmsplice_to_user(fd_file(f), &iter, flags);
  1403. kfree(iov);
  1404. out_fdput:
  1405. fdput(f);
  1406. return error;
  1407. }
  1408. SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
  1409. int, fd_out, loff_t __user *, off_out,
  1410. size_t, len, unsigned int, flags)
  1411. {
  1412. struct fd in, out;
  1413. ssize_t error;
  1414. if (unlikely(!len))
  1415. return 0;
  1416. if (unlikely(flags & ~SPLICE_F_ALL))
  1417. return -EINVAL;
  1418. error = -EBADF;
  1419. in = fdget(fd_in);
  1420. if (fd_file(in)) {
  1421. out = fdget(fd_out);
  1422. if (fd_file(out)) {
  1423. error = __do_splice(fd_file(in), off_in, fd_file(out), off_out,
  1424. len, flags);
  1425. fdput(out);
  1426. }
  1427. fdput(in);
  1428. }
  1429. return error;
  1430. }
  1431. /*
  1432. * Make sure there's data to read. Wait for input if we can, otherwise
  1433. * return an appropriate error.
  1434. */
  1435. static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
  1436. {
  1437. int ret;
  1438. /*
  1439. * Check the pipe occupancy without the inode lock first. This function
  1440. * is speculative anyways, so missing one is ok.
  1441. */
  1442. if (!pipe_empty(pipe->head, pipe->tail))
  1443. return 0;
  1444. ret = 0;
  1445. pipe_lock(pipe);
  1446. while (pipe_empty(pipe->head, pipe->tail)) {
  1447. if (signal_pending(current)) {
  1448. ret = -ERESTARTSYS;
  1449. break;
  1450. }
  1451. if (!pipe->writers)
  1452. break;
  1453. if (flags & SPLICE_F_NONBLOCK) {
  1454. ret = -EAGAIN;
  1455. break;
  1456. }
  1457. pipe_wait_readable(pipe);
  1458. }
  1459. pipe_unlock(pipe);
  1460. return ret;
  1461. }
  1462. /*
  1463. * Make sure there's writeable room. Wait for room if we can, otherwise
  1464. * return an appropriate error.
  1465. */
  1466. static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
  1467. {
  1468. int ret;
  1469. /*
  1470. * Check pipe occupancy without the inode lock first. This function
  1471. * is speculative anyways, so missing one is ok.
  1472. */
  1473. if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
  1474. return 0;
  1475. ret = 0;
  1476. pipe_lock(pipe);
  1477. while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
  1478. if (!pipe->readers) {
  1479. send_sig(SIGPIPE, current, 0);
  1480. ret = -EPIPE;
  1481. break;
  1482. }
  1483. if (flags & SPLICE_F_NONBLOCK) {
  1484. ret = -EAGAIN;
  1485. break;
  1486. }
  1487. if (signal_pending(current)) {
  1488. ret = -ERESTARTSYS;
  1489. break;
  1490. }
  1491. pipe_wait_writable(pipe);
  1492. }
  1493. pipe_unlock(pipe);
  1494. return ret;
  1495. }
  1496. /*
  1497. * Splice contents of ipipe to opipe.
  1498. */
  1499. static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
  1500. struct pipe_inode_info *opipe,
  1501. size_t len, unsigned int flags)
  1502. {
  1503. struct pipe_buffer *ibuf, *obuf;
  1504. unsigned int i_head, o_head;
  1505. unsigned int i_tail, o_tail;
  1506. unsigned int i_mask, o_mask;
  1507. int ret = 0;
  1508. bool input_wakeup = false;
  1509. retry:
  1510. ret = ipipe_prep(ipipe, flags);
  1511. if (ret)
  1512. return ret;
  1513. ret = opipe_prep(opipe, flags);
  1514. if (ret)
  1515. return ret;
  1516. /*
  1517. * Potential ABBA deadlock, work around it by ordering lock
  1518. * grabbing by pipe info address. Otherwise two different processes
  1519. * could deadlock (one doing tee from A -> B, the other from B -> A).
  1520. */
  1521. pipe_double_lock(ipipe, opipe);
  1522. i_tail = ipipe->tail;
  1523. i_mask = ipipe->ring_size - 1;
  1524. o_head = opipe->head;
  1525. o_mask = opipe->ring_size - 1;
  1526. do {
  1527. size_t o_len;
  1528. if (!opipe->readers) {
  1529. send_sig(SIGPIPE, current, 0);
  1530. if (!ret)
  1531. ret = -EPIPE;
  1532. break;
  1533. }
  1534. i_head = ipipe->head;
  1535. o_tail = opipe->tail;
  1536. if (pipe_empty(i_head, i_tail) && !ipipe->writers)
  1537. break;
  1538. /*
  1539. * Cannot make any progress, because either the input
  1540. * pipe is empty or the output pipe is full.
  1541. */
  1542. if (pipe_empty(i_head, i_tail) ||
  1543. pipe_full(o_head, o_tail, opipe->max_usage)) {
  1544. /* Already processed some buffers, break */
  1545. if (ret)
  1546. break;
  1547. if (flags & SPLICE_F_NONBLOCK) {
  1548. ret = -EAGAIN;
  1549. break;
  1550. }
  1551. /*
  1552. * We raced with another reader/writer and haven't
  1553. * managed to process any buffers. A zero return
  1554. * value means EOF, so retry instead.
  1555. */
  1556. pipe_unlock(ipipe);
  1557. pipe_unlock(opipe);
  1558. goto retry;
  1559. }
  1560. ibuf = &ipipe->bufs[i_tail & i_mask];
  1561. obuf = &opipe->bufs[o_head & o_mask];
  1562. if (len >= ibuf->len) {
  1563. /*
  1564. * Simply move the whole buffer from ipipe to opipe
  1565. */
  1566. *obuf = *ibuf;
  1567. ibuf->ops = NULL;
  1568. i_tail++;
  1569. ipipe->tail = i_tail;
  1570. input_wakeup = true;
  1571. o_len = obuf->len;
  1572. o_head++;
  1573. opipe->head = o_head;
  1574. } else {
  1575. /*
  1576. * Get a reference to this pipe buffer,
  1577. * so we can copy the contents over.
  1578. */
  1579. if (!pipe_buf_get(ipipe, ibuf)) {
  1580. if (ret == 0)
  1581. ret = -EFAULT;
  1582. break;
  1583. }
  1584. *obuf = *ibuf;
  1585. /*
  1586. * Don't inherit the gift and merge flags, we need to
  1587. * prevent multiple steals of this page.
  1588. */
  1589. obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
  1590. obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
  1591. obuf->len = len;
  1592. ibuf->offset += len;
  1593. ibuf->len -= len;
  1594. o_len = len;
  1595. o_head++;
  1596. opipe->head = o_head;
  1597. }
  1598. ret += o_len;
  1599. len -= o_len;
  1600. } while (len);
  1601. pipe_unlock(ipipe);
  1602. pipe_unlock(opipe);
  1603. /*
  1604. * If we put data in the output pipe, wakeup any potential readers.
  1605. */
  1606. if (ret > 0)
  1607. wakeup_pipe_readers(opipe);
  1608. if (input_wakeup)
  1609. wakeup_pipe_writers(ipipe);
  1610. return ret;
  1611. }
  1612. /*
  1613. * Link contents of ipipe to opipe.
  1614. */
  1615. static ssize_t link_pipe(struct pipe_inode_info *ipipe,
  1616. struct pipe_inode_info *opipe,
  1617. size_t len, unsigned int flags)
  1618. {
  1619. struct pipe_buffer *ibuf, *obuf;
  1620. unsigned int i_head, o_head;
  1621. unsigned int i_tail, o_tail;
  1622. unsigned int i_mask, o_mask;
  1623. ssize_t ret = 0;
  1624. /*
  1625. * Potential ABBA deadlock, work around it by ordering lock
  1626. * grabbing by pipe info address. Otherwise two different processes
  1627. * could deadlock (one doing tee from A -> B, the other from B -> A).
  1628. */
  1629. pipe_double_lock(ipipe, opipe);
  1630. i_tail = ipipe->tail;
  1631. i_mask = ipipe->ring_size - 1;
  1632. o_head = opipe->head;
  1633. o_mask = opipe->ring_size - 1;
  1634. do {
  1635. if (!opipe->readers) {
  1636. send_sig(SIGPIPE, current, 0);
  1637. if (!ret)
  1638. ret = -EPIPE;
  1639. break;
  1640. }
  1641. i_head = ipipe->head;
  1642. o_tail = opipe->tail;
  1643. /*
  1644. * If we have iterated all input buffers or run out of
  1645. * output room, break.
  1646. */
  1647. if (pipe_empty(i_head, i_tail) ||
  1648. pipe_full(o_head, o_tail, opipe->max_usage))
  1649. break;
  1650. ibuf = &ipipe->bufs[i_tail & i_mask];
  1651. obuf = &opipe->bufs[o_head & o_mask];
  1652. /*
  1653. * Get a reference to this pipe buffer,
  1654. * so we can copy the contents over.
  1655. */
  1656. if (!pipe_buf_get(ipipe, ibuf)) {
  1657. if (ret == 0)
  1658. ret = -EFAULT;
  1659. break;
  1660. }
  1661. *obuf = *ibuf;
  1662. /*
  1663. * Don't inherit the gift and merge flag, we need to prevent
  1664. * multiple steals of this page.
  1665. */
  1666. obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
  1667. obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
  1668. if (obuf->len > len)
  1669. obuf->len = len;
  1670. ret += obuf->len;
  1671. len -= obuf->len;
  1672. o_head++;
  1673. opipe->head = o_head;
  1674. i_tail++;
  1675. } while (len);
  1676. pipe_unlock(ipipe);
  1677. pipe_unlock(opipe);
  1678. /*
  1679. * If we put data in the output pipe, wakeup any potential readers.
  1680. */
  1681. if (ret > 0)
  1682. wakeup_pipe_readers(opipe);
  1683. return ret;
  1684. }
  1685. /*
  1686. * This is a tee(1) implementation that works on pipes. It doesn't copy
  1687. * any data, it simply references the 'in' pages on the 'out' pipe.
  1688. * The 'flags' used are the SPLICE_F_* variants, currently the only
  1689. * applicable one is SPLICE_F_NONBLOCK.
  1690. */
  1691. ssize_t do_tee(struct file *in, struct file *out, size_t len,
  1692. unsigned int flags)
  1693. {
  1694. struct pipe_inode_info *ipipe = get_pipe_info(in, true);
  1695. struct pipe_inode_info *opipe = get_pipe_info(out, true);
  1696. ssize_t ret = -EINVAL;
  1697. if (unlikely(!(in->f_mode & FMODE_READ) ||
  1698. !(out->f_mode & FMODE_WRITE)))
  1699. return -EBADF;
  1700. /*
  1701. * Duplicate the contents of ipipe to opipe without actually
  1702. * copying the data.
  1703. */
  1704. if (ipipe && opipe && ipipe != opipe) {
  1705. if ((in->f_flags | out->f_flags) & O_NONBLOCK)
  1706. flags |= SPLICE_F_NONBLOCK;
  1707. /*
  1708. * Keep going, unless we encounter an error. The ipipe/opipe
  1709. * ordering doesn't really matter.
  1710. */
  1711. ret = ipipe_prep(ipipe, flags);
  1712. if (!ret) {
  1713. ret = opipe_prep(opipe, flags);
  1714. if (!ret)
  1715. ret = link_pipe(ipipe, opipe, len, flags);
  1716. }
  1717. }
  1718. if (ret > 0) {
  1719. fsnotify_access(in);
  1720. fsnotify_modify(out);
  1721. }
  1722. return ret;
  1723. }
  1724. SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
  1725. {
  1726. struct fd in, out;
  1727. ssize_t error;
  1728. if (unlikely(flags & ~SPLICE_F_ALL))
  1729. return -EINVAL;
  1730. if (unlikely(!len))
  1731. return 0;
  1732. error = -EBADF;
  1733. in = fdget(fdin);
  1734. if (fd_file(in)) {
  1735. out = fdget(fdout);
  1736. if (fd_file(out)) {
  1737. error = do_tee(fd_file(in), fd_file(out), len, flags);
  1738. fdput(out);
  1739. }
  1740. fdput(in);
  1741. }
  1742. return error;
  1743. }