pipe.c 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/fs/pipe.c
  4. *
  5. * Copyright (C) 1991, 1992, 1999 Linus Torvalds
  6. */
  7. #include <linux/mm.h>
  8. #include <linux/file.h>
  9. #include <linux/poll.h>
  10. #include <linux/slab.h>
  11. #include <linux/module.h>
  12. #include <linux/init.h>
  13. #include <linux/fs.h>
  14. #include <linux/log2.h>
  15. #include <linux/mount.h>
  16. #include <linux/pseudo_fs.h>
  17. #include <linux/magic.h>
  18. #include <linux/pipe_fs_i.h>
  19. #include <linux/uio.h>
  20. #include <linux/highmem.h>
  21. #include <linux/pagemap.h>
  22. #include <linux/audit.h>
  23. #include <linux/syscalls.h>
  24. #include <linux/fcntl.h>
  25. #include <linux/memcontrol.h>
  26. #include <linux/watch_queue.h>
  27. #include <linux/sysctl.h>
  28. #include <linux/uaccess.h>
  29. #include <asm/ioctls.h>
  30. #include "internal.h"
  31. /*
  32. * New pipe buffers will be restricted to this size while the user is exceeding
  33. * their pipe buffer quota. The general pipe use case needs at least two
  34. * buffers: one for data yet to be read, and one for new data. If this is less
  35. * than two, then a write to a non-empty pipe may block even if the pipe is not
  36. * full. This can occur with GNU make jobserver or similar uses of pipes as
  37. * semaphores: multiple processes may be waiting to write tokens back to the
  38. * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
  39. *
  40. * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
  41. * own risk, namely: pipe writes to non-full pipes may block until the pipe is
  42. * emptied.
  43. */
  44. #define PIPE_MIN_DEF_BUFFERS 2
  45. /*
  46. * The max size that a non-root user is allowed to grow the pipe. Can
  47. * be set by root in /proc/sys/fs/pipe-max-size
  48. */
  49. static unsigned int pipe_max_size = 1048576;
  50. /* Maximum allocatable pages per user. Hard limit is unset by default, soft
  51. * matches default values.
  52. */
  53. static unsigned long pipe_user_pages_hard;
  54. static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
  55. /*
  56. * We use head and tail indices that aren't masked off, except at the point of
  57. * dereference, but rather they're allowed to wrap naturally. This means there
  58. * isn't a dead spot in the buffer, but the ring has to be a power of two and
  59. * <= 2^31.
  60. * -- David Howells 2019-09-23.
  61. *
  62. * Reads with count = 0 should always return 0.
  63. * -- Julian Bradfield 1999-06-07.
  64. *
  65. * FIFOs and Pipes now generate SIGIO for both readers and writers.
  66. * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
  67. *
  68. * pipe_read & write cleanup
  69. * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  70. */
  71. #define cmp_int(l, r) ((l > r) - (l < r))
  72. #ifdef CONFIG_PROVE_LOCKING
  73. static int pipe_lock_cmp_fn(const struct lockdep_map *a,
  74. const struct lockdep_map *b)
  75. {
  76. return cmp_int((unsigned long) a, (unsigned long) b);
  77. }
  78. #endif
  79. void pipe_lock(struct pipe_inode_info *pipe)
  80. {
  81. if (pipe->files)
  82. mutex_lock(&pipe->mutex);
  83. }
  84. EXPORT_SYMBOL(pipe_lock);
  85. void pipe_unlock(struct pipe_inode_info *pipe)
  86. {
  87. if (pipe->files)
  88. mutex_unlock(&pipe->mutex);
  89. }
  90. EXPORT_SYMBOL(pipe_unlock);
  91. void pipe_double_lock(struct pipe_inode_info *pipe1,
  92. struct pipe_inode_info *pipe2)
  93. {
  94. BUG_ON(pipe1 == pipe2);
  95. if (pipe1 > pipe2)
  96. swap(pipe1, pipe2);
  97. pipe_lock(pipe1);
  98. pipe_lock(pipe2);
  99. }
  100. static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
  101. struct pipe_buffer *buf)
  102. {
  103. struct page *page = buf->page;
  104. /*
  105. * If nobody else uses this page, and we don't already have a
  106. * temporary page, let's keep track of it as a one-deep
  107. * allocation cache. (Otherwise just release our reference to it)
  108. */
  109. if (page_count(page) == 1 && !pipe->tmp_page)
  110. pipe->tmp_page = page;
  111. else
  112. put_page(page);
  113. }
  114. static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
  115. struct pipe_buffer *buf)
  116. {
  117. struct page *page = buf->page;
  118. if (page_count(page) != 1)
  119. return false;
  120. memcg_kmem_uncharge_page(page, 0);
  121. __SetPageLocked(page);
  122. return true;
  123. }
  124. /**
  125. * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
  126. * @pipe: the pipe that the buffer belongs to
  127. * @buf: the buffer to attempt to steal
  128. *
  129. * Description:
  130. * This function attempts to steal the &struct page attached to
  131. * @buf. If successful, this function returns 0 and returns with
  132. * the page locked. The caller may then reuse the page for whatever
  133. * he wishes; the typical use is insertion into a different file
  134. * page cache.
  135. */
  136. bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
  137. struct pipe_buffer *buf)
  138. {
  139. struct page *page = buf->page;
  140. /*
  141. * A reference of one is golden, that means that the owner of this
  142. * page is the only one holding a reference to it. lock the page
  143. * and return OK.
  144. */
  145. if (page_count(page) == 1) {
  146. lock_page(page);
  147. return true;
  148. }
  149. return false;
  150. }
  151. EXPORT_SYMBOL(generic_pipe_buf_try_steal);
  152. /**
  153. * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
  154. * @pipe: the pipe that the buffer belongs to
  155. * @buf: the buffer to get a reference to
  156. *
  157. * Description:
  158. * This function grabs an extra reference to @buf. It's used in
  159. * the tee() system call, when we duplicate the buffers in one
  160. * pipe into another.
  161. */
  162. bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
  163. {
  164. return try_get_page(buf->page);
  165. }
  166. EXPORT_SYMBOL(generic_pipe_buf_get);
  167. /**
  168. * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
  169. * @pipe: the pipe that the buffer belongs to
  170. * @buf: the buffer to put a reference to
  171. *
  172. * Description:
  173. * This function releases a reference to @buf.
  174. */
  175. void generic_pipe_buf_release(struct pipe_inode_info *pipe,
  176. struct pipe_buffer *buf)
  177. {
  178. put_page(buf->page);
  179. }
  180. EXPORT_SYMBOL(generic_pipe_buf_release);
  181. static const struct pipe_buf_operations anon_pipe_buf_ops = {
  182. .release = anon_pipe_buf_release,
  183. .try_steal = anon_pipe_buf_try_steal,
  184. .get = generic_pipe_buf_get,
  185. };
  186. /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
  187. static inline bool pipe_readable(const struct pipe_inode_info *pipe)
  188. {
  189. unsigned int head = READ_ONCE(pipe->head);
  190. unsigned int tail = READ_ONCE(pipe->tail);
  191. unsigned int writers = READ_ONCE(pipe->writers);
  192. return !pipe_empty(head, tail) || !writers;
  193. }
  194. static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
  195. struct pipe_buffer *buf,
  196. unsigned int tail)
  197. {
  198. pipe_buf_release(pipe, buf);
  199. /*
  200. * If the pipe has a watch_queue, we need additional protection
  201. * by the spinlock because notifications get posted with only
  202. * this spinlock, no mutex
  203. */
  204. if (pipe_has_watch_queue(pipe)) {
  205. spin_lock_irq(&pipe->rd_wait.lock);
  206. #ifdef CONFIG_WATCH_QUEUE
  207. if (buf->flags & PIPE_BUF_FLAG_LOSS)
  208. pipe->note_loss = true;
  209. #endif
  210. pipe->tail = ++tail;
  211. spin_unlock_irq(&pipe->rd_wait.lock);
  212. return tail;
  213. }
  214. /*
  215. * Without a watch_queue, we can simply increment the tail
  216. * without the spinlock - the mutex is enough.
  217. */
  218. pipe->tail = ++tail;
  219. return tail;
  220. }
  221. static ssize_t
  222. pipe_read(struct kiocb *iocb, struct iov_iter *to)
  223. {
  224. size_t total_len = iov_iter_count(to);
  225. struct file *filp = iocb->ki_filp;
  226. struct pipe_inode_info *pipe = filp->private_data;
  227. bool was_full, wake_next_reader = false;
  228. ssize_t ret;
  229. /* Null read succeeds. */
  230. if (unlikely(total_len == 0))
  231. return 0;
  232. ret = 0;
  233. mutex_lock(&pipe->mutex);
  234. /*
  235. * We only wake up writers if the pipe was full when we started
  236. * reading in order to avoid unnecessary wakeups.
  237. *
  238. * But when we do wake up writers, we do so using a sync wakeup
  239. * (WF_SYNC), because we want them to get going and generate more
  240. * data for us.
  241. */
  242. was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
  243. for (;;) {
  244. /* Read ->head with a barrier vs post_one_notification() */
  245. unsigned int head = smp_load_acquire(&pipe->head);
  246. unsigned int tail = pipe->tail;
  247. unsigned int mask = pipe->ring_size - 1;
  248. #ifdef CONFIG_WATCH_QUEUE
  249. if (pipe->note_loss) {
  250. struct watch_notification n;
  251. if (total_len < 8) {
  252. if (ret == 0)
  253. ret = -ENOBUFS;
  254. break;
  255. }
  256. n.type = WATCH_TYPE_META;
  257. n.subtype = WATCH_META_LOSS_NOTIFICATION;
  258. n.info = watch_sizeof(n);
  259. if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
  260. if (ret == 0)
  261. ret = -EFAULT;
  262. break;
  263. }
  264. ret += sizeof(n);
  265. total_len -= sizeof(n);
  266. pipe->note_loss = false;
  267. }
  268. #endif
  269. if (!pipe_empty(head, tail)) {
  270. struct pipe_buffer *buf = &pipe->bufs[tail & mask];
  271. size_t chars = buf->len;
  272. size_t written;
  273. int error;
  274. if (chars > total_len) {
  275. if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
  276. if (ret == 0)
  277. ret = -ENOBUFS;
  278. break;
  279. }
  280. chars = total_len;
  281. }
  282. error = pipe_buf_confirm(pipe, buf);
  283. if (error) {
  284. if (!ret)
  285. ret = error;
  286. break;
  287. }
  288. written = copy_page_to_iter(buf->page, buf->offset, chars, to);
  289. if (unlikely(written < chars)) {
  290. if (!ret)
  291. ret = -EFAULT;
  292. break;
  293. }
  294. ret += chars;
  295. buf->offset += chars;
  296. buf->len -= chars;
  297. /* Was it a packet buffer? Clean up and exit */
  298. if (buf->flags & PIPE_BUF_FLAG_PACKET) {
  299. total_len = chars;
  300. buf->len = 0;
  301. }
  302. if (!buf->len)
  303. tail = pipe_update_tail(pipe, buf, tail);
  304. total_len -= chars;
  305. if (!total_len)
  306. break; /* common path: read succeeded */
  307. if (!pipe_empty(head, tail)) /* More to do? */
  308. continue;
  309. }
  310. if (!pipe->writers)
  311. break;
  312. if (ret)
  313. break;
  314. if ((filp->f_flags & O_NONBLOCK) ||
  315. (iocb->ki_flags & IOCB_NOWAIT)) {
  316. ret = -EAGAIN;
  317. break;
  318. }
  319. mutex_unlock(&pipe->mutex);
  320. /*
  321. * We only get here if we didn't actually read anything.
  322. *
  323. * However, we could have seen (and removed) a zero-sized
  324. * pipe buffer, and might have made space in the buffers
  325. * that way.
  326. *
  327. * You can't make zero-sized pipe buffers by doing an empty
  328. * write (not even in packet mode), but they can happen if
  329. * the writer gets an EFAULT when trying to fill a buffer
  330. * that already got allocated and inserted in the buffer
  331. * array.
  332. *
  333. * So we still need to wake up any pending writers in the
  334. * _very_ unlikely case that the pipe was full, but we got
  335. * no data.
  336. */
  337. if (unlikely(was_full))
  338. wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
  339. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  340. /*
  341. * But because we didn't read anything, at this point we can
  342. * just return directly with -ERESTARTSYS if we're interrupted,
  343. * since we've done any required wakeups and there's no need
  344. * to mark anything accessed. And we've dropped the lock.
  345. */
  346. if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
  347. return -ERESTARTSYS;
  348. mutex_lock(&pipe->mutex);
  349. was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
  350. wake_next_reader = true;
  351. }
  352. if (pipe_empty(pipe->head, pipe->tail))
  353. wake_next_reader = false;
  354. mutex_unlock(&pipe->mutex);
  355. if (was_full)
  356. wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
  357. if (wake_next_reader)
  358. wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
  359. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  360. if (ret > 0)
  361. file_accessed(filp);
  362. return ret;
  363. }
  364. static inline int is_packetized(struct file *file)
  365. {
  366. return (file->f_flags & O_DIRECT) != 0;
  367. }
  368. /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
  369. static inline bool pipe_writable(const struct pipe_inode_info *pipe)
  370. {
  371. unsigned int head = READ_ONCE(pipe->head);
  372. unsigned int tail = READ_ONCE(pipe->tail);
  373. unsigned int max_usage = READ_ONCE(pipe->max_usage);
  374. return !pipe_full(head, tail, max_usage) ||
  375. !READ_ONCE(pipe->readers);
  376. }
  377. static ssize_t
  378. pipe_write(struct kiocb *iocb, struct iov_iter *from)
  379. {
  380. struct file *filp = iocb->ki_filp;
  381. struct pipe_inode_info *pipe = filp->private_data;
  382. unsigned int head;
  383. ssize_t ret = 0;
  384. size_t total_len = iov_iter_count(from);
  385. ssize_t chars;
  386. bool was_empty = false;
  387. bool wake_next_writer = false;
  388. /*
  389. * Reject writing to watch queue pipes before the point where we lock
  390. * the pipe.
  391. * Otherwise, lockdep would be unhappy if the caller already has another
  392. * pipe locked.
  393. * If we had to support locking a normal pipe and a notification pipe at
  394. * the same time, we could set up lockdep annotations for that, but
  395. * since we don't actually need that, it's simpler to just bail here.
  396. */
  397. if (pipe_has_watch_queue(pipe))
  398. return -EXDEV;
  399. /* Null write succeeds. */
  400. if (unlikely(total_len == 0))
  401. return 0;
  402. mutex_lock(&pipe->mutex);
  403. if (!pipe->readers) {
  404. send_sig(SIGPIPE, current, 0);
  405. ret = -EPIPE;
  406. goto out;
  407. }
  408. /*
  409. * If it wasn't empty we try to merge new data into
  410. * the last buffer.
  411. *
  412. * That naturally merges small writes, but it also
  413. * page-aligns the rest of the writes for large writes
  414. * spanning multiple pages.
  415. */
  416. head = pipe->head;
  417. was_empty = pipe_empty(head, pipe->tail);
  418. chars = total_len & (PAGE_SIZE-1);
  419. if (chars && !was_empty) {
  420. unsigned int mask = pipe->ring_size - 1;
  421. struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
  422. int offset = buf->offset + buf->len;
  423. if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
  424. offset + chars <= PAGE_SIZE) {
  425. ret = pipe_buf_confirm(pipe, buf);
  426. if (ret)
  427. goto out;
  428. ret = copy_page_from_iter(buf->page, offset, chars, from);
  429. if (unlikely(ret < chars)) {
  430. ret = -EFAULT;
  431. goto out;
  432. }
  433. buf->len += ret;
  434. if (!iov_iter_count(from))
  435. goto out;
  436. }
  437. }
  438. for (;;) {
  439. if (!pipe->readers) {
  440. send_sig(SIGPIPE, current, 0);
  441. if (!ret)
  442. ret = -EPIPE;
  443. break;
  444. }
  445. head = pipe->head;
  446. if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
  447. unsigned int mask = pipe->ring_size - 1;
  448. struct pipe_buffer *buf;
  449. struct page *page = pipe->tmp_page;
  450. int copied;
  451. if (!page) {
  452. page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
  453. if (unlikely(!page)) {
  454. ret = ret ? : -ENOMEM;
  455. break;
  456. }
  457. pipe->tmp_page = page;
  458. }
  459. /* Allocate a slot in the ring in advance and attach an
  460. * empty buffer. If we fault or otherwise fail to use
  461. * it, either the reader will consume it or it'll still
  462. * be there for the next write.
  463. */
  464. pipe->head = head + 1;
  465. /* Insert it into the buffer array */
  466. buf = &pipe->bufs[head & mask];
  467. buf->page = page;
  468. buf->ops = &anon_pipe_buf_ops;
  469. buf->offset = 0;
  470. buf->len = 0;
  471. if (is_packetized(filp))
  472. buf->flags = PIPE_BUF_FLAG_PACKET;
  473. else
  474. buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
  475. pipe->tmp_page = NULL;
  476. copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
  477. if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
  478. if (!ret)
  479. ret = -EFAULT;
  480. break;
  481. }
  482. ret += copied;
  483. buf->len = copied;
  484. if (!iov_iter_count(from))
  485. break;
  486. }
  487. if (!pipe_full(head, pipe->tail, pipe->max_usage))
  488. continue;
  489. /* Wait for buffer space to become available. */
  490. if ((filp->f_flags & O_NONBLOCK) ||
  491. (iocb->ki_flags & IOCB_NOWAIT)) {
  492. if (!ret)
  493. ret = -EAGAIN;
  494. break;
  495. }
  496. if (signal_pending(current)) {
  497. if (!ret)
  498. ret = -ERESTARTSYS;
  499. break;
  500. }
  501. /*
  502. * We're going to release the pipe lock and wait for more
  503. * space. We wake up any readers if necessary, and then
  504. * after waiting we need to re-check whether the pipe
  505. * become empty while we dropped the lock.
  506. */
  507. mutex_unlock(&pipe->mutex);
  508. if (was_empty)
  509. wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
  510. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  511. wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
  512. mutex_lock(&pipe->mutex);
  513. was_empty = pipe_empty(pipe->head, pipe->tail);
  514. wake_next_writer = true;
  515. }
  516. out:
  517. if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
  518. wake_next_writer = false;
  519. mutex_unlock(&pipe->mutex);
  520. /*
  521. * If we do do a wakeup event, we do a 'sync' wakeup, because we
  522. * want the reader to start processing things asap, rather than
  523. * leave the data pending.
  524. *
  525. * This is particularly important for small writes, because of
  526. * how (for example) the GNU make jobserver uses small writes to
  527. * wake up pending jobs
  528. *
  529. * Epoll nonsensically wants a wakeup whether the pipe
  530. * was already empty or not.
  531. */
  532. if (was_empty || pipe->poll_usage)
  533. wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
  534. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  535. if (wake_next_writer)
  536. wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
  537. if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
  538. int err = file_update_time(filp);
  539. if (err)
  540. ret = err;
  541. sb_end_write(file_inode(filp)->i_sb);
  542. }
  543. return ret;
  544. }
  545. static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  546. {
  547. struct pipe_inode_info *pipe = filp->private_data;
  548. unsigned int count, head, tail, mask;
  549. switch (cmd) {
  550. case FIONREAD:
  551. mutex_lock(&pipe->mutex);
  552. count = 0;
  553. head = pipe->head;
  554. tail = pipe->tail;
  555. mask = pipe->ring_size - 1;
  556. while (tail != head) {
  557. count += pipe->bufs[tail & mask].len;
  558. tail++;
  559. }
  560. mutex_unlock(&pipe->mutex);
  561. return put_user(count, (int __user *)arg);
  562. #ifdef CONFIG_WATCH_QUEUE
  563. case IOC_WATCH_QUEUE_SET_SIZE: {
  564. int ret;
  565. mutex_lock(&pipe->mutex);
  566. ret = watch_queue_set_size(pipe, arg);
  567. mutex_unlock(&pipe->mutex);
  568. return ret;
  569. }
  570. case IOC_WATCH_QUEUE_SET_FILTER:
  571. return watch_queue_set_filter(
  572. pipe, (struct watch_notification_filter __user *)arg);
  573. #endif
  574. default:
  575. return -ENOIOCTLCMD;
  576. }
  577. }
  578. /* No kernel lock held - fine */
  579. static __poll_t
  580. pipe_poll(struct file *filp, poll_table *wait)
  581. {
  582. __poll_t mask;
  583. struct pipe_inode_info *pipe = filp->private_data;
  584. unsigned int head, tail;
  585. /* Epoll has some historical nasty semantics, this enables them */
  586. WRITE_ONCE(pipe->poll_usage, true);
  587. /*
  588. * Reading pipe state only -- no need for acquiring the semaphore.
  589. *
  590. * But because this is racy, the code has to add the
  591. * entry to the poll table _first_ ..
  592. */
  593. if (filp->f_mode & FMODE_READ)
  594. poll_wait(filp, &pipe->rd_wait, wait);
  595. if (filp->f_mode & FMODE_WRITE)
  596. poll_wait(filp, &pipe->wr_wait, wait);
  597. /*
  598. * .. and only then can you do the racy tests. That way,
  599. * if something changes and you got it wrong, the poll
  600. * table entry will wake you up and fix it.
  601. */
  602. head = READ_ONCE(pipe->head);
  603. tail = READ_ONCE(pipe->tail);
  604. mask = 0;
  605. if (filp->f_mode & FMODE_READ) {
  606. if (!pipe_empty(head, tail))
  607. mask |= EPOLLIN | EPOLLRDNORM;
  608. if (!pipe->writers && filp->f_pipe != pipe->w_counter)
  609. mask |= EPOLLHUP;
  610. }
  611. if (filp->f_mode & FMODE_WRITE) {
  612. if (!pipe_full(head, tail, pipe->max_usage))
  613. mask |= EPOLLOUT | EPOLLWRNORM;
  614. /*
  615. * Most Unices do not set EPOLLERR for FIFOs but on Linux they
  616. * behave exactly like pipes for poll().
  617. */
  618. if (!pipe->readers)
  619. mask |= EPOLLERR;
  620. }
  621. return mask;
  622. }
  623. static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
  624. {
  625. int kill = 0;
  626. spin_lock(&inode->i_lock);
  627. if (!--pipe->files) {
  628. inode->i_pipe = NULL;
  629. kill = 1;
  630. }
  631. spin_unlock(&inode->i_lock);
  632. if (kill)
  633. free_pipe_info(pipe);
  634. }
  635. static int
  636. pipe_release(struct inode *inode, struct file *file)
  637. {
  638. struct pipe_inode_info *pipe = file->private_data;
  639. mutex_lock(&pipe->mutex);
  640. if (file->f_mode & FMODE_READ)
  641. pipe->readers--;
  642. if (file->f_mode & FMODE_WRITE)
  643. pipe->writers--;
  644. /* Was that the last reader or writer, but not the other side? */
  645. if (!pipe->readers != !pipe->writers) {
  646. wake_up_interruptible_all(&pipe->rd_wait);
  647. wake_up_interruptible_all(&pipe->wr_wait);
  648. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  649. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  650. }
  651. mutex_unlock(&pipe->mutex);
  652. put_pipe_info(inode, pipe);
  653. return 0;
  654. }
  655. static int
  656. pipe_fasync(int fd, struct file *filp, int on)
  657. {
  658. struct pipe_inode_info *pipe = filp->private_data;
  659. int retval = 0;
  660. mutex_lock(&pipe->mutex);
  661. if (filp->f_mode & FMODE_READ)
  662. retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
  663. if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
  664. retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
  665. if (retval < 0 && (filp->f_mode & FMODE_READ))
  666. /* this can happen only if on == T */
  667. fasync_helper(-1, filp, 0, &pipe->fasync_readers);
  668. }
  669. mutex_unlock(&pipe->mutex);
  670. return retval;
  671. }
  672. unsigned long account_pipe_buffers(struct user_struct *user,
  673. unsigned long old, unsigned long new)
  674. {
  675. return atomic_long_add_return(new - old, &user->pipe_bufs);
  676. }
  677. bool too_many_pipe_buffers_soft(unsigned long user_bufs)
  678. {
  679. unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
  680. return soft_limit && user_bufs > soft_limit;
  681. }
  682. bool too_many_pipe_buffers_hard(unsigned long user_bufs)
  683. {
  684. unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
  685. return hard_limit && user_bufs > hard_limit;
  686. }
  687. bool pipe_is_unprivileged_user(void)
  688. {
  689. return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
  690. }
  691. struct pipe_inode_info *alloc_pipe_info(void)
  692. {
  693. struct pipe_inode_info *pipe;
  694. unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
  695. struct user_struct *user = get_current_user();
  696. unsigned long user_bufs;
  697. unsigned int max_size = READ_ONCE(pipe_max_size);
  698. pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
  699. if (pipe == NULL)
  700. goto out_free_uid;
  701. if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
  702. pipe_bufs = max_size >> PAGE_SHIFT;
  703. user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
  704. if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
  705. user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
  706. pipe_bufs = PIPE_MIN_DEF_BUFFERS;
  707. }
  708. if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
  709. goto out_revert_acct;
  710. pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
  711. GFP_KERNEL_ACCOUNT);
  712. if (pipe->bufs) {
  713. init_waitqueue_head(&pipe->rd_wait);
  714. init_waitqueue_head(&pipe->wr_wait);
  715. pipe->r_counter = pipe->w_counter = 1;
  716. pipe->max_usage = pipe_bufs;
  717. pipe->ring_size = pipe_bufs;
  718. pipe->nr_accounted = pipe_bufs;
  719. pipe->user = user;
  720. mutex_init(&pipe->mutex);
  721. lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
  722. return pipe;
  723. }
  724. out_revert_acct:
  725. (void) account_pipe_buffers(user, pipe_bufs, 0);
  726. kfree(pipe);
  727. out_free_uid:
  728. free_uid(user);
  729. return NULL;
  730. }
  731. void free_pipe_info(struct pipe_inode_info *pipe)
  732. {
  733. unsigned int i;
  734. #ifdef CONFIG_WATCH_QUEUE
  735. if (pipe->watch_queue)
  736. watch_queue_clear(pipe->watch_queue);
  737. #endif
  738. (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
  739. free_uid(pipe->user);
  740. for (i = 0; i < pipe->ring_size; i++) {
  741. struct pipe_buffer *buf = pipe->bufs + i;
  742. if (buf->ops)
  743. pipe_buf_release(pipe, buf);
  744. }
  745. #ifdef CONFIG_WATCH_QUEUE
  746. if (pipe->watch_queue)
  747. put_watch_queue(pipe->watch_queue);
  748. #endif
  749. if (pipe->tmp_page)
  750. __free_page(pipe->tmp_page);
  751. kfree(pipe->bufs);
  752. kfree(pipe);
  753. }
  754. static struct vfsmount *pipe_mnt __ro_after_init;
  755. /*
  756. * pipefs_dname() is called from d_path().
  757. */
  758. static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
  759. {
  760. return dynamic_dname(buffer, buflen, "pipe:[%lu]",
  761. d_inode(dentry)->i_ino);
  762. }
  763. static const struct dentry_operations pipefs_dentry_operations = {
  764. .d_dname = pipefs_dname,
  765. };
  766. static struct inode * get_pipe_inode(void)
  767. {
  768. struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
  769. struct pipe_inode_info *pipe;
  770. if (!inode)
  771. goto fail_inode;
  772. inode->i_ino = get_next_ino();
  773. pipe = alloc_pipe_info();
  774. if (!pipe)
  775. goto fail_iput;
  776. inode->i_pipe = pipe;
  777. pipe->files = 2;
  778. pipe->readers = pipe->writers = 1;
  779. inode->i_fop = &pipefifo_fops;
  780. /*
  781. * Mark the inode dirty from the very beginning,
  782. * that way it will never be moved to the dirty
  783. * list because "mark_inode_dirty()" will think
  784. * that it already _is_ on the dirty list.
  785. */
  786. inode->i_state = I_DIRTY;
  787. inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
  788. inode->i_uid = current_fsuid();
  789. inode->i_gid = current_fsgid();
  790. simple_inode_init_ts(inode);
  791. return inode;
  792. fail_iput:
  793. iput(inode);
  794. fail_inode:
  795. return NULL;
  796. }
  797. int create_pipe_files(struct file **res, int flags)
  798. {
  799. struct inode *inode = get_pipe_inode();
  800. struct file *f;
  801. int error;
  802. if (!inode)
  803. return -ENFILE;
  804. if (flags & O_NOTIFICATION_PIPE) {
  805. error = watch_queue_init(inode->i_pipe);
  806. if (error) {
  807. free_pipe_info(inode->i_pipe);
  808. iput(inode);
  809. return error;
  810. }
  811. }
  812. f = alloc_file_pseudo(inode, pipe_mnt, "",
  813. O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
  814. &pipefifo_fops);
  815. if (IS_ERR(f)) {
  816. free_pipe_info(inode->i_pipe);
  817. iput(inode);
  818. return PTR_ERR(f);
  819. }
  820. f->private_data = inode->i_pipe;
  821. f->f_pipe = 0;
  822. res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
  823. &pipefifo_fops);
  824. if (IS_ERR(res[0])) {
  825. put_pipe_info(inode, inode->i_pipe);
  826. fput(f);
  827. return PTR_ERR(res[0]);
  828. }
  829. res[0]->private_data = inode->i_pipe;
  830. res[0]->f_pipe = 0;
  831. res[1] = f;
  832. stream_open(inode, res[0]);
  833. stream_open(inode, res[1]);
  834. return 0;
  835. }
  836. static int __do_pipe_flags(int *fd, struct file **files, int flags)
  837. {
  838. int error;
  839. int fdw, fdr;
  840. if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
  841. return -EINVAL;
  842. error = create_pipe_files(files, flags);
  843. if (error)
  844. return error;
  845. error = get_unused_fd_flags(flags);
  846. if (error < 0)
  847. goto err_read_pipe;
  848. fdr = error;
  849. error = get_unused_fd_flags(flags);
  850. if (error < 0)
  851. goto err_fdr;
  852. fdw = error;
  853. audit_fd_pair(fdr, fdw);
  854. fd[0] = fdr;
  855. fd[1] = fdw;
  856. /* pipe groks IOCB_NOWAIT */
  857. files[0]->f_mode |= FMODE_NOWAIT;
  858. files[1]->f_mode |= FMODE_NOWAIT;
  859. return 0;
  860. err_fdr:
  861. put_unused_fd(fdr);
  862. err_read_pipe:
  863. fput(files[0]);
  864. fput(files[1]);
  865. return error;
  866. }
  867. int do_pipe_flags(int *fd, int flags)
  868. {
  869. struct file *files[2];
  870. int error = __do_pipe_flags(fd, files, flags);
  871. if (!error) {
  872. fd_install(fd[0], files[0]);
  873. fd_install(fd[1], files[1]);
  874. }
  875. return error;
  876. }
  877. /*
  878. * sys_pipe() is the normal C calling standard for creating
  879. * a pipe. It's not the way Unix traditionally does this, though.
  880. */
  881. static int do_pipe2(int __user *fildes, int flags)
  882. {
  883. struct file *files[2];
  884. int fd[2];
  885. int error;
  886. error = __do_pipe_flags(fd, files, flags);
  887. if (!error) {
  888. if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
  889. fput(files[0]);
  890. fput(files[1]);
  891. put_unused_fd(fd[0]);
  892. put_unused_fd(fd[1]);
  893. error = -EFAULT;
  894. } else {
  895. fd_install(fd[0], files[0]);
  896. fd_install(fd[1], files[1]);
  897. }
  898. }
  899. return error;
  900. }
  901. SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
  902. {
  903. return do_pipe2(fildes, flags);
  904. }
  905. SYSCALL_DEFINE1(pipe, int __user *, fildes)
  906. {
  907. return do_pipe2(fildes, 0);
  908. }
  909. /*
  910. * This is the stupid "wait for pipe to be readable or writable"
  911. * model.
  912. *
  913. * See pipe_read/write() for the proper kind of exclusive wait,
  914. * but that requires that we wake up any other readers/writers
  915. * if we then do not end up reading everything (ie the whole
  916. * "wake_next_reader/writer" logic in pipe_read/write()).
  917. */
  918. void pipe_wait_readable(struct pipe_inode_info *pipe)
  919. {
  920. pipe_unlock(pipe);
  921. wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
  922. pipe_lock(pipe);
  923. }
  924. void pipe_wait_writable(struct pipe_inode_info *pipe)
  925. {
  926. pipe_unlock(pipe);
  927. wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
  928. pipe_lock(pipe);
  929. }
  930. /*
  931. * This depends on both the wait (here) and the wakeup (wake_up_partner)
  932. * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
  933. * race with the count check and waitqueue prep.
  934. *
  935. * Normally in order to avoid races, you'd do the prepare_to_wait() first,
  936. * then check the condition you're waiting for, and only then sleep. But
  937. * because of the pipe lock, we can check the condition before being on
  938. * the wait queue.
  939. *
  940. * We use the 'rd_wait' waitqueue for pipe partner waiting.
  941. */
  942. static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
  943. {
  944. DEFINE_WAIT(rdwait);
  945. int cur = *cnt;
  946. while (cur == *cnt) {
  947. prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
  948. pipe_unlock(pipe);
  949. schedule();
  950. finish_wait(&pipe->rd_wait, &rdwait);
  951. pipe_lock(pipe);
  952. if (signal_pending(current))
  953. break;
  954. }
  955. return cur == *cnt ? -ERESTARTSYS : 0;
  956. }
  957. static void wake_up_partner(struct pipe_inode_info *pipe)
  958. {
  959. wake_up_interruptible_all(&pipe->rd_wait);
  960. }
  961. static int fifo_open(struct inode *inode, struct file *filp)
  962. {
  963. struct pipe_inode_info *pipe;
  964. bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
  965. int ret;
  966. filp->f_pipe = 0;
  967. spin_lock(&inode->i_lock);
  968. if (inode->i_pipe) {
  969. pipe = inode->i_pipe;
  970. pipe->files++;
  971. spin_unlock(&inode->i_lock);
  972. } else {
  973. spin_unlock(&inode->i_lock);
  974. pipe = alloc_pipe_info();
  975. if (!pipe)
  976. return -ENOMEM;
  977. pipe->files = 1;
  978. spin_lock(&inode->i_lock);
  979. if (unlikely(inode->i_pipe)) {
  980. inode->i_pipe->files++;
  981. spin_unlock(&inode->i_lock);
  982. free_pipe_info(pipe);
  983. pipe = inode->i_pipe;
  984. } else {
  985. inode->i_pipe = pipe;
  986. spin_unlock(&inode->i_lock);
  987. }
  988. }
  989. filp->private_data = pipe;
  990. /* OK, we have a pipe and it's pinned down */
  991. mutex_lock(&pipe->mutex);
  992. /* We can only do regular read/write on fifos */
  993. stream_open(inode, filp);
  994. switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
  995. case FMODE_READ:
  996. /*
  997. * O_RDONLY
  998. * POSIX.1 says that O_NONBLOCK means return with the FIFO
  999. * opened, even when there is no process writing the FIFO.
  1000. */
  1001. pipe->r_counter++;
  1002. if (pipe->readers++ == 0)
  1003. wake_up_partner(pipe);
  1004. if (!is_pipe && !pipe->writers) {
  1005. if ((filp->f_flags & O_NONBLOCK)) {
  1006. /* suppress EPOLLHUP until we have
  1007. * seen a writer */
  1008. filp->f_pipe = pipe->w_counter;
  1009. } else {
  1010. if (wait_for_partner(pipe, &pipe->w_counter))
  1011. goto err_rd;
  1012. }
  1013. }
  1014. break;
  1015. case FMODE_WRITE:
  1016. /*
  1017. * O_WRONLY
  1018. * POSIX.1 says that O_NONBLOCK means return -1 with
  1019. * errno=ENXIO when there is no process reading the FIFO.
  1020. */
  1021. ret = -ENXIO;
  1022. if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
  1023. goto err;
  1024. pipe->w_counter++;
  1025. if (!pipe->writers++)
  1026. wake_up_partner(pipe);
  1027. if (!is_pipe && !pipe->readers) {
  1028. if (wait_for_partner(pipe, &pipe->r_counter))
  1029. goto err_wr;
  1030. }
  1031. break;
  1032. case FMODE_READ | FMODE_WRITE:
  1033. /*
  1034. * O_RDWR
  1035. * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
  1036. * This implementation will NEVER block on a O_RDWR open, since
  1037. * the process can at least talk to itself.
  1038. */
  1039. pipe->readers++;
  1040. pipe->writers++;
  1041. pipe->r_counter++;
  1042. pipe->w_counter++;
  1043. if (pipe->readers == 1 || pipe->writers == 1)
  1044. wake_up_partner(pipe);
  1045. break;
  1046. default:
  1047. ret = -EINVAL;
  1048. goto err;
  1049. }
  1050. /* Ok! */
  1051. mutex_unlock(&pipe->mutex);
  1052. return 0;
  1053. err_rd:
  1054. if (!--pipe->readers)
  1055. wake_up_interruptible(&pipe->wr_wait);
  1056. ret = -ERESTARTSYS;
  1057. goto err;
  1058. err_wr:
  1059. if (!--pipe->writers)
  1060. wake_up_interruptible_all(&pipe->rd_wait);
  1061. ret = -ERESTARTSYS;
  1062. goto err;
  1063. err:
  1064. mutex_unlock(&pipe->mutex);
  1065. put_pipe_info(inode, pipe);
  1066. return ret;
  1067. }
  1068. const struct file_operations pipefifo_fops = {
  1069. .open = fifo_open,
  1070. .read_iter = pipe_read,
  1071. .write_iter = pipe_write,
  1072. .poll = pipe_poll,
  1073. .unlocked_ioctl = pipe_ioctl,
  1074. .release = pipe_release,
  1075. .fasync = pipe_fasync,
  1076. .splice_write = iter_file_splice_write,
  1077. };
  1078. /*
  1079. * Currently we rely on the pipe array holding a power-of-2 number
  1080. * of pages. Returns 0 on error.
  1081. */
  1082. unsigned int round_pipe_size(unsigned int size)
  1083. {
  1084. if (size > (1U << 31))
  1085. return 0;
  1086. /* Minimum pipe size, as required by POSIX */
  1087. if (size < PAGE_SIZE)
  1088. return PAGE_SIZE;
  1089. return roundup_pow_of_two(size);
  1090. }
  1091. /*
  1092. * Resize the pipe ring to a number of slots.
  1093. *
  1094. * Note the pipe can be reduced in capacity, but only if the current
  1095. * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
  1096. * returned instead.
  1097. */
  1098. int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
  1099. {
  1100. struct pipe_buffer *bufs;
  1101. unsigned int head, tail, mask, n;
  1102. bufs = kcalloc(nr_slots, sizeof(*bufs),
  1103. GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
  1104. if (unlikely(!bufs))
  1105. return -ENOMEM;
  1106. spin_lock_irq(&pipe->rd_wait.lock);
  1107. mask = pipe->ring_size - 1;
  1108. head = pipe->head;
  1109. tail = pipe->tail;
  1110. n = pipe_occupancy(head, tail);
  1111. if (nr_slots < n) {
  1112. spin_unlock_irq(&pipe->rd_wait.lock);
  1113. kfree(bufs);
  1114. return -EBUSY;
  1115. }
  1116. /*
  1117. * The pipe array wraps around, so just start the new one at zero
  1118. * and adjust the indices.
  1119. */
  1120. if (n > 0) {
  1121. unsigned int h = head & mask;
  1122. unsigned int t = tail & mask;
  1123. if (h > t) {
  1124. memcpy(bufs, pipe->bufs + t,
  1125. n * sizeof(struct pipe_buffer));
  1126. } else {
  1127. unsigned int tsize = pipe->ring_size - t;
  1128. if (h > 0)
  1129. memcpy(bufs + tsize, pipe->bufs,
  1130. h * sizeof(struct pipe_buffer));
  1131. memcpy(bufs, pipe->bufs + t,
  1132. tsize * sizeof(struct pipe_buffer));
  1133. }
  1134. }
  1135. head = n;
  1136. tail = 0;
  1137. kfree(pipe->bufs);
  1138. pipe->bufs = bufs;
  1139. pipe->ring_size = nr_slots;
  1140. if (pipe->max_usage > nr_slots)
  1141. pipe->max_usage = nr_slots;
  1142. pipe->tail = tail;
  1143. pipe->head = head;
  1144. if (!pipe_has_watch_queue(pipe)) {
  1145. pipe->max_usage = nr_slots;
  1146. pipe->nr_accounted = nr_slots;
  1147. }
  1148. spin_unlock_irq(&pipe->rd_wait.lock);
  1149. /* This might have made more room for writers */
  1150. wake_up_interruptible(&pipe->wr_wait);
  1151. return 0;
  1152. }
  1153. /*
  1154. * Allocate a new array of pipe buffers and copy the info over. Returns the
  1155. * pipe size if successful, or return -ERROR on error.
  1156. */
  1157. static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
  1158. {
  1159. unsigned long user_bufs;
  1160. unsigned int nr_slots, size;
  1161. long ret = 0;
  1162. if (pipe_has_watch_queue(pipe))
  1163. return -EBUSY;
  1164. size = round_pipe_size(arg);
  1165. nr_slots = size >> PAGE_SHIFT;
  1166. if (!nr_slots)
  1167. return -EINVAL;
  1168. /*
  1169. * If trying to increase the pipe capacity, check that an
  1170. * unprivileged user is not trying to exceed various limits
  1171. * (soft limit check here, hard limit check just below).
  1172. * Decreasing the pipe capacity is always permitted, even
  1173. * if the user is currently over a limit.
  1174. */
  1175. if (nr_slots > pipe->max_usage &&
  1176. size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
  1177. return -EPERM;
  1178. user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
  1179. if (nr_slots > pipe->max_usage &&
  1180. (too_many_pipe_buffers_hard(user_bufs) ||
  1181. too_many_pipe_buffers_soft(user_bufs)) &&
  1182. pipe_is_unprivileged_user()) {
  1183. ret = -EPERM;
  1184. goto out_revert_acct;
  1185. }
  1186. ret = pipe_resize_ring(pipe, nr_slots);
  1187. if (ret < 0)
  1188. goto out_revert_acct;
  1189. return pipe->max_usage * PAGE_SIZE;
  1190. out_revert_acct:
  1191. (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
  1192. return ret;
  1193. }
  1194. /*
  1195. * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
  1196. * not enough to verify that this is a pipe.
  1197. */
  1198. struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
  1199. {
  1200. struct pipe_inode_info *pipe = file->private_data;
  1201. if (file->f_op != &pipefifo_fops || !pipe)
  1202. return NULL;
  1203. if (for_splice && pipe_has_watch_queue(pipe))
  1204. return NULL;
  1205. return pipe;
  1206. }
  1207. long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
  1208. {
  1209. struct pipe_inode_info *pipe;
  1210. long ret;
  1211. pipe = get_pipe_info(file, false);
  1212. if (!pipe)
  1213. return -EBADF;
  1214. mutex_lock(&pipe->mutex);
  1215. switch (cmd) {
  1216. case F_SETPIPE_SZ:
  1217. ret = pipe_set_size(pipe, arg);
  1218. break;
  1219. case F_GETPIPE_SZ:
  1220. ret = pipe->max_usage * PAGE_SIZE;
  1221. break;
  1222. default:
  1223. ret = -EINVAL;
  1224. break;
  1225. }
  1226. mutex_unlock(&pipe->mutex);
  1227. return ret;
  1228. }
  1229. static const struct super_operations pipefs_ops = {
  1230. .destroy_inode = free_inode_nonrcu,
  1231. .statfs = simple_statfs,
  1232. };
  1233. /*
  1234. * pipefs should _never_ be mounted by userland - too much of security hassle,
  1235. * no real gain from having the whole file system mounted. So we don't need
  1236. * any operations on the root directory. However, we need a non-trivial
  1237. * d_name - pipe: will go nicely and kill the special-casing in procfs.
  1238. */
  1239. static int pipefs_init_fs_context(struct fs_context *fc)
  1240. {
  1241. struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
  1242. if (!ctx)
  1243. return -ENOMEM;
  1244. ctx->ops = &pipefs_ops;
  1245. ctx->dops = &pipefs_dentry_operations;
  1246. return 0;
  1247. }
  1248. static struct file_system_type pipe_fs_type = {
  1249. .name = "pipefs",
  1250. .init_fs_context = pipefs_init_fs_context,
  1251. .kill_sb = kill_anon_super,
  1252. };
  1253. #ifdef CONFIG_SYSCTL
  1254. static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
  1255. unsigned int *valp,
  1256. int write, void *data)
  1257. {
  1258. if (write) {
  1259. unsigned int val;
  1260. val = round_pipe_size(*lvalp);
  1261. if (val == 0)
  1262. return -EINVAL;
  1263. *valp = val;
  1264. } else {
  1265. unsigned int val = *valp;
  1266. *lvalp = (unsigned long) val;
  1267. }
  1268. return 0;
  1269. }
  1270. static int proc_dopipe_max_size(const struct ctl_table *table, int write,
  1271. void *buffer, size_t *lenp, loff_t *ppos)
  1272. {
  1273. return do_proc_douintvec(table, write, buffer, lenp, ppos,
  1274. do_proc_dopipe_max_size_conv, NULL);
  1275. }
  1276. static struct ctl_table fs_pipe_sysctls[] = {
  1277. {
  1278. .procname = "pipe-max-size",
  1279. .data = &pipe_max_size,
  1280. .maxlen = sizeof(pipe_max_size),
  1281. .mode = 0644,
  1282. .proc_handler = proc_dopipe_max_size,
  1283. },
  1284. {
  1285. .procname = "pipe-user-pages-hard",
  1286. .data = &pipe_user_pages_hard,
  1287. .maxlen = sizeof(pipe_user_pages_hard),
  1288. .mode = 0644,
  1289. .proc_handler = proc_doulongvec_minmax,
  1290. },
  1291. {
  1292. .procname = "pipe-user-pages-soft",
  1293. .data = &pipe_user_pages_soft,
  1294. .maxlen = sizeof(pipe_user_pages_soft),
  1295. .mode = 0644,
  1296. .proc_handler = proc_doulongvec_minmax,
  1297. },
  1298. };
  1299. #endif
  1300. static int __init init_pipe_fs(void)
  1301. {
  1302. int err = register_filesystem(&pipe_fs_type);
  1303. if (!err) {
  1304. pipe_mnt = kern_mount(&pipe_fs_type);
  1305. if (IS_ERR(pipe_mnt)) {
  1306. err = PTR_ERR(pipe_mnt);
  1307. unregister_filesystem(&pipe_fs_type);
  1308. }
  1309. }
  1310. #ifdef CONFIG_SYSCTL
  1311. register_sysctl_init("fs", fs_pipe_sysctls);
  1312. #endif
  1313. return err;
  1314. }
  1315. fs_initcall(init_pipe_fs);