tcp_mmap.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. /*
  2. * Copyright 2018 Google Inc.
  3. * Author: Eric Dumazet (edumazet@google.com)
  4. *
  5. * Reference program demonstrating tcp mmap() usage,
  6. * and SO_RCVLOWAT hints for receiver.
  7. *
  8. * Note : NIC with header split is needed to use mmap() on TCP :
  9. * Each incoming frame must be a multiple of PAGE_SIZE bytes of TCP payload.
  10. *
  11. * How to use on loopback interface :
  12. *
  13. * ifconfig lo mtu 61512 # 15*4096 + 40 (ipv6 header) + 32 (TCP with TS option header)
  14. * tcp_mmap -s -z &
  15. * tcp_mmap -H ::1 -z
  16. *
  17. * Or leave default lo mtu, but use -M option to set TCP_MAXSEG option to (4096 + 12)
  18. * (4096 : page size on x86, 12: TCP TS option length)
  19. * tcp_mmap -s -z -M $((4096+12)) &
  20. * tcp_mmap -H ::1 -z -M $((4096+12))
  21. *
  22. * Note: -z option on sender uses MSG_ZEROCOPY, which forces a copy when packets go through loopback interface.
  23. * We might use sendfile() instead, but really this test program is about mmap(), for receivers ;)
  24. *
  25. * $ ./tcp_mmap -s & # Without mmap()
  26. * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
  27. * received 32768 MB (0 % mmap'ed) in 14.1157 s, 19.4732 Gbit
  28. * cpu usage user:0.057 sys:7.815, 240.234 usec per MB, 65531 c-switches
  29. * received 32768 MB (0 % mmap'ed) in 14.6833 s, 18.7204 Gbit
  30. * cpu usage user:0.043 sys:8.103, 248.596 usec per MB, 65524 c-switches
  31. * received 32768 MB (0 % mmap'ed) in 11.143 s, 24.6682 Gbit
  32. * cpu usage user:0.044 sys:6.576, 202.026 usec per MB, 65519 c-switches
  33. * received 32768 MB (0 % mmap'ed) in 14.9056 s, 18.4413 Gbit
  34. * cpu usage user:0.036 sys:8.193, 251.129 usec per MB, 65530 c-switches
  35. * $ kill %1 # kill tcp_mmap server
  36. *
  37. * $ ./tcp_mmap -s -z & # With mmap()
  38. * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
  39. * received 32768 MB (99.9939 % mmap'ed) in 6.73792 s, 40.7956 Gbit
  40. * cpu usage user:0.045 sys:2.827, 87.6465 usec per MB, 65532 c-switches
  41. * received 32768 MB (99.9939 % mmap'ed) in 7.26732 s, 37.8238 Gbit
  42. * cpu usage user:0.037 sys:3.087, 95.3369 usec per MB, 65532 c-switches
  43. * received 32768 MB (99.9939 % mmap'ed) in 7.61661 s, 36.0893 Gbit
  44. * cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches
  45. * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit
  46. * cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches
  47. *
  48. * License (GPLv2):
  49. *
  50. * This program is free software; you can redistribute it and/or modify it
  51. * under the terms and conditions of the GNU General Public License,
  52. * version 2, as published by the Free Software Foundation.
  53. *
  54. * This program is distributed in the hope it will be useful, but WITHOUT
  55. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  56. * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
  57. * more details.
  58. *
  59. * You should have received a copy of the GNU General Public License along with
  60. * this program; if not, write to the Free Software Foundation, Inc.,
  61. * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  62. */
  63. #define _GNU_SOURCE
  64. #include <pthread.h>
  65. #include <sys/types.h>
  66. #include <fcntl.h>
  67. #include <error.h>
  68. #include <sys/socket.h>
  69. #include <sys/mman.h>
  70. #include <sys/resource.h>
  71. #include <unistd.h>
  72. #include <string.h>
  73. #include <stdlib.h>
  74. #include <stdio.h>
  75. #include <errno.h>
  76. #include <time.h>
  77. #include <sys/time.h>
  78. #include <netinet/in.h>
  79. #include <arpa/inet.h>
  80. #include <poll.h>
  81. #include <linux/tcp.h>
  82. #include <assert.h>
  83. #ifndef MSG_ZEROCOPY
  84. #define MSG_ZEROCOPY 0x4000000
  85. #endif
  86. #define FILE_SZ (1UL << 35)
  87. static int cfg_family = AF_INET6;
  88. static socklen_t cfg_alen = sizeof(struct sockaddr_in6);
  89. static int cfg_port = 8787;
  90. static int rcvbuf; /* Default: autotuning. Can be set with -r <integer> option */
  91. static int sndbuf; /* Default: autotuning. Can be set with -w <integer> option */
  92. static int zflg; /* zero copy option. (MSG_ZEROCOPY for sender, mmap() for receiver */
  93. static int xflg; /* hash received data (simple xor) (-h option) */
  94. static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */
  95. static int chunk_size = 512*1024;
  96. unsigned long htotal;
  97. static inline void prefetch(const void *x)
  98. {
  99. #if defined(__x86_64__)
  100. asm volatile("prefetcht0 %P0" : : "m" (*(const char *)x));
  101. #endif
  102. }
  103. void hash_zone(void *zone, unsigned int length)
  104. {
  105. unsigned long temp = htotal;
  106. while (length >= 8*sizeof(long)) {
  107. prefetch(zone + 384);
  108. temp ^= *(unsigned long *)zone;
  109. temp ^= *(unsigned long *)(zone + sizeof(long));
  110. temp ^= *(unsigned long *)(zone + 2*sizeof(long));
  111. temp ^= *(unsigned long *)(zone + 3*sizeof(long));
  112. temp ^= *(unsigned long *)(zone + 4*sizeof(long));
  113. temp ^= *(unsigned long *)(zone + 5*sizeof(long));
  114. temp ^= *(unsigned long *)(zone + 6*sizeof(long));
  115. temp ^= *(unsigned long *)(zone + 7*sizeof(long));
  116. zone += 8*sizeof(long);
  117. length -= 8*sizeof(long);
  118. }
  119. while (length >= 1) {
  120. temp ^= *(unsigned char *)zone;
  121. zone += 1;
  122. length--;
  123. }
  124. htotal = temp;
  125. }
  126. void *child_thread(void *arg)
  127. {
  128. unsigned long total_mmap = 0, total = 0;
  129. struct tcp_zerocopy_receive zc;
  130. unsigned long delta_usec;
  131. int flags = MAP_SHARED;
  132. struct timeval t0, t1;
  133. char *buffer = NULL;
  134. void *addr = NULL;
  135. double throughput;
  136. struct rusage ru;
  137. int lu, fd;
  138. fd = (int)(unsigned long)arg;
  139. gettimeofday(&t0, NULL);
  140. fcntl(fd, F_SETFL, O_NDELAY);
  141. buffer = malloc(chunk_size);
  142. if (!buffer) {
  143. perror("malloc");
  144. goto error;
  145. }
  146. if (zflg) {
  147. addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0);
  148. if (addr == (void *)-1)
  149. zflg = 0;
  150. }
  151. while (1) {
  152. struct pollfd pfd = { .fd = fd, .events = POLLIN, };
  153. int sub;
  154. poll(&pfd, 1, 10000);
  155. if (zflg) {
  156. socklen_t zc_len = sizeof(zc);
  157. int res;
  158. zc.address = (__u64)addr;
  159. zc.length = chunk_size;
  160. zc.recv_skip_hint = 0;
  161. res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
  162. &zc, &zc_len);
  163. if (res == -1)
  164. break;
  165. if (zc.length) {
  166. assert(zc.length <= chunk_size);
  167. total_mmap += zc.length;
  168. if (xflg)
  169. hash_zone(addr, zc.length);
  170. total += zc.length;
  171. }
  172. if (zc.recv_skip_hint) {
  173. assert(zc.recv_skip_hint <= chunk_size);
  174. lu = read(fd, buffer, zc.recv_skip_hint);
  175. if (lu > 0) {
  176. if (xflg)
  177. hash_zone(buffer, lu);
  178. total += lu;
  179. }
  180. }
  181. continue;
  182. }
  183. sub = 0;
  184. while (sub < chunk_size) {
  185. lu = read(fd, buffer + sub, chunk_size - sub);
  186. if (lu == 0)
  187. goto end;
  188. if (lu < 0)
  189. break;
  190. if (xflg)
  191. hash_zone(buffer + sub, lu);
  192. total += lu;
  193. sub += lu;
  194. }
  195. }
  196. end:
  197. gettimeofday(&t1, NULL);
  198. delta_usec = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
  199. throughput = 0;
  200. if (delta_usec)
  201. throughput = total * 8.0 / (double)delta_usec / 1000.0;
  202. getrusage(RUSAGE_THREAD, &ru);
  203. if (total > 1024*1024) {
  204. unsigned long total_usec;
  205. unsigned long mb = total >> 20;
  206. total_usec = 1000000*ru.ru_utime.tv_sec + ru.ru_utime.tv_usec +
  207. 1000000*ru.ru_stime.tv_sec + ru.ru_stime.tv_usec;
  208. printf("received %lg MB (%lg %% mmap'ed) in %lg s, %lg Gbit\n"
  209. " cpu usage user:%lg sys:%lg, %lg usec per MB, %lu c-switches\n",
  210. total / (1024.0 * 1024.0),
  211. 100.0*total_mmap/total,
  212. (double)delta_usec / 1000000.0,
  213. throughput,
  214. (double)ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec / 1000000.0,
  215. (double)ru.ru_stime.tv_sec + (double)ru.ru_stime.tv_usec / 1000000.0,
  216. (double)total_usec/mb,
  217. ru.ru_nvcsw);
  218. }
  219. error:
  220. free(buffer);
  221. close(fd);
  222. if (zflg)
  223. munmap(addr, chunk_size);
  224. pthread_exit(0);
  225. }
  226. static void apply_rcvsnd_buf(int fd)
  227. {
  228. if (rcvbuf && setsockopt(fd, SOL_SOCKET,
  229. SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)) == -1) {
  230. perror("setsockopt SO_RCVBUF");
  231. }
  232. if (sndbuf && setsockopt(fd, SOL_SOCKET,
  233. SO_SNDBUF, &sndbuf, sizeof(sndbuf)) == -1) {
  234. perror("setsockopt SO_SNDBUF");
  235. }
  236. }
  237. static void setup_sockaddr(int domain, const char *str_addr,
  238. struct sockaddr_storage *sockaddr)
  239. {
  240. struct sockaddr_in6 *addr6 = (void *) sockaddr;
  241. struct sockaddr_in *addr4 = (void *) sockaddr;
  242. switch (domain) {
  243. case PF_INET:
  244. memset(addr4, 0, sizeof(*addr4));
  245. addr4->sin_family = AF_INET;
  246. addr4->sin_port = htons(cfg_port);
  247. if (str_addr &&
  248. inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
  249. error(1, 0, "ipv4 parse error: %s", str_addr);
  250. break;
  251. case PF_INET6:
  252. memset(addr6, 0, sizeof(*addr6));
  253. addr6->sin6_family = AF_INET6;
  254. addr6->sin6_port = htons(cfg_port);
  255. if (str_addr &&
  256. inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
  257. error(1, 0, "ipv6 parse error: %s", str_addr);
  258. break;
  259. default:
  260. error(1, 0, "illegal domain");
  261. }
  262. }
  263. static void do_accept(int fdlisten)
  264. {
  265. if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT,
  266. &chunk_size, sizeof(chunk_size)) == -1) {
  267. perror("setsockopt SO_RCVLOWAT");
  268. }
  269. apply_rcvsnd_buf(fdlisten);
  270. while (1) {
  271. struct sockaddr_in addr;
  272. socklen_t addrlen = sizeof(addr);
  273. pthread_t th;
  274. int fd, res;
  275. fd = accept(fdlisten, (struct sockaddr *)&addr, &addrlen);
  276. if (fd == -1) {
  277. perror("accept");
  278. continue;
  279. }
  280. res = pthread_create(&th, NULL, child_thread,
  281. (void *)(unsigned long)fd);
  282. if (res) {
  283. errno = res;
  284. perror("pthread_create");
  285. close(fd);
  286. }
  287. }
  288. }
  289. int main(int argc, char *argv[])
  290. {
  291. struct sockaddr_storage listenaddr, addr;
  292. unsigned int max_pacing_rate = 0;
  293. unsigned long total = 0;
  294. char *host = NULL;
  295. int fd, c, on = 1;
  296. char *buffer;
  297. int sflg = 0;
  298. int mss = 0;
  299. while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:")) != -1) {
  300. switch (c) {
  301. case '4':
  302. cfg_family = PF_INET;
  303. cfg_alen = sizeof(struct sockaddr_in);
  304. break;
  305. case '6':
  306. cfg_family = PF_INET6;
  307. cfg_alen = sizeof(struct sockaddr_in6);
  308. break;
  309. case 'p':
  310. cfg_port = atoi(optarg);
  311. break;
  312. case 'H':
  313. host = optarg;
  314. break;
  315. case 's': /* server : listen for incoming connections */
  316. sflg++;
  317. break;
  318. case 'r':
  319. rcvbuf = atoi(optarg);
  320. break;
  321. case 'w':
  322. sndbuf = atoi(optarg);
  323. break;
  324. case 'z':
  325. zflg = 1;
  326. break;
  327. case 'M':
  328. mss = atoi(optarg);
  329. break;
  330. case 'x':
  331. xflg = 1;
  332. break;
  333. case 'k':
  334. keepflag = 1;
  335. break;
  336. case 'P':
  337. max_pacing_rate = atoi(optarg) ;
  338. break;
  339. default:
  340. exit(1);
  341. }
  342. }
  343. if (sflg) {
  344. int fdlisten = socket(cfg_family, SOCK_STREAM, 0);
  345. if (fdlisten == -1) {
  346. perror("socket");
  347. exit(1);
  348. }
  349. apply_rcvsnd_buf(fdlisten);
  350. setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
  351. setup_sockaddr(cfg_family, host, &listenaddr);
  352. if (mss &&
  353. setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
  354. &mss, sizeof(mss)) == -1) {
  355. perror("setsockopt TCP_MAXSEG");
  356. exit(1);
  357. }
  358. if (bind(fdlisten, (const struct sockaddr *)&listenaddr, cfg_alen) == -1) {
  359. perror("bind");
  360. exit(1);
  361. }
  362. if (listen(fdlisten, 128) == -1) {
  363. perror("listen");
  364. exit(1);
  365. }
  366. do_accept(fdlisten);
  367. }
  368. buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE,
  369. MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
  370. if (buffer == (char *)-1) {
  371. perror("mmap");
  372. exit(1);
  373. }
  374. fd = socket(cfg_family, SOCK_STREAM, 0);
  375. if (fd == -1) {
  376. perror("socket");
  377. exit(1);
  378. }
  379. apply_rcvsnd_buf(fd);
  380. setup_sockaddr(cfg_family, host, &addr);
  381. if (mss &&
  382. setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
  383. perror("setsockopt TCP_MAXSEG");
  384. exit(1);
  385. }
  386. if (connect(fd, (const struct sockaddr *)&addr, cfg_alen) == -1) {
  387. perror("connect");
  388. exit(1);
  389. }
  390. if (max_pacing_rate &&
  391. setsockopt(fd, SOL_SOCKET, SO_MAX_PACING_RATE,
  392. &max_pacing_rate, sizeof(max_pacing_rate)) == -1)
  393. perror("setsockopt SO_MAX_PACING_RATE");
  394. if (zflg && setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY,
  395. &on, sizeof(on)) == -1) {
  396. perror("setsockopt SO_ZEROCOPY, (-z option disabled)");
  397. zflg = 0;
  398. }
  399. while (total < FILE_SZ) {
  400. long wr = FILE_SZ - total;
  401. if (wr > chunk_size)
  402. wr = chunk_size;
  403. /* Note : we just want to fill the pipe with 0 bytes */
  404. wr = send(fd, buffer, wr, zflg ? MSG_ZEROCOPY : 0);
  405. if (wr <= 0)
  406. break;
  407. total += wr;
  408. }
  409. close(fd);
  410. munmap(buffer, chunk_size);
  411. return 0;
  412. }