user-trap.c 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. #include <signal.h>
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <unistd.h>
  5. #include <errno.h>
  6. #include <fcntl.h>
  7. #include <string.h>
  8. #include <stddef.h>
  9. #include <sys/sysmacros.h>
  10. #include <sys/types.h>
  11. #include <sys/wait.h>
  12. #include <sys/socket.h>
  13. #include <sys/stat.h>
  14. #include <sys/mman.h>
  15. #include <sys/syscall.h>
  16. #include <sys/user.h>
  17. #include <sys/ioctl.h>
  18. #include <sys/ptrace.h>
  19. #include <sys/mount.h>
  20. #include <linux/limits.h>
  21. #include <linux/filter.h>
  22. #include <linux/seccomp.h>
  23. #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
  24. static int seccomp(unsigned int op, unsigned int flags, void *args)
  25. {
  26. errno = 0;
  27. return syscall(__NR_seccomp, op, flags, args);
  28. }
  29. static int send_fd(int sock, int fd)
  30. {
  31. struct msghdr msg = {};
  32. struct cmsghdr *cmsg;
  33. int *fd_ptr;
  34. char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
  35. struct iovec io = {
  36. .iov_base = &c,
  37. .iov_len = 1,
  38. };
  39. msg.msg_iov = &io;
  40. msg.msg_iovlen = 1;
  41. msg.msg_control = buf;
  42. msg.msg_controllen = sizeof(buf);
  43. cmsg = CMSG_FIRSTHDR(&msg);
  44. cmsg->cmsg_level = SOL_SOCKET;
  45. cmsg->cmsg_type = SCM_RIGHTS;
  46. cmsg->cmsg_len = CMSG_LEN(sizeof(int));
  47. fd_ptr = (int *)CMSG_DATA(cmsg);
  48. *fd_ptr = fd;
  49. msg.msg_controllen = cmsg->cmsg_len;
  50. if (sendmsg(sock, &msg, 0) < 0) {
  51. perror("sendmsg");
  52. return -1;
  53. }
  54. return 0;
  55. }
  56. static int recv_fd(int sock)
  57. {
  58. struct msghdr msg = {};
  59. struct cmsghdr *cmsg;
  60. int *fd_ptr;
  61. char buf[CMSG_SPACE(sizeof(int))] = {0}, c = 'c';
  62. struct iovec io = {
  63. .iov_base = &c,
  64. .iov_len = 1,
  65. };
  66. msg.msg_iov = &io;
  67. msg.msg_iovlen = 1;
  68. msg.msg_control = buf;
  69. msg.msg_controllen = sizeof(buf);
  70. if (recvmsg(sock, &msg, 0) < 0) {
  71. perror("recvmsg");
  72. return -1;
  73. }
  74. cmsg = CMSG_FIRSTHDR(&msg);
  75. fd_ptr = (int *)CMSG_DATA(cmsg);
  76. return *fd_ptr;
  77. }
  78. static int user_trap_syscall(int nr, unsigned int flags)
  79. {
  80. struct sock_filter filter[] = {
  81. BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
  82. offsetof(struct seccomp_data, nr)),
  83. BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
  84. BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
  85. BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
  86. };
  87. struct sock_fprog prog = {
  88. .len = (unsigned short)ARRAY_SIZE(filter),
  89. .filter = filter,
  90. };
  91. return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
  92. }
  93. static int handle_req(struct seccomp_notif *req,
  94. struct seccomp_notif_resp *resp, int listener)
  95. {
  96. char path[PATH_MAX], source[PATH_MAX], target[PATH_MAX];
  97. int ret = -1, mem;
  98. resp->id = req->id;
  99. resp->error = -EPERM;
  100. resp->val = 0;
  101. if (req->data.nr != __NR_mount) {
  102. fprintf(stderr, "huh? trapped something besides mount? %d\n", req->data.nr);
  103. return -1;
  104. }
  105. /* Only allow bind mounts. */
  106. if (!(req->data.args[3] & MS_BIND))
  107. return 0;
  108. /*
  109. * Ok, let's read the task's memory to see where they wanted their
  110. * mount to go.
  111. */
  112. snprintf(path, sizeof(path), "/proc/%d/mem", req->pid);
  113. mem = open(path, O_RDONLY);
  114. if (mem < 0) {
  115. perror("open mem");
  116. return -1;
  117. }
  118. /*
  119. * Now we avoid a TOCTOU: we referred to a pid by its pid, but since
  120. * the pid that made the syscall may have died, we need to confirm that
  121. * the pid is still valid after we open its /proc/pid/mem file. We can
  122. * ask the listener fd this as follows.
  123. *
  124. * Note that this check should occur *after* any task-specific
  125. * resources are opened, to make sure that the task has not died and
  126. * we're not wrongly reading someone else's state in order to make
  127. * decisions.
  128. */
  129. if (ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req->id) < 0) {
  130. fprintf(stderr, "task died before we could map its memory\n");
  131. goto out;
  132. }
  133. /*
  134. * Phew, we've got the right /proc/pid/mem. Now we can read it. Note
  135. * that to avoid another TOCTOU, we should read all of the pointer args
  136. * before we decide to allow the syscall.
  137. */
  138. if (lseek(mem, req->data.args[0], SEEK_SET) < 0) {
  139. perror("seek");
  140. goto out;
  141. }
  142. ret = read(mem, source, sizeof(source));
  143. if (ret < 0) {
  144. perror("read");
  145. goto out;
  146. }
  147. if (lseek(mem, req->data.args[1], SEEK_SET) < 0) {
  148. perror("seek");
  149. goto out;
  150. }
  151. ret = read(mem, target, sizeof(target));
  152. if (ret < 0) {
  153. perror("read");
  154. goto out;
  155. }
  156. /*
  157. * Our policy is to only allow bind mounts inside /tmp. This isn't very
  158. * interesting, because we could do unprivlieged bind mounts with user
  159. * namespaces already, but you get the idea.
  160. */
  161. if (!strncmp(source, "/tmp/", 5) && !strncmp(target, "/tmp/", 5)) {
  162. if (mount(source, target, NULL, req->data.args[3], NULL) < 0) {
  163. ret = -1;
  164. perror("actual mount");
  165. goto out;
  166. }
  167. resp->error = 0;
  168. }
  169. /* Even if we didn't allow it because of policy, generating the
  170. * response was be a success, because we want to tell the worker EPERM.
  171. */
  172. ret = 0;
  173. out:
  174. close(mem);
  175. return ret;
  176. }
  177. int main(void)
  178. {
  179. int sk_pair[2], ret = 1, status, listener;
  180. pid_t worker = 0 , tracer = 0;
  181. if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair) < 0) {
  182. perror("socketpair");
  183. return 1;
  184. }
  185. worker = fork();
  186. if (worker < 0) {
  187. perror("fork");
  188. goto close_pair;
  189. }
  190. if (worker == 0) {
  191. listener = user_trap_syscall(__NR_mount,
  192. SECCOMP_FILTER_FLAG_NEW_LISTENER);
  193. if (listener < 0) {
  194. perror("seccomp");
  195. exit(1);
  196. }
  197. /*
  198. * Drop privileges. We definitely can't mount as uid 1000.
  199. */
  200. if (setuid(1000) < 0) {
  201. perror("setuid");
  202. exit(1);
  203. }
  204. /*
  205. * Send the listener to the parent; also serves as
  206. * synchronization.
  207. */
  208. if (send_fd(sk_pair[1], listener) < 0)
  209. exit(1);
  210. close(listener);
  211. if (mkdir("/tmp/foo", 0755) < 0) {
  212. perror("mkdir");
  213. exit(1);
  214. }
  215. /*
  216. * Try a bad mount just for grins.
  217. */
  218. if (mount("/dev/sda", "/tmp/foo", NULL, 0, NULL) != -1) {
  219. fprintf(stderr, "huh? mounted /dev/sda?\n");
  220. exit(1);
  221. }
  222. if (errno != EPERM) {
  223. perror("bad error from mount");
  224. exit(1);
  225. }
  226. /*
  227. * Ok, we expect this one to succeed.
  228. */
  229. if (mount("/tmp/foo", "/tmp/foo", NULL, MS_BIND, NULL) < 0) {
  230. perror("mount");
  231. exit(1);
  232. }
  233. exit(0);
  234. }
  235. /*
  236. * Get the listener from the child.
  237. */
  238. listener = recv_fd(sk_pair[0]);
  239. if (listener < 0)
  240. goto out_kill;
  241. /*
  242. * Fork a task to handle the requests. This isn't strictly necessary,
  243. * but it makes the particular writing of this sample easier, since we
  244. * can just wait ofr the tracee to exit and kill the tracer.
  245. */
  246. tracer = fork();
  247. if (tracer < 0) {
  248. perror("fork");
  249. goto out_kill;
  250. }
  251. if (tracer == 0) {
  252. struct seccomp_notif *req;
  253. struct seccomp_notif_resp *resp;
  254. struct seccomp_notif_sizes sizes;
  255. if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) < 0) {
  256. perror("seccomp(GET_NOTIF_SIZES)");
  257. goto out_close;
  258. }
  259. req = malloc(sizes.seccomp_notif);
  260. if (!req)
  261. goto out_close;
  262. resp = malloc(sizes.seccomp_notif_resp);
  263. if (!resp)
  264. goto out_req;
  265. memset(resp, 0, sizes.seccomp_notif_resp);
  266. while (1) {
  267. memset(req, 0, sizes.seccomp_notif);
  268. if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
  269. perror("ioctl recv");
  270. goto out_resp;
  271. }
  272. if (handle_req(req, resp, listener) < 0)
  273. goto out_resp;
  274. /*
  275. * ENOENT here means that the task may have gotten a
  276. * signal and restarted the syscall. It's up to the
  277. * handler to decide what to do in this case, but for
  278. * the sample code, we just ignore it. Probably
  279. * something better should happen, like undoing the
  280. * mount, or keeping track of the args to make sure we
  281. * don't do it again.
  282. */
  283. if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, resp) < 0 &&
  284. errno != ENOENT) {
  285. perror("ioctl send");
  286. goto out_resp;
  287. }
  288. }
  289. out_resp:
  290. free(resp);
  291. out_req:
  292. free(req);
  293. out_close:
  294. close(listener);
  295. exit(1);
  296. }
  297. close(listener);
  298. if (waitpid(worker, &status, 0) != worker) {
  299. perror("waitpid");
  300. goto out_kill;
  301. }
  302. if (umount2("/tmp/foo", MNT_DETACH) < 0 && errno != EINVAL) {
  303. perror("umount2");
  304. goto out_kill;
  305. }
  306. if (remove("/tmp/foo") < 0 && errno != ENOENT) {
  307. perror("remove");
  308. exit(1);
  309. }
  310. if (!WIFEXITED(status) || WEXITSTATUS(status)) {
  311. fprintf(stderr, "worker exited nonzero\n");
  312. goto out_kill;
  313. }
  314. ret = 0;
  315. out_kill:
  316. if (tracer > 0)
  317. kill(tracer, SIGKILL);
  318. if (worker > 0)
  319. kill(worker, SIGKILL);
  320. close_pair:
  321. close(sk_pair[0]);
  322. close(sk_pair[1]);
  323. return ret;
  324. }