protection_keys.c 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Tests x86 Memory Protection Keys (see Documentation/x86/protection-keys.txt)
  4. *
  5. * There are examples in here of:
  6. * * how to set protection keys on memory
  7. * * how to set/clear bits in PKRU (the rights register)
  8. * * how to handle SEGV_PKRU signals and extract pkey-relevant
  9. * information from the siginfo
  10. *
  11. * Things to add:
  12. * make sure KSM and KSM COW breaking works
  13. * prefault pages in at malloc, or not
  14. * protect MPX bounds tables with protection keys?
  15. * make sure VMA splitting/merging is working correctly
  16. * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
  17. * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
  18. * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
  19. *
  20. * Compile like this:
  21. * gcc -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
  22. * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
  23. */
  24. #define _GNU_SOURCE
  25. #include <errno.h>
  26. #include <linux/futex.h>
  27. #include <time.h>
  28. #include <sys/time.h>
  29. #include <sys/syscall.h>
  30. #include <string.h>
  31. #include <stdio.h>
  32. #include <stdint.h>
  33. #include <stdbool.h>
  34. #include <signal.h>
  35. #include <assert.h>
  36. #include <stdlib.h>
  37. #include <ucontext.h>
  38. #include <sys/mman.h>
  39. #include <sys/types.h>
  40. #include <sys/wait.h>
  41. #include <sys/stat.h>
  42. #include <fcntl.h>
  43. #include <unistd.h>
  44. #include <sys/ptrace.h>
  45. #include <setjmp.h>
  46. #include "pkey-helpers.h"
  47. int iteration_nr = 1;
  48. int test_nr;
  49. unsigned int shadow_pkru;
  50. #define HPAGE_SIZE (1UL<<21)
  51. #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
  52. #define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
  53. #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
  54. #define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
  55. #define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
  56. #define __stringify_1(x...) #x
  57. #define __stringify(x...) __stringify_1(x)
  58. #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
  59. int dprint_in_signal;
  60. char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
  61. extern void abort_hooks(void);
  62. #define pkey_assert(condition) do { \
  63. if (!(condition)) { \
  64. dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
  65. __FILE__, __LINE__, \
  66. test_nr, iteration_nr); \
  67. dprintf0("errno at assert: %d", errno); \
  68. abort_hooks(); \
  69. exit(__LINE__); \
  70. } \
  71. } while (0)
  72. void cat_into_file(char *str, char *file)
  73. {
  74. int fd = open(file, O_RDWR);
  75. int ret;
  76. dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
  77. /*
  78. * these need to be raw because they are called under
  79. * pkey_assert()
  80. */
  81. if (fd < 0) {
  82. fprintf(stderr, "error opening '%s'\n", str);
  83. perror("error: ");
  84. exit(__LINE__);
  85. }
  86. ret = write(fd, str, strlen(str));
  87. if (ret != strlen(str)) {
  88. perror("write to file failed");
  89. fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
  90. exit(__LINE__);
  91. }
  92. close(fd);
  93. }
  94. #if CONTROL_TRACING > 0
  95. static int warned_tracing;
  96. int tracing_root_ok(void)
  97. {
  98. if (geteuid() != 0) {
  99. if (!warned_tracing)
  100. fprintf(stderr, "WARNING: not run as root, "
  101. "can not do tracing control\n");
  102. warned_tracing = 1;
  103. return 0;
  104. }
  105. return 1;
  106. }
  107. #endif
  108. void tracing_on(void)
  109. {
  110. #if CONTROL_TRACING > 0
  111. #define TRACEDIR "/sys/kernel/debug/tracing"
  112. char pidstr[32];
  113. if (!tracing_root_ok())
  114. return;
  115. sprintf(pidstr, "%d", getpid());
  116. cat_into_file("0", TRACEDIR "/tracing_on");
  117. cat_into_file("\n", TRACEDIR "/trace");
  118. if (1) {
  119. cat_into_file("function_graph", TRACEDIR "/current_tracer");
  120. cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
  121. } else {
  122. cat_into_file("nop", TRACEDIR "/current_tracer");
  123. }
  124. cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
  125. cat_into_file("1", TRACEDIR "/tracing_on");
  126. dprintf1("enabled tracing\n");
  127. #endif
  128. }
  129. void tracing_off(void)
  130. {
  131. #if CONTROL_TRACING > 0
  132. if (!tracing_root_ok())
  133. return;
  134. cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
  135. #endif
  136. }
  137. void abort_hooks(void)
  138. {
  139. fprintf(stderr, "running %s()...\n", __func__);
  140. tracing_off();
  141. #ifdef SLEEP_ON_ABORT
  142. sleep(SLEEP_ON_ABORT);
  143. #endif
  144. }
  145. static inline void __page_o_noops(void)
  146. {
  147. /* 8-bytes of instruction * 512 bytes = 1 page */
  148. asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
  149. }
  150. /*
  151. * This attempts to have roughly a page of instructions followed by a few
  152. * instructions that do a write, and another page of instructions. That
  153. * way, we are pretty sure that the write is in the second page of
  154. * instructions and has at least a page of padding behind it.
  155. *
  156. * *That* lets us be sure to madvise() away the write instruction, which
  157. * will then fault, which makes sure that the fault code handles
  158. * execute-only memory properly.
  159. */
  160. __attribute__((__aligned__(PAGE_SIZE)))
  161. void lots_o_noops_around_write(int *write_to_me)
  162. {
  163. dprintf3("running %s()\n", __func__);
  164. __page_o_noops();
  165. /* Assume this happens in the second page of instructions: */
  166. *write_to_me = __LINE__;
  167. /* pad out by another page: */
  168. __page_o_noops();
  169. dprintf3("%s() done\n", __func__);
  170. }
  171. /* Define some kernel-like types */
  172. #define u8 uint8_t
  173. #define u16 uint16_t
  174. #define u32 uint32_t
  175. #define u64 uint64_t
  176. #ifdef __i386__
  177. #ifndef SYS_mprotect_key
  178. # define SYS_mprotect_key 380
  179. #endif
  180. #ifndef SYS_pkey_alloc
  181. # define SYS_pkey_alloc 381
  182. # define SYS_pkey_free 382
  183. #endif
  184. #define REG_IP_IDX REG_EIP
  185. #define si_pkey_offset 0x14
  186. #else
  187. #ifndef SYS_mprotect_key
  188. # define SYS_mprotect_key 329
  189. #endif
  190. #ifndef SYS_pkey_alloc
  191. # define SYS_pkey_alloc 330
  192. # define SYS_pkey_free 331
  193. #endif
  194. #define REG_IP_IDX REG_RIP
  195. #define si_pkey_offset 0x20
  196. #endif
  197. void dump_mem(void *dumpme, int len_bytes)
  198. {
  199. char *c = (void *)dumpme;
  200. int i;
  201. for (i = 0; i < len_bytes; i += sizeof(u64)) {
  202. u64 *ptr = (u64 *)(c + i);
  203. dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr);
  204. }
  205. }
  206. /* Failed address bound checks: */
  207. #ifndef SEGV_BNDERR
  208. # define SEGV_BNDERR 3
  209. #endif
  210. #ifndef SEGV_PKUERR
  211. # define SEGV_PKUERR 4
  212. #endif
  213. static char *si_code_str(int si_code)
  214. {
  215. if (si_code == SEGV_MAPERR)
  216. return "SEGV_MAPERR";
  217. if (si_code == SEGV_ACCERR)
  218. return "SEGV_ACCERR";
  219. if (si_code == SEGV_BNDERR)
  220. return "SEGV_BNDERR";
  221. if (si_code == SEGV_PKUERR)
  222. return "SEGV_PKUERR";
  223. return "UNKNOWN";
  224. }
  225. int pkru_faults;
  226. int last_si_pkey = -1;
  227. void signal_handler(int signum, siginfo_t *si, void *vucontext)
  228. {
  229. ucontext_t *uctxt = vucontext;
  230. int trapno;
  231. unsigned long ip;
  232. char *fpregs;
  233. u32 *pkru_ptr;
  234. u64 siginfo_pkey;
  235. u32 *si_pkey_ptr;
  236. int pkru_offset;
  237. fpregset_t fpregset;
  238. dprint_in_signal = 1;
  239. dprintf1(">>>>===============SIGSEGV============================\n");
  240. dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
  241. __rdpkru(), shadow_pkru);
  242. trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
  243. ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
  244. fpregset = uctxt->uc_mcontext.fpregs;
  245. fpregs = (void *)fpregset;
  246. dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
  247. trapno, ip, si_code_str(si->si_code), si->si_code);
  248. #ifdef __i386__
  249. /*
  250. * 32-bit has some extra padding so that userspace can tell whether
  251. * the XSTATE header is present in addition to the "legacy" FPU
  252. * state. We just assume that it is here.
  253. */
  254. fpregs += 0x70;
  255. #endif
  256. pkru_offset = pkru_xstate_offset();
  257. pkru_ptr = (void *)(&fpregs[pkru_offset]);
  258. dprintf1("siginfo: %p\n", si);
  259. dprintf1(" fpregs: %p\n", fpregs);
  260. /*
  261. * If we got a PKRU fault, we *HAVE* to have at least one bit set in
  262. * here.
  263. */
  264. dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset());
  265. if (DEBUG_LEVEL > 4)
  266. dump_mem(pkru_ptr - 128, 256);
  267. pkey_assert(*pkru_ptr);
  268. if ((si->si_code == SEGV_MAPERR) ||
  269. (si->si_code == SEGV_ACCERR) ||
  270. (si->si_code == SEGV_BNDERR)) {
  271. printf("non-PK si_code, exiting...\n");
  272. exit(4);
  273. }
  274. si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
  275. dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
  276. dump_mem((u8 *)si_pkey_ptr - 8, 24);
  277. siginfo_pkey = *si_pkey_ptr;
  278. pkey_assert(siginfo_pkey < NR_PKEYS);
  279. last_si_pkey = siginfo_pkey;
  280. dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
  281. /* need __rdpkru() version so we do not do shadow_pkru checking */
  282. dprintf1("signal pkru from pkru: %08x\n", __rdpkru());
  283. dprintf1("pkey from siginfo: %jx\n", siginfo_pkey);
  284. *(u64 *)pkru_ptr = 0x00000000;
  285. dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
  286. pkru_faults++;
  287. dprintf1("<<<<==================================================\n");
  288. dprint_in_signal = 0;
  289. }
  290. int wait_all_children(void)
  291. {
  292. int status;
  293. return waitpid(-1, &status, 0);
  294. }
  295. void sig_chld(int x)
  296. {
  297. dprint_in_signal = 1;
  298. dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
  299. dprint_in_signal = 0;
  300. }
  301. void setup_sigsegv_handler(void)
  302. {
  303. int r, rs;
  304. struct sigaction newact;
  305. struct sigaction oldact;
  306. /* #PF is mapped to sigsegv */
  307. int signum = SIGSEGV;
  308. newact.sa_handler = 0;
  309. newact.sa_sigaction = signal_handler;
  310. /*sigset_t - signals to block while in the handler */
  311. /* get the old signal mask. */
  312. rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
  313. pkey_assert(rs == 0);
  314. /* call sa_sigaction, not sa_handler*/
  315. newact.sa_flags = SA_SIGINFO;
  316. newact.sa_restorer = 0; /* void(*)(), obsolete */
  317. r = sigaction(signum, &newact, &oldact);
  318. r = sigaction(SIGALRM, &newact, &oldact);
  319. pkey_assert(r == 0);
  320. }
  321. void setup_handlers(void)
  322. {
  323. signal(SIGCHLD, &sig_chld);
  324. setup_sigsegv_handler();
  325. }
  326. pid_t fork_lazy_child(void)
  327. {
  328. pid_t forkret;
  329. forkret = fork();
  330. pkey_assert(forkret >= 0);
  331. dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
  332. if (!forkret) {
  333. /* in the child */
  334. while (1) {
  335. dprintf1("child sleeping...\n");
  336. sleep(30);
  337. }
  338. }
  339. return forkret;
  340. }
  341. #ifndef PKEY_DISABLE_ACCESS
  342. # define PKEY_DISABLE_ACCESS 0x1
  343. #endif
  344. #ifndef PKEY_DISABLE_WRITE
  345. # define PKEY_DISABLE_WRITE 0x2
  346. #endif
  347. static u32 hw_pkey_get(int pkey, unsigned long flags)
  348. {
  349. u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
  350. u32 pkru = __rdpkru();
  351. u32 shifted_pkru;
  352. u32 masked_pkru;
  353. dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
  354. __func__, pkey, flags, 0, 0);
  355. dprintf2("%s() raw pkru: %x\n", __func__, pkru);
  356. shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY));
  357. dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru);
  358. masked_pkru = shifted_pkru & mask;
  359. dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru);
  360. /*
  361. * shift down the relevant bits to the lowest two, then
  362. * mask off all the other high bits.
  363. */
  364. return masked_pkru;
  365. }
  366. static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
  367. {
  368. u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
  369. u32 old_pkru = __rdpkru();
  370. u32 new_pkru;
  371. /* make sure that 'rights' only contains the bits we expect: */
  372. assert(!(rights & ~mask));
  373. /* copy old pkru */
  374. new_pkru = old_pkru;
  375. /* mask out bits from pkey in old value: */
  376. new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY));
  377. /* OR in new bits for pkey: */
  378. new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY));
  379. __wrpkru(new_pkru);
  380. dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n",
  381. __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru);
  382. return 0;
  383. }
  384. void pkey_disable_set(int pkey, int flags)
  385. {
  386. unsigned long syscall_flags = 0;
  387. int ret;
  388. int pkey_rights;
  389. u32 orig_pkru = rdpkru();
  390. dprintf1("START->%s(%d, 0x%x)\n", __func__,
  391. pkey, flags);
  392. pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
  393. pkey_rights = hw_pkey_get(pkey, syscall_flags);
  394. dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
  395. pkey, pkey, pkey_rights);
  396. pkey_assert(pkey_rights >= 0);
  397. pkey_rights |= flags;
  398. ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
  399. assert(!ret);
  400. /*pkru and flags have the same format */
  401. shadow_pkru |= flags << (pkey * 2);
  402. dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru);
  403. pkey_assert(ret >= 0);
  404. pkey_rights = hw_pkey_get(pkey, syscall_flags);
  405. dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
  406. pkey, pkey, pkey_rights);
  407. dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
  408. if (flags)
  409. pkey_assert(rdpkru() > orig_pkru);
  410. dprintf1("END<---%s(%d, 0x%x)\n", __func__,
  411. pkey, flags);
  412. }
  413. void pkey_disable_clear(int pkey, int flags)
  414. {
  415. unsigned long syscall_flags = 0;
  416. int ret;
  417. int pkey_rights = hw_pkey_get(pkey, syscall_flags);
  418. u32 orig_pkru = rdpkru();
  419. pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
  420. dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
  421. pkey, pkey, pkey_rights);
  422. pkey_assert(pkey_rights >= 0);
  423. pkey_rights |= flags;
  424. ret = hw_pkey_set(pkey, pkey_rights, 0);
  425. /* pkru and flags have the same format */
  426. shadow_pkru &= ~(flags << (pkey * 2));
  427. pkey_assert(ret >= 0);
  428. pkey_rights = hw_pkey_get(pkey, syscall_flags);
  429. dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
  430. pkey, pkey, pkey_rights);
  431. dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
  432. if (flags)
  433. assert(rdpkru() > orig_pkru);
  434. }
  435. void pkey_write_allow(int pkey)
  436. {
  437. pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
  438. }
  439. void pkey_write_deny(int pkey)
  440. {
  441. pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
  442. }
  443. void pkey_access_allow(int pkey)
  444. {
  445. pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
  446. }
  447. void pkey_access_deny(int pkey)
  448. {
  449. pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
  450. }
  451. int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
  452. unsigned long pkey)
  453. {
  454. int sret;
  455. dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
  456. ptr, size, orig_prot, pkey);
  457. errno = 0;
  458. sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
  459. if (errno) {
  460. dprintf2("SYS_mprotect_key sret: %d\n", sret);
  461. dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
  462. dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
  463. if (DEBUG_LEVEL >= 2)
  464. perror("SYS_mprotect_pkey");
  465. }
  466. return sret;
  467. }
  468. int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
  469. {
  470. int ret = syscall(SYS_pkey_alloc, flags, init_val);
  471. dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
  472. __func__, flags, init_val, ret, errno);
  473. return ret;
  474. }
  475. int alloc_pkey(void)
  476. {
  477. int ret;
  478. unsigned long init_val = 0x0;
  479. dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n",
  480. __LINE__, __rdpkru(), shadow_pkru);
  481. ret = sys_pkey_alloc(0, init_val);
  482. /*
  483. * pkey_alloc() sets PKRU, so we need to reflect it in
  484. * shadow_pkru:
  485. */
  486. dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
  487. __LINE__, ret, __rdpkru(), shadow_pkru);
  488. if (ret) {
  489. /* clear both the bits: */
  490. shadow_pkru &= ~(0x3 << (ret * 2));
  491. dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
  492. __LINE__, ret, __rdpkru(), shadow_pkru);
  493. /*
  494. * move the new state in from init_val
  495. * (remember, we cheated and init_val == pkru format)
  496. */
  497. shadow_pkru |= (init_val << (ret * 2));
  498. }
  499. dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
  500. __LINE__, ret, __rdpkru(), shadow_pkru);
  501. dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno);
  502. /* for shadow checking: */
  503. rdpkru();
  504. dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
  505. __LINE__, ret, __rdpkru(), shadow_pkru);
  506. return ret;
  507. }
  508. int sys_pkey_free(unsigned long pkey)
  509. {
  510. int ret = syscall(SYS_pkey_free, pkey);
  511. dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
  512. return ret;
  513. }
  514. /*
  515. * I had a bug where pkey bits could be set by mprotect() but
  516. * not cleared. This ensures we get lots of random bit sets
  517. * and clears on the vma and pte pkey bits.
  518. */
  519. int alloc_random_pkey(void)
  520. {
  521. int max_nr_pkey_allocs;
  522. int ret;
  523. int i;
  524. int alloced_pkeys[NR_PKEYS];
  525. int nr_alloced = 0;
  526. int random_index;
  527. memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
  528. srand((unsigned int)time(NULL));
  529. /* allocate every possible key and make a note of which ones we got */
  530. max_nr_pkey_allocs = NR_PKEYS;
  531. for (i = 0; i < max_nr_pkey_allocs; i++) {
  532. int new_pkey = alloc_pkey();
  533. if (new_pkey < 0)
  534. break;
  535. alloced_pkeys[nr_alloced++] = new_pkey;
  536. }
  537. pkey_assert(nr_alloced > 0);
  538. /* select a random one out of the allocated ones */
  539. random_index = rand() % nr_alloced;
  540. ret = alloced_pkeys[random_index];
  541. /* now zero it out so we don't free it next */
  542. alloced_pkeys[random_index] = 0;
  543. /* go through the allocated ones that we did not want and free them */
  544. for (i = 0; i < nr_alloced; i++) {
  545. int free_ret;
  546. if (!alloced_pkeys[i])
  547. continue;
  548. free_ret = sys_pkey_free(alloced_pkeys[i]);
  549. pkey_assert(!free_ret);
  550. }
  551. dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
  552. __LINE__, ret, __rdpkru(), shadow_pkru);
  553. return ret;
  554. }
  555. int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
  556. unsigned long pkey)
  557. {
  558. int nr_iterations = random() % 100;
  559. int ret;
  560. while (0) {
  561. int rpkey = alloc_random_pkey();
  562. ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
  563. dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
  564. ptr, size, orig_prot, pkey, ret);
  565. if (nr_iterations-- < 0)
  566. break;
  567. dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
  568. __LINE__, ret, __rdpkru(), shadow_pkru);
  569. sys_pkey_free(rpkey);
  570. dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
  571. __LINE__, ret, __rdpkru(), shadow_pkru);
  572. }
  573. pkey_assert(pkey < NR_PKEYS);
  574. ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
  575. dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
  576. ptr, size, orig_prot, pkey, ret);
  577. pkey_assert(!ret);
  578. dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
  579. __LINE__, ret, __rdpkru(), shadow_pkru);
  580. return ret;
  581. }
  582. struct pkey_malloc_record {
  583. void *ptr;
  584. long size;
  585. int prot;
  586. };
  587. struct pkey_malloc_record *pkey_malloc_records;
  588. struct pkey_malloc_record *pkey_last_malloc_record;
  589. long nr_pkey_malloc_records;
  590. void record_pkey_malloc(void *ptr, long size, int prot)
  591. {
  592. long i;
  593. struct pkey_malloc_record *rec = NULL;
  594. for (i = 0; i < nr_pkey_malloc_records; i++) {
  595. rec = &pkey_malloc_records[i];
  596. /* find a free record */
  597. if (rec)
  598. break;
  599. }
  600. if (!rec) {
  601. /* every record is full */
  602. size_t old_nr_records = nr_pkey_malloc_records;
  603. size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
  604. size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
  605. dprintf2("new_nr_records: %zd\n", new_nr_records);
  606. dprintf2("new_size: %zd\n", new_size);
  607. pkey_malloc_records = realloc(pkey_malloc_records, new_size);
  608. pkey_assert(pkey_malloc_records != NULL);
  609. rec = &pkey_malloc_records[nr_pkey_malloc_records];
  610. /*
  611. * realloc() does not initialize memory, so zero it from
  612. * the first new record all the way to the end.
  613. */
  614. for (i = 0; i < new_nr_records - old_nr_records; i++)
  615. memset(rec + i, 0, sizeof(*rec));
  616. }
  617. dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
  618. (int)(rec - pkey_malloc_records), rec, ptr, size);
  619. rec->ptr = ptr;
  620. rec->size = size;
  621. rec->prot = prot;
  622. pkey_last_malloc_record = rec;
  623. nr_pkey_malloc_records++;
  624. }
  625. void free_pkey_malloc(void *ptr)
  626. {
  627. long i;
  628. int ret;
  629. dprintf3("%s(%p)\n", __func__, ptr);
  630. for (i = 0; i < nr_pkey_malloc_records; i++) {
  631. struct pkey_malloc_record *rec = &pkey_malloc_records[i];
  632. dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
  633. ptr, i, rec, rec->ptr, rec->size);
  634. if ((ptr < rec->ptr) ||
  635. (ptr >= rec->ptr + rec->size))
  636. continue;
  637. dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
  638. ptr, i, rec, rec->ptr, rec->size);
  639. nr_pkey_malloc_records--;
  640. ret = munmap(rec->ptr, rec->size);
  641. dprintf3("munmap ret: %d\n", ret);
  642. pkey_assert(!ret);
  643. dprintf3("clearing rec->ptr, rec: %p\n", rec);
  644. rec->ptr = NULL;
  645. dprintf3("done clearing rec->ptr, rec: %p\n", rec);
  646. return;
  647. }
  648. pkey_assert(false);
  649. }
  650. void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
  651. {
  652. void *ptr;
  653. int ret;
  654. rdpkru();
  655. dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
  656. size, prot, pkey);
  657. pkey_assert(pkey < NR_PKEYS);
  658. ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
  659. pkey_assert(ptr != (void *)-1);
  660. ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
  661. pkey_assert(!ret);
  662. record_pkey_malloc(ptr, size, prot);
  663. rdpkru();
  664. dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
  665. return ptr;
  666. }
  667. void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
  668. {
  669. int ret;
  670. void *ptr;
  671. dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
  672. size, prot, pkey);
  673. /*
  674. * Guarantee we can fit at least one huge page in the resulting
  675. * allocation by allocating space for 2:
  676. */
  677. size = ALIGN_UP(size, HPAGE_SIZE * 2);
  678. ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
  679. pkey_assert(ptr != (void *)-1);
  680. record_pkey_malloc(ptr, size, prot);
  681. mprotect_pkey(ptr, size, prot, pkey);
  682. dprintf1("unaligned ptr: %p\n", ptr);
  683. ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
  684. dprintf1(" aligned ptr: %p\n", ptr);
  685. ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
  686. dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
  687. ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
  688. dprintf1("MADV_WILLNEED ret: %d\n", ret);
  689. memset(ptr, 0, HPAGE_SIZE);
  690. dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
  691. return ptr;
  692. }
  693. int hugetlb_setup_ok;
  694. #define GET_NR_HUGE_PAGES 10
  695. void setup_hugetlbfs(void)
  696. {
  697. int err;
  698. int fd;
  699. char buf[] = "123";
  700. if (geteuid() != 0) {
  701. fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
  702. return;
  703. }
  704. cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
  705. /*
  706. * Now go make sure that we got the pages and that they
  707. * are 2M pages. Someone might have made 1G the default.
  708. */
  709. fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY);
  710. if (fd < 0) {
  711. perror("opening sysfs 2M hugetlb config");
  712. return;
  713. }
  714. /* -1 to guarantee leaving the trailing \0 */
  715. err = read(fd, buf, sizeof(buf)-1);
  716. close(fd);
  717. if (err <= 0) {
  718. perror("reading sysfs 2M hugetlb config");
  719. return;
  720. }
  721. if (atoi(buf) != GET_NR_HUGE_PAGES) {
  722. fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n",
  723. buf, GET_NR_HUGE_PAGES);
  724. return;
  725. }
  726. hugetlb_setup_ok = 1;
  727. }
  728. void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
  729. {
  730. void *ptr;
  731. int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
  732. if (!hugetlb_setup_ok)
  733. return PTR_ERR_ENOTSUP;
  734. dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
  735. size = ALIGN_UP(size, HPAGE_SIZE * 2);
  736. pkey_assert(pkey < NR_PKEYS);
  737. ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
  738. pkey_assert(ptr != (void *)-1);
  739. mprotect_pkey(ptr, size, prot, pkey);
  740. record_pkey_malloc(ptr, size, prot);
  741. dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
  742. return ptr;
  743. }
  744. void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
  745. {
  746. void *ptr;
  747. int fd;
  748. dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
  749. size, prot, pkey);
  750. pkey_assert(pkey < NR_PKEYS);
  751. fd = open("/dax/foo", O_RDWR);
  752. pkey_assert(fd >= 0);
  753. ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
  754. pkey_assert(ptr != (void *)-1);
  755. mprotect_pkey(ptr, size, prot, pkey);
  756. record_pkey_malloc(ptr, size, prot);
  757. dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
  758. close(fd);
  759. return ptr;
  760. }
  761. void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
  762. malloc_pkey_with_mprotect,
  763. malloc_pkey_anon_huge,
  764. malloc_pkey_hugetlb
  765. /* can not do direct with the pkey_mprotect() API:
  766. malloc_pkey_mmap_direct,
  767. malloc_pkey_mmap_dax,
  768. */
  769. };
  770. void *malloc_pkey(long size, int prot, u16 pkey)
  771. {
  772. void *ret;
  773. static int malloc_type;
  774. int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
  775. pkey_assert(pkey < NR_PKEYS);
  776. while (1) {
  777. pkey_assert(malloc_type < nr_malloc_types);
  778. ret = pkey_malloc[malloc_type](size, prot, pkey);
  779. pkey_assert(ret != (void *)-1);
  780. malloc_type++;
  781. if (malloc_type >= nr_malloc_types)
  782. malloc_type = (random()%nr_malloc_types);
  783. /* try again if the malloc_type we tried is unsupported */
  784. if (ret == PTR_ERR_ENOTSUP)
  785. continue;
  786. break;
  787. }
  788. dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
  789. size, prot, pkey, ret);
  790. return ret;
  791. }
  792. int last_pkru_faults;
  793. #define UNKNOWN_PKEY -2
  794. void expected_pk_fault(int pkey)
  795. {
  796. dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
  797. __func__, last_pkru_faults, pkru_faults);
  798. dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
  799. pkey_assert(last_pkru_faults + 1 == pkru_faults);
  800. /*
  801. * For exec-only memory, we do not know the pkey in
  802. * advance, so skip this check.
  803. */
  804. if (pkey != UNKNOWN_PKEY)
  805. pkey_assert(last_si_pkey == pkey);
  806. /*
  807. * The signal handler shold have cleared out PKRU to let the
  808. * test program continue. We now have to restore it.
  809. */
  810. if (__rdpkru() != 0)
  811. pkey_assert(0);
  812. __wrpkru(shadow_pkru);
  813. dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n",
  814. __func__, shadow_pkru);
  815. last_pkru_faults = pkru_faults;
  816. last_si_pkey = -1;
  817. }
  818. #define do_not_expect_pk_fault(msg) do { \
  819. if (last_pkru_faults != pkru_faults) \
  820. dprintf0("unexpected PK fault: %s\n", msg); \
  821. pkey_assert(last_pkru_faults == pkru_faults); \
  822. } while (0)
  823. int test_fds[10] = { -1 };
  824. int nr_test_fds;
  825. void __save_test_fd(int fd)
  826. {
  827. pkey_assert(fd >= 0);
  828. pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
  829. test_fds[nr_test_fds] = fd;
  830. nr_test_fds++;
  831. }
  832. int get_test_read_fd(void)
  833. {
  834. int test_fd = open("/etc/passwd", O_RDONLY);
  835. __save_test_fd(test_fd);
  836. return test_fd;
  837. }
  838. void close_test_fds(void)
  839. {
  840. int i;
  841. for (i = 0; i < nr_test_fds; i++) {
  842. if (test_fds[i] < 0)
  843. continue;
  844. close(test_fds[i]);
  845. test_fds[i] = -1;
  846. }
  847. nr_test_fds = 0;
  848. }
  849. #define barrier() __asm__ __volatile__("": : :"memory")
  850. __attribute__((noinline)) int read_ptr(int *ptr)
  851. {
  852. /*
  853. * Keep GCC from optimizing this away somehow
  854. */
  855. barrier();
  856. return *ptr;
  857. }
  858. void test_read_of_write_disabled_region(int *ptr, u16 pkey)
  859. {
  860. int ptr_contents;
  861. dprintf1("disabling write access to PKEY[1], doing read\n");
  862. pkey_write_deny(pkey);
  863. ptr_contents = read_ptr(ptr);
  864. dprintf1("*ptr: %d\n", ptr_contents);
  865. dprintf1("\n");
  866. }
  867. void test_read_of_access_disabled_region(int *ptr, u16 pkey)
  868. {
  869. int ptr_contents;
  870. dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
  871. rdpkru();
  872. pkey_access_deny(pkey);
  873. ptr_contents = read_ptr(ptr);
  874. dprintf1("*ptr: %d\n", ptr_contents);
  875. expected_pk_fault(pkey);
  876. }
  877. void test_write_of_write_disabled_region(int *ptr, u16 pkey)
  878. {
  879. dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
  880. pkey_write_deny(pkey);
  881. *ptr = __LINE__;
  882. expected_pk_fault(pkey);
  883. }
  884. void test_write_of_access_disabled_region(int *ptr, u16 pkey)
  885. {
  886. dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
  887. pkey_access_deny(pkey);
  888. *ptr = __LINE__;
  889. expected_pk_fault(pkey);
  890. }
  891. void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
  892. {
  893. int ret;
  894. int test_fd = get_test_read_fd();
  895. dprintf1("disabling access to PKEY[%02d], "
  896. "having kernel read() to buffer\n", pkey);
  897. pkey_access_deny(pkey);
  898. ret = read(test_fd, ptr, 1);
  899. dprintf1("read ret: %d\n", ret);
  900. pkey_assert(ret);
  901. }
  902. void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
  903. {
  904. int ret;
  905. int test_fd = get_test_read_fd();
  906. pkey_write_deny(pkey);
  907. ret = read(test_fd, ptr, 100);
  908. dprintf1("read ret: %d\n", ret);
  909. if (ret < 0 && (DEBUG_LEVEL > 0))
  910. perror("verbose read result (OK for this to be bad)");
  911. pkey_assert(ret);
  912. }
  913. void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
  914. {
  915. int pipe_ret, vmsplice_ret;
  916. struct iovec iov;
  917. int pipe_fds[2];
  918. pipe_ret = pipe(pipe_fds);
  919. pkey_assert(pipe_ret == 0);
  920. dprintf1("disabling access to PKEY[%02d], "
  921. "having kernel vmsplice from buffer\n", pkey);
  922. pkey_access_deny(pkey);
  923. iov.iov_base = ptr;
  924. iov.iov_len = PAGE_SIZE;
  925. vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
  926. dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
  927. pkey_assert(vmsplice_ret == -1);
  928. close(pipe_fds[0]);
  929. close(pipe_fds[1]);
  930. }
  931. void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
  932. {
  933. int ignored = 0xdada;
  934. int futex_ret;
  935. int some_int = __LINE__;
  936. dprintf1("disabling write to PKEY[%02d], "
  937. "doing futex gunk in buffer\n", pkey);
  938. *ptr = some_int;
  939. pkey_write_deny(pkey);
  940. futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
  941. &ignored, ignored);
  942. if (DEBUG_LEVEL > 0)
  943. perror("futex");
  944. dprintf1("futex() ret: %d\n", futex_ret);
  945. }
  946. /* Assumes that all pkeys other than 'pkey' are unallocated */
  947. void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
  948. {
  949. int err;
  950. int i;
  951. /* Note: 0 is the default pkey, so don't mess with it */
  952. for (i = 1; i < NR_PKEYS; i++) {
  953. if (pkey == i)
  954. continue;
  955. dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
  956. err = sys_pkey_free(i);
  957. pkey_assert(err);
  958. err = sys_pkey_free(i);
  959. pkey_assert(err);
  960. err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
  961. pkey_assert(err);
  962. }
  963. }
  964. /* Assumes that all pkeys other than 'pkey' are unallocated */
  965. void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
  966. {
  967. int err;
  968. int bad_pkey = NR_PKEYS+99;
  969. /* pass a known-invalid pkey in: */
  970. err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
  971. pkey_assert(err);
  972. }
  973. void become_child(void)
  974. {
  975. pid_t forkret;
  976. forkret = fork();
  977. pkey_assert(forkret >= 0);
  978. dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
  979. if (!forkret) {
  980. /* in the child */
  981. return;
  982. }
  983. exit(0);
  984. }
  985. /* Assumes that all pkeys other than 'pkey' are unallocated */
  986. void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
  987. {
  988. int err;
  989. int allocated_pkeys[NR_PKEYS] = {0};
  990. int nr_allocated_pkeys = 0;
  991. int i;
  992. for (i = 0; i < NR_PKEYS*3; i++) {
  993. int new_pkey;
  994. dprintf1("%s() alloc loop: %d\n", __func__, i);
  995. new_pkey = alloc_pkey();
  996. dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__,
  997. __LINE__, err, __rdpkru(), shadow_pkru);
  998. rdpkru(); /* for shadow checking */
  999. dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
  1000. if ((new_pkey == -1) && (errno == ENOSPC)) {
  1001. dprintf2("%s() failed to allocate pkey after %d tries\n",
  1002. __func__, nr_allocated_pkeys);
  1003. } else {
  1004. /*
  1005. * Ensure the number of successes never
  1006. * exceeds the number of keys supported
  1007. * in the hardware.
  1008. */
  1009. pkey_assert(nr_allocated_pkeys < NR_PKEYS);
  1010. allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
  1011. }
  1012. /*
  1013. * Make sure that allocation state is properly
  1014. * preserved across fork().
  1015. */
  1016. if (i == NR_PKEYS*2)
  1017. become_child();
  1018. }
  1019. dprintf3("%s()::%d\n", __func__, __LINE__);
  1020. /*
  1021. * There are 16 pkeys supported in hardware. Three are
  1022. * allocated by the time we get here:
  1023. * 1. The default key (0)
  1024. * 2. One possibly consumed by an execute-only mapping.
  1025. * 3. One allocated by the test code and passed in via
  1026. * 'pkey' to this function.
  1027. * Ensure that we can allocate at least another 13 (16-3).
  1028. */
  1029. pkey_assert(i >= NR_PKEYS-3);
  1030. for (i = 0; i < nr_allocated_pkeys; i++) {
  1031. err = sys_pkey_free(allocated_pkeys[i]);
  1032. pkey_assert(!err);
  1033. rdpkru(); /* for shadow checking */
  1034. }
  1035. }
  1036. /*
  1037. * pkey 0 is special. It is allocated by default, so you do not
  1038. * have to call pkey_alloc() to use it first. Make sure that it
  1039. * is usable.
  1040. */
  1041. void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
  1042. {
  1043. long size;
  1044. int prot;
  1045. assert(pkey_last_malloc_record);
  1046. size = pkey_last_malloc_record->size;
  1047. /*
  1048. * This is a bit of a hack. But mprotect() requires
  1049. * huge-page-aligned sizes when operating on hugetlbfs.
  1050. * So, make sure that we use something that's a multiple
  1051. * of a huge page when we can.
  1052. */
  1053. if (size >= HPAGE_SIZE)
  1054. size = HPAGE_SIZE;
  1055. prot = pkey_last_malloc_record->prot;
  1056. /* Use pkey 0 */
  1057. mprotect_pkey(ptr, size, prot, 0);
  1058. /* Make sure that we can set it back to the original pkey. */
  1059. mprotect_pkey(ptr, size, prot, pkey);
  1060. }
  1061. void test_ptrace_of_child(int *ptr, u16 pkey)
  1062. {
  1063. __attribute__((__unused__)) int peek_result;
  1064. pid_t child_pid;
  1065. void *ignored = 0;
  1066. long ret;
  1067. int status;
  1068. /*
  1069. * This is the "control" for our little expermient. Make sure
  1070. * we can always access it when ptracing.
  1071. */
  1072. int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
  1073. int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
  1074. /*
  1075. * Fork a child which is an exact copy of this process, of course.
  1076. * That means we can do all of our tests via ptrace() and then plain
  1077. * memory access and ensure they work differently.
  1078. */
  1079. child_pid = fork_lazy_child();
  1080. dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
  1081. ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
  1082. if (ret)
  1083. perror("attach");
  1084. dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
  1085. pkey_assert(ret != -1);
  1086. ret = waitpid(child_pid, &status, WUNTRACED);
  1087. if ((ret != child_pid) || !(WIFSTOPPED(status))) {
  1088. fprintf(stderr, "weird waitpid result %ld stat %x\n",
  1089. ret, status);
  1090. pkey_assert(0);
  1091. }
  1092. dprintf2("waitpid ret: %ld\n", ret);
  1093. dprintf2("waitpid status: %d\n", status);
  1094. pkey_access_deny(pkey);
  1095. pkey_write_deny(pkey);
  1096. /* Write access, untested for now:
  1097. ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
  1098. pkey_assert(ret != -1);
  1099. dprintf1("poke at %p: %ld\n", peek_at, ret);
  1100. */
  1101. /*
  1102. * Try to access the pkey-protected "ptr" via ptrace:
  1103. */
  1104. ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
  1105. /* expect it to work, without an error: */
  1106. pkey_assert(ret != -1);
  1107. /* Now access from the current task, and expect an exception: */
  1108. peek_result = read_ptr(ptr);
  1109. expected_pk_fault(pkey);
  1110. /*
  1111. * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
  1112. */
  1113. ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
  1114. /* expect it to work, without an error: */
  1115. pkey_assert(ret != -1);
  1116. /* Now access from the current task, and expect NO exception: */
  1117. peek_result = read_ptr(plain_ptr);
  1118. do_not_expect_pk_fault("read plain pointer after ptrace");
  1119. ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
  1120. pkey_assert(ret != -1);
  1121. ret = kill(child_pid, SIGKILL);
  1122. pkey_assert(ret != -1);
  1123. wait(&status);
  1124. free(plain_ptr_unaligned);
  1125. }
  1126. void *get_pointer_to_instructions(void)
  1127. {
  1128. void *p1;
  1129. p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
  1130. dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
  1131. /* lots_o_noops_around_write should be page-aligned already */
  1132. assert(p1 == &lots_o_noops_around_write);
  1133. /* Point 'p1' at the *second* page of the function: */
  1134. p1 += PAGE_SIZE;
  1135. /*
  1136. * Try to ensure we fault this in on next touch to ensure
  1137. * we get an instruction fault as opposed to a data one
  1138. */
  1139. madvise(p1, PAGE_SIZE, MADV_DONTNEED);
  1140. return p1;
  1141. }
  1142. void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
  1143. {
  1144. void *p1;
  1145. int scratch;
  1146. int ptr_contents;
  1147. int ret;
  1148. p1 = get_pointer_to_instructions();
  1149. lots_o_noops_around_write(&scratch);
  1150. ptr_contents = read_ptr(p1);
  1151. dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
  1152. ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
  1153. pkey_assert(!ret);
  1154. pkey_access_deny(pkey);
  1155. dprintf2("pkru: %x\n", rdpkru());
  1156. /*
  1157. * Make sure this is an *instruction* fault
  1158. */
  1159. madvise(p1, PAGE_SIZE, MADV_DONTNEED);
  1160. lots_o_noops_around_write(&scratch);
  1161. do_not_expect_pk_fault("executing on PROT_EXEC memory");
  1162. ptr_contents = read_ptr(p1);
  1163. dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
  1164. expected_pk_fault(pkey);
  1165. }
  1166. void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
  1167. {
  1168. void *p1;
  1169. int scratch;
  1170. int ptr_contents;
  1171. int ret;
  1172. dprintf1("%s() start\n", __func__);
  1173. p1 = get_pointer_to_instructions();
  1174. lots_o_noops_around_write(&scratch);
  1175. ptr_contents = read_ptr(p1);
  1176. dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
  1177. /* Use a *normal* mprotect(), not mprotect_pkey(): */
  1178. ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
  1179. pkey_assert(!ret);
  1180. dprintf2("pkru: %x\n", rdpkru());
  1181. /* Make sure this is an *instruction* fault */
  1182. madvise(p1, PAGE_SIZE, MADV_DONTNEED);
  1183. lots_o_noops_around_write(&scratch);
  1184. do_not_expect_pk_fault("executing on PROT_EXEC memory");
  1185. ptr_contents = read_ptr(p1);
  1186. dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
  1187. expected_pk_fault(UNKNOWN_PKEY);
  1188. /*
  1189. * Put the memory back to non-PROT_EXEC. Should clear the
  1190. * exec-only pkey off the VMA and allow it to be readable
  1191. * again. Go to PROT_NONE first to check for a kernel bug
  1192. * that did not clear the pkey when doing PROT_NONE.
  1193. */
  1194. ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
  1195. pkey_assert(!ret);
  1196. ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
  1197. pkey_assert(!ret);
  1198. ptr_contents = read_ptr(p1);
  1199. do_not_expect_pk_fault("plain read on recently PROT_EXEC area");
  1200. }
  1201. void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
  1202. {
  1203. int size = PAGE_SIZE;
  1204. int sret;
  1205. if (cpu_has_pku()) {
  1206. dprintf1("SKIP: %s: no CPU support\n", __func__);
  1207. return;
  1208. }
  1209. sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
  1210. pkey_assert(sret < 0);
  1211. }
  1212. void (*pkey_tests[])(int *ptr, u16 pkey) = {
  1213. test_read_of_write_disabled_region,
  1214. test_read_of_access_disabled_region,
  1215. test_write_of_write_disabled_region,
  1216. test_write_of_access_disabled_region,
  1217. test_kernel_write_of_access_disabled_region,
  1218. test_kernel_write_of_write_disabled_region,
  1219. test_kernel_gup_of_access_disabled_region,
  1220. test_kernel_gup_write_to_write_disabled_region,
  1221. test_executing_on_unreadable_memory,
  1222. test_implicit_mprotect_exec_only_memory,
  1223. test_mprotect_with_pkey_0,
  1224. test_ptrace_of_child,
  1225. test_pkey_syscalls_on_non_allocated_pkey,
  1226. test_pkey_syscalls_bad_args,
  1227. test_pkey_alloc_exhaust,
  1228. };
  1229. void run_tests_once(void)
  1230. {
  1231. int *ptr;
  1232. int prot = PROT_READ|PROT_WRITE;
  1233. for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
  1234. int pkey;
  1235. int orig_pkru_faults = pkru_faults;
  1236. dprintf1("======================\n");
  1237. dprintf1("test %d preparing...\n", test_nr);
  1238. tracing_on();
  1239. pkey = alloc_random_pkey();
  1240. dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
  1241. ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
  1242. dprintf1("test %d starting...\n", test_nr);
  1243. pkey_tests[test_nr](ptr, pkey);
  1244. dprintf1("freeing test memory: %p\n", ptr);
  1245. free_pkey_malloc(ptr);
  1246. sys_pkey_free(pkey);
  1247. dprintf1("pkru_faults: %d\n", pkru_faults);
  1248. dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults);
  1249. tracing_off();
  1250. close_test_fds();
  1251. printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
  1252. dprintf1("======================\n\n");
  1253. }
  1254. iteration_nr++;
  1255. }
  1256. void pkey_setup_shadow(void)
  1257. {
  1258. shadow_pkru = __rdpkru();
  1259. }
  1260. int main(void)
  1261. {
  1262. int nr_iterations = 22;
  1263. setup_handlers();
  1264. printf("has pku: %d\n", cpu_has_pku());
  1265. if (!cpu_has_pku()) {
  1266. int size = PAGE_SIZE;
  1267. int *ptr;
  1268. printf("running PKEY tests for unsupported CPU/OS\n");
  1269. ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
  1270. assert(ptr != (void *)-1);
  1271. test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
  1272. exit(0);
  1273. }
  1274. pkey_setup_shadow();
  1275. printf("startup pkru: %x\n", rdpkru());
  1276. setup_hugetlbfs();
  1277. while (nr_iterations-- > 0)
  1278. run_tests_once();
  1279. printf("done (all tests OK)\n");
  1280. return 0;
  1281. }