kexec_core.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * kexec.c - kexec system call core code.
  4. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
  5. */
  6. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  7. #include <linux/btf.h>
  8. #include <linux/capability.h>
  9. #include <linux/mm.h>
  10. #include <linux/file.h>
  11. #include <linux/slab.h>
  12. #include <linux/fs.h>
  13. #include <linux/kexec.h>
  14. #include <linux/mutex.h>
  15. #include <linux/list.h>
  16. #include <linux/highmem.h>
  17. #include <linux/syscalls.h>
  18. #include <linux/reboot.h>
  19. #include <linux/ioport.h>
  20. #include <linux/hardirq.h>
  21. #include <linux/elf.h>
  22. #include <linux/elfcore.h>
  23. #include <linux/utsname.h>
  24. #include <linux/numa.h>
  25. #include <linux/suspend.h>
  26. #include <linux/device.h>
  27. #include <linux/freezer.h>
  28. #include <linux/panic_notifier.h>
  29. #include <linux/pm.h>
  30. #include <linux/cpu.h>
  31. #include <linux/uaccess.h>
  32. #include <linux/io.h>
  33. #include <linux/console.h>
  34. #include <linux/vmalloc.h>
  35. #include <linux/swap.h>
  36. #include <linux/syscore_ops.h>
  37. #include <linux/compiler.h>
  38. #include <linux/hugetlb.h>
  39. #include <linux/objtool.h>
  40. #include <linux/kmsg_dump.h>
  41. #include <asm/page.h>
  42. #include <asm/sections.h>
  43. #include <crypto/hash.h>
  44. #include "kexec_internal.h"
  45. atomic_t __kexec_lock = ATOMIC_INIT(0);
  46. /* Flag to indicate we are going to kexec a new kernel */
  47. bool kexec_in_progress = false;
  48. bool kexec_file_dbg_print;
  49. /*
  50. * When kexec transitions to the new kernel there is a one-to-one
  51. * mapping between physical and virtual addresses. On processors
  52. * where you can disable the MMU this is trivial, and easy. For
  53. * others it is still a simple predictable page table to setup.
  54. *
  55. * In that environment kexec copies the new kernel to its final
  56. * resting place. This means I can only support memory whose
  57. * physical address can fit in an unsigned long. In particular
  58. * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  59. * If the assembly stub has more restrictive requirements
  60. * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  61. * defined more restrictively in <asm/kexec.h>.
  62. *
  63. * The code for the transition from the current kernel to the
  64. * new kernel is placed in the control_code_buffer, whose size
  65. * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
  66. * page of memory is necessary, but some architectures require more.
  67. * Because this memory must be identity mapped in the transition from
  68. * virtual to physical addresses it must live in the range
  69. * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  70. * modifiable.
  71. *
  72. * The assembly stub in the control code buffer is passed a linked list
  73. * of descriptor pages detailing the source pages of the new kernel,
  74. * and the destination addresses of those source pages. As this data
  75. * structure is not used in the context of the current OS, it must
  76. * be self-contained.
  77. *
  78. * The code has been made to work with highmem pages and will use a
  79. * destination page in its final resting place (if it happens
  80. * to allocate it). The end product of this is that most of the
  81. * physical address space, and most of RAM can be used.
  82. *
  83. * Future directions include:
  84. * - allocating a page table with the control code buffer identity
  85. * mapped, to simplify machine_kexec and make kexec_on_panic more
  86. * reliable.
  87. */
  88. /*
  89. * KIMAGE_NO_DEST is an impossible destination address..., for
  90. * allocating pages whose destination address we do not care about.
  91. */
  92. #define KIMAGE_NO_DEST (-1UL)
  93. #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT)
  94. static struct page *kimage_alloc_page(struct kimage *image,
  95. gfp_t gfp_mask,
  96. unsigned long dest);
  97. int sanity_check_segment_list(struct kimage *image)
  98. {
  99. int i;
  100. unsigned long nr_segments = image->nr_segments;
  101. unsigned long total_pages = 0;
  102. unsigned long nr_pages = totalram_pages();
  103. /*
  104. * Verify we have good destination addresses. The caller is
  105. * responsible for making certain we don't attempt to load
  106. * the new image into invalid or reserved areas of RAM. This
  107. * just verifies it is an address we can use.
  108. *
  109. * Since the kernel does everything in page size chunks ensure
  110. * the destination addresses are page aligned. Too many
  111. * special cases crop of when we don't do this. The most
  112. * insidious is getting overlapping destination addresses
  113. * simply because addresses are changed to page size
  114. * granularity.
  115. */
  116. for (i = 0; i < nr_segments; i++) {
  117. unsigned long mstart, mend;
  118. mstart = image->segment[i].mem;
  119. mend = mstart + image->segment[i].memsz;
  120. if (mstart > mend)
  121. return -EADDRNOTAVAIL;
  122. if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
  123. return -EADDRNOTAVAIL;
  124. if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
  125. return -EADDRNOTAVAIL;
  126. }
  127. /* Verify our destination addresses do not overlap.
  128. * If we alloed overlapping destination addresses
  129. * through very weird things can happen with no
  130. * easy explanation as one segment stops on another.
  131. */
  132. for (i = 0; i < nr_segments; i++) {
  133. unsigned long mstart, mend;
  134. unsigned long j;
  135. mstart = image->segment[i].mem;
  136. mend = mstart + image->segment[i].memsz;
  137. for (j = 0; j < i; j++) {
  138. unsigned long pstart, pend;
  139. pstart = image->segment[j].mem;
  140. pend = pstart + image->segment[j].memsz;
  141. /* Do the segments overlap ? */
  142. if ((mend > pstart) && (mstart < pend))
  143. return -EINVAL;
  144. }
  145. }
  146. /* Ensure our buffer sizes are strictly less than
  147. * our memory sizes. This should always be the case,
  148. * and it is easier to check up front than to be surprised
  149. * later on.
  150. */
  151. for (i = 0; i < nr_segments; i++) {
  152. if (image->segment[i].bufsz > image->segment[i].memsz)
  153. return -EINVAL;
  154. }
  155. /*
  156. * Verify that no more than half of memory will be consumed. If the
  157. * request from userspace is too large, a large amount of time will be
  158. * wasted allocating pages, which can cause a soft lockup.
  159. */
  160. for (i = 0; i < nr_segments; i++) {
  161. if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2)
  162. return -EINVAL;
  163. total_pages += PAGE_COUNT(image->segment[i].memsz);
  164. }
  165. if (total_pages > nr_pages / 2)
  166. return -EINVAL;
  167. #ifdef CONFIG_CRASH_DUMP
  168. /*
  169. * Verify we have good destination addresses. Normally
  170. * the caller is responsible for making certain we don't
  171. * attempt to load the new image into invalid or reserved
  172. * areas of RAM. But crash kernels are preloaded into a
  173. * reserved area of ram. We must ensure the addresses
  174. * are in the reserved area otherwise preloading the
  175. * kernel could corrupt things.
  176. */
  177. if (image->type == KEXEC_TYPE_CRASH) {
  178. for (i = 0; i < nr_segments; i++) {
  179. unsigned long mstart, mend;
  180. mstart = image->segment[i].mem;
  181. mend = mstart + image->segment[i].memsz - 1;
  182. /* Ensure we are within the crash kernel limits */
  183. if ((mstart < phys_to_boot_phys(crashk_res.start)) ||
  184. (mend > phys_to_boot_phys(crashk_res.end)))
  185. return -EADDRNOTAVAIL;
  186. }
  187. }
  188. #endif
  189. return 0;
  190. }
  191. struct kimage *do_kimage_alloc_init(void)
  192. {
  193. struct kimage *image;
  194. /* Allocate a controlling structure */
  195. image = kzalloc(sizeof(*image), GFP_KERNEL);
  196. if (!image)
  197. return NULL;
  198. image->head = 0;
  199. image->entry = &image->head;
  200. image->last_entry = &image->head;
  201. image->control_page = ~0; /* By default this does not apply */
  202. image->type = KEXEC_TYPE_DEFAULT;
  203. /* Initialize the list of control pages */
  204. INIT_LIST_HEAD(&image->control_pages);
  205. /* Initialize the list of destination pages */
  206. INIT_LIST_HEAD(&image->dest_pages);
  207. /* Initialize the list of unusable pages */
  208. INIT_LIST_HEAD(&image->unusable_pages);
  209. #ifdef CONFIG_CRASH_HOTPLUG
  210. image->hp_action = KEXEC_CRASH_HP_NONE;
  211. image->elfcorehdr_index = -1;
  212. image->elfcorehdr_updated = false;
  213. #endif
  214. return image;
  215. }
  216. int kimage_is_destination_range(struct kimage *image,
  217. unsigned long start,
  218. unsigned long end)
  219. {
  220. unsigned long i;
  221. for (i = 0; i < image->nr_segments; i++) {
  222. unsigned long mstart, mend;
  223. mstart = image->segment[i].mem;
  224. mend = mstart + image->segment[i].memsz - 1;
  225. if ((end >= mstart) && (start <= mend))
  226. return 1;
  227. }
  228. return 0;
  229. }
  230. static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
  231. {
  232. struct page *pages;
  233. if (fatal_signal_pending(current))
  234. return NULL;
  235. pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order);
  236. if (pages) {
  237. unsigned int count, i;
  238. pages->mapping = NULL;
  239. set_page_private(pages, order);
  240. count = 1 << order;
  241. for (i = 0; i < count; i++)
  242. SetPageReserved(pages + i);
  243. arch_kexec_post_alloc_pages(page_address(pages), count,
  244. gfp_mask);
  245. if (gfp_mask & __GFP_ZERO)
  246. for (i = 0; i < count; i++)
  247. clear_highpage(pages + i);
  248. }
  249. return pages;
  250. }
  251. static void kimage_free_pages(struct page *page)
  252. {
  253. unsigned int order, count, i;
  254. order = page_private(page);
  255. count = 1 << order;
  256. arch_kexec_pre_free_pages(page_address(page), count);
  257. for (i = 0; i < count; i++)
  258. ClearPageReserved(page + i);
  259. __free_pages(page, order);
  260. }
  261. void kimage_free_page_list(struct list_head *list)
  262. {
  263. struct page *page, *next;
  264. list_for_each_entry_safe(page, next, list, lru) {
  265. list_del(&page->lru);
  266. kimage_free_pages(page);
  267. }
  268. }
  269. static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
  270. unsigned int order)
  271. {
  272. /* Control pages are special, they are the intermediaries
  273. * that are needed while we copy the rest of the pages
  274. * to their final resting place. As such they must
  275. * not conflict with either the destination addresses
  276. * or memory the kernel is already using.
  277. *
  278. * The only case where we really need more than one of
  279. * these are for architectures where we cannot disable
  280. * the MMU and must instead generate an identity mapped
  281. * page table for all of the memory.
  282. *
  283. * At worst this runs in O(N) of the image size.
  284. */
  285. struct list_head extra_pages;
  286. struct page *pages;
  287. unsigned int count;
  288. count = 1 << order;
  289. INIT_LIST_HEAD(&extra_pages);
  290. /* Loop while I can allocate a page and the page allocated
  291. * is a destination page.
  292. */
  293. do {
  294. unsigned long pfn, epfn, addr, eaddr;
  295. pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
  296. if (!pages)
  297. break;
  298. pfn = page_to_boot_pfn(pages);
  299. epfn = pfn + count;
  300. addr = pfn << PAGE_SHIFT;
  301. eaddr = (epfn << PAGE_SHIFT) - 1;
  302. if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
  303. kimage_is_destination_range(image, addr, eaddr)) {
  304. list_add(&pages->lru, &extra_pages);
  305. pages = NULL;
  306. }
  307. } while (!pages);
  308. if (pages) {
  309. /* Remember the allocated page... */
  310. list_add(&pages->lru, &image->control_pages);
  311. /* Because the page is already in it's destination
  312. * location we will never allocate another page at
  313. * that address. Therefore kimage_alloc_pages
  314. * will not return it (again) and we don't need
  315. * to give it an entry in image->segment[].
  316. */
  317. }
  318. /* Deal with the destination pages I have inadvertently allocated.
  319. *
  320. * Ideally I would convert multi-page allocations into single
  321. * page allocations, and add everything to image->dest_pages.
  322. *
  323. * For now it is simpler to just free the pages.
  324. */
  325. kimage_free_page_list(&extra_pages);
  326. return pages;
  327. }
  328. #ifdef CONFIG_CRASH_DUMP
  329. static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
  330. unsigned int order)
  331. {
  332. /* Control pages are special, they are the intermediaries
  333. * that are needed while we copy the rest of the pages
  334. * to their final resting place. As such they must
  335. * not conflict with either the destination addresses
  336. * or memory the kernel is already using.
  337. *
  338. * Control pages are also the only pags we must allocate
  339. * when loading a crash kernel. All of the other pages
  340. * are specified by the segments and we just memcpy
  341. * into them directly.
  342. *
  343. * The only case where we really need more than one of
  344. * these are for architectures where we cannot disable
  345. * the MMU and must instead generate an identity mapped
  346. * page table for all of the memory.
  347. *
  348. * Given the low demand this implements a very simple
  349. * allocator that finds the first hole of the appropriate
  350. * size in the reserved memory region, and allocates all
  351. * of the memory up to and including the hole.
  352. */
  353. unsigned long hole_start, hole_end, size;
  354. struct page *pages;
  355. pages = NULL;
  356. size = (1 << order) << PAGE_SHIFT;
  357. hole_start = ALIGN(image->control_page, size);
  358. hole_end = hole_start + size - 1;
  359. while (hole_end <= crashk_res.end) {
  360. unsigned long i;
  361. cond_resched();
  362. if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
  363. break;
  364. /* See if I overlap any of the segments */
  365. for (i = 0; i < image->nr_segments; i++) {
  366. unsigned long mstart, mend;
  367. mstart = image->segment[i].mem;
  368. mend = mstart + image->segment[i].memsz - 1;
  369. if ((hole_end >= mstart) && (hole_start <= mend)) {
  370. /* Advance the hole to the end of the segment */
  371. hole_start = ALIGN(mend, size);
  372. hole_end = hole_start + size - 1;
  373. break;
  374. }
  375. }
  376. /* If I don't overlap any segments I have found my hole! */
  377. if (i == image->nr_segments) {
  378. pages = pfn_to_page(hole_start >> PAGE_SHIFT);
  379. image->control_page = hole_end + 1;
  380. break;
  381. }
  382. }
  383. /* Ensure that these pages are decrypted if SME is enabled. */
  384. if (pages)
  385. arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
  386. return pages;
  387. }
  388. #endif
  389. struct page *kimage_alloc_control_pages(struct kimage *image,
  390. unsigned int order)
  391. {
  392. struct page *pages = NULL;
  393. switch (image->type) {
  394. case KEXEC_TYPE_DEFAULT:
  395. pages = kimage_alloc_normal_control_pages(image, order);
  396. break;
  397. #ifdef CONFIG_CRASH_DUMP
  398. case KEXEC_TYPE_CRASH:
  399. pages = kimage_alloc_crash_control_pages(image, order);
  400. break;
  401. #endif
  402. }
  403. return pages;
  404. }
  405. static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
  406. {
  407. if (*image->entry != 0)
  408. image->entry++;
  409. if (image->entry == image->last_entry) {
  410. kimage_entry_t *ind_page;
  411. struct page *page;
  412. page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
  413. if (!page)
  414. return -ENOMEM;
  415. ind_page = page_address(page);
  416. *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION;
  417. image->entry = ind_page;
  418. image->last_entry = ind_page +
  419. ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
  420. }
  421. *image->entry = entry;
  422. image->entry++;
  423. *image->entry = 0;
  424. return 0;
  425. }
  426. static int kimage_set_destination(struct kimage *image,
  427. unsigned long destination)
  428. {
  429. destination &= PAGE_MASK;
  430. return kimage_add_entry(image, destination | IND_DESTINATION);
  431. }
  432. static int kimage_add_page(struct kimage *image, unsigned long page)
  433. {
  434. page &= PAGE_MASK;
  435. return kimage_add_entry(image, page | IND_SOURCE);
  436. }
  437. static void kimage_free_extra_pages(struct kimage *image)
  438. {
  439. /* Walk through and free any extra destination pages I may have */
  440. kimage_free_page_list(&image->dest_pages);
  441. /* Walk through and free any unusable pages I have cached */
  442. kimage_free_page_list(&image->unusable_pages);
  443. }
  444. void kimage_terminate(struct kimage *image)
  445. {
  446. if (*image->entry != 0)
  447. image->entry++;
  448. *image->entry = IND_DONE;
  449. }
  450. #define for_each_kimage_entry(image, ptr, entry) \
  451. for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
  452. ptr = (entry & IND_INDIRECTION) ? \
  453. boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
  454. static void kimage_free_entry(kimage_entry_t entry)
  455. {
  456. struct page *page;
  457. page = boot_pfn_to_page(entry >> PAGE_SHIFT);
  458. kimage_free_pages(page);
  459. }
  460. void kimage_free(struct kimage *image)
  461. {
  462. kimage_entry_t *ptr, entry;
  463. kimage_entry_t ind = 0;
  464. if (!image)
  465. return;
  466. #ifdef CONFIG_CRASH_DUMP
  467. if (image->vmcoreinfo_data_copy) {
  468. crash_update_vmcoreinfo_safecopy(NULL);
  469. vunmap(image->vmcoreinfo_data_copy);
  470. }
  471. #endif
  472. kimage_free_extra_pages(image);
  473. for_each_kimage_entry(image, ptr, entry) {
  474. if (entry & IND_INDIRECTION) {
  475. /* Free the previous indirection page */
  476. if (ind & IND_INDIRECTION)
  477. kimage_free_entry(ind);
  478. /* Save this indirection page until we are
  479. * done with it.
  480. */
  481. ind = entry;
  482. } else if (entry & IND_SOURCE)
  483. kimage_free_entry(entry);
  484. }
  485. /* Free the final indirection page */
  486. if (ind & IND_INDIRECTION)
  487. kimage_free_entry(ind);
  488. /* Handle any machine specific cleanup */
  489. machine_kexec_cleanup(image);
  490. /* Free the kexec control pages... */
  491. kimage_free_page_list(&image->control_pages);
  492. /*
  493. * Free up any temporary buffers allocated. This might hit if
  494. * error occurred much later after buffer allocation.
  495. */
  496. if (image->file_mode)
  497. kimage_file_post_load_cleanup(image);
  498. kfree(image);
  499. }
  500. static kimage_entry_t *kimage_dst_used(struct kimage *image,
  501. unsigned long page)
  502. {
  503. kimage_entry_t *ptr, entry;
  504. unsigned long destination = 0;
  505. for_each_kimage_entry(image, ptr, entry) {
  506. if (entry & IND_DESTINATION)
  507. destination = entry & PAGE_MASK;
  508. else if (entry & IND_SOURCE) {
  509. if (page == destination)
  510. return ptr;
  511. destination += PAGE_SIZE;
  512. }
  513. }
  514. return NULL;
  515. }
  516. static struct page *kimage_alloc_page(struct kimage *image,
  517. gfp_t gfp_mask,
  518. unsigned long destination)
  519. {
  520. /*
  521. * Here we implement safeguards to ensure that a source page
  522. * is not copied to its destination page before the data on
  523. * the destination page is no longer useful.
  524. *
  525. * To do this we maintain the invariant that a source page is
  526. * either its own destination page, or it is not a
  527. * destination page at all.
  528. *
  529. * That is slightly stronger than required, but the proof
  530. * that no problems will not occur is trivial, and the
  531. * implementation is simply to verify.
  532. *
  533. * When allocating all pages normally this algorithm will run
  534. * in O(N) time, but in the worst case it will run in O(N^2)
  535. * time. If the runtime is a problem the data structures can
  536. * be fixed.
  537. */
  538. struct page *page;
  539. unsigned long addr;
  540. /*
  541. * Walk through the list of destination pages, and see if I
  542. * have a match.
  543. */
  544. list_for_each_entry(page, &image->dest_pages, lru) {
  545. addr = page_to_boot_pfn(page) << PAGE_SHIFT;
  546. if (addr == destination) {
  547. list_del(&page->lru);
  548. return page;
  549. }
  550. }
  551. page = NULL;
  552. while (1) {
  553. kimage_entry_t *old;
  554. /* Allocate a page, if we run out of memory give up */
  555. page = kimage_alloc_pages(gfp_mask, 0);
  556. if (!page)
  557. return NULL;
  558. /* If the page cannot be used file it away */
  559. if (page_to_boot_pfn(page) >
  560. (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
  561. list_add(&page->lru, &image->unusable_pages);
  562. continue;
  563. }
  564. addr = page_to_boot_pfn(page) << PAGE_SHIFT;
  565. /* If it is the destination page we want use it */
  566. if (addr == destination)
  567. break;
  568. /* If the page is not a destination page use it */
  569. if (!kimage_is_destination_range(image, addr,
  570. addr + PAGE_SIZE - 1))
  571. break;
  572. /*
  573. * I know that the page is someones destination page.
  574. * See if there is already a source page for this
  575. * destination page. And if so swap the source pages.
  576. */
  577. old = kimage_dst_used(image, addr);
  578. if (old) {
  579. /* If so move it */
  580. unsigned long old_addr;
  581. struct page *old_page;
  582. old_addr = *old & PAGE_MASK;
  583. old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT);
  584. copy_highpage(page, old_page);
  585. *old = addr | (*old & ~PAGE_MASK);
  586. /* The old page I have found cannot be a
  587. * destination page, so return it if it's
  588. * gfp_flags honor the ones passed in.
  589. */
  590. if (!(gfp_mask & __GFP_HIGHMEM) &&
  591. PageHighMem(old_page)) {
  592. kimage_free_pages(old_page);
  593. continue;
  594. }
  595. page = old_page;
  596. break;
  597. }
  598. /* Place the page on the destination list, to be used later */
  599. list_add(&page->lru, &image->dest_pages);
  600. }
  601. return page;
  602. }
  603. static int kimage_load_normal_segment(struct kimage *image,
  604. struct kexec_segment *segment)
  605. {
  606. unsigned long maddr;
  607. size_t ubytes, mbytes;
  608. int result;
  609. unsigned char __user *buf = NULL;
  610. unsigned char *kbuf = NULL;
  611. if (image->file_mode)
  612. kbuf = segment->kbuf;
  613. else
  614. buf = segment->buf;
  615. ubytes = segment->bufsz;
  616. mbytes = segment->memsz;
  617. maddr = segment->mem;
  618. result = kimage_set_destination(image, maddr);
  619. if (result < 0)
  620. goto out;
  621. while (mbytes) {
  622. struct page *page;
  623. char *ptr;
  624. size_t uchunk, mchunk;
  625. page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
  626. if (!page) {
  627. result = -ENOMEM;
  628. goto out;
  629. }
  630. result = kimage_add_page(image, page_to_boot_pfn(page)
  631. << PAGE_SHIFT);
  632. if (result < 0)
  633. goto out;
  634. ptr = kmap_local_page(page);
  635. /* Start with a clear page */
  636. clear_page(ptr);
  637. ptr += maddr & ~PAGE_MASK;
  638. mchunk = min_t(size_t, mbytes,
  639. PAGE_SIZE - (maddr & ~PAGE_MASK));
  640. uchunk = min(ubytes, mchunk);
  641. if (uchunk) {
  642. /* For file based kexec, source pages are in kernel memory */
  643. if (image->file_mode)
  644. memcpy(ptr, kbuf, uchunk);
  645. else
  646. result = copy_from_user(ptr, buf, uchunk);
  647. ubytes -= uchunk;
  648. if (image->file_mode)
  649. kbuf += uchunk;
  650. else
  651. buf += uchunk;
  652. }
  653. kunmap_local(ptr);
  654. if (result) {
  655. result = -EFAULT;
  656. goto out;
  657. }
  658. maddr += mchunk;
  659. mbytes -= mchunk;
  660. cond_resched();
  661. }
  662. out:
  663. return result;
  664. }
  665. #ifdef CONFIG_CRASH_DUMP
  666. static int kimage_load_crash_segment(struct kimage *image,
  667. struct kexec_segment *segment)
  668. {
  669. /* For crash dumps kernels we simply copy the data from
  670. * user space to it's destination.
  671. * We do things a page at a time for the sake of kmap.
  672. */
  673. unsigned long maddr;
  674. size_t ubytes, mbytes;
  675. int result;
  676. unsigned char __user *buf = NULL;
  677. unsigned char *kbuf = NULL;
  678. result = 0;
  679. if (image->file_mode)
  680. kbuf = segment->kbuf;
  681. else
  682. buf = segment->buf;
  683. ubytes = segment->bufsz;
  684. mbytes = segment->memsz;
  685. maddr = segment->mem;
  686. while (mbytes) {
  687. struct page *page;
  688. char *ptr;
  689. size_t uchunk, mchunk;
  690. page = boot_pfn_to_page(maddr >> PAGE_SHIFT);
  691. if (!page) {
  692. result = -ENOMEM;
  693. goto out;
  694. }
  695. arch_kexec_post_alloc_pages(page_address(page), 1, 0);
  696. ptr = kmap_local_page(page);
  697. ptr += maddr & ~PAGE_MASK;
  698. mchunk = min_t(size_t, mbytes,
  699. PAGE_SIZE - (maddr & ~PAGE_MASK));
  700. uchunk = min(ubytes, mchunk);
  701. if (mchunk > uchunk) {
  702. /* Zero the trailing part of the page */
  703. memset(ptr + uchunk, 0, mchunk - uchunk);
  704. }
  705. if (uchunk) {
  706. /* For file based kexec, source pages are in kernel memory */
  707. if (image->file_mode)
  708. memcpy(ptr, kbuf, uchunk);
  709. else
  710. result = copy_from_user(ptr, buf, uchunk);
  711. ubytes -= uchunk;
  712. if (image->file_mode)
  713. kbuf += uchunk;
  714. else
  715. buf += uchunk;
  716. }
  717. kexec_flush_icache_page(page);
  718. kunmap_local(ptr);
  719. arch_kexec_pre_free_pages(page_address(page), 1);
  720. if (result) {
  721. result = -EFAULT;
  722. goto out;
  723. }
  724. maddr += mchunk;
  725. mbytes -= mchunk;
  726. cond_resched();
  727. }
  728. out:
  729. return result;
  730. }
  731. #endif
  732. int kimage_load_segment(struct kimage *image,
  733. struct kexec_segment *segment)
  734. {
  735. int result = -ENOMEM;
  736. switch (image->type) {
  737. case KEXEC_TYPE_DEFAULT:
  738. result = kimage_load_normal_segment(image, segment);
  739. break;
  740. #ifdef CONFIG_CRASH_DUMP
  741. case KEXEC_TYPE_CRASH:
  742. result = kimage_load_crash_segment(image, segment);
  743. break;
  744. #endif
  745. }
  746. return result;
  747. }
  748. struct kexec_load_limit {
  749. /* Mutex protects the limit count. */
  750. struct mutex mutex;
  751. int limit;
  752. };
  753. static struct kexec_load_limit load_limit_reboot = {
  754. .mutex = __MUTEX_INITIALIZER(load_limit_reboot.mutex),
  755. .limit = -1,
  756. };
  757. static struct kexec_load_limit load_limit_panic = {
  758. .mutex = __MUTEX_INITIALIZER(load_limit_panic.mutex),
  759. .limit = -1,
  760. };
  761. struct kimage *kexec_image;
  762. struct kimage *kexec_crash_image;
  763. static int kexec_load_disabled;
  764. #ifdef CONFIG_SYSCTL
  765. static int kexec_limit_handler(const struct ctl_table *table, int write,
  766. void *buffer, size_t *lenp, loff_t *ppos)
  767. {
  768. struct kexec_load_limit *limit = table->data;
  769. int val;
  770. struct ctl_table tmp = {
  771. .data = &val,
  772. .maxlen = sizeof(val),
  773. .mode = table->mode,
  774. };
  775. int ret;
  776. if (write) {
  777. ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
  778. if (ret)
  779. return ret;
  780. if (val < 0)
  781. return -EINVAL;
  782. mutex_lock(&limit->mutex);
  783. if (limit->limit != -1 && val >= limit->limit)
  784. ret = -EINVAL;
  785. else
  786. limit->limit = val;
  787. mutex_unlock(&limit->mutex);
  788. return ret;
  789. }
  790. mutex_lock(&limit->mutex);
  791. val = limit->limit;
  792. mutex_unlock(&limit->mutex);
  793. return proc_dointvec(&tmp, write, buffer, lenp, ppos);
  794. }
  795. static struct ctl_table kexec_core_sysctls[] = {
  796. {
  797. .procname = "kexec_load_disabled",
  798. .data = &kexec_load_disabled,
  799. .maxlen = sizeof(int),
  800. .mode = 0644,
  801. /* only handle a transition from default "0" to "1" */
  802. .proc_handler = proc_dointvec_minmax,
  803. .extra1 = SYSCTL_ONE,
  804. .extra2 = SYSCTL_ONE,
  805. },
  806. {
  807. .procname = "kexec_load_limit_panic",
  808. .data = &load_limit_panic,
  809. .mode = 0644,
  810. .proc_handler = kexec_limit_handler,
  811. },
  812. {
  813. .procname = "kexec_load_limit_reboot",
  814. .data = &load_limit_reboot,
  815. .mode = 0644,
  816. .proc_handler = kexec_limit_handler,
  817. },
  818. };
  819. static int __init kexec_core_sysctl_init(void)
  820. {
  821. register_sysctl_init("kernel", kexec_core_sysctls);
  822. return 0;
  823. }
  824. late_initcall(kexec_core_sysctl_init);
  825. #endif
  826. bool kexec_load_permitted(int kexec_image_type)
  827. {
  828. struct kexec_load_limit *limit;
  829. /*
  830. * Only the superuser can use the kexec syscall and if it has not
  831. * been disabled.
  832. */
  833. if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
  834. return false;
  835. /* Check limit counter and decrease it.*/
  836. limit = (kexec_image_type == KEXEC_TYPE_CRASH) ?
  837. &load_limit_panic : &load_limit_reboot;
  838. mutex_lock(&limit->mutex);
  839. if (!limit->limit) {
  840. mutex_unlock(&limit->mutex);
  841. return false;
  842. }
  843. if (limit->limit != -1)
  844. limit->limit--;
  845. mutex_unlock(&limit->mutex);
  846. return true;
  847. }
  848. /*
  849. * Move into place and start executing a preloaded standalone
  850. * executable. If nothing was preloaded return an error.
  851. */
  852. int kernel_kexec(void)
  853. {
  854. int error = 0;
  855. if (!kexec_trylock())
  856. return -EBUSY;
  857. if (!kexec_image) {
  858. error = -EINVAL;
  859. goto Unlock;
  860. }
  861. #ifdef CONFIG_KEXEC_JUMP
  862. if (kexec_image->preserve_context) {
  863. pm_prepare_console();
  864. error = freeze_processes();
  865. if (error) {
  866. error = -EBUSY;
  867. goto Restore_console;
  868. }
  869. suspend_console();
  870. error = dpm_suspend_start(PMSG_FREEZE);
  871. if (error)
  872. goto Resume_console;
  873. /* At this point, dpm_suspend_start() has been called,
  874. * but *not* dpm_suspend_end(). We *must* call
  875. * dpm_suspend_end() now. Otherwise, drivers for
  876. * some devices (e.g. interrupt controllers) become
  877. * desynchronized with the actual state of the
  878. * hardware at resume time, and evil weirdness ensues.
  879. */
  880. error = dpm_suspend_end(PMSG_FREEZE);
  881. if (error)
  882. goto Resume_devices;
  883. error = suspend_disable_secondary_cpus();
  884. if (error)
  885. goto Enable_cpus;
  886. local_irq_disable();
  887. error = syscore_suspend();
  888. if (error)
  889. goto Enable_irqs;
  890. } else
  891. #endif
  892. {
  893. kexec_in_progress = true;
  894. kernel_restart_prepare("kexec reboot");
  895. migrate_to_reboot_cpu();
  896. syscore_shutdown();
  897. /*
  898. * migrate_to_reboot_cpu() disables CPU hotplug assuming that
  899. * no further code needs to use CPU hotplug (which is true in
  900. * the reboot case). However, the kexec path depends on using
  901. * CPU hotplug again; so re-enable it here.
  902. */
  903. cpu_hotplug_enable();
  904. pr_notice("Starting new kernel\n");
  905. machine_shutdown();
  906. }
  907. kmsg_dump(KMSG_DUMP_SHUTDOWN);
  908. machine_kexec(kexec_image);
  909. #ifdef CONFIG_KEXEC_JUMP
  910. if (kexec_image->preserve_context) {
  911. syscore_resume();
  912. Enable_irqs:
  913. local_irq_enable();
  914. Enable_cpus:
  915. suspend_enable_secondary_cpus();
  916. dpm_resume_start(PMSG_RESTORE);
  917. Resume_devices:
  918. dpm_resume_end(PMSG_RESTORE);
  919. Resume_console:
  920. resume_console();
  921. thaw_processes();
  922. Restore_console:
  923. pm_restore_console();
  924. }
  925. #endif
  926. Unlock:
  927. kexec_unlock();
  928. return error;
  929. }