crash.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. /*
  2. * Architecture specific (i386/x86_64) functions for kexec based crash dumps.
  3. *
  4. * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
  5. *
  6. * Copyright (C) IBM Corporation, 2004. All rights reserved.
  7. * Copyright (C) Red Hat Inc., 2014. All rights reserved.
  8. * Authors:
  9. * Vivek Goyal <vgoyal@redhat.com>
  10. *
  11. */
  12. #define pr_fmt(fmt) "kexec: " fmt
  13. #include <linux/types.h>
  14. #include <linux/kernel.h>
  15. #include <linux/smp.h>
  16. #include <linux/reboot.h>
  17. #include <linux/kexec.h>
  18. #include <linux/delay.h>
  19. #include <linux/elf.h>
  20. #include <linux/elfcore.h>
  21. #include <linux/export.h>
  22. #include <linux/slab.h>
  23. #include <linux/vmalloc.h>
  24. #include <asm/processor.h>
  25. #include <asm/hardirq.h>
  26. #include <asm/nmi.h>
  27. #include <asm/hw_irq.h>
  28. #include <asm/apic.h>
  29. #include <asm/e820/types.h>
  30. #include <asm/io_apic.h>
  31. #include <asm/hpet.h>
  32. #include <linux/kdebug.h>
  33. #include <asm/cpu.h>
  34. #include <asm/reboot.h>
  35. #include <asm/virtext.h>
  36. #include <asm/intel_pt.h>
  37. /* Used while preparing memory map entries for second kernel */
  38. struct crash_memmap_data {
  39. struct boot_params *params;
  40. /* Type of memory */
  41. unsigned int type;
  42. };
  43. /*
  44. * This is used to VMCLEAR all VMCSs loaded on the
  45. * processor. And when loading kvm_intel module, the
  46. * callback function pointer will be assigned.
  47. *
  48. * protected by rcu.
  49. */
  50. crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
  51. EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
  52. unsigned long crash_zero_bytes;
  53. static inline void cpu_crash_vmclear_loaded_vmcss(void)
  54. {
  55. crash_vmclear_fn *do_vmclear_operation = NULL;
  56. rcu_read_lock();
  57. do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
  58. if (do_vmclear_operation)
  59. do_vmclear_operation();
  60. rcu_read_unlock();
  61. }
  62. #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
  63. static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
  64. {
  65. #ifdef CONFIG_X86_32
  66. struct pt_regs fixed_regs;
  67. if (!user_mode(regs)) {
  68. crash_fixup_ss_esp(&fixed_regs, regs);
  69. regs = &fixed_regs;
  70. }
  71. #endif
  72. crash_save_cpu(regs, cpu);
  73. /*
  74. * VMCLEAR VMCSs loaded on all cpus if needed.
  75. */
  76. cpu_crash_vmclear_loaded_vmcss();
  77. /* Disable VMX or SVM if needed.
  78. *
  79. * We need to disable virtualization on all CPUs.
  80. * Having VMX or SVM enabled on any CPU may break rebooting
  81. * after the kdump kernel has finished its task.
  82. */
  83. cpu_emergency_vmxoff();
  84. cpu_emergency_svm_disable();
  85. /*
  86. * Disable Intel PT to stop its logging
  87. */
  88. cpu_emergency_stop_pt();
  89. disable_local_APIC();
  90. }
  91. void kdump_nmi_shootdown_cpus(void)
  92. {
  93. nmi_shootdown_cpus(kdump_nmi_callback);
  94. disable_local_APIC();
  95. }
  96. /* Override the weak function in kernel/panic.c */
  97. void crash_smp_send_stop(void)
  98. {
  99. static int cpus_stopped;
  100. if (cpus_stopped)
  101. return;
  102. if (smp_ops.crash_stop_other_cpus)
  103. smp_ops.crash_stop_other_cpus();
  104. else
  105. smp_send_stop();
  106. cpus_stopped = 1;
  107. }
  108. #else
  109. void crash_smp_send_stop(void)
  110. {
  111. /* There are no cpus to shootdown */
  112. }
  113. #endif
  114. void native_machine_crash_shutdown(struct pt_regs *regs)
  115. {
  116. /* This function is only called after the system
  117. * has panicked or is otherwise in a critical state.
  118. * The minimum amount of code to allow a kexec'd kernel
  119. * to run successfully needs to happen here.
  120. *
  121. * In practice this means shooting down the other cpus in
  122. * an SMP system.
  123. */
  124. /* The kernel is broken so disable interrupts */
  125. local_irq_disable();
  126. crash_smp_send_stop();
  127. /*
  128. * VMCLEAR VMCSs loaded on this cpu if needed.
  129. */
  130. cpu_crash_vmclear_loaded_vmcss();
  131. /* Booting kdump kernel with VMX or SVM enabled won't work,
  132. * because (among other limitations) we can't disable paging
  133. * with the virt flags.
  134. */
  135. cpu_emergency_vmxoff();
  136. cpu_emergency_svm_disable();
  137. /*
  138. * Disable Intel PT to stop its logging
  139. */
  140. cpu_emergency_stop_pt();
  141. #ifdef CONFIG_X86_IO_APIC
  142. /* Prevent crash_kexec() from deadlocking on ioapic_lock. */
  143. ioapic_zap_locks();
  144. clear_IO_APIC();
  145. #endif
  146. lapic_shutdown();
  147. restore_boot_irq_mode();
  148. #ifdef CONFIG_HPET_TIMER
  149. hpet_disable();
  150. #endif
  151. crash_save_cpu(regs, safe_smp_processor_id());
  152. }
  153. #ifdef CONFIG_KEXEC_FILE
  154. static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
  155. {
  156. unsigned int *nr_ranges = arg;
  157. (*nr_ranges)++;
  158. return 0;
  159. }
  160. /* Gather all the required information to prepare elf headers for ram regions */
  161. static struct crash_mem *fill_up_crash_elf_data(void)
  162. {
  163. unsigned int nr_ranges = 0;
  164. struct crash_mem *cmem;
  165. walk_system_ram_res(0, -1, &nr_ranges,
  166. get_nr_ram_ranges_callback);
  167. if (!nr_ranges)
  168. return NULL;
  169. /*
  170. * Exclusion of crash region and/or crashk_low_res may cause
  171. * another range split. So add extra two slots here.
  172. */
  173. nr_ranges += 2;
  174. cmem = vzalloc(sizeof(struct crash_mem) +
  175. sizeof(struct crash_mem_range) * nr_ranges);
  176. if (!cmem)
  177. return NULL;
  178. cmem->max_nr_ranges = nr_ranges;
  179. cmem->nr_ranges = 0;
  180. return cmem;
  181. }
  182. /*
  183. * Look for any unwanted ranges between mstart, mend and remove them. This
  184. * might lead to split and split ranges are put in cmem->ranges[] array
  185. */
  186. static int elf_header_exclude_ranges(struct crash_mem *cmem)
  187. {
  188. int ret = 0;
  189. /* Exclude crashkernel region */
  190. ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
  191. if (ret)
  192. return ret;
  193. if (crashk_low_res.end) {
  194. ret = crash_exclude_mem_range(cmem, crashk_low_res.start,
  195. crashk_low_res.end);
  196. if (ret)
  197. return ret;
  198. }
  199. return ret;
  200. }
  201. static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
  202. {
  203. struct crash_mem *cmem = arg;
  204. cmem->ranges[cmem->nr_ranges].start = res->start;
  205. cmem->ranges[cmem->nr_ranges].end = res->end;
  206. cmem->nr_ranges++;
  207. return 0;
  208. }
  209. /* Prepare elf headers. Return addr and size */
  210. static int prepare_elf_headers(struct kimage *image, void **addr,
  211. unsigned long *sz)
  212. {
  213. struct crash_mem *cmem;
  214. Elf64_Ehdr *ehdr;
  215. Elf64_Phdr *phdr;
  216. int ret, i;
  217. cmem = fill_up_crash_elf_data();
  218. if (!cmem)
  219. return -ENOMEM;
  220. ret = walk_system_ram_res(0, -1, cmem,
  221. prepare_elf64_ram_headers_callback);
  222. if (ret)
  223. goto out;
  224. /* Exclude unwanted mem ranges */
  225. ret = elf_header_exclude_ranges(cmem);
  226. if (ret)
  227. goto out;
  228. /* By default prepare 64bit headers */
  229. ret = crash_prepare_elf64_headers(cmem,
  230. IS_ENABLED(CONFIG_X86_64), addr, sz);
  231. if (ret)
  232. goto out;
  233. /*
  234. * If a range matches backup region, adjust offset to backup
  235. * segment.
  236. */
  237. ehdr = (Elf64_Ehdr *)*addr;
  238. phdr = (Elf64_Phdr *)(ehdr + 1);
  239. for (i = 0; i < ehdr->e_phnum; phdr++, i++)
  240. if (phdr->p_type == PT_LOAD &&
  241. phdr->p_paddr == image->arch.backup_src_start &&
  242. phdr->p_memsz == image->arch.backup_src_sz) {
  243. phdr->p_offset = image->arch.backup_load_addr;
  244. break;
  245. }
  246. out:
  247. vfree(cmem);
  248. return ret;
  249. }
  250. static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
  251. {
  252. unsigned int nr_e820_entries;
  253. nr_e820_entries = params->e820_entries;
  254. if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE)
  255. return 1;
  256. memcpy(&params->e820_table[nr_e820_entries], entry,
  257. sizeof(struct e820_entry));
  258. params->e820_entries++;
  259. return 0;
  260. }
  261. static int memmap_entry_callback(struct resource *res, void *arg)
  262. {
  263. struct crash_memmap_data *cmd = arg;
  264. struct boot_params *params = cmd->params;
  265. struct e820_entry ei;
  266. ei.addr = res->start;
  267. ei.size = resource_size(res);
  268. ei.type = cmd->type;
  269. add_e820_entry(params, &ei);
  270. return 0;
  271. }
  272. static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
  273. unsigned long long mstart,
  274. unsigned long long mend)
  275. {
  276. unsigned long start, end;
  277. int ret = 0;
  278. cmem->ranges[0].start = mstart;
  279. cmem->ranges[0].end = mend;
  280. cmem->nr_ranges = 1;
  281. /* Exclude Backup region */
  282. start = image->arch.backup_load_addr;
  283. end = start + image->arch.backup_src_sz - 1;
  284. ret = crash_exclude_mem_range(cmem, start, end);
  285. if (ret)
  286. return ret;
  287. /* Exclude elf header region */
  288. start = image->arch.elf_load_addr;
  289. end = start + image->arch.elf_headers_sz - 1;
  290. return crash_exclude_mem_range(cmem, start, end);
  291. }
  292. /* Prepare memory map for crash dump kernel */
  293. int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
  294. {
  295. int i, ret = 0;
  296. unsigned long flags;
  297. struct e820_entry ei;
  298. struct crash_memmap_data cmd;
  299. struct crash_mem *cmem;
  300. cmem = vzalloc(struct_size(cmem, ranges, 1));
  301. if (!cmem)
  302. return -ENOMEM;
  303. memset(&cmd, 0, sizeof(struct crash_memmap_data));
  304. cmd.params = params;
  305. /* Add first 640K segment */
  306. ei.addr = image->arch.backup_src_start;
  307. ei.size = image->arch.backup_src_sz;
  308. ei.type = E820_TYPE_RAM;
  309. add_e820_entry(params, &ei);
  310. /* Add ACPI tables */
  311. cmd.type = E820_TYPE_ACPI;
  312. flags = IORESOURCE_MEM | IORESOURCE_BUSY;
  313. walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
  314. memmap_entry_callback);
  315. /* Add ACPI Non-volatile Storage */
  316. cmd.type = E820_TYPE_NVS;
  317. walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
  318. memmap_entry_callback);
  319. /* Add crashk_low_res region */
  320. if (crashk_low_res.end) {
  321. ei.addr = crashk_low_res.start;
  322. ei.size = crashk_low_res.end - crashk_low_res.start + 1;
  323. ei.type = E820_TYPE_RAM;
  324. add_e820_entry(params, &ei);
  325. }
  326. /* Exclude some ranges from crashk_res and add rest to memmap */
  327. ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
  328. crashk_res.end);
  329. if (ret)
  330. goto out;
  331. for (i = 0; i < cmem->nr_ranges; i++) {
  332. ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
  333. /* If entry is less than a page, skip it */
  334. if (ei.size < PAGE_SIZE)
  335. continue;
  336. ei.addr = cmem->ranges[i].start;
  337. ei.type = E820_TYPE_RAM;
  338. add_e820_entry(params, &ei);
  339. }
  340. out:
  341. vfree(cmem);
  342. return ret;
  343. }
  344. static int determine_backup_region(struct resource *res, void *arg)
  345. {
  346. struct kimage *image = arg;
  347. image->arch.backup_src_start = res->start;
  348. image->arch.backup_src_sz = resource_size(res);
  349. /* Expecting only one range for backup region */
  350. return 1;
  351. }
  352. int crash_load_segments(struct kimage *image)
  353. {
  354. int ret;
  355. struct kexec_buf kbuf = { .image = image, .buf_min = 0,
  356. .buf_max = ULONG_MAX, .top_down = false };
  357. /*
  358. * Determine and load a segment for backup area. First 640K RAM
  359. * region is backup source
  360. */
  361. ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
  362. image, determine_backup_region);
  363. /* Zero or postive return values are ok */
  364. if (ret < 0)
  365. return ret;
  366. /* Add backup segment. */
  367. if (image->arch.backup_src_sz) {
  368. kbuf.buffer = &crash_zero_bytes;
  369. kbuf.bufsz = sizeof(crash_zero_bytes);
  370. kbuf.memsz = image->arch.backup_src_sz;
  371. kbuf.buf_align = PAGE_SIZE;
  372. /*
  373. * Ideally there is no source for backup segment. This is
  374. * copied in purgatory after crash. Just add a zero filled
  375. * segment for now to make sure checksum logic works fine.
  376. */
  377. ret = kexec_add_buffer(&kbuf);
  378. if (ret)
  379. return ret;
  380. image->arch.backup_load_addr = kbuf.mem;
  381. pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
  382. image->arch.backup_load_addr,
  383. image->arch.backup_src_start, kbuf.memsz);
  384. }
  385. /* Prepare elf headers and add a segment */
  386. ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
  387. if (ret)
  388. return ret;
  389. image->arch.elf_headers = kbuf.buffer;
  390. image->arch.elf_headers_sz = kbuf.bufsz;
  391. kbuf.memsz = kbuf.bufsz;
  392. kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
  393. ret = kexec_add_buffer(&kbuf);
  394. if (ret) {
  395. vfree((void *)image->arch.elf_headers);
  396. return ret;
  397. }
  398. image->arch.elf_load_addr = kbuf.mem;
  399. pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
  400. image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
  401. return ret;
  402. }
  403. #endif /* CONFIG_KEXEC_FILE */