ivpu_pm.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * Copyright (C) 2020-2024 Intel Corporation
  4. */
  5. #include <linux/highmem.h>
  6. #include <linux/moduleparam.h>
  7. #include <linux/pci.h>
  8. #include <linux/pm_runtime.h>
  9. #include <linux/reboot.h>
  10. #include "vpu_boot_api.h"
  11. #include "ivpu_drv.h"
  12. #include "ivpu_hw.h"
  13. #include "ivpu_fw.h"
  14. #include "ivpu_fw_log.h"
  15. #include "ivpu_ipc.h"
  16. #include "ivpu_job.h"
  17. #include "ivpu_jsm_msg.h"
  18. #include "ivpu_mmu.h"
  19. #include "ivpu_ms.h"
  20. #include "ivpu_pm.h"
  21. static bool ivpu_disable_recovery;
  22. module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
  23. MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
  24. static unsigned long ivpu_tdr_timeout_ms;
  25. module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
  26. MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
  27. #define PM_RESCHEDULE_LIMIT 5
  28. static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
  29. {
  30. struct ivpu_fw_info *fw = vdev->fw;
  31. ivpu_cmdq_reset_all_contexts(vdev);
  32. ivpu_ipc_reset(vdev);
  33. ivpu_fw_load(vdev);
  34. fw->entry_point = fw->cold_boot_entry_point;
  35. }
  36. static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
  37. {
  38. struct ivpu_fw_info *fw = vdev->fw;
  39. struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem);
  40. if (!bp->save_restore_ret_address) {
  41. ivpu_pm_prepare_cold_boot(vdev);
  42. return;
  43. }
  44. ivpu_dbg(vdev, FW_BOOT, "Save/restore entry point %llx", bp->save_restore_ret_address);
  45. fw->entry_point = bp->save_restore_ret_address;
  46. }
  47. static int ivpu_suspend(struct ivpu_device *vdev)
  48. {
  49. int ret;
  50. ivpu_prepare_for_reset(vdev);
  51. ret = ivpu_shutdown(vdev);
  52. if (ret)
  53. ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
  54. return ret;
  55. }
  56. static int ivpu_resume(struct ivpu_device *vdev)
  57. {
  58. int ret;
  59. retry:
  60. pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
  61. pci_restore_state(to_pci_dev(vdev->drm.dev));
  62. ret = ivpu_hw_power_up(vdev);
  63. if (ret) {
  64. ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
  65. goto err_power_down;
  66. }
  67. ret = ivpu_mmu_enable(vdev);
  68. if (ret) {
  69. ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
  70. goto err_power_down;
  71. }
  72. ret = ivpu_boot(vdev);
  73. if (ret)
  74. goto err_mmu_disable;
  75. return 0;
  76. err_mmu_disable:
  77. ivpu_mmu_disable(vdev);
  78. err_power_down:
  79. ivpu_hw_power_down(vdev);
  80. pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
  81. if (!ivpu_fw_is_cold_boot(vdev)) {
  82. ivpu_pm_prepare_cold_boot(vdev);
  83. goto retry;
  84. } else {
  85. ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
  86. }
  87. return ret;
  88. }
  89. static void ivpu_pm_recovery_work(struct work_struct *work)
  90. {
  91. struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
  92. struct ivpu_device *vdev = pm->vdev;
  93. char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
  94. int ret;
  95. ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
  96. ret = pm_runtime_resume_and_get(vdev->drm.dev);
  97. if (ret)
  98. ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
  99. ivpu_fw_log_dump(vdev);
  100. atomic_inc(&vdev->pm->reset_counter);
  101. atomic_set(&vdev->pm->reset_pending, 1);
  102. down_write(&vdev->pm->reset_lock);
  103. ivpu_suspend(vdev);
  104. ivpu_pm_prepare_cold_boot(vdev);
  105. ivpu_jobs_abort_all(vdev);
  106. ivpu_ms_cleanup_all(vdev);
  107. ret = ivpu_resume(vdev);
  108. if (ret)
  109. ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
  110. up_write(&vdev->pm->reset_lock);
  111. atomic_set(&vdev->pm->reset_pending, 0);
  112. kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
  113. pm_runtime_mark_last_busy(vdev->drm.dev);
  114. pm_runtime_put_autosuspend(vdev->drm.dev);
  115. }
  116. void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
  117. {
  118. ivpu_err(vdev, "Recovery triggered by %s\n", reason);
  119. if (ivpu_disable_recovery) {
  120. ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
  121. return;
  122. }
  123. if (ivpu_is_fpga(vdev)) {
  124. ivpu_err(vdev, "Recovery not available on FPGA\n");
  125. return;
  126. }
  127. /* Trigger recovery if it's not in progress */
  128. if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
  129. ivpu_hw_diagnose_failure(vdev);
  130. ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
  131. queue_work(system_long_wq, &vdev->pm->recovery_work);
  132. }
  133. }
  134. static void ivpu_job_timeout_work(struct work_struct *work)
  135. {
  136. struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
  137. struct ivpu_device *vdev = pm->vdev;
  138. ivpu_pm_trigger_recovery(vdev, "TDR");
  139. }
  140. void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
  141. {
  142. unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
  143. /* No-op if already queued */
  144. queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms));
  145. }
  146. void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
  147. {
  148. cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
  149. }
  150. int ivpu_pm_suspend_cb(struct device *dev)
  151. {
  152. struct drm_device *drm = dev_get_drvdata(dev);
  153. struct ivpu_device *vdev = to_ivpu_device(drm);
  154. unsigned long timeout;
  155. ivpu_dbg(vdev, PM, "Suspend..\n");
  156. timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
  157. while (!ivpu_hw_is_idle(vdev)) {
  158. cond_resched();
  159. if (time_after_eq(jiffies, timeout)) {
  160. ivpu_err(vdev, "Failed to enter idle on system suspend\n");
  161. return -EBUSY;
  162. }
  163. }
  164. ivpu_jsm_pwr_d0i3_enter(vdev);
  165. ivpu_suspend(vdev);
  166. ivpu_pm_prepare_warm_boot(vdev);
  167. ivpu_dbg(vdev, PM, "Suspend done.\n");
  168. return 0;
  169. }
  170. int ivpu_pm_resume_cb(struct device *dev)
  171. {
  172. struct drm_device *drm = dev_get_drvdata(dev);
  173. struct ivpu_device *vdev = to_ivpu_device(drm);
  174. int ret;
  175. ivpu_dbg(vdev, PM, "Resume..\n");
  176. ret = ivpu_resume(vdev);
  177. if (ret)
  178. ivpu_err(vdev, "Failed to resume: %d\n", ret);
  179. ivpu_dbg(vdev, PM, "Resume done.\n");
  180. return ret;
  181. }
  182. int ivpu_pm_runtime_suspend_cb(struct device *dev)
  183. {
  184. struct drm_device *drm = dev_get_drvdata(dev);
  185. struct ivpu_device *vdev = to_ivpu_device(drm);
  186. int ret, ret_d0i3;
  187. bool is_idle;
  188. drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
  189. drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
  190. ivpu_dbg(vdev, PM, "Runtime suspend..\n");
  191. ivpu_mmu_disable(vdev);
  192. is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
  193. if (!is_idle)
  194. ivpu_err(vdev, "NPU is not idle before autosuspend\n");
  195. ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
  196. if (ret_d0i3)
  197. ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
  198. ret = ivpu_suspend(vdev);
  199. if (ret)
  200. ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
  201. if (!is_idle || ret_d0i3) {
  202. ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
  203. atomic_inc(&vdev->pm->reset_counter);
  204. ivpu_fw_log_dump(vdev);
  205. ivpu_pm_prepare_cold_boot(vdev);
  206. } else {
  207. ivpu_pm_prepare_warm_boot(vdev);
  208. }
  209. ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
  210. return 0;
  211. }
  212. int ivpu_pm_runtime_resume_cb(struct device *dev)
  213. {
  214. struct drm_device *drm = dev_get_drvdata(dev);
  215. struct ivpu_device *vdev = to_ivpu_device(drm);
  216. int ret;
  217. ivpu_dbg(vdev, PM, "Runtime resume..\n");
  218. ret = ivpu_resume(vdev);
  219. if (ret)
  220. ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
  221. ivpu_dbg(vdev, PM, "Runtime resume done.\n");
  222. return ret;
  223. }
  224. int ivpu_rpm_get(struct ivpu_device *vdev)
  225. {
  226. int ret;
  227. ret = pm_runtime_resume_and_get(vdev->drm.dev);
  228. if (ret < 0) {
  229. ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
  230. pm_runtime_set_suspended(vdev->drm.dev);
  231. }
  232. return ret;
  233. }
  234. void ivpu_rpm_put(struct ivpu_device *vdev)
  235. {
  236. pm_runtime_mark_last_busy(vdev->drm.dev);
  237. pm_runtime_put_autosuspend(vdev->drm.dev);
  238. }
  239. void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
  240. {
  241. struct ivpu_device *vdev = pci_get_drvdata(pdev);
  242. ivpu_dbg(vdev, PM, "Pre-reset..\n");
  243. atomic_inc(&vdev->pm->reset_counter);
  244. atomic_set(&vdev->pm->reset_pending, 1);
  245. pm_runtime_get_sync(vdev->drm.dev);
  246. down_write(&vdev->pm->reset_lock);
  247. ivpu_prepare_for_reset(vdev);
  248. ivpu_hw_reset(vdev);
  249. ivpu_pm_prepare_cold_boot(vdev);
  250. ivpu_jobs_abort_all(vdev);
  251. ivpu_ms_cleanup_all(vdev);
  252. ivpu_dbg(vdev, PM, "Pre-reset done.\n");
  253. }
  254. void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
  255. {
  256. struct ivpu_device *vdev = pci_get_drvdata(pdev);
  257. int ret;
  258. ivpu_dbg(vdev, PM, "Post-reset..\n");
  259. ret = ivpu_resume(vdev);
  260. if (ret)
  261. ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
  262. up_write(&vdev->pm->reset_lock);
  263. atomic_set(&vdev->pm->reset_pending, 0);
  264. ivpu_dbg(vdev, PM, "Post-reset done.\n");
  265. pm_runtime_mark_last_busy(vdev->drm.dev);
  266. pm_runtime_put_autosuspend(vdev->drm.dev);
  267. }
  268. void ivpu_pm_init(struct ivpu_device *vdev)
  269. {
  270. struct device *dev = vdev->drm.dev;
  271. struct ivpu_pm_info *pm = vdev->pm;
  272. int delay;
  273. pm->vdev = vdev;
  274. init_rwsem(&pm->reset_lock);
  275. atomic_set(&pm->reset_pending, 0);
  276. atomic_set(&pm->reset_counter, 0);
  277. INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
  278. INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
  279. if (ivpu_disable_recovery)
  280. delay = -1;
  281. else
  282. delay = vdev->timeout.autosuspend;
  283. pm_runtime_use_autosuspend(dev);
  284. pm_runtime_set_autosuspend_delay(dev, delay);
  285. pm_runtime_set_active(dev);
  286. ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
  287. }
  288. void ivpu_pm_cancel_recovery(struct ivpu_device *vdev)
  289. {
  290. drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
  291. cancel_work_sync(&vdev->pm->recovery_work);
  292. }
  293. void ivpu_pm_enable(struct ivpu_device *vdev)
  294. {
  295. struct device *dev = vdev->drm.dev;
  296. pm_runtime_allow(dev);
  297. pm_runtime_mark_last_busy(dev);
  298. pm_runtime_put_autosuspend(dev);
  299. }
  300. void ivpu_pm_disable(struct ivpu_device *vdev)
  301. {
  302. pm_runtime_get_noresume(vdev->drm.dev);
  303. pm_runtime_forbid(vdev->drm.dev);
  304. }
  305. int ivpu_pm_dct_init(struct ivpu_device *vdev)
  306. {
  307. if (vdev->pm->dct_active_percent)
  308. return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
  309. return 0;
  310. }
  311. int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
  312. {
  313. u32 active_us, inactive_us;
  314. int ret;
  315. if (active_percent == 0 || active_percent > 100)
  316. return -EINVAL;
  317. active_us = (DCT_PERIOD_US * active_percent) / 100;
  318. inactive_us = DCT_PERIOD_US - active_us;
  319. ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
  320. if (ret) {
  321. ivpu_err_ratelimited(vdev, "Filed to enable DCT: %d\n", ret);
  322. return ret;
  323. }
  324. vdev->pm->dct_active_percent = active_percent;
  325. ivpu_dbg(vdev, PM, "DCT set to %u%% (D0: %uus, D0i2: %uus)\n",
  326. active_percent, active_us, inactive_us);
  327. return 0;
  328. }
  329. int ivpu_pm_dct_disable(struct ivpu_device *vdev)
  330. {
  331. int ret;
  332. ret = ivpu_jsm_dct_disable(vdev);
  333. if (ret) {
  334. ivpu_err_ratelimited(vdev, "Filed to disable DCT: %d\n", ret);
  335. return ret;
  336. }
  337. vdev->pm->dct_active_percent = 0;
  338. ivpu_dbg(vdev, PM, "DCT disabled\n");
  339. return 0;
  340. }
  341. void ivpu_pm_dct_irq_thread_handler(struct ivpu_device *vdev)
  342. {
  343. bool enable;
  344. int ret;
  345. if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
  346. return;
  347. if (vdev->pm->dct_active_percent)
  348. ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
  349. else
  350. ret = ivpu_pm_dct_disable(vdev);
  351. if (!ret)
  352. ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent);
  353. }