err.c 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * This file implements the error recovery as a core part of PCIe error
  4. * reporting. When a PCIe error is delivered, an error message will be
  5. * collected and printed to console, then, an error recovery procedure
  6. * will be executed by following the PCI error recovery rules.
  7. *
  8. * Copyright (C) 2006 Intel Corp.
  9. * Tom Long Nguyen (tom.l.nguyen@intel.com)
  10. * Zhang Yanmin (yanmin.zhang@intel.com)
  11. */
  12. #include <linux/pci.h>
  13. #include <linux/module.h>
  14. #include <linux/pci.h>
  15. #include <linux/kernel.h>
  16. #include <linux/errno.h>
  17. #include <linux/aer.h>
  18. #include "portdrv.h"
  19. #include "../pci.h"
  20. struct aer_broadcast_data {
  21. enum pci_channel_state state;
  22. enum pci_ers_result result;
  23. };
  24. static pci_ers_result_t merge_result(enum pci_ers_result orig,
  25. enum pci_ers_result new)
  26. {
  27. if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
  28. return PCI_ERS_RESULT_NO_AER_DRIVER;
  29. if (new == PCI_ERS_RESULT_NONE)
  30. return orig;
  31. switch (orig) {
  32. case PCI_ERS_RESULT_CAN_RECOVER:
  33. case PCI_ERS_RESULT_RECOVERED:
  34. orig = new;
  35. break;
  36. case PCI_ERS_RESULT_DISCONNECT:
  37. if (new == PCI_ERS_RESULT_NEED_RESET)
  38. orig = PCI_ERS_RESULT_NEED_RESET;
  39. break;
  40. default:
  41. break;
  42. }
  43. return orig;
  44. }
  45. static int report_error_detected(struct pci_dev *dev, void *data)
  46. {
  47. pci_ers_result_t vote;
  48. const struct pci_error_handlers *err_handler;
  49. struct aer_broadcast_data *result_data;
  50. result_data = (struct aer_broadcast_data *) data;
  51. device_lock(&dev->dev);
  52. dev->error_state = result_data->state;
  53. if (!dev->driver ||
  54. !dev->driver->err_handler ||
  55. !dev->driver->err_handler->error_detected) {
  56. /*
  57. * If any device in the subtree does not have an error_detected
  58. * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
  59. * error callbacks of "any" device in the subtree, and will
  60. * exit in the disconnected error state.
  61. */
  62. if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
  63. vote = PCI_ERS_RESULT_NO_AER_DRIVER;
  64. else
  65. vote = PCI_ERS_RESULT_NONE;
  66. } else {
  67. err_handler = dev->driver->err_handler;
  68. vote = err_handler->error_detected(dev, result_data->state);
  69. pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
  70. }
  71. result_data->result = merge_result(result_data->result, vote);
  72. device_unlock(&dev->dev);
  73. return 0;
  74. }
  75. static int report_mmio_enabled(struct pci_dev *dev, void *data)
  76. {
  77. pci_ers_result_t vote;
  78. const struct pci_error_handlers *err_handler;
  79. struct aer_broadcast_data *result_data;
  80. result_data = (struct aer_broadcast_data *) data;
  81. device_lock(&dev->dev);
  82. if (!dev->driver ||
  83. !dev->driver->err_handler ||
  84. !dev->driver->err_handler->mmio_enabled)
  85. goto out;
  86. err_handler = dev->driver->err_handler;
  87. vote = err_handler->mmio_enabled(dev);
  88. result_data->result = merge_result(result_data->result, vote);
  89. out:
  90. device_unlock(&dev->dev);
  91. return 0;
  92. }
  93. static int report_slot_reset(struct pci_dev *dev, void *data)
  94. {
  95. pci_ers_result_t vote;
  96. const struct pci_error_handlers *err_handler;
  97. struct aer_broadcast_data *result_data;
  98. result_data = (struct aer_broadcast_data *) data;
  99. device_lock(&dev->dev);
  100. if (!dev->driver ||
  101. !dev->driver->err_handler ||
  102. !dev->driver->err_handler->slot_reset)
  103. goto out;
  104. err_handler = dev->driver->err_handler;
  105. vote = err_handler->slot_reset(dev);
  106. result_data->result = merge_result(result_data->result, vote);
  107. out:
  108. device_unlock(&dev->dev);
  109. return 0;
  110. }
  111. static int report_resume(struct pci_dev *dev, void *data)
  112. {
  113. const struct pci_error_handlers *err_handler;
  114. device_lock(&dev->dev);
  115. dev->error_state = pci_channel_io_normal;
  116. if (!dev->driver ||
  117. !dev->driver->err_handler ||
  118. !dev->driver->err_handler->resume)
  119. goto out;
  120. err_handler = dev->driver->err_handler;
  121. err_handler->resume(dev);
  122. pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
  123. out:
  124. device_unlock(&dev->dev);
  125. return 0;
  126. }
  127. /**
  128. * default_reset_link - default reset function
  129. * @dev: pointer to pci_dev data structure
  130. *
  131. * Invoked when performing link reset on a Downstream Port or a
  132. * Root Port with no aer driver.
  133. */
  134. static pci_ers_result_t default_reset_link(struct pci_dev *dev)
  135. {
  136. int rc;
  137. rc = pci_bus_error_reset(dev);
  138. pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
  139. return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
  140. }
  141. static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
  142. {
  143. pci_ers_result_t status;
  144. struct pcie_port_service_driver *driver = NULL;
  145. driver = pcie_port_find_service(dev, service);
  146. if (driver && driver->reset_link) {
  147. status = driver->reset_link(dev);
  148. } else if (dev->has_secondary_link) {
  149. status = default_reset_link(dev);
  150. } else {
  151. pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
  152. pci_name(dev));
  153. return PCI_ERS_RESULT_DISCONNECT;
  154. }
  155. if (status != PCI_ERS_RESULT_RECOVERED) {
  156. pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
  157. pci_name(dev));
  158. return PCI_ERS_RESULT_DISCONNECT;
  159. }
  160. return status;
  161. }
  162. /**
  163. * broadcast_error_message - handle message broadcast to downstream drivers
  164. * @dev: pointer to from where in a hierarchy message is broadcasted down
  165. * @state: error state
  166. * @error_mesg: message to print
  167. * @cb: callback to be broadcasted
  168. *
  169. * Invoked during error recovery process. Once being invoked, the content
  170. * of error severity will be broadcasted to all downstream drivers in a
  171. * hierarchy in question.
  172. */
  173. static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
  174. enum pci_channel_state state,
  175. char *error_mesg,
  176. int (*cb)(struct pci_dev *, void *))
  177. {
  178. struct aer_broadcast_data result_data;
  179. pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
  180. result_data.state = state;
  181. if (cb == report_error_detected)
  182. result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
  183. else
  184. result_data.result = PCI_ERS_RESULT_RECOVERED;
  185. pci_walk_bus(dev->subordinate, cb, &result_data);
  186. return result_data.result;
  187. }
  188. /**
  189. * pcie_do_fatal_recovery - handle fatal error recovery process
  190. * @dev: pointer to a pci_dev data structure of agent detecting an error
  191. *
  192. * Invoked when an error is fatal. Once being invoked, removes the devices
  193. * beneath this AER agent, followed by reset link e.g. secondary bus reset
  194. * followed by re-enumeration of devices.
  195. */
  196. void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
  197. {
  198. struct pci_dev *udev;
  199. struct pci_bus *parent;
  200. struct pci_dev *pdev, *temp;
  201. pci_ers_result_t result;
  202. if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
  203. udev = dev;
  204. else
  205. udev = dev->bus->self;
  206. parent = udev->subordinate;
  207. pci_lock_rescan_remove();
  208. pci_dev_get(dev);
  209. list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
  210. bus_list) {
  211. pci_dev_get(pdev);
  212. pci_dev_set_disconnected(pdev, NULL);
  213. if (pci_has_subordinate(pdev))
  214. pci_walk_bus(pdev->subordinate,
  215. pci_dev_set_disconnected, NULL);
  216. pci_stop_and_remove_bus_device(pdev);
  217. pci_dev_put(pdev);
  218. }
  219. result = reset_link(udev, service);
  220. if ((service == PCIE_PORT_SERVICE_AER) &&
  221. (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
  222. /*
  223. * If the error is reported by a bridge, we think this error
  224. * is related to the downstream link of the bridge, so we
  225. * do error recovery on all subordinates of the bridge instead
  226. * of the bridge and clear the error status of the bridge.
  227. */
  228. pci_aer_clear_fatal_status(dev);
  229. pci_aer_clear_device_status(dev);
  230. }
  231. if (result == PCI_ERS_RESULT_RECOVERED) {
  232. if (pcie_wait_for_link(udev, true))
  233. pci_rescan_bus(udev->bus);
  234. pci_info(dev, "Device recovery from fatal error successful\n");
  235. } else {
  236. pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
  237. pci_info(dev, "Device recovery from fatal error failed\n");
  238. }
  239. pci_dev_put(dev);
  240. pci_unlock_rescan_remove();
  241. }
  242. /**
  243. * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
  244. * @dev: pointer to a pci_dev data structure of agent detecting an error
  245. *
  246. * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
  247. * error detected message to all downstream drivers within a hierarchy in
  248. * question and return the returned code.
  249. */
  250. void pcie_do_nonfatal_recovery(struct pci_dev *dev)
  251. {
  252. pci_ers_result_t status;
  253. enum pci_channel_state state;
  254. state = pci_channel_io_normal;
  255. /*
  256. * Error recovery runs on all subordinates of the first downstream port.
  257. * If the downstream port detected the error, it is cleared at the end.
  258. */
  259. if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
  260. pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
  261. dev = dev->bus->self;
  262. status = broadcast_error_message(dev,
  263. state,
  264. "error_detected",
  265. report_error_detected);
  266. if (status == PCI_ERS_RESULT_CAN_RECOVER)
  267. status = broadcast_error_message(dev,
  268. state,
  269. "mmio_enabled",
  270. report_mmio_enabled);
  271. if (status == PCI_ERS_RESULT_NEED_RESET) {
  272. /*
  273. * TODO: Should call platform-specific
  274. * functions to reset slot before calling
  275. * drivers' slot_reset callbacks?
  276. */
  277. status = broadcast_error_message(dev,
  278. state,
  279. "slot_reset",
  280. report_slot_reset);
  281. }
  282. if (status != PCI_ERS_RESULT_RECOVERED)
  283. goto failed;
  284. broadcast_error_message(dev,
  285. state,
  286. "resume",
  287. report_resume);
  288. pci_aer_clear_device_status(dev);
  289. pci_cleanup_aer_uncorrect_error_status(dev);
  290. pci_info(dev, "AER: Device recovery successful\n");
  291. return;
  292. failed:
  293. pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
  294. /* TODO: Should kernel panic here? */
  295. pci_info(dev, "AER: Device recovery failed\n");
  296. }