pci.c 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /* Copyright(c) 2020 Intel Corporation. All rights reserved. */
  3. #include <linux/unaligned.h>
  4. #include <linux/io-64-nonatomic-lo-hi.h>
  5. #include <linux/moduleparam.h>
  6. #include <linux/module.h>
  7. #include <linux/delay.h>
  8. #include <linux/sizes.h>
  9. #include <linux/mutex.h>
  10. #include <linux/list.h>
  11. #include <linux/pci.h>
  12. #include <linux/aer.h>
  13. #include <linux/io.h>
  14. #include <cxl/mailbox.h>
  15. #include "cxlmem.h"
  16. #include "cxlpci.h"
  17. #include "cxl.h"
  18. #include "pmu.h"
  19. /**
  20. * DOC: cxl pci
  21. *
  22. * This implements the PCI exclusive functionality for a CXL device as it is
  23. * defined by the Compute Express Link specification. CXL devices may surface
  24. * certain functionality even if it isn't CXL enabled. While this driver is
  25. * focused around the PCI specific aspects of a CXL device, it binds to the
  26. * specific CXL memory device class code, and therefore the implementation of
  27. * cxl_pci is focused around CXL memory devices.
  28. *
  29. * The driver has several responsibilities, mainly:
  30. * - Create the memX device and register on the CXL bus.
  31. * - Enumerate device's register interface and map them.
  32. * - Registers nvdimm bridge device with cxl_core.
  33. * - Registers a CXL mailbox with cxl_core.
  34. */
  35. #define cxl_doorbell_busy(cxlds) \
  36. (readl((cxlds)->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET) & \
  37. CXLDEV_MBOX_CTRL_DOORBELL)
  38. /* CXL 2.0 - 8.2.8.4 */
  39. #define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
  40. /*
  41. * CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to
  42. * dictate how long to wait for the mailbox to become ready. The new
  43. * field allows the device to tell software the amount of time to wait
  44. * before mailbox ready. This field per the spec theoretically allows
  45. * for up to 255 seconds. 255 seconds is unreasonably long, its longer
  46. * than the maximum SATA port link recovery wait. Default to 60 seconds
  47. * until someone builds a CXL device that needs more time in practice.
  48. */
  49. static unsigned short mbox_ready_timeout = 60;
  50. module_param(mbox_ready_timeout, ushort, 0644);
  51. MODULE_PARM_DESC(mbox_ready_timeout, "seconds to wait for mailbox ready");
  52. static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
  53. {
  54. const unsigned long start = jiffies;
  55. unsigned long end = start;
  56. while (cxl_doorbell_busy(cxlds)) {
  57. end = jiffies;
  58. if (time_after(end, start + CXL_MAILBOX_TIMEOUT_MS)) {
  59. /* Check again in case preempted before timeout test */
  60. if (!cxl_doorbell_busy(cxlds))
  61. break;
  62. return -ETIMEDOUT;
  63. }
  64. cpu_relax();
  65. }
  66. dev_dbg(cxlds->dev, "Doorbell wait took %dms",
  67. jiffies_to_msecs(end) - jiffies_to_msecs(start));
  68. return 0;
  69. }
  70. #define cxl_err(dev, status, msg) \
  71. dev_err_ratelimited(dev, msg ", device state %s%s\n", \
  72. status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
  73. status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
  74. #define cxl_cmd_err(dev, cmd, status, msg) \
  75. dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \
  76. (cmd)->opcode, \
  77. status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
  78. status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
  79. /*
  80. * Threaded irq dev_id's must be globally unique. cxl_dev_id provides a unique
  81. * wrapper object for each irq within the same cxlds.
  82. */
  83. struct cxl_dev_id {
  84. struct cxl_dev_state *cxlds;
  85. };
  86. static int cxl_request_irq(struct cxl_dev_state *cxlds, int irq,
  87. irq_handler_t thread_fn)
  88. {
  89. struct device *dev = cxlds->dev;
  90. struct cxl_dev_id *dev_id;
  91. dev_id = devm_kzalloc(dev, sizeof(*dev_id), GFP_KERNEL);
  92. if (!dev_id)
  93. return -ENOMEM;
  94. dev_id->cxlds = cxlds;
  95. return devm_request_threaded_irq(dev, irq, NULL, thread_fn,
  96. IRQF_SHARED | IRQF_ONESHOT, NULL,
  97. dev_id);
  98. }
  99. static bool cxl_mbox_background_complete(struct cxl_dev_state *cxlds)
  100. {
  101. u64 reg;
  102. reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
  103. return FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_PCT_MASK, reg) == 100;
  104. }
  105. static irqreturn_t cxl_pci_mbox_irq(int irq, void *id)
  106. {
  107. u64 reg;
  108. u16 opcode;
  109. struct cxl_dev_id *dev_id = id;
  110. struct cxl_dev_state *cxlds = dev_id->cxlds;
  111. struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
  112. struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
  113. if (!cxl_mbox_background_complete(cxlds))
  114. return IRQ_NONE;
  115. reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
  116. opcode = FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_OPCODE_MASK, reg);
  117. if (opcode == CXL_MBOX_OP_SANITIZE) {
  118. mutex_lock(&cxl_mbox->mbox_mutex);
  119. if (mds->security.sanitize_node)
  120. mod_delayed_work(system_wq, &mds->security.poll_dwork, 0);
  121. mutex_unlock(&cxl_mbox->mbox_mutex);
  122. } else {
  123. /* short-circuit the wait in __cxl_pci_mbox_send_cmd() */
  124. rcuwait_wake_up(&cxl_mbox->mbox_wait);
  125. }
  126. return IRQ_HANDLED;
  127. }
  128. /*
  129. * Sanitization operation polling mode.
  130. */
  131. static void cxl_mbox_sanitize_work(struct work_struct *work)
  132. {
  133. struct cxl_memdev_state *mds =
  134. container_of(work, typeof(*mds), security.poll_dwork.work);
  135. struct cxl_dev_state *cxlds = &mds->cxlds;
  136. struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
  137. mutex_lock(&cxl_mbox->mbox_mutex);
  138. if (cxl_mbox_background_complete(cxlds)) {
  139. mds->security.poll_tmo_secs = 0;
  140. if (mds->security.sanitize_node)
  141. sysfs_notify_dirent(mds->security.sanitize_node);
  142. mds->security.sanitize_active = false;
  143. dev_dbg(cxlds->dev, "Sanitization operation ended\n");
  144. } else {
  145. int timeout = mds->security.poll_tmo_secs + 10;
  146. mds->security.poll_tmo_secs = min(15 * 60, timeout);
  147. schedule_delayed_work(&mds->security.poll_dwork, timeout * HZ);
  148. }
  149. mutex_unlock(&cxl_mbox->mbox_mutex);
  150. }
  151. /**
  152. * __cxl_pci_mbox_send_cmd() - Execute a mailbox command
  153. * @cxl_mbox: CXL mailbox context
  154. * @mbox_cmd: Command to send to the memory device.
  155. *
  156. * Context: Any context. Expects mbox_mutex to be held.
  157. * Return: -ETIMEDOUT if timeout occurred waiting for completion. 0 on success.
  158. * Caller should check the return code in @mbox_cmd to make sure it
  159. * succeeded.
  160. *
  161. * This is a generic form of the CXL mailbox send command thus only using the
  162. * registers defined by the mailbox capability ID - CXL 2.0 8.2.8.4. Memory
  163. * devices, and perhaps other types of CXL devices may have further information
  164. * available upon error conditions. Driver facilities wishing to send mailbox
  165. * commands should use the wrapper command.
  166. *
  167. * The CXL spec allows for up to two mailboxes. The intention is for the primary
  168. * mailbox to be OS controlled and the secondary mailbox to be used by system
  169. * firmware. This allows the OS and firmware to communicate with the device and
  170. * not need to coordinate with each other. The driver only uses the primary
  171. * mailbox.
  172. */
  173. static int __cxl_pci_mbox_send_cmd(struct cxl_mailbox *cxl_mbox,
  174. struct cxl_mbox_cmd *mbox_cmd)
  175. {
  176. struct cxl_dev_state *cxlds = mbox_to_cxlds(cxl_mbox);
  177. struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
  178. void __iomem *payload = cxlds->regs.mbox + CXLDEV_MBOX_PAYLOAD_OFFSET;
  179. struct device *dev = cxlds->dev;
  180. u64 cmd_reg, status_reg;
  181. size_t out_len;
  182. int rc;
  183. lockdep_assert_held(&cxl_mbox->mbox_mutex);
  184. /*
  185. * Here are the steps from 8.2.8.4 of the CXL 2.0 spec.
  186. * 1. Caller reads MB Control Register to verify doorbell is clear
  187. * 2. Caller writes Command Register
  188. * 3. Caller writes Command Payload Registers if input payload is non-empty
  189. * 4. Caller writes MB Control Register to set doorbell
  190. * 5. Caller either polls for doorbell to be clear or waits for interrupt if configured
  191. * 6. Caller reads MB Status Register to fetch Return code
  192. * 7. If command successful, Caller reads Command Register to get Payload Length
  193. * 8. If output payload is non-empty, host reads Command Payload Registers
  194. *
  195. * Hardware is free to do whatever it wants before the doorbell is rung,
  196. * and isn't allowed to change anything after it clears the doorbell. As
  197. * such, steps 2 and 3 can happen in any order, and steps 6, 7, 8 can
  198. * also happen in any order (though some orders might not make sense).
  199. */
  200. /* #1 */
  201. if (cxl_doorbell_busy(cxlds)) {
  202. u64 md_status =
  203. readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
  204. cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
  205. "mailbox queue busy");
  206. return -EBUSY;
  207. }
  208. /*
  209. * With sanitize polling, hardware might be done and the poller still
  210. * not be in sync. Ensure no new command comes in until so. Keep the
  211. * hardware semantics and only allow device health status.
  212. */
  213. if (mds->security.poll_tmo_secs > 0) {
  214. if (mbox_cmd->opcode != CXL_MBOX_OP_GET_HEALTH_INFO)
  215. return -EBUSY;
  216. }
  217. cmd_reg = FIELD_PREP(CXLDEV_MBOX_CMD_COMMAND_OPCODE_MASK,
  218. mbox_cmd->opcode);
  219. if (mbox_cmd->size_in) {
  220. if (WARN_ON(!mbox_cmd->payload_in))
  221. return -EINVAL;
  222. cmd_reg |= FIELD_PREP(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK,
  223. mbox_cmd->size_in);
  224. memcpy_toio(payload, mbox_cmd->payload_in, mbox_cmd->size_in);
  225. }
  226. /* #2, #3 */
  227. writeq(cmd_reg, cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
  228. /* #4 */
  229. dev_dbg(dev, "Sending command: 0x%04x\n", mbox_cmd->opcode);
  230. writel(CXLDEV_MBOX_CTRL_DOORBELL,
  231. cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
  232. /* #5 */
  233. rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
  234. if (rc == -ETIMEDOUT) {
  235. u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
  236. cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout");
  237. return rc;
  238. }
  239. /* #6 */
  240. status_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_STATUS_OFFSET);
  241. mbox_cmd->return_code =
  242. FIELD_GET(CXLDEV_MBOX_STATUS_RET_CODE_MASK, status_reg);
  243. /*
  244. * Handle the background command in a synchronous manner.
  245. *
  246. * All other mailbox commands will serialize/queue on the mbox_mutex,
  247. * which we currently hold. Furthermore this also guarantees that
  248. * cxl_mbox_background_complete() checks are safe amongst each other,
  249. * in that no new bg operation can occur in between.
  250. *
  251. * Background operations are timesliced in accordance with the nature
  252. * of the command. In the event of timeout, the mailbox state is
  253. * indeterminate until the next successful command submission and the
  254. * driver can get back in sync with the hardware state.
  255. */
  256. if (mbox_cmd->return_code == CXL_MBOX_CMD_RC_BACKGROUND) {
  257. u64 bg_status_reg;
  258. int i, timeout;
  259. /*
  260. * Sanitization is a special case which monopolizes the device
  261. * and cannot be timesliced. Handle asynchronously instead,
  262. * and allow userspace to poll(2) for completion.
  263. */
  264. if (mbox_cmd->opcode == CXL_MBOX_OP_SANITIZE) {
  265. if (mds->security.sanitize_active)
  266. return -EBUSY;
  267. /* give first timeout a second */
  268. timeout = 1;
  269. mds->security.poll_tmo_secs = timeout;
  270. mds->security.sanitize_active = true;
  271. schedule_delayed_work(&mds->security.poll_dwork,
  272. timeout * HZ);
  273. dev_dbg(dev, "Sanitization operation started\n");
  274. goto success;
  275. }
  276. dev_dbg(dev, "Mailbox background operation (0x%04x) started\n",
  277. mbox_cmd->opcode);
  278. timeout = mbox_cmd->poll_interval_ms;
  279. for (i = 0; i < mbox_cmd->poll_count; i++) {
  280. if (rcuwait_wait_event_timeout(&cxl_mbox->mbox_wait,
  281. cxl_mbox_background_complete(cxlds),
  282. TASK_UNINTERRUPTIBLE,
  283. msecs_to_jiffies(timeout)) > 0)
  284. break;
  285. }
  286. if (!cxl_mbox_background_complete(cxlds)) {
  287. dev_err(dev, "timeout waiting for background (%d ms)\n",
  288. timeout * mbox_cmd->poll_count);
  289. return -ETIMEDOUT;
  290. }
  291. bg_status_reg = readq(cxlds->regs.mbox +
  292. CXLDEV_MBOX_BG_CMD_STATUS_OFFSET);
  293. mbox_cmd->return_code =
  294. FIELD_GET(CXLDEV_MBOX_BG_CMD_COMMAND_RC_MASK,
  295. bg_status_reg);
  296. dev_dbg(dev,
  297. "Mailbox background operation (0x%04x) completed\n",
  298. mbox_cmd->opcode);
  299. }
  300. if (mbox_cmd->return_code != CXL_MBOX_CMD_RC_SUCCESS) {
  301. dev_dbg(dev, "Mailbox operation had an error: %s\n",
  302. cxl_mbox_cmd_rc2str(mbox_cmd));
  303. return 0; /* completed but caller must check return_code */
  304. }
  305. success:
  306. /* #7 */
  307. cmd_reg = readq(cxlds->regs.mbox + CXLDEV_MBOX_CMD_OFFSET);
  308. out_len = FIELD_GET(CXLDEV_MBOX_CMD_PAYLOAD_LENGTH_MASK, cmd_reg);
  309. /* #8 */
  310. if (out_len && mbox_cmd->payload_out) {
  311. /*
  312. * Sanitize the copy. If hardware misbehaves, out_len per the
  313. * spec can actually be greater than the max allowed size (21
  314. * bits available but spec defined 1M max). The caller also may
  315. * have requested less data than the hardware supplied even
  316. * within spec.
  317. */
  318. size_t n;
  319. n = min3(mbox_cmd->size_out, cxl_mbox->payload_size, out_len);
  320. memcpy_fromio(mbox_cmd->payload_out, payload, n);
  321. mbox_cmd->size_out = n;
  322. } else {
  323. mbox_cmd->size_out = 0;
  324. }
  325. return 0;
  326. }
  327. static int cxl_pci_mbox_send(struct cxl_mailbox *cxl_mbox,
  328. struct cxl_mbox_cmd *cmd)
  329. {
  330. int rc;
  331. mutex_lock_io(&cxl_mbox->mbox_mutex);
  332. rc = __cxl_pci_mbox_send_cmd(cxl_mbox, cmd);
  333. mutex_unlock(&cxl_mbox->mbox_mutex);
  334. return rc;
  335. }
  336. static int cxl_pci_setup_mailbox(struct cxl_memdev_state *mds, bool irq_avail)
  337. {
  338. struct cxl_dev_state *cxlds = &mds->cxlds;
  339. struct cxl_mailbox *cxl_mbox = &cxlds->cxl_mbox;
  340. const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
  341. struct device *dev = cxlds->dev;
  342. unsigned long timeout;
  343. int irq, msgnum;
  344. u64 md_status;
  345. u32 ctrl;
  346. timeout = jiffies + mbox_ready_timeout * HZ;
  347. do {
  348. md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
  349. if (md_status & CXLMDEV_MBOX_IF_READY)
  350. break;
  351. if (msleep_interruptible(100))
  352. break;
  353. } while (!time_after(jiffies, timeout));
  354. if (!(md_status & CXLMDEV_MBOX_IF_READY)) {
  355. cxl_err(dev, md_status, "timeout awaiting mailbox ready");
  356. return -ETIMEDOUT;
  357. }
  358. /*
  359. * A command may be in flight from a previous driver instance,
  360. * think kexec, do one doorbell wait so that
  361. * __cxl_pci_mbox_send_cmd() can assume that it is the only
  362. * source for future doorbell busy events.
  363. */
  364. if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) {
  365. cxl_err(dev, md_status, "timeout awaiting mailbox idle");
  366. return -ETIMEDOUT;
  367. }
  368. cxl_mbox->mbox_send = cxl_pci_mbox_send;
  369. cxl_mbox->payload_size =
  370. 1 << FIELD_GET(CXLDEV_MBOX_CAP_PAYLOAD_SIZE_MASK, cap);
  371. /*
  372. * CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register
  373. *
  374. * If the size is too small, mandatory commands will not work and so
  375. * there's no point in going forward. If the size is too large, there's
  376. * no harm is soft limiting it.
  377. */
  378. cxl_mbox->payload_size = min_t(size_t, cxl_mbox->payload_size, SZ_1M);
  379. if (cxl_mbox->payload_size < 256) {
  380. dev_err(dev, "Mailbox is too small (%zub)",
  381. cxl_mbox->payload_size);
  382. return -ENXIO;
  383. }
  384. dev_dbg(dev, "Mailbox payload sized %zu", cxl_mbox->payload_size);
  385. INIT_DELAYED_WORK(&mds->security.poll_dwork, cxl_mbox_sanitize_work);
  386. /* background command interrupts are optional */
  387. if (!(cap & CXLDEV_MBOX_CAP_BG_CMD_IRQ) || !irq_avail)
  388. return 0;
  389. msgnum = FIELD_GET(CXLDEV_MBOX_CAP_IRQ_MSGNUM_MASK, cap);
  390. irq = pci_irq_vector(to_pci_dev(cxlds->dev), msgnum);
  391. if (irq < 0)
  392. return 0;
  393. if (cxl_request_irq(cxlds, irq, cxl_pci_mbox_irq))
  394. return 0;
  395. dev_dbg(cxlds->dev, "Mailbox interrupts enabled\n");
  396. /* enable background command mbox irq support */
  397. ctrl = readl(cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
  398. ctrl |= CXLDEV_MBOX_CTRL_BG_CMD_IRQ;
  399. writel(ctrl, cxlds->regs.mbox + CXLDEV_MBOX_CTRL_OFFSET);
  400. return 0;
  401. }
  402. /*
  403. * Assume that any RCIEP that emits the CXL memory expander class code
  404. * is an RCD
  405. */
  406. static bool is_cxl_restricted(struct pci_dev *pdev)
  407. {
  408. return pci_pcie_type(pdev) == PCI_EXP_TYPE_RC_END;
  409. }
  410. static int cxl_rcrb_get_comp_regs(struct pci_dev *pdev,
  411. struct cxl_register_map *map)
  412. {
  413. struct cxl_dport *dport;
  414. resource_size_t component_reg_phys;
  415. *map = (struct cxl_register_map) {
  416. .host = &pdev->dev,
  417. .resource = CXL_RESOURCE_NONE,
  418. };
  419. struct cxl_port *port __free(put_cxl_port) =
  420. cxl_pci_find_port(pdev, &dport);
  421. if (!port)
  422. return -EPROBE_DEFER;
  423. component_reg_phys = cxl_rcd_component_reg_phys(&pdev->dev, dport);
  424. if (component_reg_phys == CXL_RESOURCE_NONE)
  425. return -ENXIO;
  426. map->resource = component_reg_phys;
  427. map->reg_type = CXL_REGLOC_RBI_COMPONENT;
  428. map->max_size = CXL_COMPONENT_REG_BLOCK_SIZE;
  429. return 0;
  430. }
  431. static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
  432. struct cxl_register_map *map)
  433. {
  434. int rc;
  435. rc = cxl_find_regblock(pdev, type, map);
  436. /*
  437. * If the Register Locator DVSEC does not exist, check if it
  438. * is an RCH and try to extract the Component Registers from
  439. * an RCRB.
  440. */
  441. if (rc && type == CXL_REGLOC_RBI_COMPONENT && is_cxl_restricted(pdev))
  442. rc = cxl_rcrb_get_comp_regs(pdev, map);
  443. if (rc)
  444. return rc;
  445. return cxl_setup_regs(map);
  446. }
  447. static int cxl_pci_ras_unmask(struct pci_dev *pdev)
  448. {
  449. struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
  450. void __iomem *addr;
  451. u32 orig_val, val, mask;
  452. u16 cap;
  453. int rc;
  454. if (!cxlds->regs.ras) {
  455. dev_dbg(&pdev->dev, "No RAS registers.\n");
  456. return 0;
  457. }
  458. /* BIOS has PCIe AER error control */
  459. if (!pcie_aer_is_native(pdev))
  460. return 0;
  461. rc = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &cap);
  462. if (rc)
  463. return rc;
  464. if (cap & PCI_EXP_DEVCTL_URRE) {
  465. addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET;
  466. orig_val = readl(addr);
  467. mask = CXL_RAS_UNCORRECTABLE_MASK_MASK |
  468. CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK;
  469. val = orig_val & ~mask;
  470. writel(val, addr);
  471. dev_dbg(&pdev->dev,
  472. "Uncorrectable RAS Errors Mask: %#x -> %#x\n",
  473. orig_val, val);
  474. }
  475. if (cap & PCI_EXP_DEVCTL_CERE) {
  476. addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET;
  477. orig_val = readl(addr);
  478. val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK;
  479. writel(val, addr);
  480. dev_dbg(&pdev->dev, "Correctable RAS Errors Mask: %#x -> %#x\n",
  481. orig_val, val);
  482. }
  483. return 0;
  484. }
  485. static void free_event_buf(void *buf)
  486. {
  487. kvfree(buf);
  488. }
  489. /*
  490. * There is a single buffer for reading event logs from the mailbox. All logs
  491. * share this buffer protected by the mds->event_log_lock.
  492. */
  493. static int cxl_mem_alloc_event_buf(struct cxl_memdev_state *mds)
  494. {
  495. struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
  496. struct cxl_get_event_payload *buf;
  497. buf = kvmalloc(cxl_mbox->payload_size, GFP_KERNEL);
  498. if (!buf)
  499. return -ENOMEM;
  500. mds->event.buf = buf;
  501. return devm_add_action_or_reset(mds->cxlds.dev, free_event_buf, buf);
  502. }
  503. static bool cxl_alloc_irq_vectors(struct pci_dev *pdev)
  504. {
  505. int nvecs;
  506. /*
  507. * Per CXL 3.0 3.1.1 CXL.io Endpoint a function on a CXL device must
  508. * not generate INTx messages if that function participates in
  509. * CXL.cache or CXL.mem.
  510. *
  511. * Additionally pci_alloc_irq_vectors() handles calling
  512. * pci_free_irq_vectors() automatically despite not being called
  513. * pcim_*. See pci_setup_msi_context().
  514. */
  515. nvecs = pci_alloc_irq_vectors(pdev, 1, CXL_PCI_DEFAULT_MAX_VECTORS,
  516. PCI_IRQ_MSIX | PCI_IRQ_MSI);
  517. if (nvecs < 1) {
  518. dev_dbg(&pdev->dev, "Failed to alloc irq vectors: %d\n", nvecs);
  519. return false;
  520. }
  521. return true;
  522. }
  523. static irqreturn_t cxl_event_thread(int irq, void *id)
  524. {
  525. struct cxl_dev_id *dev_id = id;
  526. struct cxl_dev_state *cxlds = dev_id->cxlds;
  527. struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
  528. u32 status;
  529. do {
  530. /*
  531. * CXL 3.0 8.2.8.3.1: The lower 32 bits are the status;
  532. * ignore the reserved upper 32 bits
  533. */
  534. status = readl(cxlds->regs.status + CXLDEV_DEV_EVENT_STATUS_OFFSET);
  535. /* Ignore logs unknown to the driver */
  536. status &= CXLDEV_EVENT_STATUS_ALL;
  537. if (!status)
  538. break;
  539. cxl_mem_get_event_records(mds, status);
  540. cond_resched();
  541. } while (status);
  542. return IRQ_HANDLED;
  543. }
  544. static int cxl_event_req_irq(struct cxl_dev_state *cxlds, u8 setting)
  545. {
  546. struct pci_dev *pdev = to_pci_dev(cxlds->dev);
  547. int irq;
  548. if (FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting) != CXL_INT_MSI_MSIX)
  549. return -ENXIO;
  550. irq = pci_irq_vector(pdev,
  551. FIELD_GET(CXLDEV_EVENT_INT_MSGNUM_MASK, setting));
  552. if (irq < 0)
  553. return irq;
  554. return cxl_request_irq(cxlds, irq, cxl_event_thread);
  555. }
  556. static int cxl_event_get_int_policy(struct cxl_memdev_state *mds,
  557. struct cxl_event_interrupt_policy *policy)
  558. {
  559. struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
  560. struct cxl_mbox_cmd mbox_cmd = {
  561. .opcode = CXL_MBOX_OP_GET_EVT_INT_POLICY,
  562. .payload_out = policy,
  563. .size_out = sizeof(*policy),
  564. };
  565. int rc;
  566. rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
  567. if (rc < 0)
  568. dev_err(mds->cxlds.dev,
  569. "Failed to get event interrupt policy : %d", rc);
  570. return rc;
  571. }
  572. static int cxl_event_config_msgnums(struct cxl_memdev_state *mds,
  573. struct cxl_event_interrupt_policy *policy)
  574. {
  575. struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
  576. struct cxl_mbox_cmd mbox_cmd;
  577. int rc;
  578. *policy = (struct cxl_event_interrupt_policy) {
  579. .info_settings = CXL_INT_MSI_MSIX,
  580. .warn_settings = CXL_INT_MSI_MSIX,
  581. .failure_settings = CXL_INT_MSI_MSIX,
  582. .fatal_settings = CXL_INT_MSI_MSIX,
  583. };
  584. mbox_cmd = (struct cxl_mbox_cmd) {
  585. .opcode = CXL_MBOX_OP_SET_EVT_INT_POLICY,
  586. .payload_in = policy,
  587. .size_in = sizeof(*policy),
  588. };
  589. rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
  590. if (rc < 0) {
  591. dev_err(mds->cxlds.dev, "Failed to set event interrupt policy : %d",
  592. rc);
  593. return rc;
  594. }
  595. /* Retrieve final interrupt settings */
  596. return cxl_event_get_int_policy(mds, policy);
  597. }
  598. static int cxl_event_irqsetup(struct cxl_memdev_state *mds)
  599. {
  600. struct cxl_dev_state *cxlds = &mds->cxlds;
  601. struct cxl_event_interrupt_policy policy;
  602. int rc;
  603. rc = cxl_event_config_msgnums(mds, &policy);
  604. if (rc)
  605. return rc;
  606. rc = cxl_event_req_irq(cxlds, policy.info_settings);
  607. if (rc) {
  608. dev_err(cxlds->dev, "Failed to get interrupt for event Info log\n");
  609. return rc;
  610. }
  611. rc = cxl_event_req_irq(cxlds, policy.warn_settings);
  612. if (rc) {
  613. dev_err(cxlds->dev, "Failed to get interrupt for event Warn log\n");
  614. return rc;
  615. }
  616. rc = cxl_event_req_irq(cxlds, policy.failure_settings);
  617. if (rc) {
  618. dev_err(cxlds->dev, "Failed to get interrupt for event Failure log\n");
  619. return rc;
  620. }
  621. rc = cxl_event_req_irq(cxlds, policy.fatal_settings);
  622. if (rc) {
  623. dev_err(cxlds->dev, "Failed to get interrupt for event Fatal log\n");
  624. return rc;
  625. }
  626. return 0;
  627. }
  628. static bool cxl_event_int_is_fw(u8 setting)
  629. {
  630. u8 mode = FIELD_GET(CXLDEV_EVENT_INT_MODE_MASK, setting);
  631. return mode == CXL_INT_FW;
  632. }
  633. static int cxl_event_config(struct pci_host_bridge *host_bridge,
  634. struct cxl_memdev_state *mds, bool irq_avail)
  635. {
  636. struct cxl_event_interrupt_policy policy;
  637. int rc;
  638. /*
  639. * When BIOS maintains CXL error reporting control, it will process
  640. * event records. Only one agent can do so.
  641. */
  642. if (!host_bridge->native_cxl_error)
  643. return 0;
  644. if (!irq_avail) {
  645. dev_info(mds->cxlds.dev, "No interrupt support, disable event processing.\n");
  646. return 0;
  647. }
  648. rc = cxl_mem_alloc_event_buf(mds);
  649. if (rc)
  650. return rc;
  651. rc = cxl_event_get_int_policy(mds, &policy);
  652. if (rc)
  653. return rc;
  654. if (cxl_event_int_is_fw(policy.info_settings) ||
  655. cxl_event_int_is_fw(policy.warn_settings) ||
  656. cxl_event_int_is_fw(policy.failure_settings) ||
  657. cxl_event_int_is_fw(policy.fatal_settings)) {
  658. dev_err(mds->cxlds.dev,
  659. "FW still in control of Event Logs despite _OSC settings\n");
  660. return -EBUSY;
  661. }
  662. rc = cxl_event_irqsetup(mds);
  663. if (rc)
  664. return rc;
  665. cxl_mem_get_event_records(mds, CXLDEV_EVENT_STATUS_ALL);
  666. return 0;
  667. }
  668. static int cxl_pci_type3_init_mailbox(struct cxl_dev_state *cxlds)
  669. {
  670. int rc;
  671. /*
  672. * Fail the init if there's no mailbox. For a type3 this is out of spec.
  673. */
  674. if (!cxlds->reg_map.device_map.mbox.valid)
  675. return -ENODEV;
  676. rc = cxl_mailbox_init(&cxlds->cxl_mbox, cxlds->dev);
  677. if (rc)
  678. return rc;
  679. return 0;
  680. }
  681. static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
  682. {
  683. struct pci_host_bridge *host_bridge = pci_find_host_bridge(pdev->bus);
  684. struct cxl_memdev_state *mds;
  685. struct cxl_dev_state *cxlds;
  686. struct cxl_register_map map;
  687. struct cxl_memdev *cxlmd;
  688. int i, rc, pmu_count;
  689. bool irq_avail;
  690. /*
  691. * Double check the anonymous union trickery in struct cxl_regs
  692. * FIXME switch to struct_group()
  693. */
  694. BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) !=
  695. offsetof(struct cxl_regs, device_regs.memdev));
  696. rc = pcim_enable_device(pdev);
  697. if (rc)
  698. return rc;
  699. pci_set_master(pdev);
  700. mds = cxl_memdev_state_create(&pdev->dev);
  701. if (IS_ERR(mds))
  702. return PTR_ERR(mds);
  703. cxlds = &mds->cxlds;
  704. pci_set_drvdata(pdev, cxlds);
  705. cxlds->rcd = is_cxl_restricted(pdev);
  706. cxlds->serial = pci_get_dsn(pdev);
  707. cxlds->cxl_dvsec = pci_find_dvsec_capability(
  708. pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
  709. if (!cxlds->cxl_dvsec)
  710. dev_warn(&pdev->dev,
  711. "Device DVSEC not present, skip CXL.mem init\n");
  712. rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map);
  713. if (rc)
  714. return rc;
  715. rc = cxl_map_device_regs(&map, &cxlds->regs.device_regs);
  716. if (rc)
  717. return rc;
  718. /*
  719. * If the component registers can't be found, the cxl_pci driver may
  720. * still be useful for management functions so don't return an error.
  721. */
  722. rc = cxl_pci_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT,
  723. &cxlds->reg_map);
  724. if (rc)
  725. dev_warn(&pdev->dev, "No component registers (%d)\n", rc);
  726. else if (!cxlds->reg_map.component_map.ras.valid)
  727. dev_dbg(&pdev->dev, "RAS registers not found\n");
  728. rc = cxl_map_component_regs(&cxlds->reg_map, &cxlds->regs.component,
  729. BIT(CXL_CM_CAP_CAP_ID_RAS));
  730. if (rc)
  731. dev_dbg(&pdev->dev, "Failed to map RAS capability.\n");
  732. rc = cxl_pci_type3_init_mailbox(cxlds);
  733. if (rc)
  734. return rc;
  735. rc = cxl_await_media_ready(cxlds);
  736. if (rc == 0)
  737. cxlds->media_ready = true;
  738. else
  739. dev_warn(&pdev->dev, "Media not active (%d)\n", rc);
  740. irq_avail = cxl_alloc_irq_vectors(pdev);
  741. rc = cxl_pci_setup_mailbox(mds, irq_avail);
  742. if (rc)
  743. return rc;
  744. rc = cxl_enumerate_cmds(mds);
  745. if (rc)
  746. return rc;
  747. rc = cxl_set_timestamp(mds);
  748. if (rc)
  749. return rc;
  750. rc = cxl_poison_state_init(mds);
  751. if (rc)
  752. return rc;
  753. rc = cxl_dev_state_identify(mds);
  754. if (rc)
  755. return rc;
  756. rc = cxl_mem_create_range_info(mds);
  757. if (rc)
  758. return rc;
  759. cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds);
  760. if (IS_ERR(cxlmd))
  761. return PTR_ERR(cxlmd);
  762. rc = devm_cxl_setup_fw_upload(&pdev->dev, mds);
  763. if (rc)
  764. return rc;
  765. rc = devm_cxl_sanitize_setup_notifier(&pdev->dev, cxlmd);
  766. if (rc)
  767. return rc;
  768. pmu_count = cxl_count_regblock(pdev, CXL_REGLOC_RBI_PMU);
  769. for (i = 0; i < pmu_count; i++) {
  770. struct cxl_pmu_regs pmu_regs;
  771. rc = cxl_find_regblock_instance(pdev, CXL_REGLOC_RBI_PMU, &map, i);
  772. if (rc) {
  773. dev_dbg(&pdev->dev, "Could not find PMU regblock\n");
  774. break;
  775. }
  776. rc = cxl_map_pmu_regs(&map, &pmu_regs);
  777. if (rc) {
  778. dev_dbg(&pdev->dev, "Could not map PMU regs\n");
  779. break;
  780. }
  781. rc = devm_cxl_pmu_add(cxlds->dev, &pmu_regs, cxlmd->id, i, CXL_PMU_MEMDEV);
  782. if (rc) {
  783. dev_dbg(&pdev->dev, "Could not add PMU instance\n");
  784. break;
  785. }
  786. }
  787. rc = cxl_event_config(host_bridge, mds, irq_avail);
  788. if (rc)
  789. return rc;
  790. if (cxl_pci_ras_unmask(pdev))
  791. dev_dbg(&pdev->dev, "No RAS reporting unmasked\n");
  792. pci_save_state(pdev);
  793. return rc;
  794. }
  795. static const struct pci_device_id cxl_mem_pci_tbl[] = {
  796. /* PCI class code for CXL.mem Type-3 Devices */
  797. { PCI_DEVICE_CLASS((PCI_CLASS_MEMORY_CXL << 8 | CXL_MEMORY_PROGIF), ~0)},
  798. { /* terminate list */ },
  799. };
  800. MODULE_DEVICE_TABLE(pci, cxl_mem_pci_tbl);
  801. static pci_ers_result_t cxl_slot_reset(struct pci_dev *pdev)
  802. {
  803. struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
  804. struct cxl_memdev *cxlmd = cxlds->cxlmd;
  805. struct device *dev = &cxlmd->dev;
  806. dev_info(&pdev->dev, "%s: restart CXL.mem after slot reset\n",
  807. dev_name(dev));
  808. pci_restore_state(pdev);
  809. if (device_attach(dev) <= 0)
  810. return PCI_ERS_RESULT_DISCONNECT;
  811. return PCI_ERS_RESULT_RECOVERED;
  812. }
  813. static void cxl_error_resume(struct pci_dev *pdev)
  814. {
  815. struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
  816. struct cxl_memdev *cxlmd = cxlds->cxlmd;
  817. struct device *dev = &cxlmd->dev;
  818. dev_info(&pdev->dev, "%s: error resume %s\n", dev_name(dev),
  819. dev->driver ? "successful" : "failed");
  820. }
  821. static void cxl_reset_done(struct pci_dev *pdev)
  822. {
  823. struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
  824. struct cxl_memdev *cxlmd = cxlds->cxlmd;
  825. struct device *dev = &pdev->dev;
  826. /*
  827. * FLR does not expect to touch the HDM decoders and related
  828. * registers. SBR, however, will wipe all device configurations.
  829. * Issue a warning if there was an active decoder before the reset
  830. * that no longer exists.
  831. */
  832. guard(device)(&cxlmd->dev);
  833. if (cxlmd->endpoint &&
  834. cxl_endpoint_decoder_reset_detected(cxlmd->endpoint)) {
  835. dev_crit(dev, "SBR happened without memory regions removal.\n");
  836. dev_crit(dev, "System may be unstable if regions hosted system memory.\n");
  837. add_taint(TAINT_USER, LOCKDEP_STILL_OK);
  838. }
  839. }
  840. static const struct pci_error_handlers cxl_error_handlers = {
  841. .error_detected = cxl_error_detected,
  842. .slot_reset = cxl_slot_reset,
  843. .resume = cxl_error_resume,
  844. .cor_error_detected = cxl_cor_error_detected,
  845. .reset_done = cxl_reset_done,
  846. };
  847. static struct pci_driver cxl_pci_driver = {
  848. .name = KBUILD_MODNAME,
  849. .id_table = cxl_mem_pci_tbl,
  850. .probe = cxl_pci_probe,
  851. .err_handler = &cxl_error_handlers,
  852. .driver = {
  853. .probe_type = PROBE_PREFER_ASYNCHRONOUS,
  854. },
  855. };
  856. #define CXL_EVENT_HDR_FLAGS_REC_SEVERITY GENMASK(1, 0)
  857. static void cxl_handle_cper_event(enum cxl_event_type ev_type,
  858. struct cxl_cper_event_rec *rec)
  859. {
  860. struct cper_cxl_event_devid *device_id = &rec->hdr.device_id;
  861. struct pci_dev *pdev __free(pci_dev_put) = NULL;
  862. enum cxl_event_log_type log_type;
  863. struct cxl_dev_state *cxlds;
  864. unsigned int devfn;
  865. u32 hdr_flags;
  866. pr_debug("CPER event %d for device %u:%u:%u.%u\n", ev_type,
  867. device_id->segment_num, device_id->bus_num,
  868. device_id->device_num, device_id->func_num);
  869. devfn = PCI_DEVFN(device_id->device_num, device_id->func_num);
  870. pdev = pci_get_domain_bus_and_slot(device_id->segment_num,
  871. device_id->bus_num, devfn);
  872. if (!pdev)
  873. return;
  874. guard(device)(&pdev->dev);
  875. if (pdev->driver != &cxl_pci_driver)
  876. return;
  877. cxlds = pci_get_drvdata(pdev);
  878. if (!cxlds)
  879. return;
  880. /* Fabricate a log type */
  881. hdr_flags = get_unaligned_le24(rec->event.generic.hdr.flags);
  882. log_type = FIELD_GET(CXL_EVENT_HDR_FLAGS_REC_SEVERITY, hdr_flags);
  883. cxl_event_trace_record(cxlds->cxlmd, log_type, ev_type,
  884. &uuid_null, &rec->event);
  885. }
  886. static void cxl_cper_work_fn(struct work_struct *work)
  887. {
  888. struct cxl_cper_work_data wd;
  889. while (cxl_cper_kfifo_get(&wd))
  890. cxl_handle_cper_event(wd.event_type, &wd.rec);
  891. }
  892. static DECLARE_WORK(cxl_cper_work, cxl_cper_work_fn);
  893. static int __init cxl_pci_driver_init(void)
  894. {
  895. int rc;
  896. rc = pci_register_driver(&cxl_pci_driver);
  897. if (rc)
  898. return rc;
  899. rc = cxl_cper_register_work(&cxl_cper_work);
  900. if (rc)
  901. pci_unregister_driver(&cxl_pci_driver);
  902. return rc;
  903. }
  904. static void __exit cxl_pci_driver_exit(void)
  905. {
  906. cxl_cper_unregister_work(&cxl_cper_work);
  907. cancel_work_sync(&cxl_cper_work);
  908. pci_unregister_driver(&cxl_pci_driver);
  909. }
  910. module_init(cxl_pci_driver_init);
  911. module_exit(cxl_pci_driver_exit);
  912. MODULE_DESCRIPTION("CXL: PCI manageability");
  913. MODULE_LICENSE("GPL v2");
  914. MODULE_IMPORT_NS(CXL);