v3d_sched.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932
  1. // SPDX-License-Identifier: GPL-2.0+
  2. /* Copyright (C) 2018 Broadcom */
  3. /**
  4. * DOC: Broadcom V3D scheduling
  5. *
  6. * The shared DRM GPU scheduler is used to coordinate submitting jobs
  7. * to the hardware. Each DRM fd (roughly a client process) gets its
  8. * own scheduler entity, which will process jobs in order. The GPU
  9. * scheduler will round-robin between clients to submit the next job.
  10. *
  11. * For simplicity, and in order to keep latency low for interactive
  12. * jobs when bulk background jobs are queued up, we submit a new job
  13. * to the HW only when it has completed the last one, instead of
  14. * filling up the CT[01]Q FIFOs with jobs. Similarly, we use
  15. * drm_sched_job_add_dependency() to manage the dependency between bin and
  16. * render, instead of having the clients submit jobs using the HW's
  17. * semaphores to interlock between them.
  18. */
  19. #include <linux/sched/clock.h>
  20. #include <linux/kthread.h>
  21. #include <drm/drm_syncobj.h>
  22. #include "v3d_drv.h"
  23. #include "v3d_regs.h"
  24. #include "v3d_trace.h"
  25. #define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
  26. static struct v3d_job *
  27. to_v3d_job(struct drm_sched_job *sched_job)
  28. {
  29. return container_of(sched_job, struct v3d_job, base);
  30. }
  31. static struct v3d_bin_job *
  32. to_bin_job(struct drm_sched_job *sched_job)
  33. {
  34. return container_of(sched_job, struct v3d_bin_job, base.base);
  35. }
  36. static struct v3d_render_job *
  37. to_render_job(struct drm_sched_job *sched_job)
  38. {
  39. return container_of(sched_job, struct v3d_render_job, base.base);
  40. }
  41. static struct v3d_tfu_job *
  42. to_tfu_job(struct drm_sched_job *sched_job)
  43. {
  44. return container_of(sched_job, struct v3d_tfu_job, base.base);
  45. }
  46. static struct v3d_csd_job *
  47. to_csd_job(struct drm_sched_job *sched_job)
  48. {
  49. return container_of(sched_job, struct v3d_csd_job, base.base);
  50. }
  51. static struct v3d_cpu_job *
  52. to_cpu_job(struct drm_sched_job *sched_job)
  53. {
  54. return container_of(sched_job, struct v3d_cpu_job, base.base);
  55. }
  56. static void
  57. v3d_sched_job_free(struct drm_sched_job *sched_job)
  58. {
  59. struct v3d_job *job = to_v3d_job(sched_job);
  60. v3d_job_cleanup(job);
  61. }
  62. void
  63. v3d_timestamp_query_info_free(struct v3d_timestamp_query_info *query_info,
  64. unsigned int count)
  65. {
  66. if (query_info->queries) {
  67. unsigned int i;
  68. for (i = 0; i < count; i++)
  69. drm_syncobj_put(query_info->queries[i].syncobj);
  70. kvfree(query_info->queries);
  71. }
  72. }
  73. void
  74. v3d_performance_query_info_free(struct v3d_performance_query_info *query_info,
  75. unsigned int count)
  76. {
  77. if (query_info->queries) {
  78. unsigned int i;
  79. for (i = 0; i < count; i++) {
  80. drm_syncobj_put(query_info->queries[i].syncobj);
  81. kvfree(query_info->queries[i].kperfmon_ids);
  82. }
  83. kvfree(query_info->queries);
  84. }
  85. }
  86. static void
  87. v3d_cpu_job_free(struct drm_sched_job *sched_job)
  88. {
  89. struct v3d_cpu_job *job = to_cpu_job(sched_job);
  90. v3d_timestamp_query_info_free(&job->timestamp_query,
  91. job->timestamp_query.count);
  92. v3d_performance_query_info_free(&job->performance_query,
  93. job->performance_query.count);
  94. v3d_job_cleanup(&job->base);
  95. }
  96. static void
  97. v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)
  98. {
  99. if (job->perfmon != v3d->active_perfmon)
  100. v3d_perfmon_stop(v3d, v3d->active_perfmon, true);
  101. if (job->perfmon && v3d->active_perfmon != job->perfmon)
  102. v3d_perfmon_start(v3d, job->perfmon);
  103. }
  104. static void
  105. v3d_job_start_stats(struct v3d_job *job, enum v3d_queue queue)
  106. {
  107. struct v3d_dev *v3d = job->v3d;
  108. struct v3d_file_priv *file = job->file->driver_priv;
  109. struct v3d_stats *global_stats = &v3d->queue[queue].stats;
  110. struct v3d_stats *local_stats = &file->stats[queue];
  111. u64 now = local_clock();
  112. unsigned long flags;
  113. /*
  114. * We only need to disable local interrupts to appease lockdep who
  115. * otherwise would think v3d_job_start_stats vs v3d_stats_update has an
  116. * unsafe in-irq vs no-irq-off usage problem. This is a false positive
  117. * because all the locks are per queue and stats type, and all jobs are
  118. * completely one at a time serialised. More specifically:
  119. *
  120. * 1. Locks for GPU queues are updated from interrupt handlers under a
  121. * spin lock and started here with preemption disabled.
  122. *
  123. * 2. Locks for CPU queues are updated from the worker with preemption
  124. * disabled and equally started here with preemption disabled.
  125. *
  126. * Therefore both are consistent.
  127. *
  128. * 3. Because next job can only be queued after the previous one has
  129. * been signaled, and locks are per queue, there is also no scope for
  130. * the start part to race with the update part.
  131. */
  132. if (IS_ENABLED(CONFIG_LOCKDEP))
  133. local_irq_save(flags);
  134. else
  135. preempt_disable();
  136. write_seqcount_begin(&local_stats->lock);
  137. local_stats->start_ns = now;
  138. write_seqcount_end(&local_stats->lock);
  139. write_seqcount_begin(&global_stats->lock);
  140. global_stats->start_ns = now;
  141. write_seqcount_end(&global_stats->lock);
  142. if (IS_ENABLED(CONFIG_LOCKDEP))
  143. local_irq_restore(flags);
  144. else
  145. preempt_enable();
  146. }
  147. static void
  148. v3d_stats_update(struct v3d_stats *stats, u64 now)
  149. {
  150. write_seqcount_begin(&stats->lock);
  151. stats->enabled_ns += now - stats->start_ns;
  152. stats->jobs_completed++;
  153. stats->start_ns = 0;
  154. write_seqcount_end(&stats->lock);
  155. }
  156. void
  157. v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue)
  158. {
  159. struct v3d_dev *v3d = job->v3d;
  160. struct v3d_file_priv *file = job->file->driver_priv;
  161. struct v3d_stats *global_stats = &v3d->queue[queue].stats;
  162. u64 now = local_clock();
  163. unsigned long flags;
  164. /* See comment in v3d_job_start_stats() */
  165. if (IS_ENABLED(CONFIG_LOCKDEP))
  166. local_irq_save(flags);
  167. else
  168. preempt_disable();
  169. /* Don't update the local stats if the file context has already closed */
  170. if (file)
  171. v3d_stats_update(&file->stats[queue], now);
  172. else
  173. drm_dbg(&v3d->drm, "The file descriptor was closed before job completion\n");
  174. v3d_stats_update(global_stats, now);
  175. if (IS_ENABLED(CONFIG_LOCKDEP))
  176. local_irq_restore(flags);
  177. else
  178. preempt_enable();
  179. }
  180. static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
  181. {
  182. struct v3d_bin_job *job = to_bin_job(sched_job);
  183. struct v3d_dev *v3d = job->base.v3d;
  184. struct drm_device *dev = &v3d->drm;
  185. struct dma_fence *fence;
  186. unsigned long irqflags;
  187. if (unlikely(job->base.base.s_fence->finished.error))
  188. return NULL;
  189. /* Lock required around bin_job update vs
  190. * v3d_overflow_mem_work().
  191. */
  192. spin_lock_irqsave(&v3d->job_lock, irqflags);
  193. v3d->bin_job = job;
  194. /* Clear out the overflow allocation, so we don't
  195. * reuse the overflow attached to a previous job.
  196. */
  197. V3D_CORE_WRITE(0, V3D_PTB_BPOS, 0);
  198. spin_unlock_irqrestore(&v3d->job_lock, irqflags);
  199. v3d_invalidate_caches(v3d);
  200. fence = v3d_fence_create(v3d, V3D_BIN);
  201. if (IS_ERR(fence))
  202. return NULL;
  203. if (job->base.irq_fence)
  204. dma_fence_put(job->base.irq_fence);
  205. job->base.irq_fence = dma_fence_get(fence);
  206. trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
  207. job->start, job->end);
  208. v3d_job_start_stats(&job->base, V3D_BIN);
  209. v3d_switch_perfmon(v3d, &job->base);
  210. /* Set the current and end address of the control list.
  211. * Writing the end register is what starts the job.
  212. */
  213. if (job->qma) {
  214. V3D_CORE_WRITE(0, V3D_CLE_CT0QMA, job->qma);
  215. V3D_CORE_WRITE(0, V3D_CLE_CT0QMS, job->qms);
  216. }
  217. if (job->qts) {
  218. V3D_CORE_WRITE(0, V3D_CLE_CT0QTS,
  219. V3D_CLE_CT0QTS_ENABLE |
  220. job->qts);
  221. }
  222. V3D_CORE_WRITE(0, V3D_CLE_CT0QBA, job->start);
  223. V3D_CORE_WRITE(0, V3D_CLE_CT0QEA, job->end);
  224. return fence;
  225. }
  226. static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job)
  227. {
  228. struct v3d_render_job *job = to_render_job(sched_job);
  229. struct v3d_dev *v3d = job->base.v3d;
  230. struct drm_device *dev = &v3d->drm;
  231. struct dma_fence *fence;
  232. if (unlikely(job->base.base.s_fence->finished.error))
  233. return NULL;
  234. v3d->render_job = job;
  235. /* Can we avoid this flush? We need to be careful of
  236. * scheduling, though -- imagine job0 rendering to texture and
  237. * job1 reading, and them being executed as bin0, bin1,
  238. * render0, render1, so that render1's flush at bin time
  239. * wasn't enough.
  240. */
  241. v3d_invalidate_caches(v3d);
  242. fence = v3d_fence_create(v3d, V3D_RENDER);
  243. if (IS_ERR(fence))
  244. return NULL;
  245. if (job->base.irq_fence)
  246. dma_fence_put(job->base.irq_fence);
  247. job->base.irq_fence = dma_fence_get(fence);
  248. trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
  249. job->start, job->end);
  250. v3d_job_start_stats(&job->base, V3D_RENDER);
  251. v3d_switch_perfmon(v3d, &job->base);
  252. /* XXX: Set the QCFG */
  253. /* Set the current and end address of the control list.
  254. * Writing the end register is what starts the job.
  255. */
  256. V3D_CORE_WRITE(0, V3D_CLE_CT1QBA, job->start);
  257. V3D_CORE_WRITE(0, V3D_CLE_CT1QEA, job->end);
  258. return fence;
  259. }
  260. static struct dma_fence *
  261. v3d_tfu_job_run(struct drm_sched_job *sched_job)
  262. {
  263. struct v3d_tfu_job *job = to_tfu_job(sched_job);
  264. struct v3d_dev *v3d = job->base.v3d;
  265. struct drm_device *dev = &v3d->drm;
  266. struct dma_fence *fence;
  267. if (unlikely(job->base.base.s_fence->finished.error))
  268. return NULL;
  269. v3d->tfu_job = job;
  270. fence = v3d_fence_create(v3d, V3D_TFU);
  271. if (IS_ERR(fence))
  272. return NULL;
  273. if (job->base.irq_fence)
  274. dma_fence_put(job->base.irq_fence);
  275. job->base.irq_fence = dma_fence_get(fence);
  276. trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
  277. v3d_job_start_stats(&job->base, V3D_TFU);
  278. V3D_WRITE(V3D_TFU_IIA(v3d->ver), job->args.iia);
  279. V3D_WRITE(V3D_TFU_IIS(v3d->ver), job->args.iis);
  280. V3D_WRITE(V3D_TFU_ICA(v3d->ver), job->args.ica);
  281. V3D_WRITE(V3D_TFU_IUA(v3d->ver), job->args.iua);
  282. V3D_WRITE(V3D_TFU_IOA(v3d->ver), job->args.ioa);
  283. if (v3d->ver >= 71)
  284. V3D_WRITE(V3D_V7_TFU_IOC, job->args.v71.ioc);
  285. V3D_WRITE(V3D_TFU_IOS(v3d->ver), job->args.ios);
  286. V3D_WRITE(V3D_TFU_COEF0(v3d->ver), job->args.coef[0]);
  287. if (v3d->ver >= 71 || (job->args.coef[0] & V3D_TFU_COEF0_USECOEF)) {
  288. V3D_WRITE(V3D_TFU_COEF1(v3d->ver), job->args.coef[1]);
  289. V3D_WRITE(V3D_TFU_COEF2(v3d->ver), job->args.coef[2]);
  290. V3D_WRITE(V3D_TFU_COEF3(v3d->ver), job->args.coef[3]);
  291. }
  292. /* ICFG kicks off the job. */
  293. V3D_WRITE(V3D_TFU_ICFG(v3d->ver), job->args.icfg | V3D_TFU_ICFG_IOC);
  294. return fence;
  295. }
  296. static struct dma_fence *
  297. v3d_csd_job_run(struct drm_sched_job *sched_job)
  298. {
  299. struct v3d_csd_job *job = to_csd_job(sched_job);
  300. struct v3d_dev *v3d = job->base.v3d;
  301. struct drm_device *dev = &v3d->drm;
  302. struct dma_fence *fence;
  303. int i, csd_cfg0_reg;
  304. if (unlikely(job->base.base.s_fence->finished.error))
  305. return NULL;
  306. v3d->csd_job = job;
  307. v3d_invalidate_caches(v3d);
  308. fence = v3d_fence_create(v3d, V3D_CSD);
  309. if (IS_ERR(fence))
  310. return NULL;
  311. if (job->base.irq_fence)
  312. dma_fence_put(job->base.irq_fence);
  313. job->base.irq_fence = dma_fence_get(fence);
  314. trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
  315. v3d_job_start_stats(&job->base, V3D_CSD);
  316. v3d_switch_perfmon(v3d, &job->base);
  317. csd_cfg0_reg = V3D_CSD_QUEUED_CFG0(v3d->ver);
  318. for (i = 1; i <= 6; i++)
  319. V3D_CORE_WRITE(0, csd_cfg0_reg + 4 * i, job->args.cfg[i]);
  320. /* Although V3D 7.1 has an eighth configuration register, we are not
  321. * using it. Therefore, make sure it remains unused.
  322. *
  323. * XXX: Set the CFG7 register
  324. */
  325. if (v3d->ver >= 71)
  326. V3D_CORE_WRITE(0, V3D_V7_CSD_QUEUED_CFG7, 0);
  327. /* CFG0 write kicks off the job. */
  328. V3D_CORE_WRITE(0, csd_cfg0_reg, job->args.cfg[0]);
  329. return fence;
  330. }
  331. static void
  332. v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job)
  333. {
  334. struct v3d_indirect_csd_info *indirect_csd = &job->indirect_csd;
  335. struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
  336. struct v3d_bo *indirect = to_v3d_bo(indirect_csd->indirect);
  337. struct drm_v3d_submit_csd *args = &indirect_csd->job->args;
  338. struct v3d_dev *v3d = job->base.v3d;
  339. u32 num_batches, *wg_counts;
  340. v3d_get_bo_vaddr(bo);
  341. v3d_get_bo_vaddr(indirect);
  342. wg_counts = (uint32_t *)(bo->vaddr + indirect_csd->offset);
  343. if (wg_counts[0] == 0 || wg_counts[1] == 0 || wg_counts[2] == 0)
  344. return;
  345. args->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
  346. args->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
  347. args->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
  348. num_batches = DIV_ROUND_UP(indirect_csd->wg_size, 16) *
  349. (wg_counts[0] * wg_counts[1] * wg_counts[2]);
  350. /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
  351. if (v3d->ver < 71 || (v3d->ver == 71 && v3d->rev < 6))
  352. args->cfg[4] = num_batches - 1;
  353. else
  354. args->cfg[4] = num_batches;
  355. WARN_ON(args->cfg[4] == ~0);
  356. for (int i = 0; i < 3; i++) {
  357. /* 0xffffffff indicates that the uniform rewrite is not needed */
  358. if (indirect_csd->wg_uniform_offsets[i] != 0xffffffff) {
  359. u32 uniform_idx = indirect_csd->wg_uniform_offsets[i];
  360. ((uint32_t *)indirect->vaddr)[uniform_idx] = wg_counts[i];
  361. }
  362. }
  363. v3d_put_bo_vaddr(indirect);
  364. v3d_put_bo_vaddr(bo);
  365. }
  366. static void
  367. v3d_timestamp_query(struct v3d_cpu_job *job)
  368. {
  369. struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query;
  370. struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
  371. u8 *value_addr;
  372. v3d_get_bo_vaddr(bo);
  373. for (int i = 0; i < timestamp_query->count; i++) {
  374. value_addr = ((u8 *)bo->vaddr) + timestamp_query->queries[i].offset;
  375. *((u64 *)value_addr) = i == 0 ? ktime_get_ns() : 0ull;
  376. drm_syncobj_replace_fence(timestamp_query->queries[i].syncobj,
  377. job->base.done_fence);
  378. }
  379. v3d_put_bo_vaddr(bo);
  380. }
  381. static void
  382. v3d_reset_timestamp_queries(struct v3d_cpu_job *job)
  383. {
  384. struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query;
  385. struct v3d_timestamp_query *queries = timestamp_query->queries;
  386. struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
  387. u8 *value_addr;
  388. v3d_get_bo_vaddr(bo);
  389. for (int i = 0; i < timestamp_query->count; i++) {
  390. value_addr = ((u8 *)bo->vaddr) + queries[i].offset;
  391. *((u64 *)value_addr) = 0;
  392. drm_syncobj_replace_fence(queries[i].syncobj, NULL);
  393. }
  394. v3d_put_bo_vaddr(bo);
  395. }
  396. static void write_to_buffer_32(u32 *dst, unsigned int idx, u32 value)
  397. {
  398. dst[idx] = value;
  399. }
  400. static void write_to_buffer_64(u64 *dst, unsigned int idx, u64 value)
  401. {
  402. dst[idx] = value;
  403. }
  404. static void
  405. write_to_buffer(void *dst, unsigned int idx, bool do_64bit, u64 value)
  406. {
  407. if (do_64bit)
  408. write_to_buffer_64(dst, idx, value);
  409. else
  410. write_to_buffer_32(dst, idx, value);
  411. }
  412. static void
  413. v3d_copy_query_results(struct v3d_cpu_job *job)
  414. {
  415. struct v3d_timestamp_query_info *timestamp_query = &job->timestamp_query;
  416. struct v3d_timestamp_query *queries = timestamp_query->queries;
  417. struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
  418. struct v3d_bo *timestamp = to_v3d_bo(job->base.bo[1]);
  419. struct v3d_copy_query_results_info *copy = &job->copy;
  420. struct dma_fence *fence;
  421. u8 *query_addr;
  422. bool available, write_result;
  423. u8 *data;
  424. int i;
  425. v3d_get_bo_vaddr(bo);
  426. v3d_get_bo_vaddr(timestamp);
  427. data = ((u8 *)bo->vaddr) + copy->offset;
  428. for (i = 0; i < timestamp_query->count; i++) {
  429. fence = drm_syncobj_fence_get(queries[i].syncobj);
  430. available = fence ? dma_fence_is_signaled(fence) : false;
  431. write_result = available || copy->do_partial;
  432. if (write_result) {
  433. query_addr = ((u8 *)timestamp->vaddr) + queries[i].offset;
  434. write_to_buffer(data, 0, copy->do_64bit, *((u64 *)query_addr));
  435. }
  436. if (copy->availability_bit)
  437. write_to_buffer(data, 1, copy->do_64bit, available ? 1u : 0u);
  438. data += copy->stride;
  439. dma_fence_put(fence);
  440. }
  441. v3d_put_bo_vaddr(timestamp);
  442. v3d_put_bo_vaddr(bo);
  443. }
  444. static void
  445. v3d_reset_performance_queries(struct v3d_cpu_job *job)
  446. {
  447. struct v3d_performance_query_info *performance_query = &job->performance_query;
  448. struct v3d_file_priv *v3d_priv = job->base.file->driver_priv;
  449. struct v3d_dev *v3d = job->base.v3d;
  450. struct v3d_perfmon *perfmon;
  451. for (int i = 0; i < performance_query->count; i++) {
  452. for (int j = 0; j < performance_query->nperfmons; j++) {
  453. perfmon = v3d_perfmon_find(v3d_priv,
  454. performance_query->queries[i].kperfmon_ids[j]);
  455. if (!perfmon) {
  456. DRM_DEBUG("Failed to find perfmon.");
  457. continue;
  458. }
  459. v3d_perfmon_stop(v3d, perfmon, false);
  460. memset(perfmon->values, 0, perfmon->ncounters * sizeof(u64));
  461. v3d_perfmon_put(perfmon);
  462. }
  463. drm_syncobj_replace_fence(performance_query->queries[i].syncobj, NULL);
  464. }
  465. }
  466. static void
  467. v3d_write_performance_query_result(struct v3d_cpu_job *job, void *data,
  468. unsigned int query)
  469. {
  470. struct v3d_performance_query_info *performance_query =
  471. &job->performance_query;
  472. struct v3d_file_priv *v3d_priv = job->base.file->driver_priv;
  473. struct v3d_performance_query *perf_query =
  474. &performance_query->queries[query];
  475. struct v3d_dev *v3d = job->base.v3d;
  476. unsigned int i, j, offset;
  477. for (i = 0, offset = 0;
  478. i < performance_query->nperfmons;
  479. i++, offset += DRM_V3D_MAX_PERF_COUNTERS) {
  480. struct v3d_perfmon *perfmon;
  481. perfmon = v3d_perfmon_find(v3d_priv,
  482. perf_query->kperfmon_ids[i]);
  483. if (!perfmon) {
  484. DRM_DEBUG("Failed to find perfmon.");
  485. continue;
  486. }
  487. v3d_perfmon_stop(v3d, perfmon, true);
  488. if (job->copy.do_64bit) {
  489. for (j = 0; j < perfmon->ncounters; j++)
  490. write_to_buffer_64(data, offset + j,
  491. perfmon->values[j]);
  492. } else {
  493. for (j = 0; j < perfmon->ncounters; j++)
  494. write_to_buffer_32(data, offset + j,
  495. perfmon->values[j]);
  496. }
  497. v3d_perfmon_put(perfmon);
  498. }
  499. }
  500. static void
  501. v3d_copy_performance_query(struct v3d_cpu_job *job)
  502. {
  503. struct v3d_performance_query_info *performance_query = &job->performance_query;
  504. struct v3d_copy_query_results_info *copy = &job->copy;
  505. struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]);
  506. struct dma_fence *fence;
  507. bool available, write_result;
  508. u8 *data;
  509. v3d_get_bo_vaddr(bo);
  510. data = ((u8 *)bo->vaddr) + copy->offset;
  511. for (int i = 0; i < performance_query->count; i++) {
  512. fence = drm_syncobj_fence_get(performance_query->queries[i].syncobj);
  513. available = fence ? dma_fence_is_signaled(fence) : false;
  514. write_result = available || copy->do_partial;
  515. if (write_result)
  516. v3d_write_performance_query_result(job, data, i);
  517. if (copy->availability_bit)
  518. write_to_buffer(data, performance_query->ncounters,
  519. copy->do_64bit, available ? 1u : 0u);
  520. data += copy->stride;
  521. dma_fence_put(fence);
  522. }
  523. v3d_put_bo_vaddr(bo);
  524. }
  525. static const v3d_cpu_job_fn cpu_job_function[] = {
  526. [V3D_CPU_JOB_TYPE_INDIRECT_CSD] = v3d_rewrite_csd_job_wg_counts_from_indirect,
  527. [V3D_CPU_JOB_TYPE_TIMESTAMP_QUERY] = v3d_timestamp_query,
  528. [V3D_CPU_JOB_TYPE_RESET_TIMESTAMP_QUERY] = v3d_reset_timestamp_queries,
  529. [V3D_CPU_JOB_TYPE_COPY_TIMESTAMP_QUERY] = v3d_copy_query_results,
  530. [V3D_CPU_JOB_TYPE_RESET_PERFORMANCE_QUERY] = v3d_reset_performance_queries,
  531. [V3D_CPU_JOB_TYPE_COPY_PERFORMANCE_QUERY] = v3d_copy_performance_query,
  532. };
  533. static struct dma_fence *
  534. v3d_cpu_job_run(struct drm_sched_job *sched_job)
  535. {
  536. struct v3d_cpu_job *job = to_cpu_job(sched_job);
  537. struct v3d_dev *v3d = job->base.v3d;
  538. v3d->cpu_job = job;
  539. if (job->job_type >= ARRAY_SIZE(cpu_job_function)) {
  540. DRM_DEBUG_DRIVER("Unknown CPU job: %d\n", job->job_type);
  541. return NULL;
  542. }
  543. v3d_job_start_stats(&job->base, V3D_CPU);
  544. trace_v3d_cpu_job_begin(&v3d->drm, job->job_type);
  545. cpu_job_function[job->job_type](job);
  546. trace_v3d_cpu_job_end(&v3d->drm, job->job_type);
  547. v3d_job_update_stats(&job->base, V3D_CPU);
  548. return NULL;
  549. }
  550. static struct dma_fence *
  551. v3d_cache_clean_job_run(struct drm_sched_job *sched_job)
  552. {
  553. struct v3d_job *job = to_v3d_job(sched_job);
  554. struct v3d_dev *v3d = job->v3d;
  555. v3d_job_start_stats(job, V3D_CACHE_CLEAN);
  556. v3d_clean_caches(v3d);
  557. v3d_job_update_stats(job, V3D_CACHE_CLEAN);
  558. return NULL;
  559. }
  560. static enum drm_gpu_sched_stat
  561. v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
  562. {
  563. enum v3d_queue q;
  564. mutex_lock(&v3d->reset_lock);
  565. /* block scheduler */
  566. for (q = 0; q < V3D_MAX_QUEUES; q++)
  567. drm_sched_stop(&v3d->queue[q].sched, sched_job);
  568. if (sched_job)
  569. drm_sched_increase_karma(sched_job);
  570. /* get the GPU back into the init state */
  571. v3d_reset(v3d);
  572. for (q = 0; q < V3D_MAX_QUEUES; q++)
  573. drm_sched_resubmit_jobs(&v3d->queue[q].sched);
  574. /* Unblock schedulers and restart their jobs. */
  575. for (q = 0; q < V3D_MAX_QUEUES; q++) {
  576. drm_sched_start(&v3d->queue[q].sched);
  577. }
  578. mutex_unlock(&v3d->reset_lock);
  579. return DRM_GPU_SCHED_STAT_NOMINAL;
  580. }
  581. static void
  582. v3d_sched_skip_reset(struct drm_sched_job *sched_job)
  583. {
  584. struct drm_gpu_scheduler *sched = sched_job->sched;
  585. spin_lock(&sched->job_list_lock);
  586. list_add(&sched_job->list, &sched->pending_list);
  587. spin_unlock(&sched->job_list_lock);
  588. }
  589. static enum drm_gpu_sched_stat
  590. v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
  591. u32 *timedout_ctca, u32 *timedout_ctra)
  592. {
  593. struct v3d_job *job = to_v3d_job(sched_job);
  594. struct v3d_dev *v3d = job->v3d;
  595. u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q));
  596. u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q));
  597. /* If the current address or return address have changed, then the GPU
  598. * has probably made progress and we should delay the reset. This
  599. * could fail if the GPU got in an infinite loop in the CL, but that
  600. * is pretty unlikely outside of an i-g-t testcase.
  601. */
  602. if (*timedout_ctca != ctca || *timedout_ctra != ctra) {
  603. *timedout_ctca = ctca;
  604. *timedout_ctra = ctra;
  605. v3d_sched_skip_reset(sched_job);
  606. return DRM_GPU_SCHED_STAT_NOMINAL;
  607. }
  608. return v3d_gpu_reset_for_timeout(v3d, sched_job);
  609. }
  610. static enum drm_gpu_sched_stat
  611. v3d_bin_job_timedout(struct drm_sched_job *sched_job)
  612. {
  613. struct v3d_bin_job *job = to_bin_job(sched_job);
  614. return v3d_cl_job_timedout(sched_job, V3D_BIN,
  615. &job->timedout_ctca, &job->timedout_ctra);
  616. }
  617. static enum drm_gpu_sched_stat
  618. v3d_render_job_timedout(struct drm_sched_job *sched_job)
  619. {
  620. struct v3d_render_job *job = to_render_job(sched_job);
  621. return v3d_cl_job_timedout(sched_job, V3D_RENDER,
  622. &job->timedout_ctca, &job->timedout_ctra);
  623. }
  624. static enum drm_gpu_sched_stat
  625. v3d_generic_job_timedout(struct drm_sched_job *sched_job)
  626. {
  627. struct v3d_job *job = to_v3d_job(sched_job);
  628. return v3d_gpu_reset_for_timeout(job->v3d, sched_job);
  629. }
  630. static enum drm_gpu_sched_stat
  631. v3d_csd_job_timedout(struct drm_sched_job *sched_job)
  632. {
  633. struct v3d_csd_job *job = to_csd_job(sched_job);
  634. struct v3d_dev *v3d = job->base.v3d;
  635. u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4(v3d->ver));
  636. /* If we've made progress, skip reset, add the job to the pending
  637. * list, and let the timer get rearmed.
  638. */
  639. if (job->timedout_batches != batches) {
  640. job->timedout_batches = batches;
  641. v3d_sched_skip_reset(sched_job);
  642. return DRM_GPU_SCHED_STAT_NOMINAL;
  643. }
  644. return v3d_gpu_reset_for_timeout(v3d, sched_job);
  645. }
  646. static const struct drm_sched_backend_ops v3d_bin_sched_ops = {
  647. .run_job = v3d_bin_job_run,
  648. .timedout_job = v3d_bin_job_timedout,
  649. .free_job = v3d_sched_job_free,
  650. };
  651. static const struct drm_sched_backend_ops v3d_render_sched_ops = {
  652. .run_job = v3d_render_job_run,
  653. .timedout_job = v3d_render_job_timedout,
  654. .free_job = v3d_sched_job_free,
  655. };
  656. static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
  657. .run_job = v3d_tfu_job_run,
  658. .timedout_job = v3d_generic_job_timedout,
  659. .free_job = v3d_sched_job_free,
  660. };
  661. static const struct drm_sched_backend_ops v3d_csd_sched_ops = {
  662. .run_job = v3d_csd_job_run,
  663. .timedout_job = v3d_csd_job_timedout,
  664. .free_job = v3d_sched_job_free
  665. };
  666. static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = {
  667. .run_job = v3d_cache_clean_job_run,
  668. .timedout_job = v3d_generic_job_timedout,
  669. .free_job = v3d_sched_job_free
  670. };
  671. static const struct drm_sched_backend_ops v3d_cpu_sched_ops = {
  672. .run_job = v3d_cpu_job_run,
  673. .timedout_job = v3d_generic_job_timedout,
  674. .free_job = v3d_cpu_job_free
  675. };
  676. int
  677. v3d_sched_init(struct v3d_dev *v3d)
  678. {
  679. int hw_jobs_limit = 1;
  680. int job_hang_limit = 0;
  681. int hang_limit_ms = 500;
  682. int ret;
  683. ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
  684. &v3d_bin_sched_ops, NULL,
  685. DRM_SCHED_PRIORITY_COUNT,
  686. hw_jobs_limit, job_hang_limit,
  687. msecs_to_jiffies(hang_limit_ms), NULL,
  688. NULL, "v3d_bin", v3d->drm.dev);
  689. if (ret)
  690. return ret;
  691. ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
  692. &v3d_render_sched_ops, NULL,
  693. DRM_SCHED_PRIORITY_COUNT,
  694. hw_jobs_limit, job_hang_limit,
  695. msecs_to_jiffies(hang_limit_ms), NULL,
  696. NULL, "v3d_render", v3d->drm.dev);
  697. if (ret)
  698. goto fail;
  699. ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
  700. &v3d_tfu_sched_ops, NULL,
  701. DRM_SCHED_PRIORITY_COUNT,
  702. hw_jobs_limit, job_hang_limit,
  703. msecs_to_jiffies(hang_limit_ms), NULL,
  704. NULL, "v3d_tfu", v3d->drm.dev);
  705. if (ret)
  706. goto fail;
  707. if (v3d_has_csd(v3d)) {
  708. ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
  709. &v3d_csd_sched_ops, NULL,
  710. DRM_SCHED_PRIORITY_COUNT,
  711. hw_jobs_limit, job_hang_limit,
  712. msecs_to_jiffies(hang_limit_ms), NULL,
  713. NULL, "v3d_csd", v3d->drm.dev);
  714. if (ret)
  715. goto fail;
  716. ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
  717. &v3d_cache_clean_sched_ops, NULL,
  718. DRM_SCHED_PRIORITY_COUNT,
  719. hw_jobs_limit, job_hang_limit,
  720. msecs_to_jiffies(hang_limit_ms), NULL,
  721. NULL, "v3d_cache_clean", v3d->drm.dev);
  722. if (ret)
  723. goto fail;
  724. }
  725. ret = drm_sched_init(&v3d->queue[V3D_CPU].sched,
  726. &v3d_cpu_sched_ops, NULL,
  727. DRM_SCHED_PRIORITY_COUNT,
  728. 1, job_hang_limit,
  729. msecs_to_jiffies(hang_limit_ms), NULL,
  730. NULL, "v3d_cpu", v3d->drm.dev);
  731. if (ret)
  732. goto fail;
  733. return 0;
  734. fail:
  735. v3d_sched_fini(v3d);
  736. return ret;
  737. }
  738. void
  739. v3d_sched_fini(struct v3d_dev *v3d)
  740. {
  741. enum v3d_queue q;
  742. for (q = 0; q < V3D_MAX_QUEUES; q++) {
  743. if (v3d->queue[q].sched.ready)
  744. drm_sched_fini(&v3d->queue[q].sched);
  745. }
  746. }