rdma.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * RDMA resource limiting controller for cgroups.
  4. *
  5. * Used to allow a cgroup hierarchy to stop processes from consuming
  6. * additional RDMA resources after a certain limit is reached.
  7. *
  8. * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
  9. */
  10. #include <linux/bitops.h>
  11. #include <linux/slab.h>
  12. #include <linux/seq_file.h>
  13. #include <linux/cgroup.h>
  14. #include <linux/parser.h>
  15. #include <linux/cgroup_rdma.h>
  16. #define RDMACG_MAX_STR "max"
  17. /*
  18. * Protects list of resource pools maintained on per cgroup basis
  19. * and rdma device list.
  20. */
  21. static DEFINE_MUTEX(rdmacg_mutex);
  22. static LIST_HEAD(rdmacg_devices);
  23. enum rdmacg_file_type {
  24. RDMACG_RESOURCE_TYPE_MAX,
  25. RDMACG_RESOURCE_TYPE_STAT,
  26. };
  27. /*
  28. * resource table definition as to be seen by the user.
  29. * Need to add entries to it when more resources are
  30. * added/defined at IB verb/core layer.
  31. */
  32. static char const *rdmacg_resource_names[] = {
  33. [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
  34. [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
  35. };
  36. /* resource tracker for each resource of rdma cgroup */
  37. struct rdmacg_resource {
  38. int max;
  39. int usage;
  40. };
  41. /*
  42. * resource pool object which represents per cgroup, per device
  43. * resources. There are multiple instances of this object per cgroup,
  44. * therefore it cannot be embedded within rdma_cgroup structure. It
  45. * is maintained as list.
  46. */
  47. struct rdmacg_resource_pool {
  48. struct rdmacg_device *device;
  49. struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
  50. struct list_head cg_node;
  51. struct list_head dev_node;
  52. /* count active user tasks of this pool */
  53. u64 usage_sum;
  54. /* total number counts which are set to max */
  55. int num_max_cnt;
  56. };
  57. static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
  58. {
  59. return container_of(css, struct rdma_cgroup, css);
  60. }
  61. static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
  62. {
  63. return css_rdmacg(cg->css.parent);
  64. }
  65. static inline struct rdma_cgroup *get_current_rdmacg(void)
  66. {
  67. return css_rdmacg(task_get_css(current, rdma_cgrp_id));
  68. }
  69. static void set_resource_limit(struct rdmacg_resource_pool *rpool,
  70. int index, int new_max)
  71. {
  72. if (new_max == S32_MAX) {
  73. if (rpool->resources[index].max != S32_MAX)
  74. rpool->num_max_cnt++;
  75. } else {
  76. if (rpool->resources[index].max == S32_MAX)
  77. rpool->num_max_cnt--;
  78. }
  79. rpool->resources[index].max = new_max;
  80. }
  81. static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
  82. {
  83. int i;
  84. for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
  85. set_resource_limit(rpool, i, S32_MAX);
  86. }
  87. static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
  88. {
  89. lockdep_assert_held(&rdmacg_mutex);
  90. list_del(&rpool->cg_node);
  91. list_del(&rpool->dev_node);
  92. kfree(rpool);
  93. }
  94. static struct rdmacg_resource_pool *
  95. find_cg_rpool_locked(struct rdma_cgroup *cg,
  96. struct rdmacg_device *device)
  97. {
  98. struct rdmacg_resource_pool *pool;
  99. lockdep_assert_held(&rdmacg_mutex);
  100. list_for_each_entry(pool, &cg->rpools, cg_node)
  101. if (pool->device == device)
  102. return pool;
  103. return NULL;
  104. }
  105. static struct rdmacg_resource_pool *
  106. get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
  107. {
  108. struct rdmacg_resource_pool *rpool;
  109. rpool = find_cg_rpool_locked(cg, device);
  110. if (rpool)
  111. return rpool;
  112. rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
  113. if (!rpool)
  114. return ERR_PTR(-ENOMEM);
  115. rpool->device = device;
  116. set_all_resource_max_limit(rpool);
  117. INIT_LIST_HEAD(&rpool->cg_node);
  118. INIT_LIST_HEAD(&rpool->dev_node);
  119. list_add_tail(&rpool->cg_node, &cg->rpools);
  120. list_add_tail(&rpool->dev_node, &device->rpools);
  121. return rpool;
  122. }
  123. /**
  124. * uncharge_cg_locked - uncharge resource for rdma cgroup
  125. * @cg: pointer to cg to uncharge and all parents in hierarchy
  126. * @device: pointer to rdmacg device
  127. * @index: index of the resource to uncharge in cg (resource pool)
  128. *
  129. * It also frees the resource pool which was created as part of
  130. * charging operation when there are no resources attached to
  131. * resource pool.
  132. */
  133. static void
  134. uncharge_cg_locked(struct rdma_cgroup *cg,
  135. struct rdmacg_device *device,
  136. enum rdmacg_resource_type index)
  137. {
  138. struct rdmacg_resource_pool *rpool;
  139. rpool = find_cg_rpool_locked(cg, device);
  140. /*
  141. * rpool cannot be null at this stage. Let kernel operate in case
  142. * if there a bug in IB stack or rdma controller, instead of crashing
  143. * the system.
  144. */
  145. if (unlikely(!rpool)) {
  146. pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
  147. return;
  148. }
  149. rpool->resources[index].usage--;
  150. /*
  151. * A negative count (or overflow) is invalid,
  152. * it indicates a bug in the rdma controller.
  153. */
  154. WARN_ON_ONCE(rpool->resources[index].usage < 0);
  155. rpool->usage_sum--;
  156. if (rpool->usage_sum == 0 &&
  157. rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
  158. /*
  159. * No user of the rpool and all entries are set to max, so
  160. * safe to delete this rpool.
  161. */
  162. free_cg_rpool_locked(rpool);
  163. }
  164. }
  165. /**
  166. * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
  167. * @cg: pointer to cg to uncharge and all parents in hierarchy
  168. * @device: pointer to rdmacg device
  169. * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
  170. * stop uncharging
  171. * @index: index of the resource to uncharge in cg in given resource pool
  172. */
  173. static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
  174. struct rdmacg_device *device,
  175. struct rdma_cgroup *stop_cg,
  176. enum rdmacg_resource_type index)
  177. {
  178. struct rdma_cgroup *p;
  179. mutex_lock(&rdmacg_mutex);
  180. for (p = cg; p != stop_cg; p = parent_rdmacg(p))
  181. uncharge_cg_locked(p, device, index);
  182. mutex_unlock(&rdmacg_mutex);
  183. css_put(&cg->css);
  184. }
  185. /**
  186. * rdmacg_uncharge - hierarchically uncharge rdma resource count
  187. * @cg: pointer to cg to uncharge and all parents in hierarchy
  188. * @device: pointer to rdmacg device
  189. * @index: index of the resource to uncharge in cgroup in given resource pool
  190. */
  191. void rdmacg_uncharge(struct rdma_cgroup *cg,
  192. struct rdmacg_device *device,
  193. enum rdmacg_resource_type index)
  194. {
  195. if (index >= RDMACG_RESOURCE_MAX)
  196. return;
  197. rdmacg_uncharge_hierarchy(cg, device, NULL, index);
  198. }
  199. EXPORT_SYMBOL(rdmacg_uncharge);
  200. /**
  201. * rdmacg_try_charge - hierarchically try to charge the rdma resource
  202. * @rdmacg: pointer to rdma cgroup which will own this resource
  203. * @device: pointer to rdmacg device
  204. * @index: index of the resource to charge in cgroup (resource pool)
  205. *
  206. * This function follows charging resource in hierarchical way.
  207. * It will fail if the charge would cause the new value to exceed the
  208. * hierarchical limit.
  209. * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
  210. * Returns pointer to rdmacg for this resource when charging is successful.
  211. *
  212. * Charger needs to account resources on two criteria.
  213. * (a) per cgroup & (b) per device resource usage.
  214. * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
  215. * the configured limits. Per device provides granular configuration
  216. * in multi device usage. It allocates resource pool in the hierarchy
  217. * for each parent it come across for first resource. Later on resource
  218. * pool will be available. Therefore it will be much faster thereon
  219. * to charge/uncharge.
  220. */
  221. int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
  222. struct rdmacg_device *device,
  223. enum rdmacg_resource_type index)
  224. {
  225. struct rdma_cgroup *cg, *p;
  226. struct rdmacg_resource_pool *rpool;
  227. s64 new;
  228. int ret = 0;
  229. if (index >= RDMACG_RESOURCE_MAX)
  230. return -EINVAL;
  231. /*
  232. * hold on to css, as cgroup can be removed but resource
  233. * accounting happens on css.
  234. */
  235. cg = get_current_rdmacg();
  236. mutex_lock(&rdmacg_mutex);
  237. for (p = cg; p; p = parent_rdmacg(p)) {
  238. rpool = get_cg_rpool_locked(p, device);
  239. if (IS_ERR(rpool)) {
  240. ret = PTR_ERR(rpool);
  241. goto err;
  242. } else {
  243. new = rpool->resources[index].usage + 1;
  244. if (new > rpool->resources[index].max) {
  245. ret = -EAGAIN;
  246. goto err;
  247. } else {
  248. rpool->resources[index].usage = new;
  249. rpool->usage_sum++;
  250. }
  251. }
  252. }
  253. mutex_unlock(&rdmacg_mutex);
  254. *rdmacg = cg;
  255. return 0;
  256. err:
  257. mutex_unlock(&rdmacg_mutex);
  258. rdmacg_uncharge_hierarchy(cg, device, p, index);
  259. return ret;
  260. }
  261. EXPORT_SYMBOL(rdmacg_try_charge);
  262. /**
  263. * rdmacg_register_device - register rdmacg device to rdma controller.
  264. * @device: pointer to rdmacg device whose resources need to be accounted.
  265. *
  266. * If IB stack wish a device to participate in rdma cgroup resource
  267. * tracking, it must invoke this API to register with rdma cgroup before
  268. * any user space application can start using the RDMA resources.
  269. */
  270. void rdmacg_register_device(struct rdmacg_device *device)
  271. {
  272. INIT_LIST_HEAD(&device->dev_node);
  273. INIT_LIST_HEAD(&device->rpools);
  274. mutex_lock(&rdmacg_mutex);
  275. list_add_tail(&device->dev_node, &rdmacg_devices);
  276. mutex_unlock(&rdmacg_mutex);
  277. }
  278. EXPORT_SYMBOL(rdmacg_register_device);
  279. /**
  280. * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
  281. * @device: pointer to rdmacg device which was previously registered with rdma
  282. * controller using rdmacg_register_device().
  283. *
  284. * IB stack must invoke this after all the resources of the IB device
  285. * are destroyed and after ensuring that no more resources will be created
  286. * when this API is invoked.
  287. */
  288. void rdmacg_unregister_device(struct rdmacg_device *device)
  289. {
  290. struct rdmacg_resource_pool *rpool, *tmp;
  291. /*
  292. * Synchronize with any active resource settings,
  293. * usage query happening via configfs.
  294. */
  295. mutex_lock(&rdmacg_mutex);
  296. list_del_init(&device->dev_node);
  297. /*
  298. * Now that this device is off the cgroup list, its safe to free
  299. * all the rpool resources.
  300. */
  301. list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
  302. free_cg_rpool_locked(rpool);
  303. mutex_unlock(&rdmacg_mutex);
  304. }
  305. EXPORT_SYMBOL(rdmacg_unregister_device);
  306. static int parse_resource(char *c, int *intval)
  307. {
  308. substring_t argstr;
  309. char *name, *value = c;
  310. size_t len;
  311. int ret, i;
  312. name = strsep(&value, "=");
  313. if (!name || !value)
  314. return -EINVAL;
  315. i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
  316. if (i < 0)
  317. return i;
  318. len = strlen(value);
  319. argstr.from = value;
  320. argstr.to = value + len;
  321. ret = match_int(&argstr, intval);
  322. if (ret >= 0) {
  323. if (*intval < 0)
  324. return -EINVAL;
  325. return i;
  326. }
  327. if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
  328. *intval = S32_MAX;
  329. return i;
  330. }
  331. return -EINVAL;
  332. }
  333. static int rdmacg_parse_limits(char *options,
  334. int *new_limits, unsigned long *enables)
  335. {
  336. char *c;
  337. int err = -EINVAL;
  338. /* parse resource options */
  339. while ((c = strsep(&options, " ")) != NULL) {
  340. int index, intval;
  341. index = parse_resource(c, &intval);
  342. if (index < 0)
  343. goto err;
  344. new_limits[index] = intval;
  345. *enables |= BIT(index);
  346. }
  347. return 0;
  348. err:
  349. return err;
  350. }
  351. static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
  352. {
  353. struct rdmacg_device *device;
  354. lockdep_assert_held(&rdmacg_mutex);
  355. list_for_each_entry(device, &rdmacg_devices, dev_node)
  356. if (!strcmp(name, device->name))
  357. return device;
  358. return NULL;
  359. }
  360. static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
  361. char *buf, size_t nbytes, loff_t off)
  362. {
  363. struct rdma_cgroup *cg = css_rdmacg(of_css(of));
  364. const char *dev_name;
  365. struct rdmacg_resource_pool *rpool;
  366. struct rdmacg_device *device;
  367. char *options = strstrip(buf);
  368. int *new_limits;
  369. unsigned long enables = 0;
  370. int i = 0, ret = 0;
  371. /* extract the device name first */
  372. dev_name = strsep(&options, " ");
  373. if (!dev_name) {
  374. ret = -EINVAL;
  375. goto err;
  376. }
  377. new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
  378. if (!new_limits) {
  379. ret = -ENOMEM;
  380. goto err;
  381. }
  382. ret = rdmacg_parse_limits(options, new_limits, &enables);
  383. if (ret)
  384. goto parse_err;
  385. /* acquire lock to synchronize with hot plug devices */
  386. mutex_lock(&rdmacg_mutex);
  387. device = rdmacg_get_device_locked(dev_name);
  388. if (!device) {
  389. ret = -ENODEV;
  390. goto dev_err;
  391. }
  392. rpool = get_cg_rpool_locked(cg, device);
  393. if (IS_ERR(rpool)) {
  394. ret = PTR_ERR(rpool);
  395. goto dev_err;
  396. }
  397. /* now set the new limits of the rpool */
  398. for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
  399. set_resource_limit(rpool, i, new_limits[i]);
  400. if (rpool->usage_sum == 0 &&
  401. rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
  402. /*
  403. * No user of the rpool and all entries are set to max, so
  404. * safe to delete this rpool.
  405. */
  406. free_cg_rpool_locked(rpool);
  407. }
  408. dev_err:
  409. mutex_unlock(&rdmacg_mutex);
  410. parse_err:
  411. kfree(new_limits);
  412. err:
  413. return ret ?: nbytes;
  414. }
  415. static void print_rpool_values(struct seq_file *sf,
  416. struct rdmacg_resource_pool *rpool)
  417. {
  418. enum rdmacg_file_type sf_type;
  419. int i;
  420. u32 value;
  421. sf_type = seq_cft(sf)->private;
  422. for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
  423. seq_puts(sf, rdmacg_resource_names[i]);
  424. seq_putc(sf, '=');
  425. if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
  426. if (rpool)
  427. value = rpool->resources[i].max;
  428. else
  429. value = S32_MAX;
  430. } else {
  431. if (rpool)
  432. value = rpool->resources[i].usage;
  433. else
  434. value = 0;
  435. }
  436. if (value == S32_MAX)
  437. seq_puts(sf, RDMACG_MAX_STR);
  438. else
  439. seq_printf(sf, "%d", value);
  440. seq_putc(sf, ' ');
  441. }
  442. }
  443. static int rdmacg_resource_read(struct seq_file *sf, void *v)
  444. {
  445. struct rdmacg_device *device;
  446. struct rdmacg_resource_pool *rpool;
  447. struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
  448. mutex_lock(&rdmacg_mutex);
  449. list_for_each_entry(device, &rdmacg_devices, dev_node) {
  450. seq_printf(sf, "%s ", device->name);
  451. rpool = find_cg_rpool_locked(cg, device);
  452. print_rpool_values(sf, rpool);
  453. seq_putc(sf, '\n');
  454. }
  455. mutex_unlock(&rdmacg_mutex);
  456. return 0;
  457. }
  458. static struct cftype rdmacg_files[] = {
  459. {
  460. .name = "max",
  461. .write = rdmacg_resource_set_max,
  462. .seq_show = rdmacg_resource_read,
  463. .private = RDMACG_RESOURCE_TYPE_MAX,
  464. .flags = CFTYPE_NOT_ON_ROOT,
  465. },
  466. {
  467. .name = "current",
  468. .seq_show = rdmacg_resource_read,
  469. .private = RDMACG_RESOURCE_TYPE_STAT,
  470. .flags = CFTYPE_NOT_ON_ROOT,
  471. },
  472. { } /* terminate */
  473. };
  474. static struct cgroup_subsys_state *
  475. rdmacg_css_alloc(struct cgroup_subsys_state *parent)
  476. {
  477. struct rdma_cgroup *cg;
  478. cg = kzalloc(sizeof(*cg), GFP_KERNEL);
  479. if (!cg)
  480. return ERR_PTR(-ENOMEM);
  481. INIT_LIST_HEAD(&cg->rpools);
  482. return &cg->css;
  483. }
  484. static void rdmacg_css_free(struct cgroup_subsys_state *css)
  485. {
  486. struct rdma_cgroup *cg = css_rdmacg(css);
  487. kfree(cg);
  488. }
  489. /**
  490. * rdmacg_css_offline - cgroup css_offline callback
  491. * @css: css of interest
  492. *
  493. * This function is called when @css is about to go away and responsible
  494. * for shooting down all rdmacg associated with @css. As part of that it
  495. * marks all the resource pool entries to max value, so that when resources are
  496. * uncharged, associated resource pool can be freed as well.
  497. */
  498. static void rdmacg_css_offline(struct cgroup_subsys_state *css)
  499. {
  500. struct rdma_cgroup *cg = css_rdmacg(css);
  501. struct rdmacg_resource_pool *rpool;
  502. mutex_lock(&rdmacg_mutex);
  503. list_for_each_entry(rpool, &cg->rpools, cg_node)
  504. set_all_resource_max_limit(rpool);
  505. mutex_unlock(&rdmacg_mutex);
  506. }
  507. struct cgroup_subsys rdma_cgrp_subsys = {
  508. .css_alloc = rdmacg_css_alloc,
  509. .css_free = rdmacg_css_free,
  510. .css_offline = rdmacg_css_offline,
  511. .legacy_cftypes = rdmacg_files,
  512. .dfl_cftypes = rdmacg_files,
  513. };