hv-24x7.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623
  1. /*
  2. * Hypervisor supplied "24x7" performance counter support
  3. *
  4. * Author: Cody P Schafer <cody@linux.vnet.ibm.com>
  5. * Copyright 2014 IBM Corporation.
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. */
  12. #define pr_fmt(fmt) "hv-24x7: " fmt
  13. #include <linux/perf_event.h>
  14. #include <linux/rbtree.h>
  15. #include <linux/module.h>
  16. #include <linux/slab.h>
  17. #include <linux/vmalloc.h>
  18. #include <asm/cputhreads.h>
  19. #include <asm/firmware.h>
  20. #include <asm/hvcall.h>
  21. #include <asm/io.h>
  22. #include <linux/byteorder/generic.h>
  23. #include "hv-24x7.h"
  24. #include "hv-24x7-catalog.h"
  25. #include "hv-common.h"
  26. /* Version of the 24x7 hypervisor API that we should use in this machine. */
  27. static int interface_version;
  28. /* Whether we have to aggregate result data for some domains. */
  29. static bool aggregate_result_elements;
  30. static bool domain_is_valid(unsigned domain)
  31. {
  32. switch (domain) {
  33. #define DOMAIN(n, v, x, c) \
  34. case HV_PERF_DOMAIN_##n: \
  35. /* fall through */
  36. #include "hv-24x7-domains.h"
  37. #undef DOMAIN
  38. return true;
  39. default:
  40. return false;
  41. }
  42. }
  43. static bool is_physical_domain(unsigned domain)
  44. {
  45. switch (domain) {
  46. #define DOMAIN(n, v, x, c) \
  47. case HV_PERF_DOMAIN_##n: \
  48. return c;
  49. #include "hv-24x7-domains.h"
  50. #undef DOMAIN
  51. default:
  52. return false;
  53. }
  54. }
  55. /* Domains for which more than one result element are returned for each event. */
  56. static bool domain_needs_aggregation(unsigned int domain)
  57. {
  58. return aggregate_result_elements &&
  59. (domain == HV_PERF_DOMAIN_PHYS_CORE ||
  60. (domain >= HV_PERF_DOMAIN_VCPU_HOME_CORE &&
  61. domain <= HV_PERF_DOMAIN_VCPU_REMOTE_NODE));
  62. }
  63. static const char *domain_name(unsigned domain)
  64. {
  65. if (!domain_is_valid(domain))
  66. return NULL;
  67. switch (domain) {
  68. case HV_PERF_DOMAIN_PHYS_CHIP: return "Physical Chip";
  69. case HV_PERF_DOMAIN_PHYS_CORE: return "Physical Core";
  70. case HV_PERF_DOMAIN_VCPU_HOME_CORE: return "VCPU Home Core";
  71. case HV_PERF_DOMAIN_VCPU_HOME_CHIP: return "VCPU Home Chip";
  72. case HV_PERF_DOMAIN_VCPU_HOME_NODE: return "VCPU Home Node";
  73. case HV_PERF_DOMAIN_VCPU_REMOTE_NODE: return "VCPU Remote Node";
  74. }
  75. WARN_ON_ONCE(domain);
  76. return NULL;
  77. }
  78. static bool catalog_entry_domain_is_valid(unsigned domain)
  79. {
  80. /* POWER8 doesn't support virtual domains. */
  81. if (interface_version == 1)
  82. return is_physical_domain(domain);
  83. else
  84. return domain_is_valid(domain);
  85. }
  86. /*
  87. * TODO: Merging events:
  88. * - Think of the hcall as an interface to a 4d array of counters:
  89. * - x = domains
  90. * - y = indexes in the domain (core, chip, vcpu, node, etc)
  91. * - z = offset into the counter space
  92. * - w = lpars (guest vms, "logical partitions")
  93. * - A single request is: x,y,y_last,z,z_last,w,w_last
  94. * - this means we can retrieve a rectangle of counters in y,z for a single x.
  95. *
  96. * - Things to consider (ignoring w):
  97. * - input cost_per_request = 16
  98. * - output cost_per_result(ys,zs) = 8 + 8 * ys + ys * zs
  99. * - limited number of requests per hcall (must fit into 4K bytes)
  100. * - 4k = 16 [buffer header] - 16 [request size] * request_count
  101. * - 255 requests per hcall
  102. * - sometimes it will be more efficient to read extra data and discard
  103. */
  104. /*
  105. * Example usage:
  106. * perf stat -e 'hv_24x7/domain=2,offset=8,vcpu=0,lpar=0xffffffff/'
  107. */
  108. /* u3 0-6, one of HV_24X7_PERF_DOMAIN */
  109. EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3);
  110. /* u16 */
  111. EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31);
  112. EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31);
  113. EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31);
  114. /* u32, see "data_offset" */
  115. EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63);
  116. /* u16 */
  117. EVENT_DEFINE_RANGE_FORMAT(lpar, config1, 0, 15);
  118. EVENT_DEFINE_RANGE(reserved1, config, 4, 15);
  119. EVENT_DEFINE_RANGE(reserved2, config1, 16, 63);
  120. EVENT_DEFINE_RANGE(reserved3, config2, 0, 63);
  121. static struct attribute *format_attrs[] = {
  122. &format_attr_domain.attr,
  123. &format_attr_offset.attr,
  124. &format_attr_core.attr,
  125. &format_attr_chip.attr,
  126. &format_attr_vcpu.attr,
  127. &format_attr_lpar.attr,
  128. NULL,
  129. };
  130. static struct attribute_group format_group = {
  131. .name = "format",
  132. .attrs = format_attrs,
  133. };
  134. static struct attribute_group event_group = {
  135. .name = "events",
  136. /* .attrs is set in init */
  137. };
  138. static struct attribute_group event_desc_group = {
  139. .name = "event_descs",
  140. /* .attrs is set in init */
  141. };
  142. static struct attribute_group event_long_desc_group = {
  143. .name = "event_long_descs",
  144. /* .attrs is set in init */
  145. };
  146. static struct kmem_cache *hv_page_cache;
  147. DEFINE_PER_CPU(int, hv_24x7_txn_flags);
  148. DEFINE_PER_CPU(int, hv_24x7_txn_err);
  149. struct hv_24x7_hw {
  150. struct perf_event *events[255];
  151. };
  152. DEFINE_PER_CPU(struct hv_24x7_hw, hv_24x7_hw);
  153. /*
  154. * request_buffer and result_buffer are not required to be 4k aligned,
  155. * but are not allowed to cross any 4k boundary. Aligning them to 4k is
  156. * the simplest way to ensure that.
  157. */
  158. #define H24x7_DATA_BUFFER_SIZE 4096
  159. DEFINE_PER_CPU(char, hv_24x7_reqb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
  160. DEFINE_PER_CPU(char, hv_24x7_resb[H24x7_DATA_BUFFER_SIZE]) __aligned(4096);
  161. static unsigned int max_num_requests(int interface_version)
  162. {
  163. return (H24x7_DATA_BUFFER_SIZE - sizeof(struct hv_24x7_request_buffer))
  164. / H24x7_REQUEST_SIZE(interface_version);
  165. }
  166. static char *event_name(struct hv_24x7_event_data *ev, int *len)
  167. {
  168. *len = be16_to_cpu(ev->event_name_len) - 2;
  169. return (char *)ev->remainder;
  170. }
  171. static char *event_desc(struct hv_24x7_event_data *ev, int *len)
  172. {
  173. unsigned nl = be16_to_cpu(ev->event_name_len);
  174. __be16 *desc_len = (__be16 *)(ev->remainder + nl - 2);
  175. *len = be16_to_cpu(*desc_len) - 2;
  176. return (char *)ev->remainder + nl;
  177. }
  178. static char *event_long_desc(struct hv_24x7_event_data *ev, int *len)
  179. {
  180. unsigned nl = be16_to_cpu(ev->event_name_len);
  181. __be16 *desc_len_ = (__be16 *)(ev->remainder + nl - 2);
  182. unsigned desc_len = be16_to_cpu(*desc_len_);
  183. __be16 *long_desc_len = (__be16 *)(ev->remainder + nl + desc_len - 2);
  184. *len = be16_to_cpu(*long_desc_len) - 2;
  185. return (char *)ev->remainder + nl + desc_len;
  186. }
  187. static bool event_fixed_portion_is_within(struct hv_24x7_event_data *ev,
  188. void *end)
  189. {
  190. void *start = ev;
  191. return (start + offsetof(struct hv_24x7_event_data, remainder)) < end;
  192. }
  193. /*
  194. * Things we don't check:
  195. * - padding for desc, name, and long/detailed desc is required to be '\0'
  196. * bytes.
  197. *
  198. * Return NULL if we pass end,
  199. * Otherwise return the address of the byte just following the event.
  200. */
  201. static void *event_end(struct hv_24x7_event_data *ev, void *end)
  202. {
  203. void *start = ev;
  204. __be16 *dl_, *ldl_;
  205. unsigned dl, ldl;
  206. unsigned nl = be16_to_cpu(ev->event_name_len);
  207. if (nl < 2) {
  208. pr_debug("%s: name length too short: %d", __func__, nl);
  209. return NULL;
  210. }
  211. if (start + nl > end) {
  212. pr_debug("%s: start=%p + nl=%u > end=%p",
  213. __func__, start, nl, end);
  214. return NULL;
  215. }
  216. dl_ = (__be16 *)(ev->remainder + nl - 2);
  217. if (!IS_ALIGNED((uintptr_t)dl_, 2))
  218. pr_warn("desc len not aligned %p", dl_);
  219. dl = be16_to_cpu(*dl_);
  220. if (dl < 2) {
  221. pr_debug("%s: desc len too short: %d", __func__, dl);
  222. return NULL;
  223. }
  224. if (start + nl + dl > end) {
  225. pr_debug("%s: (start=%p + nl=%u + dl=%u)=%p > end=%p",
  226. __func__, start, nl, dl, start + nl + dl, end);
  227. return NULL;
  228. }
  229. ldl_ = (__be16 *)(ev->remainder + nl + dl - 2);
  230. if (!IS_ALIGNED((uintptr_t)ldl_, 2))
  231. pr_warn("long desc len not aligned %p", ldl_);
  232. ldl = be16_to_cpu(*ldl_);
  233. if (ldl < 2) {
  234. pr_debug("%s: long desc len too short (ldl=%u)",
  235. __func__, ldl);
  236. return NULL;
  237. }
  238. if (start + nl + dl + ldl > end) {
  239. pr_debug("%s: start=%p + nl=%u + dl=%u + ldl=%u > end=%p",
  240. __func__, start, nl, dl, ldl, end);
  241. return NULL;
  242. }
  243. return start + nl + dl + ldl;
  244. }
  245. static long h_get_24x7_catalog_page_(unsigned long phys_4096,
  246. unsigned long version, unsigned long index)
  247. {
  248. pr_devel("h_get_24x7_catalog_page(0x%lx, %lu, %lu)",
  249. phys_4096, version, index);
  250. WARN_ON(!IS_ALIGNED(phys_4096, 4096));
  251. return plpar_hcall_norets(H_GET_24X7_CATALOG_PAGE,
  252. phys_4096, version, index);
  253. }
  254. static long h_get_24x7_catalog_page(char page[], u64 version, u32 index)
  255. {
  256. return h_get_24x7_catalog_page_(virt_to_phys(page),
  257. version, index);
  258. }
  259. /*
  260. * Each event we find in the catalog, will have a sysfs entry. Format the
  261. * data for this sysfs entry based on the event's domain.
  262. *
  263. * Events belonging to the Chip domain can only be monitored in that domain.
  264. * i.e the domain for these events is a fixed/knwon value.
  265. *
  266. * Events belonging to the Core domain can be monitored either in the physical
  267. * core or in one of the virtual CPU domains. So the domain value for these
  268. * events must be specified by the user (i.e is a required parameter). Format
  269. * the Core events with 'domain=?' so the perf-tool can error check required
  270. * parameters.
  271. *
  272. * NOTE: For the Core domain events, rather than making domain a required
  273. * parameter we could default it to PHYS_CORE and allowe users to
  274. * override the domain to one of the VCPU domains.
  275. *
  276. * However, this can make the interface a little inconsistent.
  277. *
  278. * If we set domain=2 (PHYS_CHIP) and allow user to override this field
  279. * the user may be tempted to also modify the "offset=x" field in which
  280. * can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and
  281. * HPM_INST (offset=0x20) events. With:
  282. *
  283. * perf stat -e hv_24x7/HPM_PCYC,offset=0x20/
  284. *
  285. * we end up monitoring HPM_INST, while the command line has HPM_PCYC.
  286. *
  287. * By not assigning a default value to the domain for the Core events,
  288. * we can have simple guidelines:
  289. *
  290. * - Specifying values for parameters with "=?" is required.
  291. *
  292. * - Specifying (i.e overriding) values for other parameters
  293. * is undefined.
  294. */
  295. static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain)
  296. {
  297. const char *sindex;
  298. const char *lpar;
  299. const char *domain_str;
  300. char buf[8];
  301. switch (domain) {
  302. case HV_PERF_DOMAIN_PHYS_CHIP:
  303. snprintf(buf, sizeof(buf), "%d", domain);
  304. domain_str = buf;
  305. lpar = "0x0";
  306. sindex = "chip";
  307. break;
  308. case HV_PERF_DOMAIN_PHYS_CORE:
  309. domain_str = "?";
  310. lpar = "0x0";
  311. sindex = "core";
  312. break;
  313. default:
  314. domain_str = "?";
  315. lpar = "?";
  316. sindex = "vcpu";
  317. }
  318. return kasprintf(GFP_KERNEL,
  319. "domain=%s,offset=0x%x,%s=?,lpar=%s",
  320. domain_str,
  321. be16_to_cpu(event->event_counter_offs) +
  322. be16_to_cpu(event->event_group_record_offs),
  323. sindex,
  324. lpar);
  325. }
  326. /* Avoid trusting fw to NUL terminate strings */
  327. static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp)
  328. {
  329. return kasprintf(gfp, "%.*s", max_len, maybe_str);
  330. }
  331. static ssize_t device_show_string(struct device *dev,
  332. struct device_attribute *attr, char *buf)
  333. {
  334. struct dev_ext_attribute *d;
  335. d = container_of(attr, struct dev_ext_attribute, attr);
  336. return sprintf(buf, "%s\n", (char *)d->var);
  337. }
  338. static struct attribute *device_str_attr_create_(char *name, char *str)
  339. {
  340. struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL);
  341. if (!attr)
  342. return NULL;
  343. sysfs_attr_init(&attr->attr.attr);
  344. attr->var = str;
  345. attr->attr.attr.name = name;
  346. attr->attr.attr.mode = 0444;
  347. attr->attr.show = device_show_string;
  348. return &attr->attr.attr;
  349. }
  350. /*
  351. * Allocate and initialize strings representing event attributes.
  352. *
  353. * NOTE: The strings allocated here are never destroyed and continue to
  354. * exist till shutdown. This is to allow us to create as many events
  355. * from the catalog as possible, even if we encounter errors with some.
  356. * In case of changes to error paths in future, these may need to be
  357. * freed by the caller.
  358. */
  359. static struct attribute *device_str_attr_create(char *name, int name_max,
  360. int name_nonce,
  361. char *str, size_t str_max)
  362. {
  363. char *n;
  364. char *s = memdup_to_str(str, str_max, GFP_KERNEL);
  365. struct attribute *a;
  366. if (!s)
  367. return NULL;
  368. if (!name_nonce)
  369. n = kasprintf(GFP_KERNEL, "%.*s", name_max, name);
  370. else
  371. n = kasprintf(GFP_KERNEL, "%.*s__%d", name_max, name,
  372. name_nonce);
  373. if (!n)
  374. goto out_s;
  375. a = device_str_attr_create_(n, s);
  376. if (!a)
  377. goto out_n;
  378. return a;
  379. out_n:
  380. kfree(n);
  381. out_s:
  382. kfree(s);
  383. return NULL;
  384. }
  385. static struct attribute *event_to_attr(unsigned ix,
  386. struct hv_24x7_event_data *event,
  387. unsigned domain,
  388. int nonce)
  389. {
  390. int event_name_len;
  391. char *ev_name, *a_ev_name, *val;
  392. struct attribute *attr;
  393. if (!domain_is_valid(domain)) {
  394. pr_warn("catalog event %u has invalid domain %u\n",
  395. ix, domain);
  396. return NULL;
  397. }
  398. val = event_fmt(event, domain);
  399. if (!val)
  400. return NULL;
  401. ev_name = event_name(event, &event_name_len);
  402. if (!nonce)
  403. a_ev_name = kasprintf(GFP_KERNEL, "%.*s",
  404. (int)event_name_len, ev_name);
  405. else
  406. a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d",
  407. (int)event_name_len, ev_name, nonce);
  408. if (!a_ev_name)
  409. goto out_val;
  410. attr = device_str_attr_create_(a_ev_name, val);
  411. if (!attr)
  412. goto out_name;
  413. return attr;
  414. out_name:
  415. kfree(a_ev_name);
  416. out_val:
  417. kfree(val);
  418. return NULL;
  419. }
  420. static struct attribute *event_to_desc_attr(struct hv_24x7_event_data *event,
  421. int nonce)
  422. {
  423. int nl, dl;
  424. char *name = event_name(event, &nl);
  425. char *desc = event_desc(event, &dl);
  426. /* If there isn't a description, don't create the sysfs file */
  427. if (!dl)
  428. return NULL;
  429. return device_str_attr_create(name, nl, nonce, desc, dl);
  430. }
  431. static struct attribute *
  432. event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)
  433. {
  434. int nl, dl;
  435. char *name = event_name(event, &nl);
  436. char *desc = event_long_desc(event, &dl);
  437. /* If there isn't a description, don't create the sysfs file */
  438. if (!dl)
  439. return NULL;
  440. return device_str_attr_create(name, nl, nonce, desc, dl);
  441. }
  442. static int event_data_to_attrs(unsigned ix, struct attribute **attrs,
  443. struct hv_24x7_event_data *event, int nonce)
  444. {
  445. *attrs = event_to_attr(ix, event, event->domain, nonce);
  446. if (!*attrs)
  447. return -1;
  448. return 0;
  449. }
  450. /* */
  451. struct event_uniq {
  452. struct rb_node node;
  453. const char *name;
  454. int nl;
  455. unsigned ct;
  456. unsigned domain;
  457. };
  458. static int memord(const void *d1, size_t s1, const void *d2, size_t s2)
  459. {
  460. if (s1 < s2)
  461. return 1;
  462. if (s1 > s2)
  463. return -1;
  464. return memcmp(d1, d2, s1);
  465. }
  466. static int ev_uniq_ord(const void *v1, size_t s1, unsigned d1, const void *v2,
  467. size_t s2, unsigned d2)
  468. {
  469. int r = memord(v1, s1, v2, s2);
  470. if (r)
  471. return r;
  472. if (d1 > d2)
  473. return 1;
  474. if (d2 > d1)
  475. return -1;
  476. return 0;
  477. }
  478. static int event_uniq_add(struct rb_root *root, const char *name, int nl,
  479. unsigned domain)
  480. {
  481. struct rb_node **new = &(root->rb_node), *parent = NULL;
  482. struct event_uniq *data;
  483. /* Figure out where to put new node */
  484. while (*new) {
  485. struct event_uniq *it;
  486. int result;
  487. it = container_of(*new, struct event_uniq, node);
  488. result = ev_uniq_ord(name, nl, domain, it->name, it->nl,
  489. it->domain);
  490. parent = *new;
  491. if (result < 0)
  492. new = &((*new)->rb_left);
  493. else if (result > 0)
  494. new = &((*new)->rb_right);
  495. else {
  496. it->ct++;
  497. pr_info("found a duplicate event %.*s, ct=%u\n", nl,
  498. name, it->ct);
  499. return it->ct;
  500. }
  501. }
  502. data = kmalloc(sizeof(*data), GFP_KERNEL);
  503. if (!data)
  504. return -ENOMEM;
  505. *data = (struct event_uniq) {
  506. .name = name,
  507. .nl = nl,
  508. .ct = 0,
  509. .domain = domain,
  510. };
  511. /* Add new node and rebalance tree. */
  512. rb_link_node(&data->node, parent, new);
  513. rb_insert_color(&data->node, root);
  514. /* data->ct */
  515. return 0;
  516. }
  517. static void event_uniq_destroy(struct rb_root *root)
  518. {
  519. /*
  520. * the strings we point to are in the giant block of memory filled by
  521. * the catalog, and are freed separately.
  522. */
  523. struct event_uniq *pos, *n;
  524. rbtree_postorder_for_each_entry_safe(pos, n, root, node)
  525. kfree(pos);
  526. }
  527. /*
  528. * ensure the event structure's sizes are self consistent and don't cause us to
  529. * read outside of the event
  530. *
  531. * On success, return the event length in bytes.
  532. * Otherwise, return -1 (and print as appropriate).
  533. */
  534. static ssize_t catalog_event_len_validate(struct hv_24x7_event_data *event,
  535. size_t event_idx,
  536. size_t event_data_bytes,
  537. size_t event_entry_count,
  538. size_t offset, void *end)
  539. {
  540. ssize_t ev_len;
  541. void *ev_end, *calc_ev_end;
  542. if (offset >= event_data_bytes)
  543. return -1;
  544. if (event_idx >= event_entry_count) {
  545. pr_devel("catalog event data has %zu bytes of padding after last event\n",
  546. event_data_bytes - offset);
  547. return -1;
  548. }
  549. if (!event_fixed_portion_is_within(event, end)) {
  550. pr_warn("event %zu fixed portion is not within range\n",
  551. event_idx);
  552. return -1;
  553. }
  554. ev_len = be16_to_cpu(event->length);
  555. if (ev_len % 16)
  556. pr_info("event %zu has length %zu not divisible by 16: event=%pK\n",
  557. event_idx, ev_len, event);
  558. ev_end = (__u8 *)event + ev_len;
  559. if (ev_end > end) {
  560. pr_warn("event %zu has .length=%zu, ends after buffer end: ev_end=%pK > end=%pK, offset=%zu\n",
  561. event_idx, ev_len, ev_end, end,
  562. offset);
  563. return -1;
  564. }
  565. calc_ev_end = event_end(event, end);
  566. if (!calc_ev_end) {
  567. pr_warn("event %zu has a calculated length which exceeds buffer length %zu: event=%pK end=%pK, offset=%zu\n",
  568. event_idx, event_data_bytes, event, end,
  569. offset);
  570. return -1;
  571. }
  572. if (calc_ev_end > ev_end) {
  573. pr_warn("event %zu exceeds it's own length: event=%pK, end=%pK, offset=%zu, calc_ev_end=%pK\n",
  574. event_idx, event, ev_end, offset, calc_ev_end);
  575. return -1;
  576. }
  577. return ev_len;
  578. }
  579. #define MAX_4K (SIZE_MAX / 4096)
  580. static int create_events_from_catalog(struct attribute ***events_,
  581. struct attribute ***event_descs_,
  582. struct attribute ***event_long_descs_)
  583. {
  584. long hret;
  585. size_t catalog_len, catalog_page_len, event_entry_count,
  586. event_data_len, event_data_offs,
  587. event_data_bytes, junk_events, event_idx, event_attr_ct, i,
  588. attr_max, event_idx_last, desc_ct, long_desc_ct;
  589. ssize_t ct, ev_len;
  590. uint64_t catalog_version_num;
  591. struct attribute **events, **event_descs, **event_long_descs;
  592. struct hv_24x7_catalog_page_0 *page_0 =
  593. kmem_cache_alloc(hv_page_cache, GFP_KERNEL);
  594. void *page = page_0;
  595. void *event_data, *end;
  596. struct hv_24x7_event_data *event;
  597. struct rb_root ev_uniq = RB_ROOT;
  598. int ret = 0;
  599. if (!page) {
  600. ret = -ENOMEM;
  601. goto e_out;
  602. }
  603. hret = h_get_24x7_catalog_page(page, 0, 0);
  604. if (hret) {
  605. ret = -EIO;
  606. goto e_free;
  607. }
  608. catalog_version_num = be64_to_cpu(page_0->version);
  609. catalog_page_len = be32_to_cpu(page_0->length);
  610. if (MAX_4K < catalog_page_len) {
  611. pr_err("invalid page count: %zu\n", catalog_page_len);
  612. ret = -EIO;
  613. goto e_free;
  614. }
  615. catalog_len = catalog_page_len * 4096;
  616. event_entry_count = be16_to_cpu(page_0->event_entry_count);
  617. event_data_offs = be16_to_cpu(page_0->event_data_offs);
  618. event_data_len = be16_to_cpu(page_0->event_data_len);
  619. pr_devel("cv %llu cl %zu eec %zu edo %zu edl %zu\n",
  620. catalog_version_num, catalog_len,
  621. event_entry_count, event_data_offs, event_data_len);
  622. if ((MAX_4K < event_data_len)
  623. || (MAX_4K < event_data_offs)
  624. || (MAX_4K - event_data_offs < event_data_len)) {
  625. pr_err("invalid event data offs %zu and/or len %zu\n",
  626. event_data_offs, event_data_len);
  627. ret = -EIO;
  628. goto e_free;
  629. }
  630. if ((event_data_offs + event_data_len) > catalog_page_len) {
  631. pr_err("event data %zu-%zu does not fit inside catalog 0-%zu\n",
  632. event_data_offs,
  633. event_data_offs + event_data_len,
  634. catalog_page_len);
  635. ret = -EIO;
  636. goto e_free;
  637. }
  638. if (SIZE_MAX - 1 < event_entry_count) {
  639. pr_err("event_entry_count %zu is invalid\n", event_entry_count);
  640. ret = -EIO;
  641. goto e_free;
  642. }
  643. event_data_bytes = event_data_len * 4096;
  644. /*
  645. * event data can span several pages, events can cross between these
  646. * pages. Use vmalloc to make this easier.
  647. */
  648. event_data = vmalloc(event_data_bytes);
  649. if (!event_data) {
  650. pr_err("could not allocate event data\n");
  651. ret = -ENOMEM;
  652. goto e_free;
  653. }
  654. end = event_data + event_data_bytes;
  655. /*
  656. * using vmalloc_to_phys() like this only works if PAGE_SIZE is
  657. * divisible by 4096
  658. */
  659. BUILD_BUG_ON(PAGE_SIZE % 4096);
  660. for (i = 0; i < event_data_len; i++) {
  661. hret = h_get_24x7_catalog_page_(
  662. vmalloc_to_phys(event_data + i * 4096),
  663. catalog_version_num,
  664. i + event_data_offs);
  665. if (hret) {
  666. pr_err("Failed to get event data in page %zu: rc=%ld\n",
  667. i + event_data_offs, hret);
  668. ret = -EIO;
  669. goto e_event_data;
  670. }
  671. }
  672. /*
  673. * scan the catalog to determine the number of attributes we need, and
  674. * verify it at the same time.
  675. */
  676. for (junk_events = 0, event = event_data, event_idx = 0, attr_max = 0;
  677. ;
  678. event_idx++, event = (void *)event + ev_len) {
  679. size_t offset = (void *)event - (void *)event_data;
  680. char *name;
  681. int nl;
  682. ev_len = catalog_event_len_validate(event, event_idx,
  683. event_data_bytes,
  684. event_entry_count,
  685. offset, end);
  686. if (ev_len < 0)
  687. break;
  688. name = event_name(event, &nl);
  689. if (event->event_group_record_len == 0) {
  690. pr_devel("invalid event %zu (%.*s): group_record_len == 0, skipping\n",
  691. event_idx, nl, name);
  692. junk_events++;
  693. continue;
  694. }
  695. if (!catalog_entry_domain_is_valid(event->domain)) {
  696. pr_info("event %zu (%.*s) has invalid domain %d\n",
  697. event_idx, nl, name, event->domain);
  698. junk_events++;
  699. continue;
  700. }
  701. attr_max++;
  702. }
  703. event_idx_last = event_idx;
  704. if (event_idx_last != event_entry_count)
  705. pr_warn("event buffer ended before listed # of events were parsed (got %zu, wanted %zu, junk %zu)\n",
  706. event_idx_last, event_entry_count, junk_events);
  707. events = kmalloc_array(attr_max + 1, sizeof(*events), GFP_KERNEL);
  708. if (!events) {
  709. ret = -ENOMEM;
  710. goto e_event_data;
  711. }
  712. event_descs = kmalloc_array(event_idx + 1, sizeof(*event_descs),
  713. GFP_KERNEL);
  714. if (!event_descs) {
  715. ret = -ENOMEM;
  716. goto e_event_attrs;
  717. }
  718. event_long_descs = kmalloc_array(event_idx + 1,
  719. sizeof(*event_long_descs), GFP_KERNEL);
  720. if (!event_long_descs) {
  721. ret = -ENOMEM;
  722. goto e_event_descs;
  723. }
  724. /* Iterate over the catalog filling in the attribute vector */
  725. for (junk_events = 0, event_attr_ct = 0, desc_ct = 0, long_desc_ct = 0,
  726. event = event_data, event_idx = 0;
  727. event_idx < event_idx_last;
  728. event_idx++, ev_len = be16_to_cpu(event->length),
  729. event = (void *)event + ev_len) {
  730. char *name;
  731. int nl;
  732. int nonce;
  733. /*
  734. * these are the only "bad" events that are intermixed and that
  735. * we can ignore without issue. make sure to skip them here
  736. */
  737. if (event->event_group_record_len == 0)
  738. continue;
  739. if (!catalog_entry_domain_is_valid(event->domain))
  740. continue;
  741. name = event_name(event, &nl);
  742. nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
  743. ct = event_data_to_attrs(event_idx, events + event_attr_ct,
  744. event, nonce);
  745. if (ct < 0) {
  746. pr_warn("event %zu (%.*s) creation failure, skipping\n",
  747. event_idx, nl, name);
  748. junk_events++;
  749. } else {
  750. event_attr_ct++;
  751. event_descs[desc_ct] = event_to_desc_attr(event, nonce);
  752. if (event_descs[desc_ct])
  753. desc_ct++;
  754. event_long_descs[long_desc_ct] =
  755. event_to_long_desc_attr(event, nonce);
  756. if (event_long_descs[long_desc_ct])
  757. long_desc_ct++;
  758. }
  759. }
  760. pr_info("read %zu catalog entries, created %zu event attrs (%zu failures), %zu descs\n",
  761. event_idx, event_attr_ct, junk_events, desc_ct);
  762. events[event_attr_ct] = NULL;
  763. event_descs[desc_ct] = NULL;
  764. event_long_descs[long_desc_ct] = NULL;
  765. event_uniq_destroy(&ev_uniq);
  766. vfree(event_data);
  767. kmem_cache_free(hv_page_cache, page);
  768. *events_ = events;
  769. *event_descs_ = event_descs;
  770. *event_long_descs_ = event_long_descs;
  771. return 0;
  772. e_event_descs:
  773. kfree(event_descs);
  774. e_event_attrs:
  775. kfree(events);
  776. e_event_data:
  777. vfree(event_data);
  778. e_free:
  779. kmem_cache_free(hv_page_cache, page);
  780. e_out:
  781. *events_ = NULL;
  782. *event_descs_ = NULL;
  783. *event_long_descs_ = NULL;
  784. return ret;
  785. }
  786. static ssize_t catalog_read(struct file *filp, struct kobject *kobj,
  787. struct bin_attribute *bin_attr, char *buf,
  788. loff_t offset, size_t count)
  789. {
  790. long hret;
  791. ssize_t ret = 0;
  792. size_t catalog_len = 0, catalog_page_len = 0;
  793. loff_t page_offset = 0;
  794. loff_t offset_in_page;
  795. size_t copy_len;
  796. uint64_t catalog_version_num = 0;
  797. void *page = kmem_cache_alloc(hv_page_cache, GFP_USER);
  798. struct hv_24x7_catalog_page_0 *page_0 = page;
  799. if (!page)
  800. return -ENOMEM;
  801. hret = h_get_24x7_catalog_page(page, 0, 0);
  802. if (hret) {
  803. ret = -EIO;
  804. goto e_free;
  805. }
  806. catalog_version_num = be64_to_cpu(page_0->version);
  807. catalog_page_len = be32_to_cpu(page_0->length);
  808. catalog_len = catalog_page_len * 4096;
  809. page_offset = offset / 4096;
  810. offset_in_page = offset % 4096;
  811. if (page_offset >= catalog_page_len)
  812. goto e_free;
  813. if (page_offset != 0) {
  814. hret = h_get_24x7_catalog_page(page, catalog_version_num,
  815. page_offset);
  816. if (hret) {
  817. ret = -EIO;
  818. goto e_free;
  819. }
  820. }
  821. copy_len = 4096 - offset_in_page;
  822. if (copy_len > count)
  823. copy_len = count;
  824. memcpy(buf, page+offset_in_page, copy_len);
  825. ret = copy_len;
  826. e_free:
  827. if (hret)
  828. pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
  829. " rc=%ld\n",
  830. catalog_version_num, page_offset, hret);
  831. kmem_cache_free(hv_page_cache, page);
  832. pr_devel("catalog_read: offset=%lld(%lld) count=%zu "
  833. "catalog_len=%zu(%zu) => %zd\n", offset, page_offset,
  834. count, catalog_len, catalog_page_len, ret);
  835. return ret;
  836. }
  837. static ssize_t domains_show(struct device *dev, struct device_attribute *attr,
  838. char *page)
  839. {
  840. int d, n, count = 0;
  841. const char *str;
  842. for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) {
  843. str = domain_name(d);
  844. if (!str)
  845. continue;
  846. n = sprintf(page, "%d: %s\n", d, str);
  847. if (n < 0)
  848. break;
  849. count += n;
  850. page += n;
  851. }
  852. return count;
  853. }
  854. #define PAGE_0_ATTR(_name, _fmt, _expr) \
  855. static ssize_t _name##_show(struct device *dev, \
  856. struct device_attribute *dev_attr, \
  857. char *buf) \
  858. { \
  859. long hret; \
  860. ssize_t ret = 0; \
  861. void *page = kmem_cache_alloc(hv_page_cache, GFP_USER); \
  862. struct hv_24x7_catalog_page_0 *page_0 = page; \
  863. if (!page) \
  864. return -ENOMEM; \
  865. hret = h_get_24x7_catalog_page(page, 0, 0); \
  866. if (hret) { \
  867. ret = -EIO; \
  868. goto e_free; \
  869. } \
  870. ret = sprintf(buf, _fmt, _expr); \
  871. e_free: \
  872. kmem_cache_free(hv_page_cache, page); \
  873. return ret; \
  874. } \
  875. static DEVICE_ATTR_RO(_name)
  876. PAGE_0_ATTR(catalog_version, "%lld\n",
  877. (unsigned long long)be64_to_cpu(page_0->version));
  878. PAGE_0_ATTR(catalog_len, "%lld\n",
  879. (unsigned long long)be32_to_cpu(page_0->length) * 4096);
  880. static BIN_ATTR_RO(catalog, 0/* real length varies */);
  881. static DEVICE_ATTR_RO(domains);
  882. static struct bin_attribute *if_bin_attrs[] = {
  883. &bin_attr_catalog,
  884. NULL,
  885. };
  886. static struct attribute *if_attrs[] = {
  887. &dev_attr_catalog_len.attr,
  888. &dev_attr_catalog_version.attr,
  889. &dev_attr_domains.attr,
  890. NULL,
  891. };
  892. static struct attribute_group if_group = {
  893. .name = "interface",
  894. .bin_attrs = if_bin_attrs,
  895. .attrs = if_attrs,
  896. };
  897. static const struct attribute_group *attr_groups[] = {
  898. &format_group,
  899. &event_group,
  900. &event_desc_group,
  901. &event_long_desc_group,
  902. &if_group,
  903. NULL,
  904. };
  905. /*
  906. * Start the process for a new H_GET_24x7_DATA hcall.
  907. */
  908. static void init_24x7_request(struct hv_24x7_request_buffer *request_buffer,
  909. struct hv_24x7_data_result_buffer *result_buffer)
  910. {
  911. memset(request_buffer, 0, H24x7_DATA_BUFFER_SIZE);
  912. memset(result_buffer, 0, H24x7_DATA_BUFFER_SIZE);
  913. request_buffer->interface_version = interface_version;
  914. /* memset above set request_buffer->num_requests to 0 */
  915. }
  916. /*
  917. * Commit (i.e perform) the H_GET_24x7_DATA hcall using the data collected
  918. * by 'init_24x7_request()' and 'add_event_to_24x7_request()'.
  919. */
  920. static int make_24x7_request(struct hv_24x7_request_buffer *request_buffer,
  921. struct hv_24x7_data_result_buffer *result_buffer)
  922. {
  923. long ret;
  924. /*
  925. * NOTE: Due to variable number of array elements in request and
  926. * result buffer(s), sizeof() is not reliable. Use the actual
  927. * allocated buffer size, H24x7_DATA_BUFFER_SIZE.
  928. */
  929. ret = plpar_hcall_norets(H_GET_24X7_DATA,
  930. virt_to_phys(request_buffer), H24x7_DATA_BUFFER_SIZE,
  931. virt_to_phys(result_buffer), H24x7_DATA_BUFFER_SIZE);
  932. if (ret) {
  933. struct hv_24x7_request *req;
  934. req = request_buffer->requests;
  935. pr_notice_ratelimited("hcall failed: [%d %#x %#x %d] => ret 0x%lx (%ld) detail=0x%x failing ix=%x\n",
  936. req->performance_domain, req->data_offset,
  937. req->starting_ix, req->starting_lpar_ix,
  938. ret, ret, result_buffer->detailed_rc,
  939. result_buffer->failing_request_ix);
  940. return -EIO;
  941. }
  942. return 0;
  943. }
  944. /*
  945. * Add the given @event to the next slot in the 24x7 request_buffer.
  946. *
  947. * Note that H_GET_24X7_DATA hcall allows reading several counters'
  948. * values in a single HCALL. We expect the caller to add events to the
  949. * request buffer one by one, make the HCALL and process the results.
  950. */
  951. static int add_event_to_24x7_request(struct perf_event *event,
  952. struct hv_24x7_request_buffer *request_buffer)
  953. {
  954. u16 idx;
  955. int i;
  956. size_t req_size;
  957. struct hv_24x7_request *req;
  958. if (request_buffer->num_requests >=
  959. max_num_requests(request_buffer->interface_version)) {
  960. pr_devel("Too many requests for 24x7 HCALL %d\n",
  961. request_buffer->num_requests);
  962. return -EINVAL;
  963. }
  964. switch (event_get_domain(event)) {
  965. case HV_PERF_DOMAIN_PHYS_CHIP:
  966. idx = event_get_chip(event);
  967. break;
  968. case HV_PERF_DOMAIN_PHYS_CORE:
  969. idx = event_get_core(event);
  970. break;
  971. default:
  972. idx = event_get_vcpu(event);
  973. }
  974. req_size = H24x7_REQUEST_SIZE(request_buffer->interface_version);
  975. i = request_buffer->num_requests++;
  976. req = (void *) request_buffer->requests + i * req_size;
  977. req->performance_domain = event_get_domain(event);
  978. req->data_size = cpu_to_be16(8);
  979. req->data_offset = cpu_to_be32(event_get_offset(event));
  980. req->starting_lpar_ix = cpu_to_be16(event_get_lpar(event));
  981. req->max_num_lpars = cpu_to_be16(1);
  982. req->starting_ix = cpu_to_be16(idx);
  983. req->max_ix = cpu_to_be16(1);
  984. if (request_buffer->interface_version > 1) {
  985. if (domain_needs_aggregation(req->performance_domain))
  986. req->max_num_thread_groups = -1;
  987. else if (req->performance_domain != HV_PERF_DOMAIN_PHYS_CHIP) {
  988. req->starting_thread_group_ix = idx % 2;
  989. req->max_num_thread_groups = 1;
  990. }
  991. }
  992. return 0;
  993. }
  994. /**
  995. * get_count_from_result - get event count from all result elements in result
  996. *
  997. * If the event corresponding to this result needs aggregation of the result
  998. * element values, then this function does that.
  999. *
  1000. * @event: Event associated with @res.
  1001. * @resb: Result buffer containing @res.
  1002. * @res: Result to work on.
  1003. * @countp: Output variable containing the event count.
  1004. * @next: Optional output variable pointing to the next result in @resb.
  1005. */
  1006. static int get_count_from_result(struct perf_event *event,
  1007. struct hv_24x7_data_result_buffer *resb,
  1008. struct hv_24x7_result *res, u64 *countp,
  1009. struct hv_24x7_result **next)
  1010. {
  1011. u16 num_elements = be16_to_cpu(res->num_elements_returned);
  1012. u16 data_size = be16_to_cpu(res->result_element_data_size);
  1013. unsigned int data_offset;
  1014. void *element_data;
  1015. int i;
  1016. u64 count;
  1017. /*
  1018. * We can bail out early if the result is empty.
  1019. */
  1020. if (!num_elements) {
  1021. pr_debug("Result of request %hhu is empty, nothing to do\n",
  1022. res->result_ix);
  1023. if (next)
  1024. *next = (struct hv_24x7_result *) res->elements;
  1025. return -ENODATA;
  1026. }
  1027. /*
  1028. * Since we always specify 1 as the maximum for the smallest resource
  1029. * we're requesting, there should to be only one element per result.
  1030. * Except when an event needs aggregation, in which case there are more.
  1031. */
  1032. if (num_elements != 1 &&
  1033. !domain_needs_aggregation(event_get_domain(event))) {
  1034. pr_err("Error: result of request %hhu has %hu elements\n",
  1035. res->result_ix, num_elements);
  1036. return -EIO;
  1037. }
  1038. if (data_size != sizeof(u64)) {
  1039. pr_debug("Error: result of request %hhu has data of %hu bytes\n",
  1040. res->result_ix, data_size);
  1041. return -ENOTSUPP;
  1042. }
  1043. if (resb->interface_version == 1)
  1044. data_offset = offsetof(struct hv_24x7_result_element_v1,
  1045. element_data);
  1046. else
  1047. data_offset = offsetof(struct hv_24x7_result_element_v2,
  1048. element_data);
  1049. /* Go through the result elements in the result. */
  1050. for (i = count = 0, element_data = res->elements + data_offset;
  1051. i < num_elements;
  1052. i++, element_data += data_size + data_offset)
  1053. count += be64_to_cpu(*((u64 *) element_data));
  1054. *countp = count;
  1055. /* The next result is after the last result element. */
  1056. if (next)
  1057. *next = element_data - data_offset;
  1058. return 0;
  1059. }
  1060. static int single_24x7_request(struct perf_event *event, u64 *count)
  1061. {
  1062. int ret;
  1063. struct hv_24x7_request_buffer *request_buffer;
  1064. struct hv_24x7_data_result_buffer *result_buffer;
  1065. BUILD_BUG_ON(sizeof(*request_buffer) > 4096);
  1066. BUILD_BUG_ON(sizeof(*result_buffer) > 4096);
  1067. request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
  1068. result_buffer = (void *)get_cpu_var(hv_24x7_resb);
  1069. init_24x7_request(request_buffer, result_buffer);
  1070. ret = add_event_to_24x7_request(event, request_buffer);
  1071. if (ret)
  1072. goto out;
  1073. ret = make_24x7_request(request_buffer, result_buffer);
  1074. if (ret)
  1075. goto out;
  1076. /* process result from hcall */
  1077. ret = get_count_from_result(event, result_buffer,
  1078. result_buffer->results, count, NULL);
  1079. out:
  1080. put_cpu_var(hv_24x7_reqb);
  1081. put_cpu_var(hv_24x7_resb);
  1082. return ret;
  1083. }
  1084. static int h_24x7_event_init(struct perf_event *event)
  1085. {
  1086. struct hv_perf_caps caps;
  1087. unsigned domain;
  1088. unsigned long hret;
  1089. u64 ct;
  1090. /* Not our event */
  1091. if (event->attr.type != event->pmu->type)
  1092. return -ENOENT;
  1093. /* Unused areas must be 0 */
  1094. if (event_get_reserved1(event) ||
  1095. event_get_reserved2(event) ||
  1096. event_get_reserved3(event)) {
  1097. pr_devel("reserved set when forbidden 0x%llx(0x%llx) 0x%llx(0x%llx) 0x%llx(0x%llx)\n",
  1098. event->attr.config,
  1099. event_get_reserved1(event),
  1100. event->attr.config1,
  1101. event_get_reserved2(event),
  1102. event->attr.config2,
  1103. event_get_reserved3(event));
  1104. return -EINVAL;
  1105. }
  1106. /* unsupported modes and filters */
  1107. if (event->attr.exclude_user ||
  1108. event->attr.exclude_kernel ||
  1109. event->attr.exclude_hv ||
  1110. event->attr.exclude_idle ||
  1111. event->attr.exclude_host ||
  1112. event->attr.exclude_guest)
  1113. return -EINVAL;
  1114. /* no branch sampling */
  1115. if (has_branch_stack(event))
  1116. return -EOPNOTSUPP;
  1117. /* offset must be 8 byte aligned */
  1118. if (event_get_offset(event) % 8) {
  1119. pr_devel("bad alignment\n");
  1120. return -EINVAL;
  1121. }
  1122. domain = event_get_domain(event);
  1123. if (domain >= HV_PERF_DOMAIN_MAX) {
  1124. pr_devel("invalid domain %d\n", domain);
  1125. return -EINVAL;
  1126. }
  1127. hret = hv_perf_caps_get(&caps);
  1128. if (hret) {
  1129. pr_devel("could not get capabilities: rc=%ld\n", hret);
  1130. return -EIO;
  1131. }
  1132. /* Physical domains & other lpars require extra capabilities */
  1133. if (!caps.collect_privileged && (is_physical_domain(domain) ||
  1134. (event_get_lpar(event) != event_get_lpar_max()))) {
  1135. pr_devel("hv permissions disallow: is_physical_domain:%d, lpar=0x%llx\n",
  1136. is_physical_domain(domain),
  1137. event_get_lpar(event));
  1138. return -EACCES;
  1139. }
  1140. /* Get the initial value of the counter for this event */
  1141. if (single_24x7_request(event, &ct)) {
  1142. pr_devel("test hcall failed\n");
  1143. return -EIO;
  1144. }
  1145. (void)local64_xchg(&event->hw.prev_count, ct);
  1146. return 0;
  1147. }
  1148. static u64 h_24x7_get_value(struct perf_event *event)
  1149. {
  1150. u64 ct;
  1151. if (single_24x7_request(event, &ct))
  1152. /* We checked this in event init, shouldn't fail here... */
  1153. return 0;
  1154. return ct;
  1155. }
  1156. static void update_event_count(struct perf_event *event, u64 now)
  1157. {
  1158. s64 prev;
  1159. prev = local64_xchg(&event->hw.prev_count, now);
  1160. local64_add(now - prev, &event->count);
  1161. }
  1162. static void h_24x7_event_read(struct perf_event *event)
  1163. {
  1164. u64 now;
  1165. struct hv_24x7_request_buffer *request_buffer;
  1166. struct hv_24x7_hw *h24x7hw;
  1167. int txn_flags;
  1168. txn_flags = __this_cpu_read(hv_24x7_txn_flags);
  1169. /*
  1170. * If in a READ transaction, add this counter to the list of
  1171. * counters to read during the next HCALL (i.e commit_txn()).
  1172. * If not in a READ transaction, go ahead and make the HCALL
  1173. * to read this counter by itself.
  1174. */
  1175. if (txn_flags & PERF_PMU_TXN_READ) {
  1176. int i;
  1177. int ret;
  1178. if (__this_cpu_read(hv_24x7_txn_err))
  1179. return;
  1180. request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
  1181. ret = add_event_to_24x7_request(event, request_buffer);
  1182. if (ret) {
  1183. __this_cpu_write(hv_24x7_txn_err, ret);
  1184. } else {
  1185. /*
  1186. * Associate the event with the HCALL request index,
  1187. * so ->commit_txn() can quickly find/update count.
  1188. */
  1189. i = request_buffer->num_requests - 1;
  1190. h24x7hw = &get_cpu_var(hv_24x7_hw);
  1191. h24x7hw->events[i] = event;
  1192. put_cpu_var(h24x7hw);
  1193. }
  1194. put_cpu_var(hv_24x7_reqb);
  1195. } else {
  1196. now = h_24x7_get_value(event);
  1197. update_event_count(event, now);
  1198. }
  1199. }
  1200. static void h_24x7_event_start(struct perf_event *event, int flags)
  1201. {
  1202. if (flags & PERF_EF_RELOAD)
  1203. local64_set(&event->hw.prev_count, h_24x7_get_value(event));
  1204. }
  1205. static void h_24x7_event_stop(struct perf_event *event, int flags)
  1206. {
  1207. h_24x7_event_read(event);
  1208. }
  1209. static int h_24x7_event_add(struct perf_event *event, int flags)
  1210. {
  1211. if (flags & PERF_EF_START)
  1212. h_24x7_event_start(event, flags);
  1213. return 0;
  1214. }
  1215. /*
  1216. * 24x7 counters only support READ transactions. They are
  1217. * always counting and dont need/support ADD transactions.
  1218. * Cache the flags, but otherwise ignore transactions that
  1219. * are not PERF_PMU_TXN_READ.
  1220. */
  1221. static void h_24x7_event_start_txn(struct pmu *pmu, unsigned int flags)
  1222. {
  1223. struct hv_24x7_request_buffer *request_buffer;
  1224. struct hv_24x7_data_result_buffer *result_buffer;
  1225. /* We should not be called if we are already in a txn */
  1226. WARN_ON_ONCE(__this_cpu_read(hv_24x7_txn_flags));
  1227. __this_cpu_write(hv_24x7_txn_flags, flags);
  1228. if (flags & ~PERF_PMU_TXN_READ)
  1229. return;
  1230. request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
  1231. result_buffer = (void *)get_cpu_var(hv_24x7_resb);
  1232. init_24x7_request(request_buffer, result_buffer);
  1233. put_cpu_var(hv_24x7_resb);
  1234. put_cpu_var(hv_24x7_reqb);
  1235. }
  1236. /*
  1237. * Clean up transaction state.
  1238. *
  1239. * NOTE: Ignore state of request and result buffers for now.
  1240. * We will initialize them during the next read/txn.
  1241. */
  1242. static void reset_txn(void)
  1243. {
  1244. __this_cpu_write(hv_24x7_txn_flags, 0);
  1245. __this_cpu_write(hv_24x7_txn_err, 0);
  1246. }
  1247. /*
  1248. * 24x7 counters only support READ transactions. They are always counting
  1249. * and dont need/support ADD transactions. Clear ->txn_flags but otherwise
  1250. * ignore transactions that are not of type PERF_PMU_TXN_READ.
  1251. *
  1252. * For READ transactions, submit all pending 24x7 requests (i.e requests
  1253. * that were queued by h_24x7_event_read()), to the hypervisor and update
  1254. * the event counts.
  1255. */
  1256. static int h_24x7_event_commit_txn(struct pmu *pmu)
  1257. {
  1258. struct hv_24x7_request_buffer *request_buffer;
  1259. struct hv_24x7_data_result_buffer *result_buffer;
  1260. struct hv_24x7_result *res, *next_res;
  1261. u64 count;
  1262. int i, ret, txn_flags;
  1263. struct hv_24x7_hw *h24x7hw;
  1264. txn_flags = __this_cpu_read(hv_24x7_txn_flags);
  1265. WARN_ON_ONCE(!txn_flags);
  1266. ret = 0;
  1267. if (txn_flags & ~PERF_PMU_TXN_READ)
  1268. goto out;
  1269. ret = __this_cpu_read(hv_24x7_txn_err);
  1270. if (ret)
  1271. goto out;
  1272. request_buffer = (void *)get_cpu_var(hv_24x7_reqb);
  1273. result_buffer = (void *)get_cpu_var(hv_24x7_resb);
  1274. ret = make_24x7_request(request_buffer, result_buffer);
  1275. if (ret)
  1276. goto put_reqb;
  1277. h24x7hw = &get_cpu_var(hv_24x7_hw);
  1278. /* Go through results in the result buffer to update event counts. */
  1279. for (i = 0, res = result_buffer->results;
  1280. i < result_buffer->num_results; i++, res = next_res) {
  1281. struct perf_event *event = h24x7hw->events[res->result_ix];
  1282. ret = get_count_from_result(event, result_buffer, res, &count,
  1283. &next_res);
  1284. if (ret)
  1285. break;
  1286. update_event_count(event, count);
  1287. }
  1288. put_cpu_var(hv_24x7_hw);
  1289. put_reqb:
  1290. put_cpu_var(hv_24x7_resb);
  1291. put_cpu_var(hv_24x7_reqb);
  1292. out:
  1293. reset_txn();
  1294. return ret;
  1295. }
  1296. /*
  1297. * 24x7 counters only support READ transactions. They are always counting
  1298. * and dont need/support ADD transactions. However, regardless of type
  1299. * of transaction, all we need to do is cleanup, so we don't have to check
  1300. * the type of transaction.
  1301. */
  1302. static void h_24x7_event_cancel_txn(struct pmu *pmu)
  1303. {
  1304. WARN_ON_ONCE(!__this_cpu_read(hv_24x7_txn_flags));
  1305. reset_txn();
  1306. }
  1307. static struct pmu h_24x7_pmu = {
  1308. .task_ctx_nr = perf_invalid_context,
  1309. .name = "hv_24x7",
  1310. .attr_groups = attr_groups,
  1311. .event_init = h_24x7_event_init,
  1312. .add = h_24x7_event_add,
  1313. .del = h_24x7_event_stop,
  1314. .start = h_24x7_event_start,
  1315. .stop = h_24x7_event_stop,
  1316. .read = h_24x7_event_read,
  1317. .start_txn = h_24x7_event_start_txn,
  1318. .commit_txn = h_24x7_event_commit_txn,
  1319. .cancel_txn = h_24x7_event_cancel_txn,
  1320. };
  1321. static int hv_24x7_init(void)
  1322. {
  1323. int r;
  1324. unsigned long hret;
  1325. struct hv_perf_caps caps;
  1326. if (!firmware_has_feature(FW_FEATURE_LPAR)) {
  1327. pr_debug("not a virtualized system, not enabling\n");
  1328. return -ENODEV;
  1329. } else if (!cur_cpu_spec->oprofile_cpu_type)
  1330. return -ENODEV;
  1331. /* POWER8 only supports v1, while POWER9 only supports v2. */
  1332. if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8"))
  1333. interface_version = 1;
  1334. else {
  1335. interface_version = 2;
  1336. /* SMT8 in POWER9 needs to aggregate result elements. */
  1337. if (threads_per_core == 8)
  1338. aggregate_result_elements = true;
  1339. }
  1340. hret = hv_perf_caps_get(&caps);
  1341. if (hret) {
  1342. pr_debug("could not obtain capabilities, not enabling, rc=%ld\n",
  1343. hret);
  1344. return -ENODEV;
  1345. }
  1346. hv_page_cache = kmem_cache_create("hv-page-4096", 4096, 4096, 0, NULL);
  1347. if (!hv_page_cache)
  1348. return -ENOMEM;
  1349. /* sampling not supported */
  1350. h_24x7_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
  1351. r = create_events_from_catalog(&event_group.attrs,
  1352. &event_desc_group.attrs,
  1353. &event_long_desc_group.attrs);
  1354. if (r)
  1355. return r;
  1356. r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
  1357. if (r)
  1358. return r;
  1359. return 0;
  1360. }
  1361. device_initcall(hv_24x7_init);