core.c 75 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071
  1. /*
  2. * Performance events x86 architecture code
  3. *
  4. * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  5. * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
  6. * Copyright (C) 2009 Jaswinder Singh Rajput
  7. * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
  8. * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
  9. * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
  10. * Copyright (C) 2009 Google, Inc., Stephane Eranian
  11. *
  12. * For licencing details see kernel-base/COPYING
  13. */
  14. #include <linux/perf_event.h>
  15. #include <linux/capability.h>
  16. #include <linux/notifier.h>
  17. #include <linux/hardirq.h>
  18. #include <linux/kprobes.h>
  19. #include <linux/export.h>
  20. #include <linux/init.h>
  21. #include <linux/kdebug.h>
  22. #include <linux/sched/mm.h>
  23. #include <linux/sched/clock.h>
  24. #include <linux/uaccess.h>
  25. #include <linux/slab.h>
  26. #include <linux/cpu.h>
  27. #include <linux/bitops.h>
  28. #include <linux/device.h>
  29. #include <linux/nospec.h>
  30. #include <linux/static_call.h>
  31. #include <asm/apic.h>
  32. #include <asm/stacktrace.h>
  33. #include <asm/nmi.h>
  34. #include <asm/smp.h>
  35. #include <asm/alternative.h>
  36. #include <asm/mmu_context.h>
  37. #include <asm/tlbflush.h>
  38. #include <asm/timer.h>
  39. #include <asm/desc.h>
  40. #include <asm/ldt.h>
  41. #include <asm/unwind.h>
  42. #include <asm/uprobes.h>
  43. #include <asm/ibt.h>
  44. #include "perf_event.h"
  45. struct x86_pmu x86_pmu __read_mostly;
  46. static struct pmu pmu;
  47. DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
  48. .enabled = 1,
  49. .pmu = &pmu,
  50. };
  51. DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
  52. DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
  53. DEFINE_STATIC_KEY_FALSE(perf_is_hybrid);
  54. /*
  55. * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
  56. * from just a typename, as opposed to an actual function.
  57. */
  58. DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq);
  59. DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
  60. DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all);
  61. DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable);
  62. DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable);
  63. DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign);
  64. DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
  65. DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
  66. DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
  67. DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period);
  68. DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update);
  69. DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period);
  70. DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events);
  71. DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
  72. DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
  73. DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling);
  74. DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
  75. DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling);
  76. DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task);
  77. DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
  78. DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
  79. DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
  80. DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);
  81. /*
  82. * This one is magic, it will get called even when PMU init fails (because
  83. * there is no PMU), in which case it should simply return NULL.
  84. */
  85. DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs);
  86. u64 __read_mostly hw_cache_event_ids
  87. [PERF_COUNT_HW_CACHE_MAX]
  88. [PERF_COUNT_HW_CACHE_OP_MAX]
  89. [PERF_COUNT_HW_CACHE_RESULT_MAX];
  90. u64 __read_mostly hw_cache_extra_regs
  91. [PERF_COUNT_HW_CACHE_MAX]
  92. [PERF_COUNT_HW_CACHE_OP_MAX]
  93. [PERF_COUNT_HW_CACHE_RESULT_MAX];
  94. /*
  95. * Propagate event elapsed time into the generic event.
  96. * Can only be executed on the CPU where the event is active.
  97. * Returns the delta events processed.
  98. */
  99. u64 x86_perf_event_update(struct perf_event *event)
  100. {
  101. struct hw_perf_event *hwc = &event->hw;
  102. int shift = 64 - x86_pmu.cntval_bits;
  103. u64 prev_raw_count, new_raw_count;
  104. u64 delta;
  105. if (unlikely(!hwc->event_base))
  106. return 0;
  107. /*
  108. * Careful: an NMI might modify the previous event value.
  109. *
  110. * Our tactic to handle this is to first atomically read and
  111. * exchange a new raw count - then add that new-prev delta
  112. * count to the generic event atomically:
  113. */
  114. prev_raw_count = local64_read(&hwc->prev_count);
  115. do {
  116. rdpmcl(hwc->event_base_rdpmc, new_raw_count);
  117. } while (!local64_try_cmpxchg(&hwc->prev_count,
  118. &prev_raw_count, new_raw_count));
  119. /*
  120. * Now we have the new raw value and have updated the prev
  121. * timestamp already. We can now calculate the elapsed delta
  122. * (event-)time and add that to the generic event.
  123. *
  124. * Careful, not all hw sign-extends above the physical width
  125. * of the count.
  126. */
  127. delta = (new_raw_count << shift) - (prev_raw_count << shift);
  128. delta >>= shift;
  129. local64_add(delta, &event->count);
  130. local64_sub(delta, &hwc->period_left);
  131. return new_raw_count;
  132. }
  133. /*
  134. * Find and validate any extra registers to set up.
  135. */
  136. static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
  137. {
  138. struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
  139. struct hw_perf_event_extra *reg;
  140. struct extra_reg *er;
  141. reg = &event->hw.extra_reg;
  142. if (!extra_regs)
  143. return 0;
  144. for (er = extra_regs; er->msr; er++) {
  145. if (er->event != (config & er->config_mask))
  146. continue;
  147. if (event->attr.config1 & ~er->valid_mask)
  148. return -EINVAL;
  149. /* Check if the extra msrs can be safely accessed*/
  150. if (!er->extra_msr_access)
  151. return -ENXIO;
  152. reg->idx = er->idx;
  153. reg->config = event->attr.config1;
  154. reg->reg = er->msr;
  155. break;
  156. }
  157. return 0;
  158. }
  159. static atomic_t active_events;
  160. static atomic_t pmc_refcount;
  161. static DEFINE_MUTEX(pmc_reserve_mutex);
  162. #ifdef CONFIG_X86_LOCAL_APIC
  163. static inline u64 get_possible_counter_mask(void)
  164. {
  165. u64 cntr_mask = x86_pmu.cntr_mask64;
  166. int i;
  167. if (!is_hybrid())
  168. return cntr_mask;
  169. for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
  170. cntr_mask |= x86_pmu.hybrid_pmu[i].cntr_mask64;
  171. return cntr_mask;
  172. }
  173. static bool reserve_pmc_hardware(void)
  174. {
  175. u64 cntr_mask = get_possible_counter_mask();
  176. int i, end;
  177. for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
  178. if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
  179. goto perfctr_fail;
  180. }
  181. for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
  182. if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
  183. goto eventsel_fail;
  184. }
  185. return true;
  186. eventsel_fail:
  187. end = i;
  188. for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
  189. release_evntsel_nmi(x86_pmu_config_addr(i));
  190. i = X86_PMC_IDX_MAX;
  191. perfctr_fail:
  192. end = i;
  193. for_each_set_bit(i, (unsigned long *)&cntr_mask, end)
  194. release_perfctr_nmi(x86_pmu_event_addr(i));
  195. return false;
  196. }
  197. static void release_pmc_hardware(void)
  198. {
  199. u64 cntr_mask = get_possible_counter_mask();
  200. int i;
  201. for_each_set_bit(i, (unsigned long *)&cntr_mask, X86_PMC_IDX_MAX) {
  202. release_perfctr_nmi(x86_pmu_event_addr(i));
  203. release_evntsel_nmi(x86_pmu_config_addr(i));
  204. }
  205. }
  206. #else
  207. static bool reserve_pmc_hardware(void) { return true; }
  208. static void release_pmc_hardware(void) {}
  209. #endif
  210. bool check_hw_exists(struct pmu *pmu, unsigned long *cntr_mask,
  211. unsigned long *fixed_cntr_mask)
  212. {
  213. u64 val, val_fail = -1, val_new= ~0;
  214. int i, reg, reg_fail = -1, ret = 0;
  215. int bios_fail = 0;
  216. int reg_safe = -1;
  217. /*
  218. * Check to see if the BIOS enabled any of the counters, if so
  219. * complain and bail.
  220. */
  221. for_each_set_bit(i, cntr_mask, X86_PMC_IDX_MAX) {
  222. reg = x86_pmu_config_addr(i);
  223. ret = rdmsrl_safe(reg, &val);
  224. if (ret)
  225. goto msr_fail;
  226. if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
  227. bios_fail = 1;
  228. val_fail = val;
  229. reg_fail = reg;
  230. } else {
  231. reg_safe = i;
  232. }
  233. }
  234. if (*(u64 *)fixed_cntr_mask) {
  235. reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
  236. ret = rdmsrl_safe(reg, &val);
  237. if (ret)
  238. goto msr_fail;
  239. for_each_set_bit(i, fixed_cntr_mask, X86_PMC_IDX_MAX) {
  240. if (fixed_counter_disabled(i, pmu))
  241. continue;
  242. if (val & (0x03ULL << i*4)) {
  243. bios_fail = 1;
  244. val_fail = val;
  245. reg_fail = reg;
  246. }
  247. }
  248. }
  249. /*
  250. * If all the counters are enabled, the below test will always
  251. * fail. The tools will also become useless in this scenario.
  252. * Just fail and disable the hardware counters.
  253. */
  254. if (reg_safe == -1) {
  255. reg = reg_safe;
  256. goto msr_fail;
  257. }
  258. /*
  259. * Read the current value, change it and read it back to see if it
  260. * matches, this is needed to detect certain hardware emulators
  261. * (qemu/kvm) that don't trap on the MSR access and always return 0s.
  262. */
  263. reg = x86_pmu_event_addr(reg_safe);
  264. if (rdmsrl_safe(reg, &val))
  265. goto msr_fail;
  266. val ^= 0xffffUL;
  267. ret = wrmsrl_safe(reg, val);
  268. ret |= rdmsrl_safe(reg, &val_new);
  269. if (ret || val != val_new)
  270. goto msr_fail;
  271. /*
  272. * We still allow the PMU driver to operate:
  273. */
  274. if (bios_fail) {
  275. pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
  276. pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
  277. reg_fail, val_fail);
  278. }
  279. return true;
  280. msr_fail:
  281. if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
  282. pr_cont("PMU not available due to virtualization, using software events only.\n");
  283. } else {
  284. pr_cont("Broken PMU hardware detected, using software events only.\n");
  285. pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
  286. reg, val_new);
  287. }
  288. return false;
  289. }
  290. static void hw_perf_event_destroy(struct perf_event *event)
  291. {
  292. x86_release_hardware();
  293. atomic_dec(&active_events);
  294. }
  295. void hw_perf_lbr_event_destroy(struct perf_event *event)
  296. {
  297. hw_perf_event_destroy(event);
  298. /* undo the lbr/bts event accounting */
  299. x86_del_exclusive(x86_lbr_exclusive_lbr);
  300. }
  301. static inline int x86_pmu_initialized(void)
  302. {
  303. return x86_pmu.handle_irq != NULL;
  304. }
  305. static inline int
  306. set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
  307. {
  308. struct perf_event_attr *attr = &event->attr;
  309. unsigned int cache_type, cache_op, cache_result;
  310. u64 config, val;
  311. config = attr->config;
  312. cache_type = (config >> 0) & 0xff;
  313. if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
  314. return -EINVAL;
  315. cache_type = array_index_nospec(cache_type, PERF_COUNT_HW_CACHE_MAX);
  316. cache_op = (config >> 8) & 0xff;
  317. if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
  318. return -EINVAL;
  319. cache_op = array_index_nospec(cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
  320. cache_result = (config >> 16) & 0xff;
  321. if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
  322. return -EINVAL;
  323. cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
  324. val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result];
  325. if (val == 0)
  326. return -ENOENT;
  327. if (val == -1)
  328. return -EINVAL;
  329. hwc->config |= val;
  330. attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result];
  331. return x86_pmu_extra_regs(val, event);
  332. }
  333. int x86_reserve_hardware(void)
  334. {
  335. int err = 0;
  336. if (!atomic_inc_not_zero(&pmc_refcount)) {
  337. mutex_lock(&pmc_reserve_mutex);
  338. if (atomic_read(&pmc_refcount) == 0) {
  339. if (!reserve_pmc_hardware()) {
  340. err = -EBUSY;
  341. } else {
  342. reserve_ds_buffers();
  343. reserve_lbr_buffers();
  344. }
  345. }
  346. if (!err)
  347. atomic_inc(&pmc_refcount);
  348. mutex_unlock(&pmc_reserve_mutex);
  349. }
  350. return err;
  351. }
  352. void x86_release_hardware(void)
  353. {
  354. if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
  355. release_pmc_hardware();
  356. release_ds_buffers();
  357. release_lbr_buffers();
  358. mutex_unlock(&pmc_reserve_mutex);
  359. }
  360. }
  361. /*
  362. * Check if we can create event of a certain type (that no conflicting events
  363. * are present).
  364. */
  365. int x86_add_exclusive(unsigned int what)
  366. {
  367. int i;
  368. /*
  369. * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
  370. * LBR and BTS are still mutually exclusive.
  371. */
  372. if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
  373. goto out;
  374. if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
  375. mutex_lock(&pmc_reserve_mutex);
  376. for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
  377. if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
  378. goto fail_unlock;
  379. }
  380. atomic_inc(&x86_pmu.lbr_exclusive[what]);
  381. mutex_unlock(&pmc_reserve_mutex);
  382. }
  383. out:
  384. atomic_inc(&active_events);
  385. return 0;
  386. fail_unlock:
  387. mutex_unlock(&pmc_reserve_mutex);
  388. return -EBUSY;
  389. }
  390. void x86_del_exclusive(unsigned int what)
  391. {
  392. atomic_dec(&active_events);
  393. /*
  394. * See the comment in x86_add_exclusive().
  395. */
  396. if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
  397. return;
  398. atomic_dec(&x86_pmu.lbr_exclusive[what]);
  399. }
  400. int x86_setup_perfctr(struct perf_event *event)
  401. {
  402. struct perf_event_attr *attr = &event->attr;
  403. struct hw_perf_event *hwc = &event->hw;
  404. u64 config;
  405. if (!is_sampling_event(event)) {
  406. hwc->sample_period = x86_pmu.max_period;
  407. hwc->last_period = hwc->sample_period;
  408. local64_set(&hwc->period_left, hwc->sample_period);
  409. }
  410. if (attr->type == event->pmu->type)
  411. return x86_pmu_extra_regs(event->attr.config, event);
  412. if (attr->type == PERF_TYPE_HW_CACHE)
  413. return set_ext_hw_attr(hwc, event);
  414. if (attr->config >= x86_pmu.max_events)
  415. return -EINVAL;
  416. attr->config = array_index_nospec((unsigned long)attr->config, x86_pmu.max_events);
  417. /*
  418. * The generic map:
  419. */
  420. config = x86_pmu.event_map(attr->config);
  421. if (config == 0)
  422. return -ENOENT;
  423. if (config == -1LL)
  424. return -EINVAL;
  425. hwc->config |= config;
  426. return 0;
  427. }
  428. /*
  429. * check that branch_sample_type is compatible with
  430. * settings needed for precise_ip > 1 which implies
  431. * using the LBR to capture ALL taken branches at the
  432. * priv levels of the measurement
  433. */
  434. static inline int precise_br_compat(struct perf_event *event)
  435. {
  436. u64 m = event->attr.branch_sample_type;
  437. u64 b = 0;
  438. /* must capture all branches */
  439. if (!(m & PERF_SAMPLE_BRANCH_ANY))
  440. return 0;
  441. m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
  442. if (!event->attr.exclude_user)
  443. b |= PERF_SAMPLE_BRANCH_USER;
  444. if (!event->attr.exclude_kernel)
  445. b |= PERF_SAMPLE_BRANCH_KERNEL;
  446. /*
  447. * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
  448. */
  449. return m == b;
  450. }
  451. int x86_pmu_max_precise(void)
  452. {
  453. int precise = 0;
  454. /* Support for constant skid */
  455. if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
  456. precise++;
  457. /* Support for IP fixup */
  458. if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
  459. precise++;
  460. if (x86_pmu.pebs_prec_dist)
  461. precise++;
  462. }
  463. return precise;
  464. }
  465. int x86_pmu_hw_config(struct perf_event *event)
  466. {
  467. if (event->attr.precise_ip) {
  468. int precise = x86_pmu_max_precise();
  469. if (event->attr.precise_ip > precise)
  470. return -EOPNOTSUPP;
  471. /* There's no sense in having PEBS for non sampling events: */
  472. if (!is_sampling_event(event))
  473. return -EINVAL;
  474. }
  475. /*
  476. * check that PEBS LBR correction does not conflict with
  477. * whatever the user is asking with attr->branch_sample_type
  478. */
  479. if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
  480. u64 *br_type = &event->attr.branch_sample_type;
  481. if (has_branch_stack(event)) {
  482. if (!precise_br_compat(event))
  483. return -EOPNOTSUPP;
  484. /* branch_sample_type is compatible */
  485. } else {
  486. /*
  487. * user did not specify branch_sample_type
  488. *
  489. * For PEBS fixups, we capture all
  490. * the branches at the priv level of the
  491. * event.
  492. */
  493. *br_type = PERF_SAMPLE_BRANCH_ANY;
  494. if (!event->attr.exclude_user)
  495. *br_type |= PERF_SAMPLE_BRANCH_USER;
  496. if (!event->attr.exclude_kernel)
  497. *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
  498. }
  499. }
  500. if (branch_sample_call_stack(event))
  501. event->attach_state |= PERF_ATTACH_TASK_DATA;
  502. /*
  503. * Generate PMC IRQs:
  504. * (keep 'enabled' bit clear for now)
  505. */
  506. event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
  507. /*
  508. * Count user and OS events unless requested not to
  509. */
  510. if (!event->attr.exclude_user)
  511. event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
  512. if (!event->attr.exclude_kernel)
  513. event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
  514. if (event->attr.type == event->pmu->type)
  515. event->hw.config |= x86_pmu_get_event_config(event);
  516. if (event->attr.sample_period && x86_pmu.limit_period) {
  517. s64 left = event->attr.sample_period;
  518. x86_pmu.limit_period(event, &left);
  519. if (left > event->attr.sample_period)
  520. return -EINVAL;
  521. }
  522. /* sample_regs_user never support XMM registers */
  523. if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK))
  524. return -EINVAL;
  525. /*
  526. * Besides the general purpose registers, XMM registers may
  527. * be collected in PEBS on some platforms, e.g. Icelake
  528. */
  529. if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) {
  530. if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS))
  531. return -EINVAL;
  532. if (!event->attr.precise_ip)
  533. return -EINVAL;
  534. }
  535. return x86_setup_perfctr(event);
  536. }
  537. /*
  538. * Setup the hardware configuration for a given attr_type
  539. */
  540. static int __x86_pmu_event_init(struct perf_event *event)
  541. {
  542. int err;
  543. if (!x86_pmu_initialized())
  544. return -ENODEV;
  545. err = x86_reserve_hardware();
  546. if (err)
  547. return err;
  548. atomic_inc(&active_events);
  549. event->destroy = hw_perf_event_destroy;
  550. event->hw.idx = -1;
  551. event->hw.last_cpu = -1;
  552. event->hw.last_tag = ~0ULL;
  553. /* mark unused */
  554. event->hw.extra_reg.idx = EXTRA_REG_NONE;
  555. event->hw.branch_reg.idx = EXTRA_REG_NONE;
  556. return x86_pmu.hw_config(event);
  557. }
  558. void x86_pmu_disable_all(void)
  559. {
  560. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  561. int idx;
  562. for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
  563. struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
  564. u64 val;
  565. if (!test_bit(idx, cpuc->active_mask))
  566. continue;
  567. rdmsrl(x86_pmu_config_addr(idx), val);
  568. if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
  569. continue;
  570. val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
  571. wrmsrl(x86_pmu_config_addr(idx), val);
  572. if (is_counter_pair(hwc))
  573. wrmsrl(x86_pmu_config_addr(idx + 1), 0);
  574. }
  575. }
  576. struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data)
  577. {
  578. return static_call(x86_pmu_guest_get_msrs)(nr, data);
  579. }
  580. EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
  581. /*
  582. * There may be PMI landing after enabled=0. The PMI hitting could be before or
  583. * after disable_all.
  584. *
  585. * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
  586. * It will not be re-enabled in the NMI handler again, because enabled=0. After
  587. * handling the NMI, disable_all will be called, which will not change the
  588. * state either. If PMI hits after disable_all, the PMU is already disabled
  589. * before entering NMI handler. The NMI handler will not change the state
  590. * either.
  591. *
  592. * So either situation is harmless.
  593. */
  594. static void x86_pmu_disable(struct pmu *pmu)
  595. {
  596. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  597. if (!x86_pmu_initialized())
  598. return;
  599. if (!cpuc->enabled)
  600. return;
  601. cpuc->n_added = 0;
  602. cpuc->enabled = 0;
  603. barrier();
  604. static_call(x86_pmu_disable_all)();
  605. }
  606. void x86_pmu_enable_all(int added)
  607. {
  608. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  609. int idx;
  610. for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
  611. struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
  612. if (!test_bit(idx, cpuc->active_mask))
  613. continue;
  614. __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
  615. }
  616. }
  617. static inline int is_x86_event(struct perf_event *event)
  618. {
  619. int i;
  620. if (!is_hybrid())
  621. return event->pmu == &pmu;
  622. for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
  623. if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu)
  624. return true;
  625. }
  626. return false;
  627. }
  628. struct pmu *x86_get_pmu(unsigned int cpu)
  629. {
  630. struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
  631. /*
  632. * All CPUs of the hybrid type have been offline.
  633. * The x86_get_pmu() should not be invoked.
  634. */
  635. if (WARN_ON_ONCE(!cpuc->pmu))
  636. return &pmu;
  637. return cpuc->pmu;
  638. }
  639. /*
  640. * Event scheduler state:
  641. *
  642. * Assign events iterating over all events and counters, beginning
  643. * with events with least weights first. Keep the current iterator
  644. * state in struct sched_state.
  645. */
  646. struct sched_state {
  647. int weight;
  648. int event; /* event index */
  649. int counter; /* counter index */
  650. int unassigned; /* number of events to be assigned left */
  651. int nr_gp; /* number of GP counters used */
  652. u64 used;
  653. };
  654. /* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
  655. #define SCHED_STATES_MAX 2
  656. struct perf_sched {
  657. int max_weight;
  658. int max_events;
  659. int max_gp;
  660. int saved_states;
  661. struct event_constraint **constraints;
  662. struct sched_state state;
  663. struct sched_state saved[SCHED_STATES_MAX];
  664. };
  665. /*
  666. * Initialize iterator that runs through all events and counters.
  667. */
  668. static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
  669. int num, int wmin, int wmax, int gpmax)
  670. {
  671. int idx;
  672. memset(sched, 0, sizeof(*sched));
  673. sched->max_events = num;
  674. sched->max_weight = wmax;
  675. sched->max_gp = gpmax;
  676. sched->constraints = constraints;
  677. for (idx = 0; idx < num; idx++) {
  678. if (constraints[idx]->weight == wmin)
  679. break;
  680. }
  681. sched->state.event = idx; /* start with min weight */
  682. sched->state.weight = wmin;
  683. sched->state.unassigned = num;
  684. }
  685. static void perf_sched_save_state(struct perf_sched *sched)
  686. {
  687. if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
  688. return;
  689. sched->saved[sched->saved_states] = sched->state;
  690. sched->saved_states++;
  691. }
  692. static bool perf_sched_restore_state(struct perf_sched *sched)
  693. {
  694. if (!sched->saved_states)
  695. return false;
  696. sched->saved_states--;
  697. sched->state = sched->saved[sched->saved_states];
  698. /* this assignment didn't work out */
  699. /* XXX broken vs EVENT_PAIR */
  700. sched->state.used &= ~BIT_ULL(sched->state.counter);
  701. /* try the next one */
  702. sched->state.counter++;
  703. return true;
  704. }
  705. /*
  706. * Select a counter for the current event to schedule. Return true on
  707. * success.
  708. */
  709. static bool __perf_sched_find_counter(struct perf_sched *sched)
  710. {
  711. struct event_constraint *c;
  712. int idx;
  713. if (!sched->state.unassigned)
  714. return false;
  715. if (sched->state.event >= sched->max_events)
  716. return false;
  717. c = sched->constraints[sched->state.event];
  718. /* Prefer fixed purpose counters */
  719. if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
  720. idx = INTEL_PMC_IDX_FIXED;
  721. for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
  722. u64 mask = BIT_ULL(idx);
  723. if (sched->state.used & mask)
  724. continue;
  725. sched->state.used |= mask;
  726. goto done;
  727. }
  728. }
  729. /* Grab the first unused counter starting with idx */
  730. idx = sched->state.counter;
  731. for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
  732. u64 mask = BIT_ULL(idx);
  733. if (c->flags & PERF_X86_EVENT_PAIR)
  734. mask |= mask << 1;
  735. if (sched->state.used & mask)
  736. continue;
  737. if (sched->state.nr_gp++ >= sched->max_gp)
  738. return false;
  739. sched->state.used |= mask;
  740. goto done;
  741. }
  742. return false;
  743. done:
  744. sched->state.counter = idx;
  745. if (c->overlap)
  746. perf_sched_save_state(sched);
  747. return true;
  748. }
  749. static bool perf_sched_find_counter(struct perf_sched *sched)
  750. {
  751. while (!__perf_sched_find_counter(sched)) {
  752. if (!perf_sched_restore_state(sched))
  753. return false;
  754. }
  755. return true;
  756. }
  757. /*
  758. * Go through all unassigned events and find the next one to schedule.
  759. * Take events with the least weight first. Return true on success.
  760. */
  761. static bool perf_sched_next_event(struct perf_sched *sched)
  762. {
  763. struct event_constraint *c;
  764. if (!sched->state.unassigned || !--sched->state.unassigned)
  765. return false;
  766. do {
  767. /* next event */
  768. sched->state.event++;
  769. if (sched->state.event >= sched->max_events) {
  770. /* next weight */
  771. sched->state.event = 0;
  772. sched->state.weight++;
  773. if (sched->state.weight > sched->max_weight)
  774. return false;
  775. }
  776. c = sched->constraints[sched->state.event];
  777. } while (c->weight != sched->state.weight);
  778. sched->state.counter = 0; /* start with first counter */
  779. return true;
  780. }
  781. /*
  782. * Assign a counter for each event.
  783. */
  784. int perf_assign_events(struct event_constraint **constraints, int n,
  785. int wmin, int wmax, int gpmax, int *assign)
  786. {
  787. struct perf_sched sched;
  788. perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
  789. do {
  790. if (!perf_sched_find_counter(&sched))
  791. break; /* failed */
  792. if (assign)
  793. assign[sched.state.event] = sched.state.counter;
  794. } while (perf_sched_next_event(&sched));
  795. return sched.state.unassigned;
  796. }
  797. EXPORT_SYMBOL_GPL(perf_assign_events);
  798. int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
  799. {
  800. struct event_constraint *c;
  801. struct perf_event *e;
  802. int n0, i, wmin, wmax, unsched = 0;
  803. struct hw_perf_event *hwc;
  804. u64 used_mask = 0;
  805. /*
  806. * Compute the number of events already present; see x86_pmu_add(),
  807. * validate_group() and x86_pmu_commit_txn(). For the former two
  808. * cpuc->n_events hasn't been updated yet, while for the latter
  809. * cpuc->n_txn contains the number of events added in the current
  810. * transaction.
  811. */
  812. n0 = cpuc->n_events;
  813. if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
  814. n0 -= cpuc->n_txn;
  815. static_call_cond(x86_pmu_start_scheduling)(cpuc);
  816. for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
  817. c = cpuc->event_constraint[i];
  818. /*
  819. * Previously scheduled events should have a cached constraint,
  820. * while new events should not have one.
  821. */
  822. WARN_ON_ONCE((c && i >= n0) || (!c && i < n0));
  823. /*
  824. * Request constraints for new events; or for those events that
  825. * have a dynamic constraint -- for those the constraint can
  826. * change due to external factors (sibling state, allow_tfa).
  827. */
  828. if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
  829. c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
  830. cpuc->event_constraint[i] = c;
  831. }
  832. wmin = min(wmin, c->weight);
  833. wmax = max(wmax, c->weight);
  834. }
  835. /*
  836. * fastpath, try to reuse previous register
  837. */
  838. for (i = 0; i < n; i++) {
  839. u64 mask;
  840. hwc = &cpuc->event_list[i]->hw;
  841. c = cpuc->event_constraint[i];
  842. /* never assigned */
  843. if (hwc->idx == -1)
  844. break;
  845. /* constraint still honored */
  846. if (!test_bit(hwc->idx, c->idxmsk))
  847. break;
  848. mask = BIT_ULL(hwc->idx);
  849. if (is_counter_pair(hwc))
  850. mask |= mask << 1;
  851. /* not already used */
  852. if (used_mask & mask)
  853. break;
  854. used_mask |= mask;
  855. if (assign)
  856. assign[i] = hwc->idx;
  857. }
  858. /* slow path */
  859. if (i != n) {
  860. int gpmax = x86_pmu_max_num_counters(cpuc->pmu);
  861. /*
  862. * Do not allow scheduling of more than half the available
  863. * generic counters.
  864. *
  865. * This helps avoid counter starvation of sibling thread by
  866. * ensuring at most half the counters cannot be in exclusive
  867. * mode. There is no designated counters for the limits. Any
  868. * N/2 counters can be used. This helps with events with
  869. * specific counter constraints.
  870. */
  871. if (is_ht_workaround_enabled() && !cpuc->is_fake &&
  872. READ_ONCE(cpuc->excl_cntrs->exclusive_present))
  873. gpmax /= 2;
  874. /*
  875. * Reduce the amount of available counters to allow fitting
  876. * the extra Merge events needed by large increment events.
  877. */
  878. if (x86_pmu.flags & PMU_FL_PAIR) {
  879. gpmax -= cpuc->n_pair;
  880. WARN_ON(gpmax <= 0);
  881. }
  882. unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
  883. wmax, gpmax, assign);
  884. }
  885. /*
  886. * In case of success (unsched = 0), mark events as committed,
  887. * so we do not put_constraint() in case new events are added
  888. * and fail to be scheduled
  889. *
  890. * We invoke the lower level commit callback to lock the resource
  891. *
  892. * We do not need to do all of this in case we are called to
  893. * validate an event group (assign == NULL)
  894. */
  895. if (!unsched && assign) {
  896. for (i = 0; i < n; i++)
  897. static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
  898. } else {
  899. for (i = n0; i < n; i++) {
  900. e = cpuc->event_list[i];
  901. /*
  902. * release events that failed scheduling
  903. */
  904. static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);
  905. cpuc->event_constraint[i] = NULL;
  906. }
  907. }
  908. static_call_cond(x86_pmu_stop_scheduling)(cpuc);
  909. return unsched ? -EINVAL : 0;
  910. }
  911. static int add_nr_metric_event(struct cpu_hw_events *cpuc,
  912. struct perf_event *event)
  913. {
  914. if (is_metric_event(event)) {
  915. if (cpuc->n_metric == INTEL_TD_METRIC_NUM)
  916. return -EINVAL;
  917. cpuc->n_metric++;
  918. cpuc->n_txn_metric++;
  919. }
  920. return 0;
  921. }
  922. static void del_nr_metric_event(struct cpu_hw_events *cpuc,
  923. struct perf_event *event)
  924. {
  925. if (is_metric_event(event))
  926. cpuc->n_metric--;
  927. }
  928. static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
  929. int max_count, int n)
  930. {
  931. union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
  932. if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
  933. return -EINVAL;
  934. if (n >= max_count + cpuc->n_metric)
  935. return -EINVAL;
  936. cpuc->event_list[n] = event;
  937. if (is_counter_pair(&event->hw)) {
  938. cpuc->n_pair++;
  939. cpuc->n_txn_pair++;
  940. }
  941. return 0;
  942. }
  943. /*
  944. * dogrp: true if must collect siblings events (group)
  945. * returns total number of events and error code
  946. */
  947. static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
  948. {
  949. struct perf_event *event;
  950. int n, max_count;
  951. max_count = x86_pmu_num_counters(cpuc->pmu) + x86_pmu_num_counters_fixed(cpuc->pmu);
  952. /* current number of events already accepted */
  953. n = cpuc->n_events;
  954. if (!cpuc->n_events)
  955. cpuc->pebs_output = 0;
  956. if (!cpuc->is_fake && leader->attr.precise_ip) {
  957. /*
  958. * For PEBS->PT, if !aux_event, the group leader (PT) went
  959. * away, the group was broken down and this singleton event
  960. * can't schedule any more.
  961. */
  962. if (is_pebs_pt(leader) && !leader->aux_event)
  963. return -EINVAL;
  964. /*
  965. * pebs_output: 0: no PEBS so far, 1: PT, 2: DS
  966. */
  967. if (cpuc->pebs_output &&
  968. cpuc->pebs_output != is_pebs_pt(leader) + 1)
  969. return -EINVAL;
  970. cpuc->pebs_output = is_pebs_pt(leader) + 1;
  971. }
  972. if (is_x86_event(leader)) {
  973. if (collect_event(cpuc, leader, max_count, n))
  974. return -EINVAL;
  975. n++;
  976. }
  977. if (!dogrp)
  978. return n;
  979. for_each_sibling_event(event, leader) {
  980. if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF)
  981. continue;
  982. if (collect_event(cpuc, event, max_count, n))
  983. return -EINVAL;
  984. n++;
  985. }
  986. return n;
  987. }
  988. static inline void x86_assign_hw_event(struct perf_event *event,
  989. struct cpu_hw_events *cpuc, int i)
  990. {
  991. struct hw_perf_event *hwc = &event->hw;
  992. int idx;
  993. idx = hwc->idx = cpuc->assign[i];
  994. hwc->last_cpu = smp_processor_id();
  995. hwc->last_tag = ++cpuc->tags[i];
  996. static_call_cond(x86_pmu_assign)(event, idx);
  997. switch (hwc->idx) {
  998. case INTEL_PMC_IDX_FIXED_BTS:
  999. case INTEL_PMC_IDX_FIXED_VLBR:
  1000. hwc->config_base = 0;
  1001. hwc->event_base = 0;
  1002. break;
  1003. case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END:
  1004. /* All the metric events are mapped onto the fixed counter 3. */
  1005. idx = INTEL_PMC_IDX_FIXED_SLOTS;
  1006. fallthrough;
  1007. case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
  1008. hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
  1009. hwc->event_base = x86_pmu_fixed_ctr_addr(idx - INTEL_PMC_IDX_FIXED);
  1010. hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) |
  1011. INTEL_PMC_FIXED_RDPMC_BASE;
  1012. break;
  1013. default:
  1014. hwc->config_base = x86_pmu_config_addr(hwc->idx);
  1015. hwc->event_base = x86_pmu_event_addr(hwc->idx);
  1016. hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
  1017. break;
  1018. }
  1019. }
  1020. /**
  1021. * x86_perf_rdpmc_index - Return PMC counter used for event
  1022. * @event: the perf_event to which the PMC counter was assigned
  1023. *
  1024. * The counter assigned to this performance event may change if interrupts
  1025. * are enabled. This counter should thus never be used while interrupts are
  1026. * enabled. Before this function is used to obtain the assigned counter the
  1027. * event should be checked for validity using, for example,
  1028. * perf_event_read_local(), within the same interrupt disabled section in
  1029. * which this counter is planned to be used.
  1030. *
  1031. * Return: The index of the performance monitoring counter assigned to
  1032. * @perf_event.
  1033. */
  1034. int x86_perf_rdpmc_index(struct perf_event *event)
  1035. {
  1036. lockdep_assert_irqs_disabled();
  1037. return event->hw.event_base_rdpmc;
  1038. }
  1039. static inline int match_prev_assignment(struct hw_perf_event *hwc,
  1040. struct cpu_hw_events *cpuc,
  1041. int i)
  1042. {
  1043. return hwc->idx == cpuc->assign[i] &&
  1044. hwc->last_cpu == smp_processor_id() &&
  1045. hwc->last_tag == cpuc->tags[i];
  1046. }
  1047. static void x86_pmu_start(struct perf_event *event, int flags);
  1048. static void x86_pmu_enable(struct pmu *pmu)
  1049. {
  1050. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  1051. struct perf_event *event;
  1052. struct hw_perf_event *hwc;
  1053. int i, added = cpuc->n_added;
  1054. if (!x86_pmu_initialized())
  1055. return;
  1056. if (cpuc->enabled)
  1057. return;
  1058. if (cpuc->n_added) {
  1059. int n_running = cpuc->n_events - cpuc->n_added;
  1060. /*
  1061. * apply assignment obtained either from
  1062. * hw_perf_group_sched_in() or x86_pmu_enable()
  1063. *
  1064. * step1: save events moving to new counters
  1065. */
  1066. for (i = 0; i < n_running; i++) {
  1067. event = cpuc->event_list[i];
  1068. hwc = &event->hw;
  1069. /*
  1070. * we can avoid reprogramming counter if:
  1071. * - assigned same counter as last time
  1072. * - running on same CPU as last time
  1073. * - no other event has used the counter since
  1074. */
  1075. if (hwc->idx == -1 ||
  1076. match_prev_assignment(hwc, cpuc, i))
  1077. continue;
  1078. /*
  1079. * Ensure we don't accidentally enable a stopped
  1080. * counter simply because we rescheduled.
  1081. */
  1082. if (hwc->state & PERF_HES_STOPPED)
  1083. hwc->state |= PERF_HES_ARCH;
  1084. x86_pmu_stop(event, PERF_EF_UPDATE);
  1085. }
  1086. /*
  1087. * step2: reprogram moved events into new counters
  1088. */
  1089. for (i = 0; i < cpuc->n_events; i++) {
  1090. event = cpuc->event_list[i];
  1091. hwc = &event->hw;
  1092. if (!match_prev_assignment(hwc, cpuc, i))
  1093. x86_assign_hw_event(event, cpuc, i);
  1094. else if (i < n_running)
  1095. continue;
  1096. if (hwc->state & PERF_HES_ARCH)
  1097. continue;
  1098. /*
  1099. * if cpuc->enabled = 0, then no wrmsr as
  1100. * per x86_pmu_enable_event()
  1101. */
  1102. x86_pmu_start(event, PERF_EF_RELOAD);
  1103. }
  1104. cpuc->n_added = 0;
  1105. perf_events_lapic_init();
  1106. }
  1107. cpuc->enabled = 1;
  1108. barrier();
  1109. static_call(x86_pmu_enable_all)(added);
  1110. }
  1111. DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
  1112. /*
  1113. * Set the next IRQ period, based on the hwc->period_left value.
  1114. * To be called with the event disabled in hw:
  1115. */
  1116. int x86_perf_event_set_period(struct perf_event *event)
  1117. {
  1118. struct hw_perf_event *hwc = &event->hw;
  1119. s64 left = local64_read(&hwc->period_left);
  1120. s64 period = hwc->sample_period;
  1121. int ret = 0, idx = hwc->idx;
  1122. if (unlikely(!hwc->event_base))
  1123. return 0;
  1124. /*
  1125. * If we are way outside a reasonable range then just skip forward:
  1126. */
  1127. if (unlikely(left <= -period)) {
  1128. left = period;
  1129. local64_set(&hwc->period_left, left);
  1130. hwc->last_period = period;
  1131. ret = 1;
  1132. }
  1133. if (unlikely(left <= 0)) {
  1134. left += period;
  1135. local64_set(&hwc->period_left, left);
  1136. hwc->last_period = period;
  1137. ret = 1;
  1138. }
  1139. /*
  1140. * Quirk: certain CPUs dont like it if just 1 hw_event is left:
  1141. */
  1142. if (unlikely(left < 2))
  1143. left = 2;
  1144. if (left > x86_pmu.max_period)
  1145. left = x86_pmu.max_period;
  1146. static_call_cond(x86_pmu_limit_period)(event, &left);
  1147. this_cpu_write(pmc_prev_left[idx], left);
  1148. /*
  1149. * The hw event starts counting from this event offset,
  1150. * mark it to be able to extra future deltas:
  1151. */
  1152. local64_set(&hwc->prev_count, (u64)-left);
  1153. wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
  1154. /*
  1155. * Sign extend the Merge event counter's upper 16 bits since
  1156. * we currently declare a 48-bit counter width
  1157. */
  1158. if (is_counter_pair(hwc))
  1159. wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff);
  1160. perf_event_update_userpage(event);
  1161. return ret;
  1162. }
  1163. void x86_pmu_enable_event(struct perf_event *event)
  1164. {
  1165. if (__this_cpu_read(cpu_hw_events.enabled))
  1166. __x86_pmu_enable_event(&event->hw,
  1167. ARCH_PERFMON_EVENTSEL_ENABLE);
  1168. }
  1169. /*
  1170. * Add a single event to the PMU.
  1171. *
  1172. * The event is added to the group of enabled events
  1173. * but only if it can be scheduled with existing events.
  1174. */
  1175. static int x86_pmu_add(struct perf_event *event, int flags)
  1176. {
  1177. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  1178. struct hw_perf_event *hwc;
  1179. int assign[X86_PMC_IDX_MAX];
  1180. int n, n0, ret;
  1181. hwc = &event->hw;
  1182. n0 = cpuc->n_events;
  1183. ret = n = collect_events(cpuc, event, false);
  1184. if (ret < 0)
  1185. goto out;
  1186. hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
  1187. if (!(flags & PERF_EF_START))
  1188. hwc->state |= PERF_HES_ARCH;
  1189. /*
  1190. * If group events scheduling transaction was started,
  1191. * skip the schedulability test here, it will be performed
  1192. * at commit time (->commit_txn) as a whole.
  1193. *
  1194. * If commit fails, we'll call ->del() on all events
  1195. * for which ->add() was called.
  1196. */
  1197. if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
  1198. goto done_collect;
  1199. ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
  1200. if (ret)
  1201. goto out;
  1202. /*
  1203. * copy new assignment, now we know it is possible
  1204. * will be used by hw_perf_enable()
  1205. */
  1206. memcpy(cpuc->assign, assign, n*sizeof(int));
  1207. done_collect:
  1208. /*
  1209. * Commit the collect_events() state. See x86_pmu_del() and
  1210. * x86_pmu_*_txn().
  1211. */
  1212. cpuc->n_events = n;
  1213. cpuc->n_added += n - n0;
  1214. cpuc->n_txn += n - n0;
  1215. /*
  1216. * This is before x86_pmu_enable() will call x86_pmu_start(),
  1217. * so we enable LBRs before an event needs them etc..
  1218. */
  1219. static_call_cond(x86_pmu_add)(event);
  1220. ret = 0;
  1221. out:
  1222. return ret;
  1223. }
  1224. static void x86_pmu_start(struct perf_event *event, int flags)
  1225. {
  1226. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  1227. int idx = event->hw.idx;
  1228. if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
  1229. return;
  1230. if (WARN_ON_ONCE(idx == -1))
  1231. return;
  1232. if (flags & PERF_EF_RELOAD) {
  1233. WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
  1234. static_call(x86_pmu_set_period)(event);
  1235. }
  1236. event->hw.state = 0;
  1237. cpuc->events[idx] = event;
  1238. __set_bit(idx, cpuc->active_mask);
  1239. static_call(x86_pmu_enable)(event);
  1240. perf_event_update_userpage(event);
  1241. }
  1242. void perf_event_print_debug(void)
  1243. {
  1244. u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
  1245. unsigned long *cntr_mask, *fixed_cntr_mask;
  1246. struct event_constraint *pebs_constraints;
  1247. struct cpu_hw_events *cpuc;
  1248. u64 pebs, debugctl;
  1249. int cpu, idx;
  1250. guard(irqsave)();
  1251. cpu = smp_processor_id();
  1252. cpuc = &per_cpu(cpu_hw_events, cpu);
  1253. cntr_mask = hybrid(cpuc->pmu, cntr_mask);
  1254. fixed_cntr_mask = hybrid(cpuc->pmu, fixed_cntr_mask);
  1255. pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
  1256. if (!*(u64 *)cntr_mask)
  1257. return;
  1258. if (x86_pmu.version >= 2) {
  1259. rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
  1260. rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
  1261. rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
  1262. rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
  1263. pr_info("\n");
  1264. pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
  1265. pr_info("CPU#%d: status: %016llx\n", cpu, status);
  1266. pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
  1267. pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
  1268. if (pebs_constraints) {
  1269. rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
  1270. pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
  1271. }
  1272. if (x86_pmu.lbr_nr) {
  1273. rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
  1274. pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
  1275. }
  1276. }
  1277. pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
  1278. for_each_set_bit(idx, cntr_mask, X86_PMC_IDX_MAX) {
  1279. rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
  1280. rdmsrl(x86_pmu_event_addr(idx), pmc_count);
  1281. prev_left = per_cpu(pmc_prev_left[idx], cpu);
  1282. pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
  1283. cpu, idx, pmc_ctrl);
  1284. pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
  1285. cpu, idx, pmc_count);
  1286. pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
  1287. cpu, idx, prev_left);
  1288. }
  1289. for_each_set_bit(idx, fixed_cntr_mask, X86_PMC_IDX_MAX) {
  1290. if (fixed_counter_disabled(idx, cpuc->pmu))
  1291. continue;
  1292. rdmsrl(x86_pmu_fixed_ctr_addr(idx), pmc_count);
  1293. pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
  1294. cpu, idx, pmc_count);
  1295. }
  1296. }
  1297. void x86_pmu_stop(struct perf_event *event, int flags)
  1298. {
  1299. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  1300. struct hw_perf_event *hwc = &event->hw;
  1301. if (test_bit(hwc->idx, cpuc->active_mask)) {
  1302. static_call(x86_pmu_disable)(event);
  1303. __clear_bit(hwc->idx, cpuc->active_mask);
  1304. cpuc->events[hwc->idx] = NULL;
  1305. WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
  1306. hwc->state |= PERF_HES_STOPPED;
  1307. }
  1308. if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
  1309. /*
  1310. * Drain the remaining delta count out of a event
  1311. * that we are disabling:
  1312. */
  1313. static_call(x86_pmu_update)(event);
  1314. hwc->state |= PERF_HES_UPTODATE;
  1315. }
  1316. }
  1317. static void x86_pmu_del(struct perf_event *event, int flags)
  1318. {
  1319. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  1320. union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
  1321. int i;
  1322. /*
  1323. * If we're called during a txn, we only need to undo x86_pmu.add.
  1324. * The events never got scheduled and ->cancel_txn will truncate
  1325. * the event_list.
  1326. *
  1327. * XXX assumes any ->del() called during a TXN will only be on
  1328. * an event added during that same TXN.
  1329. */
  1330. if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
  1331. goto do_del;
  1332. __set_bit(event->hw.idx, cpuc->dirty);
  1333. /*
  1334. * Not a TXN, therefore cleanup properly.
  1335. */
  1336. x86_pmu_stop(event, PERF_EF_UPDATE);
  1337. for (i = 0; i < cpuc->n_events; i++) {
  1338. if (event == cpuc->event_list[i])
  1339. break;
  1340. }
  1341. if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
  1342. return;
  1343. /* If we have a newly added event; make sure to decrease n_added. */
  1344. if (i >= cpuc->n_events - cpuc->n_added)
  1345. --cpuc->n_added;
  1346. static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);
  1347. /* Delete the array entry. */
  1348. while (++i < cpuc->n_events) {
  1349. cpuc->event_list[i-1] = cpuc->event_list[i];
  1350. cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
  1351. cpuc->assign[i-1] = cpuc->assign[i];
  1352. }
  1353. cpuc->event_constraint[i-1] = NULL;
  1354. --cpuc->n_events;
  1355. if (intel_cap.perf_metrics)
  1356. del_nr_metric_event(cpuc, event);
  1357. perf_event_update_userpage(event);
  1358. do_del:
  1359. /*
  1360. * This is after x86_pmu_stop(); so we disable LBRs after any
  1361. * event can need them etc..
  1362. */
  1363. static_call_cond(x86_pmu_del)(event);
  1364. }
  1365. int x86_pmu_handle_irq(struct pt_regs *regs)
  1366. {
  1367. struct perf_sample_data data;
  1368. struct cpu_hw_events *cpuc;
  1369. struct perf_event *event;
  1370. int idx, handled = 0;
  1371. u64 val;
  1372. cpuc = this_cpu_ptr(&cpu_hw_events);
  1373. /*
  1374. * Some chipsets need to unmask the LVTPC in a particular spot
  1375. * inside the nmi handler. As a result, the unmasking was pushed
  1376. * into all the nmi handlers.
  1377. *
  1378. * This generic handler doesn't seem to have any issues where the
  1379. * unmasking occurs so it was left at the top.
  1380. */
  1381. apic_write(APIC_LVTPC, APIC_DM_NMI);
  1382. for_each_set_bit(idx, x86_pmu.cntr_mask, X86_PMC_IDX_MAX) {
  1383. if (!test_bit(idx, cpuc->active_mask))
  1384. continue;
  1385. event = cpuc->events[idx];
  1386. val = static_call(x86_pmu_update)(event);
  1387. if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
  1388. continue;
  1389. /*
  1390. * event overflow
  1391. */
  1392. handled++;
  1393. if (!static_call(x86_pmu_set_period)(event))
  1394. continue;
  1395. perf_sample_data_init(&data, 0, event->hw.last_period);
  1396. if (has_branch_stack(event))
  1397. perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
  1398. if (perf_event_overflow(event, &data, regs))
  1399. x86_pmu_stop(event, 0);
  1400. }
  1401. if (handled)
  1402. inc_irq_stat(apic_perf_irqs);
  1403. return handled;
  1404. }
  1405. void perf_events_lapic_init(void)
  1406. {
  1407. if (!x86_pmu.apic || !x86_pmu_initialized())
  1408. return;
  1409. /*
  1410. * Always use NMI for PMU
  1411. */
  1412. apic_write(APIC_LVTPC, APIC_DM_NMI);
  1413. }
  1414. static int
  1415. perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
  1416. {
  1417. u64 start_clock;
  1418. u64 finish_clock;
  1419. int ret;
  1420. /*
  1421. * All PMUs/events that share this PMI handler should make sure to
  1422. * increment active_events for their events.
  1423. */
  1424. if (!atomic_read(&active_events))
  1425. return NMI_DONE;
  1426. start_clock = sched_clock();
  1427. ret = static_call(x86_pmu_handle_irq)(regs);
  1428. finish_clock = sched_clock();
  1429. perf_sample_event_took(finish_clock - start_clock);
  1430. return ret;
  1431. }
  1432. NOKPROBE_SYMBOL(perf_event_nmi_handler);
  1433. struct event_constraint emptyconstraint;
  1434. struct event_constraint unconstrained;
  1435. static int x86_pmu_prepare_cpu(unsigned int cpu)
  1436. {
  1437. struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
  1438. int i;
  1439. for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
  1440. cpuc->kfree_on_online[i] = NULL;
  1441. if (x86_pmu.cpu_prepare)
  1442. return x86_pmu.cpu_prepare(cpu);
  1443. return 0;
  1444. }
  1445. static int x86_pmu_dead_cpu(unsigned int cpu)
  1446. {
  1447. if (x86_pmu.cpu_dead)
  1448. x86_pmu.cpu_dead(cpu);
  1449. return 0;
  1450. }
  1451. static int x86_pmu_online_cpu(unsigned int cpu)
  1452. {
  1453. struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
  1454. int i;
  1455. for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
  1456. kfree(cpuc->kfree_on_online[i]);
  1457. cpuc->kfree_on_online[i] = NULL;
  1458. }
  1459. return 0;
  1460. }
  1461. static int x86_pmu_starting_cpu(unsigned int cpu)
  1462. {
  1463. if (x86_pmu.cpu_starting)
  1464. x86_pmu.cpu_starting(cpu);
  1465. return 0;
  1466. }
  1467. static int x86_pmu_dying_cpu(unsigned int cpu)
  1468. {
  1469. if (x86_pmu.cpu_dying)
  1470. x86_pmu.cpu_dying(cpu);
  1471. return 0;
  1472. }
  1473. static void __init pmu_check_apic(void)
  1474. {
  1475. if (boot_cpu_has(X86_FEATURE_APIC))
  1476. return;
  1477. x86_pmu.apic = 0;
  1478. pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
  1479. pr_info("no hardware sampling interrupt available.\n");
  1480. /*
  1481. * If we have a PMU initialized but no APIC
  1482. * interrupts, we cannot sample hardware
  1483. * events (user-space has to fall back and
  1484. * sample via a hrtimer based software event):
  1485. */
  1486. pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
  1487. }
  1488. static struct attribute_group x86_pmu_format_group __ro_after_init = {
  1489. .name = "format",
  1490. .attrs = NULL,
  1491. };
  1492. ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
  1493. {
  1494. struct perf_pmu_events_attr *pmu_attr =
  1495. container_of(attr, struct perf_pmu_events_attr, attr);
  1496. u64 config = 0;
  1497. if (pmu_attr->id < x86_pmu.max_events)
  1498. config = x86_pmu.event_map(pmu_attr->id);
  1499. /* string trumps id */
  1500. if (pmu_attr->event_str)
  1501. return sprintf(page, "%s\n", pmu_attr->event_str);
  1502. return x86_pmu.events_sysfs_show(page, config);
  1503. }
  1504. EXPORT_SYMBOL_GPL(events_sysfs_show);
  1505. ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
  1506. char *page)
  1507. {
  1508. struct perf_pmu_events_ht_attr *pmu_attr =
  1509. container_of(attr, struct perf_pmu_events_ht_attr, attr);
  1510. /*
  1511. * Report conditional events depending on Hyper-Threading.
  1512. *
  1513. * This is overly conservative as usually the HT special
  1514. * handling is not needed if the other CPU thread is idle.
  1515. *
  1516. * Note this does not (and cannot) handle the case when thread
  1517. * siblings are invisible, for example with virtualization
  1518. * if they are owned by some other guest. The user tool
  1519. * has to re-read when a thread sibling gets onlined later.
  1520. */
  1521. return sprintf(page, "%s",
  1522. topology_max_smt_threads() > 1 ?
  1523. pmu_attr->event_str_ht :
  1524. pmu_attr->event_str_noht);
  1525. }
  1526. ssize_t events_hybrid_sysfs_show(struct device *dev,
  1527. struct device_attribute *attr,
  1528. char *page)
  1529. {
  1530. struct perf_pmu_events_hybrid_attr *pmu_attr =
  1531. container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
  1532. struct x86_hybrid_pmu *pmu;
  1533. const char *str, *next_str;
  1534. int i;
  1535. if (hweight64(pmu_attr->pmu_type) == 1)
  1536. return sprintf(page, "%s", pmu_attr->event_str);
  1537. /*
  1538. * Hybrid PMUs may support the same event name, but with different
  1539. * event encoding, e.g., the mem-loads event on an Atom PMU has
  1540. * different event encoding from a Core PMU.
  1541. *
  1542. * The event_str includes all event encodings. Each event encoding
  1543. * is divided by ";". The order of the event encodings must follow
  1544. * the order of the hybrid PMU index.
  1545. */
  1546. pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
  1547. str = pmu_attr->event_str;
  1548. for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
  1549. if (!(x86_pmu.hybrid_pmu[i].pmu_type & pmu_attr->pmu_type))
  1550. continue;
  1551. if (x86_pmu.hybrid_pmu[i].pmu_type & pmu->pmu_type) {
  1552. next_str = strchr(str, ';');
  1553. if (next_str)
  1554. return snprintf(page, next_str - str + 1, "%s", str);
  1555. else
  1556. return sprintf(page, "%s", str);
  1557. }
  1558. str = strchr(str, ';');
  1559. str++;
  1560. }
  1561. return 0;
  1562. }
  1563. EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
  1564. EVENT_ATTR(cpu-cycles, CPU_CYCLES );
  1565. EVENT_ATTR(instructions, INSTRUCTIONS );
  1566. EVENT_ATTR(cache-references, CACHE_REFERENCES );
  1567. EVENT_ATTR(cache-misses, CACHE_MISSES );
  1568. EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
  1569. EVENT_ATTR(branch-misses, BRANCH_MISSES );
  1570. EVENT_ATTR(bus-cycles, BUS_CYCLES );
  1571. EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
  1572. EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
  1573. EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
  1574. static struct attribute *empty_attrs;
  1575. static struct attribute *events_attr[] = {
  1576. EVENT_PTR(CPU_CYCLES),
  1577. EVENT_PTR(INSTRUCTIONS),
  1578. EVENT_PTR(CACHE_REFERENCES),
  1579. EVENT_PTR(CACHE_MISSES),
  1580. EVENT_PTR(BRANCH_INSTRUCTIONS),
  1581. EVENT_PTR(BRANCH_MISSES),
  1582. EVENT_PTR(BUS_CYCLES),
  1583. EVENT_PTR(STALLED_CYCLES_FRONTEND),
  1584. EVENT_PTR(STALLED_CYCLES_BACKEND),
  1585. EVENT_PTR(REF_CPU_CYCLES),
  1586. NULL,
  1587. };
  1588. /*
  1589. * Remove all undefined events (x86_pmu.event_map(id) == 0)
  1590. * out of events_attr attributes.
  1591. */
  1592. static umode_t
  1593. is_visible(struct kobject *kobj, struct attribute *attr, int idx)
  1594. {
  1595. struct perf_pmu_events_attr *pmu_attr;
  1596. if (idx >= x86_pmu.max_events)
  1597. return 0;
  1598. pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);
  1599. /* str trumps id */
  1600. return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0;
  1601. }
  1602. static struct attribute_group x86_pmu_events_group __ro_after_init = {
  1603. .name = "events",
  1604. .attrs = events_attr,
  1605. .is_visible = is_visible,
  1606. };
  1607. ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
  1608. {
  1609. u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
  1610. u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
  1611. bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
  1612. bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
  1613. bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
  1614. bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
  1615. ssize_t ret;
  1616. /*
  1617. * We have whole page size to spend and just little data
  1618. * to write, so we can safely use sprintf.
  1619. */
  1620. ret = sprintf(page, "event=0x%02llx", event);
  1621. if (umask)
  1622. ret += sprintf(page + ret, ",umask=0x%02llx", umask);
  1623. if (edge)
  1624. ret += sprintf(page + ret, ",edge");
  1625. if (pc)
  1626. ret += sprintf(page + ret, ",pc");
  1627. if (any)
  1628. ret += sprintf(page + ret, ",any");
  1629. if (inv)
  1630. ret += sprintf(page + ret, ",inv");
  1631. if (cmask)
  1632. ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
  1633. ret += sprintf(page + ret, "\n");
  1634. return ret;
  1635. }
  1636. static struct attribute_group x86_pmu_attr_group;
  1637. static struct attribute_group x86_pmu_caps_group;
  1638. static void x86_pmu_static_call_update(void)
  1639. {
  1640. static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
  1641. static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
  1642. static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
  1643. static_call_update(x86_pmu_enable, x86_pmu.enable);
  1644. static_call_update(x86_pmu_disable, x86_pmu.disable);
  1645. static_call_update(x86_pmu_assign, x86_pmu.assign);
  1646. static_call_update(x86_pmu_add, x86_pmu.add);
  1647. static_call_update(x86_pmu_del, x86_pmu.del);
  1648. static_call_update(x86_pmu_read, x86_pmu.read);
  1649. static_call_update(x86_pmu_set_period, x86_pmu.set_period);
  1650. static_call_update(x86_pmu_update, x86_pmu.update);
  1651. static_call_update(x86_pmu_limit_period, x86_pmu.limit_period);
  1652. static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
  1653. static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
  1654. static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
  1655. static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
  1656. static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
  1657. static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
  1658. static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
  1659. static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx);
  1660. static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
  1661. static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
  1662. static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
  1663. static_call_update(x86_pmu_filter, x86_pmu.filter);
  1664. }
  1665. static void _x86_pmu_read(struct perf_event *event)
  1666. {
  1667. static_call(x86_pmu_update)(event);
  1668. }
  1669. void x86_pmu_show_pmu_cap(struct pmu *pmu)
  1670. {
  1671. pr_info("... version: %d\n", x86_pmu.version);
  1672. pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
  1673. pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu));
  1674. pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
  1675. pr_info("... max period: %016Lx\n", x86_pmu.max_period);
  1676. pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu));
  1677. pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl));
  1678. }
  1679. static int __init init_hw_perf_events(void)
  1680. {
  1681. struct x86_pmu_quirk *quirk;
  1682. int err;
  1683. pr_info("Performance Events: ");
  1684. switch (boot_cpu_data.x86_vendor) {
  1685. case X86_VENDOR_INTEL:
  1686. err = intel_pmu_init();
  1687. break;
  1688. case X86_VENDOR_AMD:
  1689. err = amd_pmu_init();
  1690. break;
  1691. case X86_VENDOR_HYGON:
  1692. err = amd_pmu_init();
  1693. x86_pmu.name = "HYGON";
  1694. break;
  1695. case X86_VENDOR_ZHAOXIN:
  1696. case X86_VENDOR_CENTAUR:
  1697. err = zhaoxin_pmu_init();
  1698. break;
  1699. default:
  1700. err = -ENOTSUPP;
  1701. }
  1702. if (err != 0) {
  1703. pr_cont("no PMU driver, software events only.\n");
  1704. err = 0;
  1705. goto out_bad_pmu;
  1706. }
  1707. pmu_check_apic();
  1708. /* sanity check that the hardware exists or is emulated */
  1709. if (!check_hw_exists(&pmu, x86_pmu.cntr_mask, x86_pmu.fixed_cntr_mask))
  1710. goto out_bad_pmu;
  1711. pr_cont("%s PMU driver.\n", x86_pmu.name);
  1712. x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
  1713. for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
  1714. quirk->func();
  1715. if (!x86_pmu.intel_ctrl)
  1716. x86_pmu.intel_ctrl = x86_pmu.cntr_mask64;
  1717. if (!x86_pmu.config_mask)
  1718. x86_pmu.config_mask = X86_RAW_EVENT_MASK;
  1719. perf_events_lapic_init();
  1720. register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
  1721. unconstrained = (struct event_constraint)
  1722. __EVENT_CONSTRAINT(0, x86_pmu.cntr_mask64,
  1723. 0, x86_pmu_num_counters(NULL), 0, 0);
  1724. x86_pmu_format_group.attrs = x86_pmu.format_attrs;
  1725. if (!x86_pmu.events_sysfs_show)
  1726. x86_pmu_events_group.attrs = &empty_attrs;
  1727. pmu.attr_update = x86_pmu.attr_update;
  1728. if (!is_hybrid())
  1729. x86_pmu_show_pmu_cap(NULL);
  1730. if (!x86_pmu.read)
  1731. x86_pmu.read = _x86_pmu_read;
  1732. if (!x86_pmu.guest_get_msrs)
  1733. x86_pmu.guest_get_msrs = (void *)&__static_call_return0;
  1734. if (!x86_pmu.set_period)
  1735. x86_pmu.set_period = x86_perf_event_set_period;
  1736. if (!x86_pmu.update)
  1737. x86_pmu.update = x86_perf_event_update;
  1738. x86_pmu_static_call_update();
  1739. /*
  1740. * Install callbacks. Core will call them for each online
  1741. * cpu.
  1742. */
  1743. err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
  1744. x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
  1745. if (err)
  1746. return err;
  1747. err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
  1748. "perf/x86:starting", x86_pmu_starting_cpu,
  1749. x86_pmu_dying_cpu);
  1750. if (err)
  1751. goto out;
  1752. err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
  1753. x86_pmu_online_cpu, NULL);
  1754. if (err)
  1755. goto out1;
  1756. if (!is_hybrid()) {
  1757. err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
  1758. if (err)
  1759. goto out2;
  1760. } else {
  1761. struct x86_hybrid_pmu *hybrid_pmu;
  1762. int i, j;
  1763. for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
  1764. hybrid_pmu = &x86_pmu.hybrid_pmu[i];
  1765. hybrid_pmu->pmu = pmu;
  1766. hybrid_pmu->pmu.type = -1;
  1767. hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
  1768. hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
  1769. err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
  1770. (hybrid_pmu->pmu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
  1771. if (err)
  1772. break;
  1773. }
  1774. if (i < x86_pmu.num_hybrid_pmus) {
  1775. for (j = 0; j < i; j++)
  1776. perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu);
  1777. pr_warn("Failed to register hybrid PMUs\n");
  1778. kfree(x86_pmu.hybrid_pmu);
  1779. x86_pmu.hybrid_pmu = NULL;
  1780. x86_pmu.num_hybrid_pmus = 0;
  1781. goto out2;
  1782. }
  1783. }
  1784. return 0;
  1785. out2:
  1786. cpuhp_remove_state(CPUHP_AP_PERF_X86_ONLINE);
  1787. out1:
  1788. cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING);
  1789. out:
  1790. cpuhp_remove_state(CPUHP_PERF_X86_PREPARE);
  1791. out_bad_pmu:
  1792. memset(&x86_pmu, 0, sizeof(x86_pmu));
  1793. return err;
  1794. }
  1795. early_initcall(init_hw_perf_events);
  1796. static void x86_pmu_read(struct perf_event *event)
  1797. {
  1798. static_call(x86_pmu_read)(event);
  1799. }
  1800. /*
  1801. * Start group events scheduling transaction
  1802. * Set the flag to make pmu::enable() not perform the
  1803. * schedulability test, it will be performed at commit time
  1804. *
  1805. * We only support PERF_PMU_TXN_ADD transactions. Save the
  1806. * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
  1807. * transactions.
  1808. */
  1809. static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
  1810. {
  1811. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  1812. WARN_ON_ONCE(cpuc->txn_flags); /* txn already in flight */
  1813. cpuc->txn_flags = txn_flags;
  1814. if (txn_flags & ~PERF_PMU_TXN_ADD)
  1815. return;
  1816. perf_pmu_disable(pmu);
  1817. __this_cpu_write(cpu_hw_events.n_txn, 0);
  1818. __this_cpu_write(cpu_hw_events.n_txn_pair, 0);
  1819. __this_cpu_write(cpu_hw_events.n_txn_metric, 0);
  1820. }
  1821. /*
  1822. * Stop group events scheduling transaction
  1823. * Clear the flag and pmu::enable() will perform the
  1824. * schedulability test.
  1825. */
  1826. static void x86_pmu_cancel_txn(struct pmu *pmu)
  1827. {
  1828. unsigned int txn_flags;
  1829. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  1830. WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
  1831. txn_flags = cpuc->txn_flags;
  1832. cpuc->txn_flags = 0;
  1833. if (txn_flags & ~PERF_PMU_TXN_ADD)
  1834. return;
  1835. /*
  1836. * Truncate collected array by the number of events added in this
  1837. * transaction. See x86_pmu_add() and x86_pmu_*_txn().
  1838. */
  1839. __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
  1840. __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
  1841. __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair));
  1842. __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric));
  1843. perf_pmu_enable(pmu);
  1844. }
  1845. /*
  1846. * Commit group events scheduling transaction
  1847. * Perform the group schedulability test as a whole
  1848. * Return 0 if success
  1849. *
  1850. * Does not cancel the transaction on failure; expects the caller to do this.
  1851. */
  1852. static int x86_pmu_commit_txn(struct pmu *pmu)
  1853. {
  1854. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  1855. int assign[X86_PMC_IDX_MAX];
  1856. int n, ret;
  1857. WARN_ON_ONCE(!cpuc->txn_flags); /* no txn in flight */
  1858. if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
  1859. cpuc->txn_flags = 0;
  1860. return 0;
  1861. }
  1862. n = cpuc->n_events;
  1863. if (!x86_pmu_initialized())
  1864. return -EAGAIN;
  1865. ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
  1866. if (ret)
  1867. return ret;
  1868. /*
  1869. * copy new assignment, now we know it is possible
  1870. * will be used by hw_perf_enable()
  1871. */
  1872. memcpy(cpuc->assign, assign, n*sizeof(int));
  1873. cpuc->txn_flags = 0;
  1874. perf_pmu_enable(pmu);
  1875. return 0;
  1876. }
  1877. /*
  1878. * a fake_cpuc is used to validate event groups. Due to
  1879. * the extra reg logic, we need to also allocate a fake
  1880. * per_core and per_cpu structure. Otherwise, group events
  1881. * using extra reg may conflict without the kernel being
  1882. * able to catch this when the last event gets added to
  1883. * the group.
  1884. */
  1885. static void free_fake_cpuc(struct cpu_hw_events *cpuc)
  1886. {
  1887. intel_cpuc_finish(cpuc);
  1888. kfree(cpuc);
  1889. }
  1890. static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu)
  1891. {
  1892. struct cpu_hw_events *cpuc;
  1893. int cpu;
  1894. cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
  1895. if (!cpuc)
  1896. return ERR_PTR(-ENOMEM);
  1897. cpuc->is_fake = 1;
  1898. if (is_hybrid()) {
  1899. struct x86_hybrid_pmu *h_pmu;
  1900. h_pmu = hybrid_pmu(event_pmu);
  1901. if (cpumask_empty(&h_pmu->supported_cpus))
  1902. goto error;
  1903. cpu = cpumask_first(&h_pmu->supported_cpus);
  1904. } else
  1905. cpu = raw_smp_processor_id();
  1906. cpuc->pmu = event_pmu;
  1907. if (intel_cpuc_prepare(cpuc, cpu))
  1908. goto error;
  1909. return cpuc;
  1910. error:
  1911. free_fake_cpuc(cpuc);
  1912. return ERR_PTR(-ENOMEM);
  1913. }
  1914. /*
  1915. * validate that we can schedule this event
  1916. */
  1917. static int validate_event(struct perf_event *event)
  1918. {
  1919. struct cpu_hw_events *fake_cpuc;
  1920. struct event_constraint *c;
  1921. int ret = 0;
  1922. fake_cpuc = allocate_fake_cpuc(event->pmu);
  1923. if (IS_ERR(fake_cpuc))
  1924. return PTR_ERR(fake_cpuc);
  1925. c = x86_pmu.get_event_constraints(fake_cpuc, 0, event);
  1926. if (!c || !c->weight)
  1927. ret = -EINVAL;
  1928. if (x86_pmu.put_event_constraints)
  1929. x86_pmu.put_event_constraints(fake_cpuc, event);
  1930. free_fake_cpuc(fake_cpuc);
  1931. return ret;
  1932. }
  1933. /*
  1934. * validate a single event group
  1935. *
  1936. * validation include:
  1937. * - check events are compatible which each other
  1938. * - events do not compete for the same counter
  1939. * - number of events <= number of counters
  1940. *
  1941. * validation ensures the group can be loaded onto the
  1942. * PMU if it was the only group available.
  1943. */
  1944. static int validate_group(struct perf_event *event)
  1945. {
  1946. struct perf_event *leader = event->group_leader;
  1947. struct cpu_hw_events *fake_cpuc;
  1948. int ret = -EINVAL, n;
  1949. /*
  1950. * Reject events from different hybrid PMUs.
  1951. */
  1952. if (is_hybrid()) {
  1953. struct perf_event *sibling;
  1954. struct pmu *pmu = NULL;
  1955. if (is_x86_event(leader))
  1956. pmu = leader->pmu;
  1957. for_each_sibling_event(sibling, leader) {
  1958. if (!is_x86_event(sibling))
  1959. continue;
  1960. if (!pmu)
  1961. pmu = sibling->pmu;
  1962. else if (pmu != sibling->pmu)
  1963. return ret;
  1964. }
  1965. }
  1966. fake_cpuc = allocate_fake_cpuc(event->pmu);
  1967. if (IS_ERR(fake_cpuc))
  1968. return PTR_ERR(fake_cpuc);
  1969. /*
  1970. * the event is not yet connected with its
  1971. * siblings therefore we must first collect
  1972. * existing siblings, then add the new event
  1973. * before we can simulate the scheduling
  1974. */
  1975. n = collect_events(fake_cpuc, leader, true);
  1976. if (n < 0)
  1977. goto out;
  1978. fake_cpuc->n_events = n;
  1979. n = collect_events(fake_cpuc, event, false);
  1980. if (n < 0)
  1981. goto out;
  1982. fake_cpuc->n_events = 0;
  1983. ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
  1984. out:
  1985. free_fake_cpuc(fake_cpuc);
  1986. return ret;
  1987. }
  1988. static int x86_pmu_event_init(struct perf_event *event)
  1989. {
  1990. struct x86_hybrid_pmu *pmu = NULL;
  1991. int err;
  1992. if ((event->attr.type != event->pmu->type) &&
  1993. (event->attr.type != PERF_TYPE_HARDWARE) &&
  1994. (event->attr.type != PERF_TYPE_HW_CACHE))
  1995. return -ENOENT;
  1996. if (is_hybrid() && (event->cpu != -1)) {
  1997. pmu = hybrid_pmu(event->pmu);
  1998. if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus))
  1999. return -ENOENT;
  2000. }
  2001. err = __x86_pmu_event_init(event);
  2002. if (!err) {
  2003. if (event->group_leader != event)
  2004. err = validate_group(event);
  2005. else
  2006. err = validate_event(event);
  2007. }
  2008. if (err) {
  2009. if (event->destroy)
  2010. event->destroy(event);
  2011. event->destroy = NULL;
  2012. }
  2013. if (READ_ONCE(x86_pmu.attr_rdpmc) &&
  2014. !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
  2015. event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
  2016. return err;
  2017. }
  2018. void perf_clear_dirty_counters(void)
  2019. {
  2020. struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
  2021. int i;
  2022. /* Don't need to clear the assigned counter. */
  2023. for (i = 0; i < cpuc->n_events; i++)
  2024. __clear_bit(cpuc->assign[i], cpuc->dirty);
  2025. if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX))
  2026. return;
  2027. for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) {
  2028. if (i >= INTEL_PMC_IDX_FIXED) {
  2029. /* Metrics and fake events don't have corresponding HW counters. */
  2030. if (!test_bit(i - INTEL_PMC_IDX_FIXED, hybrid(cpuc->pmu, fixed_cntr_mask)))
  2031. continue;
  2032. wrmsrl(x86_pmu_fixed_ctr_addr(i - INTEL_PMC_IDX_FIXED), 0);
  2033. } else {
  2034. wrmsrl(x86_pmu_event_addr(i), 0);
  2035. }
  2036. }
  2037. bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX);
  2038. }
  2039. static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
  2040. {
  2041. if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
  2042. return;
  2043. /*
  2044. * This function relies on not being called concurrently in two
  2045. * tasks in the same mm. Otherwise one task could observe
  2046. * perf_rdpmc_allowed > 1 and return all the way back to
  2047. * userspace with CR4.PCE clear while another task is still
  2048. * doing on_each_cpu_mask() to propagate CR4.PCE.
  2049. *
  2050. * For now, this can't happen because all callers hold mmap_lock
  2051. * for write. If this changes, we'll need a different solution.
  2052. */
  2053. mmap_assert_write_locked(mm);
  2054. if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1)
  2055. on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
  2056. }
  2057. static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
  2058. {
  2059. if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
  2060. return;
  2061. if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
  2062. on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
  2063. }
  2064. static int x86_pmu_event_idx(struct perf_event *event)
  2065. {
  2066. struct hw_perf_event *hwc = &event->hw;
  2067. if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
  2068. return 0;
  2069. if (is_metric_idx(hwc->idx))
  2070. return INTEL_PMC_FIXED_RDPMC_METRICS + 1;
  2071. else
  2072. return hwc->event_base_rdpmc + 1;
  2073. }
  2074. static ssize_t get_attr_rdpmc(struct device *cdev,
  2075. struct device_attribute *attr,
  2076. char *buf)
  2077. {
  2078. return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
  2079. }
  2080. static ssize_t set_attr_rdpmc(struct device *cdev,
  2081. struct device_attribute *attr,
  2082. const char *buf, size_t count)
  2083. {
  2084. static DEFINE_MUTEX(rdpmc_mutex);
  2085. unsigned long val;
  2086. ssize_t ret;
  2087. ret = kstrtoul(buf, 0, &val);
  2088. if (ret)
  2089. return ret;
  2090. if (val > 2)
  2091. return -EINVAL;
  2092. if (x86_pmu.attr_rdpmc_broken)
  2093. return -ENOTSUPP;
  2094. guard(mutex)(&rdpmc_mutex);
  2095. if (val != x86_pmu.attr_rdpmc) {
  2096. /*
  2097. * Changing into or out of never available or always available,
  2098. * aka perf-event-bypassing mode. This path is extremely slow,
  2099. * but only root can trigger it, so it's okay.
  2100. */
  2101. if (val == 0)
  2102. static_branch_inc(&rdpmc_never_available_key);
  2103. else if (x86_pmu.attr_rdpmc == 0)
  2104. static_branch_dec(&rdpmc_never_available_key);
  2105. if (val == 2)
  2106. static_branch_inc(&rdpmc_always_available_key);
  2107. else if (x86_pmu.attr_rdpmc == 2)
  2108. static_branch_dec(&rdpmc_always_available_key);
  2109. on_each_cpu(cr4_update_pce, NULL, 1);
  2110. x86_pmu.attr_rdpmc = val;
  2111. }
  2112. return count;
  2113. }
  2114. static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
  2115. static struct attribute *x86_pmu_attrs[] = {
  2116. &dev_attr_rdpmc.attr,
  2117. NULL,
  2118. };
  2119. static struct attribute_group x86_pmu_attr_group __ro_after_init = {
  2120. .attrs = x86_pmu_attrs,
  2121. };
  2122. static ssize_t max_precise_show(struct device *cdev,
  2123. struct device_attribute *attr,
  2124. char *buf)
  2125. {
  2126. return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu_max_precise());
  2127. }
  2128. static DEVICE_ATTR_RO(max_precise);
  2129. static struct attribute *x86_pmu_caps_attrs[] = {
  2130. &dev_attr_max_precise.attr,
  2131. NULL
  2132. };
  2133. static struct attribute_group x86_pmu_caps_group __ro_after_init = {
  2134. .name = "caps",
  2135. .attrs = x86_pmu_caps_attrs,
  2136. };
  2137. static const struct attribute_group *x86_pmu_attr_groups[] = {
  2138. &x86_pmu_attr_group,
  2139. &x86_pmu_format_group,
  2140. &x86_pmu_events_group,
  2141. &x86_pmu_caps_group,
  2142. NULL,
  2143. };
  2144. static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
  2145. {
  2146. static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in);
  2147. }
  2148. static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
  2149. struct perf_event_pmu_context *next_epc)
  2150. {
  2151. static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc);
  2152. }
  2153. void perf_check_microcode(void)
  2154. {
  2155. if (x86_pmu.check_microcode)
  2156. x86_pmu.check_microcode();
  2157. }
  2158. static int x86_pmu_check_period(struct perf_event *event, u64 value)
  2159. {
  2160. if (x86_pmu.check_period && x86_pmu.check_period(event, value))
  2161. return -EINVAL;
  2162. if (value && x86_pmu.limit_period) {
  2163. s64 left = value;
  2164. x86_pmu.limit_period(event, &left);
  2165. if (left > value)
  2166. return -EINVAL;
  2167. }
  2168. return 0;
  2169. }
  2170. static int x86_pmu_aux_output_match(struct perf_event *event)
  2171. {
  2172. if (!(pmu.capabilities & PERF_PMU_CAP_AUX_OUTPUT))
  2173. return 0;
  2174. if (x86_pmu.aux_output_match)
  2175. return x86_pmu.aux_output_match(event);
  2176. return 0;
  2177. }
  2178. static bool x86_pmu_filter(struct pmu *pmu, int cpu)
  2179. {
  2180. bool ret = false;
  2181. static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);
  2182. return ret;
  2183. }
  2184. static struct pmu pmu = {
  2185. .pmu_enable = x86_pmu_enable,
  2186. .pmu_disable = x86_pmu_disable,
  2187. .attr_groups = x86_pmu_attr_groups,
  2188. .event_init = x86_pmu_event_init,
  2189. .event_mapped = x86_pmu_event_mapped,
  2190. .event_unmapped = x86_pmu_event_unmapped,
  2191. .add = x86_pmu_add,
  2192. .del = x86_pmu_del,
  2193. .start = x86_pmu_start,
  2194. .stop = x86_pmu_stop,
  2195. .read = x86_pmu_read,
  2196. .start_txn = x86_pmu_start_txn,
  2197. .cancel_txn = x86_pmu_cancel_txn,
  2198. .commit_txn = x86_pmu_commit_txn,
  2199. .event_idx = x86_pmu_event_idx,
  2200. .sched_task = x86_pmu_sched_task,
  2201. .swap_task_ctx = x86_pmu_swap_task_ctx,
  2202. .check_period = x86_pmu_check_period,
  2203. .aux_output_match = x86_pmu_aux_output_match,
  2204. .filter = x86_pmu_filter,
  2205. };
  2206. void arch_perf_update_userpage(struct perf_event *event,
  2207. struct perf_event_mmap_page *userpg, u64 now)
  2208. {
  2209. struct cyc2ns_data data;
  2210. u64 offset;
  2211. userpg->cap_user_time = 0;
  2212. userpg->cap_user_time_zero = 0;
  2213. userpg->cap_user_rdpmc =
  2214. !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
  2215. userpg->pmc_width = x86_pmu.cntval_bits;
  2216. if (!using_native_sched_clock() || !sched_clock_stable())
  2217. return;
  2218. cyc2ns_read_begin(&data);
  2219. offset = data.cyc2ns_offset + __sched_clock_offset;
  2220. /*
  2221. * Internal timekeeping for enabled/running/stopped times
  2222. * is always in the local_clock domain.
  2223. */
  2224. userpg->cap_user_time = 1;
  2225. userpg->time_mult = data.cyc2ns_mul;
  2226. userpg->time_shift = data.cyc2ns_shift;
  2227. userpg->time_offset = offset - now;
  2228. /*
  2229. * cap_user_time_zero doesn't make sense when we're using a different
  2230. * time base for the records.
  2231. */
  2232. if (!event->attr.use_clockid) {
  2233. userpg->cap_user_time_zero = 1;
  2234. userpg->time_zero = offset;
  2235. }
  2236. cyc2ns_read_end();
  2237. }
  2238. /*
  2239. * Determine whether the regs were taken from an irq/exception handler rather
  2240. * than from perf_arch_fetch_caller_regs().
  2241. */
  2242. static bool perf_hw_regs(struct pt_regs *regs)
  2243. {
  2244. return regs->flags & X86_EFLAGS_FIXED;
  2245. }
  2246. void
  2247. perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
  2248. {
  2249. struct unwind_state state;
  2250. unsigned long addr;
  2251. if (perf_guest_state()) {
  2252. /* TODO: We don't support guest os callchain now */
  2253. return;
  2254. }
  2255. if (perf_callchain_store(entry, regs->ip))
  2256. return;
  2257. if (perf_hw_regs(regs))
  2258. unwind_start(&state, current, regs, NULL);
  2259. else
  2260. unwind_start(&state, current, NULL, (void *)regs->sp);
  2261. for (; !unwind_done(&state); unwind_next_frame(&state)) {
  2262. addr = unwind_get_return_address(&state);
  2263. if (!addr || perf_callchain_store(entry, addr))
  2264. return;
  2265. }
  2266. }
  2267. static inline int
  2268. valid_user_frame(const void __user *fp, unsigned long size)
  2269. {
  2270. return __access_ok(fp, size);
  2271. }
  2272. static unsigned long get_segment_base(unsigned int segment)
  2273. {
  2274. struct desc_struct *desc;
  2275. unsigned int idx = segment >> 3;
  2276. if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
  2277. #ifdef CONFIG_MODIFY_LDT_SYSCALL
  2278. struct ldt_struct *ldt;
  2279. /* IRQs are off, so this synchronizes with smp_store_release */
  2280. ldt = READ_ONCE(current->active_mm->context.ldt);
  2281. if (!ldt || idx >= ldt->nr_entries)
  2282. return 0;
  2283. desc = &ldt->entries[idx];
  2284. #else
  2285. return 0;
  2286. #endif
  2287. } else {
  2288. if (idx >= GDT_ENTRIES)
  2289. return 0;
  2290. desc = raw_cpu_ptr(gdt_page.gdt) + idx;
  2291. }
  2292. return get_desc_base(desc);
  2293. }
  2294. #ifdef CONFIG_UPROBES
  2295. /*
  2296. * Heuristic-based check if uprobe is installed at the function entry.
  2297. *
  2298. * Under assumption of user code being compiled with frame pointers,
  2299. * `push %rbp/%ebp` is a good indicator that we indeed are.
  2300. *
  2301. * Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
  2302. * If we get this wrong, captured stack trace might have one extra bogus
  2303. * entry, but the rest of stack trace will still be meaningful.
  2304. */
  2305. static bool is_uprobe_at_func_entry(struct pt_regs *regs)
  2306. {
  2307. struct arch_uprobe *auprobe;
  2308. if (!current->utask)
  2309. return false;
  2310. auprobe = current->utask->auprobe;
  2311. if (!auprobe)
  2312. return false;
  2313. /* push %rbp/%ebp */
  2314. if (auprobe->insn[0] == 0x55)
  2315. return true;
  2316. /* endbr64 (64-bit only) */
  2317. if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn))
  2318. return true;
  2319. return false;
  2320. }
  2321. #else
  2322. static bool is_uprobe_at_func_entry(struct pt_regs *regs)
  2323. {
  2324. return false;
  2325. }
  2326. #endif /* CONFIG_UPROBES */
  2327. #ifdef CONFIG_IA32_EMULATION
  2328. #include <linux/compat.h>
  2329. static inline int
  2330. perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
  2331. {
  2332. /* 32-bit process in 64-bit kernel. */
  2333. unsigned long ss_base, cs_base;
  2334. struct stack_frame_ia32 frame;
  2335. const struct stack_frame_ia32 __user *fp;
  2336. u32 ret_addr;
  2337. if (user_64bit_mode(regs))
  2338. return 0;
  2339. cs_base = get_segment_base(regs->cs);
  2340. ss_base = get_segment_base(regs->ss);
  2341. fp = compat_ptr(ss_base + regs->bp);
  2342. pagefault_disable();
  2343. /* see perf_callchain_user() below for why we do this */
  2344. if (is_uprobe_at_func_entry(regs) &&
  2345. !get_user(ret_addr, (const u32 __user *)regs->sp))
  2346. perf_callchain_store(entry, ret_addr);
  2347. while (entry->nr < entry->max_stack) {
  2348. if (!valid_user_frame(fp, sizeof(frame)))
  2349. break;
  2350. if (__get_user(frame.next_frame, &fp->next_frame))
  2351. break;
  2352. if (__get_user(frame.return_address, &fp->return_address))
  2353. break;
  2354. perf_callchain_store(entry, cs_base + frame.return_address);
  2355. fp = compat_ptr(ss_base + frame.next_frame);
  2356. }
  2357. pagefault_enable();
  2358. return 1;
  2359. }
  2360. #else
  2361. static inline int
  2362. perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
  2363. {
  2364. return 0;
  2365. }
  2366. #endif
  2367. void
  2368. perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
  2369. {
  2370. struct stack_frame frame;
  2371. const struct stack_frame __user *fp;
  2372. unsigned long ret_addr;
  2373. if (perf_guest_state()) {
  2374. /* TODO: We don't support guest os callchain now */
  2375. return;
  2376. }
  2377. /*
  2378. * We don't know what to do with VM86 stacks.. ignore them for now.
  2379. */
  2380. if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
  2381. return;
  2382. fp = (void __user *)regs->bp;
  2383. perf_callchain_store(entry, regs->ip);
  2384. if (!nmi_uaccess_okay())
  2385. return;
  2386. if (perf_callchain_user32(regs, entry))
  2387. return;
  2388. pagefault_disable();
  2389. /*
  2390. * If we are called from uprobe handler, and we are indeed at the very
  2391. * entry to user function (which is normally a `push %rbp` instruction,
  2392. * under assumption of application being compiled with frame pointers),
  2393. * we should read return address from *regs->sp before proceeding
  2394. * to follow frame pointers, otherwise we'll skip immediate caller
  2395. * as %rbp is not yet setup.
  2396. */
  2397. if (is_uprobe_at_func_entry(regs) &&
  2398. !get_user(ret_addr, (const unsigned long __user *)regs->sp))
  2399. perf_callchain_store(entry, ret_addr);
  2400. while (entry->nr < entry->max_stack) {
  2401. if (!valid_user_frame(fp, sizeof(frame)))
  2402. break;
  2403. if (__get_user(frame.next_frame, &fp->next_frame))
  2404. break;
  2405. if (__get_user(frame.return_address, &fp->return_address))
  2406. break;
  2407. perf_callchain_store(entry, frame.return_address);
  2408. fp = (void __user *)frame.next_frame;
  2409. }
  2410. pagefault_enable();
  2411. }
  2412. /*
  2413. * Deal with code segment offsets for the various execution modes:
  2414. *
  2415. * VM86 - the good olde 16 bit days, where the linear address is
  2416. * 20 bits and we use regs->ip + 0x10 * regs->cs.
  2417. *
  2418. * IA32 - Where we need to look at GDT/LDT segment descriptor tables
  2419. * to figure out what the 32bit base address is.
  2420. *
  2421. * X32 - has TIF_X32 set, but is running in x86_64
  2422. *
  2423. * X86_64 - CS,DS,SS,ES are all zero based.
  2424. */
  2425. static unsigned long code_segment_base(struct pt_regs *regs)
  2426. {
  2427. /*
  2428. * For IA32 we look at the GDT/LDT segment base to convert the
  2429. * effective IP to a linear address.
  2430. */
  2431. #ifdef CONFIG_X86_32
  2432. /*
  2433. * If we are in VM86 mode, add the segment offset to convert to a
  2434. * linear address.
  2435. */
  2436. if (regs->flags & X86_VM_MASK)
  2437. return 0x10 * regs->cs;
  2438. if (user_mode(regs) && regs->cs != __USER_CS)
  2439. return get_segment_base(regs->cs);
  2440. #else
  2441. if (user_mode(regs) && !user_64bit_mode(regs) &&
  2442. regs->cs != __USER32_CS)
  2443. return get_segment_base(regs->cs);
  2444. #endif
  2445. return 0;
  2446. }
  2447. unsigned long perf_instruction_pointer(struct pt_regs *regs)
  2448. {
  2449. if (perf_guest_state())
  2450. return perf_guest_get_ip();
  2451. return regs->ip + code_segment_base(regs);
  2452. }
  2453. unsigned long perf_misc_flags(struct pt_regs *regs)
  2454. {
  2455. unsigned int guest_state = perf_guest_state();
  2456. int misc = 0;
  2457. if (guest_state) {
  2458. if (guest_state & PERF_GUEST_USER)
  2459. misc |= PERF_RECORD_MISC_GUEST_USER;
  2460. else
  2461. misc |= PERF_RECORD_MISC_GUEST_KERNEL;
  2462. } else {
  2463. if (user_mode(regs))
  2464. misc |= PERF_RECORD_MISC_USER;
  2465. else
  2466. misc |= PERF_RECORD_MISC_KERNEL;
  2467. }
  2468. if (regs->flags & PERF_EFLAGS_EXACT)
  2469. misc |= PERF_RECORD_MISC_EXACT_IP;
  2470. return misc;
  2471. }
  2472. void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
  2473. {
  2474. /* This API doesn't currently support enumerating hybrid PMUs. */
  2475. if (WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) ||
  2476. !x86_pmu_initialized()) {
  2477. memset(cap, 0, sizeof(*cap));
  2478. return;
  2479. }
  2480. /*
  2481. * Note, hybrid CPU models get tracked as having hybrid PMUs even when
  2482. * all E-cores are disabled via BIOS. When E-cores are disabled, the
  2483. * base PMU holds the correct number of counters for P-cores.
  2484. */
  2485. cap->version = x86_pmu.version;
  2486. cap->num_counters_gp = x86_pmu_num_counters(NULL);
  2487. cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL);
  2488. cap->bit_width_gp = x86_pmu.cntval_bits;
  2489. cap->bit_width_fixed = x86_pmu.cntval_bits;
  2490. cap->events_mask = (unsigned int)x86_pmu.events_maskl;
  2491. cap->events_mask_len = x86_pmu.events_mask_len;
  2492. cap->pebs_ept = x86_pmu.pebs_ept;
  2493. }
  2494. EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
  2495. u64 perf_get_hw_event_config(int hw_event)
  2496. {
  2497. int max = x86_pmu.max_events;
  2498. if (hw_event < max)
  2499. return x86_pmu.event_map(array_index_nospec(hw_event, max));
  2500. return 0;
  2501. }
  2502. EXPORT_SYMBOL_GPL(perf_get_hw_event_config);