123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281 |
- // SPDX-License-Identifier: GPL-2.0
- #include <linux/version.h>
- #include <linux/ptrace.h>
- #include <uapi/linux/bpf.h>
- #include "bpf_helpers.h"
- /*
- * The CPU number, cstate number and pstate number are based
- * on 96boards Hikey with octa CA53 CPUs.
- *
- * Every CPU have three idle states for cstate:
- * WFI, CPU_OFF, CLUSTER_OFF
- *
- * Every CPU have 5 operating points:
- * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
- *
- * This code is based on these assumption and other platforms
- * need to adjust these definitions.
- */
- #define MAX_CPU 8
- #define MAX_PSTATE_ENTRIES 5
- #define MAX_CSTATE_ENTRIES 3
- static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
- /*
- * my_map structure is used to record cstate and pstate index and
- * timestamp (Idx, Ts), when new event incoming we need to update
- * combination for new state index and timestamp (Idx`, Ts`).
- *
- * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
- * interval for the previous state: Duration(Idx) = Ts` - Ts.
- *
- * Every CPU has one below array for recording state index and
- * timestamp, and record for cstate and pstate saperately:
- *
- * +--------------------------+
- * | cstate timestamp |
- * +--------------------------+
- * | cstate index |
- * +--------------------------+
- * | pstate timestamp |
- * +--------------------------+
- * | pstate index |
- * +--------------------------+
- */
- #define MAP_OFF_CSTATE_TIME 0
- #define MAP_OFF_CSTATE_IDX 1
- #define MAP_OFF_PSTATE_TIME 2
- #define MAP_OFF_PSTATE_IDX 3
- #define MAP_OFF_NUM 4
- struct bpf_map_def SEC("maps") my_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = MAX_CPU * MAP_OFF_NUM,
- };
- /* cstate_duration records duration time for every idle state per CPU */
- struct bpf_map_def SEC("maps") cstate_duration = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = MAX_CPU * MAX_CSTATE_ENTRIES,
- };
- /* pstate_duration records duration time for every operating point per CPU */
- struct bpf_map_def SEC("maps") pstate_duration = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(u32),
- .value_size = sizeof(u64),
- .max_entries = MAX_CPU * MAX_PSTATE_ENTRIES,
- };
- /*
- * The trace events for cpu_idle and cpu_frequency are taken from:
- * /sys/kernel/debug/tracing/events/power/cpu_idle/format
- * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
- *
- * These two events have same format, so define one common structure.
- */
- struct cpu_args {
- u64 pad;
- u32 state;
- u32 cpu_id;
- };
- /* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
- static u32 find_cpu_pstate_idx(u32 frequency)
- {
- u32 i;
- for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
- if (frequency == cpu_opps[i])
- return i;
- }
- return i;
- }
- SEC("tracepoint/power/cpu_idle")
- int bpf_prog1(struct cpu_args *ctx)
- {
- u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
- u32 key, cpu, pstate_idx;
- u64 *val;
- if (ctx->cpu_id > MAX_CPU)
- return 0;
- cpu = ctx->cpu_id;
- key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
- cts = bpf_map_lookup_elem(&my_map, &key);
- if (!cts)
- return 0;
- key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
- cstate = bpf_map_lookup_elem(&my_map, &key);
- if (!cstate)
- return 0;
- key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
- pts = bpf_map_lookup_elem(&my_map, &key);
- if (!pts)
- return 0;
- key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
- pstate = bpf_map_lookup_elem(&my_map, &key);
- if (!pstate)
- return 0;
- prev_state = *cstate;
- *cstate = ctx->state;
- if (!*cts) {
- *cts = bpf_ktime_get_ns();
- return 0;
- }
- cur_ts = bpf_ktime_get_ns();
- delta = cur_ts - *cts;
- *cts = cur_ts;
- /*
- * When state doesn't equal to (u32)-1, the cpu will enter
- * one idle state; for this case we need to record interval
- * for the pstate.
- *
- * OPP2
- * +---------------------+
- * OPP1 | |
- * ---------+ |
- * | Idle state
- * +---------------
- *
- * |<- pstate duration ->|
- * ^ ^
- * pts cur_ts
- */
- if (ctx->state != (u32)-1) {
- /* record pstate after have first cpu_frequency event */
- if (!*pts)
- return 0;
- delta = cur_ts - *pts;
- pstate_idx = find_cpu_pstate_idx(*pstate);
- if (pstate_idx >= MAX_PSTATE_ENTRIES)
- return 0;
- key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
- val = bpf_map_lookup_elem(&pstate_duration, &key);
- if (val)
- __sync_fetch_and_add((long *)val, delta);
- /*
- * When state equal to (u32)-1, the cpu just exits from one
- * specific idle state; for this case we need to record
- * interval for the pstate.
- *
- * OPP2
- * -----------+
- * | OPP1
- * | +-----------
- * | Idle state |
- * +---------------------+
- *
- * |<- cstate duration ->|
- * ^ ^
- * cts cur_ts
- */
- } else {
- key = cpu * MAX_CSTATE_ENTRIES + prev_state;
- val = bpf_map_lookup_elem(&cstate_duration, &key);
- if (val)
- __sync_fetch_and_add((long *)val, delta);
- }
- /* Update timestamp for pstate as new start time */
- if (*pts)
- *pts = cur_ts;
- return 0;
- }
- SEC("tracepoint/power/cpu_frequency")
- int bpf_prog2(struct cpu_args *ctx)
- {
- u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
- u32 key, cpu, pstate_idx;
- u64 *val;
- cpu = ctx->cpu_id;
- key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
- pts = bpf_map_lookup_elem(&my_map, &key);
- if (!pts)
- return 0;
- key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
- pstate = bpf_map_lookup_elem(&my_map, &key);
- if (!pstate)
- return 0;
- key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
- cstate = bpf_map_lookup_elem(&my_map, &key);
- if (!cstate)
- return 0;
- prev_state = *pstate;
- *pstate = ctx->state;
- if (!*pts) {
- *pts = bpf_ktime_get_ns();
- return 0;
- }
- cur_ts = bpf_ktime_get_ns();
- delta = cur_ts - *pts;
- *pts = cur_ts;
- /* When CPU is in idle, bail out to skip pstate statistics */
- if (*cstate != (u32)(-1))
- return 0;
- /*
- * The cpu changes to another different OPP (in below diagram
- * change frequency from OPP3 to OPP1), need recording interval
- * for previous frequency OPP3 and update timestamp as start
- * time for new frequency OPP1.
- *
- * OPP3
- * +---------------------+
- * OPP2 | |
- * ---------+ |
- * | OPP1
- * +---------------
- *
- * |<- pstate duration ->|
- * ^ ^
- * pts cur_ts
- */
- pstate_idx = find_cpu_pstate_idx(*pstate);
- if (pstate_idx >= MAX_PSTATE_ENTRIES)
- return 0;
- key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
- val = bpf_map_lookup_elem(&pstate_duration, &key);
- if (val)
- __sync_fetch_and_add((long *)val, delta);
- return 0;
- }
- char _license[] SEC("license") = "GPL";
- u32 _version SEC("version") = LINUX_VERSION_CODE;
|