1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <inttypes.h>
18 #include <signal.h>
19 #include <stdio.h>
20 #include <string.h>
21 #include <sys/prctl.h>
22
23 #include <algorithm>
24 #include <chrono>
25 #include <optional>
26 #include <set>
27 #include <string>
28 #include <string_view>
29 #include <vector>
30
31 #include <android-base/file.h>
32 #include <android-base/logging.h>
33 #include <android-base/strings.h>
34 #include <android-base/unique_fd.h>
35
36 #include "IOEventLoop.h"
37 #include "ProbeEvents.h"
38 #include "cmd_stat_impl.h"
39 #include "command.h"
40 #include "environment.h"
41 #include "event_attr.h"
42 #include "event_fd.h"
43 #include "event_selection_set.h"
44 #include "event_type.h"
45 #include "utils.h"
46 #include "workload.h"
47
48 namespace simpleperf {
49
50 using android::base::Split;
51
52 static std::vector<std::string> default_measured_event_types{
53 "cpu-cycles", "stalled-cycles-frontend", "stalled-cycles-backend",
54 "instructions", "branch-instructions", "branch-misses",
55 "task-clock", "context-switches", "page-faults",
56 };
57
58 static const std::unordered_map<std::string_view, std::pair<std::string_view, std::string_view>>
59 COMMON_EVENT_RATE_MAP = {
60 {"cache-misses", {"cache-references", "miss rate"}},
61 {"branch-misses", {"branch-instructions", "miss rate"}},
62 };
63
64 static const std::unordered_map<std::string_view, std::pair<std::string_view, std::string_view>>
65 ARM_EVENT_RATE_MAP = {
66 // Refer to "D6.10.5 Meaningful ratios between common microarchitectural events" in ARMv8
67 // specification.
68 {"raw-l1i-cache-refill", {"raw-l1i-cache", "level 1 instruction cache refill rate"}},
69 {"raw-l1i-tlb-refill", {"raw-l1i-tlb", "level 1 instruction TLB refill rate"}},
70 {"raw-l1d-cache-refill", {"raw-l1d-cache", "level 1 data or unified cache refill rate"}},
71 {"raw-l1d-tlb-refill", {"raw-l1d-tlb", "level 1 data or unified TLB refill rate"}},
72 {"raw-l2d-cache-refill", {"raw-l2d-cache", "level 2 data or unified cache refill rate"}},
73 {"raw-l2i-cache-refill", {"raw-l2i-cache", "level 2 instruction cache refill rate"}},
74 {"raw-l3d-cache-refill", {"raw-l3d-cache", "level 3 data or unified cache refill rate"}},
75 {"raw-l2d-tlb-refill", {"raw-l2d-tlb", "level 2 data or unified TLB refill rate"}},
76 {"raw-l2i-tlb-refill", {"raw-l2i-tlb", "level 2 instruction TLB refill rate"}},
77 {"raw-bus-access", {"raw-bus-cycles", "bus accesses per cycle"}},
78 {"raw-ll-cache-miss", {"raw-ll-cache", "last level data or unified cache refill rate"}},
79 {"raw-dtlb-walk", {"raw-l1d-tlb", "data TLB miss rate"}},
80 {"raw-itlb-walk", {"raw-l1i-tlb", "instruction TLB miss rate"}},
81 {"raw-ll-cache-miss-rd", {"raw-ll-cache-rd", "memory read operation miss rate"}},
82 {"raw-remote-access-rd",
83 {"raw-remote-access", "read accesses to another socket in a multi-socket system"}},
84 // Refer to "Table K3-2 Relationship between REFILL events and associated access events" in
85 // ARMv8 specification.
86 {"raw-l1d-cache-refill-rd", {"raw-l1d-cache-rd", "level 1 cache refill rate, read"}},
87 {"raw-l1d-cache-refill-wr", {"raw-l1d-cache-wr", "level 1 cache refill rate, write"}},
88 {"raw-l1d-tlb-refill-rd", {"raw-l1d-tlb-rd", "level 1 TLB refill rate, read"}},
89 {"raw-l1d-tlb-refill-wr", {"raw-l1d-tlb-wr", "level 1 TLB refill rate, write"}},
90 {"raw-l2d-cache-refill-rd", {"raw-l2d-cache-rd", "level 2 data cache refill rate, read"}},
91 {"raw-l2d-cache-refill-wr", {"raw-l2d-cache-wr", "level 2 data cache refill rate, write"}},
92 {"raw-l2d-tlb-refill-rd", {"raw-l2d-tlb-rd", "level 2 data TLB refill rate, read"}},
93 };
94
ReadableCountValue(bool csv)95 std::string CounterSummary::ReadableCountValue(bool csv) {
96 if (type_name == "cpu-clock" || type_name == "task-clock") {
97 // Convert nanoseconds to milliseconds.
98 double value = count / 1e6;
99 return android::base::StringPrintf("%lf(ms)", value);
100 }
101 if (csv) {
102 return android::base::StringPrintf("%" PRIu64, count);
103 }
104 return ReadableCount(count);
105 }
106
FindSummary(const std::string & type_name,const std::string & modifier,const ThreadInfo * thread,int cpu)107 const CounterSummary* CounterSummaries::FindSummary(const std::string& type_name,
108 const std::string& modifier,
109 const ThreadInfo* thread, int cpu) {
110 for (const auto& s : summaries_) {
111 if (s.type_name == type_name && s.modifier == modifier && s.thread == thread && s.cpu == cpu) {
112 return &s;
113 }
114 }
115 return nullptr;
116 }
117
AutoGenerateSummaries()118 void CounterSummaries::AutoGenerateSummaries() {
119 for (size_t i = 0; i < summaries_.size(); ++i) {
120 const CounterSummary& s = summaries_[i];
121 if (s.modifier == "u") {
122 const CounterSummary* other = FindSummary(s.type_name, "k", s.thread, s.cpu);
123 if (other != nullptr && other->IsMonitoredAtTheSameTime(s)) {
124 if (FindSummary(s.type_name, "", s.thread, s.cpu) == nullptr) {
125 summaries_.emplace_back(s.type_name, "", s.group_id, s.thread, s.cpu,
126 s.count + other->count, s.runtime_in_ns, s.scale, true, csv_);
127 }
128 }
129 }
130 }
131 }
132
GenerateComments(double duration_in_sec)133 void CounterSummaries::GenerateComments(double duration_in_sec) {
134 for (auto& s : summaries_) {
135 s.comment = GetCommentForSummary(s, duration_in_sec);
136 }
137 }
138
Show(FILE * fp)139 void CounterSummaries::Show(FILE* fp) {
140 bool show_thread = !summaries_.empty() && summaries_[0].thread != nullptr;
141 bool show_cpu = !summaries_.empty() && summaries_[0].cpu != -1;
142 if (csv_) {
143 ShowCSV(fp, show_thread, show_cpu);
144 } else {
145 ShowText(fp, show_thread, show_cpu);
146 }
147 }
148
ShowCSV(FILE * fp,bool show_thread,bool show_cpu)149 void CounterSummaries::ShowCSV(FILE* fp, bool show_thread, bool show_cpu) {
150 for (auto& s : summaries_) {
151 if (show_thread) {
152 fprintf(fp, "%s,%d,%d,", s.thread->name.c_str(), s.thread->pid, s.thread->tid);
153 }
154 if (show_cpu) {
155 fprintf(fp, "%d,", s.cpu);
156 }
157 fprintf(fp, "%s,%s,%s,%s\n", s.readable_count.c_str(), s.Name().c_str(), s.comment.c_str(),
158 (s.auto_generated ? "(generated)," : ""));
159 }
160 }
161
ShowText(FILE * fp,bool show_thread,bool show_cpu)162 void CounterSummaries::ShowText(FILE* fp, bool show_thread, bool show_cpu) {
163 std::vector<std::string> titles;
164
165 if (show_thread) {
166 titles = {"thread_name", "pid", "tid"};
167 }
168 if (show_cpu) {
169 titles.emplace_back("cpu");
170 }
171 titles.emplace_back("count");
172 titles.emplace_back("event_name");
173 titles.emplace_back(" # count / runtime");
174
175 std::vector<size_t> width(titles.size(), 0);
176
177 auto adjust_width = [](size_t& w, size_t size) { w = std::max(w, size); };
178
179 // The last title is too long. Don't include it for width adjustment.
180 for (size_t i = 0; i + 1 < titles.size(); i++) {
181 adjust_width(width[i], titles[i].size());
182 }
183
184 for (auto& s : summaries_) {
185 size_t i = 0;
186 if (show_thread) {
187 adjust_width(width[i++], s.thread->name.size());
188 adjust_width(width[i++], std::to_string(s.thread->pid).size());
189 adjust_width(width[i++], std::to_string(s.thread->tid).size());
190 }
191 if (show_cpu) {
192 adjust_width(width[i++], std::to_string(s.cpu).size());
193 }
194 adjust_width(width[i++], s.readable_count.size());
195 adjust_width(width[i++], s.Name().size());
196 adjust_width(width[i++], s.comment.size());
197 }
198
199 fprintf(fp, "# ");
200 for (size_t i = 0; i < titles.size(); i++) {
201 if (titles[i] == "count") {
202 fprintf(fp, "%*s", static_cast<int>(width[i]), titles[i].c_str());
203 } else {
204 fprintf(fp, "%-*s", static_cast<int>(width[i]), titles[i].c_str());
205 }
206 if (i + 1 < titles.size()) {
207 fprintf(fp, " ");
208 }
209 }
210 fprintf(fp, "\n");
211
212 for (auto& s : summaries_) {
213 size_t i = 0;
214 if (show_thread) {
215 fprintf(fp, " %-*s", static_cast<int>(width[i++]), s.thread->name.c_str());
216 fprintf(fp, " %-*d", static_cast<int>(width[i++]), s.thread->pid);
217 fprintf(fp, " %-*d", static_cast<int>(width[i++]), s.thread->tid);
218 }
219 if (show_cpu) {
220 fprintf(fp, " %-*d", static_cast<int>(width[i++]), s.cpu);
221 }
222 fprintf(fp, " %*s %-*s # %-*s%s\n", static_cast<int>(width[i]), s.readable_count.c_str(),
223 static_cast<int>(width[i + 1]), s.Name().c_str(), static_cast<int>(width[i + 2]),
224 s.comment.c_str(), (s.auto_generated ? " (generated)" : ""));
225 }
226 }
227
GetCommentForSummary(const CounterSummary & s,double duration_in_sec)228 std::string CounterSummaries::GetCommentForSummary(const CounterSummary& s,
229 double duration_in_sec) {
230 char sap_mid;
231 if (csv_) {
232 sap_mid = ',';
233 } else {
234 sap_mid = ' ';
235 }
236 if (s.type_name == "task-clock") {
237 double run_sec = s.count / 1e9;
238 double used_cpus = run_sec / duration_in_sec;
239 return android::base::StringPrintf("%f%ccpus used", used_cpus, sap_mid);
240 }
241 if (s.type_name == "cpu-clock") {
242 return "";
243 }
244 if (s.type_name == "cpu-cycles") {
245 if (s.runtime_in_ns == 0) {
246 return "";
247 }
248 double ghz = static_cast<double>(s.count) / s.runtime_in_ns;
249 return android::base::StringPrintf("%f%cGHz", ghz, sap_mid);
250 }
251 if (s.type_name == "instructions" && s.count != 0) {
252 const CounterSummary* other = FindSummary("cpu-cycles", s.modifier, s.thread, s.cpu);
253 if (other != nullptr && other->IsMonitoredAtTheSameTime(s)) {
254 double cpi = static_cast<double>(other->count) / s.count;
255 return android::base::StringPrintf("%f%ccycles per instruction", cpi, sap_mid);
256 }
257 }
258 std::string rate_comment = GetRateComment(s, sap_mid);
259 if (!rate_comment.empty()) {
260 return rate_comment;
261 }
262 if (s.runtime_in_ns == 0) {
263 return "";
264 }
265 double runtime_in_sec = static_cast<double>(s.runtime_in_ns) / 1e9;
266 double rate = s.count / runtime_in_sec;
267 if (rate >= 1e9 - 1e5) {
268 return android::base::StringPrintf("%.3f%cG/sec", rate / 1e9, sap_mid);
269 }
270 if (rate >= 1e6 - 1e2) {
271 return android::base::StringPrintf("%.3f%cM/sec", rate / 1e6, sap_mid);
272 }
273 if (rate >= 1e3) {
274 return android::base::StringPrintf("%.3f%cK/sec", rate / 1e3, sap_mid);
275 }
276 return android::base::StringPrintf("%.3f%c/sec", rate, sap_mid);
277 }
278
GetRateComment(const CounterSummary & s,char sep)279 std::string CounterSummaries::GetRateComment(const CounterSummary& s, char sep) {
280 std::string_view miss_event_name = s.type_name;
281 std::string event_name;
282 std::string rate_desc;
283 if (auto it = COMMON_EVENT_RATE_MAP.find(miss_event_name); it != COMMON_EVENT_RATE_MAP.end()) {
284 event_name = it->second.first;
285 rate_desc = it->second.second;
286 }
287 if (event_name.empty() && (GetTargetArch() == ARCH_ARM || GetTargetArch() == ARCH_ARM64)) {
288 if (auto it = ARM_EVENT_RATE_MAP.find(miss_event_name); it != ARM_EVENT_RATE_MAP.end()) {
289 event_name = it->second.first;
290 rate_desc = it->second.second;
291 }
292 }
293 if (event_name.empty() && android::base::ConsumeSuffix(&miss_event_name, "-misses")) {
294 event_name = std::string(miss_event_name) + "s";
295 rate_desc = "miss rate";
296 }
297 if (!event_name.empty()) {
298 const CounterSummary* other = FindSummary(event_name, s.modifier, s.thread, s.cpu);
299 if (other != nullptr && other->IsMonitoredAtTheSameTime(s) && other->count != 0) {
300 double miss_rate = static_cast<double>(s.count) / other->count;
301 return android::base::StringPrintf("%f%%%c%s", miss_rate * 100, sep, rate_desc.c_str());
302 }
303 }
304 return "";
305 }
306
307 namespace {
308
309 // devfreq may use performance counters to calculate memory latency (as in
310 // drivers/devfreq/arm-memlat-mon.c). Hopefully we can get more available counters by asking devfreq
311 // to not use the memory latency governor temporarily.
312 class DevfreqCounters {
313 public:
Use()314 bool Use() {
315 if (!IsRoot()) {
316 LOG(ERROR) << "--use-devfreq-counters needs root permission to set devfreq governors";
317 return false;
318 }
319 std::string devfreq_dir = "/sys/class/devfreq/";
320 for (auto& name : GetSubDirs(devfreq_dir)) {
321 std::string governor_path = devfreq_dir + name + "/governor";
322 if (IsRegularFile(governor_path)) {
323 std::string governor;
324 if (!android::base::ReadFileToString(governor_path, &governor)) {
325 LOG(ERROR) << "failed to read " << governor_path;
326 return false;
327 }
328 governor = android::base::Trim(governor);
329 if (governor == "mem_latency") {
330 if (!android::base::WriteStringToFile("performance", governor_path)) {
331 PLOG(ERROR) << "failed to write " << governor_path;
332 return false;
333 }
334 mem_latency_governor_paths_.emplace_back(std::move(governor_path));
335 }
336 }
337 }
338 return true;
339 }
340
~DevfreqCounters()341 ~DevfreqCounters() {
342 for (auto& path : mem_latency_governor_paths_) {
343 android::base::WriteStringToFile("mem_latency", path);
344 }
345 }
346
347 private:
348 std::vector<std::string> mem_latency_governor_paths_;
349 };
350
351 class StatCommand : public Command {
352 public:
StatCommand()353 StatCommand()
354 : Command(
355 "stat", "gather performance counter information",
356 // clang-format off
357 "Usage: simpleperf stat [options] [command [command-args]]\n"
358 " Gather performance counter information of running [command].\n"
359 " And -a/-p/-t option can be used to change target of counter information.\n"
360 "-a Collect system-wide information.\n"
361 #if defined(__ANDROID__)
362 "--app package_name Profile the process of an Android application.\n"
363 " On non-rooted devices, the app must be debuggable,\n"
364 " because we use run-as to switch to the app's context.\n"
365 #endif
366 "--cpu cpu_item1,cpu_item2,... Monitor events on selected cpus. cpu_item can be a number like\n"
367 " 1, or a range like 0-3. A --cpu option affects all event types\n"
368 " following it until meeting another --cpu option.\n"
369 "--csv Write report in comma separate form.\n"
370 "--duration time_in_sec Monitor for time_in_sec seconds instead of running\n"
371 " [command]. Here time_in_sec may be any positive\n"
372 " floating point number.\n"
373 "--interval time_in_ms Print stat for every time_in_ms milliseconds.\n"
374 " Here time_in_ms may be any positive floating point\n"
375 " number. Simpleperf prints total values from the\n"
376 " starting point. But this can be changed by\n"
377 " --interval-only-values.\n"
378 "--interval-only-values Print numbers of events happened in each interval.\n"
379 "-e event1[:modifier1],event2[:modifier2],...\n"
380 " Select a list of events to count. An event can be:\n"
381 " 1) an event name listed in `simpleperf list`;\n"
382 " 2) a raw PMU event in rN format. N is a hex number.\n"
383 " For example, r1b selects event number 0x1b.\n"
384 " Modifiers can be added to define how the event should be\n"
385 " monitored. Possible modifiers are:\n"
386 " u - monitor user space events only\n"
387 " k - monitor kernel space events only\n"
388 "--group event1[:modifier],event2[:modifier2],...\n"
389 " Similar to -e option. But events specified in the same --group\n"
390 " option are monitored as a group, and scheduled in and out at the\n"
391 " same time.\n"
392 "--kprobe kprobe_event1,kprobe_event2,...\n"
393 " Add kprobe events during stating. The kprobe_event format is in\n"
394 " Documentation/trace/kprobetrace.rst in the kernel. Examples:\n"
395 " 'p:myprobe do_sys_openat2 $arg2:string' - add event kprobes:myprobe\n"
396 " 'r:myretprobe do_sys_openat2 $retval:s64' - add event kprobes:myretprobe\n"
397 "--no-inherit Don't stat created child threads/processes.\n"
398 "-o output_filename Write report to output_filename instead of standard output.\n"
399 "--per-core Print counters for each cpu core.\n"
400 "--per-thread Print counters for each thread.\n"
401 "-p pid_or_process_name_regex1,pid_or_process_name_regex2,...\n"
402 " Stat events on existing processes. Processes are searched either by pid\n"
403 " or process name regex. Mutually exclusive with -a.\n"
404 "-t tid1,tid2,... Stat events on existing threads. Mutually exclusive with -a.\n"
405 "--tp-filter filter_string Set filter_string for the previous tracepoint event.\n"
406 " Format is in Documentation/trace/events.rst in the kernel.\n"
407 " An example: 'prev_comm != \"simpleperf\" && (prev_pid > 1)'.\n"
408 "--print-hw-counter Test and print CPU PMU hardware counters available on the device.\n"
409 "--sort key1,key2,... Select keys used to sort the report, used when --per-thread\n"
410 " or --per-core appears. The appearance order of keys decides\n"
411 " the order of keys used to sort the report.\n"
412 " Possible keys include:\n"
413 " count -- event count for each entry\n"
414 " count_per_thread -- event count for a thread on all cpus\n"
415 " cpu -- cpu id\n"
416 " pid -- process id\n"
417 " tid -- thread id\n"
418 " comm -- thread name\n"
419 " The default sort keys are:\n"
420 " count_per_thread,tid,cpu,count\n"
421 #if defined(__ANDROID__)
422 "--use-devfreq-counters On devices with Qualcomm SOCs, some hardware counters may be used\n"
423 " to monitor memory latency (in drivers/devfreq/arm-memlat-mon.c),\n"
424 " making fewer counters available to users. This option asks devfreq\n"
425 " to temporarily release counters by replacing memory-latency governor\n"
426 " with performance governor. It affects memory latency during profiling,\n"
427 " and may cause wedged power if simpleperf is killed in between.\n"
428 #endif
429 "--verbose Show result in verbose mode.\n"
430 #if 0
431 // Below options are only used internally and shouldn't be visible to the public.
432 "--in-app We are already running in the app's context.\n"
433 "--tracepoint-events file_name Read tracepoint events from [file_name] instead of tracefs.\n"
434 "--out-fd <fd> Write output to a file descriptor.\n"
435 "--stop-signal-fd <fd> Stop stating when fd is readable.\n"
436 #endif
437 // clang-format on
438 ),
439 verbose_mode_(false),
440 system_wide_collection_(false),
441 child_inherit_(true),
442 duration_in_sec_(0),
443 interval_in_ms_(0),
444 interval_only_values_(false),
445 event_selection_set_(true),
446 csv_(false),
447 in_app_context_(false) {
448 // Die if parent exits.
449 prctl(PR_SET_PDEATHSIG, SIGHUP, 0, 0, 0);
450 // Set default sort keys. Full key list is in BuildSummaryComparator().
451 sort_keys_ = {"count_per_thread", "tid", "cpu", "count"};
452 }
453
454 bool Run(const std::vector<std::string>& args);
455
456 private:
457 bool ParseOptions(const std::vector<std::string>& args, std::vector<std::string>* non_option_args,
458 ProbeEvents& probe_events);
459 void PrintHardwareCounters();
460 bool AddDefaultMeasuredEventTypes();
461 void SetEventSelectionFlags();
462 void MonitorEachThread();
463 void AdjustToIntervalOnlyValues(std::vector<CountersInfo>& counters);
464 bool ShowCounters(const std::vector<CountersInfo>& counters, double duration_in_sec, FILE* fp);
465 void CheckHardwareCounterMultiplexing();
466 void PrintWarningForInaccurateEvents();
467
468 bool verbose_mode_;
469 bool system_wide_collection_;
470 bool child_inherit_;
471 double duration_in_sec_;
472 double interval_in_ms_;
473 bool interval_only_values_;
474 std::vector<std::vector<CounterSum>> last_sum_values_;
475 EventSelectionSet event_selection_set_;
476 std::string output_filename_;
477 android::base::unique_fd out_fd_;
478 bool csv_;
479 std::string app_package_name_;
480 bool in_app_context_;
481 android::base::unique_fd stop_signal_fd_;
482 bool use_devfreq_counters_ = false;
483
484 bool report_per_core_ = false;
485 bool report_per_thread_ = false;
486 // used to report event count for each thread
487 std::unordered_map<pid_t, ThreadInfo> thread_info_;
488 // used to sort report
489 std::vector<std::string> sort_keys_;
490 std::optional<SummaryComparator> summary_comparator_;
491 bool print_hw_counter_ = false;
492 };
493
Run(const std::vector<std::string> & args)494 bool StatCommand::Run(const std::vector<std::string>& args) {
495 if (!CheckPerfEventLimit()) {
496 return false;
497 }
498 AllowMoreOpenedFiles();
499
500 // 1. Parse options, and use default measured event types if not given.
501 std::vector<std::string> workload_args;
502 ProbeEvents probe_events(event_selection_set_);
503 if (!ParseOptions(args, &workload_args, probe_events)) {
504 return false;
505 }
506 if (print_hw_counter_) {
507 PrintHardwareCounters();
508 return true;
509 }
510 if (!app_package_name_.empty() && !in_app_context_) {
511 if (!IsRoot()) {
512 return RunInAppContext(app_package_name_, "stat", args, workload_args.size(),
513 output_filename_, !event_selection_set_.GetTracepointEvents().empty());
514 }
515 }
516 DevfreqCounters devfreq_counters;
517 if (use_devfreq_counters_) {
518 if (!devfreq_counters.Use()) {
519 return false;
520 }
521 }
522 if (event_selection_set_.empty()) {
523 if (!AddDefaultMeasuredEventTypes()) {
524 return false;
525 }
526 }
527 SetEventSelectionFlags();
528
529 // 2. Create workload.
530 std::unique_ptr<Workload> workload;
531 if (!workload_args.empty()) {
532 workload = Workload::CreateWorkload(workload_args);
533 if (workload == nullptr) {
534 return false;
535 }
536 }
537 bool need_to_check_targets = false;
538 if (system_wide_collection_) {
539 if (report_per_thread_) {
540 event_selection_set_.AddMonitoredProcesses(GetAllProcesses());
541 } else {
542 event_selection_set_.AddMonitoredThreads({-1});
543 }
544 } else if (!event_selection_set_.HasMonitoredTarget()) {
545 if (workload != nullptr) {
546 event_selection_set_.AddMonitoredProcesses({workload->GetPid()});
547 event_selection_set_.SetEnableCondition(false, true);
548 } else if (!app_package_name_.empty()) {
549 std::set<pid_t> pids = WaitForAppProcesses(app_package_name_);
550 event_selection_set_.AddMonitoredProcesses(pids);
551 } else {
552 LOG(ERROR) << "No threads to monitor. Try `simpleperf help stat` for help\n";
553 return false;
554 }
555 } else {
556 need_to_check_targets = true;
557 }
558
559 if (report_per_thread_) {
560 MonitorEachThread();
561 }
562
563 // 3. Open perf_event_files and output file if defined.
564 if (!event_selection_set_.OpenEventFiles()) {
565 return false;
566 }
567 std::unique_ptr<FILE, decltype(&fclose)> fp_holder(nullptr, fclose);
568 if (!output_filename_.empty()) {
569 fp_holder.reset(fopen(output_filename_.c_str(), "we"));
570 if (fp_holder == nullptr) {
571 PLOG(ERROR) << "failed to open " << output_filename_;
572 return false;
573 }
574 } else if (out_fd_ != -1) {
575 fp_holder.reset(fdopen(out_fd_.release(), "we"));
576 if (fp_holder == nullptr) {
577 PLOG(ERROR) << "failed to write output.";
578 return false;
579 }
580 }
581 FILE* fp = fp_holder ? fp_holder.get() : stdout;
582
583 // 4. Add signal/periodic Events.
584 IOEventLoop* loop = event_selection_set_.GetIOEventLoop();
585 std::chrono::time_point<std::chrono::steady_clock> start_time;
586 std::vector<CountersInfo> counters;
587 if (need_to_check_targets && !event_selection_set_.StopWhenNoMoreTargets()) {
588 return false;
589 }
590 auto exit_loop_callback = [loop]() { return loop->ExitLoop(); };
591 if (!loop->AddSignalEvents({SIGCHLD, SIGINT, SIGTERM, SIGHUP}, exit_loop_callback)) {
592 return false;
593 }
594 if (stop_signal_fd_ != -1) {
595 if (!loop->AddReadEvent(stop_signal_fd_, exit_loop_callback)) {
596 return false;
597 }
598 }
599 if (duration_in_sec_ != 0) {
600 if (!loop->AddPeriodicEvent(SecondToTimeval(duration_in_sec_), exit_loop_callback)) {
601 return false;
602 }
603 }
604 auto print_counters = [&]() {
605 auto end_time = std::chrono::steady_clock::now();
606 if (!event_selection_set_.ReadCounters(&counters)) {
607 return false;
608 }
609 double duration_in_sec =
610 std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time).count();
611 if (interval_only_values_) {
612 AdjustToIntervalOnlyValues(counters);
613 }
614 if (!ShowCounters(counters, duration_in_sec, fp)) {
615 return false;
616 }
617 return true;
618 };
619
620 if (interval_in_ms_ != 0) {
621 if (!loop->AddPeriodicEvent(SecondToTimeval(interval_in_ms_ / 1000.0), print_counters)) {
622 return false;
623 }
624 }
625
626 // 5. Count events while workload running.
627 start_time = std::chrono::steady_clock::now();
628 if (workload != nullptr && !workload->Start()) {
629 return false;
630 }
631 if (!loop->RunLoop()) {
632 return false;
633 }
634
635 // 6. Read and print counters.
636 if (interval_in_ms_ == 0) {
637 if (!print_counters()) {
638 return false;
639 }
640 }
641
642 // 7. Print warnings when needed.
643 event_selection_set_.CloseEventFiles();
644 CheckHardwareCounterMultiplexing();
645 PrintWarningForInaccurateEvents();
646
647 return true;
648 }
649
ParseOptions(const std::vector<std::string> & args,std::vector<std::string> * non_option_args,ProbeEvents & probe_events)650 bool StatCommand::ParseOptions(const std::vector<std::string>& args,
651 std::vector<std::string>* non_option_args,
652 ProbeEvents& probe_events) {
653 OptionValueMap options;
654 std::vector<std::pair<OptionName, OptionValue>> ordered_options;
655
656 if (!PreprocessOptions(args, GetStatCmdOptionFormats(), &options, &ordered_options,
657 non_option_args)) {
658 return false;
659 }
660
661 // Process options.
662 system_wide_collection_ = options.PullBoolValue("-a");
663
664 if (auto value = options.PullValue("--app"); value) {
665 app_package_name_ = *value->str_value;
666 }
667 csv_ = options.PullBoolValue("--csv");
668
669 if (!options.PullDoubleValue("--duration", &duration_in_sec_, 1e-9)) {
670 return false;
671 }
672 if (!options.PullDoubleValue("--interval", &interval_in_ms_, 1e-9)) {
673 return false;
674 }
675 interval_only_values_ = options.PullBoolValue("--interval-only-values");
676
677 in_app_context_ = options.PullBoolValue("--in-app");
678 for (const OptionValue& value : options.PullValues("--kprobe")) {
679 for (const auto& cmd : Split(*value.str_value, ",")) {
680 if (!probe_events.AddKprobe(cmd)) {
681 return false;
682 }
683 }
684 }
685 child_inherit_ = !options.PullBoolValue("--no-inherit");
686
687 if (auto value = options.PullValue("-o"); value) {
688 output_filename_ = *value->str_value;
689 }
690 if (auto value = options.PullValue("--out-fd"); value) {
691 out_fd_.reset(static_cast<int>(value->uint_value));
692 }
693
694 report_per_core_ = options.PullBoolValue("--per-core");
695 report_per_thread_ = options.PullBoolValue("--per-thread");
696
697 if (auto strs = options.PullStringValues("-p"); !strs.empty()) {
698 if (auto pids = GetPidsFromStrings(strs, true, true); pids) {
699 event_selection_set_.AddMonitoredProcesses(pids.value());
700 } else {
701 return false;
702 }
703 }
704 print_hw_counter_ = options.PullBoolValue("--print-hw-counter");
705
706 if (auto value = options.PullValue("--sort"); value) {
707 sort_keys_ = Split(*value->str_value, ",");
708 }
709
710 if (auto value = options.PullValue("--stop-signal-fd"); value) {
711 stop_signal_fd_.reset(static_cast<int>(value->uint_value));
712 }
713
714 for (const OptionValue& value : options.PullValues("-t")) {
715 if (auto tids = GetTidsFromString(*value.str_value, true); tids) {
716 event_selection_set_.AddMonitoredThreads(tids.value());
717 } else {
718 return false;
719 }
720 }
721
722 if (auto value = options.PullValue("--tracepoint-events"); value) {
723 if (!EventTypeManager::Instance().ReadTracepointsFromFile(*value->str_value)) {
724 return false;
725 }
726 }
727
728 use_devfreq_counters_ = options.PullBoolValue("--use-devfreq-counters");
729 verbose_mode_ = options.PullBoolValue("--verbose");
730
731 CHECK(options.values.empty());
732
733 // Process ordered options.
734 for (const auto& pair : ordered_options) {
735 const OptionName& name = pair.first;
736 const OptionValue& value = pair.second;
737
738 if (name == "--cpu") {
739 if (auto v = GetCpusFromString(*value.str_value); v) {
740 std::set<int>& cpus = v.value();
741 event_selection_set_.SetCpusForNewEvents(std::vector<int>(cpus.begin(), cpus.end()));
742 } else {
743 return false;
744 }
745 } else if (name == "-e") {
746 for (const auto& event_type : Split(*value.str_value, ",")) {
747 if (!probe_events.CreateProbeEventIfNotExist(event_type)) {
748 return false;
749 }
750 if (!event_selection_set_.AddEventType(event_type)) {
751 return false;
752 }
753 }
754 } else if (name == "--group") {
755 std::vector<std::string> event_types = Split(*value.str_value, ",");
756 for (const auto& event_type : event_types) {
757 if (!probe_events.CreateProbeEventIfNotExist(event_type)) {
758 return false;
759 }
760 }
761 if (!event_selection_set_.AddEventGroup(event_types)) {
762 return false;
763 }
764 } else if (name == "--tp-filter") {
765 if (!event_selection_set_.SetTracepointFilter(*value.str_value)) {
766 return false;
767 }
768 } else {
769 LOG(ERROR) << "unprocessed option: " << name;
770 return false;
771 }
772 }
773
774 if (system_wide_collection_ && event_selection_set_.HasMonitoredTarget()) {
775 LOG(ERROR) << "Stat system wide and existing processes/threads can't be "
776 "used at the same time.";
777 return false;
778 }
779 if (system_wide_collection_ && !IsRoot()) {
780 LOG(ERROR) << "System wide profiling needs root privilege.";
781 return false;
782 }
783
784 if (report_per_core_ || report_per_thread_) {
785 summary_comparator_ = BuildSummaryComparator(sort_keys_, report_per_thread_, report_per_core_);
786 if (!summary_comparator_) {
787 return false;
788 }
789 }
790 return true;
791 }
792
CheckHardwareCountersOnCpu(int cpu,size_t counters)793 std::optional<bool> CheckHardwareCountersOnCpu(int cpu, size_t counters) {
794 if (counters == 0) {
795 return true;
796 }
797 const EventType* event = FindEventTypeByName("cpu-cycles", true);
798 if (event == nullptr) {
799 return std::nullopt;
800 }
801 perf_event_attr attr = CreateDefaultPerfEventAttr(*event);
802 auto workload = Workload::CreateWorkload({"sleep", "0.1"});
803 if (!workload || !workload->SetCpuAffinity(cpu)) {
804 return std::nullopt;
805 }
806 std::vector<std::unique_ptr<EventFd>> event_fds;
807 for (size_t i = 0; i < counters; i++) {
808 EventFd* group_event_fd = event_fds.empty() ? nullptr : event_fds[0].get();
809 auto event_fd =
810 EventFd::OpenEventFile(attr, workload->GetPid(), cpu, group_event_fd, "cpu-cycles", false);
811 if (!event_fd) {
812 return false;
813 }
814 event_fds.emplace_back(std::move(event_fd));
815 }
816 if (!workload->Start() || !workload->WaitChildProcess(true, nullptr)) {
817 return std::nullopt;
818 }
819 for (auto& event_fd : event_fds) {
820 PerfCounter counter;
821 if (!event_fd->ReadCounter(&counter)) {
822 return std::nullopt;
823 }
824 if (counter.time_enabled == 0 || counter.time_enabled > counter.time_running) {
825 return false;
826 }
827 }
828 return true;
829 }
830
GetHardwareCountersOnCpu(int cpu)831 std::optional<size_t> GetHardwareCountersOnCpu(int cpu) {
832 size_t available_counters = 0;
833 while (true) {
834 std::optional<bool> result = CheckHardwareCountersOnCpu(cpu, available_counters + 1);
835 if (!result.has_value()) {
836 return std::nullopt;
837 }
838 if (!result.value()) {
839 break;
840 }
841 available_counters++;
842 }
843 return available_counters;
844 }
845
PrintHardwareCounters()846 void StatCommand::PrintHardwareCounters() {
847 for (int cpu : GetOnlineCpus()) {
848 std::optional<size_t> counters = GetHardwareCountersOnCpu(cpu);
849 if (!counters) {
850 // When built as a 32-bit program, we can't set sched_affinity to a 64-bit only CPU. So we
851 // may not be able to get hardware counters on that CPU.
852 LOG(WARNING) << "Failed to get CPU PMU hardware counters on cpu " << cpu;
853 continue;
854 }
855 printf("There are %zu CPU PMU hardware counters available on cpu %d.\n", counters.value(), cpu);
856 }
857 }
858
AddDefaultMeasuredEventTypes()859 bool StatCommand::AddDefaultMeasuredEventTypes() {
860 for (auto& name : default_measured_event_types) {
861 // It is not an error when some event types in the default list are not
862 // supported by the kernel.
863 const EventType* type = FindEventTypeByName(name);
864 if (type != nullptr && IsEventAttrSupported(CreateDefaultPerfEventAttr(*type), name)) {
865 if (!event_selection_set_.AddEventType(name)) {
866 return false;
867 }
868 }
869 }
870 if (event_selection_set_.empty()) {
871 LOG(ERROR) << "Failed to add any supported default measured types";
872 return false;
873 }
874 return true;
875 }
876
SetEventSelectionFlags()877 void StatCommand::SetEventSelectionFlags() {
878 event_selection_set_.SetInherit(child_inherit_);
879 }
880
MonitorEachThread()881 void StatCommand::MonitorEachThread() {
882 std::vector<pid_t> threads;
883 for (auto pid : event_selection_set_.GetMonitoredProcesses()) {
884 for (auto tid : GetThreadsInProcess(pid)) {
885 ThreadInfo info;
886 if (GetThreadName(tid, &info.name)) {
887 info.tid = tid;
888 info.pid = pid;
889 thread_info_[tid] = std::move(info);
890 threads.push_back(tid);
891 }
892 }
893 }
894 for (auto tid : event_selection_set_.GetMonitoredThreads()) {
895 ThreadInfo info;
896 if (ReadThreadNameAndPid(tid, &info.name, &info.pid)) {
897 info.tid = tid;
898 thread_info_[tid] = std::move(info);
899 threads.push_back(tid);
900 }
901 }
902 event_selection_set_.ClearMonitoredTargets();
903 event_selection_set_.AddMonitoredThreads(threads);
904 }
905
AdjustToIntervalOnlyValues(std::vector<CountersInfo> & counters)906 void StatCommand::AdjustToIntervalOnlyValues(std::vector<CountersInfo>& counters) {
907 if (last_sum_values_.size() < counters.size()) {
908 last_sum_values_.resize(counters.size());
909 }
910 for (size_t i = 0; i < counters.size(); i++) {
911 std::vector<CounterInfo>& counters_per_event = counters[i].counters;
912 std::vector<CounterSum>& last_sum = last_sum_values_[i];
913
914 if (last_sum.size() < counters_per_event.size()) {
915 last_sum.resize(counters_per_event.size());
916 }
917 for (size_t j = 0; j < counters_per_event.size(); j++) {
918 PerfCounter& counter = counters_per_event[j].counter;
919 CounterSum new_sum;
920 new_sum.FromCounter(counter);
921 CounterSum delta = new_sum - last_sum[j];
922 delta.ToCounter(counter);
923 last_sum[j] = new_sum;
924 }
925 }
926 }
927
ShowCounters(const std::vector<CountersInfo> & counters,double duration_in_sec,FILE * fp)928 bool StatCommand::ShowCounters(const std::vector<CountersInfo>& counters, double duration_in_sec,
929 FILE* fp) {
930 if (csv_) {
931 fprintf(fp, "Performance counter statistics,\n");
932 } else {
933 fprintf(fp, "Performance counter statistics:\n\n");
934 }
935
936 if (verbose_mode_) {
937 for (auto& counters_info : counters) {
938 for (auto& counter_info : counters_info.counters) {
939 if (csv_) {
940 fprintf(fp,
941 "%s,tid,%d,cpu,%d,count,%" PRIu64 ",time_enabled,%" PRIu64
942 ",time running,%" PRIu64 ",id,%" PRIu64 ",\n",
943 counters_info.event_name.c_str(), counter_info.tid, counter_info.cpu,
944 counter_info.counter.value, counter_info.counter.time_enabled,
945 counter_info.counter.time_running, counter_info.counter.id);
946 } else {
947 fprintf(fp,
948 "%s(tid %d, cpu %d): count %" PRIu64 ", time_enabled %" PRIu64
949 ", time running %" PRIu64 ", id %" PRIu64 "\n",
950 counters_info.event_name.c_str(), counter_info.tid, counter_info.cpu,
951 counter_info.counter.value, counter_info.counter.time_enabled,
952 counter_info.counter.time_running, counter_info.counter.id);
953 }
954 }
955 }
956 }
957
958 CounterSummaryBuilder builder(report_per_thread_, report_per_core_, csv_, thread_info_,
959 summary_comparator_);
960 for (const auto& info : counters) {
961 builder.AddCountersForOneEventType(info);
962 }
963 CounterSummaries summaries(builder.Build(), csv_);
964 summaries.AutoGenerateSummaries();
965 summaries.GenerateComments(duration_in_sec);
966 summaries.Show(fp);
967
968 if (csv_) {
969 fprintf(fp, "Total test time,%lf,seconds,\n", duration_in_sec);
970 } else {
971 fprintf(fp, "\nTotal test time: %lf seconds.\n", duration_in_sec);
972 }
973 return true;
974 }
975
CheckHardwareCounterMultiplexing()976 void StatCommand::CheckHardwareCounterMultiplexing() {
977 for (const auto& [cpu, hardware_events] : event_selection_set_.GetHardwareCountersForCpus()) {
978 std::optional<bool> result = CheckHardwareCountersOnCpu(cpu, hardware_events);
979 if (result.has_value() && !result.value()) {
980 LOG(WARNING) << "It seems the number of hardware events are more than the number of\n"
981 << "available CPU PMU hardware counters. That will trigger hardware counter\n"
982 << "multiplexing. As a result, events are not counted all the time processes\n"
983 << "running, and event counts are smaller than what really happen.\n"
984 << "Use --print-hw-counter to show available hardware counters.\n"
985 #if defined(__ANDROID__)
986 << "If on a rooted device, try --use-devfreq-counters to get more counters.\n"
987 #endif
988 ;
989 break;
990 }
991 }
992 }
993
PrintWarningForInaccurateEvents()994 void StatCommand::PrintWarningForInaccurateEvents() {
995 for (const EventType* event : event_selection_set_.GetEvents()) {
996 if (event->name == "raw-l3d-cache-lmiss-rd") {
997 LOG(WARNING) << "PMU event L3D_CACHE_LMISS_RD might undercount on A510. Please use "
998 "L3D_CACHE_REFILL_RD instead.";
999 break;
1000 }
1001 }
1002 }
1003
1004 } // namespace
1005
RegisterStatCommand()1006 void RegisterStatCommand() {
1007 RegisterCommand("stat", [] { return std::unique_ptr<Command>(new StatCommand); });
1008 }
1009
1010 } // namespace simpleperf
1011