1#!/usr/bin/env python3
2#
3# Copyright (C) 2017 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""pprof_proto_generator.py: read perf.data, generate pprof.profile, which can be
19    used by pprof.
20
21  Example:
22    ./app_profiler.py
23    ./pprof_proto_generator.py
24    pprof -text pprof.profile
25"""
26
27import logging
28import os
29import os.path
30import re
31import sys
32
33from simpleperf_report_lib import GetReportLib
34from simpleperf_utils import (Addr2Nearestline, BaseArgumentParser, BinaryFinder, extant_dir,
35                              flatten_arg_list, log_exit, ReadElf, ToolFinder)
36try:
37    import profile_pb2
38except ImportError as e:
39    log_exit(f'{e}\nprotobuf package is missing or too old. Please install it like ' +
40             '`pip install protobuf==4.21`.')
41
42
43# Some units of common event names
44EVENT_UNITS = {
45    'cpu-clock': 'nanoseconds',
46    'cpu-cycles': 'cpu-cycles',
47    'instructions': 'instructions',
48    'task-clock': 'nanoseconds',
49}
50
51
52def load_pprof_profile(filename):
53    profile = profile_pb2.Profile()
54    with open(filename, "rb") as f:
55        profile.ParseFromString(f.read())
56    return profile
57
58
59def store_pprof_profile(filename, profile):
60    with open(filename, 'wb') as f:
61        f.write(profile.SerializeToString())
62
63
64class PprofProfilePrinter(object):
65
66    def __init__(self, profile):
67        self.profile = profile
68        self.string_table = profile.string_table
69
70    def show(self):
71        p = self.profile
72        sub_space = '  '
73        print('Profile {')
74        print('%d sample_types' % len(p.sample_type))
75        for i in range(len(p.sample_type)):
76            print('sample_type[%d] = ' % i, end='')
77            self.show_value_type(p.sample_type[i])
78        print('%d samples' % len(p.sample))
79        for i in range(len(p.sample)):
80            print('sample[%d]:' % i)
81            self.show_sample(p.sample[i], sub_space)
82        print('%d mappings' % len(p.mapping))
83        for i in range(len(p.mapping)):
84            print('mapping[%d]:' % i)
85            self.show_mapping(p.mapping[i], sub_space)
86        print('%d locations' % len(p.location))
87        for i in range(len(p.location)):
88            print('location[%d]:' % i)
89            self.show_location(p.location[i], sub_space)
90        for i in range(len(p.function)):
91            print('function[%d]:' % i)
92            self.show_function(p.function[i], sub_space)
93        print('%d strings' % len(p.string_table))
94        for i in range(len(p.string_table)):
95            print('string[%d]: %s' % (i, p.string_table[i]))
96        print('drop_frames: %s' % self.string(p.drop_frames))
97        print('keep_frames: %s' % self.string(p.keep_frames))
98        print('time_nanos: %u' % p.time_nanos)
99        print('duration_nanos: %u' % p.duration_nanos)
100        print('period_type: ', end='')
101        self.show_value_type(p.period_type)
102        print('period: %u' % p.period)
103        for i in range(len(p.comment)):
104            print('comment[%d] = %s' % (i, self.string(p.comment[i])))
105        print('default_sample_type: %d' % p.default_sample_type)
106        print('} // Profile')
107        print()
108
109    def show_value_type(self, value_type, space=''):
110        print('%sValueType(typeID=%d, unitID=%d, type=%s, unit=%s)' %
111              (space, value_type.type, value_type.unit,
112               self.string(value_type.type), self.string(value_type.unit)))
113
114    def show_sample(self, sample, space=''):
115        sub_space = space + '  '
116        for i in range(len(sample.location_id)):
117            print('%slocation_id[%d]: id %d' % (space, i, sample.location_id[i]))
118            self.show_location_id(sample.location_id[i], sub_space)
119        for i in range(len(sample.value)):
120            print('%svalue[%d] = %d' % (space, i, sample.value[i]))
121        for i in range(len(sample.label)):
122            print('%slabel[%d] = %s:%s' % (space, i, self.string(sample.label[i].key),
123                                           self.string(sample.label[i].str)))
124
125    def show_location_id(self, location_id, space=''):
126        location = self.profile.location[location_id - 1]
127        self.show_location(location, space)
128
129    def show_location(self, location, space=''):
130        sub_space = space + '  '
131        print('%sid: %d' % (space, location.id))
132        print('%smapping_id: %d' % (space, location.mapping_id))
133        self.show_mapping_id(location.mapping_id, sub_space)
134        print('%saddress: %x' % (space, location.address))
135        for i in range(len(location.line)):
136            print('%sline[%d]:' % (space, i))
137            self.show_line(location.line[i], sub_space)
138
139    def show_mapping_id(self, mapping_id, space=''):
140        mapping = self.profile.mapping[mapping_id - 1]
141        self.show_mapping(mapping, space)
142
143    def show_mapping(self, mapping, space=''):
144        print('%sid: %d' % (space, mapping.id))
145        print('%smemory_start: %x' % (space, mapping.memory_start))
146        print('%smemory_limit: %x' % (space, mapping.memory_limit))
147        print('%sfile_offset: %x' % (space, mapping.file_offset))
148        print('%sfilename: %s(%d)' % (space, self.string(mapping.filename),
149                                      mapping.filename))
150        print('%sbuild_id: %s(%d)' % (space, self.string(mapping.build_id),
151                                      mapping.build_id))
152        print('%shas_functions: %s' % (space, mapping.has_functions))
153        print('%shas_filenames: %s' % (space, mapping.has_filenames))
154        print('%shas_line_numbers: %s' % (space, mapping.has_line_numbers))
155        print('%shas_inline_frames: %s' % (space, mapping.has_inline_frames))
156
157    def show_line(self, line, space=''):
158        sub_space = space + '  '
159        print('%sfunction_id: %d' % (space, line.function_id))
160        self.show_function_id(line.function_id, sub_space)
161        print('%sline: %d' % (space, line.line))
162
163    def show_function_id(self, function_id, space=''):
164        function = self.profile.function[function_id - 1]
165        self.show_function(function, space)
166
167    def show_function(self, function, space=''):
168        print('%sid: %d' % (space, function.id))
169        print('%sname: %s' % (space, self.string(function.name)))
170        print('%ssystem_name: %s' % (space, self.string(function.system_name)))
171        print('%sfilename: %s' % (space, self.string(function.filename)))
172        print('%sstart_line: %d' % (space, function.start_line))
173
174    def string(self, string_id):
175        return self.string_table[string_id]
176
177
178class Label(object):
179    def __init__(self, key_id: int, str_id: int):
180        # See profile.Label.key
181        self.key_id = key_id
182        # See profile.Label.str
183        self.str_id = str_id
184
185
186class Sample(object):
187
188    def __init__(self):
189        self.location_ids = []
190        self.values = {}
191        self.labels = []
192
193    def add_location_id(self, location_id):
194        self.location_ids.append(location_id)
195
196    def add_value(self, sample_type_id, value):
197        self.values[sample_type_id] = self.values.get(sample_type_id, 0) + value
198
199    def add_values(self, values):
200        for sample_type_id, value in values.items():
201            self.add_value(sample_type_id, value)
202
203    @property
204    def key(self):
205        return tuple(self.location_ids)
206
207
208class Location(object):
209
210    def __init__(self, mapping_id, address, vaddr_in_dso):
211        self.id = -1  # unset
212        self.mapping_id = mapping_id
213        self.address = address
214        self.vaddr_in_dso = vaddr_in_dso
215        self.lines = []
216
217    @property
218    def key(self):
219        return (self.mapping_id, self.address)
220
221
222class Line(object):
223
224    def __init__(self):
225        self.function_id = 0
226        self.line = 0
227
228
229class Mapping(object):
230
231    def __init__(self, start, end, pgoff, filename_id, build_id_id):
232        self.id = -1  # unset
233        self.memory_start = start
234        self.memory_limit = end
235        self.file_offset = pgoff
236        self.filename_id = filename_id
237        self.build_id_id = build_id_id
238
239    @property
240    def key(self):
241        return (
242            self.memory_start,
243            self.memory_limit,
244            self.file_offset,
245            self.filename_id,
246            self.build_id_id)
247
248
249class Function(object):
250
251    def __init__(self, name_id, dso_name_id, vaddr_in_dso):
252        self.id = -1  # unset
253        self.name_id = name_id
254        self.dso_name_id = dso_name_id
255        self.vaddr_in_dso = vaddr_in_dso
256        self.source_filename_id = 0
257        self.start_line = 0
258
259    @property
260    def key(self):
261        return (self.name_id, self.dso_name_id)
262
263
264# pylint: disable=no-member
265class PprofProfileGenerator(object):
266
267    def __init__(self, config):
268        self.config = config
269        self.lib = None
270
271        config['binary_cache_dir'] = 'binary_cache'
272        if not os.path.isdir(config['binary_cache_dir']):
273            config['binary_cache_dir'] = None
274        self.dso_filter = set(config['dso_filters']) if config.get('dso_filters') else None
275        self.max_chain_length = config['max_chain_length']
276        self.profile = profile_pb2.Profile()
277        self.profile.string_table.append('')
278        self.string_table = {}
279        self.sample_types = {}
280        self.sample_map = {}
281        self.sample_list = []
282        self.location_map = {}
283        self.location_list = []
284        self.mapping_map = {}
285        self.mapping_list = []
286        self.function_map = {}
287        self.function_list = []
288
289        # Map from dso_name in perf.data to (binary path, build_id).
290        self.binary_map = {}
291        self.read_elf = ReadElf(self.config['ndk_path'])
292        self.binary_finder = BinaryFinder(config['binary_cache_dir'], self.read_elf)
293
294    def load_record_file(self, record_file):
295        self.lib = GetReportLib(record_file)
296
297        if self.config['binary_cache_dir']:
298            self.lib.SetSymfs(self.config['binary_cache_dir'])
299            kallsyms = os.path.join(self.config['binary_cache_dir'], 'kallsyms')
300            if os.path.isfile(kallsyms):
301                self.lib.SetKallsymsFile(kallsyms)
302
303        if self.config.get('show_art_frames'):
304            self.lib.ShowArtFrames()
305        self.lib.SetReportOptions(self.config['report_lib_options'])
306
307        comments = [
308            "Simpleperf Record Command:\n" + self.lib.GetRecordCmd(),
309            "Converted to pprof with:\n" + " ".join(sys.argv),
310            "Architecture:\n" + self.lib.GetArch(),
311        ]
312        meta_info = self.lib.MetaInfo()
313        if "app_versioncode" in meta_info:
314            comments.append("App Version Code:\n" + meta_info["app_versioncode"])
315        for comment in comments:
316            self.profile.comment.append(self.get_string_id(comment))
317        if "timestamp" in meta_info:
318            self.profile.time_nanos = int(meta_info["timestamp"]) * 1000 * 1000 * 1000
319
320        numbers_re = re.compile(r"\d+")
321
322        # Process all samples in perf.data, aggregate samples.
323        while True:
324            report_sample = self.lib.GetNextSample()
325            if report_sample is None:
326                self.lib.Close()
327                self.lib = None
328                break
329            event = self.lib.GetEventOfCurrentSample()
330            symbol = self.lib.GetSymbolOfCurrentSample()
331            callchain = self.lib.GetCallChainOfCurrentSample()
332
333            sample_type_id = self.get_sample_type_id(event.name)
334            sample = Sample()
335            sample.add_value(sample_type_id, 1)
336            sample.add_value(sample_type_id + 1, report_sample.period)
337            sample.labels.append(Label(
338                self.get_string_id("thread"),
339                self.get_string_id(report_sample.thread_comm)))
340            # Heuristic: threadpools doing similar work are often named as
341            # name-1, name-2, name-3. Combine threadpools into one label
342            # "name-%d" if they only differ by a number.
343            sample.labels.append(Label(
344                self.get_string_id("threadpool"),
345                self.get_string_id(
346                    numbers_re.sub("%d", report_sample.thread_comm))))
347            sample.labels.append(Label(
348                self.get_string_id("pid"),
349                self.get_string_id(str(report_sample.pid))))
350            sample.labels.append(Label(
351                self.get_string_id("tid"),
352                self.get_string_id(str(report_sample.tid))))
353            if self._filter_symbol(symbol):
354                location_id = self.get_location_id(report_sample.ip, symbol)
355                sample.add_location_id(location_id)
356            for i in range(max(0, callchain.nr - self.max_chain_length), callchain.nr):
357                entry = callchain.entries[i]
358                if self._filter_symbol(symbol):
359                    location_id = self.get_location_id(entry.ip, entry.symbol)
360                    sample.add_location_id(location_id)
361            if sample.location_ids:
362                self.add_sample(sample)
363
364    def gen(self, jobs: int):
365        # 1. Generate line info for locations and functions.
366        self.gen_source_lines(jobs)
367
368        # 2. Produce samples/locations/functions in profile.
369        for sample in self.sample_list:
370            self.gen_profile_sample(sample)
371        for mapping in self.mapping_list:
372            self.gen_profile_mapping(mapping)
373        for location in self.location_list:
374            self.gen_profile_location(location)
375        for function in self.function_list:
376            self.gen_profile_function(function)
377
378        return self.profile
379
380    def _filter_symbol(self, symbol):
381        if not self.dso_filter or symbol.dso_name in self.dso_filter:
382            return True
383        return False
384
385    def get_string_id(self, str_value):
386        if not str_value:
387            return 0
388        str_id = self.string_table.get(str_value)
389        if str_id is not None:
390            return str_id
391        str_id = len(self.string_table) + 1
392        self.string_table[str_value] = str_id
393        self.profile.string_table.append(str_value)
394        return str_id
395
396    def get_string(self, str_id):
397        return self.profile.string_table[str_id]
398
399    def get_sample_type_id(self, name):
400        sample_type_id = self.sample_types.get(name)
401        if sample_type_id is not None:
402            return sample_type_id
403        sample_type_id = len(self.profile.sample_type)
404        sample_type = self.profile.sample_type.add()
405        sample_type.type = self.get_string_id(name + '_samples')
406        sample_type.unit = self.get_string_id('samples')
407        sample_type = self.profile.sample_type.add()
408        sample_type.type = self.get_string_id(name)
409        units = EVENT_UNITS.get(name, 'count')
410        sample_type.unit = self.get_string_id(units)
411        self.sample_types[name] = sample_type_id
412        return sample_type_id
413
414    def get_location_id(self, ip, symbol):
415        binary_path, build_id = self.get_binary(symbol.dso_name)
416        mapping_id = self.get_mapping_id(symbol.mapping[0], binary_path, build_id)
417        location = Location(mapping_id, ip, symbol.vaddr_in_file)
418        function_id = self.get_function_id(symbol.symbol_name, binary_path, symbol.symbol_addr)
419        if function_id:
420            # Add Line only when it has a valid function id, see http://b/36988814.
421            # Default line info only contains the function name
422            line = Line()
423            line.function_id = function_id
424            location.lines.append(line)
425
426        exist_location = self.location_map.get(location.key)
427        if exist_location:
428            return exist_location.id
429        # location_id starts from 1
430        location.id = len(self.location_list) + 1
431        self.location_list.append(location)
432        self.location_map[location.key] = location
433        return location.id
434
435    def get_mapping_id(self, report_mapping, filename, build_id):
436        filename_id = self.get_string_id(filename)
437        build_id_id = self.get_string_id(build_id)
438        mapping = Mapping(report_mapping.start, report_mapping.end,
439                          report_mapping.pgoff, filename_id, build_id_id)
440        exist_mapping = self.mapping_map.get(mapping.key)
441        if exist_mapping:
442            return exist_mapping.id
443        # mapping_id starts from 1
444        mapping.id = len(self.mapping_list) + 1
445        self.mapping_list.append(mapping)
446        self.mapping_map[mapping.key] = mapping
447        return mapping.id
448
449    def get_binary(self, dso_name):
450        """ Return (binary_path, build_id) for a given dso_name. """
451        value = self.binary_map.get(dso_name)
452        if value:
453            return value
454
455        binary_path = dso_name
456        build_id = self.lib.GetBuildIdForPath(dso_name)
457        # Try elf_path in binary cache.
458        elf_path = self.binary_finder.find_binary(dso_name, build_id)
459        if elf_path:
460            binary_path = str(elf_path)
461
462        # The build ids in perf.data are padded to 20 bytes, but pprof needs without padding.
463        build_id = ReadElf.unpad_build_id(build_id)
464        self.binary_map[dso_name] = (binary_path, build_id)
465        return (binary_path, build_id)
466
467    def get_mapping(self, mapping_id):
468        return self.mapping_list[mapping_id - 1] if mapping_id > 0 else None
469
470    def get_function_id(self, name, dso_name, vaddr_in_file):
471        if name == 'unknown':
472            return 0
473        function = Function(self.get_string_id(name), self.get_string_id(dso_name), vaddr_in_file)
474        exist_function = self.function_map.get(function.key)
475        if exist_function:
476            return exist_function.id
477        # function_id starts from 1
478        function.id = len(self.function_list) + 1
479        self.function_list.append(function)
480        self.function_map[function.key] = function
481        return function.id
482
483    def get_function(self, function_id):
484        return self.function_list[function_id - 1] if function_id > 0 else None
485
486    def add_sample(self, sample):
487        exist_sample = self.sample_map.get(sample.key)
488        if exist_sample:
489            exist_sample.add_values(sample.values)
490        else:
491            self.sample_list.append(sample)
492            self.sample_map[sample.key] = sample
493
494    def gen_source_lines(self, jobs: int):
495        # 1. Create Addr2line instance
496        if not self.config.get('binary_cache_dir'):
497            logging.info("Can't generate line information because binary_cache is missing.")
498            return
499        if not ToolFinder.find_tool_path('llvm-symbolizer', self.config['ndk_path']):
500            logging.info("Can't generate line information because can't find llvm-symbolizer.")
501            return
502        # We have changed dso names to paths in binary_cache in self.get_binary(). So no need to
503        # pass binary_cache_dir to BinaryFinder.
504        binary_finder = BinaryFinder(None, self.read_elf)
505        addr2line = Addr2Nearestline(self.config['ndk_path'], binary_finder, True)
506
507        # 2. Put all needed addresses to it.
508        for location in self.location_list:
509            mapping = self.get_mapping(location.mapping_id)
510            dso_name = self.get_string(mapping.filename_id)
511            if location.lines:
512                function = self.get_function(location.lines[0].function_id)
513                addr2line.add_addr(dso_name, None, function.vaddr_in_dso, location.vaddr_in_dso)
514        for function in self.function_list:
515            dso_name = self.get_string(function.dso_name_id)
516            addr2line.add_addr(dso_name, None, function.vaddr_in_dso, function.vaddr_in_dso)
517
518        # 3. Generate source lines.
519        addr2line.convert_addrs_to_lines(jobs)
520
521        # 4. Annotate locations and functions.
522        for location in self.location_list:
523            if not location.lines:
524                continue
525            mapping = self.get_mapping(location.mapping_id)
526            dso_name = self.get_string(mapping.filename_id)
527            dso = addr2line.get_dso(dso_name)
528            if not dso:
529                continue
530            sources = addr2line.get_addr_source(dso, location.vaddr_in_dso)
531            if not sources:
532                continue
533            for i, source in enumerate(sources):
534                source_file, source_line, function_name = source
535                if i == 0:
536                    # Don't override original function name from report library, which is more
537                    # accurate when proguard mapping file is given.
538                    function_id = location.lines[0].function_id
539                    # Clear default line info.
540                    location.lines.clear()
541                else:
542                    function_id = self.get_function_id(function_name, dso_name, 0)
543                if function_id == 0:
544                    continue
545                location.lines.append(self.add_line(source_file, source_line, function_id))
546
547        for function in self.function_list:
548            dso_name = self.get_string(function.dso_name_id)
549            if function.vaddr_in_dso:
550                dso = addr2line.get_dso(dso_name)
551                if not dso:
552                    continue
553                sources = addr2line.get_addr_source(dso, function.vaddr_in_dso)
554                if sources:
555                    source_file, source_line, _ = sources[0]
556                    function.source_filename_id = self.get_string_id(source_file)
557                    function.start_line = source_line
558
559    def add_line(self, source_file, source_line, function_id):
560        line = Line()
561        function = self.get_function(function_id)
562        function.source_filename_id = self.get_string_id(source_file)
563        line.function_id = function_id
564        line.line = source_line
565        return line
566
567    def gen_profile_sample(self, sample):
568        profile_sample = self.profile.sample.add()
569        profile_sample.location_id.extend(sample.location_ids)
570        sample_type_count = len(self.sample_types) * 2
571        values = [0] * sample_type_count
572        for sample_type_id in sample.values:
573            values[sample_type_id] = sample.values[sample_type_id]
574        profile_sample.value.extend(values)
575
576        for l in sample.labels:
577            label = profile_sample.label.add()
578            label.key = l.key_id
579            label.str = l.str_id
580
581    def gen_profile_mapping(self, mapping):
582        profile_mapping = self.profile.mapping.add()
583        profile_mapping.id = mapping.id
584        profile_mapping.memory_start = mapping.memory_start
585        profile_mapping.memory_limit = mapping.memory_limit
586        profile_mapping.file_offset = mapping.file_offset
587        profile_mapping.filename = mapping.filename_id
588        profile_mapping.build_id = mapping.build_id_id
589        profile_mapping.has_filenames = True
590        profile_mapping.has_functions = True
591        if self.config.get('binary_cache_dir'):
592            profile_mapping.has_line_numbers = True
593            profile_mapping.has_inline_frames = True
594        else:
595            profile_mapping.has_line_numbers = False
596            profile_mapping.has_inline_frames = False
597
598    def gen_profile_location(self, location):
599        profile_location = self.profile.location.add()
600        profile_location.id = location.id
601        profile_location.mapping_id = location.mapping_id
602        profile_location.address = location.address
603        for i in range(len(location.lines)):
604            line = profile_location.line.add()
605            line.function_id = location.lines[i].function_id
606            line.line = location.lines[i].line
607
608    def gen_profile_function(self, function):
609        profile_function = self.profile.function.add()
610        profile_function.id = function.id
611        profile_function.name = function.name_id
612        profile_function.system_name = function.name_id
613        profile_function.filename = function.source_filename_id
614        profile_function.start_line = function.start_line
615
616
617def main():
618    parser = BaseArgumentParser(description='Generate pprof profile data in pprof.profile.')
619    parser.add_argument('--show', nargs='?', action='append', help='print existing pprof.profile.')
620    parser.add_argument('-i', '--record_file', nargs='+', default=['perf.data'], help="""
621        Set profiling data file to report. Default is perf.data""")
622    parser.add_argument('-o', '--output_file', default='pprof.profile', help="""
623        The path of generated pprof profile data.""")
624    parser.add_argument('--max_chain_length', type=int, default=1000000000, help="""
625        Maximum depth of samples to be converted.""")  # Large value as infinity standin.
626    parser.add_argument('--ndk_path', type=extant_dir, help='Set the path of a ndk release.')
627    parser.add_argument(
628        '-j', '--jobs', type=int, default=os.cpu_count(),
629        help='Use multithreading to speed up source code annotation.')
630    sample_filter_group = parser.add_argument_group('Sample filter options')
631    sample_filter_group.add_argument('--dso', nargs='+', action='append', help="""
632        Use samples only in selected binaries.""")
633    parser.add_report_lib_options(sample_filter_group=sample_filter_group)
634
635    args = parser.parse_args()
636    if args.show:
637        show_file = args.show[0] if args.show[0] else 'pprof.profile'
638        profile = load_pprof_profile(show_file)
639        printer = PprofProfilePrinter(profile)
640        printer.show()
641        return
642
643    config = {}
644    config['output_file'] = args.output_file
645    config['dso_filters'] = flatten_arg_list(args.dso)
646    config['ndk_path'] = args.ndk_path
647    config['max_chain_length'] = args.max_chain_length
648    config['report_lib_options'] = args.report_lib_options
649    generator = PprofProfileGenerator(config)
650    for record_file in args.record_file:
651        generator.load_record_file(record_file)
652    profile = generator.gen(args.jobs)
653    store_pprof_profile(config['output_file'], profile)
654    logging.info("Report is generated at '%s' successfully." % config['output_file'])
655    logging.info('Before uploading to the continuous PProf UI, use gzip to compress the file.')
656
657
658if __name__ == '__main__':
659    main()
660