from parser.aggregate import aggregate_times, nan_to_zero, LAYER_TOTAL from parser.naming import layers, names, phases, make_tag, subphases from parser.naming import LAYER_APPLICATION, LAYER_CPU from parser.naming import (PHASE_INITIALIZATION, PHASE_PREPARATION, PHASE_COMPILATION, PHASE_INPUTS_AND_OUTPUTS, PHASE_EXECUTION, PHASE_RESULTS, PHASE_TERMINATION, PHASE_OVERALL, PHASE_WARMUP, PHASE_BENCHMARK) import json import math import sys def print_stats(tracker_map, print_detail=True, total_times=False, per_execution=False, json_output=False, starting_mark='', sep=''): """ Prints statistics for a single Overall phase as text or json. For text output: By default prints the self-time for each layer, prints total times instead if given total_times=True. By default prints stats for all phases, prints only the Execution and its subphases (as per-execution times) if per_execution=True. If per_execution=True and the trace contains separate Warmup and Benchmark phases, prints only the Benchmark phase. For json output: The json output is internal to NNAPI and is not guaranteed stable or extensively defined. It does however contain a version field so that backwards-compativle tools can be created on top of it. The json output includes both the statistics themselves produced by aggregate_times as well as the values used to create the text output so that those are easily available. Look at the end of the function for the fields included in the json. """ PHASE_EXECUTION_LESS_IO_AND_RESULTS = "PEO" phases_to_pick = phases + [PHASE_INPUTS_AND_OUTPUTS, PHASE_RESULTS] for tracker in tracker_map.values(): if not tracker.is_complete(): sys.stderr.write("Incomplete trace, not able to print all statistics\n") return if sep: print(sep) # Select template and statistics to use times, self_times, has_warmup_and_benchmark, execution_counts = aggregate_times(tracker_map) if not per_execution: template = TEMPLATE_ALL_PHASES else: template = TEMPLATE_EXECUTION_ONLY if total_times: template = template.replace("self-times", "total time") times_to_use = times else: times_to_use = self_times if has_warmup_and_benchmark and per_execution: template = template.replace("Execution", "Benchmark") for phase in [PHASE_EXECUTION] + subphases[PHASE_EXECUTION]: for layer in layers + [LAYER_TOTAL]: times_to_use[phase][layer] = times_to_use[PHASE_BENCHMARK][phase][layer] # Rewrite template shorthand template = template.replace(":fl", ":>11.2f") template = template.replace(":f", ":>9.2f") # Gather template inputs from statistics values = dict() full_total = 0.0 has_cpu = False for layer in layers: for phase in phases_to_pick: t = times_to_use[phase][layer] values[make_tag(layer, phase)] = t if layer == LAYER_CPU: has_cpu = (has_cpu or t > 0.0) # Calculate layer totals and PHASE_EXECUTION_LESS_IO_AND_RESULTS for phase in phases_to_pick: values[make_tag(LAYER_TOTAL, phase)] = times_to_use[phase][LAYER_TOTAL] for layer in layers + [LAYER_TOTAL]: values[make_tag(layer, PHASE_EXECUTION_LESS_IO_AND_RESULTS)] = ( values[make_tag(layer, PHASE_EXECUTION)] - values[make_tag(layer, PHASE_INPUTS_AND_OUTPUTS)] - values[make_tag(layer, PHASE_RESULTS)]) values[make_tag(layer, PHASE_OVERALL)] = times_to_use[PHASE_OVERALL][layer] # Calculate layer execution percentages for layer in layers: if values[make_tag(LAYER_TOTAL, PHASE_EXECUTION)] > 0.0: values[make_tag(layer, "PEp")] = (values[make_tag(layer, PHASE_EXECUTION)] * 100.0 / values[make_tag(LAYER_TOTAL, PHASE_EXECUTION)]) else: values[make_tag(layer, "PEp")] = math.nan # Make output numbers per-execution if desired if per_execution: if has_warmup_and_benchmark: divide_by = execution_counts[PHASE_BENCHMARK] else: divide_by = execution_counts[PHASE_OVERALL] for layer in (layers + [LAYER_TOTAL]): for phase in [PHASE_INPUTS_AND_OUTPUTS, PHASE_EXECUTION_LESS_IO_AND_RESULTS, PHASE_RESULTS, PHASE_EXECUTION]: if divide_by != 0: values[layer + "_" + phase] = values[layer + "_" + phase] / divide_by else: values[layer + "_" + phase] = math.nan # Generate and print output if not json_output: # Apply template and prettify numbers output = template.format(**values) output = output.replace(" 0.00%", " -") output = output.replace(" 0.00", " -") output = output.replace(" nan", " n/a") # Print output print(starting_mark) for line in output.splitlines(): if line[0:3] == "CPU" and not has_cpu: continue print(line) if print_detail: for pid in tracker_map: tracker = tracker_map[pid] tracker.print_stats() for pid in tracker_map: tracker = tracker_map[pid] tracker.print() else: output = dict(times=times, self_times=self_times, execution_counts=execution_counts, template_inputs=values, version=1, starting_mark=starting_mark) output = json.dumps(output, indent=2, sort_keys=True) # JSON doesn't recognize NaN output = output.replace("NaN", "null") print(output) def reset_trackers(tracker_map): for pid in tracker_map: tracker = tracker_map[pid] tracker.reset() TEMPLATE_ALL_PHASES = """ =========================================================================================================================================== NNAPI timing summary (self-times, ms wall-clock) Execution ---------------------------------------------------- Initialization Preparation Compilation I/O Compute Results Ex. total Termination Total -------------- ----------- ----------- ----------- ------------ ----------- ----------- ----------- ---------- Application {LA_PI:f} {LA_PP:f} {LA_PC:f} {LA_PIO:fl} {LA_PEO:fl} {LA_PR:f} {LA_PE:f} {LA_PT:f} {LA_PO:f}* Runtime {LR_PI:f} {LR_PP:f} {LR_PC:f} {LR_PIO:fl} {LR_PEO:fl} {LR_PR:f} {LR_PE:f} {LR_PT:f} {LR_PO:f} IPC {LI_PI:f} {LI_PP:f} {LI_PC:f} {LI_PIO:fl} {LI_PEO:fl} {LI_PR:f} {LI_PE:f} {LI_PT:f} {LI_PO:f} Driver {LD_PI:f} {LD_PP:f} {LD_PC:f} {LD_PIO:fl} {LD_PEO:fl} {LD_PR:f} {LD_PE:f} {LD_PT:f} {LD_PO:f} CPU {LC_PI:f} {LC_PP:f} {LC_PC:f} {LC_PIO:fl} {LC_PEO:fl} {LC_PR:f} {LC_PE:f} {LC_PT:f} {LC_PO:f} Total {LT_PI:f}* {LT_PP:f}* {LT_PC:f}* {LT_PIO:fl}* {LT_PEO:fl}* {LT_PR:f}* {LT_PE:f}* {LT_PT:f}* {LT_PO:f}* =========================================================================================================================================== * This total ignores missing (n/a) values and thus is not necessarily consistent with the rest of the numbers """ TEMPLATE_EXECUTION_ONLY = """ ================================================================================ NNAPI timing summary (self-times, ms wall-clock) Execution ------------------------------------------------------------------ I/O Compute Results Total Percentage ----------- ------------ ----------- ----------- ----------- Application {LA_PIO:fl} {LA_PEO:fl} {LA_PR:f} {LA_PE:f} {LA_PEp:fl}% Runtime {LR_PIO:fl} {LR_PEO:fl} {LR_PR:f} {LR_PE:f} {LR_PEp:fl}% IPC {LI_PIO:fl} {LI_PEO:fl} {LI_PR:f} {LI_PE:f} {LI_PEp:fl}% Driver {LD_PIO:fl} {LD_PEO:fl} {LD_PR:f} {LD_PE:f} {LD_PEp:fl}% CPU {LC_PIO:fl} {LC_PEO:fl} {LC_PR:f} {LC_PE:f} {LC_PEp:fl}% Total {LT_PIO:fl}* {LT_PEO:fl}* {LT_PR:f}* {LT_PE:f} 100% ================================================================================ * This total ignores missing (n/a) values and thus is not necessarily consistent with the rest of the numbers """