• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2019 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Operations"
18 
19 #include "Softmax.h"
20 
21 #include <algorithm>
22 #include <cfloat>
23 #include <limits>
24 #include <vector>
25 
26 #include "OperationResolver.h"
27 #include "Tracing.h"
28 #include "nnapi/Validation.h"
29 
30 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
31 #pragma clang diagnostic push
32 #pragma clang diagnostic ignored "-Wunused-parameter"
33 #pragma clang diagnostic ignored "-Wsign-compare"
34 #pragma clang diagnostic ignored "-Winvalid-partial-specialization"
35 #include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
36 #include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
37 #pragma clang diagnostic pop
38 
39 #include "CpuOperationUtils.h"
40 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
41 
42 namespace android {
43 namespace nn {
44 
45 namespace softmax {
46 
47 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
48 namespace {
49 
softmaxSlowFloat32(const float * inputData,const Shape & inputShape,const float beta,int32_t axis,float * outputData,const Shape &)50 inline bool softmaxSlowFloat32(const float* inputData, const Shape& inputShape, const float beta,
51                                int32_t axis, float* outputData, const Shape& /*outputShape*/) {
52     NNTRACE_TRANS("softmaxFloatSlow32");
53     const uint32_t outerSize = getNumberOfElements(inputShape, 0, axis);
54     const uint32_t axisSize = getSizeOfDimension(inputShape, axis);
55     const uint32_t innerSize =
56             getNumberOfElements(inputShape, axis + 1, getNumberOfDimensions(inputShape));
57     for (uint32_t outer = 0; outer < outerSize; ++outer) {
58         const float* inputBeg = inputData + outer * axisSize * innerSize;
59         const float* inputEnd = inputBeg + axisSize * innerSize;
60         float* outputBeg = outputData + outer * axisSize * innerSize;
61         for (uint32_t inner = 0; inner < innerSize; ++inner, ++inputBeg, ++inputEnd, ++outputBeg) {
62             // Find max
63             float maxValue = -FLT_MAX;
64             for (const float* p = inputBeg; p < inputEnd; p += innerSize) {
65                 maxValue = std::max(maxValue, *p);
66             }
67             // Compute sum
68             float sum = 0.0f;
69             for (const float* p = inputBeg; p < inputEnd; p += innerSize) {
70                 sum += std::exp((*p - maxValue) * beta);
71             }
72             // Compute result
73             float* pOut = outputBeg;
74             for (const float* p = inputBeg; p < inputEnd; p += innerSize, pOut += innerSize) {
75                 *pOut = std::exp((*p - maxValue) * beta) / sum;
76             }
77         }
78     }
79     return true;
80 }
81 
softmaxFloat32(const float * inputData,const Shape & inputShape,const float beta,int32_t axis,float * outputData,const Shape & outputShape)82 bool softmaxFloat32(const float* inputData, const Shape& inputShape, const float beta, int32_t axis,
83                     float* outputData, const Shape& outputShape) {
84     int32_t ndim = getNumberOfDimensions(inputShape);
85     NN_CHECK(handleNegativeAxis(inputShape, &axis));
86     // TFLite optimized implementation only supports computation along the last axis
87     if (axis == ndim - 1) {
88         NNTRACE_COMP("optimized_ops::Softmax::float");
89         tflite::SoftmaxParams param = {.beta = beta};
90         tflite::optimized_ops::Softmax(param, convertShapeToTflshape(inputShape), inputData,
91                                        convertShapeToTflshape(outputShape), outputData);
92         return true;
93     } else {
94         return softmaxSlowFloat32(inputData, inputShape, beta, axis, outputData, outputShape);
95     }
96 }
97 
softmaxFloat16(const _Float16 * inputData,const Shape & inputShape,const float beta,int32_t axis,_Float16 * outputData,const Shape & outputShape)98 bool softmaxFloat16(const _Float16* inputData, const Shape& inputShape, const float beta,
99                     int32_t axis, _Float16* outputData, const Shape& outputShape) {
100     NNTRACE_TRANS("softmaxFloat16");
101     std::vector<float> inputData_float32(getNumberOfElements(inputShape));
102     convertFloat16ToFloat32(inputData, &inputData_float32);
103     std::vector<float> outputData_float32(getNumberOfElements(outputShape));
104 
105     softmaxFloat32(inputData_float32.data(), inputShape, beta, axis, outputData_float32.data(),
106                    outputShape);
107     convertFloat32ToFloat16(outputData_float32, outputData);
108 
109     return true;
110 }
111 
112 template <typename T>
softmaxQuant8Impl(const T * inputData,const Shape & inputShape,const float,int32_t axis,int32_t inputMultiplier,int32_t inputLeftShift,float diffMin,T * outputData,const Shape &)113 bool softmaxQuant8Impl(const T* inputData, const Shape& inputShape, const float /*beta*/,
114                        int32_t axis, int32_t inputMultiplier, int32_t inputLeftShift, float diffMin,
115                        T* outputData, const Shape& /*outputShape*/) {
116     NNTRACE_TRANS("softmaxQuant8");
117     // The representation chosen for the input to the exp() function is Q5.26.
118     // We need to leave extra space since values that we skip might be as large as
119     // -32 before multiplying by input_beta_multiplier, and therefore as large as
120     // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
121     // accumulation, but exp(-16) definitely is.
122     static const int32_t kScaledDiffIntegerBits = 5;
123     static const int kAccumulationIntegerBits = 12;
124     using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
125     using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
126     using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
127 
128     const uint32_t outerSize = getNumberOfElements(inputShape, 0, axis);
129     const uint32_t axisSize = getSizeOfDimension(inputShape, axis);
130     const uint32_t innerSize =
131             getNumberOfElements(inputShape, axis + 1, getNumberOfDimensions(inputShape));
132     for (uint32_t outer = 0; outer < outerSize; ++outer) {
133         const T* inputBeg = inputData + outer * axisSize * innerSize;
134         const T* inputEnd = inputBeg + axisSize * innerSize;
135         T* outputBeg = outputData + outer * axisSize * innerSize;
136         for (uint32_t inner = 0; inner < innerSize; ++inner, ++inputBeg, ++inputEnd, ++outputBeg) {
137             // Find max
138             T maxValue = std::is_same_v<T, int8_t> ? -128 : 0;
139             for (const T* p = inputBeg; p < inputEnd; p += innerSize) {
140                 maxValue = std::max(maxValue, *p);
141             }
142 
143             // Compute sum
144             FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
145             for (const T* p = inputBeg; p < inputEnd; p += innerSize) {
146                 int32_t input_diff = static_cast<int32_t>(*p) - maxValue;
147                 if (input_diff >= diffMin) {
148                     const int32_t input_diff_rescaled =
149                             tflite::MultiplyByQuantizedMultiplierGreaterThanOne(
150                                     input_diff, inputMultiplier, inputLeftShift);
151                     const auto scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled);
152                     sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
153                                                         exp_on_negative_values(scaled_diff_f8));
154                 }
155             }
156 
157             uint32_t fixed_sum_of_exps = static_cast<uint32_t>(sum_of_exps.raw());
158             int32_t headroom_plus_one = tflite::CountLeadingZeros(fixed_sum_of_exps);
159             // This is the number of bits to the left of the binary point above 1.0.
160             // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
161             // no later adjustment will be needed.
162             int32_t num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
163             int32_t shifted_sum_minus_one = static_cast<int32_t>(
164                     (fixed_sum_of_exps << headroom_plus_one) - (static_cast<uint32_t>(1) << 31));
165 
166             FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
167                     FixedPoint0::FromRaw(shifted_sum_minus_one));
168 
169             // Compute result
170             constexpr int32_t q_min = std::numeric_limits<T>::min();
171             constexpr int32_t q_max = std::numeric_limits<T>::max();
172             T* pOut = outputBeg;
173             for (const T* p = inputBeg; p < inputEnd; p += innerSize, pOut += innerSize) {
174                 int32_t input_diff = static_cast<int32_t>(*p) - maxValue;
175                 if (input_diff >= diffMin) {
176                     const int32_t input_diff_rescaled =
177                             tflite::MultiplyByQuantizedMultiplierGreaterThanOne(
178                                     input_diff, inputMultiplier, inputLeftShift);
179                     const auto scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled);
180 
181                     FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
182                     int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
183                             (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
184                     if (std::is_same_v<T, int8_t>) {
185                         unsat_output -= 128;
186                     }
187 
188                     *pOut = static_cast<T>(std::max(std::min(unsat_output, q_max), q_min));
189 
190                 } else {
191                     *pOut = std::is_same_v<T, int8_t> ? -128 : 0;
192                 }
193             }
194         }
195     }
196     return true;
197 }
198 
199 template <typename T>
softmaxQuant8(const T * inputData,const Shape & inputShape,const float beta,int32_t axis,T * outputData,const Shape & outputShape)200 bool softmaxQuant8(const T* inputData, const Shape& inputShape, const float beta, int32_t axis,
201                    T* outputData, const Shape& outputShape) {
202     [[maybe_unused]] int32_t ndim = getNumberOfDimensions(inputShape);
203     NN_CHECK(handleNegativeAxis(inputShape, &axis));
204 
205     if ((inputShape.type == OperandType::TENSOR_QUANT8_ASYMM && outputShape.offset != 0) ||
206         (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED &&
207          outputShape.offset != -128) ||
208         outputShape.scale != 1.f / 256) {
209         LOG(ERROR) << "incorrect scale / offset for output";
210         return false;
211     }
212 
213     static const int32_t kScaledDiffIntegerBits = 5;
214     const double input_beta_real_multiplier =
215             std::min(1.0 * beta * inputShape.scale * (1 << (31 - kScaledDiffIntegerBits)),
216                      (1LL << 31) - 1.0);
217 
218     int32_t inputMultiplier = 0, inputLeftShift = 0;
219     if (!QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, &inputMultiplier,
220                                           &inputLeftShift)) {
221         return false;
222     }
223     int32_t diffMin = -CalculateInputRadius(kScaledDiffIntegerBits, inputLeftShift);
224 
225     return softmaxQuant8Impl(inputData, inputShape, beta, axis, inputMultiplier, inputLeftShift,
226                              diffMin, outputData, outputShape);
227 }
228 
229 }  // namespace
230 
prepare(IOperationExecutionContext * context)231 bool prepare(IOperationExecutionContext* context) {
232     Shape input = context->getInputShape(kInputTensor);
233     float beta = (input.type == OperandType::TENSOR_FLOAT16)
234                          ? context->getInputValue<_Float16>(kBetaScalar)
235                          : context->getInputValue<float>(kBetaScalar);
236     NN_RET_CHECK_LE(getNumberOfDimensions(input), 4u);
237     NN_RET_CHECK_GT(beta, 0.0f);
238     Shape output = context->getOutputShape(kOutputTensor);
239     output.dimensions = input.dimensions;
240     return context->setOutputShape(kOutputTensor, output);
241 }
242 
execute(IOperationExecutionContext * context)243 bool execute(IOperationExecutionContext* context) {
244     // Bypass execution in the case of zero-sized input.
245     if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
246     int32_t axis = (context->getNumInputs() == kNumInputs)
247                            ? context->getInputValue<int32_t>(kAxisScalar)
248                            : -1;
249     switch (context->getInputType(kInputTensor)) {
250         case OperandType::TENSOR_FLOAT16:
251             return softmaxFloat16(context->getInputBuffer<_Float16>(kInputTensor),
252                                   context->getInputShape(kInputTensor),
253                                   context->getInputValue<_Float16>(kBetaScalar), axis,
254                                   context->getOutputBuffer<_Float16>(kOutputTensor),
255                                   context->getOutputShape(kOutputTensor));
256         case OperandType::TENSOR_FLOAT32:
257             return softmaxFloat32(context->getInputBuffer<float>(kInputTensor),
258                                   context->getInputShape(kInputTensor),
259                                   context->getInputValue<float>(kBetaScalar), axis,
260                                   context->getOutputBuffer<float>(kOutputTensor),
261                                   context->getOutputShape(kOutputTensor));
262         case OperandType::TENSOR_QUANT8_ASYMM:
263             return softmaxQuant8(context->getInputBuffer<uint8_t>(kInputTensor),
264                                  context->getInputShape(kInputTensor),
265                                  context->getInputValue<float>(kBetaScalar), axis,
266                                  context->getOutputBuffer<uint8_t>(kOutputTensor),
267                                  context->getOutputShape(kOutputTensor));
268         case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
269             return softmaxQuant8(context->getInputBuffer<int8_t>(kInputTensor),
270                                  context->getInputShape(kInputTensor),
271                                  context->getInputValue<float>(kBetaScalar), axis,
272                                  context->getOutputBuffer<int8_t>(kOutputTensor),
273                                  context->getOutputShape(kOutputTensor));
274         default:
275             NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
276     }
277 }
278 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
279 
280 }  // namespace softmax
281 
282 NN_REGISTER_OPERATION_DEFAULT_VALIDATION(SOFTMAX, softmax::prepare, softmax::execute,
283                                          .allowZeroSizedInput = true);
284 
285 }  // namespace nn
286 }  // namespace android
287