1 /*
2 * Copyright (C) 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Operations"
18
19 #include "Softmax.h"
20
21 #include <algorithm>
22 #include <cfloat>
23 #include <limits>
24 #include <vector>
25
26 #include "OperationResolver.h"
27 #include "Tracing.h"
28 #include "nnapi/Validation.h"
29
30 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
31 #pragma clang diagnostic push
32 #pragma clang diagnostic ignored "-Wunused-parameter"
33 #pragma clang diagnostic ignored "-Wsign-compare"
34 #pragma clang diagnostic ignored "-Winvalid-partial-specialization"
35 #include <tensorflow/lite/kernels/internal/optimized/legacy_optimized_ops.h>
36 #include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
37 #pragma clang diagnostic pop
38
39 #include "CpuOperationUtils.h"
40 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
41
42 namespace android {
43 namespace nn {
44
45 namespace softmax {
46
47 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
48 namespace {
49
softmaxSlowFloat32(const float * inputData,const Shape & inputShape,const float beta,int32_t axis,float * outputData,const Shape &)50 inline bool softmaxSlowFloat32(const float* inputData, const Shape& inputShape, const float beta,
51 int32_t axis, float* outputData, const Shape& /*outputShape*/) {
52 NNTRACE_TRANS("softmaxFloatSlow32");
53 const uint32_t outerSize = getNumberOfElements(inputShape, 0, axis);
54 const uint32_t axisSize = getSizeOfDimension(inputShape, axis);
55 const uint32_t innerSize =
56 getNumberOfElements(inputShape, axis + 1, getNumberOfDimensions(inputShape));
57 for (uint32_t outer = 0; outer < outerSize; ++outer) {
58 const float* inputBeg = inputData + outer * axisSize * innerSize;
59 const float* inputEnd = inputBeg + axisSize * innerSize;
60 float* outputBeg = outputData + outer * axisSize * innerSize;
61 for (uint32_t inner = 0; inner < innerSize; ++inner, ++inputBeg, ++inputEnd, ++outputBeg) {
62 // Find max
63 float maxValue = -FLT_MAX;
64 for (const float* p = inputBeg; p < inputEnd; p += innerSize) {
65 maxValue = std::max(maxValue, *p);
66 }
67 // Compute sum
68 float sum = 0.0f;
69 for (const float* p = inputBeg; p < inputEnd; p += innerSize) {
70 sum += std::exp((*p - maxValue) * beta);
71 }
72 // Compute result
73 float* pOut = outputBeg;
74 for (const float* p = inputBeg; p < inputEnd; p += innerSize, pOut += innerSize) {
75 *pOut = std::exp((*p - maxValue) * beta) / sum;
76 }
77 }
78 }
79 return true;
80 }
81
softmaxFloat32(const float * inputData,const Shape & inputShape,const float beta,int32_t axis,float * outputData,const Shape & outputShape)82 bool softmaxFloat32(const float* inputData, const Shape& inputShape, const float beta, int32_t axis,
83 float* outputData, const Shape& outputShape) {
84 int32_t ndim = getNumberOfDimensions(inputShape);
85 NN_CHECK(handleNegativeAxis(inputShape, &axis));
86 // TFLite optimized implementation only supports computation along the last axis
87 if (axis == ndim - 1) {
88 NNTRACE_COMP("optimized_ops::Softmax::float");
89 tflite::SoftmaxParams param = {.beta = beta};
90 tflite::optimized_ops::Softmax(param, convertShapeToTflshape(inputShape), inputData,
91 convertShapeToTflshape(outputShape), outputData);
92 return true;
93 } else {
94 return softmaxSlowFloat32(inputData, inputShape, beta, axis, outputData, outputShape);
95 }
96 }
97
softmaxFloat16(const _Float16 * inputData,const Shape & inputShape,const float beta,int32_t axis,_Float16 * outputData,const Shape & outputShape)98 bool softmaxFloat16(const _Float16* inputData, const Shape& inputShape, const float beta,
99 int32_t axis, _Float16* outputData, const Shape& outputShape) {
100 NNTRACE_TRANS("softmaxFloat16");
101 std::vector<float> inputData_float32(getNumberOfElements(inputShape));
102 convertFloat16ToFloat32(inputData, &inputData_float32);
103 std::vector<float> outputData_float32(getNumberOfElements(outputShape));
104
105 softmaxFloat32(inputData_float32.data(), inputShape, beta, axis, outputData_float32.data(),
106 outputShape);
107 convertFloat32ToFloat16(outputData_float32, outputData);
108
109 return true;
110 }
111
112 template <typename T>
softmaxQuant8Impl(const T * inputData,const Shape & inputShape,const float,int32_t axis,int32_t inputMultiplier,int32_t inputLeftShift,float diffMin,T * outputData,const Shape &)113 bool softmaxQuant8Impl(const T* inputData, const Shape& inputShape, const float /*beta*/,
114 int32_t axis, int32_t inputMultiplier, int32_t inputLeftShift, float diffMin,
115 T* outputData, const Shape& /*outputShape*/) {
116 NNTRACE_TRANS("softmaxQuant8");
117 // The representation chosen for the input to the exp() function is Q5.26.
118 // We need to leave extra space since values that we skip might be as large as
119 // -32 before multiplying by input_beta_multiplier, and therefore as large as
120 // -16 afterwards. Note that exp(-8) is definitely not insignificant to
121 // accumulation, but exp(-16) definitely is.
122 static const int32_t kScaledDiffIntegerBits = 5;
123 static const int kAccumulationIntegerBits = 12;
124 using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
125 using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
126 using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
127
128 const uint32_t outerSize = getNumberOfElements(inputShape, 0, axis);
129 const uint32_t axisSize = getSizeOfDimension(inputShape, axis);
130 const uint32_t innerSize =
131 getNumberOfElements(inputShape, axis + 1, getNumberOfDimensions(inputShape));
132 for (uint32_t outer = 0; outer < outerSize; ++outer) {
133 const T* inputBeg = inputData + outer * axisSize * innerSize;
134 const T* inputEnd = inputBeg + axisSize * innerSize;
135 T* outputBeg = outputData + outer * axisSize * innerSize;
136 for (uint32_t inner = 0; inner < innerSize; ++inner, ++inputBeg, ++inputEnd, ++outputBeg) {
137 // Find max
138 T maxValue = std::is_same_v<T, int8_t> ? -128 : 0;
139 for (const T* p = inputBeg; p < inputEnd; p += innerSize) {
140 maxValue = std::max(maxValue, *p);
141 }
142
143 // Compute sum
144 FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
145 for (const T* p = inputBeg; p < inputEnd; p += innerSize) {
146 int32_t input_diff = static_cast<int32_t>(*p) - maxValue;
147 if (input_diff >= diffMin) {
148 const int32_t input_diff_rescaled =
149 tflite::MultiplyByQuantizedMultiplierGreaterThanOne(
150 input_diff, inputMultiplier, inputLeftShift);
151 const auto scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled);
152 sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
153 exp_on_negative_values(scaled_diff_f8));
154 }
155 }
156
157 uint32_t fixed_sum_of_exps = static_cast<uint32_t>(sum_of_exps.raw());
158 int32_t headroom_plus_one = tflite::CountLeadingZeros(fixed_sum_of_exps);
159 // This is the number of bits to the left of the binary point above 1.0.
160 // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
161 // no later adjustment will be needed.
162 int32_t num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
163 int32_t shifted_sum_minus_one = static_cast<int32_t>(
164 (fixed_sum_of_exps << headroom_plus_one) - (static_cast<uint32_t>(1) << 31));
165
166 FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
167 FixedPoint0::FromRaw(shifted_sum_minus_one));
168
169 // Compute result
170 constexpr int32_t q_min = std::numeric_limits<T>::min();
171 constexpr int32_t q_max = std::numeric_limits<T>::max();
172 T* pOut = outputBeg;
173 for (const T* p = inputBeg; p < inputEnd; p += innerSize, pOut += innerSize) {
174 int32_t input_diff = static_cast<int32_t>(*p) - maxValue;
175 if (input_diff >= diffMin) {
176 const int32_t input_diff_rescaled =
177 tflite::MultiplyByQuantizedMultiplierGreaterThanOne(
178 input_diff, inputMultiplier, inputLeftShift);
179 const auto scaled_diff_f8 = FixedPointScaledDiff::FromRaw(input_diff_rescaled);
180
181 FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
182 int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
183 (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
184 if (std::is_same_v<T, int8_t>) {
185 unsat_output -= 128;
186 }
187
188 *pOut = static_cast<T>(std::max(std::min(unsat_output, q_max), q_min));
189
190 } else {
191 *pOut = std::is_same_v<T, int8_t> ? -128 : 0;
192 }
193 }
194 }
195 }
196 return true;
197 }
198
199 template <typename T>
softmaxQuant8(const T * inputData,const Shape & inputShape,const float beta,int32_t axis,T * outputData,const Shape & outputShape)200 bool softmaxQuant8(const T* inputData, const Shape& inputShape, const float beta, int32_t axis,
201 T* outputData, const Shape& outputShape) {
202 [[maybe_unused]] int32_t ndim = getNumberOfDimensions(inputShape);
203 NN_CHECK(handleNegativeAxis(inputShape, &axis));
204
205 if ((inputShape.type == OperandType::TENSOR_QUANT8_ASYMM && outputShape.offset != 0) ||
206 (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED &&
207 outputShape.offset != -128) ||
208 outputShape.scale != 1.f / 256) {
209 LOG(ERROR) << "incorrect scale / offset for output";
210 return false;
211 }
212
213 static const int32_t kScaledDiffIntegerBits = 5;
214 const double input_beta_real_multiplier =
215 std::min(1.0 * beta * inputShape.scale * (1 << (31 - kScaledDiffIntegerBits)),
216 (1LL << 31) - 1.0);
217
218 int32_t inputMultiplier = 0, inputLeftShift = 0;
219 if (!QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, &inputMultiplier,
220 &inputLeftShift)) {
221 return false;
222 }
223 int32_t diffMin = -CalculateInputRadius(kScaledDiffIntegerBits, inputLeftShift);
224
225 return softmaxQuant8Impl(inputData, inputShape, beta, axis, inputMultiplier, inputLeftShift,
226 diffMin, outputData, outputShape);
227 }
228
229 } // namespace
230
prepare(IOperationExecutionContext * context)231 bool prepare(IOperationExecutionContext* context) {
232 Shape input = context->getInputShape(kInputTensor);
233 float beta = (input.type == OperandType::TENSOR_FLOAT16)
234 ? context->getInputValue<_Float16>(kBetaScalar)
235 : context->getInputValue<float>(kBetaScalar);
236 NN_RET_CHECK_LE(getNumberOfDimensions(input), 4u);
237 NN_RET_CHECK_GT(beta, 0.0f);
238 Shape output = context->getOutputShape(kOutputTensor);
239 output.dimensions = input.dimensions;
240 return context->setOutputShape(kOutputTensor, output);
241 }
242
execute(IOperationExecutionContext * context)243 bool execute(IOperationExecutionContext* context) {
244 // Bypass execution in the case of zero-sized input.
245 if (getNumberOfElements(context->getOutputShape(kOutputTensor)) == 0) return true;
246 int32_t axis = (context->getNumInputs() == kNumInputs)
247 ? context->getInputValue<int32_t>(kAxisScalar)
248 : -1;
249 switch (context->getInputType(kInputTensor)) {
250 case OperandType::TENSOR_FLOAT16:
251 return softmaxFloat16(context->getInputBuffer<_Float16>(kInputTensor),
252 context->getInputShape(kInputTensor),
253 context->getInputValue<_Float16>(kBetaScalar), axis,
254 context->getOutputBuffer<_Float16>(kOutputTensor),
255 context->getOutputShape(kOutputTensor));
256 case OperandType::TENSOR_FLOAT32:
257 return softmaxFloat32(context->getInputBuffer<float>(kInputTensor),
258 context->getInputShape(kInputTensor),
259 context->getInputValue<float>(kBetaScalar), axis,
260 context->getOutputBuffer<float>(kOutputTensor),
261 context->getOutputShape(kOutputTensor));
262 case OperandType::TENSOR_QUANT8_ASYMM:
263 return softmaxQuant8(context->getInputBuffer<uint8_t>(kInputTensor),
264 context->getInputShape(kInputTensor),
265 context->getInputValue<float>(kBetaScalar), axis,
266 context->getOutputBuffer<uint8_t>(kOutputTensor),
267 context->getOutputShape(kOutputTensor));
268 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED:
269 return softmaxQuant8(context->getInputBuffer<int8_t>(kInputTensor),
270 context->getInputShape(kInputTensor),
271 context->getInputValue<float>(kBetaScalar), axis,
272 context->getOutputBuffer<int8_t>(kOutputTensor),
273 context->getOutputShape(kOutputTensor));
274 default:
275 NN_RET_CHECK_FAIL() << "Unsupported tensor type for operation " << kOperationName;
276 }
277 }
278 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
279
280 } // namespace softmax
281
282 NN_REGISTER_OPERATION_DEFAULT_VALIDATION(SOFTMAX, softmax::prepare, softmax::execute,
283 .allowZeroSizedInput = true);
284
285 } // namespace nn
286 } // namespace android
287