Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[onert-micro] Dequantize f16 to f32 #14551

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions onert-micro/onert-micro/include/pal/common/PALDequantize.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,29 @@
* limitations under the License.
*/

/*
The MIT License (MIT)

Copyright (c) 2017 Facebook Inc.
Copyright (c) 2017 Georgia Institute of Technology
Copyright 2019 Google LLC

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
associated documentation files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge, publish, distribute,
sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or
substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

#ifndef ONERT_MICRO_EXECUTE_PAL_DEQUANTIZE_COMMON_H
#define ONERT_MICRO_EXECUTE_PAL_DEQUANTIZE_COMMON_H

Expand All @@ -23,6 +46,101 @@
#include "PALUtils.h"

#include <cmath>
#include <bit>

namespace
{

/// Notice that this code comes from FP16(https://github.com/Maratyszcza/FP16) under MIT License

/*
* Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to
* a 32-bit floating-point number in IEEE single-precision format, in bit representation.
*
* @note The implementation doesn't use any floating-point operations.
*/
uint32_t fp16_ieee_to_fp32_bits(uint16_t h)
{
/*
* Extend the half-precision floating-point number to 32 bits and shift to the upper part of the
* 32-bit word:
* +---+-----+------------+-------------------+
* | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 31 26-30 16-25 0-15
*
* S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
*/
const uint32_t w = (uint32_t)h << 16;
/*
* Extract the sign of the input number into the high bit of the 32-bit word:
*
* +---+----------------------------------+
* | S |0000000 00000000 00000000 00000000|
* +---+----------------------------------+
* Bits 31 0-31
*/
const uint32_t sign = w & UINT32_C(0x80000000);
/*
* Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
*
* +---+-----+------------+-------------------+
* | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
* +---+-----+------------+-------------------+
* Bits 30 27-31 17-26 0-16
*/
const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
/*
* Renorm shift is the number of bits to shift mantissa left to make the half-precision number
* normalized. If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit
* exponent) equals one. In this case renorm_shift == 0. If the number is denormalize,
* renorm_shift > 0. Note that if we shift denormalized nonsign by renorm_shift, the unit bit of
* mantissa will shift into exponent, turning the biased exponent into 1, and making mantissa
* normalized (i.e. without leading 1).
*/
#ifdef _MSC_VER
unsigned long nonsign_bsr;
_BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31;
#else
uint32_t renorm_shift = __builtin_clz(nonsign);
#endif
renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0;
/*
* Iff half-precision number has exponent of 15, the addition overflows it into bit 31,
* and the subsequent shift turns the high 9 bits into 1. Thus
* inf_nan_mask ==
* 0x7F800000 if the half-precision number had exponent of 15 (i.e. was NaN or
* infinity) 0x00000000 otherwise
*/
const int32_t inf_nan_mask = ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000);
/*
* Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31
* remains 0. The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus
* zero_mask ==
* 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h)
* 0x00000000 otherwise
*/
const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31;
/*
* 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal)
* 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and
* 10-bit mantissa shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision
* number.
* 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias
* (0x7F for single-precision number less 0xF for half-precision number).
* 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization.
* As renorm_shift is less than 0x70, this can be combined with step 3.
* 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the input was NaN or infinity.
* 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was
* zero.
* 7. Combine with the sign of the input number.
*/
return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) &
~zero_mask);
}

} // namespace

namespace onert_micro
{
Expand All @@ -46,6 +164,20 @@ OMStatus Dequantize(const core::QuantizationParams op_params, const uint32_t fla
}
return Ok;
}

OMStatus DequantizeF16toF32(const uint32_t flat_size, const uint16_t *input_data,
float *output_data)
{
for (uint32_t i = 0; i < flat_size; i++)
{
uint32_t f32_in_bits = (fp16_ieee_to_fp32_bits(input_data[i]));
float val;
memcpy(&val, &f32_in_bits, sizeof(float));
output_data[i] = val;
}
return Ok;
}

} // namespace pal
} // namespace execute
} // namespace onert_micro
Expand Down
7 changes: 7 additions & 0 deletions onert-micro/onert-micro/src/execute/kernels/Dequantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ OMStatus execute_kernel_CircleDequantize(const OMExecuteArgs &execute_args)
switch (input->type())
{
#ifndef DIS_FLOAT
case circle::TensorType_FLOAT16:
{
status = pal::DequantizeF16toF32(core::OMRuntimeShape(input).flatSize(),
core::utils::castInputData<uint16_t>(input_data),
core::utils::castOutputData<float>(output_data));
}
break;
case circle::TensorType_INT8:
{
assert(input->quantization() != nullptr);
Expand Down
8 changes: 4 additions & 4 deletions onert-micro/onert-micro/src/import/kernels/Dequantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ OMStatus configure_kernel_CircleDequantize(const OMConfigureArgs &config_args)

// Check input quantization params
const auto *input_quantization = input->quantization();
status = utils::checkCondition(input->type() != circle::TensorType_FLOAT32 or
input_quantization != nullptr and
input_quantization->scale() != nullptr and
input_quantization->scale()->size() == 1);
status = utils::checkCondition(
input->type() != circle::TensorType_FLOAT32 or input->type() != circle::TensorType_FLOAT16 or
input_quantization != nullptr and input_quantization->scale() != nullptr and
input_quantization->scale()->size() == 1);
if (status != Ok)
return status;

Expand Down