You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fp16_t.cc 38 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "common/fp16_t.h"
  17. #include "external/register/register_types.h"
  18. namespace {
  19. constexpr uint16_t kManBitLength = 11;
  20. }
  21. namespace ge {
  22. /// @ingroup fp16_t global filed
  23. /// @brief round mode of last valid digital
  24. enum TagFp16RoundMode g_round_mode = kRoundToNearest;
  25. void ExtractFp16(const uint16_t &val, uint16_t &s, int16_t &e, uint16_t &m) {
  26. // 1.Extract
  27. s = static_cast<uint16_t>(FP16_EXTRAC_SIGN(val));
  28. e = static_cast<int16_t>(FP16_EXTRAC_EXP(val));
  29. m = static_cast<uint16_t>(FP16_EXTRAC_MAN(val));
  30. // Denormal
  31. if (e == 0) {
  32. e = 1;
  33. }
  34. }
  35. /// @ingroup fp16_t static method
  36. /// @param [in] man truncated mantissa
  37. /// @param [in] shift_out left shift bits based on ten bits
  38. /// @brief judge whether to add one to the result while converting fp16_t to other datatype
  39. /// @return Return true if add one, otherwise false
  40. static bool IsRoundOne(uint64_t man, uint16_t trunc_len) {
  41. uint64_t mask0 = 0x4;
  42. uint64_t mask1 = 0x2;
  43. uint64_t mask2;
  44. uint16_t shift_out = static_cast<uint16_t>(trunc_len - kDim2);
  45. mask0 = mask0 << shift_out;
  46. mask1 = mask1 << shift_out;
  47. mask2 = mask1 - 1;
  48. bool last_bit = ((man & mask0) > 0);
  49. bool trunc_high = false;
  50. bool trunc_left = false;
  51. if (g_round_mode == kRoundToNearest) {
  52. trunc_high = ((man & mask1) > 0);
  53. trunc_left = ((man & mask2) > 0);
  54. }
  55. return (trunc_high && (trunc_left || last_bit));
  56. }
  57. /// @ingroup fp16_t public method
  58. /// @param [in] exp exponent of fp16_t value
  59. /// @param [in] man exponent of fp16_t value
  60. /// @brief normalize fp16_t value
  61. /// @return
  62. static void Fp16Normalize(int16_t &exp, uint16_t &man) {
  63. // set to invalid data
  64. if (exp >= kFp16MaxExp) {
  65. exp = static_cast<int16_t>(kFp16MaxExp);
  66. man = static_cast<uint16_t>(kFp16MaxMan);
  67. } else if (exp == 0 && man == kFp16ManHideBit) {
  68. exp++;
  69. man = 0;
  70. }
  71. }
  72. /// @ingroup fp16_t math conversion static method
  73. /// @param [in] fp_val uint16_t value of fp16_t object
  74. /// @brief Convert fp16_t to float/fp32
  75. /// @return Return float/fp32 value of fp_val which is the value of fp16_t object
  76. static float Fp16ToFloat(const uint16_t &fp_val) {
  77. uint16_t hf_sign;
  78. uint16_t hf_man;
  79. int16_t hf_exp;
  80. ExtractFp16(fp_val, hf_sign, hf_exp, hf_man);
  81. while (hf_man && !(hf_man & kFp16ManHideBit)) {
  82. hf_man <<= 1;
  83. hf_exp--;
  84. }
  85. uint32_t e_ret;
  86. uint32_t m_ret;
  87. uint32_t s_ret = hf_sign;
  88. if (hf_man == 0) {
  89. e_ret = 0;
  90. m_ret = 0;
  91. } else {
  92. e_ret = hf_exp - kFp16ExpBias + kFp32ExpBias;
  93. m_ret = hf_man & kFp16ManMask;
  94. m_ret = m_ret << (kFp32ManLen - kFp16ManLen);
  95. }
  96. uint32_t f_val = FP32_CONSTRUCTOR(s_ret, e_ret, m_ret);
  97. auto p_ret_v = reinterpret_cast<float *>(&f_val);
  98. return *p_ret_v;
  99. }
  100. /// @ingroup fp16_t math conversion static method
  101. /// @param [in] fp_val uint16_t value of fp16_t object
  102. /// @brief Convert fp16_t to double/fp64
  103. /// @return Return double/fp64 value of fp_val which is the value of fp16_t object
  104. static double Fp16ToDouble(const uint16_t &fp_val) {
  105. uint16_t hf_sign;
  106. uint16_t hf_man;
  107. int16_t hf_exp;
  108. ExtractFp16(fp_val, hf_sign, hf_exp, hf_man);
  109. while (hf_man && !(hf_man & kFp16ManHideBit)) {
  110. hf_man <<= 1;
  111. hf_exp--;
  112. }
  113. uint64_t e_ret;
  114. uint64_t m_ret;
  115. uint64_t s_ret = hf_sign;
  116. if (!hf_man) {
  117. e_ret = 0;
  118. m_ret = 0;
  119. } else {
  120. e_ret = hf_exp - kFp16ExpBias + kFp64ExpBias;
  121. m_ret = hf_man & kFp16ManMask;
  122. m_ret = m_ret << (kFp64ManLen - kFp16ManLen);
  123. }
  124. uint64_t f_val = (s_ret << kFp64SignIndex) | (e_ret << kFp64ManLen) | (m_ret);
  125. auto p_ret_v = reinterpret_cast<double *>(&f_val);
  126. return *p_ret_v;
  127. }
  128. /// @ingroup fp16_t static method
  129. /// @param [in] s_ret sign of fp16_t value
  130. /// @param [in] long_int_m man uint64_t value of fp16_t object
  131. /// @param [in] shift_out shift offset
  132. /// @brief calculate uint8 value by sign,man and shift offset
  133. /// @return Return uint8 value of fp16_t object
  134. static uint8_t GetUint8ValByMan(uint8_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) {
  135. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  136. auto m_ret = static_cast<uint8_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max);
  137. need_round = need_round && ((s_ret == 0 && m_ret < kInt8Max) || (s_ret == 1 && m_ret <= kInt8Max));
  138. if (need_round) {
  139. m_ret++;
  140. }
  141. if (s_ret) {
  142. m_ret = (~m_ret) + 1;
  143. }
  144. if (m_ret == 0) {
  145. s_ret = 0;
  146. }
  147. return static_cast<uint8_t>((s_ret << kBitShift7) | (m_ret));
  148. }
  149. /// @ingroup fp16_t math conversion static method
  150. /// @param [in] fp_val uint16_t value of fp16_t object
  151. /// @brief Convert fp16_t to int8_t
  152. /// @return Return int8_t value of fp_val which is the value of fp16_t object
  153. static int8_t Fp16ToInt8(const uint16_t &fp_val) {
  154. int8_t ret;
  155. uint8_t ret_v;
  156. // 1.get s_ret and shift it to bit0.
  157. uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  158. // 2.get hf_e and hf_m
  159. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  160. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  161. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  162. ret_v = 0;
  163. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  164. return ret;
  165. }
  166. uint64_t long_int_m = hf_m;
  167. uint8_t overflow_flag = 0;
  168. uint16_t shift_out = 0;
  169. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  170. overflow_flag = 1;
  171. } else {
  172. while (hf_e != kFp16ExpBias) {
  173. if (hf_e > kFp16ExpBias) {
  174. hf_e--;
  175. long_int_m = long_int_m << 1;
  176. if (s_ret == 1 && long_int_m >= 0x20000u) { // sign=1,negative number(<0)
  177. long_int_m = 0x20000u; // 10 0000 0000 0000 0000 10(fp16_t-man)+7(int8)=17bit
  178. overflow_flag = 1;
  179. break;
  180. } else if (s_ret != 1 && long_int_m >= 0x1FFFFu) { // sign=0,positive number(>0)
  181. long_int_m = 0x1FFFFu; // 01 1111 1111 1111 1111 10(fp16_t-man)+7(int8)
  182. overflow_flag = 1;
  183. break;
  184. }
  185. } else {
  186. hf_e++;
  187. shift_out++;
  188. }
  189. }
  190. }
  191. if (overflow_flag) {
  192. ret_v = kInt8Max + s_ret;
  193. } else {
  194. // Generate final result
  195. ret_v = GetUint8ValByMan(s_ret, long_int_m, shift_out);
  196. }
  197. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  198. return ret;
  199. }
  200. /// @ingroup fp16_t math conversion static method
  201. /// @param [in] fp_val uint16_t value of fp16_t object
  202. /// @brief Convert fp16_t to uint8_t
  203. /// @return Return uint8_t value of fp_val which is the value of fp16_t object
  204. static uint8_t Fp16ToUInt8(const uint16_t &fp_val) {
  205. uint8_t m_ret = 0;
  206. // 1.get s_ret and shift it to bit0.
  207. uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  208. // 2.get hf_e and hf_m
  209. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  210. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  211. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  212. return 0;
  213. }
  214. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  215. m_ret = ~0;
  216. } else {
  217. uint64_t long_int_m = hf_m;
  218. uint8_t overflow_flag = 0;
  219. uint16_t shift_out = 0;
  220. while (hf_e != kFp16ExpBias) {
  221. if (hf_e > kFp16ExpBias) {
  222. hf_e--;
  223. long_int_m = long_int_m << 1;
  224. if (long_int_m >= 0x40000Lu) { // overflow 0100 0000 0000 0000 0000
  225. long_int_m = 0x3FFFFLu; // 11 1111 1111 1111 1111 10(fp16_t-man)+8(uint8)=18bit
  226. overflow_flag = 1;
  227. m_ret = ~0;
  228. break;
  229. }
  230. } else {
  231. hf_e++;
  232. shift_out++;
  233. }
  234. }
  235. if (!overflow_flag) {
  236. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  237. m_ret = static_cast<uint8_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max);
  238. if (need_round && m_ret != kBitLen8Max) {
  239. m_ret++;
  240. }
  241. }
  242. }
  243. if (s_ret == 1) { // Negative number
  244. m_ret = 0;
  245. }
  246. // m_ret equal to final result
  247. return m_ret;
  248. }
  249. /// @ingroup fp16_t static method
  250. /// @param [in] s_ret sign of fp16_t value
  251. /// @param [in] long_int_m man uint64_t value of fp16_t object
  252. /// @param [in] shift_out shift offset
  253. /// @brief calculate uint16 value by sign,man and shift offset
  254. /// @return Return uint16 value of fp16_t object
  255. static uint16_t GetUint16ValByMan(uint16_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) {
  256. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  257. auto m_ret = static_cast<uint16_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max);
  258. if (need_round && m_ret < kInt16Max) {
  259. m_ret++;
  260. }
  261. if (s_ret) {
  262. m_ret = (~m_ret) + 1;
  263. }
  264. if (m_ret == 0) {
  265. s_ret = 0;
  266. }
  267. return static_cast<uint16_t>((s_ret << kBitShift15) | (m_ret));
  268. }
  269. /// @ingroup fp16_t math conversion static method
  270. /// @param [in] fp_val uint16_t value of fp16_t object
  271. /// @brief Convert fp16_t to int16_t
  272. /// @return Return int16_t value of fp_val which is the value of fp16_t object
  273. static int16_t Fp16ToInt16(const uint16_t &fp_val) {
  274. int16_t ret;
  275. uint16_t ret_v;
  276. // 1.get s_ret and shift it to bit0.
  277. uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  278. // 2.get hf_e and hf_m
  279. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  280. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  281. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  282. ret_v = 0;
  283. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  284. return ret;
  285. }
  286. uint64_t long_int_m = hf_m;
  287. uint8_t overflow_flag = 0;
  288. uint16_t shift_out = 0;
  289. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  290. overflow_flag = 1;
  291. } else {
  292. while (hf_e != kFp16ExpBias) {
  293. if (hf_e > kFp16ExpBias) {
  294. hf_e--;
  295. long_int_m = long_int_m << 1;
  296. if (s_ret == 1 && long_int_m > 0x2000000Lu) { // sign=1,negative number(<0)
  297. long_int_m = 0x2000000Lu; // 10(fp16_t-man)+15(int16)=25bit
  298. overflow_flag = 1;
  299. break;
  300. } else if (s_ret != 1 && long_int_m >= 0x1FFFFFFLu) { // sign=0,positive number(>0) Overflow
  301. long_int_m = 0x1FFFFFFLu; // 10(fp16_t-man)+15(int16)=25bit
  302. overflow_flag = 1;
  303. break;
  304. }
  305. } else {
  306. hf_e++;
  307. shift_out++;
  308. }
  309. }
  310. }
  311. if (overflow_flag) {
  312. ret_v = kInt16Max + s_ret;
  313. } else {
  314. // Generate final result
  315. ret_v = GetUint16ValByMan(s_ret, long_int_m, shift_out);
  316. }
  317. ret = *(reinterpret_cast<int16_t *>(&ret_v));
  318. return ret;
  319. }
  320. /// @ingroup fp16_t math conversion static method
  321. /// @param [in] fp_val uint16_t value of fp16_t object
  322. /// @brief Convert fp16_t to uint16_t
  323. /// @return Return uint16_t value of fp_val which is the value of fp16_t object
  324. static uint16_t Fp16ToUInt16(const uint16_t &fp_val) {
  325. uint16_t m_ret = 0;
  326. // 1.get s_ret and shift it to bit0.
  327. uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  328. // 2.get hf_e and hf_m
  329. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  330. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  331. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  332. return 0;
  333. }
  334. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  335. m_ret = ~0;
  336. } else {
  337. uint64_t long_int_m = hf_m;
  338. uint16_t shift_out = 0;
  339. while (hf_e != kFp16ExpBias) {
  340. if (hf_e > kFp16ExpBias) {
  341. hf_e--;
  342. long_int_m = long_int_m << 1;
  343. } else {
  344. hf_e++;
  345. shift_out++;
  346. }
  347. }
  348. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  349. m_ret = static_cast<uint16_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max);
  350. if (need_round && m_ret != kBitLen16Max) {
  351. m_ret++;
  352. }
  353. }
  354. if (s_ret == 1) { // Negative number
  355. m_ret = 0;
  356. }
  357. // m_ret equal to final result
  358. return m_ret;
  359. }
  360. /// @ingroup fp16_t math convertion static method
  361. /// @param [in] fp_val uint16_t value of fp16_t object
  362. /// @brief Convert fp16_t to int32_t
  363. /// @return Return int32_t value of fp_val which is the value of fp16_t object
  364. static int32_t Fp16ToInt32(const uint16_t &fp_val) {
  365. uint32_t ret_v;
  366. // 1.get s_ret and shift it to bit0.
  367. uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  368. // 2.get hf_e and hf_m
  369. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  370. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  371. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  372. ret_v = kInt32Max + s_ret;
  373. } else {
  374. uint64_t long_int_m = hf_m;
  375. uint16_t shift_out = 0;
  376. while (hf_e != kFp16ExpBias) {
  377. if (hf_e > kFp16ExpBias) {
  378. hf_e--;
  379. long_int_m = long_int_m << 1;
  380. } else {
  381. hf_e++;
  382. shift_out++;
  383. }
  384. }
  385. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  386. auto m_ret = static_cast<uint32_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max);
  387. if (need_round && m_ret < kInt32Max) {
  388. m_ret++;
  389. }
  390. if (s_ret == 1) {
  391. m_ret = (~m_ret) + 1;
  392. }
  393. if (m_ret == 0) {
  394. s_ret = 0;
  395. }
  396. // Generate final result
  397. ret_v = (s_ret << kBitShift31) | (m_ret);
  398. }
  399. return *(reinterpret_cast<int32_t *>(&ret_v));
  400. }
  401. /// @ingroup fp16_t math conversion static method
  402. /// @param [in] fp_val uint16_t value of fp16_t object
  403. /// @brief Convert fp16_t to uint32_t
  404. /// @return Return uint32_t value of fp_val which is the value of fp16_t object
  405. static uint32_t Fp16ToUInt32(const uint16_t &fp_val) {
  406. uint32_t m_ret;
  407. // 1.get s_ret and shift it to bit0.
  408. uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  409. // 2.get hf_e and hf_m
  410. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  411. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  412. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  413. return 0u;
  414. }
  415. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  416. m_ret = ~0u;
  417. } else {
  418. uint64_t long_int_m = hf_m;
  419. uint16_t shift_out = 0;
  420. while (hf_e != kFp16ExpBias) {
  421. if (hf_e > kFp16ExpBias) {
  422. hf_e--;
  423. long_int_m = long_int_m << 1;
  424. } else {
  425. hf_e++;
  426. shift_out++;
  427. }
  428. }
  429. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  430. m_ret = static_cast<uint32_t>(long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max;
  431. if (need_round && m_ret != kBitLen32Max) {
  432. m_ret++;
  433. }
  434. }
  435. if (s_ret == 1) { // Negative number
  436. m_ret = 0;
  437. }
  438. // m_ret equal to final result
  439. return m_ret;
  440. }
  441. static uint16_t Fp16AddCalVal(uint16_t &s_ret, int16_t e_ret, uint16_t m_ret, uint32_t m_trunc, uint16_t shift_out) {
  442. uint16_t m_min = kFp16ManHideBit << shift_out;
  443. uint16_t m_max = m_min << 1;
  444. // Denormal
  445. while (m_ret < m_min && e_ret > 0) { // the value of m_ret should not be smaller than 2^23
  446. m_ret = m_ret << 1;
  447. m_ret += (kFp32SignMask & m_trunc) >> kFp32SignIndex;
  448. m_trunc = m_trunc << 1;
  449. e_ret = e_ret - 1;
  450. }
  451. while (m_ret >= m_max) { // the value of m_ret should be smaller than 2^24
  452. m_trunc = m_trunc >> 1;
  453. m_trunc = m_trunc | (kFp32SignMask * (m_ret & 1));
  454. m_ret = m_ret >> 1;
  455. e_ret = e_ret + 1;
  456. }
  457. bool b_last_bit = ((m_ret & 1) > 0);
  458. bool b_trunc_high = 0;
  459. bool b_trunc_left = 0;
  460. b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0);
  461. b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0);
  462. m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret, shift_out);
  463. while (m_ret >= m_max) {
  464. m_ret = m_ret >> 1;
  465. e_ret = e_ret + 1;
  466. }
  467. if (e_ret == 0 && m_ret <= m_max) {
  468. m_ret = m_ret >> 1;
  469. }
  470. Fp16Normalize(e_ret, m_ret);
  471. uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  472. return ret;
  473. }
  474. /// @ingroup fp16_t math operator
  475. /// @param [in] v_1 left operator value of fp16_t object
  476. /// @param [in] v_2 right operator value of fp16_t object
  477. /// @brief Performing fp16_t addition
  478. /// @return Return fp16_t result of adding this and fp
  479. static uint16_t Fp16Add(uint16_t v_1, uint16_t v_2) {
  480. uint16_t s_a;
  481. uint16_t s_b;
  482. int16_t e_a;
  483. int16_t e_b;
  484. uint32_t m_a;
  485. uint32_t m_b;
  486. uint16_t m_a_tmp;
  487. uint16_t m_b_tmp;
  488. uint16_t shift_out = 0;
  489. // 1.Extract
  490. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  491. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  492. m_a = m_a_tmp;
  493. m_b = m_b_tmp;
  494. uint16_t sum;
  495. uint16_t s_ret;
  496. if (s_a != s_b) {
  497. ReverseMan(s_a > 0, m_a);
  498. ReverseMan(s_b > 0, m_b);
  499. sum = static_cast<uint16_t>(GetManSum(e_a, m_a, e_b, m_b));
  500. s_ret = (sum & kFp16SignMask) >> kFp16SignIndex;
  501. ReverseMan(s_ret > 0, m_a);
  502. ReverseMan(s_ret > 0, m_b);
  503. } else {
  504. sum = static_cast<uint16_t>(GetManSum(e_a, m_a, e_b, m_b));
  505. s_ret = s_a;
  506. }
  507. if (sum == 0) {
  508. shift_out = 3; // shift to left 3 bits
  509. m_a = m_a << shift_out;
  510. m_b = m_b << shift_out;
  511. }
  512. uint32_t m_trunc = 0;
  513. int16_t e_ret = std::max(e_a, e_b);
  514. int16_t e_tmp = std::abs(e_a - e_b);
  515. if (e_a > e_b) {
  516. m_trunc = (m_b << (kBitShift32 - static_cast<uint16_t>(e_tmp)));
  517. m_b = RightShift(m_b, e_tmp);
  518. } else if (e_a < e_b) {
  519. m_trunc = (m_a << (kBitShift32 - static_cast<uint16_t>(e_tmp)));
  520. m_a = RightShift(m_a, e_tmp);
  521. }
  522. // calculate mantissav
  523. auto m_ret = static_cast<uint16_t>(m_a + m_b);
  524. return Fp16AddCalVal(s_ret, e_ret, m_ret, m_trunc, shift_out);
  525. }
  526. /// @ingroup fp16_t math operator
  527. /// @param [in] v_1 left operator value of fp16_t object
  528. /// @param [in] v_2 right operator value of fp16_t object
  529. /// @brief Performing fp16_t subtraction
  530. /// @return Return fp16_t result of subtraction fp from this
  531. static uint16_t Fp16Sub(uint16_t v_1, uint16_t v_2) {
  532. // Reverse
  533. uint16_t tmp = ((~(v_2)) & kFp16SignMask) | (v_2 & kFp16AbsMax);
  534. return Fp16Add(v_1, tmp);
  535. }
  536. /// @ingroup fp16_t math operator
  537. /// @param [in] v_1 left operator value of fp16_t object
  538. /// @param [in] v_2 right operator value of fp16_t object
  539. /// @brief Performing fp16_t multiplication
  540. /// @return Return fp16_t result of multiplying this and fp
  541. static uint16_t Fp16Mul(uint16_t v_1, uint16_t v_2) {
  542. uint16_t s_a;
  543. uint16_t s_b;
  544. int16_t e_a;
  545. int16_t e_b;
  546. uint32_t m_a;
  547. uint32_t m_b;
  548. uint16_t s_ret;
  549. uint16_t m_ret;
  550. int16_t e_ret;
  551. uint32_t mul_m;
  552. uint16_t m_a_tmp;
  553. uint16_t m_b_tmp;
  554. // 1.Extract
  555. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  556. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  557. m_a = m_a_tmp;
  558. m_b = m_b_tmp;
  559. e_ret = e_a + e_b - kFp16ExpBias - kDim10;
  560. mul_m = m_a * m_b;
  561. s_ret = s_a ^ s_b;
  562. uint32_t m_min = kFp16ManHideBit;
  563. uint32_t m_max = m_min << 1;
  564. uint32_t m_trunc = 0;
  565. // the value of m_ret should not be smaller than 2^23
  566. while (mul_m < m_min && e_ret > 1) {
  567. mul_m = mul_m << 1;
  568. e_ret = e_ret - 1;
  569. }
  570. while (mul_m >= m_max || e_ret < 1) {
  571. m_trunc = m_trunc >> 1;
  572. m_trunc = m_trunc | (kFp32SignMask * (mul_m & 1));
  573. mul_m = mul_m >> 1;
  574. e_ret = e_ret + 1;
  575. }
  576. bool b_last_bit = ((mul_m & 1) > 0);
  577. bool b_trunc_high = 0;
  578. bool b_trunc_left = 0;
  579. b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0);
  580. b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0);
  581. mul_m = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, mul_m);
  582. while (mul_m >= m_max || e_ret < 0) {
  583. mul_m = mul_m >> 1;
  584. e_ret = e_ret + 1;
  585. }
  586. if (e_ret == 1 && mul_m < kFp16ManHideBit) {
  587. e_ret = 0;
  588. }
  589. m_ret = static_cast<uint16_t>(mul_m);
  590. Fp16Normalize(e_ret, m_ret);
  591. uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  592. return ret;
  593. }
  594. /// @ingroup fp16_t math operator divided
  595. /// @param [in] v_1 left operator value of fp16_t object
  596. /// @param [in] v_2 right operator value of fp16_t object
  597. /// @brief Performing fp16_t division
  598. /// @return Return fp16_t result of division this by fp
  599. static uint16_t Fp16Div(uint16_t v_1, uint16_t v_2) {
  600. uint16_t ret;
  601. if (FP16_IS_ZERO(v_2)) { // result is inf
  602. // throw "fp16_t division by zero.";
  603. uint16_t s_a;
  604. uint16_t s_b;
  605. uint16_t s_ret;
  606. s_a = FP16_EXTRAC_SIGN(v_1);
  607. s_b = FP16_EXTRAC_SIGN(v_2);
  608. s_ret = s_a ^ s_b;
  609. ret = FP16_CONSTRUCTOR(s_ret, kFp16MaxExp, 0u);
  610. } else if (FP16_IS_ZERO(v_1)) {
  611. ret = 0u;
  612. } else {
  613. uint16_t s_a;
  614. uint16_t s_b;
  615. int16_t e_a;
  616. int16_t e_b;
  617. uint64_t m_a;
  618. uint64_t m_b;
  619. float m_div;
  620. uint16_t m_a_tmp;
  621. uint16_t m_b_tmp;
  622. // 1.Extract
  623. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  624. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  625. m_a = m_a_tmp;
  626. m_b = m_b_tmp;
  627. uint64_t m_tmp;
  628. if (e_a > e_b) {
  629. m_tmp = m_a;
  630. uint16_t tmp;
  631. tmp = e_a - e_b;
  632. for (int i = 0; i < tmp; i++) {
  633. m_tmp = m_tmp << 1;
  634. }
  635. m_a = m_tmp;
  636. } else if (e_a < e_b) {
  637. m_tmp = m_b;
  638. uint16_t tmp = e_b - e_a;
  639. for (int i = 0; i < tmp; i++) {
  640. m_tmp = m_tmp << 1;
  641. }
  642. m_b = m_tmp;
  643. }
  644. m_div = static_cast<float>(m_a * 1.0f / m_b);
  645. fp16_t fp_div;
  646. fp_div = m_div;
  647. ret = fp_div.val;
  648. if (s_a != s_b) {
  649. ret |= kFp16SignMask;
  650. }
  651. }
  652. return ret;
  653. }
  654. // operate
  655. fp16_t fp16_t::operator+(const fp16_t fp) {
  656. uint16_t ret_val = Fp16Add(val, fp.val);
  657. fp16_t ret(ret_val);
  658. return ret;
  659. }
  660. fp16_t fp16_t::operator-(const fp16_t fp) {
  661. uint16_t ret_val = Fp16Sub(val, fp.val);
  662. fp16_t ret(ret_val);
  663. return ret;
  664. }
  665. fp16_t fp16_t::operator*(const fp16_t fp) {
  666. uint16_t ret_val = Fp16Mul(val, fp.val);
  667. fp16_t ret(ret_val);
  668. return ret;
  669. }
  670. fp16_t fp16_t::operator/(const fp16_t fp) {
  671. uint16_t ret_val = Fp16Div(val, fp.val);
  672. fp16_t ret(ret_val);
  673. return ret;
  674. }
  675. fp16_t fp16_t::operator+=(const fp16_t fp) {
  676. val = Fp16Add(val, fp.val);
  677. return *this;
  678. }
  679. fp16_t fp16_t::operator-=(const fp16_t fp) {
  680. val = Fp16Sub(val, fp.val);
  681. return *this;
  682. }
  683. fp16_t fp16_t::operator*=(const fp16_t fp) {
  684. val = Fp16Mul(val, fp.val);
  685. return *this;
  686. }
  687. fp16_t fp16_t::operator/=(const fp16_t fp) {
  688. val = Fp16Div(val, fp.val);
  689. return *this;
  690. }
  691. // compare
  692. bool fp16_t::operator==(const fp16_t &fp) const {
  693. bool result = true;
  694. if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) {
  695. result = true;
  696. } else {
  697. result = ((val & kBitLen16Max) == (fp.val & kBitLen16Max)); // bit compare
  698. }
  699. return result;
  700. }
  701. bool fp16_t::operator!=(const fp16_t &fp) const {
  702. bool result = true;
  703. if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) {
  704. result = false;
  705. } else {
  706. result = ((val & kBitLen16Max) != (fp.val & kBitLen16Max)); // bit compare
  707. }
  708. return result;
  709. }
  710. bool fp16_t::operator>(const fp16_t &fp) const {
  711. uint16_t s_a;
  712. uint16_t s_b;
  713. uint16_t e_a;
  714. uint16_t e_b;
  715. uint16_t m_a;
  716. uint16_t m_b;
  717. bool result = true;
  718. // 1.Extract
  719. s_a = FP16_EXTRAC_SIGN(val);
  720. s_b = FP16_EXTRAC_SIGN(fp.val);
  721. e_a = FP16_EXTRAC_EXP(val);
  722. e_b = FP16_EXTRAC_EXP(fp.val);
  723. m_a = FP16_EXTRAC_MAN(val);
  724. m_b = FP16_EXTRAC_MAN(fp.val);
  725. // Compare
  726. if ((s_a == 0) && (s_b > 0)) { // + -
  727. // -0=0
  728. result = !(FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val));
  729. } else if ((s_a == 0) && (s_b == 0)) { // + +
  730. if (e_a > e_b) { // e_a - e_b >= 1; Va always larger than Vb
  731. result = true;
  732. } else if (e_a == e_b) {
  733. result = m_a > m_b;
  734. } else {
  735. result = false;
  736. }
  737. } else if ((s_a > 0) && (s_b > 0)) { // - - opposite to + +
  738. if (e_a < e_b) {
  739. result = true;
  740. } else if (e_a == e_b) {
  741. result = m_a < m_b;
  742. } else {
  743. result = false;
  744. }
  745. } else { // - +
  746. result = false;
  747. }
  748. return result;
  749. }
  750. bool fp16_t::operator>=(const fp16_t &fp) const {
  751. bool result = true;
  752. if ((*this) > fp) {
  753. result = true;
  754. } else if ((*this) == fp) {
  755. result = true;
  756. } else {
  757. result = false;
  758. }
  759. return result;
  760. }
  761. bool fp16_t::operator<(const fp16_t &fp) const {
  762. bool result = true;
  763. if ((*this) >= fp) {
  764. result = false;
  765. } else {
  766. result = true;
  767. }
  768. return result;
  769. }
  770. bool fp16_t::operator<=(const fp16_t &fp) const {
  771. bool result = true;
  772. if ((*this) > fp) {
  773. result = false;
  774. } else {
  775. result = true;
  776. }
  777. return result;
  778. }
  779. // evaluation
  780. fp16_t &fp16_t::operator=(const fp16_t &fp) {
  781. if (&fp == this) {
  782. return *this;
  783. }
  784. val = fp.val;
  785. return *this;
  786. }
  787. fp16_t &fp16_t::operator=(const float &f_val) {
  788. uint16_t s_ret;
  789. uint16_t m_ret;
  790. int16_t e_ret;
  791. uint32_t e_f;
  792. uint32_t m_f;
  793. const uint32_t ui32_v = *(reinterpret_cast<const uint32_t *>(&f_val)); // 1:8:23bit sign:exp:man
  794. uint32_t m_len_delta;
  795. s_ret = static_cast<uint16_t>((ui32_v & kFp32SignMask) >> kFp32SignIndex); // 4Byte->2Byte
  796. e_f = (ui32_v & kFp32ExpMask) >> kFp32ManLen; // 8 bit exponent
  797. m_f = (ui32_v & kFp32ManMask); // 23 bit mantissa dont't need to care about denormal
  798. m_len_delta = kFp32ManLen - kFp16ManLen;
  799. bool need_round = false;
  800. // Exponent overflow/NaN converts to signed inf/NaN
  801. if (e_f > 0x8Fu) { // 0x8Fu:142=127+15
  802. e_ret = kFp16MaxExp - 1;
  803. m_ret = kFp16MaxMan;
  804. } else if (e_f <= 0x70u) { // 0x70u:112=127-15 Exponent underflow converts to denormalized half or signed zero
  805. e_ret = 0;
  806. if (e_f >= 0x67) { // 0x67:103=127-24 Denormal
  807. m_f = (m_f | kFp32ManHideBit);
  808. uint16_t shift_out = kFp32ManLen;
  809. uint64_t m_tmp = (static_cast<uint64_t>(m_f)) << (e_f - 0x67);
  810. need_round = IsRoundOne(m_tmp, shift_out);
  811. m_ret = static_cast<uint16_t>(m_tmp >> shift_out);
  812. if (need_round) {
  813. m_ret++;
  814. }
  815. } else if (e_f == 0x66 && m_f > 0) { // 0x66:102 Denormal 0<f_v<min(Denormal)
  816. m_ret = 1;
  817. } else {
  818. m_ret = 0;
  819. }
  820. } else { // Regular case with no overflow or underflow
  821. e_ret = static_cast<int16_t>(e_f - 0x70u);
  822. need_round = IsRoundOne(m_f, static_cast<uint16_t>(m_len_delta));
  823. m_ret = static_cast<uint16_t>(m_f >> m_len_delta);
  824. if (need_round) {
  825. m_ret++;
  826. }
  827. if (m_ret & kFp16ManHideBit) {
  828. e_ret++;
  829. }
  830. }
  831. Fp16Normalize(e_ret, m_ret);
  832. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  833. return *this;
  834. }
  835. fp16_t &fp16_t::operator=(const int8_t &i_val) {
  836. uint16_t s_ret;
  837. uint16_t e_ret;
  838. uint16_t m_ret;
  839. s_ret = static_cast<uint16_t>(((static_cast<uint8_t>(i_val)) & 0x80) >> kDim7);
  840. m_ret = static_cast<uint16_t>(((static_cast<uint8_t>(i_val)) & kInt8Max));
  841. if (m_ret == 0) {
  842. e_ret = 0;
  843. } else {
  844. if (s_ret) { // negative number(<0)
  845. m_ret = static_cast<uint16_t>(std::abs(i_val)); // complement
  846. }
  847. e_ret = kFp16ManLen;
  848. while ((m_ret & kFp16ManHideBit) == 0) {
  849. m_ret = m_ret << 1;
  850. e_ret = e_ret - 1;
  851. }
  852. e_ret = e_ret + kFp16ExpBias;
  853. }
  854. val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret);
  855. return *this;
  856. }
  857. fp16_t &fp16_t::operator=(const uint8_t &ui_val) {
  858. uint16_t s_ret;
  859. uint16_t e_ret;
  860. uint16_t m_ret;
  861. s_ret = 0;
  862. e_ret = 0;
  863. m_ret = ui_val;
  864. if (m_ret) {
  865. e_ret = kFp16ManLen;
  866. while ((m_ret & kFp16ManHideBit) == 0) {
  867. m_ret = m_ret << 1;
  868. e_ret = e_ret - 1;
  869. }
  870. e_ret = e_ret + kFp16ExpBias;
  871. }
  872. val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret);
  873. return *this;
  874. }
  875. static void SetValByUint16Val(const uint16_t &input_val, const uint16_t &sign, uint16_t &ret_val) {
  876. uint32_t m_tmp = (input_val & kFp32AbsMax);
  877. uint16_t m_min = kFp16ManHideBit;
  878. uint16_t m_max = m_min << 1;
  879. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  880. if (m_tmp) {
  881. int16_t e_ret;
  882. if (len > kDim11) {
  883. e_ret = kFp16ExpBias + kFp16ManLen;
  884. uint16_t e_tmp = len - kDim11;
  885. uint32_t trunc_mask = 1;
  886. for (int i = 1; i < e_tmp; i++) {
  887. trunc_mask = (trunc_mask << 1) + 1;
  888. }
  889. uint32_t m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp);
  890. for (int i = 0; i < e_tmp; i++) {
  891. m_tmp = (m_tmp >> 1);
  892. e_ret = e_ret + 1;
  893. }
  894. bool b_last_bit = ((m_tmp & 1) > 0);
  895. bool b_trunc_high = 0;
  896. bool b_trunc_left = 0;
  897. if (kRoundToNearest == g_round_mode) { // trunc
  898. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  899. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  900. }
  901. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  902. while (m_tmp >= m_max || e_ret < 0) {
  903. m_tmp = m_tmp >> 1;
  904. e_ret = e_ret + 1;
  905. }
  906. } else {
  907. e_ret = kFp16ExpBias;
  908. m_tmp = m_tmp << (kManBitLength - len);
  909. e_ret = e_ret + (len - 1);
  910. }
  911. auto m_ret = static_cast<uint16_t>(m_tmp);
  912. ret_val = FP16_CONSTRUCTOR(sign, static_cast<uint16_t>(e_ret), m_ret);
  913. }
  914. }
  915. fp16_t &fp16_t::operator=(const int16_t &i_val) {
  916. if (i_val == 0) {
  917. val = 0;
  918. } else {
  919. uint16_t ui_val = *(reinterpret_cast<const uint16_t *>(&i_val));
  920. auto s_ret = static_cast<uint16_t>(ui_val >> kBitShift15);
  921. if (s_ret) {
  922. int16_t iValM = -i_val;
  923. ui_val = *(reinterpret_cast<uint16_t *>(&iValM));
  924. }
  925. SetValByUint16Val(ui_val, s_ret, val);
  926. }
  927. return *this;
  928. }
  929. fp16_t &fp16_t::operator=(const uint16_t &ui_val) {
  930. if (ui_val == 0) {
  931. val = 0;
  932. } else {
  933. int16_t e_ret;
  934. uint16_t m_ret = ui_val;
  935. uint16_t m_min = kFp16ManHideBit;
  936. uint16_t m_max = m_min << 1;
  937. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_ret));
  938. if (len > kManBitLength) {
  939. e_ret = kFp16ExpBias + kFp16ManLen;
  940. uint32_t m_trunc;
  941. uint32_t trunc_mask = 1;
  942. uint16_t e_tmp = len - kManBitLength;
  943. for (int i = 1; i < e_tmp; i++) {
  944. trunc_mask = (trunc_mask << 1) + 1;
  945. }
  946. m_trunc = (m_ret & trunc_mask) << (kBitShift32 - e_tmp);
  947. for (int i = 0; i < e_tmp; i++) {
  948. m_ret = (m_ret >> 1);
  949. e_ret = e_ret + 1;
  950. }
  951. bool b_last_bit = ((m_ret & 1) > 0);
  952. bool b_trunc_high = 0;
  953. bool b_trunc_left = 0;
  954. if (kRoundToNearest == g_round_mode) { // trunc
  955. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  956. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  957. }
  958. m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret);
  959. while (m_ret >= m_max || e_ret < 0) {
  960. m_ret = m_ret >> 1;
  961. e_ret = e_ret + 1;
  962. }
  963. if (FP16_IS_INVALID(val)) {
  964. val = kFp16Max;
  965. }
  966. } else {
  967. e_ret = kFp16ExpBias;
  968. m_ret = m_ret << (kDim11 - len);
  969. e_ret = e_ret + (len - 1);
  970. }
  971. val = FP16_CONSTRUCTOR(0u, static_cast<uint16_t>(e_ret), m_ret);
  972. }
  973. return *this;
  974. }
  975. static void SetValByUint32Val(const uint32_t &input_val, const uint16_t &sign, uint16_t &ret_val) {
  976. int16_t e_ret;
  977. uint32_t m_tmp = (input_val & kFp32AbsMax);
  978. uint32_t m_min = kFp16ManHideBit;
  979. uint32_t m_max = m_min << 1;
  980. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  981. if (len > kDim11) {
  982. e_ret = kFp16ExpBias + kFp16ManLen;
  983. uint32_t m_trunc = 0;
  984. uint32_t trunc_mask = 1;
  985. uint16_t e_tmp = len - kDim11;
  986. for (int i = 1; i < e_tmp; i++) {
  987. trunc_mask = (trunc_mask << 1) + 1;
  988. }
  989. m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp);
  990. for (int i = 0; i < e_tmp; i++) {
  991. m_tmp = (m_tmp >> 1);
  992. e_ret = e_ret + 1;
  993. }
  994. bool b_last_bit = ((m_tmp & 1) > 0);
  995. bool b_trunc_high = 0;
  996. bool b_trunc_left = 0;
  997. if (kRoundToNearest == g_round_mode) { // trunc
  998. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  999. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  1000. }
  1001. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  1002. while (m_tmp >= m_max || e_ret < 0) {
  1003. m_tmp = m_tmp >> 1;
  1004. e_ret = e_ret + 1;
  1005. }
  1006. if (e_ret >= kFp16MaxExp) {
  1007. e_ret = kFp16MaxExp - 1;
  1008. m_tmp = kFp16MaxMan;
  1009. }
  1010. } else {
  1011. e_ret = kFp16ExpBias;
  1012. m_tmp = m_tmp << (kDim11 - len);
  1013. e_ret = e_ret + (len - 1);
  1014. }
  1015. auto m_ret = static_cast<uint16_t>(m_tmp);
  1016. ret_val = FP16_CONSTRUCTOR(sign, static_cast<uint16_t>(e_ret), m_ret);
  1017. }
  1018. fp16_t &fp16_t::operator=(const int32_t &i_val) {
  1019. if (i_val == 0) {
  1020. val = 0;
  1021. } else {
  1022. uint32_t ui_val = *(reinterpret_cast<const uint32_t *>(&i_val));
  1023. auto s_ret = static_cast<uint16_t>(ui_val >> kBitShift31);
  1024. if (s_ret) {
  1025. int32_t iValM = -i_val;
  1026. ui_val = *(reinterpret_cast<uint32_t *>(&iValM));
  1027. }
  1028. SetValByUint32Val(ui_val, s_ret, val);
  1029. }
  1030. return *this;
  1031. }
  1032. fp16_t &fp16_t::operator=(const uint32_t &ui_val) {
  1033. if (ui_val == 0) {
  1034. val = 0;
  1035. } else {
  1036. int16_t e_ret;
  1037. uint32_t m_tmp = ui_val;
  1038. uint32_t m_min = kFp16ManHideBit;
  1039. uint32_t m_max = m_min << 1;
  1040. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  1041. if (len > kDim11) {
  1042. e_ret = kFp16ExpBias + kFp16ManLen;
  1043. uint32_t m_trunc = 0;
  1044. uint32_t trunc_mask = 1;
  1045. uint16_t e_tmp = len - kDim11;
  1046. for (int i = 1; i < e_tmp; i++) {
  1047. trunc_mask = (trunc_mask << 1) + 1;
  1048. }
  1049. m_trunc = (m_tmp & trunc_mask) << static_cast<uint32_t>(kBitShift32 - e_tmp);
  1050. for (uint16_t i = 0; i < e_tmp; i++) {
  1051. m_tmp = (m_tmp >> 1);
  1052. e_ret = e_ret + 1;
  1053. }
  1054. bool b_last_bit = ((m_tmp & 1) > 0);
  1055. bool b_trunc_high = false;
  1056. bool b_trunc_left = false;
  1057. if (g_round_mode == kRoundToNearest) { // trunc
  1058. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  1059. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  1060. }
  1061. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  1062. while (m_tmp >= m_max || e_ret < 0) {
  1063. m_tmp = m_tmp >> 1;
  1064. e_ret = e_ret + 1;
  1065. }
  1066. if (e_ret >= kFp16MaxExp) {
  1067. e_ret = kFp16MaxExp - 1;
  1068. m_tmp = kFp16MaxMan;
  1069. }
  1070. } else {
  1071. e_ret = kFp16ExpBias;
  1072. m_tmp = m_tmp << (kDim11 - len);
  1073. e_ret = e_ret + (len - 1);
  1074. }
  1075. auto m_ret = static_cast<uint16_t>(m_tmp);
  1076. val = FP16_CONSTRUCTOR(0u, static_cast<uint16_t>(e_ret), m_ret);
  1077. }
  1078. return *this;
  1079. }
  1080. fp16_t &fp16_t::operator=(const double &d_val) {
  1081. uint16_t s_ret;
  1082. uint16_t m_ret;
  1083. int16_t e_ret;
  1084. uint64_t e_d;
  1085. uint64_t m_d;
  1086. uint64_t ui64_v = *(reinterpret_cast<const uint64_t *>(&d_val)); // 1:11:52bit sign:exp:man
  1087. uint32_t m_len_delta;
  1088. s_ret = static_cast<uint16_t>((ui64_v & kFp64SignMask) >> kFp64SignIndex); // 4Byte
  1089. e_d = (ui64_v & kFp64ExpMask) >> kFp64ManLen; // 10 bit exponent
  1090. m_d = (ui64_v & kFp64ManMask); // 52 bit mantissa
  1091. m_len_delta = kFp64ManLen - kFp16ManLen;
  1092. bool need_round = false;
  1093. // Exponent overflow/NaN converts to signed inf/NaN
  1094. if (e_d >= 0x410u) { // 0x410:1040=1023+16
  1095. e_ret = kFp16MaxExp - 1;
  1096. m_ret = kFp16MaxMan;
  1097. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  1098. } else if (e_d <= 0x3F0u) { // Exponent underflow converts to denormalized half or signed zero
  1099. // 0x3F0:1008=1023-15
  1100. // Signed zeros, denormalized floats, and floats with small
  1101. // exponents all convert to signed zero half precision.
  1102. e_ret = 0;
  1103. if (e_d >= 0x3E7u) { // 0x3E7u:999=1023-24 Denormal
  1104. // Underflows to a denormalized value
  1105. m_d = (kFp64ManHideBit | m_d);
  1106. uint16_t shift_out = kFp64ManLen;
  1107. uint64_t m_tmp = (static_cast<uint64_t>(m_d)) << (e_d - 0x3E7u);
  1108. need_round = IsRoundOne(m_tmp, shift_out);
  1109. m_ret = static_cast<uint16_t>(m_tmp >> shift_out);
  1110. if (need_round) {
  1111. m_ret++;
  1112. }
  1113. } else if (e_d == 0x3E6u && m_d > 0) {
  1114. m_ret = 1;
  1115. } else {
  1116. m_ret = 0;
  1117. }
  1118. } else { // Regular case with no overflow or underflow
  1119. e_ret = static_cast<int16_t>(e_d - 0x3F0u);
  1120. need_round = IsRoundOne(m_d, m_len_delta);
  1121. m_ret = static_cast<uint16_t>(m_d >> m_len_delta);
  1122. if (need_round) {
  1123. m_ret++;
  1124. }
  1125. if (m_ret & kFp16ManHideBit) {
  1126. e_ret++;
  1127. }
  1128. }
  1129. Fp16Normalize(e_ret, m_ret);
  1130. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  1131. return *this;
  1132. }
  1133. // convert
  1134. fp16_t::operator float() const {
  1135. return Fp16ToFloat(val);
  1136. }
  1137. fp16_t::operator double() const {
  1138. return Fp16ToDouble(val);
  1139. }
  1140. fp16_t::operator int8_t() const {
  1141. return Fp16ToInt8(val);
  1142. }
  1143. fp16_t::operator uint8_t() const {
  1144. return Fp16ToUInt8(val);
  1145. }
  1146. fp16_t::operator int16_t() const {
  1147. return Fp16ToInt16(val);
  1148. }
  1149. fp16_t::operator uint16_t() const {
  1150. return Fp16ToUInt16(val);
  1151. }
  1152. fp16_t::operator int32_t() const {
  1153. return Fp16ToInt32(val);
  1154. }
  1155. fp16_t::operator uint32_t() const {
  1156. return Fp16ToUInt32(val);
  1157. }
  1158. // Cannot be used, just in order to solve the compile error
  1159. fp16_t::operator int64_t() const {
  1160. return 0;
  1161. }
  1162. // Cannot be used, just in order to solve the compile error
  1163. fp16_t::operator uint64_t() const {
  1164. return 0;
  1165. }
  1166. int fp16_t::IsInf() {
  1167. if ((val & kFp16AbsMax) == kFp16ExpMask) {
  1168. if (val & kFp16SignMask) {
  1169. return -1;
  1170. } else {
  1171. return 1;
  1172. }
  1173. } else {
  1174. return 0;
  1175. }
  1176. }
  1177. float fp16_t::ToFloat() const {
  1178. return Fp16ToFloat(val);
  1179. }
  1180. double fp16_t::ToDouble() const {
  1181. return Fp16ToDouble(val);
  1182. }
  1183. int8_t fp16_t::ToInt8() const {
  1184. return Fp16ToInt8(val);
  1185. }
  1186. uint8_t fp16_t::ToUInt8() const {
  1187. return Fp16ToUInt8(val);
  1188. }
  1189. int16_t fp16_t::ToInt16() const {
  1190. return Fp16ToInt16(val);
  1191. }
  1192. uint16_t fp16_t::ToUInt16() const {
  1193. return Fp16ToUInt16(val);
  1194. }
  1195. int32_t fp16_t::ToInt32() const {
  1196. return Fp16ToInt32(val);
  1197. }
  1198. uint32_t fp16_t::ToUInt32() const {
  1199. return Fp16ToUInt32(val);
  1200. }
  1201. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示