You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fp16_t.cc 38 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "common/fp16_t.h"
  17. #include "external/register/register_types.h"
  18. namespace {
  19. constexpr uint16_t kManBitLength = 11;
  20. }
  21. namespace ge {
  22. /// @ingroup fp16_t global filed
  23. /// @brief round mode of last valid digital
  24. enum TagFp16RoundMode g_round_mode = kRoundToNearest;
  25. void ExtractFp16(const uint16_t &val, uint16_t &s, int16_t &e, uint16_t &m) {
  26. // 1.Extract
  27. s = static_cast<uint16_t>(FP16_EXTRAC_SIGN(val));
  28. e = static_cast<int16_t>(FP16_EXTRAC_EXP(val));
  29. m = static_cast<uint16_t>(FP16_EXTRAC_MAN(val));
  30. // Denormal
  31. if (e == 0) {
  32. e = 1;
  33. }
  34. }
  35. /// @ingroup fp16_t static method
  36. /// @param [in] man truncated mantissa
  37. /// @param [in] shift_out left shift bits based on ten bits
  38. /// @brief judge whether to add one to the result while converting fp16_t to other datatype
  39. /// @return Return true if add one, otherwise false
  40. static bool IsRoundOne(uint64_t man, uint16_t trunc_len) {
  41. uint64_t mask0 = 0x4;
  42. uint64_t mask1 = 0x2;
  43. uint64_t mask2;
  44. uint16_t shift_out = static_cast<uint16_t>(trunc_len - kDim2);
  45. mask0 = mask0 << shift_out;
  46. mask1 = mask1 << shift_out;
  47. mask2 = mask1 - 1;
  48. bool last_bit = ((man & mask0) > 0);
  49. bool trunc_high = false;
  50. bool trunc_left = false;
  51. if (g_round_mode == kRoundToNearest) {
  52. trunc_high = ((man & mask1) > 0);
  53. trunc_left = ((man & mask2) > 0);
  54. }
  55. return (trunc_high && (trunc_left || last_bit));
  56. }
  57. /// @ingroup fp16_t public method
  58. /// @param [in] exp exponent of fp16_t value
  59. /// @param [in] man exponent of fp16_t value
  60. /// @brief normalize fp16_t value
  61. /// @return
  62. static void Fp16Normalize(int16_t &exp, uint16_t &man) {
  63. // set to invalid data
  64. if (exp >= kFp16MaxExp) {
  65. exp = static_cast<int16_t>(kFp16MaxExp);
  66. man = static_cast<uint16_t>(kFp16MaxMan);
  67. } else if (exp == 0 && man == kFp16ManHideBit) {
  68. exp++;
  69. man = 0;
  70. }
  71. }
  72. /// @ingroup fp16_t math conversion static method
  73. /// @param [in] fp_val uint16_t value of fp16_t object
  74. /// @brief Convert fp16_t to float/fp32
  75. /// @return Return float/fp32 value of fp_val which is the value of fp16_t object
  76. static float Fp16ToFloat(const uint16_t &fp_val) {
  77. uint16_t hf_sign;
  78. uint16_t hf_man;
  79. int16_t hf_exp;
  80. ExtractFp16(fp_val, hf_sign, hf_exp, hf_man);
  81. while (hf_man && !(hf_man & kFp16ManHideBit)) {
  82. hf_man <<= 1;
  83. hf_exp--;
  84. }
  85. uint32_t e_ret;
  86. uint32_t m_ret;
  87. uint32_t s_ret = hf_sign;
  88. if (hf_man == 0) {
  89. e_ret = 0;
  90. m_ret = 0;
  91. } else {
  92. e_ret = hf_exp - kFp16ExpBias + kFp32ExpBias;
  93. m_ret = hf_man & kFp16ManMask;
  94. m_ret = m_ret << (kFp32ManLen - kFp16ManLen);
  95. }
  96. uint32_t f_val = FP32_CONSTRUCTOR(s_ret, e_ret, m_ret);
  97. auto p_ret_v = reinterpret_cast<float *>(&f_val);
  98. return *p_ret_v;
  99. }
  100. /// @ingroup fp16_t math conversion static method
  101. /// @param [in] fp_val uint16_t value of fp16_t object
  102. /// @brief Convert fp16_t to double/fp64
  103. /// @return Return double/fp64 value of fp_val which is the value of fp16_t object
  104. static double Fp16ToDouble(const uint16_t &fp_val) {
  105. uint16_t hf_sign;
  106. uint16_t hf_man;
  107. int16_t hf_exp;
  108. ExtractFp16(fp_val, hf_sign, hf_exp, hf_man);
  109. while (hf_man && !(hf_man & kFp16ManHideBit)) {
  110. hf_man <<= 1;
  111. hf_exp--;
  112. }
  113. uint64_t e_ret;
  114. uint64_t m_ret;
  115. uint64_t s_ret = hf_sign;
  116. if (!hf_man) {
  117. e_ret = 0;
  118. m_ret = 0;
  119. } else {
  120. e_ret = hf_exp - kFp16ExpBias + kFp64ExpBias;
  121. m_ret = hf_man & kFp16ManMask;
  122. m_ret = m_ret << (kFp64ManLen - kFp16ManLen);
  123. }
  124. uint64_t f_val = (s_ret << kFp64SignIndex) | (e_ret << kFp64ManLen) | (m_ret);
  125. auto p_ret_v = reinterpret_cast<double *>(&f_val);
  126. return *p_ret_v;
  127. }
  128. /// @ingroup fp16_t static method
  129. /// @param [in] s_ret sign of fp16_t value
  130. /// @param [in] long_int_m man uint64_t value of fp16_t object
  131. /// @param [in] shift_out shift offset
  132. /// @brief calculate uint8 value by sign,man and shift offset
  133. /// @return Return uint8 value of fp16_t object
  134. static uint8_t GetUint8ValByMan(uint8_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) {
  135. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  136. auto m_ret = static_cast<uint8_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max);
  137. need_round = need_round && ((s_ret == 0 && m_ret < kInt8Max) || (s_ret == 1 && m_ret <= kInt8Max));
  138. if (need_round) {
  139. m_ret++;
  140. }
  141. if (s_ret) {
  142. m_ret = (~m_ret) + 1;
  143. }
  144. if (m_ret == 0) {
  145. s_ret = 0;
  146. }
  147. return static_cast<uint8_t>((s_ret << kBitShift7) | (m_ret));
  148. }
  149. /// @ingroup fp16_t math conversion static method
  150. /// @param [in] fp_val uint16_t value of fp16_t object
  151. /// @brief Convert fp16_t to int8_t
  152. /// @return Return int8_t value of fp_val which is the value of fp16_t object
  153. static int8_t Fp16ToInt8(const uint16_t &fp_val) {
  154. int8_t ret;
  155. uint8_t ret_v;
  156. // 1.get s_ret and shift it to bit0.
  157. uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  158. // 2.get hf_e and hf_m
  159. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  160. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  161. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  162. ret_v = 0;
  163. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  164. return ret;
  165. }
  166. uint64_t long_int_m = hf_m;
  167. uint8_t overflow_flag = 0;
  168. uint16_t shift_out = 0;
  169. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  170. overflow_flag = 1;
  171. } else {
  172. while (hf_e != kFp16ExpBias) {
  173. if (hf_e > kFp16ExpBias) {
  174. hf_e--;
  175. long_int_m = long_int_m << 1;
  176. if (s_ret == 1 && long_int_m >= 0x20000u) { // sign=1,negative number(<0)
  177. long_int_m = 0x20000u; // 10 0000 0000 0000 0000 10(fp16_t-man)+7(int8)=17bit
  178. overflow_flag = 1;
  179. break;
  180. } else if (s_ret != 1 && long_int_m >= 0x1FFFFu) { // sign=0,positive number(>0)
  181. long_int_m = 0x1FFFFu; // 01 1111 1111 1111 1111 10(fp16_t-man)+7(int8)
  182. overflow_flag = 1;
  183. break;
  184. }
  185. } else {
  186. hf_e++;
  187. shift_out++;
  188. }
  189. }
  190. }
  191. if (overflow_flag) {
  192. ret_v = kInt8Max + s_ret;
  193. } else {
  194. // Generate final result
  195. ret_v = GetUint8ValByMan(s_ret, long_int_m, shift_out);
  196. }
  197. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  198. return ret;
  199. }
  200. /// @ingroup fp16_t math conversion static method
  201. /// @param [in] fp_val uint16_t value of fp16_t object
  202. /// @brief Convert fp16_t to uint8_t
  203. /// @return Return uint8_t value of fp_val which is the value of fp16_t object
  204. static uint8_t Fp16ToUInt8(const uint16_t &fp_val) {
  205. uint8_t m_ret = 0;
  206. // 1.get s_ret and shift it to bit0.
  207. uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  208. // 2.get hf_e and hf_m
  209. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  210. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  211. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  212. return 0;
  213. }
  214. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  215. m_ret = ~0;
  216. } else {
  217. uint64_t long_int_m = hf_m;
  218. uint8_t overflow_flag = 0;
  219. uint16_t shift_out = 0;
  220. while (hf_e != kFp16ExpBias) {
  221. if (hf_e > kFp16ExpBias) {
  222. hf_e--;
  223. long_int_m = long_int_m << 1;
  224. if (long_int_m >= 0x40000Lu) { // overflow 0100 0000 0000 0000 0000
  225. long_int_m = 0x3FFFFLu; // 11 1111 1111 1111 1111 10(fp16_t-man)+8(uint8)=18bit
  226. overflow_flag = 1;
  227. m_ret = ~0;
  228. break;
  229. }
  230. } else {
  231. hf_e++;
  232. shift_out++;
  233. }
  234. }
  235. if (!overflow_flag) {
  236. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  237. m_ret = static_cast<uint8_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max);
  238. if (need_round && m_ret != kBitLen8Max) {
  239. m_ret++;
  240. }
  241. }
  242. }
  243. if (s_ret == 1) { // Negative number
  244. m_ret = 0;
  245. }
  246. // m_ret equal to final result
  247. return m_ret;
  248. }
  249. /// @ingroup fp16_t static method
  250. /// @param [in] s_ret sign of fp16_t value
  251. /// @param [in] long_int_m man uint64_t value of fp16_t object
  252. /// @param [in] shift_out shift offset
  253. /// @brief calculate uint16 value by sign,man and shift offset
  254. /// @return Return uint16 value of fp16_t object
  255. static uint16_t GetUint16ValByMan(uint16_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) {
  256. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  257. auto m_ret = static_cast<uint16_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max);
  258. if (need_round && m_ret < kInt16Max) {
  259. m_ret++;
  260. }
  261. if (s_ret) {
  262. m_ret = (~m_ret) + 1;
  263. }
  264. if (m_ret == 0) {
  265. s_ret = 0;
  266. }
  267. return static_cast<uint16_t>((s_ret << kBitShift15) | (m_ret));
  268. }
  269. /// @ingroup fp16_t math conversion static method
  270. /// @param [in] fp_val uint16_t value of fp16_t object
  271. /// @brief Convert fp16_t to int16_t
  272. /// @return Return int16_t value of fp_val which is the value of fp16_t object
  273. static int16_t Fp16ToInt16(const uint16_t &fp_val) {
  274. int16_t ret;
  275. uint16_t ret_v;
  276. // 1.get s_ret and shift it to bit0.
  277. uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  278. // 2.get hf_e and hf_m
  279. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  280. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  281. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  282. ret_v = 0;
  283. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  284. return ret;
  285. }
  286. uint64_t long_int_m = hf_m;
  287. uint8_t overflow_flag = 0;
  288. uint16_t shift_out = 0;
  289. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  290. overflow_flag = 1;
  291. } else {
  292. while (hf_e != kFp16ExpBias) {
  293. if (hf_e > kFp16ExpBias) {
  294. hf_e--;
  295. long_int_m = long_int_m << 1;
  296. if (s_ret == 1 && long_int_m > 0x2000000Lu) { // sign=1,negative number(<0)
  297. long_int_m = 0x2000000Lu; // 10(fp16_t-man)+15(int16)=25bit
  298. overflow_flag = 1;
  299. break;
  300. } else if (s_ret != 1 && long_int_m >= 0x1FFFFFFLu) { // sign=0,positive number(>0) Overflow
  301. long_int_m = 0x1FFFFFFLu; // 10(fp16_t-man)+15(int16)=25bit
  302. overflow_flag = 1;
  303. break;
  304. }
  305. } else {
  306. hf_e++;
  307. shift_out++;
  308. }
  309. }
  310. }
  311. if (overflow_flag) {
  312. ret_v = kInt16Max + s_ret;
  313. } else {
  314. // Generate final result
  315. ret_v = GetUint16ValByMan(s_ret, long_int_m, shift_out);
  316. }
  317. ret = *(reinterpret_cast<int16_t *>(&ret_v));
  318. return ret;
  319. }
  320. /// @ingroup fp16_t math conversion static method
  321. /// @param [in] fp_val uint16_t value of fp16_t object
  322. /// @brief Convert fp16_t to uint16_t
  323. /// @return Return uint16_t value of fp_val which is the value of fp16_t object
  324. static uint16_t Fp16ToUInt16(const uint16_t &fp_val) {
  325. uint16_t m_ret = 0;
  326. // 1.get s_ret and shift it to bit0.
  327. uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  328. // 2.get hf_e and hf_m
  329. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  330. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  331. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  332. return 0;
  333. }
  334. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  335. m_ret = ~0;
  336. } else {
  337. uint64_t long_int_m = hf_m;
  338. uint16_t shift_out = 0;
  339. while (hf_e != kFp16ExpBias) {
  340. if (hf_e > kFp16ExpBias) {
  341. hf_e--;
  342. long_int_m = long_int_m << 1;
  343. } else {
  344. hf_e++;
  345. shift_out++;
  346. }
  347. }
  348. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  349. m_ret = static_cast<uint16_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max);
  350. if (need_round && m_ret != kBitLen16Max) {
  351. m_ret++;
  352. }
  353. }
  354. if (s_ret == 1) { // Negative number
  355. m_ret = 0;
  356. }
  357. // m_ret equal to final result
  358. return m_ret;
  359. }
  360. /// @ingroup fp16_t math convertion static method
  361. /// @param [in] fp_val uint16_t value of fp16_t object
  362. /// @brief Convert fp16_t to int32_t
  363. /// @return Return int32_t value of fp_val which is the value of fp16_t object
  364. static int32_t Fp16ToInt32(const uint16_t &fp_val) {
  365. uint32_t ret_v;
  366. // 1.get s_ret and shift it to bit0.
  367. uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  368. // 2.get hf_e and hf_m
  369. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  370. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  371. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  372. ret_v = kInt32Max + s_ret;
  373. } else {
  374. uint64_t long_int_m = hf_m;
  375. uint16_t shift_out = 0;
  376. while (hf_e != kFp16ExpBias) {
  377. if (hf_e > kFp16ExpBias) {
  378. hf_e--;
  379. long_int_m = long_int_m << 1;
  380. } else {
  381. hf_e++;
  382. shift_out++;
  383. }
  384. }
  385. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  386. auto m_ret = static_cast<uint32_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max);
  387. if (need_round && m_ret < kInt32Max) {
  388. m_ret++;
  389. }
  390. if (s_ret == 1) {
  391. m_ret = (~m_ret) + 1;
  392. }
  393. if (m_ret == 0) {
  394. s_ret = 0;
  395. }
  396. // Generate final result
  397. ret_v = (s_ret << kBitShift31) | (m_ret);
  398. }
  399. return *(reinterpret_cast<int32_t *>(&ret_v));
  400. }
  401. /// @ingroup fp16_t math conversion static method
  402. /// @param [in] fp_val uint16_t value of fp16_t object
  403. /// @brief Convert fp16_t to uint32_t
  404. /// @return Return uint32_t value of fp_val which is the value of fp16_t object
  405. static uint32_t Fp16ToUInt32(const uint16_t &fp_val) {
  406. uint32_t m_ret;
  407. // 1.get s_ret and shift it to bit0.
  408. uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  409. // 2.get hf_e and hf_m
  410. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  411. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  412. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  413. return 0u;
  414. }
  415. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  416. m_ret = ~0u;
  417. } else {
  418. uint64_t long_int_m = hf_m;
  419. uint16_t shift_out = 0;
  420. while (hf_e != kFp16ExpBias) {
  421. if (hf_e > kFp16ExpBias) {
  422. hf_e--;
  423. long_int_m = long_int_m << 1;
  424. } else {
  425. hf_e++;
  426. shift_out++;
  427. }
  428. }
  429. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  430. m_ret = static_cast<uint32_t>(long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max;
  431. if (need_round && m_ret != kBitLen32Max) {
  432. m_ret++;
  433. }
  434. }
  435. if (s_ret == 1) { // Negative number
  436. m_ret = 0;
  437. }
  438. // m_ret equal to final result
  439. return m_ret;
  440. }
  441. static uint16_t Fp16AddCalVal(uint16_t &s_ret, int16_t e_ret, uint16_t m_ret, uint32_t m_trunc, uint16_t shift_out) {
  442. uint16_t m_min = kFp16ManHideBit << shift_out;
  443. uint16_t m_max = m_min << 1;
  444. // Denormal
  445. while (m_ret < m_min && e_ret > 0) { // the value of m_ret should not be smaller than 2^23
  446. m_ret = m_ret << 1;
  447. m_ret += (kFp32SignMask & m_trunc) >> kFp32SignIndex;
  448. m_trunc = m_trunc << 1;
  449. e_ret = e_ret - 1;
  450. }
  451. while (m_ret >= m_max) { // the value of m_ret should be smaller than 2^24
  452. m_trunc = m_trunc >> 1;
  453. m_trunc = m_trunc | (kFp32SignMask * (m_ret & 1));
  454. m_ret = m_ret >> 1;
  455. e_ret = e_ret + 1;
  456. }
  457. bool b_last_bit = ((m_ret & 1) > 0);
  458. bool b_trunc_high = 0;
  459. bool b_trunc_left = 0;
  460. b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0);
  461. b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0);
  462. m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret, shift_out);
  463. while (m_ret >= m_max) {
  464. m_ret = m_ret >> 1;
  465. e_ret = e_ret + 1;
  466. }
  467. if (e_ret == 0 && m_ret <= m_max) {
  468. m_ret = m_ret >> 1;
  469. }
  470. Fp16Normalize(e_ret, m_ret);
  471. uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  472. return ret;
  473. }
  474. /// @ingroup fp16_t math operator
  475. /// @param [in] v_1 left operator value of fp16_t object
  476. /// @param [in] v_2 right operator value of fp16_t object
  477. /// @brief Performing fp16_t addition
  478. /// @return Return fp16_t result of adding this and fp
  479. static uint16_t Fp16Add(uint16_t v_1, uint16_t v_2) {
  480. uint16_t s_a;
  481. uint16_t s_b;
  482. int16_t e_a;
  483. int16_t e_b;
  484. uint32_t m_a;
  485. uint32_t m_b;
  486. uint16_t m_a_tmp;
  487. uint16_t m_b_tmp;
  488. uint16_t shift_out = 0;
  489. // 1.Extract
  490. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  491. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  492. m_a = m_a_tmp;
  493. m_b = m_b_tmp;
  494. uint16_t sum;
  495. uint16_t s_ret;
  496. if (s_a != s_b) {
  497. ReverseMan(s_a > 0, m_a);
  498. ReverseMan(s_b > 0, m_b);
  499. sum = static_cast<uint16_t>(GetManSum(e_a, m_a, e_b, m_b));
  500. s_ret = (sum & kFp16SignMask) >> kFp16SignIndex;
  501. ReverseMan(s_ret > 0, m_a);
  502. ReverseMan(s_ret > 0, m_b);
  503. } else {
  504. sum = static_cast<uint16_t>(GetManSum(e_a, m_a, e_b, m_b));
  505. s_ret = s_a;
  506. }
  507. if (sum == 0) {
  508. shift_out = 3; // shift to left 3 bits
  509. m_a = m_a << shift_out;
  510. m_b = m_b << shift_out;
  511. }
  512. uint32_t m_trunc = 0;
  513. int16_t e_ret = std::max(e_a, e_b);
  514. int16_t e_tmp = std::abs(e_a - e_b);
  515. if (e_a > e_b) {
  516. m_trunc = (m_b << (kBitShift32 - static_cast<uint16_t>(e_tmp)));
  517. m_b = RightShift(m_b, e_tmp);
  518. } else if (e_a < e_b) {
  519. m_trunc = (m_a << (kBitShift32 - static_cast<uint16_t>(e_tmp)));
  520. m_a = RightShift(m_a, e_tmp);
  521. }
  522. // calculate mantissav
  523. auto m_ret = static_cast<uint16_t>(m_a + m_b);
  524. return Fp16AddCalVal(s_ret, e_ret, m_ret, m_trunc, shift_out);
  525. }
  526. /// @ingroup fp16_t math operator
  527. /// @param [in] v_1 left operator value of fp16_t object
  528. /// @param [in] v_2 right operator value of fp16_t object
  529. /// @brief Performing fp16_t subtraction
  530. /// @return Return fp16_t result of subtraction fp from this
  531. static uint16_t Fp16Sub(uint16_t v_1, uint16_t v_2) {
  532. // Reverse
  533. uint16_t tmp = ((~(v_2)) & kFp16SignMask) | (v_2 & kFp16AbsMax);
  534. return Fp16Add(v_1, tmp);
  535. }
  536. /// @ingroup fp16_t math operator
  537. /// @param [in] v_1 left operator value of fp16_t object
  538. /// @param [in] v_2 right operator value of fp16_t object
  539. /// @brief Performing fp16_t multiplication
  540. /// @return Return fp16_t result of multiplying this and fp
  541. static uint16_t Fp16Mul(uint16_t v_1, uint16_t v_2) {
  542. uint16_t s_a;
  543. uint16_t s_b;
  544. int16_t e_a;
  545. int16_t e_b;
  546. uint32_t m_a;
  547. uint32_t m_b;
  548. uint16_t s_ret;
  549. uint16_t m_ret;
  550. int16_t e_ret;
  551. uint32_t mul_m;
  552. uint16_t m_a_tmp;
  553. uint16_t m_b_tmp;
  554. // 1.Extract
  555. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  556. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  557. m_a = m_a_tmp;
  558. m_b = m_b_tmp;
  559. e_ret = e_a + e_b - kFp16ExpBias - kDim10;
  560. mul_m = m_a * m_b;
  561. s_ret = s_a ^ s_b;
  562. uint32_t m_min = kFp16ManHideBit;
  563. uint32_t m_max = m_min << 1;
  564. uint32_t m_trunc = 0;
  565. // the value of m_ret should not be smaller than 2^23
  566. while (mul_m < m_min && e_ret > 1) {
  567. mul_m = mul_m << 1;
  568. e_ret = e_ret - 1;
  569. }
  570. while (mul_m >= m_max || e_ret < 1) {
  571. m_trunc = m_trunc >> 1;
  572. m_trunc = m_trunc | (kFp32SignMask * (mul_m & 1));
  573. mul_m = mul_m >> 1;
  574. e_ret = e_ret + 1;
  575. }
  576. bool b_last_bit = ((mul_m & 1) > 0);
  577. bool b_trunc_high = 0;
  578. bool b_trunc_left = 0;
  579. b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0);
  580. b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0);
  581. mul_m = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, mul_m);
  582. while (mul_m >= m_max || e_ret < 0) {
  583. mul_m = mul_m >> 1;
  584. e_ret = e_ret + 1;
  585. }
  586. if (e_ret == 1 && mul_m < kFp16ManHideBit) {
  587. e_ret = 0;
  588. }
  589. m_ret = static_cast<uint16_t>(mul_m);
  590. Fp16Normalize(e_ret, m_ret);
  591. uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  592. return ret;
  593. }
  594. /// @ingroup fp16_t math operator divided
  595. /// @param [in] v_1 left operator value of fp16_t object
  596. /// @param [in] v_2 right operator value of fp16_t object
  597. /// @brief Performing fp16_t division
  598. /// @return Return fp16_t result of division this by fp
  599. static uint16_t Fp16Div(uint16_t v_1, uint16_t v_2) {
  600. uint16_t ret;
  601. if (FP16_IS_ZERO(v_2)) { // result is inf
  602. // throw "fp16_t division by zero.";
  603. uint16_t s_a, s_b;
  604. uint16_t s_ret;
  605. s_a = FP16_EXTRAC_SIGN(v_1);
  606. s_b = FP16_EXTRAC_SIGN(v_2);
  607. s_ret = s_a ^ s_b;
  608. ret = FP16_CONSTRUCTOR(s_ret, kFp16MaxExp, 0u);
  609. } else if (FP16_IS_ZERO(v_1)) {
  610. ret = 0u;
  611. } else {
  612. uint16_t s_a;
  613. uint16_t s_b;
  614. int16_t e_a;
  615. int16_t e_b;
  616. uint64_t m_a;
  617. uint64_t m_b;
  618. float m_div;
  619. uint16_t m_a_tmp;
  620. uint16_t m_b_tmp;
  621. // 1.Extract
  622. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  623. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  624. m_a = m_a_tmp;
  625. m_b = m_b_tmp;
  626. uint64_t m_tmp;
  627. if (e_a > e_b) {
  628. m_tmp = m_a;
  629. uint16_t tmp;
  630. tmp = e_a - e_b;
  631. for (int i = 0; i < tmp; i++) {
  632. m_tmp = m_tmp << 1;
  633. }
  634. m_a = m_tmp;
  635. } else if (e_a < e_b) {
  636. m_tmp = m_b;
  637. uint16_t tmp = e_b - e_a;
  638. for (int i = 0; i < tmp; i++) {
  639. m_tmp = m_tmp << 1;
  640. }
  641. m_b = m_tmp;
  642. }
  643. m_div = static_cast<float>(m_a * 1.0f / m_b);
  644. fp16_t fp_div;
  645. fp_div = m_div;
  646. ret = fp_div.val;
  647. if (s_a != s_b) {
  648. ret |= kFp16SignMask;
  649. }
  650. }
  651. return ret;
  652. }
  653. // operate
  654. fp16_t fp16_t::operator+(const fp16_t fp) {
  655. uint16_t ret_val = Fp16Add(val, fp.val);
  656. fp16_t ret(ret_val);
  657. return ret;
  658. }
  659. fp16_t fp16_t::operator-(const fp16_t fp) {
  660. uint16_t ret_val = Fp16Sub(val, fp.val);
  661. fp16_t ret(ret_val);
  662. return ret;
  663. }
  664. fp16_t fp16_t::operator*(const fp16_t fp) {
  665. uint16_t ret_val = Fp16Mul(val, fp.val);
  666. fp16_t ret(ret_val);
  667. return ret;
  668. }
  669. fp16_t fp16_t::operator/(const fp16_t fp) {
  670. uint16_t ret_val = Fp16Div(val, fp.val);
  671. fp16_t ret(ret_val);
  672. return ret;
  673. }
  674. fp16_t fp16_t::operator+=(const fp16_t fp) {
  675. val = Fp16Add(val, fp.val);
  676. return *this;
  677. }
  678. fp16_t fp16_t::operator-=(const fp16_t fp) {
  679. val = Fp16Sub(val, fp.val);
  680. return *this;
  681. }
  682. fp16_t fp16_t::operator*=(const fp16_t fp) {
  683. val = Fp16Mul(val, fp.val);
  684. return *this;
  685. }
  686. fp16_t fp16_t::operator/=(const fp16_t fp) {
  687. val = Fp16Div(val, fp.val);
  688. return *this;
  689. }
  690. // compare
  691. bool fp16_t::operator==(const fp16_t &fp) const {
  692. bool result = true;
  693. if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) {
  694. result = true;
  695. } else {
  696. result = ((val & kBitLen16Max) == (fp.val & kBitLen16Max)); // bit compare
  697. }
  698. return result;
  699. }
  700. bool fp16_t::operator!=(const fp16_t &fp) const {
  701. bool result = true;
  702. if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) {
  703. result = false;
  704. } else {
  705. result = ((val & kBitLen16Max) != (fp.val & kBitLen16Max)); // bit compare
  706. }
  707. return result;
  708. }
  709. bool fp16_t::operator>(const fp16_t &fp) const {
  710. uint16_t s_a;
  711. uint16_t s_b;
  712. uint16_t e_a;
  713. uint16_t e_b;
  714. uint16_t m_a;
  715. uint16_t m_b;
  716. bool result = true;
  717. // 1.Extract
  718. s_a = FP16_EXTRAC_SIGN(val);
  719. s_b = FP16_EXTRAC_SIGN(fp.val);
  720. e_a = FP16_EXTRAC_EXP(val);
  721. e_b = FP16_EXTRAC_EXP(fp.val);
  722. m_a = FP16_EXTRAC_MAN(val);
  723. m_b = FP16_EXTRAC_MAN(fp.val);
  724. // Compare
  725. if ((s_a == 0) && (s_b > 0)) { // + -
  726. // -0=0
  727. result = !(FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val));
  728. } else if ((s_a == 0) && (s_b == 0)) { // + +
  729. if (e_a > e_b) { // e_a - e_b >= 1; Va always larger than Vb
  730. result = true;
  731. } else if (e_a == e_b) {
  732. result = m_a > m_b;
  733. } else {
  734. result = false;
  735. }
  736. } else if ((s_a > 0) && (s_b > 0)) { // - - opposite to + +
  737. if (e_a < e_b) {
  738. result = true;
  739. } else if (e_a == e_b) {
  740. result = m_a < m_b;
  741. } else {
  742. result = false;
  743. }
  744. } else { // - +
  745. result = false;
  746. }
  747. return result;
  748. }
  749. bool fp16_t::operator>=(const fp16_t &fp) const {
  750. bool result = true;
  751. if ((*this) > fp) {
  752. result = true;
  753. } else if ((*this) == fp) {
  754. result = true;
  755. } else {
  756. result = false;
  757. }
  758. return result;
  759. }
  760. bool fp16_t::operator<(const fp16_t &fp) const {
  761. bool result = true;
  762. if ((*this) >= fp) {
  763. result = false;
  764. } else {
  765. result = true;
  766. }
  767. return result;
  768. }
  769. bool fp16_t::operator<=(const fp16_t &fp) const {
  770. bool result = true;
  771. if ((*this) > fp) {
  772. result = false;
  773. } else {
  774. result = true;
  775. }
  776. return result;
  777. }
  778. // evaluation
  779. fp16_t &fp16_t::operator=(const fp16_t &fp) {
  780. if (&fp == this) {
  781. return *this;
  782. }
  783. val = fp.val;
  784. return *this;
  785. }
  786. fp16_t &fp16_t::operator=(const float &f_val) {
  787. uint16_t s_ret;
  788. uint16_t m_ret;
  789. int16_t e_ret;
  790. uint32_t e_f;
  791. uint32_t m_f;
  792. const uint32_t ui32_v = *(reinterpret_cast<const uint32_t *>(&f_val)); // 1:8:23bit sign:exp:man
  793. uint32_t m_len_delta;
  794. s_ret = static_cast<uint16_t>((ui32_v & kFp32SignMask) >> kFp32SignIndex); // 4Byte->2Byte
  795. e_f = (ui32_v & kFp32ExpMask) >> kFp32ManLen; // 8 bit exponent
  796. m_f = (ui32_v & kFp32ManMask); // 23 bit mantissa dont't need to care about denormal
  797. m_len_delta = kFp32ManLen - kFp16ManLen;
  798. bool need_round = false;
  799. // Exponent overflow/NaN converts to signed inf/NaN
  800. if (e_f > 0x8Fu) { // 0x8Fu:142=127+15
  801. e_ret = kFp16MaxExp - 1;
  802. m_ret = kFp16MaxMan;
  803. } else if (e_f <= 0x70u) { // 0x70u:112=127-15 Exponent underflow converts to denormalized half or signed zero
  804. e_ret = 0;
  805. if (e_f >= 0x67) { // 0x67:103=127-24 Denormal
  806. m_f = (m_f | kFp32ManHideBit);
  807. uint16_t shift_out = kFp32ManLen;
  808. uint64_t m_tmp = (static_cast<uint64_t>(m_f)) << (e_f - 0x67);
  809. need_round = IsRoundOne(m_tmp, shift_out);
  810. m_ret = static_cast<uint16_t>(m_tmp >> shift_out);
  811. if (need_round) {
  812. m_ret++;
  813. }
  814. } else if (e_f == 0x66 && m_f > 0) { // 0x66:102 Denormal 0<f_v<min(Denormal)
  815. m_ret = 1;
  816. } else {
  817. m_ret = 0;
  818. }
  819. } else { // Regular case with no overflow or underflow
  820. e_ret = static_cast<int16_t>(e_f - 0x70u);
  821. need_round = IsRoundOne(m_f, static_cast<uint16_t>(m_len_delta));
  822. m_ret = static_cast<uint16_t>(m_f >> m_len_delta);
  823. if (need_round) {
  824. m_ret++;
  825. }
  826. if (m_ret & kFp16ManHideBit) {
  827. e_ret++;
  828. }
  829. }
  830. Fp16Normalize(e_ret, m_ret);
  831. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  832. return *this;
  833. }
  834. fp16_t &fp16_t::operator=(const int8_t &i_val) {
  835. uint16_t s_ret, e_ret, m_ret;
  836. s_ret = static_cast<uint16_t>(((static_cast<uint8_t>(i_val)) & 0x80) >> kDim7);
  837. m_ret = static_cast<uint16_t>(((static_cast<uint8_t>(i_val)) & kInt8Max));
  838. if (m_ret == 0) {
  839. e_ret = 0;
  840. } else {
  841. if (s_ret) { // negative number(<0)
  842. m_ret = static_cast<uint16_t>(std::abs(i_val)); // complement
  843. }
  844. e_ret = kFp16ManLen;
  845. while ((m_ret & kFp16ManHideBit) == 0) {
  846. m_ret = m_ret << 1;
  847. e_ret = e_ret - 1;
  848. }
  849. e_ret = e_ret + kFp16ExpBias;
  850. }
  851. val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret);
  852. return *this;
  853. }
  854. fp16_t &fp16_t::operator=(const uint8_t &ui_val) {
  855. uint16_t s_ret, e_ret, m_ret;
  856. s_ret = 0;
  857. e_ret = 0;
  858. m_ret = ui_val;
  859. if (m_ret) {
  860. e_ret = kFp16ManLen;
  861. while ((m_ret & kFp16ManHideBit) == 0) {
  862. m_ret = m_ret << 1;
  863. e_ret = e_ret - 1;
  864. }
  865. e_ret = e_ret + kFp16ExpBias;
  866. }
  867. val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret);
  868. return *this;
  869. }
  870. static void SetValByUint16Val(const uint16_t &input_val, const uint16_t &sign, uint16_t &ret_val) {
  871. uint32_t m_tmp = (input_val & kFp32AbsMax);
  872. uint16_t m_min = kFp16ManHideBit;
  873. uint16_t m_max = m_min << 1;
  874. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  875. if (m_tmp) {
  876. int16_t e_ret;
  877. if (len > kDim11) {
  878. e_ret = kFp16ExpBias + kFp16ManLen;
  879. uint16_t e_tmp = len - kDim11;
  880. uint32_t trunc_mask = 1;
  881. for (int i = 1; i < e_tmp; i++) {
  882. trunc_mask = (trunc_mask << 1) + 1;
  883. }
  884. uint32_t m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp);
  885. for (int i = 0; i < e_tmp; i++) {
  886. m_tmp = (m_tmp >> 1);
  887. e_ret = e_ret + 1;
  888. }
  889. bool b_last_bit = ((m_tmp & 1) > 0);
  890. bool b_trunc_high = 0;
  891. bool b_trunc_left = 0;
  892. if (kRoundToNearest == g_round_mode) { // trunc
  893. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  894. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  895. }
  896. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  897. while (m_tmp >= m_max || e_ret < 0) {
  898. m_tmp = m_tmp >> 1;
  899. e_ret = e_ret + 1;
  900. }
  901. } else {
  902. e_ret = kFp16ExpBias;
  903. m_tmp = m_tmp << (kManBitLength - len);
  904. e_ret = e_ret + (len - 1);
  905. }
  906. auto m_ret = static_cast<uint16_t>(m_tmp);
  907. ret_val = FP16_CONSTRUCTOR(sign, static_cast<uint16_t>(e_ret), m_ret);
  908. }
  909. }
  910. fp16_t &fp16_t::operator=(const int16_t &i_val) {
  911. if (i_val == 0) {
  912. val = 0;
  913. } else {
  914. uint16_t ui_val = *(reinterpret_cast<const uint16_t *>(&i_val));
  915. auto s_ret = static_cast<uint16_t>(ui_val >> kBitShift15);
  916. if (s_ret) {
  917. int16_t iValM = -i_val;
  918. ui_val = *(reinterpret_cast<uint16_t *>(&iValM));
  919. }
  920. SetValByUint16Val(ui_val, s_ret, val);
  921. }
  922. return *this;
  923. }
  924. fp16_t &fp16_t::operator=(const uint16_t &ui_val) {
  925. if (ui_val == 0) {
  926. val = 0;
  927. } else {
  928. int16_t e_ret;
  929. uint16_t m_ret = ui_val;
  930. uint16_t m_min = kFp16ManHideBit;
  931. uint16_t m_max = m_min << 1;
  932. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_ret));
  933. if (len > kManBitLength) {
  934. e_ret = kFp16ExpBias + kFp16ManLen;
  935. uint32_t m_trunc;
  936. uint32_t trunc_mask = 1;
  937. uint16_t e_tmp = len - kManBitLength;
  938. for (int i = 1; i < e_tmp; i++) {
  939. trunc_mask = (trunc_mask << 1) + 1;
  940. }
  941. m_trunc = (m_ret & trunc_mask) << (kBitShift32 - e_tmp);
  942. for (int i = 0; i < e_tmp; i++) {
  943. m_ret = (m_ret >> 1);
  944. e_ret = e_ret + 1;
  945. }
  946. bool b_last_bit = ((m_ret & 1) > 0);
  947. bool b_trunc_high = 0;
  948. bool b_trunc_left = 0;
  949. if (kRoundToNearest == g_round_mode) { // trunc
  950. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  951. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  952. }
  953. m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret);
  954. while (m_ret >= m_max || e_ret < 0) {
  955. m_ret = m_ret >> 1;
  956. e_ret = e_ret + 1;
  957. }
  958. if (FP16_IS_INVALID(val)) {
  959. val = kFp16Max;
  960. }
  961. } else {
  962. e_ret = kFp16ExpBias;
  963. m_ret = m_ret << (kDim11 - len);
  964. e_ret = e_ret + (len - 1);
  965. }
  966. val = FP16_CONSTRUCTOR(0u, static_cast<uint16_t>(e_ret), m_ret);
  967. }
  968. return *this;
  969. }
  970. static void SetValByUint32Val(const uint32_t &input_val, const uint16_t &sign, uint16_t &ret_val) {
  971. int16_t e_ret;
  972. uint32_t m_tmp = (input_val & kFp32AbsMax);
  973. uint32_t m_min = kFp16ManHideBit;
  974. uint32_t m_max = m_min << 1;
  975. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  976. if (len > kDim11) {
  977. e_ret = kFp16ExpBias + kFp16ManLen;
  978. uint32_t m_trunc = 0;
  979. uint32_t trunc_mask = 1;
  980. uint16_t e_tmp = len - kDim11;
  981. for (int i = 1; i < e_tmp; i++) {
  982. trunc_mask = (trunc_mask << 1) + 1;
  983. }
  984. m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp);
  985. for (int i = 0; i < e_tmp; i++) {
  986. m_tmp = (m_tmp >> 1);
  987. e_ret = e_ret + 1;
  988. }
  989. bool b_last_bit = ((m_tmp & 1) > 0);
  990. bool b_trunc_high = 0;
  991. bool b_trunc_left = 0;
  992. if (kRoundToNearest == g_round_mode) { // trunc
  993. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  994. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  995. }
  996. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  997. while (m_tmp >= m_max || e_ret < 0) {
  998. m_tmp = m_tmp >> 1;
  999. e_ret = e_ret + 1;
  1000. }
  1001. if (e_ret >= kFp16MaxExp) {
  1002. e_ret = kFp16MaxExp - 1;
  1003. m_tmp = kFp16MaxMan;
  1004. }
  1005. } else {
  1006. e_ret = kFp16ExpBias;
  1007. m_tmp = m_tmp << (kDim11 - len);
  1008. e_ret = e_ret + (len - 1);
  1009. }
  1010. auto m_ret = static_cast<uint16_t>(m_tmp);
  1011. ret_val = FP16_CONSTRUCTOR(sign, static_cast<uint16_t>(e_ret), m_ret);
  1012. }
  1013. fp16_t &fp16_t::operator=(const int32_t &i_val) {
  1014. if (i_val == 0) {
  1015. val = 0;
  1016. } else {
  1017. uint32_t ui_val = *(reinterpret_cast<const uint32_t *>(&i_val));
  1018. auto s_ret = static_cast<uint16_t>(ui_val >> kBitShift31);
  1019. if (s_ret) {
  1020. int32_t iValM = -i_val;
  1021. ui_val = *(reinterpret_cast<uint32_t *>(&iValM));
  1022. }
  1023. SetValByUint32Val(ui_val, s_ret, val);
  1024. }
  1025. return *this;
  1026. }
  1027. fp16_t &fp16_t::operator=(const uint32_t &ui_val) {
  1028. if (ui_val == 0) {
  1029. val = 0;
  1030. } else {
  1031. int16_t e_ret;
  1032. uint32_t m_tmp = ui_val;
  1033. uint32_t m_min = kFp16ManHideBit;
  1034. uint32_t m_max = m_min << 1;
  1035. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  1036. if (len > kDim11) {
  1037. e_ret = kFp16ExpBias + kFp16ManLen;
  1038. uint32_t m_trunc = 0;
  1039. uint32_t trunc_mask = 1;
  1040. uint16_t e_tmp = len - kDim11;
  1041. for (int i = 1; i < e_tmp; i++) {
  1042. trunc_mask = (trunc_mask << 1) + 1;
  1043. }
  1044. m_trunc = (m_tmp & trunc_mask) << static_cast<uint32_t>(kBitShift32 - e_tmp);
  1045. for (uint16_t i = 0; i < e_tmp; i++) {
  1046. m_tmp = (m_tmp >> 1);
  1047. e_ret = e_ret + 1;
  1048. }
  1049. bool b_last_bit = ((m_tmp & 1) > 0);
  1050. bool b_trunc_high = false;
  1051. bool b_trunc_left = false;
  1052. if (g_round_mode == kRoundToNearest) { // trunc
  1053. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  1054. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  1055. }
  1056. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  1057. while (m_tmp >= m_max || e_ret < 0) {
  1058. m_tmp = m_tmp >> 1;
  1059. e_ret = e_ret + 1;
  1060. }
  1061. if (e_ret >= kFp16MaxExp) {
  1062. e_ret = kFp16MaxExp - 1;
  1063. m_tmp = kFp16MaxMan;
  1064. }
  1065. } else {
  1066. e_ret = kFp16ExpBias;
  1067. m_tmp = m_tmp << (kDim11 - len);
  1068. e_ret = e_ret + (len - 1);
  1069. }
  1070. auto m_ret = static_cast<uint16_t>(m_tmp);
  1071. val = FP16_CONSTRUCTOR(0u, static_cast<uint16_t>(e_ret), m_ret);
  1072. }
  1073. return *this;
  1074. }
  1075. fp16_t &fp16_t::operator=(const double &d_val) {
  1076. uint16_t s_ret;
  1077. uint16_t m_ret;
  1078. int16_t e_ret;
  1079. uint64_t e_d;
  1080. uint64_t m_d;
  1081. uint64_t ui64_v = *(reinterpret_cast<const uint64_t *>(&d_val)); // 1:11:52bit sign:exp:man
  1082. uint32_t m_len_delta;
  1083. s_ret = static_cast<uint16_t>((ui64_v & kFp64SignMask) >> kFp64SignIndex); // 4Byte
  1084. e_d = (ui64_v & kFp64ExpMask) >> kFp64ManLen; // 10 bit exponent
  1085. m_d = (ui64_v & kFp64ManMask); // 52 bit mantissa
  1086. m_len_delta = kFp64ManLen - kFp16ManLen;
  1087. bool need_round = false;
  1088. // Exponent overflow/NaN converts to signed inf/NaN
  1089. if (e_d >= 0x410u) { // 0x410:1040=1023+16
  1090. e_ret = kFp16MaxExp - 1;
  1091. m_ret = kFp16MaxMan;
  1092. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  1093. } else if (e_d <= 0x3F0u) { // Exponent underflow converts to denormalized half or signed zero
  1094. // 0x3F0:1008=1023-15
  1095. // Signed zeros, denormalized floats, and floats with small
  1096. // exponents all convert to signed zero half precision.
  1097. e_ret = 0;
  1098. if (e_d >= 0x3E7u) { // 0x3E7u:999=1023-24 Denormal
  1099. // Underflows to a denormalized value
  1100. m_d = (kFp64ManHideBit | m_d);
  1101. uint16_t shift_out = kFp64ManLen;
  1102. uint64_t m_tmp = (static_cast<uint64_t>(m_d)) << (e_d - 0x3E7u);
  1103. need_round = IsRoundOne(m_tmp, shift_out);
  1104. m_ret = static_cast<uint16_t>(m_tmp >> shift_out);
  1105. if (need_round) {
  1106. m_ret++;
  1107. }
  1108. } else if (e_d == 0x3E6u && m_d > 0) {
  1109. m_ret = 1;
  1110. } else {
  1111. m_ret = 0;
  1112. }
  1113. } else { // Regular case with no overflow or underflow
  1114. e_ret = static_cast<int16_t>(e_d - 0x3F0u);
  1115. need_round = IsRoundOne(m_d, m_len_delta);
  1116. m_ret = static_cast<uint16_t>(m_d >> m_len_delta);
  1117. if (need_round) {
  1118. m_ret++;
  1119. }
  1120. if (m_ret & kFp16ManHideBit) {
  1121. e_ret++;
  1122. }
  1123. }
  1124. Fp16Normalize(e_ret, m_ret);
  1125. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  1126. return *this;
  1127. }
  1128. // convert
  1129. fp16_t::operator float() const { return Fp16ToFloat(val); }
  1130. fp16_t::operator double() const { return Fp16ToDouble(val); }
  1131. fp16_t::operator int8_t() const { return Fp16ToInt8(val); }
  1132. fp16_t::operator uint8_t() const { return Fp16ToUInt8(val); }
  1133. fp16_t::operator int16_t() const { return Fp16ToInt16(val); }
  1134. fp16_t::operator uint16_t() const { return Fp16ToUInt16(val); }
  1135. fp16_t::operator int32_t() const { return Fp16ToInt32(val); }
  1136. fp16_t::operator uint32_t() const { return Fp16ToUInt32(val); }
  1137. // Cannot be used, just in order to solve the compile error
  1138. fp16_t::operator int64_t() const { return 0; }
  1139. // Cannot be used, just in order to solve the compile error
  1140. fp16_t::operator uint64_t() const { return 0; }
  1141. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int fp16_t::IsInf() {
  1142. if ((val & kFp16AbsMax) == kFp16ExpMask) {
  1143. if (val & kFp16SignMask) {
  1144. return -1;
  1145. } else {
  1146. return 1;
  1147. }
  1148. } else {
  1149. return 0;
  1150. }
  1151. }
  1152. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY float fp16_t::ToFloat() const { return Fp16ToFloat(val); }
  1153. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY double fp16_t::ToDouble() const { return Fp16ToDouble(val); }
  1154. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int8_t fp16_t::ToInt8() const { return Fp16ToInt8(val); }
  1155. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint8_t fp16_t::ToUInt8() const { return Fp16ToUInt8(val); }
  1156. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int16_t fp16_t::ToInt16() const { return Fp16ToInt16(val); }
  1157. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint16_t fp16_t::ToUInt16() const { return Fp16ToUInt16(val); }
  1158. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int32_t fp16_t::ToInt32() const { return Fp16ToInt32(val); }
  1159. FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY uint32_t fp16_t::ToUInt32() const { return Fp16ToUInt32(val); }
  1160. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示