You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fp16_t.cc 37 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "common/fp16_t.h"
  17. #include "external/register/register_types.h"
  18. namespace {
  19. constexpr uint16_t kManBitLength = 11;
  20. }
  21. namespace ge {
  22. /// @ingroup fp16_t global filed
  23. /// @brief round mode of last valid digital
  24. enum TagFp16RoundMode g_round_mode = kRoundToNearest;
  25. void ExtractFp16(const uint16_t &val, uint16_t &s, int16_t &e, uint16_t &m) {
  26. // 1.Extract
  27. s = static_cast<uint16_t>(FP16_EXTRAC_SIGN(val));
  28. e = static_cast<int16_t>(FP16_EXTRAC_EXP(val));
  29. m = static_cast<uint16_t>(FP16_EXTRAC_MAN(val));
  30. // Denormal
  31. if (e == 0) {
  32. e = 1;
  33. }
  34. }
  35. /// @ingroup fp16_t static method
  36. /// @param [in] man truncated mantissa
  37. /// @param [in] shift_out left shift bits based on ten bits
  38. /// @brief judge whether to add one to the result while converting fp16_t to other datatype
  39. /// @return Return true if add one, otherwise false
  40. static bool IsRoundOne(uint64_t man, uint16_t trunc_len) {
  41. uint64_t mask0 = 0x4;
  42. uint64_t mask1 = 0x2;
  43. uint64_t mask2;
  44. uint16_t shift_out = static_cast<uint16_t>(trunc_len - kDim2);
  45. mask0 = mask0 << shift_out;
  46. mask1 = mask1 << shift_out;
  47. mask2 = mask1 - 1;
  48. bool last_bit = ((man & mask0) > 0);
  49. bool trunc_high = false;
  50. bool trunc_left = false;
  51. if (g_round_mode == kRoundToNearest) {
  52. trunc_high = ((man & mask1) > 0);
  53. trunc_left = ((man & mask2) > 0);
  54. }
  55. return (trunc_high && (trunc_left || last_bit));
  56. }
  57. /// @ingroup fp16_t public method
  58. /// @param [in] exp exponent of fp16_t value
  59. /// @param [in] man exponent of fp16_t value
  60. /// @brief normalize fp16_t value
  61. /// @return
  62. static void Fp16Normalize(int16_t &exp, uint16_t &man) {
  63. // set to invalid data
  64. if (exp >= kFp16MaxExp) {
  65. exp = static_cast<int16_t>(kFp16MaxExp);
  66. man = static_cast<uint16_t>(kFp16MaxMan);
  67. } else if (exp == 0 && man == kFp16ManHideBit) {
  68. exp++;
  69. man = 0;
  70. }
  71. }
  72. /// @ingroup fp16_t math conversion static method
  73. /// @param [in] fp_val uint16_t value of fp16_t object
  74. /// @brief Convert fp16_t to float/fp32
  75. /// @return Return float/fp32 value of fp_val which is the value of fp16_t object
  76. static float Fp16ToFloat(const uint16_t &fp_val) {
  77. uint16_t hf_sign;
  78. uint16_t hf_man;
  79. int16_t hf_exp;
  80. ExtractFp16(fp_val, hf_sign, hf_exp, hf_man);
  81. while (hf_man && !(hf_man & kFp16ManHideBit)) {
  82. hf_man <<= 1;
  83. hf_exp--;
  84. }
  85. uint32_t e_ret, m_ret;
  86. uint32_t s_ret = hf_sign;
  87. if (hf_man == 0) {
  88. e_ret = 0;
  89. m_ret = 0;
  90. } else {
  91. e_ret = hf_exp - kFp16ExpBias + kFp32ExpBias;
  92. m_ret = hf_man & kFp16ManMask;
  93. m_ret = m_ret << (kFp32ManLen - kFp16ManLen);
  94. }
  95. uint32_t f_val = FP32_CONSTRUCTOR(s_ret, e_ret, m_ret);
  96. auto p_ret_v = reinterpret_cast<float *>(&f_val);
  97. return *p_ret_v;
  98. }
  99. /// @ingroup fp16_t math conversion static method
  100. /// @param [in] fp_val uint16_t value of fp16_t object
  101. /// @brief Convert fp16_t to double/fp64
  102. /// @return Return double/fp64 value of fp_val which is the value of fp16_t object
  103. static double Fp16ToDouble(const uint16_t &fp_val) {
  104. uint16_t hf_sign;
  105. uint16_t hf_man;
  106. int16_t hf_exp;
  107. ExtractFp16(fp_val, hf_sign, hf_exp, hf_man);
  108. while (hf_man && !(hf_man & kFp16ManHideBit)) {
  109. hf_man <<= 1;
  110. hf_exp--;
  111. }
  112. uint64_t e_ret;
  113. uint64_t m_ret;
  114. uint64_t s_ret = hf_sign;
  115. if (!hf_man) {
  116. e_ret = 0;
  117. m_ret = 0;
  118. } else {
  119. e_ret = hf_exp - kFp16ExpBias + kFp64ExpBias;
  120. m_ret = hf_man & kFp16ManMask;
  121. m_ret = m_ret << (kFp64ManLen - kFp16ManLen);
  122. }
  123. uint64_t f_val = (s_ret << kFp64SignIndex) | (e_ret << kFp64ManLen) | (m_ret);
  124. auto p_ret_v = reinterpret_cast<double *>(&f_val);
  125. return *p_ret_v;
  126. }
  127. /// @ingroup fp16_t static method
  128. /// @param [in] s_ret sign of fp16_t value
  129. /// @param [in] long_int_m man uint64_t value of fp16_t object
  130. /// @param [in] shift_out shift offset
  131. /// @brief calculate uint8 value by sign,man and shift offset
  132. /// @return Return uint8 value of fp16_t object
  133. static uint8_t GetUint8ValByMan(uint8_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) {
  134. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  135. auto m_ret = static_cast<uint8_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max);
  136. need_round = need_round && ((s_ret == 0 && m_ret < kInt8Max) || (s_ret == 1 && m_ret <= kInt8Max));
  137. if (need_round) {
  138. m_ret++;
  139. }
  140. if (s_ret) {
  141. m_ret = (~m_ret) + 1;
  142. }
  143. if (m_ret == 0) {
  144. s_ret = 0;
  145. }
  146. return static_cast<uint8_t>((s_ret << kBitShift7) | (m_ret));
  147. }
  148. /// @ingroup fp16_t math conversion static method
  149. /// @param [in] fp_val uint16_t value of fp16_t object
  150. /// @brief Convert fp16_t to int8_t
  151. /// @return Return int8_t value of fp_val which is the value of fp16_t object
  152. static int8_t Fp16ToInt8(const uint16_t &fp_val) {
  153. int8_t ret;
  154. uint8_t ret_v;
  155. // 1.get s_ret and shift it to bit0.
  156. uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  157. // 2.get hf_e and hf_m
  158. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  159. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  160. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  161. ret_v = 0;
  162. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  163. return ret;
  164. }
  165. uint64_t long_int_m = hf_m;
  166. uint8_t overflow_flag = 0;
  167. uint16_t shift_out = 0;
  168. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  169. overflow_flag = 1;
  170. } else {
  171. while (hf_e != kFp16ExpBias) {
  172. if (hf_e > kFp16ExpBias) {
  173. hf_e--;
  174. long_int_m = long_int_m << 1;
  175. if (s_ret == 1 && long_int_m >= 0x20000u) { // sign=1,negative number(<0)
  176. long_int_m = 0x20000u; // 10 0000 0000 0000 0000 10(fp16_t-man)+7(int8)=17bit
  177. overflow_flag = 1;
  178. break;
  179. } else if (s_ret != 1 && long_int_m >= 0x1FFFFu) { // sign=0,positive number(>0)
  180. long_int_m = 0x1FFFFu; // 01 1111 1111 1111 1111 10(fp16_t-man)+7(int8)
  181. overflow_flag = 1;
  182. break;
  183. }
  184. } else {
  185. hf_e++;
  186. shift_out++;
  187. }
  188. }
  189. }
  190. if (overflow_flag) {
  191. ret_v = kInt8Max + s_ret;
  192. } else {
  193. // Generate final result
  194. ret_v = GetUint8ValByMan(s_ret, long_int_m, shift_out);
  195. }
  196. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  197. return ret;
  198. }
  199. /// @ingroup fp16_t math conversion static method
  200. /// @param [in] fp_val uint16_t value of fp16_t object
  201. /// @brief Convert fp16_t to uint8_t
  202. /// @return Return uint8_t value of fp_val which is the value of fp16_t object
  203. static uint8_t Fp16ToUInt8(const uint16_t &fp_val) {
  204. uint8_t m_ret = 0;
  205. // 1.get s_ret and shift it to bit0.
  206. uint8_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  207. // 2.get hf_e and hf_m
  208. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  209. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  210. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  211. return 0;
  212. }
  213. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  214. m_ret = ~0;
  215. } else {
  216. uint64_t long_int_m = hf_m;
  217. uint8_t overflow_flag = 0;
  218. uint16_t shift_out = 0;
  219. while (hf_e != kFp16ExpBias) {
  220. if (hf_e > kFp16ExpBias) {
  221. hf_e--;
  222. long_int_m = long_int_m << 1;
  223. if (long_int_m >= 0x40000Lu) { // overflow 0100 0000 0000 0000 0000
  224. long_int_m = 0x3FFFFLu; // 11 1111 1111 1111 1111 10(fp16_t-man)+8(uint8)=18bit
  225. overflow_flag = 1;
  226. m_ret = ~0;
  227. break;
  228. }
  229. } else {
  230. hf_e++;
  231. shift_out++;
  232. }
  233. }
  234. if (!overflow_flag) {
  235. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  236. m_ret = static_cast<uint8_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen8Max);
  237. if (need_round && m_ret != kBitLen8Max) {
  238. m_ret++;
  239. }
  240. }
  241. }
  242. if (s_ret == 1) { // Negative number
  243. m_ret = 0;
  244. }
  245. // m_ret equal to final result
  246. return m_ret;
  247. }
  248. /// @ingroup fp16_t static method
  249. /// @param [in] s_ret sign of fp16_t value
  250. /// @param [in] long_int_m man uint64_t value of fp16_t object
  251. /// @param [in] shift_out shift offset
  252. /// @brief calculate uint16 value by sign,man and shift offset
  253. /// @return Return uint16 value of fp16_t object
  254. static uint16_t GetUint16ValByMan(uint16_t s_ret, const uint64_t &long_int_m, const uint16_t &shift_out) {
  255. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  256. auto m_ret = static_cast<uint16_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max);
  257. if (need_round && m_ret < kInt16Max) {
  258. m_ret++;
  259. }
  260. if (s_ret) {
  261. m_ret = (~m_ret) + 1;
  262. }
  263. if (m_ret == 0) {
  264. s_ret = 0;
  265. }
  266. return static_cast<uint16_t>((s_ret << kBitShift15) | (m_ret));
  267. }
  268. /// @ingroup fp16_t math conversion static method
  269. /// @param [in] fp_val uint16_t value of fp16_t object
  270. /// @brief Convert fp16_t to int16_t
  271. /// @return Return int16_t value of fp_val which is the value of fp16_t object
  272. static int16_t Fp16ToInt16(const uint16_t &fp_val) {
  273. int16_t ret;
  274. uint16_t ret_v;
  275. // 1.get s_ret and shift it to bit0.
  276. uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  277. // 2.get hf_e and hf_m
  278. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  279. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  280. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  281. ret_v = 0;
  282. ret = *(reinterpret_cast<uint8_t *>(&ret_v));
  283. return ret;
  284. }
  285. uint64_t long_int_m = hf_m;
  286. uint8_t overflow_flag = 0;
  287. uint16_t shift_out = 0;
  288. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  289. overflow_flag = 1;
  290. } else {
  291. while (hf_e != kFp16ExpBias) {
  292. if (hf_e > kFp16ExpBias) {
  293. hf_e--;
  294. long_int_m = long_int_m << 1;
  295. if (s_ret == 1 && long_int_m > 0x2000000Lu) { // sign=1,negative number(<0)
  296. long_int_m = 0x2000000Lu; // 10(fp16_t-man)+15(int16)=25bit
  297. overflow_flag = 1;
  298. break;
  299. } else if (s_ret != 1 && long_int_m >= 0x1FFFFFFLu) { // sign=0,positive number(>0) Overflow
  300. long_int_m = 0x1FFFFFFLu; // 10(fp16_t-man)+15(int16)=25bit
  301. overflow_flag = 1;
  302. break;
  303. }
  304. } else {
  305. hf_e++;
  306. shift_out++;
  307. }
  308. }
  309. }
  310. if (overflow_flag) {
  311. ret_v = kInt16Max + s_ret;
  312. } else {
  313. // Generate final result
  314. ret_v = GetUint16ValByMan(s_ret, long_int_m, shift_out);
  315. }
  316. ret = *(reinterpret_cast<int16_t *>(&ret_v));
  317. return ret;
  318. }
  319. /// @ingroup fp16_t math conversion static method
  320. /// @param [in] fp_val uint16_t value of fp16_t object
  321. /// @brief Convert fp16_t to uint16_t
  322. /// @return Return uint16_t value of fp_val which is the value of fp16_t object
  323. static uint16_t Fp16ToUInt16(const uint16_t &fp_val) {
  324. uint16_t m_ret = 0;
  325. // 1.get s_ret and shift it to bit0.
  326. uint16_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  327. // 2.get hf_e and hf_m
  328. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  329. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  330. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  331. return 0;
  332. }
  333. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  334. m_ret = ~0;
  335. } else {
  336. uint64_t long_int_m = hf_m;
  337. uint16_t shift_out = 0;
  338. while (hf_e != kFp16ExpBias) {
  339. if (hf_e > kFp16ExpBias) {
  340. hf_e--;
  341. long_int_m = long_int_m << 1;
  342. } else {
  343. hf_e++;
  344. shift_out++;
  345. }
  346. }
  347. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  348. m_ret = static_cast<uint16_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen16Max);
  349. if (need_round && m_ret != kBitLen16Max) {
  350. m_ret++;
  351. }
  352. }
  353. if (s_ret == 1) { // Negative number
  354. m_ret = 0;
  355. }
  356. // m_ret equal to final result
  357. return m_ret;
  358. }
  359. /// @ingroup fp16_t math convertion static method
  360. /// @param [in] fp_val uint16_t value of fp16_t object
  361. /// @brief Convert fp16_t to int32_t
  362. /// @return Return int32_t value of fp_val which is the value of fp16_t object
  363. static int32_t Fp16ToInt32(const uint16_t &fp_val) {
  364. uint32_t ret_v;
  365. // 1.get s_ret and shift it to bit0.
  366. uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  367. // 2.get hf_e and hf_m
  368. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  369. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  370. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  371. ret_v = kInt32Max + s_ret;
  372. } else {
  373. uint64_t long_int_m = hf_m;
  374. uint16_t shift_out = 0;
  375. while (hf_e != kFp16ExpBias) {
  376. if (hf_e > kFp16ExpBias) {
  377. hf_e--;
  378. long_int_m = long_int_m << 1;
  379. } else {
  380. hf_e++;
  381. shift_out++;
  382. }
  383. }
  384. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  385. auto m_ret = static_cast<uint32_t>((long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max);
  386. if (need_round && m_ret < kInt32Max) {
  387. m_ret++;
  388. }
  389. if (s_ret == 1) {
  390. m_ret = (~m_ret) + 1;
  391. }
  392. if (m_ret == 0) {
  393. s_ret = 0;
  394. }
  395. // Generate final result
  396. ret_v = (s_ret << kBitShift31) | (m_ret);
  397. }
  398. return *(reinterpret_cast<int32_t *>(&ret_v));
  399. }
  400. /// @ingroup fp16_t math conversion static method
  401. /// @param [in] fp_val uint16_t value of fp16_t object
  402. /// @brief Convert fp16_t to uint32_t
  403. /// @return Return uint32_t value of fp_val which is the value of fp16_t object
  404. static uint32_t Fp16ToUInt32(const uint16_t &fp_val) {
  405. uint32_t m_ret;
  406. // 1.get s_ret and shift it to bit0.
  407. uint32_t s_ret = FP16_EXTRAC_SIGN(fp_val);
  408. // 2.get hf_e and hf_m
  409. uint16_t hf_e = FP16_EXTRAC_EXP(fp_val);
  410. uint16_t hf_m = FP16_EXTRAC_MAN(fp_val);
  411. if (FP16_IS_DENORM(fp_val)) { // Denormalized number
  412. return 0u;
  413. }
  414. if (FP16_IS_INVALID(fp_val)) { // Inf or NaN
  415. m_ret = ~0u;
  416. } else {
  417. uint64_t long_int_m = hf_m;
  418. uint16_t shift_out = 0;
  419. while (hf_e != kFp16ExpBias) {
  420. if (hf_e > kFp16ExpBias) {
  421. hf_e--;
  422. long_int_m = long_int_m << 1;
  423. } else {
  424. hf_e++;
  425. shift_out++;
  426. }
  427. }
  428. bool need_round = IsRoundOne(long_int_m, shift_out + kFp16ManLen);
  429. m_ret = static_cast<uint32_t>(long_int_m >> (kFp16ManLen + shift_out)) & kBitLen32Max;
  430. if (need_round && m_ret != kBitLen32Max) {
  431. m_ret++;
  432. }
  433. }
  434. if (s_ret == 1) { // Negative number
  435. m_ret = 0;
  436. }
  437. // m_ret equal to final result
  438. return m_ret;
  439. }
  440. static uint16_t Fp16AddCalVal(uint16_t &s_ret, int16_t e_ret, uint16_t m_ret, uint32_t m_trunc, uint16_t shift_out) {
  441. uint16_t m_min = kFp16ManHideBit << shift_out;
  442. uint16_t m_max = m_min << 1;
  443. // Denormal
  444. while (m_ret < m_min && e_ret > 0) { // the value of m_ret should not be smaller than 2^23
  445. m_ret = m_ret << 1;
  446. m_ret += (kFp32SignMask & m_trunc) >> kFp32SignIndex;
  447. m_trunc = m_trunc << 1;
  448. e_ret = e_ret - 1;
  449. }
  450. while (m_ret >= m_max) { // the value of m_ret should be smaller than 2^24
  451. m_trunc = m_trunc >> 1;
  452. m_trunc = m_trunc | (kFp32SignMask * (m_ret & 1));
  453. m_ret = m_ret >> 1;
  454. e_ret = e_ret + 1;
  455. }
  456. bool b_last_bit = ((m_ret & 1) > 0);
  457. bool b_trunc_high = 0;
  458. bool b_trunc_left = 0;
  459. b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0);
  460. b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0);
  461. m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret, shift_out);
  462. while (m_ret >= m_max) {
  463. m_ret = m_ret >> 1;
  464. e_ret = e_ret + 1;
  465. }
  466. if (e_ret == 0 && m_ret <= m_max) {
  467. m_ret = m_ret >> 1;
  468. }
  469. Fp16Normalize(e_ret, m_ret);
  470. uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  471. return ret;
  472. }
  473. /// @ingroup fp16_t math operator
  474. /// @param [in] v_1 left operator value of fp16_t object
  475. /// @param [in] v_2 right operator value of fp16_t object
  476. /// @brief Performing fp16_t addition
  477. /// @return Return fp16_t result of adding this and fp
  478. static uint16_t Fp16Add(uint16_t v_1, uint16_t v_2) {
  479. uint16_t s_a;
  480. uint16_t s_b;
  481. int16_t e_a;
  482. int16_t e_b;
  483. uint32_t m_a;
  484. uint32_t m_b;
  485. uint16_t m_a_tmp;
  486. uint16_t m_b_tmp;
  487. uint16_t shift_out = 0;
  488. // 1.Extract
  489. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  490. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  491. m_a = m_a_tmp;
  492. m_b = m_b_tmp;
  493. uint16_t sum;
  494. uint16_t s_ret;
  495. if (s_a != s_b) {
  496. ReverseMan(s_a > 0, m_a);
  497. ReverseMan(s_b > 0, m_b);
  498. sum = static_cast<uint16_t>(GetManSum(e_a, m_a, e_b, m_b));
  499. s_ret = (sum & kFp16SignMask) >> kFp16SignIndex;
  500. ReverseMan(s_ret > 0, m_a);
  501. ReverseMan(s_ret > 0, m_b);
  502. } else {
  503. sum = static_cast<uint16_t>(GetManSum(e_a, m_a, e_b, m_b));
  504. s_ret = s_a;
  505. }
  506. if (sum == 0) {
  507. shift_out = 3; // shift to left 3 bits
  508. m_a = m_a << shift_out;
  509. m_b = m_b << shift_out;
  510. }
  511. uint32_t m_trunc = 0;
  512. int16_t e_ret = std::max(e_a, e_b);
  513. int16_t e_tmp = std::abs(e_a - e_b);
  514. if (e_a > e_b) {
  515. m_trunc = (m_b << (kBitShift32 - static_cast<uint16_t>(e_tmp)));
  516. m_b = RightShift(m_b, e_tmp);
  517. } else if (e_a < e_b) {
  518. m_trunc = (m_a << (kBitShift32 - static_cast<uint16_t>(e_tmp)));
  519. m_a = RightShift(m_a, e_tmp);
  520. }
  521. // calculate mantissav
  522. auto m_ret = static_cast<uint16_t>(m_a + m_b);
  523. return Fp16AddCalVal(s_ret, e_ret, m_ret, m_trunc, shift_out);
  524. }
  525. /// @ingroup fp16_t math operator
  526. /// @param [in] v_1 left operator value of fp16_t object
  527. /// @param [in] v_2 right operator value of fp16_t object
  528. /// @brief Performing fp16_t subtraction
  529. /// @return Return fp16_t result of subtraction fp from this
  530. static uint16_t Fp16Sub(uint16_t v_1, uint16_t v_2) {
  531. // Reverse
  532. uint16_t tmp = ((~(v_2)) & kFp16SignMask) | (v_2 & kFp16AbsMax);
  533. return Fp16Add(v_1, tmp);
  534. }
  535. /// @ingroup fp16_t math operator
  536. /// @param [in] v_1 left operator value of fp16_t object
  537. /// @param [in] v_2 right operator value of fp16_t object
  538. /// @brief Performing fp16_t multiplication
  539. /// @return Return fp16_t result of multiplying this and fp
  540. static uint16_t Fp16Mul(uint16_t v_1, uint16_t v_2) {
  541. uint16_t s_a, s_b;
  542. int16_t e_a, e_b;
  543. uint32_t m_a, m_b;
  544. uint16_t s_ret;
  545. uint16_t m_ret;
  546. int16_t e_ret;
  547. uint32_t mul_m;
  548. uint16_t m_a_tmp;
  549. uint16_t m_b_tmp;
  550. // 1.Extract
  551. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  552. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  553. m_a = m_a_tmp;
  554. m_b = m_b_tmp;
  555. e_ret = e_a + e_b - kFp16ExpBias - kDim10;
  556. mul_m = m_a * m_b;
  557. s_ret = s_a ^ s_b;
  558. uint32_t m_min = kFp16ManHideBit;
  559. uint32_t m_max = m_min << 1;
  560. uint32_t m_trunc = 0;
  561. // the value of m_ret should not be smaller than 2^23
  562. while (mul_m < m_min && e_ret > 1) {
  563. mul_m = mul_m << 1;
  564. e_ret = e_ret - 1;
  565. }
  566. while (mul_m >= m_max || e_ret < 1) {
  567. m_trunc = m_trunc >> 1;
  568. m_trunc = m_trunc | (kFp32SignMask * (mul_m & 1));
  569. mul_m = mul_m >> 1;
  570. e_ret = e_ret + 1;
  571. }
  572. bool b_last_bit = ((mul_m & 1) > 0);
  573. bool b_trunc_high = 0;
  574. bool b_trunc_left = 0;
  575. b_trunc_high = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32SignMask) > 0);
  576. b_trunc_left = (kRoundToNearest == g_round_mode) && ((m_trunc & kFp32AbsMax) > 0);
  577. mul_m = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, mul_m);
  578. while (mul_m >= m_max || e_ret < 0) {
  579. mul_m = mul_m >> 1;
  580. e_ret = e_ret + 1;
  581. }
  582. if (e_ret == 1 && mul_m < kFp16ManHideBit) {
  583. e_ret = 0;
  584. }
  585. m_ret = static_cast<uint16_t>(mul_m);
  586. Fp16Normalize(e_ret, m_ret);
  587. uint16_t ret = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  588. return ret;
  589. }
  590. /// @ingroup fp16_t math operator divided
  591. /// @param [in] v_1 left operator value of fp16_t object
  592. /// @param [in] v_2 right operator value of fp16_t object
  593. /// @brief Performing fp16_t division
  594. /// @return Return fp16_t result of division this by fp
  595. static uint16_t Fp16Div(uint16_t v_1, uint16_t v_2) {
  596. uint16_t ret;
  597. if (FP16_IS_ZERO(v_2)) { // result is inf
  598. // throw "fp16_t division by zero.";
  599. uint16_t s_a;
  600. uint16_t s_b;
  601. uint16_t s_ret;
  602. s_a = FP16_EXTRAC_SIGN(v_1);
  603. s_b = FP16_EXTRAC_SIGN(v_2);
  604. s_ret = s_a ^ s_b;
  605. ret = FP16_CONSTRUCTOR(s_ret, kFp16MaxExp, 0u);
  606. } else if (FP16_IS_ZERO(v_1)) {
  607. ret = 0u;
  608. } else {
  609. uint16_t s_a;
  610. uint16_t s_b;
  611. int16_t e_a;
  612. int16_t e_b;
  613. uint64_t m_a;
  614. uint64_t m_b;
  615. float m_div;
  616. uint16_t m_a_tmp;
  617. uint16_t m_b_tmp;
  618. // 1.Extract
  619. ExtractFp16(v_1, s_a, e_a, m_a_tmp);
  620. ExtractFp16(v_2, s_b, e_b, m_b_tmp);
  621. m_a = m_a_tmp;
  622. m_b = m_b_tmp;
  623. uint64_t m_tmp;
  624. if (e_a > e_b) {
  625. m_tmp = m_a;
  626. uint16_t tmp;
  627. tmp = e_a - e_b;
  628. for (int i = 0; i < tmp; i++) {
  629. m_tmp = m_tmp << 1;
  630. }
  631. m_a = m_tmp;
  632. } else if (e_a < e_b) {
  633. m_tmp = m_b;
  634. uint16_t tmp = e_b - e_a;
  635. for (int i = 0; i < tmp; i++) {
  636. m_tmp = m_tmp << 1;
  637. }
  638. m_b = m_tmp;
  639. }
  640. m_div = static_cast<float>(m_a * 1.0f / m_b);
  641. fp16_t fp_div;
  642. fp_div = m_div;
  643. ret = fp_div.val;
  644. if (s_a != s_b) {
  645. ret |= kFp16SignMask;
  646. }
  647. }
  648. return ret;
  649. }
  650. // operate
  651. fp16_t fp16_t::operator+(const fp16_t fp) {
  652. uint16_t ret_val = Fp16Add(val, fp.val);
  653. fp16_t ret(ret_val);
  654. return ret;
  655. }
  656. fp16_t fp16_t::operator-(const fp16_t fp) {
  657. uint16_t ret_val = Fp16Sub(val, fp.val);
  658. fp16_t ret(ret_val);
  659. return ret;
  660. }
  661. fp16_t fp16_t::operator*(const fp16_t fp) {
  662. uint16_t ret_val = Fp16Mul(val, fp.val);
  663. fp16_t ret(ret_val);
  664. return ret;
  665. }
  666. fp16_t fp16_t::operator/(const fp16_t fp) {
  667. uint16_t ret_val = Fp16Div(val, fp.val);
  668. fp16_t ret(ret_val);
  669. return ret;
  670. }
  671. fp16_t fp16_t::operator+=(const fp16_t fp) {
  672. val = Fp16Add(val, fp.val);
  673. return *this;
  674. }
  675. fp16_t fp16_t::operator-=(const fp16_t fp) {
  676. val = Fp16Sub(val, fp.val);
  677. return *this;
  678. }
  679. fp16_t fp16_t::operator*=(const fp16_t fp) {
  680. val = Fp16Mul(val, fp.val);
  681. return *this;
  682. }
  683. fp16_t fp16_t::operator/=(const fp16_t fp) {
  684. val = Fp16Div(val, fp.val);
  685. return *this;
  686. }
  687. // compare
  688. bool fp16_t::operator==(const fp16_t &fp) const {
  689. bool result = true;
  690. if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) {
  691. result = true;
  692. } else {
  693. result = ((val & kBitLen16Max) == (fp.val & kBitLen16Max)); // bit compare
  694. }
  695. return result;
  696. }
  697. bool fp16_t::operator!=(const fp16_t &fp) const {
  698. bool result = true;
  699. if (FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val)) {
  700. result = false;
  701. } else {
  702. result = ((val & kBitLen16Max) != (fp.val & kBitLen16Max)); // bit compare
  703. }
  704. return result;
  705. }
  706. bool fp16_t::operator>(const fp16_t &fp) const {
  707. uint16_t s_a;
  708. uint16_t s_b;
  709. uint16_t e_a;
  710. uint16_t e_b;
  711. uint16_t m_a;
  712. uint16_t m_b;
  713. bool result = true;
  714. // 1.Extract
  715. s_a = FP16_EXTRAC_SIGN(val);
  716. s_b = FP16_EXTRAC_SIGN(fp.val);
  717. e_a = FP16_EXTRAC_EXP(val);
  718. e_b = FP16_EXTRAC_EXP(fp.val);
  719. m_a = FP16_EXTRAC_MAN(val);
  720. m_b = FP16_EXTRAC_MAN(fp.val);
  721. // Compare
  722. if ((s_a == 0) && (s_b > 0)) { // + -
  723. // -0=0
  724. result = !(FP16_IS_ZERO(val) && FP16_IS_ZERO(fp.val));
  725. } else if ((s_a == 0) && (s_b == 0)) { // + +
  726. if (e_a > e_b) { // e_a - e_b >= 1; Va always larger than Vb
  727. result = true;
  728. } else if (e_a == e_b) {
  729. result = m_a > m_b;
  730. } else {
  731. result = false;
  732. }
  733. } else if ((s_a > 0) && (s_b > 0)) { // - - opposite to + +
  734. if (e_a < e_b) {
  735. result = true;
  736. } else if (e_a == e_b) {
  737. result = m_a < m_b;
  738. } else {
  739. result = false;
  740. }
  741. } else { // - +
  742. result = false;
  743. }
  744. return result;
  745. }
  746. bool fp16_t::operator>=(const fp16_t &fp) const {
  747. bool result = true;
  748. if ((*this) > fp) {
  749. result = true;
  750. } else if ((*this) == fp) {
  751. result = true;
  752. } else {
  753. result = false;
  754. }
  755. return result;
  756. }
  757. bool fp16_t::operator<(const fp16_t &fp) const {
  758. bool result = true;
  759. if ((*this) >= fp) {
  760. result = false;
  761. } else {
  762. result = true;
  763. }
  764. return result;
  765. }
  766. bool fp16_t::operator<=(const fp16_t &fp) const {
  767. bool result = true;
  768. if ((*this) > fp) {
  769. result = false;
  770. } else {
  771. result = true;
  772. }
  773. return result;
  774. }
  775. // evaluation
  776. fp16_t &fp16_t::operator=(const fp16_t &fp) {
  777. if (&fp == this) {
  778. return *this;
  779. }
  780. val = fp.val;
  781. return *this;
  782. }
  783. fp16_t &fp16_t::operator=(const float &f_val) {
  784. uint16_t s_ret;
  785. uint16_t m_ret;
  786. int16_t e_ret;
  787. uint32_t e_f;
  788. uint32_t m_f;
  789. const uint32_t ui32_v = *(reinterpret_cast<const uint32_t *>(&f_val)); // 1:8:23bit sign:exp:man
  790. uint32_t m_len_delta;
  791. s_ret = static_cast<uint16_t>((ui32_v & kFp32SignMask) >> kFp32SignIndex); // 4Byte->2Byte
  792. e_f = (ui32_v & kFp32ExpMask) >> kFp32ManLen; // 8 bit exponent
  793. m_f = (ui32_v & kFp32ManMask); // 23 bit mantissa dont't need to care about denormal
  794. m_len_delta = kFp32ManLen - kFp16ManLen;
  795. bool need_round = false;
  796. // Exponent overflow/NaN converts to signed inf/NaN
  797. if (e_f > 0x8Fu) { // 0x8Fu:142=127+15
  798. e_ret = kFp16MaxExp - 1;
  799. m_ret = kFp16MaxMan;
  800. } else if (e_f <= 0x70u) { // 0x70u:112=127-15 Exponent underflow converts to denormalized half or signed zero
  801. e_ret = 0;
  802. if (e_f >= 0x67) { // 0x67:103=127-24 Denormal
  803. m_f = (m_f | kFp32ManHideBit);
  804. uint16_t shift_out = kFp32ManLen;
  805. uint64_t m_tmp = (static_cast<uint64_t>(m_f)) << (e_f - 0x67);
  806. need_round = IsRoundOne(m_tmp, shift_out);
  807. m_ret = static_cast<uint16_t>(m_tmp >> shift_out);
  808. if (need_round) {
  809. m_ret++;
  810. }
  811. } else if (e_f == 0x66 && m_f > 0) { // 0x66:102 Denormal 0<f_v<min(Denormal)
  812. m_ret = 1;
  813. } else {
  814. m_ret = 0;
  815. }
  816. } else { // Regular case with no overflow or underflow
  817. e_ret = static_cast<int16_t>(e_f - 0x70u);
  818. need_round = IsRoundOne(m_f, static_cast<uint16_t>(m_len_delta));
  819. m_ret = static_cast<uint16_t>(m_f >> m_len_delta);
  820. if (need_round) {
  821. m_ret++;
  822. }
  823. if (m_ret & kFp16ManHideBit) {
  824. e_ret++;
  825. }
  826. }
  827. Fp16Normalize(e_ret, m_ret);
  828. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  829. return *this;
  830. }
  831. fp16_t &fp16_t::operator=(const int8_t &i_val) {
  832. uint16_t s_ret;
  833. uint16_t e_ret;
  834. uint16_t m_ret;
  835. s_ret = static_cast<uint16_t>(((static_cast<uint8_t>(i_val)) & 0x80) >> kDim7);
  836. m_ret = static_cast<uint16_t>(((static_cast<uint8_t>(i_val)) & kInt8Max));
  837. if (m_ret == 0) {
  838. e_ret = 0;
  839. } else {
  840. if (s_ret) { // negative number(<0)
  841. m_ret = static_cast<uint16_t>(std::abs(i_val)); // complement
  842. }
  843. e_ret = kFp16ManLen;
  844. while ((m_ret & kFp16ManHideBit) == 0) {
  845. m_ret = m_ret << 1;
  846. e_ret = e_ret - 1;
  847. }
  848. e_ret = e_ret + kFp16ExpBias;
  849. }
  850. val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret);
  851. return *this;
  852. }
  853. fp16_t &fp16_t::operator=(const uint8_t &ui_val) {
  854. uint16_t s_ret;
  855. uint16_t e_ret;
  856. uint16_t m_ret;
  857. s_ret = 0;
  858. e_ret = 0;
  859. m_ret = ui_val;
  860. if (m_ret) {
  861. e_ret = kFp16ManLen;
  862. while ((m_ret & kFp16ManHideBit) == 0) {
  863. m_ret = m_ret << 1;
  864. e_ret = e_ret - 1;
  865. }
  866. e_ret = e_ret + kFp16ExpBias;
  867. }
  868. val = FP16_CONSTRUCTOR(s_ret, e_ret, m_ret);
  869. return *this;
  870. }
  871. static void SetValByUint16Val(const uint16_t &input_val, const uint16_t &sign, uint16_t &ret_val) {
  872. uint32_t m_tmp = (input_val & kFp32AbsMax);
  873. uint16_t m_min = kFp16ManHideBit;
  874. uint16_t m_max = m_min << 1;
  875. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  876. if (m_tmp) {
  877. int16_t e_ret;
  878. if (len > kDim11) {
  879. e_ret = kFp16ExpBias + kFp16ManLen;
  880. uint16_t e_tmp = len - kDim11;
  881. uint32_t trunc_mask = 1;
  882. for (int i = 1; i < e_tmp; i++) {
  883. trunc_mask = (trunc_mask << 1) + 1;
  884. }
  885. uint32_t m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp);
  886. for (int i = 0; i < e_tmp; i++) {
  887. m_tmp = (m_tmp >> 1);
  888. e_ret = e_ret + 1;
  889. }
  890. bool b_last_bit = ((m_tmp & 1) > 0);
  891. bool b_trunc_high = 0;
  892. bool b_trunc_left = 0;
  893. if (kRoundToNearest == g_round_mode) { // trunc
  894. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  895. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  896. }
  897. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  898. while (m_tmp >= m_max || e_ret < 0) {
  899. m_tmp = m_tmp >> 1;
  900. e_ret = e_ret + 1;
  901. }
  902. } else {
  903. e_ret = kFp16ExpBias;
  904. m_tmp = m_tmp << (kManBitLength - len);
  905. e_ret = e_ret + (len - 1);
  906. }
  907. auto m_ret = static_cast<uint16_t>(m_tmp);
  908. ret_val = FP16_CONSTRUCTOR(sign, static_cast<uint16_t>(e_ret), m_ret);
  909. }
  910. }
  911. fp16_t &fp16_t::operator=(const int16_t &i_val) {
  912. if (i_val == 0) {
  913. val = 0;
  914. } else {
  915. uint16_t ui_val = *(reinterpret_cast<const uint16_t *>(&i_val));
  916. auto s_ret = static_cast<uint16_t>(ui_val >> kBitShift15);
  917. if (s_ret) {
  918. int16_t iValM = -i_val;
  919. ui_val = *(reinterpret_cast<uint16_t *>(&iValM));
  920. }
  921. SetValByUint16Val(ui_val, s_ret, val);
  922. }
  923. return *this;
  924. }
  925. fp16_t &fp16_t::operator=(const uint16_t &ui_val) {
  926. if (ui_val == 0) {
  927. val = 0;
  928. } else {
  929. int16_t e_ret;
  930. uint16_t m_ret = ui_val;
  931. uint16_t m_min = kFp16ManHideBit;
  932. uint16_t m_max = m_min << 1;
  933. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_ret));
  934. if (len > kManBitLength) {
  935. e_ret = kFp16ExpBias + kFp16ManLen;
  936. uint32_t m_trunc;
  937. uint32_t trunc_mask = 1;
  938. uint16_t e_tmp = len - kManBitLength;
  939. for (int i = 1; i < e_tmp; i++) {
  940. trunc_mask = (trunc_mask << 1) + 1;
  941. }
  942. m_trunc = (m_ret & trunc_mask) << (kBitShift32 - e_tmp);
  943. for (int i = 0; i < e_tmp; i++) {
  944. m_ret = (m_ret >> 1);
  945. e_ret = e_ret + 1;
  946. }
  947. bool b_last_bit = ((m_ret & 1) > 0);
  948. bool b_trunc_high = 0;
  949. bool b_trunc_left = 0;
  950. if (kRoundToNearest == g_round_mode) { // trunc
  951. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  952. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  953. }
  954. m_ret = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_ret);
  955. while (m_ret >= m_max || e_ret < 0) {
  956. m_ret = m_ret >> 1;
  957. e_ret = e_ret + 1;
  958. }
  959. if (FP16_IS_INVALID(val)) {
  960. val = kFp16Max;
  961. }
  962. } else {
  963. e_ret = kFp16ExpBias;
  964. m_ret = m_ret << (kDim11 - len);
  965. e_ret = e_ret + (len - 1);
  966. }
  967. val = FP16_CONSTRUCTOR(0u, static_cast<uint16_t>(e_ret), m_ret);
  968. }
  969. return *this;
  970. }
  971. static void SetValByUint32Val(const uint32_t &input_val, const uint16_t &sign, uint16_t &ret_val) {
  972. int16_t e_ret;
  973. uint32_t m_tmp = (input_val & kFp32AbsMax);
  974. uint32_t m_min = kFp16ManHideBit;
  975. uint32_t m_max = m_min << 1;
  976. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  977. if (len > kDim11) {
  978. e_ret = kFp16ExpBias + kFp16ManLen;
  979. uint32_t m_trunc = 0;
  980. uint32_t trunc_mask = 1;
  981. uint16_t e_tmp = len - kDim11;
  982. for (int i = 1; i < e_tmp; i++) {
  983. trunc_mask = (trunc_mask << 1) + 1;
  984. }
  985. m_trunc = (m_tmp & trunc_mask) << (kBitShift32 - e_tmp);
  986. for (int i = 0; i < e_tmp; i++) {
  987. m_tmp = (m_tmp >> 1);
  988. e_ret = e_ret + 1;
  989. }
  990. bool b_last_bit = ((m_tmp & 1) > 0);
  991. bool b_trunc_high = 0;
  992. bool b_trunc_left = 0;
  993. if (kRoundToNearest == g_round_mode) { // trunc
  994. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  995. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  996. }
  997. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  998. while (m_tmp >= m_max || e_ret < 0) {
  999. m_tmp = m_tmp >> 1;
  1000. e_ret = e_ret + 1;
  1001. }
  1002. if (e_ret >= kFp16MaxExp) {
  1003. e_ret = kFp16MaxExp - 1;
  1004. m_tmp = kFp16MaxMan;
  1005. }
  1006. } else {
  1007. e_ret = kFp16ExpBias;
  1008. m_tmp = m_tmp << (kDim11 - len);
  1009. e_ret = e_ret + (len - 1);
  1010. }
  1011. auto m_ret = static_cast<uint16_t>(m_tmp);
  1012. ret_val = FP16_CONSTRUCTOR(sign, static_cast<uint16_t>(e_ret), m_ret);
  1013. }
  1014. fp16_t &fp16_t::operator=(const int32_t &i_val) {
  1015. if (i_val == 0) {
  1016. val = 0;
  1017. } else {
  1018. uint32_t ui_val = *(reinterpret_cast<const uint32_t *>(&i_val));
  1019. auto s_ret = static_cast<uint16_t>(ui_val >> kBitShift31);
  1020. if (s_ret) {
  1021. int32_t iValM = -i_val;
  1022. ui_val = *(reinterpret_cast<uint32_t *>(&iValM));
  1023. }
  1024. SetValByUint32Val(ui_val, s_ret, val);
  1025. }
  1026. return *this;
  1027. }
  1028. fp16_t &fp16_t::operator=(const uint32_t &ui_val) {
  1029. if (ui_val == 0) {
  1030. val = 0;
  1031. } else {
  1032. int16_t e_ret;
  1033. uint32_t m_tmp = ui_val;
  1034. uint32_t m_min = kFp16ManHideBit;
  1035. uint32_t m_max = m_min << 1;
  1036. uint16_t len = static_cast<uint16_t>(GetManBitLength(m_tmp));
  1037. if (len > kDim11) {
  1038. e_ret = kFp16ExpBias + kFp16ManLen;
  1039. uint32_t m_trunc = 0;
  1040. uint32_t trunc_mask = 1;
  1041. uint16_t e_tmp = len - kDim11;
  1042. for (int i = 1; i < e_tmp; i++) {
  1043. trunc_mask = (trunc_mask << 1) + 1;
  1044. }
  1045. m_trunc = (m_tmp & trunc_mask) << static_cast<uint32_t>(kBitShift32 - e_tmp);
  1046. for (uint16_t i = 0; i < e_tmp; i++) {
  1047. m_tmp = (m_tmp >> 1);
  1048. e_ret = e_ret + 1;
  1049. }
  1050. bool b_last_bit = ((m_tmp & 1) > 0);
  1051. bool b_trunc_high = false;
  1052. bool b_trunc_left = false;
  1053. if (g_round_mode == kRoundToNearest) { // trunc
  1054. b_trunc_high = ((m_trunc & kFp32SignMask) > 0);
  1055. b_trunc_left = ((m_trunc & kFp32AbsMax) > 0);
  1056. }
  1057. m_tmp = ManRoundToNearest(b_last_bit, b_trunc_high, b_trunc_left, m_tmp);
  1058. while (m_tmp >= m_max || e_ret < 0) {
  1059. m_tmp = m_tmp >> 1;
  1060. e_ret = e_ret + 1;
  1061. }
  1062. if (e_ret >= kFp16MaxExp) {
  1063. e_ret = kFp16MaxExp - 1;
  1064. m_tmp = kFp16MaxMan;
  1065. }
  1066. } else {
  1067. e_ret = kFp16ExpBias;
  1068. m_tmp = m_tmp << (kDim11 - len);
  1069. e_ret = e_ret + (len - 1);
  1070. }
  1071. auto m_ret = static_cast<uint16_t>(m_tmp);
  1072. val = FP16_CONSTRUCTOR(0u, static_cast<uint16_t>(e_ret), m_ret);
  1073. }
  1074. return *this;
  1075. }
  1076. fp16_t &fp16_t::operator=(const double &d_val) {
  1077. uint16_t s_ret;
  1078. uint16_t m_ret;
  1079. int16_t e_ret;
  1080. uint64_t e_d;
  1081. uint64_t m_d;
  1082. uint64_t ui64_v = *(reinterpret_cast<const uint64_t *>(&d_val)); // 1:11:52bit sign:exp:man
  1083. uint32_t m_len_delta;
  1084. s_ret = static_cast<uint16_t>((ui64_v & kFp64SignMask) >> kFp64SignIndex); // 4Byte
  1085. e_d = (ui64_v & kFp64ExpMask) >> kFp64ManLen; // 10 bit exponent
  1086. m_d = (ui64_v & kFp64ManMask); // 52 bit mantissa
  1087. m_len_delta = kFp64ManLen - kFp16ManLen;
  1088. bool need_round = false;
  1089. // Exponent overflow/NaN converts to signed inf/NaN
  1090. if (e_d >= 0x410u) { // 0x410:1040=1023+16
  1091. e_ret = kFp16MaxExp - 1;
  1092. m_ret = kFp16MaxMan;
  1093. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  1094. } else if (e_d <= 0x3F0u) { // Exponent underflow converts to denormalized half or signed zero
  1095. // 0x3F0:1008=1023-15
  1096. // Signed zeros, denormalized floats, and floats with small
  1097. // exponents all convert to signed zero half precision.
  1098. e_ret = 0;
  1099. if (e_d >= 0x3E7u) { // 0x3E7u:999=1023-24 Denormal
  1100. // Underflows to a denormalized value
  1101. m_d = (kFp64ManHideBit | m_d);
  1102. uint16_t shift_out = kFp64ManLen;
  1103. uint64_t m_tmp = (static_cast<uint64_t>(m_d)) << (e_d - 0x3E7u);
  1104. need_round = IsRoundOne(m_tmp, shift_out);
  1105. m_ret = static_cast<uint16_t>(m_tmp >> shift_out);
  1106. if (need_round) {
  1107. m_ret++;
  1108. }
  1109. } else if (e_d == 0x3E6u && m_d > 0) {
  1110. m_ret = 1;
  1111. } else {
  1112. m_ret = 0;
  1113. }
  1114. } else { // Regular case with no overflow or underflow
  1115. e_ret = static_cast<int16_t>(e_d - 0x3F0u);
  1116. need_round = IsRoundOne(m_d, m_len_delta);
  1117. m_ret = static_cast<uint16_t>(m_d >> m_len_delta);
  1118. if (need_round) {
  1119. m_ret++;
  1120. }
  1121. if (m_ret & kFp16ManHideBit) {
  1122. e_ret++;
  1123. }
  1124. }
  1125. Fp16Normalize(e_ret, m_ret);
  1126. val = FP16_CONSTRUCTOR(s_ret, static_cast<uint16_t>(e_ret), m_ret);
  1127. return *this;
  1128. }
  1129. // convert
  1130. fp16_t::operator float() const {
  1131. return Fp16ToFloat(val);
  1132. }
  1133. fp16_t::operator double() const {
  1134. return Fp16ToDouble(val);
  1135. }
  1136. fp16_t::operator int8_t() const {
  1137. return Fp16ToInt8(val);
  1138. }
  1139. fp16_t::operator uint8_t() const {
  1140. return Fp16ToUInt8(val);
  1141. }
  1142. fp16_t::operator int16_t() const {
  1143. return Fp16ToInt16(val);
  1144. }
  1145. fp16_t::operator uint16_t() const {
  1146. return Fp16ToUInt16(val);
  1147. }
  1148. fp16_t::operator int32_t() const {
  1149. return Fp16ToInt32(val);
  1150. }
  1151. fp16_t::operator uint32_t() const {
  1152. return Fp16ToUInt32(val);
  1153. }
  1154. // Cannot be used, just in order to solve the compile error
  1155. fp16_t::operator int64_t() const {
  1156. return 0;
  1157. }
  1158. // Cannot be used, just in order to solve the compile error
  1159. fp16_t::operator uint64_t() const {
  1160. return 0;
  1161. }
  1162. int fp16_t::IsInf() {
  1163. if ((val & kFp16AbsMax) == kFp16ExpMask) {
  1164. if (val & kFp16SignMask) {
  1165. return -1;
  1166. } else {
  1167. return 1;
  1168. }
  1169. } else {
  1170. return 0;
  1171. }
  1172. }
  1173. float fp16_t::ToFloat() const {
  1174. return Fp16ToFloat(val);
  1175. }
  1176. double fp16_t::ToDouble() const {
  1177. return Fp16ToDouble(val);
  1178. }
  1179. int8_t fp16_t::ToInt8() const {
  1180. return Fp16ToInt8(val);
  1181. }
  1182. uint8_t fp16_t::ToUInt8() const {
  1183. return Fp16ToUInt8(val);
  1184. }
  1185. int16_t fp16_t::ToInt16() const {
  1186. return Fp16ToInt16(val);
  1187. }
  1188. uint16_t fp16_t::ToUInt16() const {
  1189. return Fp16ToUInt16(val);
  1190. }
  1191. int32_t fp16_t::ToInt32() const {
  1192. return Fp16ToInt32(val);
  1193. }
  1194. uint32_t fp16_t::ToUInt32() const {
  1195. return Fp16ToUInt32(val);
  1196. }
  1197. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示