You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strsm_kernel_RT_8x8_msa.c 54 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  30. {
  31. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  32. v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
  33. v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
  34. v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
  35. v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
  36. v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
  37. v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
  38. FLOAT *c_nxt1line = c + ldc;
  39. FLOAT *c_nxt2line = c + 2 * ldc;
  40. FLOAT *c_nxt3line = c + 3 * ldc;
  41. FLOAT *c_nxt4line = c + 4 * ldc;
  42. FLOAT *c_nxt5line = c + 5 * ldc;
  43. FLOAT *c_nxt6line = c + 6 * ldc;
  44. FLOAT *c_nxt7line = c + 7 * ldc;
  45. if (bk > 0)
  46. {
  47. BLASLONG k;
  48. FLOAT *aa = a, *bb = b;
  49. v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
  50. v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
  51. v4f32 res8, res9, res10, res11, res12, res13, res14, res15;
  52. LD_SP2(aa, 4, src_a0, src_a1);
  53. src_b = LD_SP(bb + 0);
  54. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  55. res0 = src_a0 * src_b0;
  56. res1 = src_a1 * src_b0;
  57. res2 = src_a0 * src_b1;
  58. res3 = src_a1 * src_b1;
  59. res4 = src_a0 * src_b2;
  60. res5 = src_a1 * src_b2;
  61. res6 = src_a0 * src_b3;
  62. res7 = src_a1 * src_b3;
  63. src_b = LD_SP(bb + 4);
  64. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  65. res8 = src_a0 * src_b0;
  66. res9 = src_a1 * src_b0;
  67. res10 = src_a0 * src_b1;
  68. res11 = src_a1 * src_b1;
  69. res12 = src_a0 * src_b2;
  70. res13 = src_a1 * src_b2;
  71. res14 = src_a0 * src_b3;
  72. res15 = src_a1 * src_b3;
  73. for (k = (bk - 1); k--;)
  74. {
  75. aa += 8;
  76. bb += 8;
  77. LD_SP2(aa, 4, src_a0, src_a1);
  78. src_b = LD_SP(bb + 0);
  79. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  80. res0 += src_a0 * src_b0;
  81. res1 += src_a1 * src_b0;
  82. res2 += src_a0 * src_b1;
  83. res3 += src_a1 * src_b1;
  84. res4 += src_a0 * src_b2;
  85. res5 += src_a1 * src_b2;
  86. res6 += src_a0 * src_b3;
  87. res7 += src_a1 * src_b3;
  88. src_b = LD_SP(bb + 4);
  89. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  90. res8 += src_a0 * src_b0;
  91. res9 += src_a1 * src_b0;
  92. res10 += src_a0 * src_b1;
  93. res11 += src_a1 * src_b1;
  94. res12 += src_a0 * src_b2;
  95. res13 += src_a1 * src_b2;
  96. res14 += src_a0 * src_b3;
  97. res15 += src_a1 * src_b3;
  98. }
  99. LD_SP2(c, 4, src_c0, src_c1);
  100. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  101. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  102. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  103. LD_SP2(c_nxt4line, 4, src_c8, src_c9);
  104. LD_SP2(c_nxt5line, 4, src_c10, src_c11);
  105. LD_SP2(c_nxt6line, 4, src_c12, src_c13);
  106. LD_SP2(c_nxt7line, 4, src_c14, src_c15);
  107. src_c0 -= res0;
  108. src_c1 -= res1;
  109. src_c2 -= res2;
  110. src_c3 -= res3;
  111. src_c4 -= res4;
  112. src_c5 -= res5;
  113. src_c6 -= res6;
  114. src_c7 -= res7;
  115. src_c8 -= res8;
  116. src_c9 -= res9;
  117. src_c10 -= res10;
  118. src_c11 -= res11;
  119. src_c12 -= res12;
  120. src_c13 -= res13;
  121. src_c14 -= res14;
  122. src_c15 -= res15;
  123. }
  124. else
  125. {
  126. LD_SP2(c, 4, src_c0, src_c1);
  127. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  128. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  129. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  130. LD_SP2(c_nxt4line, 4, src_c8, src_c9);
  131. LD_SP2(c_nxt5line, 4, src_c10, src_c11);
  132. LD_SP2(c_nxt6line, 4, src_c12, src_c13);
  133. LD_SP2(c_nxt7line, 4, src_c14, src_c15);
  134. }
  135. b -= 64;
  136. src_b = LD_SP(b + 60);
  137. SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
  138. src_b = LD_SP(b + 56);
  139. SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
  140. src_c15 *= src_b63;
  141. src_c14 *= src_b63;
  142. src_c13 -= src_c15 * src_b62;
  143. src_c12 -= src_c14 * src_b62;
  144. src_c11 -= src_c15 * src_b61;
  145. src_c10 -= src_c14 * src_b61;
  146. src_c9 -= src_c15 * src_b60;
  147. src_c8 -= src_c14 * src_b60;
  148. src_c7 -= src_c15 * src_b59;
  149. src_c6 -= src_c14 * src_b59;
  150. src_c5 -= src_c15 * src_b58;
  151. src_c4 -= src_c14 * src_b58;
  152. src_c3 -= src_c15 * src_b57;
  153. src_c2 -= src_c14 * src_b57;
  154. src_c1 -= src_c15 * src_b56;
  155. src_c0 -= src_c14 * src_b56;
  156. src_b = LD_SP(b + 48);
  157. SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
  158. src_b52 = LD_SP(b + 52);
  159. src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
  160. src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
  161. src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
  162. src_c12 *= src_b54;
  163. src_c13 *= src_b54;
  164. src_c10 -= src_c12 * src_b53;
  165. src_c11 -= src_c13 * src_b53;
  166. src_c8 -= src_c12 * src_b52;
  167. src_c9 -= src_c13 * src_b52;
  168. src_c6 -= src_c12 * src_b51;
  169. src_c7 -= src_c13 * src_b51;
  170. src_c4 -= src_c12 * src_b50;
  171. src_c5 -= src_c13 * src_b50;
  172. src_c2 -= src_c12 * src_b49;
  173. src_c3 -= src_c13 * src_b49;
  174. src_c0 -= src_c12 * src_b48;
  175. src_c1 -= src_c13 * src_b48;
  176. ST_SP4(src_c12, src_c13, src_c14, src_c15, a - 16, 4);
  177. ST_SP2(src_c12, src_c13, c_nxt6line, 4);
  178. ST_SP2(src_c14, src_c15, c_nxt7line, 4);
  179. src_b = LD_SP(b + 40);
  180. SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
  181. src_b44 = LD_SP(b + 44);
  182. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
  183. src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
  184. src_c10 *= src_b45;
  185. src_c11 *= src_b45;
  186. src_c8 -= src_c10 * src_b44;
  187. src_c9 -= src_c11 * src_b44;
  188. src_c6 -= src_c10 * src_b43;
  189. src_c7 -= src_c11 * src_b43;
  190. src_c4 -= src_c10 * src_b42;
  191. src_c5 -= src_c11 * src_b42;
  192. src_c2 -= src_c10 * src_b41;
  193. src_c3 -= src_c11 * src_b41;
  194. src_c0 -= src_c10 * src_b40;
  195. src_c1 -= src_c11 * src_b40;
  196. src_b = LD_SP(b + 32);
  197. SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
  198. src_b36 = __msa_cast_to_vector_float(*(b + 36));
  199. src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0);
  200. src_c8 *= src_b36;
  201. src_c9 *= src_b36;
  202. src_c6 -= src_c8 * src_b35;
  203. src_c7 -= src_c9 * src_b35;
  204. src_c4 -= src_c8 * src_b34;
  205. src_c5 -= src_c9 * src_b34;
  206. src_c2 -= src_c8 * src_b33;
  207. src_c3 -= src_c9 * src_b33;
  208. src_c0 -= src_c8 * src_b32;
  209. src_c1 -= src_c9 * src_b32;
  210. ST_SP4(src_c8, src_c9, src_c10, src_c11, a - 32, 4);
  211. ST_SP2(src_c8, src_c9, c_nxt4line, 4);
  212. ST_SP2(src_c10, src_c11, c_nxt5line, 4);
  213. src_b = LD_SP(b + 24);
  214. SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
  215. src_c6 *= src_b27;
  216. src_c7 *= src_b27;
  217. src_c4 -= src_c6 * src_b26;
  218. src_c5 -= src_c7 * src_b26;
  219. src_c2 -= src_c6 * src_b25;
  220. src_c3 -= src_c7 * src_b25;
  221. src_c0 -= src_c6 * src_b24;
  222. src_c1 -= src_c7 * src_b24;
  223. src_b16 = LD_SP(b + 16);
  224. src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
  225. src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
  226. src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
  227. src_c4 *= src_b18;
  228. src_c5 *= src_b18;
  229. src_c2 -= src_c4 * src_b17;
  230. src_c3 -= src_c5 * src_b17;
  231. src_c0 -= src_c4 * src_b16;
  232. src_c1 -= src_c5 * src_b16;
  233. ST_SP4(src_c4, src_c5, src_c6, src_c7, a - 48, 4);
  234. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  235. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  236. src_b9 = __msa_cast_to_vector_float(*(b + 9));
  237. src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0);
  238. src_b8 = __msa_cast_to_vector_float(*(b + 8));
  239. src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
  240. src_b0 = __msa_cast_to_vector_float(*(b + 0));
  241. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  242. src_c2 *= src_b9;
  243. src_c3 *= src_b9;
  244. src_c0 -= src_c2 * src_b8;
  245. src_c1 -= src_c3 * src_b8;
  246. src_c0 *= src_b0;
  247. src_c1 *= src_b0;
  248. ST_SP4(src_c0, src_c1, src_c2, src_c3, a - 64, 4);
  249. ST_SP2(src_c0, src_c1, c, 4);
  250. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  251. }
  252. static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  253. {
  254. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  255. v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12;
  256. v4f32 src_b13, src_b14, src_b15;
  257. FLOAT *c_nxt1line = c + ldc;
  258. FLOAT *c_nxt2line = c + 2 * ldc;
  259. FLOAT *c_nxt3line = c + 3 * ldc;
  260. if (bk > 0)
  261. {
  262. BLASLONG k;
  263. FLOAT *aa = a, *bb = b;
  264. v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
  265. v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
  266. LD_SP2(aa, 4, src_a0, src_a1);
  267. src_b = LD_SP(bb + 0);
  268. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  269. res0 = src_a0 * src_b0;
  270. res1 = src_a1 * src_b0;
  271. res2 = src_a0 * src_b1;
  272. res3 = src_a1 * src_b1;
  273. res4 = src_a0 * src_b2;
  274. res5 = src_a1 * src_b2;
  275. res6 = src_a0 * src_b3;
  276. res7 = src_a1 * src_b3;
  277. for (k = (bk - 1) / 2; k--;)
  278. {
  279. aa += 8;
  280. bb += 4;
  281. LD_SP2(aa, 4, src_a0, src_a1);
  282. src_b = LD_SP(bb + 0);
  283. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  284. res0 += src_a0 * src_b0;
  285. res1 += src_a1 * src_b0;
  286. res2 += src_a0 * src_b1;
  287. res3 += src_a1 * src_b1;
  288. res4 += src_a0 * src_b2;
  289. res5 += src_a1 * src_b2;
  290. res6 += src_a0 * src_b3;
  291. res7 += src_a1 * src_b3;
  292. aa += 8;
  293. bb += 4;
  294. LD_SP2(aa, 4, src_a0, src_a1);
  295. src_b = LD_SP(bb + 0);
  296. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  297. res0 += src_a0 * src_b0;
  298. res1 += src_a1 * src_b0;
  299. res2 += src_a0 * src_b1;
  300. res3 += src_a1 * src_b1;
  301. res4 += src_a0 * src_b2;
  302. res5 += src_a1 * src_b2;
  303. res6 += src_a0 * src_b3;
  304. res7 += src_a1 * src_b3;
  305. }
  306. if ((bk - 1) & 1)
  307. {
  308. aa += 8;
  309. bb += 4;
  310. LD_SP2(aa, 4, src_a0, src_a1);
  311. src_b = LD_SP(bb + 0);
  312. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  313. res0 += src_a0 * src_b0;
  314. res1 += src_a1 * src_b0;
  315. res2 += src_a0 * src_b1;
  316. res3 += src_a1 * src_b1;
  317. res4 += src_a0 * src_b2;
  318. res5 += src_a1 * src_b2;
  319. res6 += src_a0 * src_b3;
  320. res7 += src_a1 * src_b3;
  321. }
  322. LD_SP2(c, 4, src_c0, src_c1);
  323. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  324. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  325. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  326. src_c0 -= res0;
  327. src_c1 -= res1;
  328. src_c2 -= res2;
  329. src_c3 -= res3;
  330. src_c4 -= res4;
  331. src_c5 -= res5;
  332. src_c6 -= res6;
  333. src_c7 -= res7;
  334. }
  335. else
  336. {
  337. LD_SP2(c, 4, src_c0, src_c1);
  338. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  339. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  340. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  341. }
  342. a -= 32;
  343. b -= 16;
  344. src_b = LD_SP(b + 12);
  345. SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
  346. src_b8 = LD_SP(b + 8);
  347. src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
  348. src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
  349. src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
  350. src_b5 = __msa_cast_to_vector_float(*(b + 5));
  351. src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
  352. src_b4 = __msa_cast_to_vector_float(*(b + 4));
  353. src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0);
  354. src_b0 = __msa_cast_to_vector_float(*(b + 0));
  355. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  356. src_c7 *= src_b15;
  357. src_c6 *= src_b15;
  358. src_c5 -= src_c7 * src_b14;
  359. src_c4 -= src_c6 * src_b14;
  360. src_c3 -= src_c7 * src_b13;
  361. src_c2 -= src_c6 * src_b13;
  362. src_c1 -= src_c7 * src_b12;
  363. src_c0 -= src_c6 * src_b12;
  364. src_c5 *= src_b10;
  365. src_c4 *= src_b10;
  366. src_c3 -= src_c5 * src_b9;
  367. src_c2 -= src_c4 * src_b9;
  368. src_c1 -= src_c5 * src_b8;
  369. src_c0 -= src_c4 * src_b8;
  370. src_c3 *= src_b5;
  371. src_c2 *= src_b5;
  372. src_c1 -= src_c3 * src_b4;
  373. src_c0 -= src_c2 * src_b4;
  374. src_c1 *= src_b0;
  375. src_c0 *= src_b0;
  376. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  377. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  378. ST_SP2(src_c0, src_c1, c, 4);
  379. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  380. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  381. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  382. }
  383. static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  384. {
  385. v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
  386. FLOAT *c_nxt1line = c + ldc;
  387. if (bk > 0)
  388. {
  389. BLASLONG k;
  390. FLOAT *aa = a, *bb = b;
  391. v4f32 src_a0, src_a1, src_b1, res0, res1, res2, res3;
  392. LD_SP2(aa, 4, src_a0, src_a1);
  393. src_b0 = __msa_cast_to_vector_float(*bb);
  394. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  395. src_b1 = __msa_cast_to_vector_float(*(bb + 1));
  396. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0);
  397. res0 = src_a0 * src_b0;
  398. res1 = src_a1 * src_b0;
  399. res2 = src_a0 * src_b1;
  400. res3 = src_a1 * src_b1;
  401. for (k = (bk - 1) >> 1; k--;)
  402. {
  403. aa += 8;
  404. bb += 2;
  405. LD_SP2(aa, 4, src_a0, src_a1);
  406. src_b0 = __msa_cast_to_vector_float(*bb);
  407. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  408. src_b1 = __msa_cast_to_vector_float(*(bb + 1));
  409. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0);
  410. res0 += src_a0 * src_b0;
  411. res1 += src_a1 * src_b0;
  412. res2 += src_a0 * src_b1;
  413. res3 += src_a1 * src_b1;
  414. aa += 8;
  415. bb += 2;
  416. LD_SP2(aa, 4, src_a0, src_a1);
  417. src_b0 = __msa_cast_to_vector_float(*bb);
  418. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  419. src_b1 = __msa_cast_to_vector_float(*(bb + 1));
  420. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0);
  421. res0 += src_a0 * src_b0;
  422. res1 += src_a1 * src_b0;
  423. res2 += src_a0 * src_b1;
  424. res3 += src_a1 * src_b1;
  425. }
  426. if ((bk - 1) & 1)
  427. {
  428. aa += 8;
  429. bb += 2;
  430. LD_SP2(aa, 4, src_a0, src_a1);
  431. src_b0 = __msa_cast_to_vector_float(*bb);
  432. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  433. src_b1 = __msa_cast_to_vector_float(*(bb + 1));
  434. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b1, 0);
  435. res0 += src_a0 * src_b0;
  436. res1 += src_a1 * src_b0;
  437. res2 += src_a0 * src_b1;
  438. res3 += src_a1 * src_b1;
  439. }
  440. LD_SP2(c, 4, src_c0, src_c1);
  441. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  442. src_c0 -= res0;
  443. src_c1 -= res1;
  444. src_c2 -= res2;
  445. src_c3 -= res3;
  446. }
  447. else
  448. {
  449. LD_SP2(c, 4, src_c0, src_c1);
  450. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  451. }
  452. a -= 16;
  453. b -= 4;
  454. src_b0 = __msa_cast_to_vector_float(*(b + 0));
  455. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  456. src_b2 = __msa_cast_to_vector_float(*(b + 2));
  457. src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0);
  458. src_b3 = __msa_cast_to_vector_float(*(b + 3));
  459. src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0);
  460. src_c2 *= src_b3;
  461. src_c3 *= src_b3;
  462. src_c0 -= src_c2 * src_b2;
  463. src_c1 -= src_c3 * src_b2;
  464. src_c0 *= src_b0;
  465. src_c1 *= src_b0;
  466. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  467. ST_SP2(src_c0, src_c1, c, 4);
  468. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  469. }
  470. static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  471. {
  472. v4f32 src_c0, src_c1, src_b0;
  473. if (bk > 0)
  474. {
  475. BLASLONG k;
  476. FLOAT *aa = a, *bb = b;
  477. v4f32 src_a0, src_a1, res0, res1;
  478. LD_SP2(aa, 4, src_a0, src_a1);
  479. src_b0 = __msa_cast_to_vector_float(*bb);
  480. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  481. res0 = src_a0 * src_b0;
  482. res1 = src_a1 * src_b0;
  483. for (k = (bk - 1) >> 2; k--;)
  484. {
  485. aa += 8;
  486. bb += 1;
  487. LD_SP2(aa, 4, src_a0, src_a1);
  488. src_b0 = __msa_cast_to_vector_float(*bb);
  489. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  490. res0 += src_a0 * src_b0;
  491. res1 += src_a1 * src_b0;
  492. aa += 8;
  493. bb += 1;
  494. LD_SP2(aa, 4, src_a0, src_a1);
  495. src_b0 = __msa_cast_to_vector_float(*bb);
  496. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  497. res0 += src_a0 * src_b0;
  498. res1 += src_a1 * src_b0;
  499. aa += 8;
  500. bb += 1;
  501. LD_SP2(aa, 4, src_a0, src_a1);
  502. src_b0 = __msa_cast_to_vector_float(*bb);
  503. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  504. res0 += src_a0 * src_b0;
  505. res1 += src_a1 * src_b0;
  506. aa += 8;
  507. bb += 1;
  508. LD_SP2(aa, 4, src_a0, src_a1);
  509. src_b0 = __msa_cast_to_vector_float(*bb);
  510. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  511. res0 += src_a0 * src_b0;
  512. res1 += src_a1 * src_b0;
  513. }
  514. if ((bk - 1) & 3)
  515. {
  516. if ((bk - 1) & 2)
  517. {
  518. aa += 8;
  519. bb += 1;
  520. LD_SP2(aa, 4, src_a0, src_a1);
  521. src_b0 = __msa_cast_to_vector_float(*bb);
  522. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  523. res0 += src_a0 * src_b0;
  524. res1 += src_a1 * src_b0;
  525. aa += 8;
  526. bb += 1;
  527. LD_SP2(aa, 4, src_a0, src_a1);
  528. src_b0 = __msa_cast_to_vector_float(*bb);
  529. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  530. res0 += src_a0 * src_b0;
  531. res1 += src_a1 * src_b0;
  532. }
  533. if ((bk - 1) & 1)
  534. {
  535. aa += 8;
  536. bb += 1;
  537. LD_SP2(aa, 4, src_a0, src_a1);
  538. src_b0 = __msa_cast_to_vector_float(*bb);
  539. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  540. res0 += src_a0 * src_b0;
  541. res1 += src_a1 * src_b0;
  542. }
  543. }
  544. LD_SP2(c, 4, src_c0, src_c1);
  545. src_c0 -= res0;
  546. src_c1 -= res1;
  547. }
  548. else
  549. {
  550. LD_SP2(c, 4, src_c0, src_c1);
  551. }
  552. a -= 8;
  553. b -= 1;
  554. src_b0 = __msa_cast_to_vector_float(*(b + 0));
  555. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  556. src_c0 *= src_b0;
  557. src_c1 *= src_b0;
  558. ST_SP2(src_c0, src_c1, a, 4);
  559. ST_SP2(src_c0, src_c1, c, 4);
  560. }
  561. static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  562. {
  563. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  564. v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
  565. v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
  566. v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
  567. v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
  568. v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
  569. FLOAT *c_nxt1line = c + ldc;
  570. FLOAT *c_nxt2line = c + 2 * ldc;
  571. FLOAT *c_nxt3line = c + 3 * ldc;
  572. FLOAT *c_nxt4line = c + 4 * ldc;
  573. FLOAT *c_nxt5line = c + 5 * ldc;
  574. FLOAT *c_nxt6line = c + 6 * ldc;
  575. FLOAT *c_nxt7line = c + 7 * ldc;
  576. if (bk > 0)
  577. {
  578. BLASLONG k;
  579. FLOAT *aa = a, *bb = b;
  580. v4f32 src_a0, src_b1, src_b2, src_b3;
  581. v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
  582. src_a0 = LD_SP(aa);
  583. src_b = LD_SP(bb + 0);
  584. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  585. res0 = src_a0 * src_b0;
  586. res1 = src_a0 * src_b1;
  587. res2 = src_a0 * src_b2;
  588. res3 = src_a0 * src_b3;
  589. src_b = LD_SP(bb + 4);
  590. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  591. res4 = src_a0 * src_b0;
  592. res5 = src_a0 * src_b1;
  593. res6 = src_a0 * src_b2;
  594. res7 = src_a0 * src_b3;
  595. for (k = (bk - 1); k--;)
  596. {
  597. aa += 4;
  598. bb += 8;
  599. src_a0 = LD_SP(aa);
  600. src_b = LD_SP(bb + 0);
  601. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  602. res0 += src_a0 * src_b0;
  603. res1 += src_a0 * src_b1;
  604. res2 += src_a0 * src_b2;
  605. res3 += src_a0 * src_b3;
  606. src_b = LD_SP(bb + 4);
  607. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  608. res4 += src_a0 * src_b0;
  609. res5 += src_a0 * src_b1;
  610. res6 += src_a0 * src_b2;
  611. res7 += src_a0 * src_b3;
  612. }
  613. src_c0 = LD_SP(c);
  614. src_c1 = LD_SP(c_nxt1line);
  615. src_c2 = LD_SP(c_nxt2line);
  616. src_c3 = LD_SP(c_nxt3line);
  617. src_c4 = LD_SP(c_nxt4line);
  618. src_c5 = LD_SP(c_nxt5line);
  619. src_c6 = LD_SP(c_nxt6line);
  620. src_c7 = LD_SP(c_nxt7line);
  621. src_c0 -= res0;
  622. src_c1 -= res1;
  623. src_c2 -= res2;
  624. src_c3 -= res3;
  625. src_c4 -= res4;
  626. src_c5 -= res5;
  627. src_c6 -= res6;
  628. src_c7 -= res7;
  629. }
  630. else
  631. {
  632. src_c0 = LD_SP(c);
  633. src_c1 = LD_SP(c_nxt1line);
  634. src_c2 = LD_SP(c_nxt2line);
  635. src_c3 = LD_SP(c_nxt3line);
  636. src_c4 = LD_SP(c_nxt4line);
  637. src_c5 = LD_SP(c_nxt5line);
  638. src_c6 = LD_SP(c_nxt6line);
  639. src_c7 = LD_SP(c_nxt7line);
  640. }
  641. a -= 32;
  642. b -= 64;
  643. src_b = LD_SP(b + 60);
  644. SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
  645. src_b = LD_SP(b + 56);
  646. SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
  647. src_b = LD_SP(b + 48);
  648. SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
  649. src_b52 = LD_SP(b + 52);
  650. src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
  651. src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
  652. src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
  653. src_b = LD_SP(b + 40);
  654. SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
  655. src_b44 = LD_SP(b + 44);
  656. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
  657. src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
  658. src_b = LD_SP(b + 32);
  659. SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
  660. src_b36 = __msa_cast_to_vector_float(*(b + 36));
  661. src_b36 = (v4f32) __msa_splati_w((v4i32) src_b36, 0);
  662. src_b = LD_SP(b + 24);
  663. SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
  664. src_b16 = LD_SP(b + 16);
  665. src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
  666. src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
  667. src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
  668. src_b9 = __msa_cast_to_vector_float(*(b + 9));
  669. src_b9 = (v4f32) __msa_splati_w((v4i32) src_b9, 0);
  670. src_b8 = __msa_cast_to_vector_float(*(b + 8));
  671. src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
  672. src_b0 = __msa_cast_to_vector_float(*(b + 0));
  673. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  674. src_c7 *= src_b63;
  675. src_c6 -= src_c7 * src_b62;
  676. src_c5 -= src_c7 * src_b61;
  677. src_c4 -= src_c7 * src_b60;
  678. src_c3 -= src_c7 * src_b59;
  679. src_c2 -= src_c7 * src_b58;
  680. src_c1 -= src_c7 * src_b57;
  681. src_c0 -= src_c7 * src_b56;
  682. src_c6 *= src_b54;
  683. src_c5 -= src_c6 * src_b53;
  684. src_c4 -= src_c6 * src_b52;
  685. src_c3 -= src_c6 * src_b51;
  686. src_c2 -= src_c6 * src_b50;
  687. src_c1 -= src_c6 * src_b49;
  688. src_c0 -= src_c6 * src_b48;
  689. src_c5 *= src_b45;
  690. src_c4 -= src_c5 * src_b44;
  691. src_c3 -= src_c5 * src_b43;
  692. src_c2 -= src_c5 * src_b42;
  693. src_c1 -= src_c5 * src_b41;
  694. src_c0 -= src_c5 * src_b40;
  695. src_c4 *= src_b36;
  696. src_c3 -= src_c4 * src_b35;
  697. src_c2 -= src_c4 * src_b34;
  698. src_c1 -= src_c4 * src_b33;
  699. src_c0 -= src_c4 * src_b32;
  700. src_c3 *= src_b27;
  701. src_c2 -= src_c3 * src_b26;
  702. src_c1 -= src_c3 * src_b25;
  703. src_c0 -= src_c3 * src_b24;
  704. src_c2 *= src_b18;
  705. src_c1 -= src_c2 * src_b17;
  706. src_c0 -= src_c2 * src_b16;
  707. src_c1 *= src_b9;
  708. src_c0 -= src_c1 * src_b8;
  709. src_c0 *= src_b0;
  710. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  711. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  712. ST_SP(src_c0, c);
  713. ST_SP(src_c1, c_nxt1line);
  714. ST_SP(src_c2, c_nxt2line);
  715. ST_SP(src_c3, c_nxt3line);
  716. ST_SP(src_c4, c_nxt4line);
  717. ST_SP(src_c5, c_nxt5line);
  718. ST_SP(src_c6, c_nxt6line);
  719. ST_SP(src_c7, c_nxt7line);
  720. }
  721. static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  722. {
  723. v4f32 src_c0, src_c1, src_c2, src_c3, src_b;
  724. v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
  725. v4f32 src_b14, src_b15;
  726. FLOAT *c_nxt1line = c + ldc;
  727. FLOAT *c_nxt2line = c + 2 * ldc;
  728. FLOAT *c_nxt3line = c + 3 * ldc;
  729. if (bk > 0)
  730. {
  731. BLASLONG k;
  732. FLOAT *aa = a, *bb = b;
  733. v4f32 src_a, src_b1, src_b2, src_b3, res0, res1, res2, res3;
  734. src_a = LD_SP(aa);
  735. src_b = LD_SP(bb);
  736. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  737. res0 = src_a * src_b0;
  738. res1 = src_a * src_b1;
  739. res2 = src_a * src_b2;
  740. res3 = src_a * src_b3;
  741. for (k = ((bk - 1) >> 1); k--;)
  742. {
  743. aa += 4;
  744. bb += 4;
  745. src_a = LD_SP(aa);
  746. src_b = LD_SP(bb);
  747. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  748. res0 += src_a * src_b0;
  749. res1 += src_a * src_b1;
  750. res2 += src_a * src_b2;
  751. res3 += src_a * src_b3;
  752. aa += 4;
  753. bb += 4;
  754. src_a = LD_SP(aa);
  755. src_b = LD_SP(bb);
  756. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  757. res0 += src_a * src_b0;
  758. res1 += src_a * src_b1;
  759. res2 += src_a * src_b2;
  760. res3 += src_a * src_b3;
  761. }
  762. if ((bk - 1) & 1)
  763. {
  764. aa += 4;
  765. bb += 4;
  766. src_a = LD_SP(aa);
  767. src_b = LD_SP(bb);
  768. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  769. res0 += src_a * src_b0;
  770. res1 += src_a * src_b1;
  771. res2 += src_a * src_b2;
  772. res3 += src_a * src_b3;
  773. }
  774. src_c0 = LD_SP(c);
  775. src_c1 = LD_SP(c_nxt1line);
  776. src_c2 = LD_SP(c_nxt2line);
  777. src_c3 = LD_SP(c_nxt3line);
  778. src_c0 -= res0;
  779. src_c1 -= res1;
  780. src_c2 -= res2;
  781. src_c3 -= res3;
  782. }
  783. else
  784. {
  785. src_c0 = LD_SP(c);
  786. src_c1 = LD_SP(c_nxt1line);
  787. src_c2 = LD_SP(c_nxt2line);
  788. src_c3 = LD_SP(c_nxt3line);
  789. }
  790. a -= 16;
  791. b -= 16;
  792. src_b = LD_SP(b + 12);
  793. SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
  794. src_b8 = LD_SP(b + 8);
  795. src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
  796. src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
  797. src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
  798. src_b5 = __msa_cast_to_vector_float(*(b + 5));
  799. src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
  800. src_b4 = __msa_cast_to_vector_float(*(b + 4));
  801. src_b4 = (v4f32) __msa_splati_w((v4i32) src_b4, 0);
  802. src_b0 = __msa_cast_to_vector_float(*(b + 0));
  803. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  804. src_c3 *= src_b15;
  805. src_c2 -= src_c3 * src_b14;
  806. src_c1 -= src_c3 * src_b13;
  807. src_c0 -= src_c3 * src_b12;
  808. src_c2 *= src_b10;
  809. src_c1 -= src_c2 * src_b9;
  810. src_c0 -= src_c2 * src_b8;
  811. src_c1 *= src_b5;
  812. src_c0 -= src_c1 * src_b4;
  813. src_c0 *= src_b0;
  814. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  815. ST_SP(src_c0, c);
  816. ST_SP(src_c1, c_nxt1line);
  817. ST_SP(src_c2, c_nxt2line);
  818. ST_SP(src_c3, c_nxt3line);
  819. }
  820. static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  821. {
  822. v4f32 src_c0, src_c1, src_b0, src_b2, src_b3;
  823. FLOAT *c_nxt1line = c + ldc;
  824. if (bk > 0)
  825. {
  826. BLASLONG k;
  827. FLOAT *aa = a, *bb = b;
  828. v4f32 src_a, src_b1, res0, res1;
  829. src_a = LD_SP(aa);
  830. src_b0 = LD_SP(bb);
  831. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  832. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  833. res0 = src_a * src_b0;
  834. res1 = src_a * src_b1;
  835. for (k = ((bk - 1) >> 1); k--;)
  836. {
  837. aa += 4;
  838. bb += 2;
  839. src_a = LD_SP(aa);
  840. src_b0 = LD_SP(bb);
  841. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  842. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  843. res0 += src_a * src_b0;
  844. res1 += src_a * src_b1;
  845. aa += 4;
  846. bb += 2;
  847. src_a = LD_SP(aa);
  848. src_b0 = LD_SP(bb);
  849. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  850. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  851. res0 += src_a * src_b0;
  852. res1 += src_a * src_b1;
  853. }
  854. if ((bk - 1) & 1)
  855. {
  856. aa += 4;
  857. bb += 2;
  858. src_a = LD_SP(aa);
  859. src_b0 = LD_SP(bb);
  860. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  861. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  862. res0 += src_a * src_b0;
  863. res1 += src_a * src_b1;
  864. }
  865. src_c0 = LD_SP(c);
  866. src_c1 = LD_SP(c_nxt1line);
  867. src_c0 -= res0;
  868. src_c1 -= res1;
  869. }
  870. else
  871. {
  872. src_c0 = LD_SP(c);
  873. src_c1 = LD_SP(c_nxt1line);
  874. }
  875. a -= 8;
  876. b -= 4;
  877. src_b3 = __msa_cast_to_vector_float(*(b + 3));
  878. src_b3 = (v4f32) __msa_splati_w((v4i32) src_b3, 0);
  879. src_b2 = __msa_cast_to_vector_float(*(b + 2));
  880. src_b2 = (v4f32) __msa_splati_w((v4i32) src_b2, 0);
  881. src_b0 = __msa_cast_to_vector_float(*(b + 0));
  882. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  883. src_c1 *= src_b3;
  884. src_c0 -= src_c1 * src_b2;
  885. src_c0 *= src_b0;
  886. ST_SP2(src_c0, src_c1, a, 4);
  887. ST_SP(src_c0, c);
  888. ST_SP(src_c1, c_nxt1line);
  889. }
  890. static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  891. {
  892. FLOAT b0, c0, c1, c2, c3;
  893. c0 = *(c + 0);
  894. c1 = *(c + 1);
  895. c2 = *(c + 2);
  896. c3 = *(c + 3);
  897. if (bk > 0)
  898. {
  899. BLASLONG k;
  900. FLOAT *aa = a, *bb = b;
  901. FLOAT t0, t1, t2, t3;
  902. t0 = aa[0] * bb[0];
  903. t1 = aa[1] * bb[0];
  904. t2 = aa[2] * bb[0];
  905. t3 = aa[3] * bb[0];
  906. for (k = (bk - 1); k--;)
  907. {
  908. aa += 4;
  909. bb += 1;
  910. t0 += aa[0] * bb[0];
  911. t1 += aa[1] * bb[0];
  912. t2 += aa[2] * bb[0];
  913. t3 += aa[3] * bb[0];
  914. }
  915. c0 -= t0;
  916. c1 -= t1;
  917. c2 -= t2;
  918. c3 -= t3;
  919. }
  920. a -= 4;
  921. b -= 1;
  922. b0 = *b;
  923. c0 *= b0;
  924. c1 *= b0;
  925. c2 *= b0;
  926. c3 *= b0;
  927. *(a + 0) = c0;
  928. *(a + 1) = c1;
  929. *(a + 2) = c2;
  930. *(a + 3) = c3;
  931. *(c + 0) = c0;
  932. *(c + 1) = c1;
  933. *(c + 2) = c2;
  934. *(c + 3) = c3;
  935. }
  936. static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  937. {
  938. FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
  939. FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
  940. FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7;
  941. FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  942. FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
  943. c0 = *(c + 0);
  944. c1 = *(c + 1);
  945. c0_nxt1 = *(c + 0 + 1 * ldc);
  946. c1_nxt1 = *(c + 1 + 1 * ldc);
  947. c0_nxt2 = *(c + 0 + 2 * ldc);
  948. c1_nxt2 = *(c + 1 + 2 * ldc);
  949. c0_nxt3 = *(c + 0 + 3 * ldc);
  950. c1_nxt3 = *(c + 1 + 3 * ldc);
  951. c0_nxt4 = *(c + 0 + 4 * ldc);
  952. c1_nxt4 = *(c + 1 + 4 * ldc);
  953. c0_nxt5 = *(c + 0 + 5 * ldc);
  954. c1_nxt5 = *(c + 1 + 5 * ldc);
  955. c0_nxt6 = *(c + 0 + 6 * ldc);
  956. c1_nxt6 = *(c + 1 + 6 * ldc);
  957. c0_nxt7 = *(c + 0 + 7 * ldc);
  958. c1_nxt7 = *(c + 1 + 7 * ldc);
  959. if (bk > 0)
  960. {
  961. BLASLONG k;
  962. FLOAT *aa = a, *bb = b;
  963. FLOAT res[16];
  964. res[0] = aa[0] * bb[0];
  965. res[1] = aa[1] * bb[0];
  966. res[2] = aa[0] * bb[1];
  967. res[3] = aa[1] * bb[1];
  968. res[4] = aa[0] * bb[2];
  969. res[5] = aa[1] * bb[2];
  970. res[6] = aa[0] * bb[3];
  971. res[7] = aa[1] * bb[3];
  972. res[8] = aa[0] * bb[4];
  973. res[9] = aa[1] * bb[4];
  974. res[10] = aa[0] * bb[5];
  975. res[11] = aa[1] * bb[5];
  976. res[12] = aa[0] * bb[6];
  977. res[13] = aa[1] * bb[6];
  978. res[14] = aa[0] * bb[7];
  979. res[15] = aa[1] * bb[7];
  980. for (k = (bk - 1); k--;)
  981. {
  982. aa += 2;
  983. bb += 8;
  984. res[0] += aa[0] * bb[0];
  985. res[1] += aa[1] * bb[0];
  986. res[2] += aa[0] * bb[1];
  987. res[3] += aa[1] * bb[1];
  988. res[4] += aa[0] * bb[2];
  989. res[5] += aa[1] * bb[2];
  990. res[6] += aa[0] * bb[3];
  991. res[7] += aa[1] * bb[3];
  992. res[8] += aa[0] * bb[4];
  993. res[9] += aa[1] * bb[4];
  994. res[10] += aa[0] * bb[5];
  995. res[11] += aa[1] * bb[5];
  996. res[12] += aa[0] * bb[6];
  997. res[13] += aa[1] * bb[6];
  998. res[14] += aa[0] * bb[7];
  999. res[15] += aa[1] * bb[7];
  1000. }
  1001. c0 -= res[0];
  1002. c1 -= res[1];
  1003. c0_nxt1 -= res[2];
  1004. c1_nxt1 -= res[3];
  1005. c0_nxt2 -= res[4];
  1006. c1_nxt2 -= res[5];
  1007. c0_nxt3 -= res[6];
  1008. c1_nxt3 -= res[7];
  1009. c0_nxt4 -= res[8];
  1010. c1_nxt4 -= res[9];
  1011. c0_nxt5 -= res[10];
  1012. c1_nxt5 -= res[11];
  1013. c0_nxt6 -= res[12];
  1014. c1_nxt6 -= res[13];
  1015. c0_nxt7 -= res[14];
  1016. c1_nxt7 -= res[15];
  1017. }
  1018. a -= 16;
  1019. b -= 64;
  1020. b0 = *(b + 0);
  1021. b8 = *(b + 8);
  1022. b9 = *(b + 9);
  1023. b16 = *(b + 16);
  1024. b17 = *(b + 17);
  1025. b18 = *(b + 18);
  1026. b24 = *(b + 24);
  1027. b25 = *(b + 25);
  1028. b26 = *(b + 26);
  1029. b27 = *(b + 27);
  1030. b32 = *(b + 32);
  1031. b33 = *(b + 33);
  1032. b34 = *(b + 34);
  1033. b35 = *(b + 35);
  1034. b36 = *(b + 36);
  1035. b40 = *(b + 40);
  1036. b41 = *(b + 41);
  1037. b42 = *(b + 42);
  1038. b43 = *(b + 43);
  1039. b44 = *(b + 44);
  1040. b45 = *(b + 45);
  1041. b48 = *(b + 48);
  1042. b49 = *(b + 49);
  1043. b50 = *(b + 50);
  1044. b51 = *(b + 51);
  1045. b52 = *(b + 52);
  1046. b53 = *(b + 53);
  1047. b54 = *(b + 54);
  1048. b56 = *(b + 56);
  1049. b57 = *(b + 57);
  1050. b58 = *(b + 58);
  1051. b59 = *(b + 59);
  1052. b60 = *(b + 60);
  1053. b61 = *(b + 61);
  1054. b62 = *(b + 62);
  1055. b63 = *(b + 63);
  1056. c0_nxt7 *= b63;
  1057. c1_nxt7 *= b63;
  1058. c0_nxt6 -= c0_nxt7 * b62;
  1059. c1_nxt6 -= c1_nxt7 * b62;
  1060. c0_nxt6 *= b54;
  1061. c1_nxt6 *= b54;
  1062. c0_nxt5 -= c0_nxt7 * b61;
  1063. c1_nxt5 -= c1_nxt7 * b61;
  1064. c0_nxt5 -= c0_nxt6 * b53;
  1065. c1_nxt5 -= c1_nxt6 * b53;
  1066. c0_nxt5 *= b45;
  1067. c1_nxt5 *= b45;
  1068. c0_nxt4 -= c0_nxt7 * b60;
  1069. c1_nxt4 -= c1_nxt7 * b60;
  1070. c0_nxt4 -= c0_nxt6 * b52;
  1071. c1_nxt4 -= c1_nxt6 * b52;
  1072. c0_nxt4 -= c0_nxt5 * b44;
  1073. c1_nxt4 -= c1_nxt5 * b44;
  1074. c0_nxt4 *= b36;
  1075. c1_nxt4 *= b36;
  1076. c0_nxt3 -= c0_nxt7 * b59;
  1077. c1_nxt3 -= c1_nxt7 * b59;
  1078. c0_nxt3 -= c0_nxt6 * b51;
  1079. c1_nxt3 -= c1_nxt6 * b51;
  1080. c0_nxt3 -= c0_nxt5 * b43;
  1081. c1_nxt3 -= c1_nxt5 * b43;
  1082. c0_nxt3 -= c0_nxt4 * b35;
  1083. c1_nxt3 -= c1_nxt4 * b35;
  1084. c0_nxt3 *= b27;
  1085. c1_nxt3 *= b27;
  1086. c0_nxt2 -= c0_nxt7 * b58;
  1087. c1_nxt2 -= c1_nxt7 * b58;
  1088. c0_nxt2 -= c0_nxt6 * b50;
  1089. c1_nxt2 -= c1_nxt6 * b50;
  1090. c0_nxt2 -= c0_nxt5 * b42;
  1091. c1_nxt2 -= c1_nxt5 * b42;
  1092. c0_nxt2 -= c0_nxt4 * b34;
  1093. c1_nxt2 -= c1_nxt4 * b34;
  1094. c0_nxt2 -= c0_nxt3 * b26;
  1095. c1_nxt2 -= c1_nxt3 * b26;
  1096. c0_nxt2 *= b18;
  1097. c1_nxt2 *= b18;
  1098. c0_nxt1 -= c0_nxt7 * b57;
  1099. c1_nxt1 -= c1_nxt7 * b57;
  1100. c0_nxt1 -= c0_nxt6 * b49;
  1101. c1_nxt1 -= c1_nxt6 * b49;
  1102. c0_nxt1 -= c0_nxt5 * b41;
  1103. c1_nxt1 -= c1_nxt5 * b41;
  1104. c0_nxt1 -= c0_nxt4 * b33;
  1105. c1_nxt1 -= c1_nxt4 * b33;
  1106. c0_nxt1 -= c0_nxt3 * b25;
  1107. c1_nxt1 -= c1_nxt3 * b25;
  1108. c0_nxt1 -= c0_nxt2 * b17;
  1109. c1_nxt1 -= c1_nxt2 * b17;
  1110. c0_nxt1 *= b9;
  1111. c1_nxt1 *= b9;
  1112. c0 -= c0_nxt7 * b56;
  1113. c1 -= c1_nxt7 * b56;
  1114. c0 -= c0_nxt6 * b48;
  1115. c1 -= c1_nxt6 * b48;
  1116. c0 -= c0_nxt5 * b40;
  1117. c1 -= c1_nxt5 * b40;
  1118. c0 -= c0_nxt4 * b32;
  1119. c1 -= c1_nxt4 * b32;
  1120. c0 -= c0_nxt3 * b24;
  1121. c1 -= c1_nxt3 * b24;
  1122. c0 -= c0_nxt2 * b16;
  1123. c1 -= c1_nxt2 * b16;
  1124. c0 -= c0_nxt1 * b8;
  1125. c1 -= c1_nxt1 * b8;
  1126. c0 *= b0;
  1127. c1 *= b0;
  1128. *(a + 0) = c0;
  1129. *(a + 1) = c1;
  1130. *(a + 2) = c0_nxt1;
  1131. *(a + 3) = c1_nxt1;
  1132. *(a + 4) = c0_nxt2;
  1133. *(a + 5) = c1_nxt2;
  1134. *(a + 6) = c0_nxt3;
  1135. *(a + 7) = c1_nxt3;
  1136. *(a + 8) = c0_nxt4;
  1137. *(a + 9) = c1_nxt4;
  1138. *(a + 10) = c0_nxt5;
  1139. *(a + 11) = c1_nxt5;
  1140. *(a + 12) = c0_nxt6;
  1141. *(a + 13) = c1_nxt6;
  1142. *(a + 14) = c0_nxt7;
  1143. *(a + 15) = c1_nxt7;
  1144. *(c + 0) = c0;
  1145. *(c + 1) = c1;
  1146. *(c + 0 + 1 * ldc) = c0_nxt1;
  1147. *(c + 1 + 1 * ldc) = c1_nxt1;
  1148. *(c + 0 + 2 * ldc) = c0_nxt2;
  1149. *(c + 1 + 2 * ldc) = c1_nxt2;
  1150. *(c + 0 + 3 * ldc) = c0_nxt3;
  1151. *(c + 1 + 3 * ldc) = c1_nxt3;
  1152. *(c + 0 + 4 * ldc) = c0_nxt4;
  1153. *(c + 1 + 4 * ldc) = c1_nxt4;
  1154. *(c + 0 + 5 * ldc) = c0_nxt5;
  1155. *(c + 1 + 5 * ldc) = c1_nxt5;
  1156. *(c + 0 + 6 * ldc) = c0_nxt6;
  1157. *(c + 1 + 6 * ldc) = c1_nxt6;
  1158. *(c + 0 + 7 * ldc) = c0_nxt7;
  1159. *(c + 1 + 7 * ldc) = c1_nxt7;
  1160. }
  1161. static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1162. {
  1163. FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
  1164. FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  1165. c0 = *(c + 0);
  1166. c1 = *(c + 1);
  1167. c0_nxt1 = *(c + 0 + 1 * ldc);
  1168. c1_nxt1 = *(c + 1 + 1 * ldc);
  1169. c0_nxt2 = *(c + 0 + 2 * ldc);
  1170. c1_nxt2 = *(c + 1 + 2 * ldc);
  1171. c0_nxt3 = *(c + 0 + 3 * ldc);
  1172. c1_nxt3 = *(c + 1 + 3 * ldc);
  1173. if (bk > 0)
  1174. {
  1175. BLASLONG k;
  1176. FLOAT *aa = a, *bb = b;
  1177. FLOAT res[8];
  1178. res[0] = aa[0] * bb[0];
  1179. res[1] = aa[1] * bb[0];
  1180. res[2] = aa[0] * bb[1];
  1181. res[3] = aa[1] * bb[1];
  1182. res[4] = aa[0] * bb[2];
  1183. res[5] = aa[1] * bb[2];
  1184. res[6] = aa[0] * bb[3];
  1185. res[7] = aa[1] * bb[3];
  1186. for (k = (bk - 1); k--;)
  1187. {
  1188. aa += 2;
  1189. bb += 4;
  1190. res[0] += aa[0] * bb[0];
  1191. res[1] += aa[1] * bb[0];
  1192. res[2] += aa[0] * bb[1];
  1193. res[3] += aa[1] * bb[1];
  1194. res[4] += aa[0] * bb[2];
  1195. res[5] += aa[1] * bb[2];
  1196. res[6] += aa[0] * bb[3];
  1197. res[7] += aa[1] * bb[3];
  1198. }
  1199. c0 -= res[0];
  1200. c1 -= res[1];
  1201. c0_nxt1 -= res[2];
  1202. c1_nxt1 -= res[3];
  1203. c0_nxt2 -= res[4];
  1204. c1_nxt2 -= res[5];
  1205. c0_nxt3 -= res[6];
  1206. c1_nxt3 -= res[7];
  1207. }
  1208. a -= 8;
  1209. b -= 16;
  1210. b0 = *b;
  1211. b4 = *(b + 4);
  1212. b5 = *(b + 5);
  1213. b8 = *(b + 8);
  1214. b9 = *(b + 9);
  1215. b10 = *(b + 10);
  1216. b12 = *(b + 12);
  1217. b13 = *(b + 13);
  1218. b14 = *(b + 14);
  1219. b15 = *(b + 15);
  1220. c0_nxt3 *= b15;
  1221. c1_nxt3 *= b15;
  1222. c0_nxt2 = (c0_nxt2 - c0_nxt3 * b14) * b10;
  1223. c1_nxt2 = (c1_nxt2 - c1_nxt3 * b14) * b10;
  1224. c0_nxt1 = ((c0_nxt1 - c0_nxt3 * b13) - c0_nxt2 * b9) * b5;
  1225. c1_nxt1 = ((c1_nxt1 - c1_nxt3 * b13) - c1_nxt2 * b9) * b5;
  1226. c0 = (((c0 - c0_nxt3 * b12) - c0_nxt2 * b8) - c0_nxt1 * b4) * b0;
  1227. c1 = (((c1 - c1_nxt3 * b12) - c1_nxt2 * b8) - c1_nxt1 * b4) * b0;
  1228. *(a + 0) = c0;
  1229. *(a + 1) = c1;
  1230. *(a + 2) = c0_nxt1;
  1231. *(a + 3) = c1_nxt1;
  1232. *(a + 4) = c0_nxt2;
  1233. *(a + 5) = c1_nxt2;
  1234. *(a + 6) = c0_nxt3;
  1235. *(a + 7) = c1_nxt3;
  1236. *(c + 0) = c0;
  1237. *(c + 1) = c1;
  1238. *(c + 0 + 1 * ldc) = c0_nxt1;
  1239. *(c + 1 + 1 * ldc) = c1_nxt1;
  1240. *(c + 0 + 2 * ldc) = c0_nxt2;
  1241. *(c + 1 + 2 * ldc) = c1_nxt2;
  1242. *(c + 0 + 3 * ldc) = c0_nxt3;
  1243. *(c + 1 + 3 * ldc) = c1_nxt3;
  1244. }
  1245. static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1246. {
  1247. FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
  1248. c0 = *(c + 0);
  1249. c1 = *(c + 1);
  1250. c0_nxt = *(c + 0 + ldc);
  1251. c1_nxt = *(c + 1 + ldc);
  1252. if (bk > 0)
  1253. {
  1254. BLASLONG k;
  1255. FLOAT *aa = a, *bb = b;
  1256. FLOAT res[4];
  1257. res[0] = aa[0] * bb[0];
  1258. res[1] = aa[1] * bb[0];
  1259. res[2] = aa[0] * bb[1];
  1260. res[3] = aa[1] * bb[1];
  1261. for (k = (bk - 1); k--;)
  1262. {
  1263. aa += 2;
  1264. bb += 2;
  1265. res[0] += aa[0] * bb[0];
  1266. res[1] += aa[1] * bb[0];
  1267. res[2] += aa[0] * bb[1];
  1268. res[3] += aa[1] * bb[1];
  1269. }
  1270. c0 -= res[0];
  1271. c1 -= res[1];
  1272. c0_nxt -= res[2];
  1273. c1_nxt -= res[3];
  1274. }
  1275. a -= 4;
  1276. b -= 4;
  1277. b3 = *(b + 3);
  1278. b2 = *(b + 2);
  1279. b0 = *b;
  1280. c0_nxt *= b3;
  1281. c1_nxt *= b3;
  1282. c0 -= c0_nxt * b2;
  1283. c1 -= c1_nxt * b2;
  1284. c0 *= b0;
  1285. c1 *= b0;
  1286. *(a + 0) = c0;
  1287. *(a + 1) = c1;
  1288. *(a + 2) = c0_nxt;
  1289. *(a + 3) = c1_nxt;
  1290. *(c + 0) = c0;
  1291. *(c + 1) = c1;
  1292. *(c + 0 + ldc) = c0_nxt;
  1293. *(c + 1 + ldc) = c1_nxt;
  1294. }
  1295. static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1296. {
  1297. FLOAT b0, c0, c1;
  1298. c0 = *(c + 0);
  1299. c1 = *(c + 1);
  1300. if (bk > 0)
  1301. {
  1302. BLASLONG k;
  1303. FLOAT *aa = a, *bb = b;
  1304. FLOAT res0, res1;
  1305. res0 = aa[0] * bb[0];
  1306. res1 = aa[1] * bb[0];
  1307. for (k = (bk - 1); k--;)
  1308. {
  1309. aa += 2;
  1310. bb += 1;
  1311. res0 += aa[0] * bb[0];
  1312. res1 += aa[1] * bb[0];
  1313. }
  1314. c0 -= res0;
  1315. c1 -= res1;
  1316. }
  1317. a -= 2;
  1318. b -= 1;
  1319. b0 = *b;
  1320. c0 *= b0;
  1321. c1 *= b0;
  1322. *(a + 0) = c0;
  1323. *(a + 1) = c1;
  1324. *(c + 0) = c0;
  1325. *(c + 1) = c1;
  1326. }
  1327. static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1328. {
  1329. FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
  1330. FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
  1331. FLOAT b56, b57, b58, b59, b60, b61, b62, b63;
  1332. FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
  1333. c0 = *(c + 0);
  1334. c1 = *(c + 1 * ldc);
  1335. c2 = *(c + 2 * ldc);
  1336. c3 = *(c + 3 * ldc);
  1337. c4 = *(c + 4 * ldc);
  1338. c5 = *(c + 5 * ldc);
  1339. c6 = *(c + 6 * ldc);
  1340. c7 = *(c + 7 * ldc);
  1341. if (bk > 0)
  1342. {
  1343. BLASLONG k;
  1344. FLOAT *aa = a, *bb = b;
  1345. FLOAT t0, t1, t2, t3, t4, t5, t6, t7;
  1346. t0 = aa[0] * bb[0];
  1347. t1 = aa[0] * bb[1];
  1348. t2 = aa[0] * bb[2];
  1349. t3 = aa[0] * bb[3];
  1350. t4 = aa[0] * bb[4];
  1351. t5 = aa[0] * bb[5];
  1352. t6 = aa[0] * bb[6];
  1353. t7 = aa[0] * bb[7];
  1354. for (k = (bk - 1); k--;)
  1355. {
  1356. aa += 1;
  1357. bb += 8;
  1358. t0 += aa[0] * bb[0];
  1359. t1 += aa[0] * bb[1];
  1360. t2 += aa[0] * bb[2];
  1361. t3 += aa[0] * bb[3];
  1362. t4 += aa[0] * bb[4];
  1363. t5 += aa[0] * bb[5];
  1364. t6 += aa[0] * bb[6];
  1365. t7 += aa[0] * bb[7];
  1366. }
  1367. c0 -= t0;
  1368. c1 -= t1;
  1369. c2 -= t2;
  1370. c3 -= t3;
  1371. c4 -= t4;
  1372. c5 -= t5;
  1373. c6 -= t6;
  1374. c7 -= t7;
  1375. }
  1376. a -= 8;
  1377. b -= 64;
  1378. b0 = *(b + 0);
  1379. b8 = *(b + 8);
  1380. b9 = *(b + 9);
  1381. b16 = *(b + 16);
  1382. b17 = *(b + 17);
  1383. b18 = *(b + 18);
  1384. b24 = *(b + 24);
  1385. b25 = *(b + 25);
  1386. b26 = *(b + 26);
  1387. b27 = *(b + 27);
  1388. b32 = *(b + 32);
  1389. b33 = *(b + 33);
  1390. b34 = *(b + 34);
  1391. b35 = *(b + 35);
  1392. b36 = *(b + 36);
  1393. b40 = *(b + 40);
  1394. b41 = *(b + 41);
  1395. b42 = *(b + 42);
  1396. b43 = *(b + 43);
  1397. b44 = *(b + 44);
  1398. b45 = *(b + 45);
  1399. b48 = *(b + 48);
  1400. b49 = *(b + 49);
  1401. b50 = *(b + 50);
  1402. b51 = *(b + 51);
  1403. b52 = *(b + 52);
  1404. b53 = *(b + 53);
  1405. b54 = *(b + 54);
  1406. b56 = *(b + 56);
  1407. b57 = *(b + 57);
  1408. b58 = *(b + 58);
  1409. b59 = *(b + 59);
  1410. b60 = *(b + 60);
  1411. b61 = *(b + 61);
  1412. b62 = *(b + 62);
  1413. b63 = *(b + 63);
  1414. c7 *= b63;
  1415. c6 -= c7 * b62;
  1416. c6 *= b54;
  1417. c5 -= c7 * b61;
  1418. c5 -= c6 * b53;
  1419. c5 *= b45;
  1420. c4 -= c7 * b60;
  1421. c4 -= c6 * b52;
  1422. c4 -= c5 * b44;
  1423. c4 *= b36;
  1424. c3 -= c7 * b59;
  1425. c3 -= c6 * b51;
  1426. c3 -= c5 * b43;
  1427. c3 -= c4 * b35;
  1428. c3 *= b27;
  1429. c2 -= c7 * b58;
  1430. c2 -= c6 * b50;
  1431. c2 -= c5 * b42;
  1432. c2 -= c4 * b34;
  1433. c2 -= c3 * b26;
  1434. c2 *= b18;
  1435. c1 -= c7 * b57;
  1436. c1 -= c6 * b49;
  1437. c1 -= c5 * b41;
  1438. c1 -= c4 * b33;
  1439. c1 -= c3 * b25;
  1440. c1 -= c2 * b17;
  1441. c1 *= b9;
  1442. c0 -= c7 * b56;
  1443. c0 -= c6 * b48;
  1444. c0 -= c5 * b40;
  1445. c0 -= c4 * b32;
  1446. c0 -= c3 * b24;
  1447. c0 -= c2 * b16;
  1448. c0 -= c1 * b8;
  1449. c0 *= b0;
  1450. *(a + 0) = c0;
  1451. *(a + 1) = c1;
  1452. *(a + 2) = c2;
  1453. *(a + 3) = c3;
  1454. *(a + 4) = c4;
  1455. *(a + 5) = c5;
  1456. *(a + 6) = c6;
  1457. *(a + 7) = c7;
  1458. *(c + 0) = c0;
  1459. *(c + 1 * ldc) = c1;
  1460. *(c + 2 * ldc) = c2;
  1461. *(c + 3 * ldc) = c3;
  1462. *(c + 4 * ldc) = c4;
  1463. *(c + 5 * ldc) = c5;
  1464. *(c + 6 * ldc) = c6;
  1465. *(c + 7 * ldc) = c7;
  1466. }
  1467. static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1468. {
  1469. FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
  1470. FLOAT c0, c1, c2, c3;
  1471. c0 = *(c + 0);
  1472. c1 = *(c + 1 * ldc);
  1473. c2 = *(c + 2 * ldc);
  1474. c3 = *(c + 3 * ldc);
  1475. if (bk > 0)
  1476. {
  1477. BLASLONG k;
  1478. FLOAT *aa = a, *bb = b;
  1479. FLOAT res0, res1, res2, res3;
  1480. res0 = aa[0] * bb[0];
  1481. res1 = aa[0] * bb[1];
  1482. res2 = aa[0] * bb[2];
  1483. res3 = aa[0] * bb[3];
  1484. for (k = (bk - 1); k--;)
  1485. {
  1486. aa += 1;
  1487. bb += 4;
  1488. res0 += aa[0] * bb[0];
  1489. res1 += aa[0] * bb[1];
  1490. res2 += aa[0] * bb[2];
  1491. res3 += aa[0] * bb[3];
  1492. }
  1493. c0 -= res0;
  1494. c1 -= res1;
  1495. c2 -= res2;
  1496. c3 -= res3;
  1497. }
  1498. a -= 4;
  1499. b -= 16;
  1500. b0 = *b;
  1501. b4 = *(b + 4);
  1502. b5 = *(b + 5);
  1503. b8 = *(b + 8);
  1504. b9 = *(b + 9);
  1505. b10 = *(b + 10);
  1506. b12 = *(b + 12);
  1507. b13 = *(b + 13);
  1508. b14 = *(b + 14);
  1509. b15 = *(b + 15);
  1510. c3 *= b15;
  1511. c2 = (c2 - c3 * b14) * b10;
  1512. c1 = ((c1 - c3 * b13) - c2 * b9) * b5;
  1513. c0 = (((c0 - c3 * b12) - c2 * b8) - c1 * b4) * b0;
  1514. *(a + 0) = c0;
  1515. *(a + 1) = c1;
  1516. *(a + 2) = c2;
  1517. *(a + 3) = c3;
  1518. *(c) = c0;
  1519. *(c + 1 * ldc) = c1;
  1520. *(c + 2 * ldc) = c2;
  1521. *(c + 3 * ldc) = c3;
  1522. }
  1523. static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1524. {
  1525. FLOAT b0, b2, b3, c0, c1;
  1526. c0 = *(c + 0);
  1527. c1 = *(c + ldc);
  1528. if (bk > 0)
  1529. {
  1530. BLASLONG k;
  1531. FLOAT *aa = a, *bb = b;
  1532. FLOAT res0, res1;
  1533. res0 = aa[0] * bb[0];
  1534. res1 = aa[0] * bb[1];
  1535. for (k = (bk - 1); k--;)
  1536. {
  1537. aa += 1;
  1538. bb += 2;
  1539. res0 += aa[0] * bb[0];
  1540. res1 += aa[0] * bb[1];
  1541. }
  1542. c0 -= res0;
  1543. c1 -= res1;
  1544. }
  1545. a -= 2;
  1546. b -= 4;
  1547. b3 = *(b + 3);
  1548. b2 = *(b + 2);
  1549. b0 = *b;
  1550. c1 *= b3;
  1551. c0 -= c1 * b2;
  1552. c0 *= b0;
  1553. *(a + 0) = c0;
  1554. *(a + 1) = c1;
  1555. *(c + 0) = c0;
  1556. *(c + ldc) = c1;
  1557. }
  1558. static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1559. {
  1560. if (bk > 0)
  1561. {
  1562. BLASLONG k;
  1563. FLOAT *aa = a, *bb = b;
  1564. FLOAT res;
  1565. res = *aa * *bb;
  1566. for (k = (bk - 1); k--;)
  1567. {
  1568. aa++;
  1569. bb++;
  1570. res += *aa * *bb;
  1571. }
  1572. *c -= res;
  1573. }
  1574. *c *= *(a - 1);
  1575. *(b - 1) = *c;
  1576. }
  1577. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
  1578. FLOAT *c, BLASLONG ldc, BLASLONG offset)
  1579. {
  1580. FLOAT *aa, *cc;
  1581. BLASLONG i, j, kk;
  1582. kk = n - offset;
  1583. c += n * ldc;
  1584. b += n * k;
  1585. if (n & 7)
  1586. {
  1587. if (n & 1)
  1588. {
  1589. aa = a;
  1590. b -= k;
  1591. c -= ldc;
  1592. cc = c;
  1593. for (i = (m >> 3); i--;)
  1594. {
  1595. ssolve_8x1_rt_msa(aa + 8 * kk, b + kk, cc, (k - kk));
  1596. aa += 8 * k;
  1597. cc += 8;
  1598. }
  1599. if (m & 7)
  1600. {
  1601. if (m & 4)
  1602. {
  1603. ssolve_4x1_rt_msa(aa + 4 * kk, b + kk, cc, (k - kk));
  1604. aa += 4 * k;
  1605. cc += 4;
  1606. }
  1607. if (m & 2)
  1608. {
  1609. ssolve_2x1_rt_msa(aa + 2 * kk, b + kk, cc, (k - kk));
  1610. aa += 2 * k;
  1611. cc += 2;
  1612. }
  1613. if (m & 1)
  1614. {
  1615. ssolve_1x1_rt_msa(b + kk, aa + kk, cc, (k - kk));
  1616. aa += k;
  1617. cc += 1;
  1618. }
  1619. }
  1620. kk -= 1;
  1621. }
  1622. if (n & 2)
  1623. {
  1624. aa = a;
  1625. b -= 2 * k;
  1626. c -= 2 * ldc;
  1627. cc = c;
  1628. for (i = (m >> 3); i--;)
  1629. {
  1630. ssolve_8x2_rt_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1631. aa += 8 * k;
  1632. cc += 8;
  1633. }
  1634. if (m & 7)
  1635. {
  1636. if (m & 4)
  1637. {
  1638. ssolve_4x2_rt_msa(aa + 4 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1639. aa += 4 * k;
  1640. cc += 4;
  1641. }
  1642. if (m & 2)
  1643. {
  1644. ssolve_2x2_rt_msa(aa + 2 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1645. aa += 2 * k;
  1646. cc += 2;
  1647. }
  1648. if (m & 1)
  1649. {
  1650. ssolve_1x2_rt_msa(aa + kk, b + 2 * kk, cc, ldc, (k - kk));
  1651. aa += k;
  1652. cc += 1;
  1653. }
  1654. }
  1655. kk -= 2;
  1656. }
  1657. if (n & 4)
  1658. {
  1659. aa = a;
  1660. b -= 4 * k;
  1661. c -= 4 * ldc;
  1662. cc = c;
  1663. for (i = (m >> 3); i--;)
  1664. {
  1665. ssolve_8x4_rt_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1666. aa += 8 * k;
  1667. cc += 8;
  1668. }
  1669. if (m & 7)
  1670. {
  1671. if (m & 4)
  1672. {
  1673. ssolve_4x4_rt_msa(aa + 4 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1674. aa += 4 * k;
  1675. cc += 4;
  1676. }
  1677. if (m & 2)
  1678. {
  1679. ssolve_2x4_rt_msa(aa + 2 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1680. aa += 2 * k;
  1681. cc += 2;
  1682. }
  1683. if (m & 1)
  1684. {
  1685. ssolve_1x4_rt_msa(aa + kk, b + 4 * kk, cc, ldc, (k - kk));
  1686. aa += k;
  1687. cc += 1;
  1688. }
  1689. }
  1690. kk -= 4;
  1691. }
  1692. }
  1693. for (j = (n >> 3); j--;)
  1694. {
  1695. aa = a;
  1696. b -= 8 * k;
  1697. c -= 8 * ldc;
  1698. cc = c;
  1699. for (i = (m >> 3); i--;)
  1700. {
  1701. ssolve_8x8_rt_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1702. aa += 8 * k;
  1703. cc += 8;
  1704. }
  1705. if (m & 7)
  1706. {
  1707. if (m & 4)
  1708. {
  1709. ssolve_4x8_rt_msa(aa + 4 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1710. aa += 4 * k;
  1711. cc += 4;
  1712. }
  1713. if (m & 2)
  1714. {
  1715. ssolve_2x8_rt_msa(aa + 2 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1716. aa += 2 * k;
  1717. cc += 2;
  1718. }
  1719. if (m & 1)
  1720. {
  1721. ssolve_1x8_rt_msa(aa + kk, b + 8 * kk, cc, ldc, (k - kk));
  1722. aa += k;
  1723. cc += 1;
  1724. }
  1725. }
  1726. kk -= 8;
  1727. }
  1728. return 0;
  1729. }