You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strsm_kernel_RN_8x8_msa.c 41 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  30. {
  31. BLASLONG k;
  32. v4f32 src_a0, src_a1;
  33. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  34. v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
  35. v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
  36. v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18;
  37. v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28;
  38. v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39;
  39. v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b;
  40. FLOAT *c_nxt1line = c + ldc;
  41. FLOAT *c_nxt2line = c + 2 * ldc;
  42. FLOAT *c_nxt3line = c + 3 * ldc;
  43. FLOAT *c_nxt4line = c + 4 * ldc;
  44. FLOAT *c_nxt5line = c + 5 * ldc;
  45. FLOAT *c_nxt6line = c + 6 * ldc;
  46. FLOAT *c_nxt7line = c + 7 * ldc;
  47. LD_SP2(c, 4, src_c0, src_c1);
  48. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  49. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  50. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  51. LD_SP2(c_nxt4line, 4, src_c8, src_c9);
  52. LD_SP2(c_nxt5line, 4, src_c10, src_c11);
  53. LD_SP2(c_nxt6line, 4, src_c12, src_c13);
  54. LD_SP2(c_nxt7line, 4, src_c14, src_c15);
  55. for (k = 0; k < bk; k++)
  56. {
  57. LD_SP2(a, 4, src_a0, src_a1);
  58. src_b = LD_SP(b + 0);
  59. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  60. src_c0 -= src_a0 * src_b0;
  61. src_c1 -= src_a1 * src_b0;
  62. src_c2 -= src_a0 * src_b1;
  63. src_c3 -= src_a1 * src_b1;
  64. src_c4 -= src_a0 * src_b2;
  65. src_c5 -= src_a1 * src_b2;
  66. src_c6 -= src_a0 * src_b3;
  67. src_c7 -= src_a1 * src_b3;
  68. src_b = LD_SP(b + 4);
  69. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  70. src_c8 -= src_a0 * src_b0;
  71. src_c9 -= src_a1 * src_b0;
  72. src_c10 -= src_a0 * src_b1;
  73. src_c11 -= src_a1 * src_b1;
  74. src_c12 -= src_a0 * src_b2;
  75. src_c13 -= src_a1 * src_b2;
  76. src_c14 -= src_a0 * src_b3;
  77. src_c15 -= src_a1 * src_b3;
  78. a += 8;
  79. b += 8;
  80. }
  81. src_b = LD_SP(b + 0);
  82. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  83. src_b = LD_SP(b + 4);
  84. SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7);
  85. src_b = LD_SP(b + 9);
  86. SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12);
  87. src_b13 = LD_SP(b + 13);
  88. src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2);
  89. src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1);
  90. src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0);
  91. src_c0 *= src_b0;
  92. src_c1 *= src_b0;
  93. src_c2 -= src_c0 * src_b1;
  94. src_c3 -= src_c1 * src_b1;
  95. src_c4 -= src_c0 * src_b2;
  96. src_c5 -= src_c1 * src_b2;
  97. src_c6 -= src_c0 * src_b3;
  98. src_c7 -= src_c1 * src_b3;
  99. src_c8 -= src_c0 * src_b4;
  100. src_c9 -= src_c1 * src_b4;
  101. src_c10 -= src_c0 * src_b5;
  102. src_c11 -= src_c1 * src_b5;
  103. src_c12 -= src_c0 * src_b6;
  104. src_c13 -= src_c1 * src_b6;
  105. src_c14 -= src_c0 * src_b7;
  106. src_c15 -= src_c1 * src_b7;
  107. ST_SP2(src_c0, src_c1, a, 4);
  108. ST_SP2(src_c0, src_c1, c, 4);
  109. src_c2 *= src_b9;
  110. src_c3 *= src_b9;
  111. src_c4 -= src_c2 * src_b10;
  112. src_c5 -= src_c3 * src_b10;
  113. src_c6 -= src_c2 * src_b11;
  114. src_c7 -= src_c3 * src_b11;
  115. src_c8 -= src_c2 * src_b12;
  116. src_c9 -= src_c3 * src_b12;
  117. src_c10 -= src_c2 * src_b13;
  118. src_c11 -= src_c3 * src_b13;
  119. src_c12 -= src_c2 * src_b14;
  120. src_c13 -= src_c3 * src_b14;
  121. src_c14 -= src_c2 * src_b15;
  122. src_c15 -= src_c3 * src_b15;
  123. ST_SP2(src_c2, src_c3, a + 8, 4);
  124. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  125. src_b = LD_SP(b + 18);
  126. SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21);
  127. src_b22 = LD_SP(b + 22);
  128. src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1);
  129. src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0);
  130. src_b = LD_SP(b + 27);
  131. SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
  132. COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31);
  133. src_c4 *= src_b18;
  134. src_c5 *= src_b18;
  135. src_c6 -= src_c4 * src_b19;
  136. src_c7 -= src_c5 * src_b19;
  137. src_c8 -= src_c4 * src_b20;
  138. src_c9 -= src_c5 * src_b20;
  139. src_c10 -= src_c4 * src_b21;
  140. src_c11 -= src_c5 * src_b21;
  141. src_c12 -= src_c4 * src_b22;
  142. src_c13 -= src_c5 * src_b22;
  143. src_c14 -= src_c4 * src_b23;
  144. src_c15 -= src_c5 * src_b23;
  145. ST_SP2(src_c4, src_c5, a + 16, 4);
  146. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  147. src_c6 *= src_b27;
  148. src_c7 *= src_b27;
  149. src_c8 -= src_c6 * src_b28;
  150. src_c9 -= src_c7 * src_b28;
  151. src_c10 -= src_c6 * src_b29;
  152. src_c11 -= src_c7 * src_b29;
  153. src_c12 -= src_c6 * src_b30;
  154. src_c13 -= src_c7 * src_b30;
  155. src_c14 -= src_c6 * src_b31;
  156. src_c15 -= src_c7 * src_b31;
  157. ST_SP2(src_c6, src_c7, a + 24, 4);
  158. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  159. src_b = LD_SP(b + 36);
  160. SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
  161. src_b45 = LD_SP(b + 45);
  162. src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2);
  163. src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
  164. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
  165. COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54);
  166. COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55);
  167. COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63);
  168. src_c8 *= src_b36;
  169. src_c9 *= src_b36;
  170. src_c10 -= src_c8 * src_b37;
  171. src_c11 -= src_c9 * src_b37;
  172. src_c12 -= src_c8 * src_b38;
  173. src_c13 -= src_c9 * src_b38;
  174. src_c14 -= src_c8 * src_b39;
  175. src_c15 -= src_c9 * src_b39;
  176. ST_SP2(src_c8, src_c9, a + 32, 4);
  177. ST_SP2(src_c8, src_c9, c_nxt4line, 4);
  178. src_c10 *= src_b45;
  179. src_c11 *= src_b45;
  180. src_c12 -= src_c10 * src_b46;
  181. src_c13 -= src_c11 * src_b46;
  182. src_c14 -= src_c10 * src_b47;
  183. src_c15 -= src_c11 * src_b47;
  184. ST_SP2(src_c10, src_c11, a + 40, 4);
  185. ST_SP2(src_c10, src_c11, c_nxt5line, 4);
  186. src_c12 *= src_b54;
  187. src_c13 *= src_b54;
  188. src_c14 -= src_c12 * src_b55;
  189. src_c15 -= src_c13 * src_b55;
  190. ST_SP2(src_c12, src_c13, a + 48, 4);
  191. ST_SP2(src_c12, src_c13, c_nxt6line, 4);
  192. src_c14 *= src_b63;
  193. src_c15 *= src_b63;
  194. ST_SP2(src_c14, src_c15, a + 56, 4);
  195. ST_SP2(src_c14, src_c15, c_nxt7line, 4);
  196. }
  197. static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  198. {
  199. BLASLONG k;
  200. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  201. v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7;
  202. v4f32 src_b10, src_b11, src_b15, src_b, src_a0, src_a1;
  203. FLOAT *c_nxt1line = c + ldc;
  204. FLOAT *c_nxt2line = c + 2 * ldc;
  205. FLOAT *c_nxt3line = c + 3 * ldc;
  206. LD_SP2(c, 4, src_c0, src_c1);
  207. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  208. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  209. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  210. for (k = 0; k < (bk >> 1); k++)
  211. {
  212. LD_SP2(a, 4, src_a0, src_a1);
  213. src_b = LD_SP(b + 0);
  214. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  215. src_c0 -= src_a0 * src_b0;
  216. src_c1 -= src_a1 * src_b0;
  217. src_c2 -= src_a0 * src_b1;
  218. src_c3 -= src_a1 * src_b1;
  219. src_c4 -= src_a0 * src_b2;
  220. src_c5 -= src_a1 * src_b2;
  221. src_c6 -= src_a0 * src_b3;
  222. src_c7 -= src_a1 * src_b3;
  223. a += 8;
  224. b += 4;
  225. LD_SP2(a, 4, src_a0, src_a1);
  226. src_b = LD_SP(b + 0);
  227. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  228. src_c0 -= src_a0 * src_b0;
  229. src_c1 -= src_a1 * src_b0;
  230. src_c2 -= src_a0 * src_b1;
  231. src_c3 -= src_a1 * src_b1;
  232. src_c4 -= src_a0 * src_b2;
  233. src_c5 -= src_a1 * src_b2;
  234. src_c6 -= src_a0 * src_b3;
  235. src_c7 -= src_a1 * src_b3;
  236. a += 8;
  237. b += 4;
  238. }
  239. if (bk & 1)
  240. {
  241. LD_SP2(a, 4, src_a0, src_a1);
  242. src_b = LD_SP(b + 0);
  243. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  244. src_c0 -= src_a0 * src_b0;
  245. src_c1 -= src_a1 * src_b0;
  246. src_c2 -= src_a0 * src_b1;
  247. src_c3 -= src_a1 * src_b1;
  248. src_c4 -= src_a0 * src_b2;
  249. src_c5 -= src_a1 * src_b2;
  250. src_c6 -= src_a0 * src_b3;
  251. src_c7 -= src_a1 * src_b3;
  252. a += 8;
  253. b += 4;
  254. }
  255. src_b = LD_SP(b + 0);
  256. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  257. src_b5 = LD_SP(b + 5);
  258. src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
  259. src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
  260. src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
  261. COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10);
  262. COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11);
  263. COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15);
  264. src_c0 *= src_b0;
  265. src_c1 *= src_b0;
  266. src_c2 -= src_c0 * src_b1;
  267. src_c3 -= src_c1 * src_b1;
  268. src_c4 -= src_c0 * src_b2;
  269. src_c5 -= src_c1 * src_b2;
  270. src_c6 -= src_c0 * src_b3;
  271. src_c7 -= src_c1 * src_b3;
  272. src_c2 *= src_b5;
  273. src_c3 *= src_b5;
  274. src_c4 -= src_c2 * src_b6;
  275. src_c5 -= src_c3 * src_b6;
  276. src_c6 -= src_c2 * src_b7;
  277. src_c7 -= src_c3 * src_b7;
  278. src_c4 *= src_b10;
  279. src_c5 *= src_b10;
  280. src_c6 -= src_c4 * src_b11;
  281. src_c7 -= src_c5 * src_b11;
  282. src_c6 *= src_b15;
  283. src_c7 *= src_b15;
  284. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  285. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  286. ST_SP2(src_c0, src_c1, c, 4);
  287. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  288. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  289. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  290. }
  291. static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  292. {
  293. BLASLONG k;
  294. v4f32 src_a0, src_a1;
  295. v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3;
  296. FLOAT *c_nxt1line = c + ldc;
  297. LD_SP2(c, 4, src_c0, src_c1);
  298. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  299. for (k = 0; k < (bk >> 1); k++)
  300. {
  301. LD_SP2(a, 4, src_a0, src_a1);
  302. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  303. COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
  304. src_c0 -= src_a0 * src_b0;
  305. src_c1 -= src_a1 * src_b0;
  306. src_c2 -= src_a0 * src_b1;
  307. src_c3 -= src_a1 * src_b1;
  308. a += 8;
  309. b += 2;
  310. LD_SP2(a, 4, src_a0, src_a1);
  311. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  312. COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
  313. src_c0 -= src_a0 * src_b0;
  314. src_c1 -= src_a1 * src_b0;
  315. src_c2 -= src_a0 * src_b1;
  316. src_c3 -= src_a1 * src_b1;
  317. a += 8;
  318. b += 2;
  319. }
  320. if (bk & 1)
  321. {
  322. LD_SP2(a, 4, src_a0, src_a1);
  323. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  324. COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
  325. src_c0 -= src_a0 * src_b0;
  326. src_c1 -= src_a1 * src_b0;
  327. src_c2 -= src_a0 * src_b1;
  328. src_c3 -= src_a1 * src_b1;
  329. a += 8;
  330. b += 2;
  331. }
  332. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  333. COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
  334. COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
  335. src_c0 *= src_b0;
  336. src_c1 *= src_b0;
  337. src_c2 -= src_c0 * src_b1;
  338. src_c3 -= src_c1 * src_b1;
  339. src_c2 *= src_b3;
  340. src_c3 *= src_b3;
  341. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  342. ST_SP2(src_c0, src_c1, c, 4);
  343. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  344. }
  345. static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  346. {
  347. BLASLONG k;
  348. v4f32 src_a0, src_a1, src_c0, src_c1, src_b0;
  349. LD_SP2(c, 4, src_c0, src_c1);
  350. for (k = 0; k < (bk >> 2); k++)
  351. {
  352. LD_SP2(a, 4, src_a0, src_a1);
  353. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  354. src_c0 -= src_a0 * src_b0;
  355. src_c1 -= src_a1 * src_b0;
  356. a += 8;
  357. b += 1;
  358. LD_SP2(a, 4, src_a0, src_a1);
  359. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  360. src_c0 -= src_a0 * src_b0;
  361. src_c1 -= src_a1 * src_b0;
  362. a += 8;
  363. b += 1;
  364. LD_SP2(a, 4, src_a0, src_a1);
  365. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  366. src_c0 -= src_a0 * src_b0;
  367. src_c1 -= src_a1 * src_b0;
  368. a += 8;
  369. b += 1;
  370. LD_SP2(a, 4, src_a0, src_a1);
  371. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  372. src_c0 -= src_a0 * src_b0;
  373. src_c1 -= src_a1 * src_b0;
  374. a += 8;
  375. b += 1;
  376. }
  377. if (bk & 3)
  378. {
  379. if (bk & 2)
  380. {
  381. LD_SP2(a, 4, src_a0, src_a1);
  382. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  383. src_c0 -= src_a0 * src_b0;
  384. src_c1 -= src_a1 * src_b0;
  385. a += 8;
  386. b += 1;
  387. LD_SP2(a, 4, src_a0, src_a1);
  388. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  389. src_c0 -= src_a0 * src_b0;
  390. src_c1 -= src_a1 * src_b0;
  391. a += 8;
  392. b += 1;
  393. }
  394. if (bk & 1)
  395. {
  396. LD_SP2(a, 4, src_a0, src_a1);
  397. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  398. src_c0 -= src_a0 * src_b0;
  399. src_c1 -= src_a1 * src_b0;
  400. a += 8;
  401. b += 1;
  402. }
  403. }
  404. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  405. src_c0 *= src_b0;
  406. src_c1 *= src_b0;
  407. ST_SP2(src_c0, src_c1, a, 4);
  408. ST_SP2(src_c0, src_c1, c, 4);
  409. }
  410. static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  411. {
  412. BLASLONG k;
  413. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  414. v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
  415. v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18;
  416. v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28;
  417. v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39;
  418. v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b, src_a0;
  419. FLOAT *c_nxt1line = c + ldc;
  420. FLOAT *c_nxt2line = c + 2 * ldc;
  421. FLOAT *c_nxt3line = c + 3 * ldc;
  422. FLOAT *c_nxt4line = c + 4 * ldc;
  423. FLOAT *c_nxt5line = c + 5 * ldc;
  424. FLOAT *c_nxt6line = c + 6 * ldc;
  425. FLOAT *c_nxt7line = c + 7 * ldc;
  426. src_c0 = LD_SP(c);
  427. src_c1 = LD_SP(c_nxt1line);
  428. src_c2 = LD_SP(c_nxt2line);
  429. src_c3 = LD_SP(c_nxt3line);
  430. src_c4 = LD_SP(c_nxt4line);
  431. src_c5 = LD_SP(c_nxt5line);
  432. src_c6 = LD_SP(c_nxt6line);
  433. src_c7 = LD_SP(c_nxt7line);
  434. for (k = 0; k < bk; k++)
  435. {
  436. src_a0 = LD_SP(a);
  437. src_b = LD_SP(b + 0);
  438. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  439. src_c0 -= src_a0 * src_b0;
  440. src_c1 -= src_a0 * src_b1;
  441. src_c2 -= src_a0 * src_b2;
  442. src_c3 -= src_a0 * src_b3;
  443. src_b = LD_SP(b + 4);
  444. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  445. src_c4 -= src_a0 * src_b0;
  446. src_c5 -= src_a0 * src_b1;
  447. src_c6 -= src_a0 * src_b2;
  448. src_c7 -= src_a0 * src_b3;
  449. a += 4;
  450. b += 8;
  451. }
  452. src_b = LD_SP(b + 0);
  453. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  454. src_b = LD_SP(b + 4);
  455. SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7);
  456. src_b = LD_SP(b + 9);
  457. SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12);
  458. src_b13 = LD_SP(b + 13);
  459. src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2);
  460. src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1);
  461. src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0);
  462. src_b = LD_SP(b + 18);
  463. SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21);
  464. src_b22 = LD_SP(b + 22);
  465. src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1);
  466. src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0);
  467. src_b = LD_SP(b + 27);
  468. SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
  469. COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31);
  470. src_b = LD_SP(b + 36);
  471. SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
  472. src_b45 = LD_SP(b + 45);
  473. src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2);
  474. src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
  475. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
  476. COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54);
  477. COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55);
  478. COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63);
  479. src_c0 *= src_b0;
  480. src_c1 -= src_c0 * src_b1;
  481. src_c2 -= src_c0 * src_b2;
  482. src_c3 -= src_c0 * src_b3;
  483. src_c4 -= src_c0 * src_b4;
  484. src_c5 -= src_c0 * src_b5;
  485. src_c6 -= src_c0 * src_b6;
  486. src_c7 -= src_c0 * src_b7;
  487. src_c1 *= src_b9;
  488. src_c2 -= src_c1 * src_b10;
  489. src_c3 -= src_c1 * src_b11;
  490. src_c4 -= src_c1 * src_b12;
  491. src_c5 -= src_c1 * src_b13;
  492. src_c6 -= src_c1 * src_b14;
  493. src_c7 -= src_c1 * src_b15;
  494. src_c2 *= src_b18;
  495. src_c3 -= src_c2 * src_b19;
  496. src_c4 -= src_c2 * src_b20;
  497. src_c5 -= src_c2 * src_b21;
  498. src_c6 -= src_c2 * src_b22;
  499. src_c7 -= src_c2 * src_b23;
  500. src_c3 *= src_b27;
  501. src_c4 -= src_c3 * src_b28;
  502. src_c5 -= src_c3 * src_b29;
  503. src_c6 -= src_c3 * src_b30;
  504. src_c7 -= src_c3 * src_b31;
  505. src_c4 *= src_b36;
  506. src_c5 -= src_c4 * src_b37;
  507. src_c6 -= src_c4 * src_b38;
  508. src_c7 -= src_c4 * src_b39;
  509. src_c5 *= src_b45;
  510. src_c6 -= src_c5 * src_b46;
  511. src_c7 -= src_c5 * src_b47;
  512. src_c6 *= src_b54;
  513. src_c7 -= src_c6 * src_b55;
  514. src_c7 *= src_b63;
  515. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  516. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  517. ST_SP(src_c0, c);
  518. ST_SP(src_c1, c_nxt1line);
  519. ST_SP(src_c2, c_nxt2line);
  520. ST_SP(src_c3, c_nxt3line);
  521. ST_SP(src_c4, c_nxt4line);
  522. ST_SP(src_c5, c_nxt5line);
  523. ST_SP(src_c6, c_nxt6line);
  524. ST_SP(src_c7, c_nxt7line);
  525. }
  526. static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  527. {
  528. BLASLONG k;
  529. v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3;
  530. v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b, src_a0;
  531. FLOAT *c_nxt1line = c + ldc;
  532. FLOAT *c_nxt2line = c + 2 * ldc;
  533. FLOAT *c_nxt3line = c + 3 * ldc;
  534. src_c0 = LD_SP(c);
  535. src_c1 = LD_SP(c_nxt1line);
  536. src_c2 = LD_SP(c_nxt2line);
  537. src_c3 = LD_SP(c_nxt3line);
  538. for (k = 0; k < (bk >> 1); k++)
  539. {
  540. src_a0 = LD_SP(a);
  541. src_b = LD_SP(b + 0);
  542. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  543. src_c0 -= src_a0 * src_b0;
  544. src_c1 -= src_a0 * src_b1;
  545. src_c2 -= src_a0 * src_b2;
  546. src_c3 -= src_a0 * src_b3;
  547. a += 4;
  548. b += 4;
  549. src_a0 = LD_SP(a);
  550. src_b = LD_SP(b + 0);
  551. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  552. src_c0 -= src_a0 * src_b0;
  553. src_c1 -= src_a0 * src_b1;
  554. src_c2 -= src_a0 * src_b2;
  555. src_c3 -= src_a0 * src_b3;
  556. a += 4;
  557. b += 4;
  558. }
  559. if (bk & 1)
  560. {
  561. src_a0 = LD_SP(a);
  562. src_b = LD_SP(b + 0);
  563. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  564. src_c0 -= src_a0 * src_b0;
  565. src_c1 -= src_a0 * src_b1;
  566. src_c2 -= src_a0 * src_b2;
  567. src_c3 -= src_a0 * src_b3;
  568. a += 4;
  569. b += 4;
  570. }
  571. src_b = LD_SP(b + 0);
  572. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  573. src_b5 = LD_SP(b + 5);
  574. src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
  575. src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
  576. src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
  577. COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10);
  578. COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11);
  579. COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15);
  580. src_c0 *= src_b0;
  581. src_c1 -= src_c0 * src_b1;
  582. src_c2 -= src_c0 * src_b2;
  583. src_c3 -= src_c0 * src_b3;
  584. src_c1 *= src_b5;
  585. src_c2 -= src_c1 * src_b6;
  586. src_c3 -= src_c1 * src_b7;
  587. src_c2 *= src_b10;
  588. src_c3 -= src_c2 * src_b11;
  589. src_c3 *= src_b15;
  590. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  591. ST_SP(src_c0, c);
  592. ST_SP(src_c1, c_nxt1line);
  593. ST_SP(src_c2, c_nxt2line);
  594. ST_SP(src_c3, c_nxt3line);
  595. }
  596. static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  597. {
  598. BLASLONG k;
  599. v4f32 src_a, src_c0, src_c1, src_b0, src_b1, src_b3;
  600. FLOAT *c_nxt1line = c + ldc;
  601. src_c0 = LD_SP(c);
  602. src_c1 = LD_SP(c_nxt1line);
  603. for (k = 0; k < (bk >> 2); k++)
  604. {
  605. src_a = LD_SP(a);
  606. src_b0 = LD_SP(b);
  607. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  608. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  609. src_c0 -= src_a * src_b0;
  610. src_c1 -= src_a * src_b1;
  611. a += 4;
  612. b += 2;
  613. src_a = LD_SP(a);
  614. src_b0 = LD_SP(b);
  615. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  616. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  617. src_c0 -= src_a * src_b0;
  618. src_c1 -= src_a * src_b1;
  619. a += 4;
  620. b += 2;
  621. src_a = LD_SP(a);
  622. src_b0 = LD_SP(b);
  623. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  624. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  625. src_c0 -= src_a * src_b0;
  626. src_c1 -= src_a * src_b1;
  627. a += 4;
  628. b += 2;
  629. src_a = LD_SP(a);
  630. src_b0 = LD_SP(b);
  631. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  632. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  633. src_c0 -= src_a * src_b0;
  634. src_c1 -= src_a * src_b1;
  635. a += 4;
  636. b += 2;
  637. }
  638. if (bk & 3)
  639. {
  640. if (bk & 2)
  641. {
  642. src_a = LD_SP(a);
  643. src_b0 = LD_SP(b);
  644. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  645. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  646. src_c0 -= src_a * src_b0;
  647. src_c1 -= src_a * src_b1;
  648. a += 4;
  649. b += 2;
  650. src_a = LD_SP(a);
  651. src_b0 = LD_SP(b);
  652. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  653. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  654. src_c0 -= src_a * src_b0;
  655. src_c1 -= src_a * src_b1;
  656. a += 4;
  657. b += 2;
  658. }
  659. if (bk & 1)
  660. {
  661. src_a = LD_SP(a);
  662. src_b0 = LD_SP(b);
  663. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  664. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  665. src_c0 -= src_a * src_b0;
  666. src_c1 -= src_a * src_b1;
  667. a += 4;
  668. b += 2;
  669. }
  670. }
  671. COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
  672. COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
  673. COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
  674. src_c0 *= src_b0;
  675. src_c1 -= src_c0 * src_b1;
  676. src_c1 *= src_b3;
  677. ST_SP2(src_c0, src_c1, a, 4);
  678. ST_SP(src_c0, c);
  679. ST_SP(src_c1, c_nxt1line);
  680. }
  681. static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  682. {
  683. BLASLONG k;
  684. FLOAT b0, c0, c1, c2, c3;
  685. c0 = *(c + 0);
  686. c1 = *(c + 1);
  687. c2 = *(c + 2);
  688. c3 = *(c + 3);
  689. for (k = 0; k < bk; k++)
  690. {
  691. c0 -= a[0] * b[0];
  692. c1 -= a[1] * b[0];
  693. c2 -= a[2] * b[0];
  694. c3 -= a[3] * b[0];
  695. a += 4;
  696. b += 1;
  697. }
  698. b0 = *(b + 0);
  699. c0 *= b0;
  700. c1 *= b0;
  701. c2 *= b0;
  702. c3 *= b0;
  703. *(a + 0) = c0;
  704. *(a + 1) = c1;
  705. *(a + 2) = c2;
  706. *(a + 3) = c3;
  707. *(c + 0) = c0;
  708. *(c + 1) = c1;
  709. *(c + 2) = c2;
  710. *(c + 3) = c3;
  711. }
  712. static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  713. {
  714. BLASLONG k;
  715. FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15;
  716. FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31;
  717. FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63;
  718. FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  719. FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
  720. FLOAT c0_nxt7, c1_nxt7;
  721. c0 = *(c + 0);
  722. c1 = *(c + 1);
  723. c0_nxt1 = *(c + 0 + 1 * ldc);
  724. c1_nxt1 = *(c + 1 + 1 * ldc);
  725. c0_nxt2 = *(c + 0 + 2 * ldc);
  726. c1_nxt2 = *(c + 1 + 2 * ldc);
  727. c0_nxt3 = *(c + 0 + 3 * ldc);
  728. c1_nxt3 = *(c + 1 + 3 * ldc);
  729. c0_nxt4 = *(c + 0 + 4 * ldc);
  730. c1_nxt4 = *(c + 1 + 4 * ldc);
  731. c0_nxt5 = *(c + 0 + 5 * ldc);
  732. c1_nxt5 = *(c + 1 + 5 * ldc);
  733. c0_nxt6 = *(c + 0 + 6 * ldc);
  734. c1_nxt6 = *(c + 1 + 6 * ldc);
  735. c0_nxt7 = *(c + 0 + 7 * ldc);
  736. c1_nxt7 = *(c + 1 + 7 * ldc);
  737. for (k = 0; k < bk; k++)
  738. {
  739. c0 -= a[0] * b[0];
  740. c1 -= a[1] * b[0];
  741. c0_nxt1 -= a[0] * b[1];
  742. c1_nxt1 -= a[1] * b[1];
  743. c0_nxt2 -= a[0] * b[2];
  744. c1_nxt2 -= a[1] * b[2];
  745. c0_nxt3 -= a[0] * b[3];
  746. c1_nxt3 -= a[1] * b[3];
  747. c0_nxt4 -= a[0] * b[4];
  748. c1_nxt4 -= a[1] * b[4];
  749. c0_nxt5 -= a[0] * b[5];
  750. c1_nxt5 -= a[1] * b[5];
  751. c0_nxt6 -= a[0] * b[6];
  752. c1_nxt6 -= a[1] * b[6];
  753. c0_nxt7 -= a[0] * b[7];
  754. c1_nxt7 -= a[1] * b[7];
  755. a += 2;
  756. b += 8;
  757. }
  758. b0 = *(b + 0);
  759. b1 = *(b + 1);
  760. b2 = *(b + 2);
  761. b3 = *(b + 3);
  762. b4 = *(b + 4);
  763. b5 = *(b + 5);
  764. b6 = *(b + 6);
  765. b7 = *(b + 7);
  766. b9 = *(b + 9);
  767. b10 = *(b + 10);
  768. b11 = *(b + 11);
  769. b12 = *(b + 12);
  770. b13 = *(b + 13);
  771. b14 = *(b + 14);
  772. b15 = *(b + 15);
  773. b18 = *(b + 18);
  774. b19 = *(b + 19);
  775. b20 = *(b + 20);
  776. b21 = *(b + 21);
  777. b22 = *(b + 22);
  778. b23 = *(b + 23);
  779. b27 = *(b + 27);
  780. b28 = *(b + 28);
  781. b29 = *(b + 29);
  782. b30 = *(b + 30);
  783. b31 = *(b + 31);
  784. b36 = *(b + 36);
  785. b37 = *(b + 37);
  786. b38 = *(b + 38);
  787. b39 = *(b + 39);
  788. b45 = *(b + 45);
  789. b46 = *(b + 46);
  790. b47 = *(b + 47);
  791. b54 = *(b + 54);
  792. b55 = *(b + 55);
  793. b63 = *(b + 63);
  794. c0 *= b0;
  795. c1 *= b0;
  796. c0_nxt1 -= c0 * b1;
  797. c1_nxt1 -= c1 * b1;
  798. c0_nxt2 -= c0 * b2;
  799. c1_nxt2 -= c1 * b2;
  800. c0_nxt3 -= c0 * b3;
  801. c1_nxt3 -= c1 * b3;
  802. c0_nxt4 -= c0 * b4;
  803. c1_nxt4 -= c1 * b4;
  804. c0_nxt5 -= c0 * b5;
  805. c1_nxt5 -= c1 * b5;
  806. c0_nxt6 -= c0 * b6;
  807. c1_nxt6 -= c1 * b6;
  808. c0_nxt7 -= c0 * b7;
  809. c1_nxt7 -= c1 * b7;
  810. c0_nxt1 *= b9;
  811. c1_nxt1 *= b9;
  812. c0_nxt2 -= c0_nxt1 * b10;
  813. c1_nxt2 -= c1_nxt1 * b10;
  814. c0_nxt3 -= c0_nxt1 * b11;
  815. c1_nxt3 -= c1_nxt1 * b11;
  816. c0_nxt4 -= c0_nxt1 * b12;
  817. c1_nxt4 -= c1_nxt1 * b12;
  818. c0_nxt5 -= c0_nxt1 * b13;
  819. c1_nxt5 -= c1_nxt1 * b13;
  820. c0_nxt6 -= c0_nxt1 * b14;
  821. c1_nxt6 -= c1_nxt1 * b14;
  822. c0_nxt7 -= c0_nxt1 * b15;
  823. c1_nxt7 -= c1_nxt1 * b15;
  824. c0_nxt2 *= b18;
  825. c1_nxt2 *= b18;
  826. c0_nxt3 -= c0_nxt2 * b19;
  827. c1_nxt3 -= c1_nxt2 * b19;
  828. c0_nxt4 -= c0_nxt2 * b20;
  829. c1_nxt4 -= c1_nxt2 * b20;
  830. c0_nxt5 -= c0_nxt2 * b21;
  831. c1_nxt5 -= c1_nxt2 * b21;
  832. c0_nxt6 -= c0_nxt2 * b22;
  833. c1_nxt6 -= c1_nxt2 * b22;
  834. c0_nxt7 -= c0_nxt2 * b23;
  835. c1_nxt7 -= c1_nxt2 * b23;
  836. c0_nxt3 *= b27;
  837. c1_nxt3 *= b27;
  838. c0_nxt4 -= c0_nxt3 * b28;
  839. c1_nxt4 -= c1_nxt3 * b28;
  840. c0_nxt5 -= c0_nxt3 * b29;
  841. c1_nxt5 -= c1_nxt3 * b29;
  842. c0_nxt6 -= c0_nxt3 * b30;
  843. c1_nxt6 -= c1_nxt3 * b30;
  844. c0_nxt7 -= c0_nxt3 * b31;
  845. c1_nxt7 -= c1_nxt3 * b31;
  846. c0_nxt4 *= b36;
  847. c1_nxt4 *= b36;
  848. c0_nxt5 -= c0_nxt4 * b37;
  849. c1_nxt5 -= c1_nxt4 * b37;
  850. c0_nxt6 -= c0_nxt4 * b38;
  851. c1_nxt6 -= c1_nxt4 * b38;
  852. c0_nxt7 -= c0_nxt4 * b39;
  853. c1_nxt7 -= c1_nxt4 * b39;
  854. c0_nxt5 *= b45;
  855. c1_nxt5 *= b45;
  856. c0_nxt6 -= c0_nxt5 * b46;
  857. c1_nxt6 -= c1_nxt5 * b46;
  858. c0_nxt7 -= c0_nxt5 * b47;
  859. c1_nxt7 -= c1_nxt5 * b47;
  860. c0_nxt6 *= b54;
  861. c1_nxt6 *= b54;
  862. c0_nxt7 -= c0_nxt6 * b55;
  863. c1_nxt7 -= c1_nxt6 * b55;
  864. c0_nxt7 *= b63;
  865. c1_nxt7 *= b63;
  866. *(a + 0) = c0;
  867. *(a + 1) = c1;
  868. *(a + 2) = c0_nxt1;
  869. *(a + 3) = c1_nxt1;
  870. *(a + 4) = c0_nxt2;
  871. *(a + 5) = c1_nxt2;
  872. *(a + 6) = c0_nxt3;
  873. *(a + 7) = c1_nxt3;
  874. *(a + 8) = c0_nxt4;
  875. *(a + 9) = c1_nxt4;
  876. *(a + 10) = c0_nxt5;
  877. *(a + 11) = c1_nxt5;
  878. *(a + 12) = c0_nxt6;
  879. *(a + 13) = c1_nxt6;
  880. *(a + 14) = c0_nxt7;
  881. *(a + 15) = c1_nxt7;
  882. *(c + 0) = c0;
  883. *(c + 1) = c1;
  884. *(c + 0 + 1 * ldc) = c0_nxt1;
  885. *(c + 1 + 1 * ldc) = c1_nxt1;
  886. *(c + 0 + 2 * ldc) = c0_nxt2;
  887. *(c + 1 + 2 * ldc) = c1_nxt2;
  888. *(c + 0 + 3 * ldc) = c0_nxt3;
  889. *(c + 1 + 3 * ldc) = c1_nxt3;
  890. *(c + 0 + 4 * ldc) = c0_nxt4;
  891. *(c + 1 + 4 * ldc) = c1_nxt4;
  892. *(c + 0 + 5 * ldc) = c0_nxt5;
  893. *(c + 1 + 5 * ldc) = c1_nxt5;
  894. *(c + 0 + 6 * ldc) = c0_nxt6;
  895. *(c + 1 + 6 * ldc) = c1_nxt6;
  896. *(c + 0 + 7 * ldc) = c0_nxt7;
  897. *(c + 1 + 7 * ldc) = c1_nxt7;
  898. }
  899. static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  900. {
  901. BLASLONG k;
  902. FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1;
  903. FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3;
  904. c0 = *(c + 0);
  905. c1 = *(c + 1);
  906. c0_nxt1 = *(c + 0 + 1 * ldc);
  907. c1_nxt1 = *(c + 1 + 1 * ldc);
  908. c0_nxt2 = *(c + 0 + 2 * ldc);
  909. c1_nxt2 = *(c + 1 + 2 * ldc);
  910. c0_nxt3 = *(c + 0 + 3 * ldc);
  911. c1_nxt3 = *(c + 1 + 3 * ldc);
  912. for (k = 0; k < bk; k++)
  913. {
  914. c0 -= a[0] * b[0];
  915. c1 -= a[1] * b[0];
  916. c0_nxt1 -= a[0] * b[1];
  917. c1_nxt1 -= a[1] * b[1];
  918. c0_nxt2 -= a[0] * b[2];
  919. c1_nxt2 -= a[1] * b[2];
  920. c0_nxt3 -= a[0] * b[3];
  921. c1_nxt3 -= a[1] * b[3];
  922. a += 2;
  923. b += 4;
  924. }
  925. b0 = *(b + 0);
  926. b1 = *(b + 1);
  927. b2 = *(b + 2);
  928. b3 = *(b + 3);
  929. b5 = *(b + 5);
  930. b6 = *(b + 6);
  931. b7 = *(b + 7);
  932. b10 = *(b + 10);
  933. b11 = *(b + 11);
  934. b15 = *(b + 15);
  935. c0 *= b0;
  936. c1 *= b0;
  937. c0_nxt1 -= c0 * b1;
  938. c1_nxt1 -= c1 * b1;
  939. c0_nxt1 *= b5;
  940. c1_nxt1 *= b5;
  941. c0_nxt2 -= c0 * b2;
  942. c1_nxt2 -= c1 * b2;
  943. c0_nxt2 -= c0_nxt1 * b6;
  944. c1_nxt2 -= c1_nxt1 * b6;
  945. c0_nxt2 *= b10;
  946. c1_nxt2 *= b10;
  947. c0_nxt3 -= c0 * b3;
  948. c1_nxt3 -= c1 * b3;
  949. c0_nxt3 -= c0_nxt1 * b7;
  950. c1_nxt3 -= c1_nxt1 * b7;
  951. c0_nxt3 -= c0_nxt2 * b11;
  952. c1_nxt3 -= c1_nxt2 * b11;
  953. c0_nxt3 *= b15;
  954. c1_nxt3 *= b15;
  955. *(a + 0) = c0;
  956. *(a + 1) = c1;
  957. *(a + 2) = c0_nxt1;
  958. *(a + 3) = c1_nxt1;
  959. *(a + 4) = c0_nxt2;
  960. *(a + 5) = c1_nxt2;
  961. *(a + 6) = c0_nxt3;
  962. *(a + 7) = c1_nxt3;
  963. *(c + 0) = c0;
  964. *(c + 1) = c1;
  965. *(c + 1 * ldc) = c0_nxt1;
  966. *(c + 1 + 1 * ldc) = c1_nxt1;
  967. *(c + 2 * ldc) = c0_nxt2;
  968. *(c + 1 + 2 * ldc) = c1_nxt2;
  969. *(c + 3 * ldc) = c0_nxt3;
  970. *(c + 1 + 3 * ldc) = c1_nxt3;
  971. }
  972. static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  973. {
  974. BLASLONG k;
  975. FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt;
  976. c0 = *(c + 0);
  977. c1 = *(c + 1);
  978. c0_nxt = *(c + 0 + ldc);
  979. c1_nxt = *(c + 1 + ldc);
  980. for (k = 0; k < bk; k++)
  981. {
  982. c0 -= a[0] * b[0];
  983. c1 -= a[1] * b[0];
  984. c0_nxt -= a[0] * b[1];
  985. c1_nxt -= a[1] * b[1];
  986. a += 2;
  987. b += 2;
  988. }
  989. b0 = *(b + 0);
  990. b1 = *(b + 1);
  991. b3 = *(b + 3);
  992. c0 *= b0;
  993. c1 *= b0;
  994. c0_nxt -= c0 * b1;
  995. c1_nxt -= c1 * b1;
  996. c0_nxt *= b3;
  997. c1_nxt *= b3;
  998. *(a + 0) = c0;
  999. *(a + 1) = c1;
  1000. *(a + 2) = c0_nxt;
  1001. *(a + 3) = c1_nxt;
  1002. *(c + 0) = c0;
  1003. *(c + 1) = c1;
  1004. *(c + ldc) = c0_nxt;
  1005. *(c + 1 + ldc) = c1_nxt;
  1006. }
  1007. static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1008. {
  1009. BLASLONG k;
  1010. FLOAT b0, c0, c1;
  1011. c0 = *(c + 0);
  1012. c1 = *(c + 1);
  1013. for (k = 0; k < bk; k++)
  1014. {
  1015. c0 -= a[0] * b[0];
  1016. c1 -= a[1] * b[0];
  1017. a += 2;
  1018. b += 1;
  1019. }
  1020. b0 = *(b + 0);
  1021. c0 *= b0;
  1022. c1 *= b0;
  1023. *(a + 0) = c0;
  1024. *(a + 1) = c1;
  1025. *(c + 0) = c0;
  1026. *(c + 1) = c1;
  1027. }
  1028. static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1029. {
  1030. BLASLONG k;
  1031. FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15;
  1032. FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38;
  1033. FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7;
  1034. c0 = *(c + 0);
  1035. c1 = *(c + 1 * ldc);
  1036. c2 = *(c + 2 * ldc);
  1037. c3 = *(c + 3 * ldc);
  1038. c4 = *(c + 4 * ldc);
  1039. c5 = *(c + 5 * ldc);
  1040. c6 = *(c + 6 * ldc);
  1041. c7 = *(c + 7 * ldc);
  1042. for (k = 0; k < bk; k++)
  1043. {
  1044. c0 -= a[0] * b[0];
  1045. c1 -= a[0] * b[1];
  1046. c2 -= a[0] * b[2];
  1047. c3 -= a[0] * b[3];
  1048. c4 -= a[0] * b[4];
  1049. c5 -= a[0] * b[5];
  1050. c6 -= a[0] * b[6];
  1051. c7 -= a[0] * b[7];
  1052. a += 1;
  1053. b += 8;
  1054. }
  1055. b0 = *(b + 0);
  1056. b1 = *(b + 1);
  1057. b2 = *(b + 2);
  1058. b3 = *(b + 3);
  1059. b4 = *(b + 4);
  1060. b5 = *(b + 5);
  1061. b6 = *(b + 6);
  1062. b7 = *(b + 7);
  1063. b9 = *(b + 9);
  1064. b10 = *(b + 10);
  1065. b11 = *(b + 11);
  1066. b12 = *(b + 12);
  1067. b13 = *(b + 13);
  1068. b14 = *(b + 14);
  1069. b15 = *(b + 15);
  1070. b18 = *(b + 18);
  1071. b19 = *(b + 19);
  1072. b20 = *(b + 20);
  1073. b21 = *(b + 21);
  1074. b22 = *(b + 22);
  1075. b23 = *(b + 23);
  1076. b27 = *(b + 27);
  1077. b28 = *(b + 28);
  1078. b29 = *(b + 29);
  1079. b30 = *(b + 30);
  1080. b31 = *(b + 31);
  1081. b36 = *(b + 36);
  1082. b37 = *(b + 37);
  1083. b38 = *(b + 38);
  1084. b39 = *(b + 39);
  1085. b45 = *(b + 45);
  1086. b46 = *(b + 46);
  1087. b47 = *(b + 47);
  1088. b54 = *(b + 54);
  1089. b55 = *(b + 55);
  1090. b63 = *(b + 63);
  1091. c0 *= b0;
  1092. c1 -= c0 * b1;
  1093. c1 *= b9;
  1094. c2 -= c0 * b2;
  1095. c2 -= c1 * b10;
  1096. c2 *= b18;
  1097. c3 -= c0 * b3;
  1098. c3 -= c1 * b11;
  1099. c3 -= c2 * b19;
  1100. c3 *= b27;
  1101. c4 -= c0 * b4;
  1102. c4 -= c1 * b12;
  1103. c4 -= c2 * b20;
  1104. c4 -= c3 * b28;
  1105. c4 *= b36;
  1106. c5 -= c0 * b5;
  1107. c5 -= c1 * b13;
  1108. c5 -= c2 * b21;
  1109. c5 -= c3 * b29;
  1110. c5 -= c4 * b37;
  1111. c5 *= b45;
  1112. c6 -= c0 * b6;
  1113. c6 -= c1 * b14;
  1114. c6 -= c2 * b22;
  1115. c6 -= c3 * b30;
  1116. c6 -= c4 * b38;
  1117. c6 -= c5 * b46;
  1118. c6 *= b54;
  1119. c7 -= c0 * b7;
  1120. c7 -= c1 * b15;
  1121. c7 -= c2 * b23;
  1122. c7 -= c3 * b31;
  1123. c7 -= c4 * b39;
  1124. c7 -= c5 * b47;
  1125. c7 -= c6 * b55;
  1126. c7 *= b63;
  1127. *(a + 0) = c0;
  1128. *(a + 1) = c1;
  1129. *(a + 2) = c2;
  1130. *(a + 3) = c3;
  1131. *(a + 4) = c4;
  1132. *(a + 5) = c5;
  1133. *(a + 6) = c6;
  1134. *(a + 7) = c7;
  1135. *(c + 0) = c0;
  1136. *(c + 1 * ldc) = c1;
  1137. *(c + 2 * ldc) = c2;
  1138. *(c + 3 * ldc) = c3;
  1139. *(c + 4 * ldc) = c4;
  1140. *(c + 5 * ldc) = c5;
  1141. *(c + 6 * ldc) = c6;
  1142. *(c + 7 * ldc) = c7;
  1143. }
  1144. static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1145. {
  1146. BLASLONG k;
  1147. FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3;
  1148. c0 = *(c + 0);
  1149. c1 = *(c + 1 * ldc);
  1150. c2 = *(c + 2 * ldc);
  1151. c3 = *(c + 3 * ldc);
  1152. for (k = 0; k < bk; k++)
  1153. {
  1154. c0 -= a[0] * b[0];
  1155. c1 -= a[0] * b[1];
  1156. c2 -= a[0] * b[2];
  1157. c3 -= a[0] * b[3];
  1158. a += 1;
  1159. b += 4;
  1160. }
  1161. b0 = *(b + 0);
  1162. b1 = *(b + 1);
  1163. b2 = *(b + 2);
  1164. b3 = *(b + 3);
  1165. b5 = *(b + 5);
  1166. b6 = *(b + 6);
  1167. b7 = *(b + 7);
  1168. b10 = *(b + 10);
  1169. b11 = *(b + 11);
  1170. b15 = *(b + 15);
  1171. c0 *= b0;
  1172. c1 -= c0 * b1;
  1173. c1 *= b5;
  1174. c2 -= c0 * b2;
  1175. c2 -= c1 * b6;
  1176. c2 *= b10;
  1177. c3 -= c0 * b3;
  1178. c3 -= c1 * b7;
  1179. c3 -= c2 * b11;
  1180. c3 *= b15;
  1181. *(a + 0) = c0;
  1182. *(a + 1) = c1;
  1183. *(a + 2) = c2;
  1184. *(a + 3) = c3;
  1185. *(c + 0) = c0;
  1186. *(c + 1 * ldc) = c1;
  1187. *(c + 2 * ldc) = c2;
  1188. *(c + 3 * ldc) = c3;
  1189. }
  1190. static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1191. {
  1192. BLASLONG k;
  1193. FLOAT b0, b1, b3, c0, c1;
  1194. c0 = *c;
  1195. c1 = *(c + ldc);
  1196. for (k = 0; k < bk; k++)
  1197. {
  1198. c0 -= a[0] * b[0];
  1199. c1 -= a[0] * b[1];
  1200. a += 1;
  1201. b += 2;
  1202. }
  1203. b0 = *(b + 0);
  1204. b1 = *(b + 1);
  1205. b3 = *(b + 3);
  1206. c0 *= b0;
  1207. c1 -= c0 * b1;
  1208. c1 *= b3;
  1209. *(a + 0) = c0;
  1210. *(a + 1) = c1;
  1211. *(c + 0) = c0;
  1212. *(c + ldc) = c1;
  1213. }
  1214. static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1215. {
  1216. BLASLONG k;
  1217. for (k = 0; k < bk; k++)
  1218. {
  1219. *c -= a[0] * b[0];
  1220. a++;
  1221. b++;
  1222. }
  1223. *c *= *b;
  1224. *a = *c;
  1225. }
  1226. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
  1227. FLOAT *c, BLASLONG ldc, BLASLONG offset)
  1228. {
  1229. FLOAT *aa, *cc;
  1230. BLASLONG i, j, kk;
  1231. kk = -offset;
  1232. for (j = (n >> 3); j--;)
  1233. {
  1234. aa = a;
  1235. cc = c;
  1236. for (i = (m >> 3); i--;)
  1237. {
  1238. ssolve_8x8_rn_msa(aa, b, cc, ldc, kk);
  1239. aa += 8 * k;
  1240. cc += 8;
  1241. }
  1242. if (m & 7)
  1243. {
  1244. if (m & 4)
  1245. {
  1246. ssolve_4x8_rn_msa(aa, b, cc, ldc, kk);
  1247. aa += 4 * k;
  1248. cc += 4;
  1249. }
  1250. if (m & 2)
  1251. {
  1252. ssolve_2x8_rn_msa(aa, b, cc, ldc, kk);
  1253. aa += 2 * k;
  1254. cc += 2;
  1255. }
  1256. if (m & 1)
  1257. {
  1258. ssolve_1x8_rn_msa(aa, b, cc, ldc, kk);
  1259. aa += k;
  1260. cc += 1;
  1261. }
  1262. }
  1263. kk += 8;
  1264. b += 8 * k;
  1265. c += 8 * ldc;
  1266. }
  1267. if (n & 7)
  1268. {
  1269. if (n & 4)
  1270. {
  1271. aa = a;
  1272. cc = c;
  1273. for (i = (m >> 3); i--;)
  1274. {
  1275. ssolve_8x4_rn_msa(aa, b, cc, ldc, kk);
  1276. aa += 8 * k;
  1277. cc += 8;
  1278. }
  1279. if (m & 7)
  1280. {
  1281. if (m & 4)
  1282. {
  1283. ssolve_4x4_rn_msa(aa, b, cc, ldc, kk);
  1284. aa += 4 * k;
  1285. cc += 4;
  1286. }
  1287. if (m & 2)
  1288. {
  1289. ssolve_2x4_rn_msa(aa, b, cc, ldc, kk);
  1290. aa += 2 * k;
  1291. cc += 2;
  1292. }
  1293. if (m & 1)
  1294. {
  1295. ssolve_1x4_rn_msa(aa, b, cc, ldc, kk);
  1296. aa += k;
  1297. cc += 1;
  1298. }
  1299. }
  1300. b += 4 * k;
  1301. c += 4 * ldc;
  1302. kk += 4;
  1303. }
  1304. if (n & 2)
  1305. {
  1306. aa = a;
  1307. cc = c;
  1308. for (i = (m >> 3); i--;)
  1309. {
  1310. ssolve_8x2_rn_msa(aa, b, cc, ldc, kk);
  1311. aa += 8 * k;
  1312. cc += 8;
  1313. }
  1314. if (m & 7)
  1315. {
  1316. if (m & 4)
  1317. {
  1318. ssolve_4x2_rn_msa(aa, b, cc, ldc, kk);
  1319. aa += 4 * k;
  1320. cc += 4;
  1321. }
  1322. if (m & 2)
  1323. {
  1324. ssolve_2x2_rn_msa(aa, b, cc, ldc, kk);
  1325. aa += 2 * k;
  1326. cc += 2;
  1327. }
  1328. if (m & 1)
  1329. {
  1330. ssolve_1x2_rn_msa(aa, b, cc, ldc, kk);
  1331. aa += k;
  1332. cc += 1;
  1333. }
  1334. }
  1335. b += 2 * k;
  1336. c += 2 * ldc;
  1337. kk += 2;
  1338. }
  1339. if (n & 1)
  1340. {
  1341. aa = a;
  1342. cc = c;
  1343. for (i = (m >> 3); i--;)
  1344. {
  1345. ssolve_8x1_rn_msa(aa, b, cc, ldc, kk);
  1346. aa += 8 * k;
  1347. cc += 8;
  1348. }
  1349. if (m & 7)
  1350. {
  1351. if (m & 4)
  1352. {
  1353. ssolve_4x1_rn_msa(aa, b, cc, ldc, kk);
  1354. aa += 4 * k;
  1355. cc += 4;
  1356. }
  1357. if (m & 2)
  1358. {
  1359. ssolve_2x1_rn_msa(aa, b, cc, ldc, kk);
  1360. aa += 2 * k;
  1361. cc += 2;
  1362. }
  1363. if (m & 1)
  1364. {
  1365. ssolve_1x1_rn_msa(aa, b, cc, kk);
  1366. aa += k;
  1367. cc += 1;
  1368. }
  1369. }
  1370. b += k;
  1371. c += ldc;
  1372. kk += 1;
  1373. }
  1374. }
  1375. return 0;
  1376. }