You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strsm_kernel_LT_8x8_msa.c 44 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  30. {
  31. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  32. v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
  33. v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
  34. v4f32 res_c8, res_c9, res_c10, res_c11, res_c12, res_c13, res_c14, res_c15;
  35. v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
  36. v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
  37. v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
  38. v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
  39. v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a;
  40. FLOAT *c_nxt1line = c + ldc;
  41. FLOAT *c_nxt2line = c + 2 * ldc;
  42. FLOAT *c_nxt3line = c + 3 * ldc;
  43. FLOAT *c_nxt4line = c + 4 * ldc;
  44. FLOAT *c_nxt5line = c + 5 * ldc;
  45. FLOAT *c_nxt6line = c + 6 * ldc;
  46. FLOAT *c_nxt7line = c + 7 * ldc;
  47. LD_SP2(c, 4, src_c0, src_c1);
  48. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  49. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  50. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  51. LD_SP2(c_nxt4line, 4, src_c8, src_c9);
  52. LD_SP2(c_nxt5line, 4, src_c10, src_c11);
  53. LD_SP2(c_nxt6line, 4, src_c12, src_c13);
  54. LD_SP2(c_nxt7line, 4, src_c14, src_c15);
  55. if (bk > 0)
  56. {
  57. BLASLONG k, pref_offset;
  58. FLOAT *pa0_pref;
  59. v4f32 src_b0, src_b1, src_b2, src_b3, src_bb0, src_bb1;
  60. pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
  61. if (pref_offset)
  62. {
  63. pref_offset = L1_DATA_LINESIZE - pref_offset;
  64. pref_offset = pref_offset / sizeof(FLOAT);
  65. }
  66. pa0_pref = a + pref_offset;
  67. for (k = 0; k < (bk >> 1); k++)
  68. {
  69. PREF_OFFSET(pa0_pref, 64);
  70. PREF_OFFSET(pa0_pref, 96);
  71. LD_SP2_INC(a, 4, src_a0, src_a1);
  72. LD_SP2_INC(b, 4, src_bb0, src_bb1);
  73. SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
  74. src_c0 -= src_a0 * src_b0;
  75. src_c1 -= src_a1 * src_b0;
  76. src_c2 -= src_a0 * src_b1;
  77. src_c3 -= src_a1 * src_b1;
  78. src_c4 -= src_a0 * src_b2;
  79. src_c5 -= src_a1 * src_b2;
  80. src_c6 -= src_a0 * src_b3;
  81. src_c7 -= src_a1 * src_b3;
  82. SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
  83. src_c8 -= src_a0 * src_b0;
  84. src_c9 -= src_a1 * src_b0;
  85. src_c10 -= src_a0 * src_b1;
  86. src_c11 -= src_a1 * src_b1;
  87. src_c12 -= src_a0 * src_b2;
  88. src_c13 -= src_a1 * src_b2;
  89. src_c14 -= src_a0 * src_b3;
  90. src_c15 -= src_a1 * src_b3;
  91. LD_SP2_INC(a, 4, src_a0, src_a1);
  92. LD_SP2_INC(b, 4, src_bb0, src_bb1);
  93. SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
  94. src_c0 -= src_a0 * src_b0;
  95. src_c1 -= src_a1 * src_b0;
  96. src_c2 -= src_a0 * src_b1;
  97. src_c3 -= src_a1 * src_b1;
  98. src_c4 -= src_a0 * src_b2;
  99. src_c5 -= src_a1 * src_b2;
  100. src_c6 -= src_a0 * src_b3;
  101. src_c7 -= src_a1 * src_b3;
  102. SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
  103. src_c8 -= src_a0 * src_b0;
  104. src_c9 -= src_a1 * src_b0;
  105. src_c10 -= src_a0 * src_b1;
  106. src_c11 -= src_a1 * src_b1;
  107. src_c12 -= src_a0 * src_b2;
  108. src_c13 -= src_a1 * src_b2;
  109. src_c14 -= src_a0 * src_b3;
  110. src_c15 -= src_a1 * src_b3;
  111. pa0_pref += 16;
  112. }
  113. if (bk & 1)
  114. {
  115. LD_SP2_INC(a, 4, src_a0, src_a1);
  116. LD_SP2_INC(b, 4, src_bb0, src_bb1);
  117. SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
  118. src_c0 -= src_a0 * src_b0;
  119. src_c1 -= src_a1 * src_b0;
  120. src_c2 -= src_a0 * src_b1;
  121. src_c3 -= src_a1 * src_b1;
  122. src_c4 -= src_a0 * src_b2;
  123. src_c5 -= src_a1 * src_b2;
  124. src_c6 -= src_a0 * src_b3;
  125. src_c7 -= src_a1 * src_b3;
  126. SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
  127. src_c8 -= src_a0 * src_b0;
  128. src_c9 -= src_a1 * src_b0;
  129. src_c10 -= src_a0 * src_b1;
  130. src_c11 -= src_a1 * src_b1;
  131. src_c12 -= src_a0 * src_b2;
  132. src_c13 -= src_a1 * src_b2;
  133. src_c14 -= src_a0 * src_b3;
  134. src_c15 -= src_a1 * src_b3;
  135. }
  136. }
  137. TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
  138. res_c0, res_c1, res_c2, res_c3);
  139. TRANSPOSE4x4_SP_SP(src_c8, src_c10, src_c12, src_c14,
  140. res_c8, res_c9, res_c10, res_c11);
  141. TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
  142. res_c4, res_c5, res_c6, res_c7);
  143. TRANSPOSE4x4_SP_SP(src_c9, src_c11, src_c13, src_c15,
  144. res_c12, res_c13, res_c14, res_c15);
  145. src_a = LD_SP(a + 0);
  146. SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
  147. src_a = LD_SP(a + 4);
  148. SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7);
  149. res_c0 *= src_a0;
  150. res_c8 *= src_a0;
  151. res_c1 -= res_c0 * src_a1;
  152. res_c9 -= res_c8 * src_a1;
  153. res_c2 -= res_c0 * src_a2;
  154. res_c10 -= res_c8 * src_a2;
  155. res_c3 -= res_c0 * src_a3;
  156. res_c11 -= res_c8 * src_a3;
  157. res_c4 -= res_c0 * src_a4;
  158. res_c12 -= res_c8 * src_a4;
  159. res_c5 -= res_c0 * src_a5;
  160. res_c13 -= res_c8 * src_a5;
  161. res_c6 -= res_c0 * src_a6;
  162. res_c14 -= res_c8 * src_a6;
  163. res_c7 -= res_c0 * src_a7;
  164. res_c15 -= res_c8 * src_a7;
  165. src_a = LD_SP(a + 9);
  166. SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12);
  167. src_a13 = LD_SP(a + 13);
  168. src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2);
  169. src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1);
  170. src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0);
  171. res_c1 *= src_a9;
  172. res_c9 *= src_a9;
  173. res_c2 -= res_c1 * src_a10;
  174. res_c10 -= res_c9 * src_a10;
  175. res_c3 -= res_c1 * src_a11;
  176. res_c11 -= res_c9 * src_a11;
  177. res_c4 -= res_c1 * src_a12;
  178. res_c12 -= res_c9 * src_a12;
  179. res_c5 -= res_c1 * src_a13;
  180. res_c13 -= res_c9 * src_a13;
  181. res_c6 -= res_c1 * src_a14;
  182. res_c14 -= res_c9 * src_a14;
  183. res_c7 -= res_c1 * src_a15;
  184. res_c15 -= res_c9 * src_a15;
  185. src_a = LD_SP(a + 18);
  186. SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21);
  187. src_a22 = LD_SP(a + 22);
  188. src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1);
  189. src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0);
  190. res_c2 *= src_a18;
  191. res_c10 *= src_a18;
  192. res_c3 -= res_c2 * src_a19;
  193. res_c11 -= res_c10 * src_a19;
  194. res_c4 -= res_c2 * src_a20;
  195. res_c12 -= res_c10 * src_a20;
  196. res_c5 -= res_c2 * src_a21;
  197. res_c13 -= res_c10 * src_a21;
  198. res_c6 -= res_c2 * src_a22;
  199. res_c14 -= res_c10 * src_a22;
  200. res_c7 -= res_c2 * src_a23;
  201. res_c15 -= res_c10 * src_a23;
  202. src_a = LD_SP(a + 27);
  203. SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
  204. src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
  205. res_c3 *= src_a27;
  206. res_c11 *= src_a27;
  207. res_c4 -= res_c3 * src_a28;
  208. res_c12 -= res_c11 * src_a28;
  209. res_c5 -= res_c3 * src_a29;
  210. res_c13 -= res_c11 * src_a29;
  211. res_c6 -= res_c3 * src_a30;
  212. res_c14 -= res_c11 * src_a30;
  213. res_c7 -= res_c3 * src_a31;
  214. res_c15 -= res_c11 * src_a31;
  215. ST_SP4(res_c0, res_c8, res_c1, res_c9, b, 4);
  216. ST_SP4(res_c2, res_c10, res_c3, res_c11, b + 16, 4);
  217. TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
  218. src_c0, src_c2, src_c4, src_c6);
  219. TRANSPOSE4x4_SP_SP(res_c8, res_c9, res_c10, res_c11,
  220. src_c8, src_c10, src_c12, src_c14);
  221. ST_SP(src_c0, c);
  222. ST_SP(src_c2, c_nxt1line);
  223. ST_SP(src_c4, c_nxt2line);
  224. ST_SP(src_c6, c_nxt3line);
  225. ST_SP(src_c8, c_nxt4line);
  226. ST_SP(src_c10, c_nxt5line);
  227. ST_SP(src_c12, c_nxt6line);
  228. ST_SP(src_c14, c_nxt7line);
  229. src_a = LD_SP(a + 36);
  230. SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39);
  231. res_c4 *= src_a36;
  232. res_c12 *= src_a36;
  233. res_c5 -= res_c4 * src_a37;
  234. res_c13 -= res_c12 * src_a37;
  235. res_c6 -= res_c4 * src_a38;
  236. res_c14 -= res_c12 * src_a38;
  237. res_c7 -= res_c4 * src_a39;
  238. res_c15 -= res_c12 * src_a39;
  239. src_a45 = LD_SP(a + 45);
  240. src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2);
  241. src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1);
  242. src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0);
  243. res_c5 *= src_a45;
  244. res_c13 *= src_a45;
  245. res_c6 -= res_c5 * src_a46;
  246. res_c14 -= res_c13 * src_a46;
  247. res_c7 -= res_c5 * src_a47;
  248. res_c15 -= res_c13 * src_a47;
  249. src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
  250. src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
  251. src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
  252. res_c6 *= src_a54;
  253. res_c14 *= src_a54;
  254. res_c7 -= res_c6 * src_a55;
  255. res_c15 -= res_c14 * src_a55;
  256. res_c7 *= src_a63;
  257. res_c15 *= src_a63;
  258. ST_SP4(res_c4, res_c12, res_c5, res_c13, b + 32, 4);
  259. ST_SP4(res_c6, res_c14, res_c7, res_c15, b + 48, 4);
  260. TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
  261. src_c1, src_c3, src_c5, src_c7);
  262. TRANSPOSE4x4_SP_SP(res_c12, res_c13, res_c14, res_c15,
  263. src_c9, src_c11, src_c13, src_c15);
  264. ST_SP(src_c1, c + 4);
  265. ST_SP(src_c3, c_nxt1line + 4);
  266. ST_SP(src_c5, c_nxt2line + 4);
  267. ST_SP(src_c7, c_nxt3line + 4);
  268. ST_SP(src_c9, c_nxt4line + 4);
  269. ST_SP(src_c11, c_nxt5line + 4);
  270. ST_SP(src_c13, c_nxt6line + 4);
  271. ST_SP(src_c15, c_nxt7line + 4);
  272. }
  273. static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  274. {
  275. BLASLONG k;
  276. v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
  277. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  278. v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
  279. v4f32 src_a0, src_a1, src_a2, src_a3, src_a4, src_a5, src_a6, src_a7;
  280. v4f32 src_a9, src_a10, src_a11, src_a12, src_a13, src_a14, src_a15, src_a18;
  281. v4f32 src_a19, src_a20, src_a21, src_a22, src_a23, src_a27, src_a28;
  282. v4f32 src_a29, src_a30, src_a31, src_a36, src_a37, src_a38, src_a39;
  283. v4f32 src_a45, src_a46, src_a47, src_a54, src_a55, src_a63, src_a;
  284. FLOAT *c_nxt1line = c + ldc;
  285. FLOAT *c_nxt2line = c + 2 * ldc;
  286. FLOAT *c_nxt3line = c + 3 * ldc;
  287. LD_SP2(c, 4, src_c0, src_c1);
  288. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  289. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  290. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  291. for (k = 0; k < bk; k++)
  292. {
  293. LD_SP2(a, 4, src_a0, src_a1);
  294. src_b = LD_SP(b + 0);
  295. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  296. src_c0 -= src_a0 * src_b0;
  297. src_c1 -= src_a1 * src_b0;
  298. src_c2 -= src_a0 * src_b1;
  299. src_c3 -= src_a1 * src_b1;
  300. src_c4 -= src_a0 * src_b2;
  301. src_c5 -= src_a1 * src_b2;
  302. src_c6 -= src_a0 * src_b3;
  303. src_c7 -= src_a1 * src_b3;
  304. a += 8;
  305. b += 4;
  306. }
  307. TRANSPOSE4x4_SP_SP(src_c0, src_c2, src_c4, src_c6,
  308. res_c0, res_c1, res_c2, res_c3);
  309. TRANSPOSE4x4_SP_SP(src_c1, src_c3, src_c5, src_c7,
  310. res_c4, res_c5, res_c6, res_c7);
  311. src_a = LD_SP(a + 0);
  312. SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
  313. src_a = LD_SP(a + 4);
  314. SPLATI_W4_SP(src_a, src_a4, src_a5, src_a6, src_a7);
  315. res_c0 *= src_a0;
  316. res_c1 -= res_c0 * src_a1;
  317. res_c2 -= res_c0 * src_a2;
  318. res_c3 -= res_c0 * src_a3;
  319. res_c4 -= res_c0 * src_a4;
  320. res_c5 -= res_c0 * src_a5;
  321. res_c6 -= res_c0 * src_a6;
  322. res_c7 -= res_c0 * src_a7;
  323. src_a = LD_SP(a + 9);
  324. SPLATI_W4_SP(src_a, src_a9, src_a10, src_a11, src_a12);
  325. src_a13 = LD_SP(a + 13);
  326. src_a15 = (v4f32) __msa_splati_w((v4i32) src_a13, 2);
  327. src_a14 = (v4f32) __msa_splati_w((v4i32) src_a13, 1);
  328. src_a13 = (v4f32) __msa_splati_w((v4i32) src_a13, 0);
  329. res_c1 *= src_a9;
  330. res_c2 -= res_c1 * src_a10;
  331. res_c3 -= res_c1 * src_a11;
  332. res_c4 -= res_c1 * src_a12;
  333. res_c5 -= res_c1 * src_a13;
  334. res_c6 -= res_c1 * src_a14;
  335. res_c7 -= res_c1 * src_a15;
  336. src_a = LD_SP(a + 18);
  337. SPLATI_W4_SP(src_a, src_a18, src_a19, src_a20, src_a21);
  338. src_a22 = LD_SP(a + 22);
  339. src_a23 = (v4f32) __msa_splati_w((v4i32) src_a22, 1);
  340. src_a22 = (v4f32) __msa_splati_w((v4i32) src_a22, 0);
  341. res_c2 *= src_a18;
  342. res_c3 -= res_c2 * src_a19;
  343. res_c4 -= res_c2 * src_a20;
  344. res_c5 -= res_c2 * src_a21;
  345. res_c6 -= res_c2 * src_a22;
  346. res_c7 -= res_c2 * src_a23;
  347. src_a = LD_SP(a + 27);
  348. SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
  349. src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
  350. res_c3 *= src_a27;
  351. res_c4 -= res_c3 * src_a28;
  352. res_c5 -= res_c3 * src_a29;
  353. res_c6 -= res_c3 * src_a30;
  354. res_c7 -= res_c3 * src_a31;
  355. src_a = LD_SP(a + 36);
  356. SPLATI_W4_SP(src_a, src_a36, src_a37, src_a38, src_a39);
  357. res_c4 *= src_a36;
  358. res_c5 -= res_c4 * src_a37;
  359. res_c6 -= res_c4 * src_a38;
  360. res_c7 -= res_c4 * src_a39;
  361. src_a45 = LD_SP(a + 45);
  362. src_a47 = (v4f32) __msa_splati_w((v4i32) src_a45, 2);
  363. src_a46 = (v4f32) __msa_splati_w((v4i32) src_a45, 1);
  364. src_a45 = (v4f32) __msa_splati_w((v4i32) src_a45, 0);
  365. res_c5 *= src_a45;
  366. res_c6 -= res_c5 * src_a46;
  367. res_c7 -= res_c5 * src_a47;
  368. src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
  369. src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
  370. src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
  371. res_c6 *= src_a54;
  372. res_c7 -= res_c6 * src_a55;
  373. res_c7 *= src_a63;
  374. ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
  375. b += 16;
  376. ST_SP4(res_c4, res_c5, res_c6, res_c7, b, 4);
  377. TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
  378. src_c0, src_c2, src_c4, src_c6);
  379. TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
  380. src_c1, src_c3, src_c5, src_c7);
  381. ST_SP2(src_c0, src_c1, c, 4);
  382. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  383. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  384. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  385. }
  386. static void ssolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  387. {
  388. BLASLONG k;
  389. FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
  390. FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
  391. FLOAT a45, a46, a47, a54, a55, a63;
  392. FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
  393. FLOAT c0_nxt, c1_nxt, c2_nxt, c3_nxt, c4_nxt, c5_nxt, c6_nxt, c7_nxt;
  394. c0 = *(c + 0);
  395. c1 = *(c + 1);
  396. c2 = *(c + 2);
  397. c3 = *(c + 3);
  398. c4 = *(c + 4);
  399. c5 = *(c + 5);
  400. c6 = *(c + 6);
  401. c7 = *(c + 7);
  402. c0_nxt = *(c + 0 + ldc);
  403. c1_nxt = *(c + 1 + ldc);
  404. c2_nxt = *(c + 2 + ldc);
  405. c3_nxt = *(c + 3 + ldc);
  406. c4_nxt = *(c + 4 + ldc);
  407. c5_nxt = *(c + 5 + ldc);
  408. c6_nxt = *(c + 6 + ldc);
  409. c7_nxt = *(c + 7 + ldc);
  410. for (k = 0; k < bk; k++)
  411. {
  412. c0 -= a[0] * b[0];
  413. c1 -= a[1] * b[0];
  414. c2 -= a[2] * b[0];
  415. c3 -= a[3] * b[0];
  416. c4 -= a[4] * b[0];
  417. c5 -= a[5] * b[0];
  418. c6 -= a[6] * b[0];
  419. c7 -= a[7] * b[0];
  420. c0_nxt -= a[0] * b[1];
  421. c1_nxt -= a[1] * b[1];
  422. c2_nxt -= a[2] * b[1];
  423. c3_nxt -= a[3] * b[1];
  424. c4_nxt -= a[4] * b[1];
  425. c5_nxt -= a[5] * b[1];
  426. c6_nxt -= a[6] * b[1];
  427. c7_nxt -= a[7] * b[1];
  428. a += 8;
  429. b += 2;
  430. }
  431. a0 = *(a + 0);
  432. a1 = *(a + 1);
  433. a2 = *(a + 2);
  434. a3 = *(a + 3);
  435. a4 = *(a + 4);
  436. a5 = *(a + 5);
  437. a6 = *(a + 6);
  438. a7 = *(a + 7);
  439. a9 = *(a + 9);
  440. a10 = *(a + 10);
  441. a11 = *(a + 11);
  442. a12 = *(a + 12);
  443. a13 = *(a + 13);
  444. a14 = *(a + 14);
  445. a15 = *(a + 15);
  446. a18 = *(a + 18);
  447. a19 = *(a + 19);
  448. a20 = *(a + 20);
  449. a21 = *(a + 21);
  450. a22 = *(a + 22);
  451. a23 = *(a + 23);
  452. a27 = *(a + 27);
  453. a28 = *(a + 28);
  454. a29 = *(a + 29);
  455. a30 = *(a + 30);
  456. a31 = *(a + 31);
  457. a36 = *(a + 36);
  458. a37 = *(a + 37);
  459. a38 = *(a + 38);
  460. a39 = *(a + 39);
  461. a45 = *(a + 45);
  462. a46 = *(a + 46);
  463. a47 = *(a + 47);
  464. a54 = *(a + 54);
  465. a55 = *(a + 55);
  466. a63 = *(a + 63);
  467. c0 *= a0;
  468. c0_nxt *= a0;
  469. c1 -= c0 * a1;
  470. c1_nxt -= c0_nxt * a1;
  471. c1 *= a9;
  472. c1_nxt *= a9;
  473. c2 -= c0 * a2;
  474. c2_nxt -= c0_nxt * a2;
  475. c2 -= c1 * a10;
  476. c2_nxt -= c1_nxt * a10;
  477. c2 *= a18;
  478. c2_nxt *= a18;
  479. c3 -= c0 * a3;
  480. c3_nxt -= c0_nxt * a3;
  481. c3 -= c1 * a11;
  482. c3_nxt -= c1_nxt * a11;
  483. c3 -= c2 * a19;
  484. c3_nxt -= c2_nxt * a19;
  485. c3 *= a27;
  486. c3_nxt *= a27;
  487. c4 -= c0 * a4;
  488. c4_nxt -= c0_nxt * a4;
  489. c4 -= c1 * a12;
  490. c4_nxt -= c1_nxt * a12;
  491. c4 -= c2 * a20;
  492. c4_nxt -= c2_nxt * a20;
  493. c4 -= c3 * a28;
  494. c4_nxt -= c3_nxt * a28;
  495. c4 *= a36;
  496. c4_nxt *= a36;
  497. c5 -= c0 * a5;
  498. c5_nxt -= c0_nxt * a5;
  499. c5 -= c1 * a13;
  500. c5_nxt -= c1_nxt * a13;
  501. c5 -= c2 * a21;
  502. c5_nxt -= c2_nxt * a21;
  503. c5 -= c3 * a29;
  504. c5_nxt -= c3_nxt * a29;
  505. c5 -= c4 * a37;
  506. c5_nxt -= c4_nxt * a37;
  507. c5 *= a45;
  508. c5_nxt *= a45;
  509. c6 -= c0 * a6;
  510. c6_nxt -= c0_nxt * a6;
  511. c6 -= c1 * a14;
  512. c6_nxt -= c1_nxt * a14;
  513. c6 -= c2 * a22;
  514. c6_nxt -= c2_nxt * a22;
  515. c6 -= c3 * a30;
  516. c6_nxt -= c3_nxt * a30;
  517. c6 -= c4 * a38;
  518. c6_nxt -= c4_nxt * a38;
  519. c6 -= c5 * a46;
  520. c6_nxt -= c5_nxt * a46;
  521. c6 *= a54;
  522. c6_nxt *= a54;
  523. c7 -= c0 * a7;
  524. c7_nxt -= c0_nxt * a7;
  525. c7 -= c1 * a15;
  526. c7_nxt -= c1_nxt * a15;
  527. c7 -= c2 * a23;
  528. c7_nxt -= c2_nxt * a23;
  529. c7 -= c3 * a31;
  530. c7_nxt -= c3_nxt * a31;
  531. c7 -= c4 * a39;
  532. c7_nxt -= c4_nxt * a39;
  533. c7 -= c5 * a47;
  534. c7_nxt -= c5_nxt * a47;
  535. c7 -= c6 * a55;
  536. c7_nxt -= c6_nxt * a55;
  537. c7 *= a63;
  538. c7_nxt *= a63;
  539. *(c + 0) = c0;
  540. *(c + 1) = c1;
  541. *(c + 2) = c2;
  542. *(c + 3) = c3;
  543. *(c + 4) = c4;
  544. *(c + 5) = c5;
  545. *(c + 6) = c6;
  546. *(c + 7) = c7;
  547. *(c + 0 + ldc) = c0_nxt;
  548. *(c + 1 + ldc) = c1_nxt;
  549. *(c + 2 + ldc) = c2_nxt;
  550. *(c + 3 + ldc) = c3_nxt;
  551. *(c + 4 + ldc) = c4_nxt;
  552. *(c + 5 + ldc) = c5_nxt;
  553. *(c + 6 + ldc) = c6_nxt;
  554. *(c + 7 + ldc) = c7_nxt;
  555. *(b + 0) = c0;
  556. *(b + 1) = c0_nxt;
  557. *(b + 2) = c1;
  558. *(b + 3) = c1_nxt;
  559. *(b + 4) = c2;
  560. *(b + 5) = c2_nxt;
  561. *(b + 6) = c3;
  562. *(b + 7) = c3_nxt;
  563. *(b + 8) = c4;
  564. *(b + 9) = c4_nxt;
  565. *(b + 10) = c5;
  566. *(b + 11) = c5_nxt;
  567. *(b + 12) = c6;
  568. *(b + 13) = c6_nxt;
  569. *(b + 14) = c7;
  570. *(b + 15) = c7_nxt;
  571. }
  572. static void ssolve_8x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  573. {
  574. BLASLONG k;
  575. FLOAT a0, a1, a2, a3, a4, a5, a6, a7, a9, a10, a11, a12, a13, a14, a15, a18;
  576. FLOAT a19, a20, a21, a22, a23, a27, a28, a29, a30, a31, a36, a37, a38, a39;
  577. FLOAT a45, a46, a47, a54, a55, a63, c0, c1, c2, c3, c4, c5, c6, c7;
  578. c0 = *(c + 0);
  579. c1 = *(c + 1);
  580. c2 = *(c + 2);
  581. c3 = *(c + 3);
  582. c4 = *(c + 4);
  583. c5 = *(c + 5);
  584. c6 = *(c + 6);
  585. c7 = *(c + 7);
  586. for (k = 0; k < bk; k++)
  587. {
  588. c0 -= a[0] * b[0];
  589. c1 -= a[1] * b[0];
  590. c2 -= a[2] * b[0];
  591. c3 -= a[3] * b[0];
  592. c4 -= a[4] * b[0];
  593. c5 -= a[5] * b[0];
  594. c6 -= a[6] * b[0];
  595. c7 -= a[7] * b[0];
  596. a += 8;
  597. b += 1;
  598. }
  599. a0 = *(a + 0);
  600. a1 = *(a + 1);
  601. a2 = *(a + 2);
  602. a3 = *(a + 3);
  603. a4 = *(a + 4);
  604. a5 = *(a + 5);
  605. a6 = *(a + 6);
  606. a7 = *(a + 7);
  607. a9 = *(a + 9);
  608. a10 = *(a + 10);
  609. a11 = *(a + 11);
  610. a12 = *(a + 12);
  611. a13 = *(a + 13);
  612. a14 = *(a + 14);
  613. a15 = *(a + 15);
  614. a18 = *(a + 18);
  615. a19 = *(a + 19);
  616. a20 = *(a + 20);
  617. a21 = *(a + 21);
  618. a22 = *(a + 22);
  619. a23 = *(a + 23);
  620. a27 = *(a + 27);
  621. a28 = *(a + 28);
  622. a29 = *(a + 29);
  623. a30 = *(a + 30);
  624. a31 = *(a + 31);
  625. a36 = *(a + 36);
  626. a37 = *(a + 37);
  627. a38 = *(a + 38);
  628. a39 = *(a + 39);
  629. a45 = *(a + 45);
  630. a46 = *(a + 46);
  631. a47 = *(a + 47);
  632. a54 = *(a + 54);
  633. a55 = *(a + 55);
  634. a63 = *(a + 63);
  635. c0 *= a0;
  636. c1 -= c0 * a1;
  637. c1 *= a9;
  638. c2 -= c0 * a2;
  639. c2 -= c1 * a10;
  640. c2 *= a18;
  641. c3 -= c0 * a3;
  642. c3 -= c1 * a11;
  643. c3 -= c2 * a19;
  644. c3 *= a27;
  645. c4 -= c0 * a4;
  646. c4 -= c1 * a12;
  647. c4 -= c2 * a20;
  648. c4 -= c3 * a28;
  649. c4 *= a36;
  650. c5 -= c0 * a5;
  651. c5 -= c1 * a13;
  652. c5 -= c2 * a21;
  653. c5 -= c3 * a29;
  654. c5 -= c4 * a37;
  655. c5 *= a45;
  656. c6 -= c0 * a6;
  657. c6 -= c1 * a14;
  658. c6 -= c2 * a22;
  659. c6 -= c3 * a30;
  660. c6 -= c4 * a38;
  661. c6 -= c5 * a46;
  662. c6 *= a54;
  663. c7 -= c0 * a7;
  664. c7 -= c1 * a15;
  665. c7 -= c2 * a23;
  666. c7 -= c3 * a31;
  667. c7 -= c4 * a39;
  668. c7 -= c5 * a47;
  669. c7 -= c6 * a55;
  670. c7 *= a63;
  671. *(c + 0) = c0;
  672. *(c + 1) = c1;
  673. *(c + 2) = c2;
  674. *(c + 3) = c3;
  675. *(c + 4) = c4;
  676. *(c + 5) = c5;
  677. *(c + 6) = c6;
  678. *(c + 7) = c7;
  679. *(b + 0) = c0;
  680. *(b + 1) = c1;
  681. *(b + 2) = c2;
  682. *(b + 3) = c3;
  683. *(b + 4) = c4;
  684. *(b + 5) = c5;
  685. *(b + 6) = c6;
  686. *(b + 7) = c7;
  687. }
  688. static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  689. {
  690. BLASLONG k;
  691. v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
  692. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  693. v4f32 res_c0, res_c1, res_c2, res_c3, res_c4, res_c5, res_c6, res_c7;
  694. v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
  695. v4f32 src_a10, src_a11, src_a15, src_a;
  696. FLOAT *c_nxt1line = c + ldc;
  697. FLOAT *c_nxt2line = c + 2 * ldc;
  698. FLOAT *c_nxt3line = c + 3 * ldc;
  699. FLOAT *c_nxt4line = c + 4 * ldc;
  700. FLOAT *c_nxt5line = c + 5 * ldc;
  701. FLOAT *c_nxt6line = c + 6 * ldc;
  702. FLOAT *c_nxt7line = c + 7 * ldc;
  703. src_c0 = LD_SP(c);
  704. src_c1 = LD_SP(c_nxt1line);
  705. src_c2 = LD_SP(c_nxt2line);
  706. src_c3 = LD_SP(c_nxt3line);
  707. src_c4 = LD_SP(c_nxt4line);
  708. src_c5 = LD_SP(c_nxt5line);
  709. src_c6 = LD_SP(c_nxt6line);
  710. src_c7 = LD_SP(c_nxt7line);
  711. for (k = 0; k < (bk >> 1); k++)
  712. {
  713. src_a0 = LD_SP(a);
  714. src_b = LD_SP(b + 0);
  715. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  716. src_c0 -= src_a0 * src_b0;
  717. src_c1 -= src_a0 * src_b1;
  718. src_c2 -= src_a0 * src_b2;
  719. src_c3 -= src_a0 * src_b3;
  720. src_b = LD_SP(b + 4);
  721. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  722. src_c4 -= src_a0 * src_b0;
  723. src_c5 -= src_a0 * src_b1;
  724. src_c6 -= src_a0 * src_b2;
  725. src_c7 -= src_a0 * src_b3;
  726. a += 4;
  727. b += 8;
  728. src_a0 = LD_SP(a);
  729. src_b = LD_SP(b + 0);
  730. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  731. src_c0 -= src_a0 * src_b0;
  732. src_c1 -= src_a0 * src_b1;
  733. src_c2 -= src_a0 * src_b2;
  734. src_c3 -= src_a0 * src_b3;
  735. src_b = LD_SP(b + 4);
  736. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  737. src_c4 -= src_a0 * src_b0;
  738. src_c5 -= src_a0 * src_b1;
  739. src_c6 -= src_a0 * src_b2;
  740. src_c7 -= src_a0 * src_b3;
  741. a += 4;
  742. b += 8;
  743. }
  744. if ((bk & 1) && (bk > 0))
  745. {
  746. src_a0 = LD_SP(a);
  747. src_b = LD_SP(b + 0);
  748. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  749. src_c0 -= src_a0 * src_b0;
  750. src_c1 -= src_a0 * src_b1;
  751. src_c2 -= src_a0 * src_b2;
  752. src_c3 -= src_a0 * src_b3;
  753. src_b = LD_SP(b + 4);
  754. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  755. src_c4 -= src_a0 * src_b0;
  756. src_c5 -= src_a0 * src_b1;
  757. src_c6 -= src_a0 * src_b2;
  758. src_c7 -= src_a0 * src_b3;
  759. a += 4;
  760. b += 8;
  761. }
  762. TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
  763. res_c0, res_c1, res_c2, res_c3);
  764. TRANSPOSE4x4_SP_SP(src_c4, src_c5, src_c6, src_c7,
  765. res_c4, res_c5, res_c6, res_c7);
  766. src_a = LD_SP(a + 0);
  767. SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
  768. src_a5 = LD_SP(a + 5);
  769. src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
  770. src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
  771. src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
  772. src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
  773. src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
  774. src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
  775. res_c0 *= src_a0;
  776. res_c4 *= src_a0;
  777. res_c1 -= res_c0 * src_a1;
  778. res_c5 -= res_c4 * src_a1;
  779. res_c2 -= res_c0 * src_a2;
  780. res_c6 -= res_c4 * src_a2;
  781. res_c3 -= res_c0 * src_a3;
  782. res_c7 -= res_c4 * src_a3;
  783. res_c1 *= src_a5;
  784. res_c5 *= src_a5;
  785. res_c2 -= res_c1 * src_a6;
  786. res_c6 -= res_c5 * src_a6;
  787. res_c3 -= res_c1 * src_a7;
  788. res_c7 -= res_c5 * src_a7;
  789. res_c2 *= src_a10;
  790. res_c6 *= src_a10;
  791. res_c3 -= res_c2 * src_a11;
  792. res_c7 -= res_c6 * src_a11;
  793. res_c3 *= src_a15;
  794. res_c7 *= src_a15;
  795. ST_SP4(res_c0, res_c4, res_c1, res_c5, b, 4);
  796. ST_SP4(res_c2, res_c6, res_c3, res_c7, b + 16, 4);
  797. TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
  798. src_c0, src_c1, src_c2, src_c3);
  799. TRANSPOSE4x4_SP_SP(res_c4, res_c5, res_c6, res_c7,
  800. src_c4, src_c5, src_c6, src_c7);
  801. ST_SP(src_c0, c);
  802. ST_SP(src_c1, c_nxt1line);
  803. ST_SP(src_c2, c_nxt2line);
  804. ST_SP(src_c3, c_nxt3line);
  805. ST_SP(src_c4, c_nxt4line);
  806. ST_SP(src_c5, c_nxt5line);
  807. ST_SP(src_c6, c_nxt6line);
  808. ST_SP(src_c7, c_nxt7line);
  809. }
  810. static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  811. {
  812. BLASLONG k;
  813. v4f32 src_b, src_b0, src_b1, src_b2, src_b3;
  814. v4f32 src_c0, src_c1, src_c2, src_c3, res_c0, res_c1, res_c2, res_c3;
  815. v4f32 src_a0, src_a1, src_a2, src_a3, src_a5, src_a6, src_a7;
  816. v4f32 src_a10, src_a11, src_a15, src_a;
  817. FLOAT *c_nxt1line = c + ldc;
  818. FLOAT *c_nxt2line = c + 2 * ldc;
  819. FLOAT *c_nxt3line = c + 3 * ldc;
  820. src_c0 = LD_SP(c);
  821. src_c1 = LD_SP(c_nxt1line);
  822. src_c2 = LD_SP(c_nxt2line);
  823. src_c3 = LD_SP(c_nxt3line);
  824. for (k = 0; k < (bk >> 1); k++)
  825. {
  826. src_a0 = LD_SP(a);
  827. src_b = LD_SP(b + 0);
  828. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  829. src_c0 -= src_a0 * src_b0;
  830. src_c1 -= src_a0 * src_b1;
  831. src_c2 -= src_a0 * src_b2;
  832. src_c3 -= src_a0 * src_b3;
  833. a += 4;
  834. b += 4;
  835. src_a0 = LD_SP(a);
  836. src_b = LD_SP(b + 0);
  837. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  838. src_c0 -= src_a0 * src_b0;
  839. src_c1 -= src_a0 * src_b1;
  840. src_c2 -= src_a0 * src_b2;
  841. src_c3 -= src_a0 * src_b3;
  842. a += 4;
  843. b += 4;
  844. }
  845. if ((bk & 1) && (bk > 0))
  846. {
  847. src_a0 = LD_SP(a);
  848. src_b = LD_SP(b + 0);
  849. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  850. src_c0 -= src_a0 * src_b0;
  851. src_c1 -= src_a0 * src_b1;
  852. src_c2 -= src_a0 * src_b2;
  853. src_c3 -= src_a0 * src_b3;
  854. a += 4;
  855. b += 4;
  856. }
  857. TRANSPOSE4x4_SP_SP(src_c0, src_c1, src_c2, src_c3,
  858. res_c0, res_c1, res_c2, res_c3);
  859. src_a = LD_SP(a + 0);
  860. SPLATI_W4_SP(src_a, src_a0, src_a1, src_a2, src_a3);
  861. src_a5 = LD_SP(a + 5);
  862. src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
  863. src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
  864. src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
  865. src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
  866. src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
  867. src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
  868. res_c0 *= src_a0;
  869. res_c1 -= res_c0 * src_a1;
  870. res_c2 -= res_c0 * src_a2;
  871. res_c3 -= res_c0 * src_a3;
  872. res_c1 *= src_a5;
  873. res_c2 -= res_c1 * src_a6;
  874. res_c3 -= res_c1 * src_a7;
  875. res_c2 *= src_a10;
  876. res_c3 -= res_c2 * src_a11;
  877. res_c3 *= src_a15;
  878. ST_SP4(res_c0, res_c1, res_c2, res_c3, b, 4);
  879. TRANSPOSE4x4_SP_SP(res_c0, res_c1, res_c2, res_c3,
  880. src_c0, src_c1, src_c2, src_c3);
  881. ST_SP(src_c0, c);
  882. ST_SP(src_c1, c_nxt1line);
  883. ST_SP(src_c2, c_nxt2line);
  884. ST_SP(src_c3, c_nxt3line);
  885. }
  886. static void ssolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  887. {
  888. BLASLONG k;
  889. FLOAT c0, c1, c2, c3, c0_nxt, c1_nxt, c2_nxt, c3_nxt;
  890. FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15;
  891. c0 = *(c + 0);
  892. c1 = *(c + 1);
  893. c2 = *(c + 2);
  894. c3 = *(c + 3);
  895. c0_nxt = *(c + 0 + ldc);
  896. c1_nxt = *(c + 1 + ldc);
  897. c2_nxt = *(c + 2 + ldc);
  898. c3_nxt = *(c + 3 + ldc);
  899. for (k = 0; k < bk; k++)
  900. {
  901. c0 -= a[0] * b[0];
  902. c1 -= a[1] * b[0];
  903. c2 -= a[2] * b[0];
  904. c3 -= a[3] * b[0];
  905. c0_nxt -= a[0] * b[1];
  906. c1_nxt -= a[1] * b[1];
  907. c2_nxt -= a[2] * b[1];
  908. c3_nxt -= a[3] * b[1];
  909. a += 4;
  910. b += 2;
  911. }
  912. a0 = *(a + 0);
  913. a1 = *(a + 1);
  914. a2 = *(a + 2);
  915. a3 = *(a + 3);
  916. a5 = *(a + 5);
  917. a6 = *(a + 6);
  918. a7 = *(a + 7);
  919. a10 = *(a + 10);
  920. a11 = *(a + 11);
  921. a15 = *(a + 15);
  922. c0 *= a0;
  923. c0_nxt *= a0;
  924. c1 -= c0 * a1;
  925. c1_nxt -= c0_nxt * a1;
  926. c1 *= a5;
  927. c1_nxt *= a5;
  928. c2 -= c0 * a2;
  929. c2_nxt -= c0_nxt * a2;
  930. c2 -= c1 * a6;
  931. c2_nxt -= c1_nxt * a6;
  932. c2 *= a10;
  933. c2_nxt *= a10;
  934. c3 -= c0 * a3;
  935. c3_nxt -= c0_nxt * a3;
  936. c3 -= c1 * a7;
  937. c3_nxt -= c1_nxt * a7;
  938. c3 -= c2 * a11;
  939. c3_nxt -= c2_nxt * a11;
  940. c3 *= a15;
  941. c3_nxt *= a15;
  942. *(b + 0) = c0;
  943. *(b + 1) = c0_nxt;
  944. *(b + 2) = c1;
  945. *(b + 3) = c1_nxt;
  946. *(b + 4) = c2;
  947. *(b + 5) = c2_nxt;
  948. *(b + 6) = c3;
  949. *(b + 7) = c3_nxt;
  950. *(c + 0) = c0;
  951. *(c + 1) = c1;
  952. *(c + 2) = c2;
  953. *(c + 3) = c3;
  954. *(c + 0 + ldc) = c0_nxt;
  955. *(c + 1 + ldc) = c1_nxt;
  956. *(c + 2 + ldc) = c2_nxt;
  957. *(c + 3 + ldc) = c3_nxt;
  958. }
  959. static void ssolve_4x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  960. {
  961. BLASLONG k;
  962. FLOAT a0, a1, a2, a3, a5, a6, a7, a10, a11, a15, c0, c1, c2, c3;
  963. c0 = *(c + 0);
  964. c1 = *(c + 1);
  965. c2 = *(c + 2);
  966. c3 = *(c + 3);
  967. for (k = 0; k < bk; k++)
  968. {
  969. c0 -= a[0] * b[0];
  970. c1 -= a[1] * b[0];
  971. c2 -= a[2] * b[0];
  972. c3 -= a[3] * b[0];
  973. a += 4;
  974. b += 1;
  975. }
  976. a0 = *(a + 0);
  977. a1 = *(a + 1);
  978. a2 = *(a + 2);
  979. a3 = *(a + 3);
  980. a5 = *(a + 5);
  981. a6 = *(a + 6);
  982. a7 = *(a + 7);
  983. a10 = *(a + 10);
  984. a11 = *(a + 11);
  985. a15 = *(a + 15);
  986. c0 *= a0;
  987. c1 -= c0 * a1;
  988. c1 *= a5;
  989. c2 -= c0 * a2;
  990. c2 -= c1 * a6;
  991. c2 *= a10;
  992. c3 -= c0 * a3;
  993. c3 -= c1 * a7;
  994. c3 -= c2 * a11;
  995. c3 *= a15;
  996. *(b + 0) = c0;
  997. *(b + 1) = c1;
  998. *(b + 2) = c2;
  999. *(b + 3) = c3;
  1000. *(c + 0) = c0;
  1001. *(c + 1) = c1;
  1002. *(c + 2) = c2;
  1003. *(c + 3) = c3;
  1004. }
  1005. static void ssolve_2x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1006. {
  1007. BLASLONG k;
  1008. FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2;
  1009. FLOAT c0_nxt3, c1_nxt3, c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5;
  1010. FLOAT c0_nxt6, c1_nxt6, c0_nxt7, c1_nxt7;
  1011. c0 = *(c + 0);
  1012. c1 = *(c + 1);
  1013. c0_nxt1 = *(c + ldc);
  1014. c1_nxt1 = *(c + 1 + ldc);
  1015. c0_nxt2 = *(c + 2 * ldc);
  1016. c1_nxt2 = *(c + 1 + 2 * ldc);
  1017. c0_nxt3 = *(c + 3 * ldc);
  1018. c1_nxt3 = *(c + 1 + 3 * ldc);
  1019. c0_nxt4 = *(c + 4 * ldc);
  1020. c1_nxt4 = *(c + 1 + 4 * ldc);
  1021. c0_nxt5 = *(c + 5 * ldc);
  1022. c1_nxt5 = *(c + 1 + 5 * ldc);
  1023. c0_nxt6 = *(c + 6 * ldc);
  1024. c1_nxt6 = *(c + 1 + 6 * ldc);
  1025. c0_nxt7 = *(c + 7 * ldc);
  1026. c1_nxt7 = *(c + 1 + 7 * ldc);
  1027. for (k = 0; k < bk; k++)
  1028. {
  1029. c0 -= a[0] * b[0];
  1030. c1 -= a[1] * b[0];
  1031. c0_nxt1 -= a[0] * b[1];
  1032. c1_nxt1 -= a[1] * b[1];
  1033. c0_nxt2 -= a[0] * b[2];
  1034. c1_nxt2 -= a[1] * b[2];
  1035. c0_nxt3 -= a[0] * b[3];
  1036. c1_nxt3 -= a[1] * b[3];
  1037. c0_nxt4 -= a[0] * b[4];
  1038. c1_nxt4 -= a[1] * b[4];
  1039. c0_nxt5 -= a[0] * b[5];
  1040. c1_nxt5 -= a[1] * b[5];
  1041. c0_nxt6 -= a[0] * b[6];
  1042. c1_nxt6 -= a[1] * b[6];
  1043. c0_nxt7 -= a[0] * b[7];
  1044. c1_nxt7 -= a[1] * b[7];
  1045. a += 2;
  1046. b += 8;
  1047. }
  1048. a0 = *a;
  1049. a1 = *(a + 1);
  1050. a3 = *(a + 3);
  1051. c0 = c0 * a0;
  1052. c1 = (c1 - c0 * a1) * a3;
  1053. c0_nxt1 = c0_nxt1 * a0;
  1054. c1_nxt1 = (c1_nxt1 - c0_nxt1 * a1) * a3;
  1055. c0_nxt2 = c0_nxt2 * a0;
  1056. c1_nxt2 = (c1_nxt2 - c0_nxt2 * a1) * a3;
  1057. c0_nxt3 = c0_nxt3 * a0;
  1058. c1_nxt3 = (c1_nxt3 - c0_nxt3 * a1) * a3;
  1059. c0_nxt4 = c0_nxt4 * a0;
  1060. c1_nxt4 = (c1_nxt4 - c0_nxt4 * a1) * a3;
  1061. c0_nxt5 = c0_nxt5 * a0;
  1062. c1_nxt5 = (c1_nxt5 - c0_nxt5 * a1) * a3;
  1063. c0_nxt6 = c0_nxt6 * a0;
  1064. c1_nxt6 = (c1_nxt6 - c0_nxt6 * a1) * a3;
  1065. c0_nxt7 = c0_nxt7 * a0;
  1066. c1_nxt7 = (c1_nxt7 - c0_nxt7 * a1) * a3;
  1067. *(b + 0) = c0;
  1068. *(b + 1) = c0_nxt1;
  1069. *(b + 2) = c0_nxt2;
  1070. *(b + 3) = c0_nxt3;
  1071. *(b + 4) = c0_nxt4;
  1072. *(b + 5) = c0_nxt5;
  1073. *(b + 6) = c0_nxt6;
  1074. *(b + 7) = c0_nxt7;
  1075. *(b + 8) = c1;
  1076. *(b + 9) = c1_nxt1;
  1077. *(b + 10) = c1_nxt2;
  1078. *(b + 11) = c1_nxt3;
  1079. *(b + 12) = c1_nxt4;
  1080. *(b + 13) = c1_nxt5;
  1081. *(b + 14) = c1_nxt6;
  1082. *(b + 15) = c1_nxt7;
  1083. *(c + 0) = c0;
  1084. *(c + 1) = c1;
  1085. *(c + 0 + ldc) = c0_nxt1;
  1086. *(c + 1 + ldc) = c1_nxt1;
  1087. *(c + 0 + 2 * ldc) = c0_nxt2;
  1088. *(c + 1 + 2 * ldc) = c1_nxt2;
  1089. *(c + 0 + 3 * ldc) = c0_nxt3;
  1090. *(c + 1 + 3 * ldc) = c1_nxt3;
  1091. *(c + 0 + 4 * ldc) = c0_nxt4;
  1092. *(c + 1 + 4 * ldc) = c1_nxt4;
  1093. *(c + 0 + 5 * ldc) = c0_nxt5;
  1094. *(c + 1 + 5 * ldc) = c1_nxt5;
  1095. *(c + 0 + 6 * ldc) = c0_nxt6;
  1096. *(c + 1 + 6 * ldc) = c1_nxt6;
  1097. *(c + 0 + 7 * ldc) = c0_nxt7;
  1098. *(c + 1 + 7 * ldc) = c1_nxt7;
  1099. }
  1100. static void ssolve_2x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1101. {
  1102. BLASLONG k;
  1103. FLOAT a0, a1, a3, c0, c1, c0_nxt1, c1_nxt1;
  1104. FLOAT c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  1105. c0 = *(c + 0);
  1106. c1 = *(c + 1);
  1107. c0_nxt1 = *(c + ldc);
  1108. c1_nxt1 = *(c + 1 + ldc);
  1109. c0_nxt2 = *(c + 2 * ldc);
  1110. c1_nxt2 = *(c + 1 + 2 * ldc);
  1111. c0_nxt3 = *(c + 3 * ldc);
  1112. c1_nxt3 = *(c + 1 + 3 * ldc);
  1113. for (k = 0; k < bk; k++)
  1114. {
  1115. c0 -= a[0] * b[0];
  1116. c1 -= a[1] * b[0];
  1117. c0_nxt1 -= a[0] * b[1];
  1118. c1_nxt1 -= a[1] * b[1];
  1119. c0_nxt2 -= a[0] * b[2];
  1120. c1_nxt2 -= a[1] * b[2];
  1121. c0_nxt3 -= a[0] * b[3];
  1122. c1_nxt3 -= a[1] * b[3];
  1123. a += 2;
  1124. b += 4;
  1125. }
  1126. a0 = *a;
  1127. a1 = *(a + 1);
  1128. a3 = *(a + 3);
  1129. c0 *= a0;
  1130. c0_nxt1 *= a0;
  1131. c0_nxt2 *= a0;
  1132. c0_nxt3 *= a0;
  1133. c1 -= c0 * a1;
  1134. c1_nxt1 -= c0_nxt1 * a1;
  1135. c1_nxt2 -= c0_nxt2 * a1;
  1136. c1_nxt3 -= c0_nxt3 * a1;
  1137. c1 *= a3;
  1138. c1_nxt1 *= a3;
  1139. c1_nxt2 *= a3;
  1140. c1_nxt3 *= a3;
  1141. *(b + 0) = c0;
  1142. *(b + 1) = c0_nxt1;
  1143. *(b + 2) = c0_nxt2;
  1144. *(b + 3) = c0_nxt3;
  1145. *(b + 4) = c1;
  1146. *(b + 5) = c1_nxt1;
  1147. *(b + 6) = c1_nxt2;
  1148. *(b + 7) = c1_nxt3;
  1149. *(c + 0) = c0;
  1150. *(c + 1) = c1;
  1151. *(c + 0 + ldc) = c0_nxt1;
  1152. *(c + 1 + ldc) = c1_nxt1;
  1153. *(c + 0 + 2 * ldc) = c0_nxt2;
  1154. *(c + 1 + 2 * ldc) = c1_nxt2;
  1155. *(c + 0 + 3 * ldc) = c0_nxt3;
  1156. *(c + 1 + 3 * ldc) = c1_nxt3;
  1157. }
  1158. static void ssolve_2x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1159. {
  1160. BLASLONG k;
  1161. FLOAT a0, a1, a3, c0, c1, c0_nxt, c1_nxt;
  1162. c0 = *(c + 0);
  1163. c1 = *(c + 1);
  1164. c0_nxt = *(c + ldc);
  1165. c1_nxt = *(c + 1 + ldc);
  1166. for (k = 0; k < bk; k++)
  1167. {
  1168. c0 -= a[0] * b[0];
  1169. c1 -= a[1] * b[0];
  1170. c0_nxt -= a[0] * b[1];
  1171. c1_nxt -= a[1] * b[1];
  1172. a += 2;
  1173. b += 2;
  1174. }
  1175. a0 = *a;
  1176. a1 = *(a + 1);
  1177. a3 = *(a + 3);
  1178. c0 *= a0;
  1179. c0_nxt *= a0;
  1180. c1 -= c0 * a1;
  1181. c1_nxt -= c0_nxt * a1;
  1182. c1 *= a3;
  1183. c1_nxt *= a3;
  1184. *(b + 0) = c0;
  1185. *(b + 1) = c0_nxt;
  1186. *(b + 2) = c1;
  1187. *(b + 3) = c1_nxt;
  1188. *(c + 0) = c0;
  1189. *(c + 1) = c1;
  1190. *(c + 0 + ldc) = c0_nxt;
  1191. *(c + 1 + ldc) = c1_nxt;
  1192. }
  1193. static void ssolve_2x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1194. {
  1195. BLASLONG k;
  1196. FLOAT c0, c1;
  1197. c0 = *(c + 0);
  1198. c1 = *(c + 1);
  1199. for (k = 0; k < bk; k++)
  1200. {
  1201. c0 -= a[0] * b[0];
  1202. c1 -= a[1] * b[0];
  1203. a += 2;
  1204. b += 1;
  1205. }
  1206. c0 *= *(a + 0);
  1207. c1 -= c0 * *(a + 1);
  1208. c1 *= *(a + 3);
  1209. *(b + 0) = c0;
  1210. *(b + 1) = c1;
  1211. *(c + 0) = c0;
  1212. *(c + 1) = c1;
  1213. }
  1214. static void ssolve_1x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1215. {
  1216. BLASLONG k;
  1217. FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
  1218. c0 = *(c + 0);
  1219. c1 = *(c + 1 * ldc);
  1220. c2 = *(c + 2 * ldc);
  1221. c3 = *(c + 3 * ldc);
  1222. c4 = *(c + 4 * ldc);
  1223. c5 = *(c + 5 * ldc);
  1224. c6 = *(c + 6 * ldc);
  1225. c7 = *(c + 7 * ldc);
  1226. for (k = 0; k < bk; k++)
  1227. {
  1228. c0 -= a[0] * b[0];
  1229. c1 -= a[0] * b[1];
  1230. c2 -= a[0] * b[2];
  1231. c3 -= a[0] * b[3];
  1232. c4 -= a[0] * b[4];
  1233. c5 -= a[0] * b[5];
  1234. c6 -= a[0] * b[6];
  1235. c7 -= a[0] * b[7];
  1236. a += 1;
  1237. b += 8;
  1238. }
  1239. c0 *= *a;
  1240. c1 *= *a;
  1241. c2 *= *a;
  1242. c3 *= *a;
  1243. c4 *= *a;
  1244. c5 *= *a;
  1245. c6 *= *a;
  1246. c7 *= *a;
  1247. *(b + 0) = c0;
  1248. *(b + 1) = c1;
  1249. *(b + 2) = c2;
  1250. *(b + 3) = c3;
  1251. *(b + 4) = c4;
  1252. *(b + 5) = c5;
  1253. *(b + 6) = c6;
  1254. *(b + 7) = c7;
  1255. *(c + 0) = c0;
  1256. *(c + 1 * ldc) = c1;
  1257. *(c + 2 * ldc) = c2;
  1258. *(c + 3 * ldc) = c3;
  1259. *(c + 4 * ldc) = c4;
  1260. *(c + 5 * ldc) = c5;
  1261. *(c + 6 * ldc) = c6;
  1262. *(c + 7 * ldc) = c7;
  1263. }
  1264. static void ssolve_1x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1265. {
  1266. BLASLONG k;
  1267. FLOAT c0, c1, c2, c3;
  1268. c0 = *(c + 0 * ldc);
  1269. c1 = *(c + 1 * ldc);
  1270. c2 = *(c + 2 * ldc);
  1271. c3 = *(c + 3 * ldc);
  1272. for (k = 0; k < bk; k++)
  1273. {
  1274. c0 -= a[0] * b[0];
  1275. c1 -= a[0] * b[1];
  1276. c2 -= a[0] * b[2];
  1277. c3 -= a[0] * b[3];
  1278. a += 1;
  1279. b += 4;
  1280. }
  1281. c0 *= *a;
  1282. c1 *= *a;
  1283. c2 *= *a;
  1284. c3 *= *a;
  1285. *c = c0;
  1286. *(c + ldc) = c1;
  1287. *(c + 2 * ldc) = c2;
  1288. *(c + 3 * ldc) = c3;
  1289. *b = *c;
  1290. *(b + 1) = *(c + ldc);
  1291. *(b + 2) = *(c + 2 * ldc);
  1292. *(b + 3) = *(c + 3 * ldc);
  1293. }
  1294. static void ssolve_1x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1295. {
  1296. BLASLONG k;
  1297. FLOAT c0, c1;
  1298. c0 = *c;
  1299. c1 = *(c + ldc);
  1300. for (k = 0; k < bk; k++)
  1301. {
  1302. c0 -= a[0] * b[0];
  1303. c1 -= a[0] * b[1];
  1304. a += 1;
  1305. b += 2;
  1306. }
  1307. *c = c0 * *a;
  1308. *(c + ldc) = c1 * *a;
  1309. *b = *c;
  1310. *(b + 1) = *(c + ldc);
  1311. }
  1312. static void ssolve_1x1_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1313. {
  1314. BLASLONG k;
  1315. for (k = 0; k < bk; k++)
  1316. {
  1317. *c -= a[0] * b[0];
  1318. a++;
  1319. b++;
  1320. }
  1321. *c *= *a;
  1322. *b = *c;
  1323. }
  1324. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
  1325. FLOAT *c, BLASLONG ldc, BLASLONG offset)
  1326. {
  1327. FLOAT *aa, *cc;
  1328. BLASLONG i, j, kk;
  1329. for (j = (n >> 3); j--;)
  1330. {
  1331. kk = offset;
  1332. aa = a;
  1333. cc = c;
  1334. for (i = (m >> 3); i--;)
  1335. {
  1336. ssolve_8x8_lt_msa(aa, b, cc, ldc, kk);
  1337. aa += 8 * k;
  1338. cc += 8;
  1339. kk += 8;
  1340. }
  1341. if (m & 7)
  1342. {
  1343. if (m & 4)
  1344. {
  1345. ssolve_4x8_lt_msa(aa, b, cc, ldc, kk);
  1346. aa += 4 * k;
  1347. cc += 4;
  1348. kk += 4;
  1349. }
  1350. if (m & 2)
  1351. {
  1352. ssolve_2x8_lt_msa(aa, b, cc, ldc, kk);
  1353. aa += 2 * k;
  1354. cc += 2;
  1355. kk += 2;
  1356. }
  1357. if (m & 1)
  1358. {
  1359. ssolve_1x8_lt_msa(aa, b, cc, ldc, kk);
  1360. aa += k;
  1361. cc += 1;
  1362. kk += 1;
  1363. }
  1364. }
  1365. b += 8 * k;
  1366. c += 8 * ldc;
  1367. }
  1368. if (n & 7)
  1369. {
  1370. if (n & 4)
  1371. {
  1372. kk = offset;
  1373. aa = a;
  1374. cc = c;
  1375. for (i = (m >> 3); i--;)
  1376. {
  1377. ssolve_8x4_lt_msa(aa, b, cc, ldc, kk);
  1378. aa += 8 * k;
  1379. cc += 8;
  1380. kk += 8;
  1381. }
  1382. if (m & 7)
  1383. {
  1384. if (m & 4)
  1385. {
  1386. ssolve_4x4_lt_msa(aa, b, cc, ldc, kk);
  1387. aa += 4 * k;
  1388. cc += 4;
  1389. kk += 4;
  1390. }
  1391. if (m & 2)
  1392. {
  1393. ssolve_2x4_lt_msa(aa, b, cc, ldc, kk);
  1394. aa += 2 * k;
  1395. cc += 2;
  1396. kk += 2;
  1397. }
  1398. if (m & 1)
  1399. {
  1400. ssolve_1x4_lt_msa(aa, b, cc, ldc, kk);
  1401. aa += k;
  1402. cc += 1;
  1403. kk += 1;
  1404. }
  1405. }
  1406. b += 4 * k;
  1407. c += 4 * ldc;
  1408. }
  1409. if (n & 2)
  1410. {
  1411. kk = offset;
  1412. aa = a;
  1413. cc = c;
  1414. for (i = (m >> 3); i--;)
  1415. {
  1416. ssolve_8x2_lt_msa(aa, b, cc, ldc, kk);
  1417. aa += 8 * k;
  1418. cc += 8;
  1419. kk += 8;
  1420. }
  1421. if (m & 7)
  1422. {
  1423. if (m & 4)
  1424. {
  1425. ssolve_4x2_lt_msa(aa, b, cc, ldc, kk);
  1426. aa += 4 * k;
  1427. cc += 4;
  1428. kk += 4;
  1429. }
  1430. if (m & 2)
  1431. {
  1432. ssolve_2x2_lt_msa(aa, b, cc, ldc, kk);
  1433. aa += 2 * k;
  1434. cc += 2;
  1435. kk += 2;
  1436. }
  1437. if (m & 1)
  1438. {
  1439. ssolve_1x2_lt_msa(aa, b, cc, ldc, kk);
  1440. aa += k;
  1441. cc += 1;
  1442. kk += 1;
  1443. }
  1444. }
  1445. b += 2 * k;
  1446. c += 2 * ldc;
  1447. }
  1448. if (n & 1)
  1449. {
  1450. kk = offset;
  1451. aa = a;
  1452. cc = c;
  1453. for (i = (m >> 3); i--;)
  1454. {
  1455. ssolve_8x1_lt_msa(aa, b, cc, kk);
  1456. aa += 8 * k;
  1457. cc += 8;
  1458. kk += 8;
  1459. }
  1460. if (m & 7)
  1461. {
  1462. if (m & 4)
  1463. {
  1464. ssolve_4x1_lt_msa(aa, b, cc, kk);
  1465. aa += 4 * k;
  1466. cc += 4;
  1467. kk += 4;
  1468. }
  1469. if (m & 2)
  1470. {
  1471. ssolve_2x1_lt_msa(aa, b, cc, kk);
  1472. aa += 2 * k;
  1473. cc += 2;
  1474. kk += 2;
  1475. }
  1476. if (m & 1)
  1477. {
  1478. ssolve_1x1_lt_msa(aa, b, cc, kk);
  1479. aa += k;
  1480. cc += 1;
  1481. kk += 1;
  1482. }
  1483. }
  1484. b += k;
  1485. c += ldc;
  1486. }
  1487. }
  1488. return 0;
  1489. }