You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strsm_kernel_RN_8x8_msa.c 44 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  30. {
  31. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  32. v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
  33. v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
  34. v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18;
  35. v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28;
  36. v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39;
  37. v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b;
  38. FLOAT *c_nxt1line = c + ldc;
  39. FLOAT *c_nxt2line = c + 2 * ldc;
  40. FLOAT *c_nxt3line = c + 3 * ldc;
  41. FLOAT *c_nxt4line = c + 4 * ldc;
  42. FLOAT *c_nxt5line = c + 5 * ldc;
  43. FLOAT *c_nxt6line = c + 6 * ldc;
  44. FLOAT *c_nxt7line = c + 7 * ldc;
  45. LD_SP2(c, 4, src_c0, src_c1);
  46. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  47. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  48. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  49. LD_SP2(c_nxt4line, 4, src_c8, src_c9);
  50. LD_SP2(c_nxt5line, 4, src_c10, src_c11);
  51. LD_SP2(c_nxt6line, 4, src_c12, src_c13);
  52. LD_SP2(c_nxt7line, 4, src_c14, src_c15);
  53. if (bk > 0)
  54. {
  55. BLASLONG k, pref_offset;
  56. FLOAT *pa0_pref;
  57. v4f32 src_a0, src_a1, src_bb0, src_bb1;
  58. pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
  59. if (pref_offset)
  60. {
  61. pref_offset = L1_DATA_LINESIZE - pref_offset;
  62. pref_offset = pref_offset / sizeof(FLOAT);
  63. }
  64. pa0_pref = a + pref_offset;
  65. for (k = 0; k < (bk >> 1); k++)
  66. {
  67. PREF_OFFSET(pa0_pref, 64);
  68. PREF_OFFSET(pa0_pref, 96);
  69. LD_SP2_INC(a, 4, src_a0, src_a1);
  70. LD_SP2_INC(b, 4, src_bb0, src_bb1);
  71. SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
  72. src_c0 -= src_a0 * src_b0;
  73. src_c1 -= src_a1 * src_b0;
  74. src_c2 -= src_a0 * src_b1;
  75. src_c3 -= src_a1 * src_b1;
  76. src_c4 -= src_a0 * src_b2;
  77. src_c5 -= src_a1 * src_b2;
  78. src_c6 -= src_a0 * src_b3;
  79. src_c7 -= src_a1 * src_b3;
  80. SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
  81. src_c8 -= src_a0 * src_b0;
  82. src_c9 -= src_a1 * src_b0;
  83. src_c10 -= src_a0 * src_b1;
  84. src_c11 -= src_a1 * src_b1;
  85. src_c12 -= src_a0 * src_b2;
  86. src_c13 -= src_a1 * src_b2;
  87. src_c14 -= src_a0 * src_b3;
  88. src_c15 -= src_a1 * src_b3;
  89. LD_SP2_INC(a, 4, src_a0, src_a1);
  90. LD_SP2_INC(b, 4, src_bb0, src_bb1);
  91. SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
  92. src_c0 -= src_a0 * src_b0;
  93. src_c1 -= src_a1 * src_b0;
  94. src_c2 -= src_a0 * src_b1;
  95. src_c3 -= src_a1 * src_b1;
  96. src_c4 -= src_a0 * src_b2;
  97. src_c5 -= src_a1 * src_b2;
  98. src_c6 -= src_a0 * src_b3;
  99. src_c7 -= src_a1 * src_b3;
  100. SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
  101. src_c8 -= src_a0 * src_b0;
  102. src_c9 -= src_a1 * src_b0;
  103. src_c10 -= src_a0 * src_b1;
  104. src_c11 -= src_a1 * src_b1;
  105. src_c12 -= src_a0 * src_b2;
  106. src_c13 -= src_a1 * src_b2;
  107. src_c14 -= src_a0 * src_b3;
  108. src_c15 -= src_a1 * src_b3;
  109. pa0_pref += 16;
  110. }
  111. if (bk & 1)
  112. {
  113. LD_SP2_INC(a, 4, src_a0, src_a1);
  114. LD_SP2_INC(b, 4, src_bb0, src_bb1);
  115. SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
  116. src_c0 -= src_a0 * src_b0;
  117. src_c1 -= src_a1 * src_b0;
  118. src_c2 -= src_a0 * src_b1;
  119. src_c3 -= src_a1 * src_b1;
  120. src_c4 -= src_a0 * src_b2;
  121. src_c5 -= src_a1 * src_b2;
  122. src_c6 -= src_a0 * src_b3;
  123. src_c7 -= src_a1 * src_b3;
  124. SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
  125. src_c8 -= src_a0 * src_b0;
  126. src_c9 -= src_a1 * src_b0;
  127. src_c10 -= src_a0 * src_b1;
  128. src_c11 -= src_a1 * src_b1;
  129. src_c12 -= src_a0 * src_b2;
  130. src_c13 -= src_a1 * src_b2;
  131. src_c14 -= src_a0 * src_b3;
  132. src_c15 -= src_a1 * src_b3;
  133. }
  134. }
  135. src_b = LD_SP(b + 0);
  136. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  137. src_b = LD_SP(b + 4);
  138. SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7);
  139. src_b = LD_SP(b + 9);
  140. SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12);
  141. src_b13 = LD_SP(b + 13);
  142. src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2);
  143. src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1);
  144. src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0);
  145. src_c0 *= src_b0;
  146. src_c1 *= src_b0;
  147. src_c2 -= src_c0 * src_b1;
  148. src_c3 -= src_c1 * src_b1;
  149. src_c4 -= src_c0 * src_b2;
  150. src_c5 -= src_c1 * src_b2;
  151. src_c6 -= src_c0 * src_b3;
  152. src_c7 -= src_c1 * src_b3;
  153. src_c8 -= src_c0 * src_b4;
  154. src_c9 -= src_c1 * src_b4;
  155. src_c10 -= src_c0 * src_b5;
  156. src_c11 -= src_c1 * src_b5;
  157. src_c12 -= src_c0 * src_b6;
  158. src_c13 -= src_c1 * src_b6;
  159. src_c14 -= src_c0 * src_b7;
  160. src_c15 -= src_c1 * src_b7;
  161. ST_SP2(src_c0, src_c1, a, 4);
  162. ST_SP2(src_c0, src_c1, c, 4);
  163. src_c2 *= src_b9;
  164. src_c3 *= src_b9;
  165. src_c4 -= src_c2 * src_b10;
  166. src_c5 -= src_c3 * src_b10;
  167. src_c6 -= src_c2 * src_b11;
  168. src_c7 -= src_c3 * src_b11;
  169. src_c8 -= src_c2 * src_b12;
  170. src_c9 -= src_c3 * src_b12;
  171. src_c10 -= src_c2 * src_b13;
  172. src_c11 -= src_c3 * src_b13;
  173. src_c12 -= src_c2 * src_b14;
  174. src_c13 -= src_c3 * src_b14;
  175. src_c14 -= src_c2 * src_b15;
  176. src_c15 -= src_c3 * src_b15;
  177. ST_SP2(src_c2, src_c3, a + 8, 4);
  178. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  179. src_b = LD_SP(b + 18);
  180. SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21);
  181. src_b22 = LD_SP(b + 22);
  182. src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1);
  183. src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0);
  184. src_b = LD_SP(b + 27);
  185. SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
  186. src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
  187. src_c4 *= src_b18;
  188. src_c5 *= src_b18;
  189. src_c6 -= src_c4 * src_b19;
  190. src_c7 -= src_c5 * src_b19;
  191. src_c8 -= src_c4 * src_b20;
  192. src_c9 -= src_c5 * src_b20;
  193. src_c10 -= src_c4 * src_b21;
  194. src_c11 -= src_c5 * src_b21;
  195. src_c12 -= src_c4 * src_b22;
  196. src_c13 -= src_c5 * src_b22;
  197. src_c14 -= src_c4 * src_b23;
  198. src_c15 -= src_c5 * src_b23;
  199. ST_SP2(src_c4, src_c5, a + 16, 4);
  200. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  201. src_c6 *= src_b27;
  202. src_c7 *= src_b27;
  203. src_c8 -= src_c6 * src_b28;
  204. src_c9 -= src_c7 * src_b28;
  205. src_c10 -= src_c6 * src_b29;
  206. src_c11 -= src_c7 * src_b29;
  207. src_c12 -= src_c6 * src_b30;
  208. src_c13 -= src_c7 * src_b30;
  209. src_c14 -= src_c6 * src_b31;
  210. src_c15 -= src_c7 * src_b31;
  211. ST_SP2(src_c6, src_c7, a + 24, 4);
  212. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  213. src_b = LD_SP(b + 36);
  214. SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
  215. src_b45 = LD_SP(b + 45);
  216. src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2);
  217. src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
  218. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
  219. src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
  220. src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
  221. src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
  222. src_c8 *= src_b36;
  223. src_c9 *= src_b36;
  224. src_c10 -= src_c8 * src_b37;
  225. src_c11 -= src_c9 * src_b37;
  226. src_c12 -= src_c8 * src_b38;
  227. src_c13 -= src_c9 * src_b38;
  228. src_c14 -= src_c8 * src_b39;
  229. src_c15 -= src_c9 * src_b39;
  230. ST_SP2(src_c8, src_c9, a + 32, 4);
  231. ST_SP2(src_c8, src_c9, c_nxt4line, 4);
  232. src_c10 *= src_b45;
  233. src_c11 *= src_b45;
  234. src_c12 -= src_c10 * src_b46;
  235. src_c13 -= src_c11 * src_b46;
  236. src_c14 -= src_c10 * src_b47;
  237. src_c15 -= src_c11 * src_b47;
  238. ST_SP2(src_c10, src_c11, a + 40, 4);
  239. ST_SP2(src_c10, src_c11, c_nxt5line, 4);
  240. src_c12 *= src_b54;
  241. src_c13 *= src_b54;
  242. src_c14 -= src_c12 * src_b55;
  243. src_c15 -= src_c13 * src_b55;
  244. ST_SP2(src_c12, src_c13, a + 48, 4);
  245. ST_SP2(src_c12, src_c13, c_nxt6line, 4);
  246. src_c14 *= src_b63;
  247. src_c15 *= src_b63;
  248. ST_SP2(src_c14, src_c15, a + 56, 4);
  249. ST_SP2(src_c14, src_c15, c_nxt7line, 4);
  250. }
  251. static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  252. {
  253. BLASLONG k;
  254. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  255. v4f32 src_b0, src_b1, src_b2, src_b3, src_b5, src_b6, src_b7;
  256. v4f32 src_b10, src_b11, src_b15, src_b, src_a0, src_a1;
  257. FLOAT *c_nxt1line = c + ldc;
  258. FLOAT *c_nxt2line = c + 2 * ldc;
  259. FLOAT *c_nxt3line = c + 3 * ldc;
  260. LD_SP2(c, 4, src_c0, src_c1);
  261. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  262. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  263. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  264. for (k = 0; k < (bk >> 1); k++)
  265. {
  266. LD_SP2(a, 4, src_a0, src_a1);
  267. src_b = LD_SP(b + 0);
  268. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  269. src_c0 -= src_a0 * src_b0;
  270. src_c1 -= src_a1 * src_b0;
  271. src_c2 -= src_a0 * src_b1;
  272. src_c3 -= src_a1 * src_b1;
  273. src_c4 -= src_a0 * src_b2;
  274. src_c5 -= src_a1 * src_b2;
  275. src_c6 -= src_a0 * src_b3;
  276. src_c7 -= src_a1 * src_b3;
  277. a += 8;
  278. b += 4;
  279. LD_SP2(a, 4, src_a0, src_a1);
  280. src_b = LD_SP(b + 0);
  281. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  282. src_c0 -= src_a0 * src_b0;
  283. src_c1 -= src_a1 * src_b0;
  284. src_c2 -= src_a0 * src_b1;
  285. src_c3 -= src_a1 * src_b1;
  286. src_c4 -= src_a0 * src_b2;
  287. src_c5 -= src_a1 * src_b2;
  288. src_c6 -= src_a0 * src_b3;
  289. src_c7 -= src_a1 * src_b3;
  290. a += 8;
  291. b += 4;
  292. }
  293. if ((bk & 1) && (bk > 0))
  294. {
  295. LD_SP2(a, 4, src_a0, src_a1);
  296. src_b = LD_SP(b + 0);
  297. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  298. src_c0 -= src_a0 * src_b0;
  299. src_c1 -= src_a1 * src_b0;
  300. src_c2 -= src_a0 * src_b1;
  301. src_c3 -= src_a1 * src_b1;
  302. src_c4 -= src_a0 * src_b2;
  303. src_c5 -= src_a1 * src_b2;
  304. src_c6 -= src_a0 * src_b3;
  305. src_c7 -= src_a1 * src_b3;
  306. a += 8;
  307. b += 4;
  308. }
  309. src_b = LD_SP(b + 0);
  310. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  311. src_b5 = LD_SP(b + 5);
  312. src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
  313. src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
  314. src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
  315. src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
  316. src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
  317. src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
  318. src_c0 *= src_b0;
  319. src_c1 *= src_b0;
  320. src_c2 -= src_c0 * src_b1;
  321. src_c3 -= src_c1 * src_b1;
  322. src_c4 -= src_c0 * src_b2;
  323. src_c5 -= src_c1 * src_b2;
  324. src_c6 -= src_c0 * src_b3;
  325. src_c7 -= src_c1 * src_b3;
  326. src_c2 *= src_b5;
  327. src_c3 *= src_b5;
  328. src_c4 -= src_c2 * src_b6;
  329. src_c5 -= src_c3 * src_b6;
  330. src_c6 -= src_c2 * src_b7;
  331. src_c7 -= src_c3 * src_b7;
  332. src_c4 *= src_b10;
  333. src_c5 *= src_b10;
  334. src_c6 -= src_c4 * src_b11;
  335. src_c7 -= src_c5 * src_b11;
  336. src_c6 *= src_b15;
  337. src_c7 *= src_b15;
  338. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  339. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  340. ST_SP2(src_c0, src_c1, c, 4);
  341. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  342. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  343. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  344. }
  345. static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  346. {
  347. BLASLONG k;
  348. v4f32 src_a0, src_a1;
  349. v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b3;
  350. FLOAT *c_nxt1line = c + ldc;
  351. LD_SP2(c, 4, src_c0, src_c1);
  352. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  353. for (k = 0; k < (bk >> 1); k++)
  354. {
  355. LD_SP2(a, 4, src_a0, src_a1);
  356. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  357. src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
  358. src_c0 -= src_a0 * src_b0;
  359. src_c1 -= src_a1 * src_b0;
  360. src_c2 -= src_a0 * src_b1;
  361. src_c3 -= src_a1 * src_b1;
  362. a += 8;
  363. b += 2;
  364. LD_SP2(a, 4, src_a0, src_a1);
  365. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  366. src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
  367. src_c0 -= src_a0 * src_b0;
  368. src_c1 -= src_a1 * src_b0;
  369. src_c2 -= src_a0 * src_b1;
  370. src_c3 -= src_a1 * src_b1;
  371. a += 8;
  372. b += 2;
  373. }
  374. if ((bk & 1) && (bk > 0))
  375. {
  376. LD_SP2(a, 4, src_a0, src_a1);
  377. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  378. src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
  379. src_c0 -= src_a0 * src_b0;
  380. src_c1 -= src_a1 * src_b0;
  381. src_c2 -= src_a0 * src_b1;
  382. src_c3 -= src_a1 * src_b1;
  383. a += 8;
  384. b += 2;
  385. }
  386. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  387. src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
  388. src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
  389. src_c0 *= src_b0;
  390. src_c1 *= src_b0;
  391. src_c2 -= src_c0 * src_b1;
  392. src_c3 -= src_c1 * src_b1;
  393. src_c2 *= src_b3;
  394. src_c3 *= src_b3;
  395. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  396. ST_SP2(src_c0, src_c1, c, 4);
  397. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  398. }
  399. static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  400. {
  401. BLASLONG k;
  402. v4f32 src_a0, src_a1, src_c0, src_c1, src_b0;
  403. LD_SP2(c, 4, src_c0, src_c1);
  404. for (k = 0; k < (bk >> 2); k++)
  405. {
  406. LD_SP2(a, 4, src_a0, src_a1);
  407. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  408. src_c0 -= src_a0 * src_b0;
  409. src_c1 -= src_a1 * src_b0;
  410. a += 8;
  411. b += 1;
  412. LD_SP2(a, 4, src_a0, src_a1);
  413. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  414. src_c0 -= src_a0 * src_b0;
  415. src_c1 -= src_a1 * src_b0;
  416. a += 8;
  417. b += 1;
  418. LD_SP2(a, 4, src_a0, src_a1);
  419. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  420. src_c0 -= src_a0 * src_b0;
  421. src_c1 -= src_a1 * src_b0;
  422. a += 8;
  423. b += 1;
  424. LD_SP2(a, 4, src_a0, src_a1);
  425. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  426. src_c0 -= src_a0 * src_b0;
  427. src_c1 -= src_a1 * src_b0;
  428. a += 8;
  429. b += 1;
  430. }
  431. if ((bk & 3) && (bk > 0))
  432. {
  433. if (bk & 2)
  434. {
  435. LD_SP2(a, 4, src_a0, src_a1);
  436. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  437. src_c0 -= src_a0 * src_b0;
  438. src_c1 -= src_a1 * src_b0;
  439. a += 8;
  440. b += 1;
  441. LD_SP2(a, 4, src_a0, src_a1);
  442. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  443. src_c0 -= src_a0 * src_b0;
  444. src_c1 -= src_a1 * src_b0;
  445. a += 8;
  446. b += 1;
  447. }
  448. if (bk & 1)
  449. {
  450. LD_SP2(a, 4, src_a0, src_a1);
  451. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  452. src_c0 -= src_a0 * src_b0;
  453. src_c1 -= src_a1 * src_b0;
  454. a += 8;
  455. b += 1;
  456. }
  457. }
  458. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  459. src_c0 *= src_b0;
  460. src_c1 *= src_b0;
  461. ST_SP2(src_c0, src_c1, a, 4);
  462. ST_SP2(src_c0, src_c1, c, 4);
  463. }
  464. static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  465. {
  466. BLASLONG k;
  467. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  468. v4f32 src_b0, src_b1, src_b2, src_b3, src_b4, src_b5, src_b6, src_b7;
  469. v4f32 src_b9, src_b10, src_b11, src_b12, src_b13, src_b14, src_b15, src_b18;
  470. v4f32 src_b19, src_b20, src_b21, src_b22, src_b23, src_b27, src_b28;
  471. v4f32 src_b29, src_b30, src_b31, src_b36, src_b37, src_b38, src_b39;
  472. v4f32 src_b45, src_b46, src_b47, src_b54, src_b55, src_b63, src_b, src_a0;
  473. FLOAT *c_nxt1line = c + ldc;
  474. FLOAT *c_nxt2line = c + 2 * ldc;
  475. FLOAT *c_nxt3line = c + 3 * ldc;
  476. FLOAT *c_nxt4line = c + 4 * ldc;
  477. FLOAT *c_nxt5line = c + 5 * ldc;
  478. FLOAT *c_nxt6line = c + 6 * ldc;
  479. FLOAT *c_nxt7line = c + 7 * ldc;
  480. src_c0 = LD_SP(c);
  481. src_c1 = LD_SP(c_nxt1line);
  482. src_c2 = LD_SP(c_nxt2line);
  483. src_c3 = LD_SP(c_nxt3line);
  484. src_c4 = LD_SP(c_nxt4line);
  485. src_c5 = LD_SP(c_nxt5line);
  486. src_c6 = LD_SP(c_nxt6line);
  487. src_c7 = LD_SP(c_nxt7line);
  488. for (k = 0; k < bk; k++)
  489. {
  490. src_a0 = LD_SP(a);
  491. src_b = LD_SP(b + 0);
  492. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  493. src_c0 -= src_a0 * src_b0;
  494. src_c1 -= src_a0 * src_b1;
  495. src_c2 -= src_a0 * src_b2;
  496. src_c3 -= src_a0 * src_b3;
  497. src_b = LD_SP(b + 4);
  498. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  499. src_c4 -= src_a0 * src_b0;
  500. src_c5 -= src_a0 * src_b1;
  501. src_c6 -= src_a0 * src_b2;
  502. src_c7 -= src_a0 * src_b3;
  503. a += 4;
  504. b += 8;
  505. }
  506. src_b = LD_SP(b + 0);
  507. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  508. src_b = LD_SP(b + 4);
  509. SPLATI_W4_SP(src_b, src_b4, src_b5, src_b6, src_b7);
  510. src_b = LD_SP(b + 9);
  511. SPLATI_W4_SP(src_b, src_b9, src_b10, src_b11, src_b12);
  512. src_b13 = LD_SP(b + 13);
  513. src_b15 = (v4f32) __msa_splati_w((v4i32) src_b13, 2);
  514. src_b14 = (v4f32) __msa_splati_w((v4i32) src_b13, 1);
  515. src_b13 = (v4f32) __msa_splati_w((v4i32) src_b13, 0);
  516. src_b = LD_SP(b + 18);
  517. SPLATI_W4_SP(src_b, src_b18, src_b19, src_b20, src_b21);
  518. src_b22 = LD_SP(b + 22);
  519. src_b23 = (v4f32) __msa_splati_w((v4i32) src_b22, 1);
  520. src_b22 = (v4f32) __msa_splati_w((v4i32) src_b22, 0);
  521. src_b = LD_SP(b + 27);
  522. SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
  523. src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
  524. src_b = LD_SP(b + 36);
  525. SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
  526. src_b45 = LD_SP(b + 45);
  527. src_b47 = (v4f32) __msa_splati_w((v4i32) src_b45, 2);
  528. src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
  529. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
  530. src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
  531. src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
  532. src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
  533. src_c0 *= src_b0;
  534. src_c1 -= src_c0 * src_b1;
  535. src_c2 -= src_c0 * src_b2;
  536. src_c3 -= src_c0 * src_b3;
  537. src_c4 -= src_c0 * src_b4;
  538. src_c5 -= src_c0 * src_b5;
  539. src_c6 -= src_c0 * src_b6;
  540. src_c7 -= src_c0 * src_b7;
  541. src_c1 *= src_b9;
  542. src_c2 -= src_c1 * src_b10;
  543. src_c3 -= src_c1 * src_b11;
  544. src_c4 -= src_c1 * src_b12;
  545. src_c5 -= src_c1 * src_b13;
  546. src_c6 -= src_c1 * src_b14;
  547. src_c7 -= src_c1 * src_b15;
  548. src_c2 *= src_b18;
  549. src_c3 -= src_c2 * src_b19;
  550. src_c4 -= src_c2 * src_b20;
  551. src_c5 -= src_c2 * src_b21;
  552. src_c6 -= src_c2 * src_b22;
  553. src_c7 -= src_c2 * src_b23;
  554. src_c3 *= src_b27;
  555. src_c4 -= src_c3 * src_b28;
  556. src_c5 -= src_c3 * src_b29;
  557. src_c6 -= src_c3 * src_b30;
  558. src_c7 -= src_c3 * src_b31;
  559. src_c4 *= src_b36;
  560. src_c5 -= src_c4 * src_b37;
  561. src_c6 -= src_c4 * src_b38;
  562. src_c7 -= src_c4 * src_b39;
  563. src_c5 *= src_b45;
  564. src_c6 -= src_c5 * src_b46;
  565. src_c7 -= src_c5 * src_b47;
  566. src_c6 *= src_b54;
  567. src_c7 -= src_c6 * src_b55;
  568. src_c7 *= src_b63;
  569. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  570. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  571. ST_SP(src_c0, c);
  572. ST_SP(src_c1, c_nxt1line);
  573. ST_SP(src_c2, c_nxt2line);
  574. ST_SP(src_c3, c_nxt3line);
  575. ST_SP(src_c4, c_nxt4line);
  576. ST_SP(src_c5, c_nxt5line);
  577. ST_SP(src_c6, c_nxt6line);
  578. ST_SP(src_c7, c_nxt7line);
  579. }
  580. static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  581. {
  582. BLASLONG k;
  583. v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b1, src_b2, src_b3;
  584. v4f32 src_b5, src_b6, src_b7, src_b10, src_b11, src_b15, src_b, src_a0;
  585. FLOAT *c_nxt1line = c + ldc;
  586. FLOAT *c_nxt2line = c + 2 * ldc;
  587. FLOAT *c_nxt3line = c + 3 * ldc;
  588. src_c0 = LD_SP(c);
  589. src_c1 = LD_SP(c_nxt1line);
  590. src_c2 = LD_SP(c_nxt2line);
  591. src_c3 = LD_SP(c_nxt3line);
  592. for (k = 0; k < (bk >> 1); k++)
  593. {
  594. src_a0 = LD_SP(a);
  595. src_b = LD_SP(b + 0);
  596. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  597. src_c0 -= src_a0 * src_b0;
  598. src_c1 -= src_a0 * src_b1;
  599. src_c2 -= src_a0 * src_b2;
  600. src_c3 -= src_a0 * src_b3;
  601. a += 4;
  602. b += 4;
  603. src_a0 = LD_SP(a);
  604. src_b = LD_SP(b + 0);
  605. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  606. src_c0 -= src_a0 * src_b0;
  607. src_c1 -= src_a0 * src_b1;
  608. src_c2 -= src_a0 * src_b2;
  609. src_c3 -= src_a0 * src_b3;
  610. a += 4;
  611. b += 4;
  612. }
  613. if ((bk & 1) && (bk > 0))
  614. {
  615. src_a0 = LD_SP(a);
  616. src_b = LD_SP(b + 0);
  617. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  618. src_c0 -= src_a0 * src_b0;
  619. src_c1 -= src_a0 * src_b1;
  620. src_c2 -= src_a0 * src_b2;
  621. src_c3 -= src_a0 * src_b3;
  622. a += 4;
  623. b += 4;
  624. }
  625. src_b = LD_SP(b + 0);
  626. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  627. src_b5 = LD_SP(b + 5);
  628. src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
  629. src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
  630. src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
  631. src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
  632. src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
  633. src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
  634. src_c0 *= src_b0;
  635. src_c1 -= src_c0 * src_b1;
  636. src_c2 -= src_c0 * src_b2;
  637. src_c3 -= src_c0 * src_b3;
  638. src_c1 *= src_b5;
  639. src_c2 -= src_c1 * src_b6;
  640. src_c3 -= src_c1 * src_b7;
  641. src_c2 *= src_b10;
  642. src_c3 -= src_c2 * src_b11;
  643. src_c3 *= src_b15;
  644. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  645. ST_SP(src_c0, c);
  646. ST_SP(src_c1, c_nxt1line);
  647. ST_SP(src_c2, c_nxt2line);
  648. ST_SP(src_c3, c_nxt3line);
  649. }
  650. static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  651. {
  652. BLASLONG k;
  653. v4f32 src_a, src_c0, src_c1, src_b0, src_b1, src_b3;
  654. FLOAT *c_nxt1line = c + ldc;
  655. src_c0 = LD_SP(c);
  656. src_c1 = LD_SP(c_nxt1line);
  657. for (k = 0; k < (bk >> 2); k++)
  658. {
  659. src_a = LD_SP(a);
  660. src_b0 = LD_SP(b);
  661. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  662. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  663. src_c0 -= src_a * src_b0;
  664. src_c1 -= src_a * src_b1;
  665. a += 4;
  666. b += 2;
  667. src_a = LD_SP(a);
  668. src_b0 = LD_SP(b);
  669. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  670. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  671. src_c0 -= src_a * src_b0;
  672. src_c1 -= src_a * src_b1;
  673. a += 4;
  674. b += 2;
  675. src_a = LD_SP(a);
  676. src_b0 = LD_SP(b);
  677. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  678. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  679. src_c0 -= src_a * src_b0;
  680. src_c1 -= src_a * src_b1;
  681. a += 4;
  682. b += 2;
  683. src_a = LD_SP(a);
  684. src_b0 = LD_SP(b);
  685. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  686. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  687. src_c0 -= src_a * src_b0;
  688. src_c1 -= src_a * src_b1;
  689. a += 4;
  690. b += 2;
  691. }
  692. if ((bk & 3) && (bk > 0))
  693. {
  694. if (bk & 2)
  695. {
  696. src_a = LD_SP(a);
  697. src_b0 = LD_SP(b);
  698. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  699. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  700. src_c0 -= src_a * src_b0;
  701. src_c1 -= src_a * src_b1;
  702. a += 4;
  703. b += 2;
  704. src_a = LD_SP(a);
  705. src_b0 = LD_SP(b);
  706. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  707. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  708. src_c0 -= src_a * src_b0;
  709. src_c1 -= src_a * src_b1;
  710. a += 4;
  711. b += 2;
  712. }
  713. if (bk & 1)
  714. {
  715. src_a = LD_SP(a);
  716. src_b0 = LD_SP(b);
  717. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  718. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  719. src_c0 -= src_a * src_b0;
  720. src_c1 -= src_a * src_b1;
  721. a += 4;
  722. b += 2;
  723. }
  724. }
  725. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  726. src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
  727. src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
  728. src_c0 *= src_b0;
  729. src_c1 -= src_c0 * src_b1;
  730. src_c1 *= src_b3;
  731. ST_SP2(src_c0, src_c1, a, 4);
  732. ST_SP(src_c0, c);
  733. ST_SP(src_c1, c_nxt1line);
  734. }
  735. static void ssolve_4x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  736. {
  737. BLASLONG k;
  738. FLOAT b0, c0, c1, c2, c3;
  739. c0 = *(c + 0);
  740. c1 = *(c + 1);
  741. c2 = *(c + 2);
  742. c3 = *(c + 3);
  743. for (k = 0; k < bk; k++)
  744. {
  745. c0 -= a[0] * b[0];
  746. c1 -= a[1] * b[0];
  747. c2 -= a[2] * b[0];
  748. c3 -= a[3] * b[0];
  749. a += 4;
  750. b += 1;
  751. }
  752. b0 = *(b + 0);
  753. c0 *= b0;
  754. c1 *= b0;
  755. c2 *= b0;
  756. c3 *= b0;
  757. *(a + 0) = c0;
  758. *(a + 1) = c1;
  759. *(a + 2) = c2;
  760. *(a + 3) = c3;
  761. *(c + 0) = c0;
  762. *(c + 1) = c1;
  763. *(c + 2) = c2;
  764. *(c + 3) = c3;
  765. }
  766. static void ssolve_2x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  767. {
  768. BLASLONG k;
  769. FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15;
  770. FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31;
  771. FLOAT b36, b37, b38, b39, b45, b46, b47, b54, b55, b63;
  772. FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  773. FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
  774. FLOAT c0_nxt7, c1_nxt7;
  775. c0 = *(c + 0);
  776. c1 = *(c + 1);
  777. c0_nxt1 = *(c + 0 + 1 * ldc);
  778. c1_nxt1 = *(c + 1 + 1 * ldc);
  779. c0_nxt2 = *(c + 0 + 2 * ldc);
  780. c1_nxt2 = *(c + 1 + 2 * ldc);
  781. c0_nxt3 = *(c + 0 + 3 * ldc);
  782. c1_nxt3 = *(c + 1 + 3 * ldc);
  783. c0_nxt4 = *(c + 0 + 4 * ldc);
  784. c1_nxt4 = *(c + 1 + 4 * ldc);
  785. c0_nxt5 = *(c + 0 + 5 * ldc);
  786. c1_nxt5 = *(c + 1 + 5 * ldc);
  787. c0_nxt6 = *(c + 0 + 6 * ldc);
  788. c1_nxt6 = *(c + 1 + 6 * ldc);
  789. c0_nxt7 = *(c + 0 + 7 * ldc);
  790. c1_nxt7 = *(c + 1 + 7 * ldc);
  791. for (k = 0; k < bk; k++)
  792. {
  793. c0 -= a[0] * b[0];
  794. c1 -= a[1] * b[0];
  795. c0_nxt1 -= a[0] * b[1];
  796. c1_nxt1 -= a[1] * b[1];
  797. c0_nxt2 -= a[0] * b[2];
  798. c1_nxt2 -= a[1] * b[2];
  799. c0_nxt3 -= a[0] * b[3];
  800. c1_nxt3 -= a[1] * b[3];
  801. c0_nxt4 -= a[0] * b[4];
  802. c1_nxt4 -= a[1] * b[4];
  803. c0_nxt5 -= a[0] * b[5];
  804. c1_nxt5 -= a[1] * b[5];
  805. c0_nxt6 -= a[0] * b[6];
  806. c1_nxt6 -= a[1] * b[6];
  807. c0_nxt7 -= a[0] * b[7];
  808. c1_nxt7 -= a[1] * b[7];
  809. a += 2;
  810. b += 8;
  811. }
  812. b0 = *(b + 0);
  813. b1 = *(b + 1);
  814. b2 = *(b + 2);
  815. b3 = *(b + 3);
  816. b4 = *(b + 4);
  817. b5 = *(b + 5);
  818. b6 = *(b + 6);
  819. b7 = *(b + 7);
  820. b9 = *(b + 9);
  821. b10 = *(b + 10);
  822. b11 = *(b + 11);
  823. b12 = *(b + 12);
  824. b13 = *(b + 13);
  825. b14 = *(b + 14);
  826. b15 = *(b + 15);
  827. b18 = *(b + 18);
  828. b19 = *(b + 19);
  829. b20 = *(b + 20);
  830. b21 = *(b + 21);
  831. b22 = *(b + 22);
  832. b23 = *(b + 23);
  833. b27 = *(b + 27);
  834. b28 = *(b + 28);
  835. b29 = *(b + 29);
  836. b30 = *(b + 30);
  837. b31 = *(b + 31);
  838. b36 = *(b + 36);
  839. b37 = *(b + 37);
  840. b38 = *(b + 38);
  841. b39 = *(b + 39);
  842. b45 = *(b + 45);
  843. b46 = *(b + 46);
  844. b47 = *(b + 47);
  845. b54 = *(b + 54);
  846. b55 = *(b + 55);
  847. b63 = *(b + 63);
  848. c0 *= b0;
  849. c1 *= b0;
  850. c0_nxt1 -= c0 * b1;
  851. c1_nxt1 -= c1 * b1;
  852. c0_nxt2 -= c0 * b2;
  853. c1_nxt2 -= c1 * b2;
  854. c0_nxt3 -= c0 * b3;
  855. c1_nxt3 -= c1 * b3;
  856. c0_nxt4 -= c0 * b4;
  857. c1_nxt4 -= c1 * b4;
  858. c0_nxt5 -= c0 * b5;
  859. c1_nxt5 -= c1 * b5;
  860. c0_nxt6 -= c0 * b6;
  861. c1_nxt6 -= c1 * b6;
  862. c0_nxt7 -= c0 * b7;
  863. c1_nxt7 -= c1 * b7;
  864. c0_nxt1 *= b9;
  865. c1_nxt1 *= b9;
  866. c0_nxt2 -= c0_nxt1 * b10;
  867. c1_nxt2 -= c1_nxt1 * b10;
  868. c0_nxt3 -= c0_nxt1 * b11;
  869. c1_nxt3 -= c1_nxt1 * b11;
  870. c0_nxt4 -= c0_nxt1 * b12;
  871. c1_nxt4 -= c1_nxt1 * b12;
  872. c0_nxt5 -= c0_nxt1 * b13;
  873. c1_nxt5 -= c1_nxt1 * b13;
  874. c0_nxt6 -= c0_nxt1 * b14;
  875. c1_nxt6 -= c1_nxt1 * b14;
  876. c0_nxt7 -= c0_nxt1 * b15;
  877. c1_nxt7 -= c1_nxt1 * b15;
  878. c0_nxt2 *= b18;
  879. c1_nxt2 *= b18;
  880. c0_nxt3 -= c0_nxt2 * b19;
  881. c1_nxt3 -= c1_nxt2 * b19;
  882. c0_nxt4 -= c0_nxt2 * b20;
  883. c1_nxt4 -= c1_nxt2 * b20;
  884. c0_nxt5 -= c0_nxt2 * b21;
  885. c1_nxt5 -= c1_nxt2 * b21;
  886. c0_nxt6 -= c0_nxt2 * b22;
  887. c1_nxt6 -= c1_nxt2 * b22;
  888. c0_nxt7 -= c0_nxt2 * b23;
  889. c1_nxt7 -= c1_nxt2 * b23;
  890. c0_nxt3 *= b27;
  891. c1_nxt3 *= b27;
  892. c0_nxt4 -= c0_nxt3 * b28;
  893. c1_nxt4 -= c1_nxt3 * b28;
  894. c0_nxt5 -= c0_nxt3 * b29;
  895. c1_nxt5 -= c1_nxt3 * b29;
  896. c0_nxt6 -= c0_nxt3 * b30;
  897. c1_nxt6 -= c1_nxt3 * b30;
  898. c0_nxt7 -= c0_nxt3 * b31;
  899. c1_nxt7 -= c1_nxt3 * b31;
  900. c0_nxt4 *= b36;
  901. c1_nxt4 *= b36;
  902. c0_nxt5 -= c0_nxt4 * b37;
  903. c1_nxt5 -= c1_nxt4 * b37;
  904. c0_nxt6 -= c0_nxt4 * b38;
  905. c1_nxt6 -= c1_nxt4 * b38;
  906. c0_nxt7 -= c0_nxt4 * b39;
  907. c1_nxt7 -= c1_nxt4 * b39;
  908. c0_nxt5 *= b45;
  909. c1_nxt5 *= b45;
  910. c0_nxt6 -= c0_nxt5 * b46;
  911. c1_nxt6 -= c1_nxt5 * b46;
  912. c0_nxt7 -= c0_nxt5 * b47;
  913. c1_nxt7 -= c1_nxt5 * b47;
  914. c0_nxt6 *= b54;
  915. c1_nxt6 *= b54;
  916. c0_nxt7 -= c0_nxt6 * b55;
  917. c1_nxt7 -= c1_nxt6 * b55;
  918. c0_nxt7 *= b63;
  919. c1_nxt7 *= b63;
  920. *(a + 0) = c0;
  921. *(a + 1) = c1;
  922. *(a + 2) = c0_nxt1;
  923. *(a + 3) = c1_nxt1;
  924. *(a + 4) = c0_nxt2;
  925. *(a + 5) = c1_nxt2;
  926. *(a + 6) = c0_nxt3;
  927. *(a + 7) = c1_nxt3;
  928. *(a + 8) = c0_nxt4;
  929. *(a + 9) = c1_nxt4;
  930. *(a + 10) = c0_nxt5;
  931. *(a + 11) = c1_nxt5;
  932. *(a + 12) = c0_nxt6;
  933. *(a + 13) = c1_nxt6;
  934. *(a + 14) = c0_nxt7;
  935. *(a + 15) = c1_nxt7;
  936. *(c + 0) = c0;
  937. *(c + 1) = c1;
  938. *(c + 0 + 1 * ldc) = c0_nxt1;
  939. *(c + 1 + 1 * ldc) = c1_nxt1;
  940. *(c + 0 + 2 * ldc) = c0_nxt2;
  941. *(c + 1 + 2 * ldc) = c1_nxt2;
  942. *(c + 0 + 3 * ldc) = c0_nxt3;
  943. *(c + 1 + 3 * ldc) = c1_nxt3;
  944. *(c + 0 + 4 * ldc) = c0_nxt4;
  945. *(c + 1 + 4 * ldc) = c1_nxt4;
  946. *(c + 0 + 5 * ldc) = c0_nxt5;
  947. *(c + 1 + 5 * ldc) = c1_nxt5;
  948. *(c + 0 + 6 * ldc) = c0_nxt6;
  949. *(c + 1 + 6 * ldc) = c1_nxt6;
  950. *(c + 0 + 7 * ldc) = c0_nxt7;
  951. *(c + 1 + 7 * ldc) = c1_nxt7;
  952. }
  953. static void ssolve_2x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  954. {
  955. BLASLONG k;
  956. FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1;
  957. FLOAT c0_nxt1, c0_nxt2, c0_nxt3, c1_nxt1, c1_nxt2, c1_nxt3;
  958. c0 = *(c + 0);
  959. c1 = *(c + 1);
  960. c0_nxt1 = *(c + 0 + 1 * ldc);
  961. c1_nxt1 = *(c + 1 + 1 * ldc);
  962. c0_nxt2 = *(c + 0 + 2 * ldc);
  963. c1_nxt2 = *(c + 1 + 2 * ldc);
  964. c0_nxt3 = *(c + 0 + 3 * ldc);
  965. c1_nxt3 = *(c + 1 + 3 * ldc);
  966. for (k = 0; k < bk; k++)
  967. {
  968. c0 -= a[0] * b[0];
  969. c1 -= a[1] * b[0];
  970. c0_nxt1 -= a[0] * b[1];
  971. c1_nxt1 -= a[1] * b[1];
  972. c0_nxt2 -= a[0] * b[2];
  973. c1_nxt2 -= a[1] * b[2];
  974. c0_nxt3 -= a[0] * b[3];
  975. c1_nxt3 -= a[1] * b[3];
  976. a += 2;
  977. b += 4;
  978. }
  979. b0 = *(b + 0);
  980. b1 = *(b + 1);
  981. b2 = *(b + 2);
  982. b3 = *(b + 3);
  983. b5 = *(b + 5);
  984. b6 = *(b + 6);
  985. b7 = *(b + 7);
  986. b10 = *(b + 10);
  987. b11 = *(b + 11);
  988. b15 = *(b + 15);
  989. c0 *= b0;
  990. c1 *= b0;
  991. c0_nxt1 -= c0 * b1;
  992. c1_nxt1 -= c1 * b1;
  993. c0_nxt1 *= b5;
  994. c1_nxt1 *= b5;
  995. c0_nxt2 -= c0 * b2;
  996. c1_nxt2 -= c1 * b2;
  997. c0_nxt2 -= c0_nxt1 * b6;
  998. c1_nxt2 -= c1_nxt1 * b6;
  999. c0_nxt2 *= b10;
  1000. c1_nxt2 *= b10;
  1001. c0_nxt3 -= c0 * b3;
  1002. c1_nxt3 -= c1 * b3;
  1003. c0_nxt3 -= c0_nxt1 * b7;
  1004. c1_nxt3 -= c1_nxt1 * b7;
  1005. c0_nxt3 -= c0_nxt2 * b11;
  1006. c1_nxt3 -= c1_nxt2 * b11;
  1007. c0_nxt3 *= b15;
  1008. c1_nxt3 *= b15;
  1009. *(a + 0) = c0;
  1010. *(a + 1) = c1;
  1011. *(a + 2) = c0_nxt1;
  1012. *(a + 3) = c1_nxt1;
  1013. *(a + 4) = c0_nxt2;
  1014. *(a + 5) = c1_nxt2;
  1015. *(a + 6) = c0_nxt3;
  1016. *(a + 7) = c1_nxt3;
  1017. *(c + 0) = c0;
  1018. *(c + 1) = c1;
  1019. *(c + 1 * ldc) = c0_nxt1;
  1020. *(c + 1 + 1 * ldc) = c1_nxt1;
  1021. *(c + 2 * ldc) = c0_nxt2;
  1022. *(c + 1 + 2 * ldc) = c1_nxt2;
  1023. *(c + 3 * ldc) = c0_nxt3;
  1024. *(c + 1 + 3 * ldc) = c1_nxt3;
  1025. }
  1026. static void ssolve_2x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1027. {
  1028. BLASLONG k;
  1029. FLOAT b0, b1, b3, c0, c0_nxt, c1, c1_nxt;
  1030. c0 = *(c + 0);
  1031. c1 = *(c + 1);
  1032. c0_nxt = *(c + 0 + ldc);
  1033. c1_nxt = *(c + 1 + ldc);
  1034. for (k = 0; k < bk; k++)
  1035. {
  1036. c0 -= a[0] * b[0];
  1037. c1 -= a[1] * b[0];
  1038. c0_nxt -= a[0] * b[1];
  1039. c1_nxt -= a[1] * b[1];
  1040. a += 2;
  1041. b += 2;
  1042. }
  1043. b0 = *(b + 0);
  1044. b1 = *(b + 1);
  1045. b3 = *(b + 3);
  1046. c0 *= b0;
  1047. c1 *= b0;
  1048. c0_nxt -= c0 * b1;
  1049. c1_nxt -= c1 * b1;
  1050. c0_nxt *= b3;
  1051. c1_nxt *= b3;
  1052. *(a + 0) = c0;
  1053. *(a + 1) = c1;
  1054. *(a + 2) = c0_nxt;
  1055. *(a + 3) = c1_nxt;
  1056. *(c + 0) = c0;
  1057. *(c + 1) = c1;
  1058. *(c + ldc) = c0_nxt;
  1059. *(c + 1 + ldc) = c1_nxt;
  1060. }
  1061. static void ssolve_2x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1062. {
  1063. BLASLONG k;
  1064. FLOAT b0, c0, c1;
  1065. c0 = *(c + 0);
  1066. c1 = *(c + 1);
  1067. for (k = 0; k < bk; k++)
  1068. {
  1069. c0 -= a[0] * b[0];
  1070. c1 -= a[1] * b[0];
  1071. a += 2;
  1072. b += 1;
  1073. }
  1074. b0 = *(b + 0);
  1075. c0 *= b0;
  1076. c1 *= b0;
  1077. *(a + 0) = c0;
  1078. *(a + 1) = c1;
  1079. *(c + 0) = c0;
  1080. *(c + 1) = c1;
  1081. }
  1082. static void ssolve_1x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1083. {
  1084. BLASLONG k;
  1085. FLOAT b0, b1, b2, b3, b4, b5, b6, b7, b9, b10, b11, b12, b13, b14, b15;
  1086. FLOAT b18, b19, b20, b21, b22, b23, b27, b28, b29, b30, b31, b36, b37, b38;
  1087. FLOAT b39, b45, b46, b47, b54, b55, b63, c0, c1, c2, c3, c4, c5, c6, c7;
  1088. c0 = *(c + 0);
  1089. c1 = *(c + 1 * ldc);
  1090. c2 = *(c + 2 * ldc);
  1091. c3 = *(c + 3 * ldc);
  1092. c4 = *(c + 4 * ldc);
  1093. c5 = *(c + 5 * ldc);
  1094. c6 = *(c + 6 * ldc);
  1095. c7 = *(c + 7 * ldc);
  1096. for (k = 0; k < bk; k++)
  1097. {
  1098. c0 -= a[0] * b[0];
  1099. c1 -= a[0] * b[1];
  1100. c2 -= a[0] * b[2];
  1101. c3 -= a[0] * b[3];
  1102. c4 -= a[0] * b[4];
  1103. c5 -= a[0] * b[5];
  1104. c6 -= a[0] * b[6];
  1105. c7 -= a[0] * b[7];
  1106. a += 1;
  1107. b += 8;
  1108. }
  1109. b0 = *(b + 0);
  1110. b1 = *(b + 1);
  1111. b2 = *(b + 2);
  1112. b3 = *(b + 3);
  1113. b4 = *(b + 4);
  1114. b5 = *(b + 5);
  1115. b6 = *(b + 6);
  1116. b7 = *(b + 7);
  1117. b9 = *(b + 9);
  1118. b10 = *(b + 10);
  1119. b11 = *(b + 11);
  1120. b12 = *(b + 12);
  1121. b13 = *(b + 13);
  1122. b14 = *(b + 14);
  1123. b15 = *(b + 15);
  1124. b18 = *(b + 18);
  1125. b19 = *(b + 19);
  1126. b20 = *(b + 20);
  1127. b21 = *(b + 21);
  1128. b22 = *(b + 22);
  1129. b23 = *(b + 23);
  1130. b27 = *(b + 27);
  1131. b28 = *(b + 28);
  1132. b29 = *(b + 29);
  1133. b30 = *(b + 30);
  1134. b31 = *(b + 31);
  1135. b36 = *(b + 36);
  1136. b37 = *(b + 37);
  1137. b38 = *(b + 38);
  1138. b39 = *(b + 39);
  1139. b45 = *(b + 45);
  1140. b46 = *(b + 46);
  1141. b47 = *(b + 47);
  1142. b54 = *(b + 54);
  1143. b55 = *(b + 55);
  1144. b63 = *(b + 63);
  1145. c0 *= b0;
  1146. c1 -= c0 * b1;
  1147. c1 *= b9;
  1148. c2 -= c0 * b2;
  1149. c2 -= c1 * b10;
  1150. c2 *= b18;
  1151. c3 -= c0 * b3;
  1152. c3 -= c1 * b11;
  1153. c3 -= c2 * b19;
  1154. c3 *= b27;
  1155. c4 -= c0 * b4;
  1156. c4 -= c1 * b12;
  1157. c4 -= c2 * b20;
  1158. c4 -= c3 * b28;
  1159. c4 *= b36;
  1160. c5 -= c0 * b5;
  1161. c5 -= c1 * b13;
  1162. c5 -= c2 * b21;
  1163. c5 -= c3 * b29;
  1164. c5 -= c4 * b37;
  1165. c5 *= b45;
  1166. c6 -= c0 * b6;
  1167. c6 -= c1 * b14;
  1168. c6 -= c2 * b22;
  1169. c6 -= c3 * b30;
  1170. c6 -= c4 * b38;
  1171. c6 -= c5 * b46;
  1172. c6 *= b54;
  1173. c7 -= c0 * b7;
  1174. c7 -= c1 * b15;
  1175. c7 -= c2 * b23;
  1176. c7 -= c3 * b31;
  1177. c7 -= c4 * b39;
  1178. c7 -= c5 * b47;
  1179. c7 -= c6 * b55;
  1180. c7 *= b63;
  1181. *(a + 0) = c0;
  1182. *(a + 1) = c1;
  1183. *(a + 2) = c2;
  1184. *(a + 3) = c3;
  1185. *(a + 4) = c4;
  1186. *(a + 5) = c5;
  1187. *(a + 6) = c6;
  1188. *(a + 7) = c7;
  1189. *(c + 0) = c0;
  1190. *(c + 1 * ldc) = c1;
  1191. *(c + 2 * ldc) = c2;
  1192. *(c + 3 * ldc) = c3;
  1193. *(c + 4 * ldc) = c4;
  1194. *(c + 5 * ldc) = c5;
  1195. *(c + 6 * ldc) = c6;
  1196. *(c + 7 * ldc) = c7;
  1197. }
  1198. static void ssolve_1x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1199. {
  1200. BLASLONG k;
  1201. FLOAT b0, b1, b2, b3, b5, b6, b7, b10, b11, b15, c0, c1, c2, c3;
  1202. c0 = *(c + 0);
  1203. c1 = *(c + 1 * ldc);
  1204. c2 = *(c + 2 * ldc);
  1205. c3 = *(c + 3 * ldc);
  1206. for (k = 0; k < bk; k++)
  1207. {
  1208. c0 -= a[0] * b[0];
  1209. c1 -= a[0] * b[1];
  1210. c2 -= a[0] * b[2];
  1211. c3 -= a[0] * b[3];
  1212. a += 1;
  1213. b += 4;
  1214. }
  1215. b0 = *(b + 0);
  1216. b1 = *(b + 1);
  1217. b2 = *(b + 2);
  1218. b3 = *(b + 3);
  1219. b5 = *(b + 5);
  1220. b6 = *(b + 6);
  1221. b7 = *(b + 7);
  1222. b10 = *(b + 10);
  1223. b11 = *(b + 11);
  1224. b15 = *(b + 15);
  1225. c0 *= b0;
  1226. c1 -= c0 * b1;
  1227. c1 *= b5;
  1228. c2 -= c0 * b2;
  1229. c2 -= c1 * b6;
  1230. c2 *= b10;
  1231. c3 -= c0 * b3;
  1232. c3 -= c1 * b7;
  1233. c3 -= c2 * b11;
  1234. c3 *= b15;
  1235. *(a + 0) = c0;
  1236. *(a + 1) = c1;
  1237. *(a + 2) = c2;
  1238. *(a + 3) = c3;
  1239. *(c + 0) = c0;
  1240. *(c + 1 * ldc) = c1;
  1241. *(c + 2 * ldc) = c2;
  1242. *(c + 3 * ldc) = c3;
  1243. }
  1244. static void ssolve_1x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1245. {
  1246. BLASLONG k;
  1247. FLOAT b0, b1, b3, c0, c1;
  1248. c0 = *c;
  1249. c1 = *(c + ldc);
  1250. for (k = 0; k < bk; k++)
  1251. {
  1252. c0 -= a[0] * b[0];
  1253. c1 -= a[0] * b[1];
  1254. a += 1;
  1255. b += 2;
  1256. }
  1257. b0 = *(b + 0);
  1258. b1 = *(b + 1);
  1259. b3 = *(b + 3);
  1260. c0 *= b0;
  1261. c1 -= c0 * b1;
  1262. c1 *= b3;
  1263. *(a + 0) = c0;
  1264. *(a + 1) = c1;
  1265. *(c + 0) = c0;
  1266. *(c + ldc) = c1;
  1267. }
  1268. static void ssolve_1x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1269. {
  1270. BLASLONG k;
  1271. for (k = 0; k < bk; k++)
  1272. {
  1273. *c -= a[0] * b[0];
  1274. a++;
  1275. b++;
  1276. }
  1277. *c *= *b;
  1278. *a = *c;
  1279. }
  1280. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
  1281. FLOAT *c, BLASLONG ldc, BLASLONG offset)
  1282. {
  1283. FLOAT *aa, *cc;
  1284. BLASLONG i, j, kk;
  1285. kk = -offset;
  1286. for (j = (n >> 3); j--;)
  1287. {
  1288. aa = a;
  1289. cc = c;
  1290. for (i = (m >> 3); i--;)
  1291. {
  1292. ssolve_8x8_rn_msa(aa, b, cc, ldc, kk);
  1293. aa += 8 * k;
  1294. cc += 8;
  1295. }
  1296. if (m & 7)
  1297. {
  1298. if (m & 4)
  1299. {
  1300. ssolve_4x8_rn_msa(aa, b, cc, ldc, kk);
  1301. aa += 4 * k;
  1302. cc += 4;
  1303. }
  1304. if (m & 2)
  1305. {
  1306. ssolve_2x8_rn_msa(aa, b, cc, ldc, kk);
  1307. aa += 2 * k;
  1308. cc += 2;
  1309. }
  1310. if (m & 1)
  1311. {
  1312. ssolve_1x8_rn_msa(aa, b, cc, ldc, kk);
  1313. aa += k;
  1314. cc += 1;
  1315. }
  1316. }
  1317. kk += 8;
  1318. b += 8 * k;
  1319. c += 8 * ldc;
  1320. }
  1321. if (n & 7)
  1322. {
  1323. if (n & 4)
  1324. {
  1325. aa = a;
  1326. cc = c;
  1327. for (i = (m >> 3); i--;)
  1328. {
  1329. ssolve_8x4_rn_msa(aa, b, cc, ldc, kk);
  1330. aa += 8 * k;
  1331. cc += 8;
  1332. }
  1333. if (m & 7)
  1334. {
  1335. if (m & 4)
  1336. {
  1337. ssolve_4x4_rn_msa(aa, b, cc, ldc, kk);
  1338. aa += 4 * k;
  1339. cc += 4;
  1340. }
  1341. if (m & 2)
  1342. {
  1343. ssolve_2x4_rn_msa(aa, b, cc, ldc, kk);
  1344. aa += 2 * k;
  1345. cc += 2;
  1346. }
  1347. if (m & 1)
  1348. {
  1349. ssolve_1x4_rn_msa(aa, b, cc, ldc, kk);
  1350. aa += k;
  1351. cc += 1;
  1352. }
  1353. }
  1354. b += 4 * k;
  1355. c += 4 * ldc;
  1356. kk += 4;
  1357. }
  1358. if (n & 2)
  1359. {
  1360. aa = a;
  1361. cc = c;
  1362. for (i = (m >> 3); i--;)
  1363. {
  1364. ssolve_8x2_rn_msa(aa, b, cc, ldc, kk);
  1365. aa += 8 * k;
  1366. cc += 8;
  1367. }
  1368. if (m & 7)
  1369. {
  1370. if (m & 4)
  1371. {
  1372. ssolve_4x2_rn_msa(aa, b, cc, ldc, kk);
  1373. aa += 4 * k;
  1374. cc += 4;
  1375. }
  1376. if (m & 2)
  1377. {
  1378. ssolve_2x2_rn_msa(aa, b, cc, ldc, kk);
  1379. aa += 2 * k;
  1380. cc += 2;
  1381. }
  1382. if (m & 1)
  1383. {
  1384. ssolve_1x2_rn_msa(aa, b, cc, ldc, kk);
  1385. aa += k;
  1386. cc += 1;
  1387. }
  1388. }
  1389. b += 2 * k;
  1390. c += 2 * ldc;
  1391. kk += 2;
  1392. }
  1393. if (n & 1)
  1394. {
  1395. aa = a;
  1396. cc = c;
  1397. for (i = (m >> 3); i--;)
  1398. {
  1399. ssolve_8x1_rn_msa(aa, b, cc, ldc, kk);
  1400. aa += 8 * k;
  1401. cc += 8;
  1402. }
  1403. if (m & 7)
  1404. {
  1405. if (m & 4)
  1406. {
  1407. ssolve_4x1_rn_msa(aa, b, cc, ldc, kk);
  1408. aa += 4 * k;
  1409. cc += 4;
  1410. }
  1411. if (m & 2)
  1412. {
  1413. ssolve_2x1_rn_msa(aa, b, cc, ldc, kk);
  1414. aa += 2 * k;
  1415. cc += 2;
  1416. }
  1417. if (m & 1)
  1418. {
  1419. ssolve_1x1_rn_msa(aa, b, cc, kk);
  1420. aa += k;
  1421. cc += 1;
  1422. }
  1423. }
  1424. b += k;
  1425. c += ldc;
  1426. kk += 1;
  1427. }
  1428. }
  1429. return 0;
  1430. }