You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strsm_kernel_RT_8x8_msa.c 45 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790
  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  30. {
  31. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  32. v4f32 src_c8, src_c9, src_c10, src_c11, src_c12, src_c13, src_c14, src_c15;
  33. v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
  34. v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
  35. v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
  36. v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
  37. v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
  38. FLOAT *c_nxt1line = c + ldc;
  39. FLOAT *c_nxt2line = c + 2 * ldc;
  40. FLOAT *c_nxt3line = c + 3 * ldc;
  41. FLOAT *c_nxt4line = c + 4 * ldc;
  42. FLOAT *c_nxt5line = c + 5 * ldc;
  43. FLOAT *c_nxt6line = c + 6 * ldc;
  44. FLOAT *c_nxt7line = c + 7 * ldc;
  45. LD_SP2(c, 4, src_c0, src_c1);
  46. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  47. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  48. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  49. LD_SP2(c_nxt4line, 4, src_c8, src_c9);
  50. LD_SP2(c_nxt5line, 4, src_c10, src_c11);
  51. LD_SP2(c_nxt6line, 4, src_c12, src_c13);
  52. LD_SP2(c_nxt7line, 4, src_c14, src_c15);
  53. if (bk > 0)
  54. {
  55. BLASLONG k, pref_offset;
  56. FLOAT *aa = a, *bb = b, *pa0_pref;
  57. v4f32 src_a0, src_a1, src_b1, src_b2, src_b3, src_bb0, src_bb1;
  58. pref_offset = (uintptr_t)a & (L1_DATA_LINESIZE - 1);
  59. if (pref_offset)
  60. {
  61. pref_offset = L1_DATA_LINESIZE - pref_offset;
  62. pref_offset = pref_offset / sizeof(FLOAT);
  63. }
  64. pa0_pref = a + pref_offset;
  65. for (k = 0; k < (bk >> 1); k++)
  66. {
  67. PREF_OFFSET(pa0_pref, 64);
  68. PREF_OFFSET(pa0_pref, 96);
  69. LD_SP2_INC(aa, 4, src_a0, src_a1);
  70. LD_SP2_INC(bb, 4, src_bb0, src_bb1);
  71. SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
  72. src_c0 -= src_a0 * src_b0;
  73. src_c1 -= src_a1 * src_b0;
  74. src_c2 -= src_a0 * src_b1;
  75. src_c3 -= src_a1 * src_b1;
  76. src_c4 -= src_a0 * src_b2;
  77. src_c5 -= src_a1 * src_b2;
  78. src_c6 -= src_a0 * src_b3;
  79. src_c7 -= src_a1 * src_b3;
  80. SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
  81. src_c8 -= src_a0 * src_b0;
  82. src_c9 -= src_a1 * src_b0;
  83. src_c10 -= src_a0 * src_b1;
  84. src_c11 -= src_a1 * src_b1;
  85. src_c12 -= src_a0 * src_b2;
  86. src_c13 -= src_a1 * src_b2;
  87. src_c14 -= src_a0 * src_b3;
  88. src_c15 -= src_a1 * src_b3;
  89. LD_SP2_INC(aa, 4, src_a0, src_a1);
  90. LD_SP2_INC(bb, 4, src_bb0, src_bb1);
  91. SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
  92. src_c0 -= src_a0 * src_b0;
  93. src_c1 -= src_a1 * src_b0;
  94. src_c2 -= src_a0 * src_b1;
  95. src_c3 -= src_a1 * src_b1;
  96. src_c4 -= src_a0 * src_b2;
  97. src_c5 -= src_a1 * src_b2;
  98. src_c6 -= src_a0 * src_b3;
  99. src_c7 -= src_a1 * src_b3;
  100. SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
  101. src_c8 -= src_a0 * src_b0;
  102. src_c9 -= src_a1 * src_b0;
  103. src_c10 -= src_a0 * src_b1;
  104. src_c11 -= src_a1 * src_b1;
  105. src_c12 -= src_a0 * src_b2;
  106. src_c13 -= src_a1 * src_b2;
  107. src_c14 -= src_a0 * src_b3;
  108. src_c15 -= src_a1 * src_b3;
  109. pa0_pref += 16;
  110. }
  111. if (bk & 1)
  112. {
  113. LD_SP2_INC(aa, 4, src_a0, src_a1);
  114. LD_SP2_INC(bb, 4, src_bb0, src_bb1);
  115. SPLATI_W4_SP(src_bb0, src_b0, src_b1, src_b2, src_b3);
  116. src_c0 -= src_a0 * src_b0;
  117. src_c1 -= src_a1 * src_b0;
  118. src_c2 -= src_a0 * src_b1;
  119. src_c3 -= src_a1 * src_b1;
  120. src_c4 -= src_a0 * src_b2;
  121. src_c5 -= src_a1 * src_b2;
  122. src_c6 -= src_a0 * src_b3;
  123. src_c7 -= src_a1 * src_b3;
  124. SPLATI_W4_SP(src_bb1, src_b0, src_b1, src_b2, src_b3);
  125. src_c8 -= src_a0 * src_b0;
  126. src_c9 -= src_a1 * src_b0;
  127. src_c10 -= src_a0 * src_b1;
  128. src_c11 -= src_a1 * src_b1;
  129. src_c12 -= src_a0 * src_b2;
  130. src_c13 -= src_a1 * src_b2;
  131. src_c14 -= src_a0 * src_b3;
  132. src_c15 -= src_a1 * src_b3;
  133. }
  134. }
  135. b -= 64;
  136. src_b = LD_SP(b + 60);
  137. SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
  138. src_b = LD_SP(b + 56);
  139. SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
  140. src_c15 *= src_b63;
  141. src_c14 *= src_b63;
  142. src_c13 -= src_c15 * src_b62;
  143. src_c12 -= src_c14 * src_b62;
  144. src_c11 -= src_c15 * src_b61;
  145. src_c10 -= src_c14 * src_b61;
  146. src_c9 -= src_c15 * src_b60;
  147. src_c8 -= src_c14 * src_b60;
  148. src_c7 -= src_c15 * src_b59;
  149. src_c6 -= src_c14 * src_b59;
  150. src_c5 -= src_c15 * src_b58;
  151. src_c4 -= src_c14 * src_b58;
  152. src_c3 -= src_c15 * src_b57;
  153. src_c2 -= src_c14 * src_b57;
  154. src_c1 -= src_c15 * src_b56;
  155. src_c0 -= src_c14 * src_b56;
  156. src_b = LD_SP(b + 48);
  157. SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
  158. src_b52 = LD_SP(b + 52);
  159. src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
  160. src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
  161. src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
  162. src_c12 *= src_b54;
  163. src_c13 *= src_b54;
  164. src_c10 -= src_c12 * src_b53;
  165. src_c11 -= src_c13 * src_b53;
  166. src_c8 -= src_c12 * src_b52;
  167. src_c9 -= src_c13 * src_b52;
  168. src_c6 -= src_c12 * src_b51;
  169. src_c7 -= src_c13 * src_b51;
  170. src_c4 -= src_c12 * src_b50;
  171. src_c5 -= src_c13 * src_b50;
  172. src_c2 -= src_c12 * src_b49;
  173. src_c3 -= src_c13 * src_b49;
  174. src_c0 -= src_c12 * src_b48;
  175. src_c1 -= src_c13 * src_b48;
  176. ST_SP4(src_c12, src_c13, src_c14, src_c15, a - 16, 4);
  177. ST_SP2(src_c12, src_c13, c_nxt6line, 4);
  178. ST_SP2(src_c14, src_c15, c_nxt7line, 4);
  179. src_b = LD_SP(b + 40);
  180. SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
  181. src_b44 = LD_SP(b + 44);
  182. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
  183. src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
  184. src_c10 *= src_b45;
  185. src_c11 *= src_b45;
  186. src_c8 -= src_c10 * src_b44;
  187. src_c9 -= src_c11 * src_b44;
  188. src_c6 -= src_c10 * src_b43;
  189. src_c7 -= src_c11 * src_b43;
  190. src_c4 -= src_c10 * src_b42;
  191. src_c5 -= src_c11 * src_b42;
  192. src_c2 -= src_c10 * src_b41;
  193. src_c3 -= src_c11 * src_b41;
  194. src_c0 -= src_c10 * src_b40;
  195. src_c1 -= src_c11 * src_b40;
  196. src_b = LD_SP(b + 32);
  197. SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
  198. src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
  199. src_c8 *= src_b36;
  200. src_c9 *= src_b36;
  201. src_c6 -= src_c8 * src_b35;
  202. src_c7 -= src_c9 * src_b35;
  203. src_c4 -= src_c8 * src_b34;
  204. src_c5 -= src_c9 * src_b34;
  205. src_c2 -= src_c8 * src_b33;
  206. src_c3 -= src_c9 * src_b33;
  207. src_c0 -= src_c8 * src_b32;
  208. src_c1 -= src_c9 * src_b32;
  209. ST_SP4(src_c8, src_c9, src_c10, src_c11, a - 32, 4);
  210. ST_SP2(src_c8, src_c9, c_nxt4line, 4);
  211. ST_SP2(src_c10, src_c11, c_nxt5line, 4);
  212. src_b = LD_SP(b + 24);
  213. SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
  214. src_c6 *= src_b27;
  215. src_c7 *= src_b27;
  216. src_c4 -= src_c6 * src_b26;
  217. src_c5 -= src_c7 * src_b26;
  218. src_c2 -= src_c6 * src_b25;
  219. src_c3 -= src_c7 * src_b25;
  220. src_c0 -= src_c6 * src_b24;
  221. src_c1 -= src_c7 * src_b24;
  222. src_b16 = LD_SP(b + 16);
  223. src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
  224. src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
  225. src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
  226. src_c4 *= src_b18;
  227. src_c5 *= src_b18;
  228. src_c2 -= src_c4 * src_b17;
  229. src_c3 -= src_c5 * src_b17;
  230. src_c0 -= src_c4 * src_b16;
  231. src_c1 -= src_c5 * src_b16;
  232. ST_SP4(src_c4, src_c5, src_c6, src_c7, a - 48, 4);
  233. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  234. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  235. src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
  236. src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
  237. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  238. src_c2 *= src_b9;
  239. src_c3 *= src_b9;
  240. src_c0 -= src_c2 * src_b8;
  241. src_c1 -= src_c3 * src_b8;
  242. src_c0 *= src_b0;
  243. src_c1 *= src_b0;
  244. ST_SP4(src_c0, src_c1, src_c2, src_c3, a - 64, 4);
  245. ST_SP2(src_c0, src_c1, c, 4);
  246. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  247. }
  248. static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  249. {
  250. BLASLONG k;
  251. FLOAT *aa = a, *bb = b;
  252. v4f32 src_a0, src_a1, src_b1, src_b2, src_b3;
  253. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  254. v4f32 src_b, src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12;
  255. v4f32 src_b13, src_b14, src_b15;
  256. FLOAT *c_nxt1line = c + ldc;
  257. FLOAT *c_nxt2line = c + 2 * ldc;
  258. FLOAT *c_nxt3line = c + 3 * ldc;
  259. LD_SP2(c, 4, src_c0, src_c1);
  260. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  261. LD_SP2(c_nxt2line, 4, src_c4, src_c5);
  262. LD_SP2(c_nxt3line, 4, src_c6, src_c7);
  263. for (k = 0; k < (bk >> 1); k++)
  264. {
  265. LD_SP2(aa, 4, src_a0, src_a1);
  266. src_b = LD_SP(bb + 0);
  267. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  268. src_c0 -= src_a0 * src_b0;
  269. src_c1 -= src_a1 * src_b0;
  270. src_c2 -= src_a0 * src_b1;
  271. src_c3 -= src_a1 * src_b1;
  272. src_c4 -= src_a0 * src_b2;
  273. src_c5 -= src_a1 * src_b2;
  274. src_c6 -= src_a0 * src_b3;
  275. src_c7 -= src_a1 * src_b3;
  276. aa += 8;
  277. bb += 4;
  278. LD_SP2(aa, 4, src_a0, src_a1);
  279. src_b = LD_SP(bb + 0);
  280. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  281. src_c0 -= src_a0 * src_b0;
  282. src_c1 -= src_a1 * src_b0;
  283. src_c2 -= src_a0 * src_b1;
  284. src_c3 -= src_a1 * src_b1;
  285. src_c4 -= src_a0 * src_b2;
  286. src_c5 -= src_a1 * src_b2;
  287. src_c6 -= src_a0 * src_b3;
  288. src_c7 -= src_a1 * src_b3;
  289. aa += 8;
  290. bb += 4;
  291. }
  292. if ((bk & 1) && (bk > 0))
  293. {
  294. LD_SP2(aa, 4, src_a0, src_a1);
  295. src_b = LD_SP(bb + 0);
  296. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  297. src_c0 -= src_a0 * src_b0;
  298. src_c1 -= src_a1 * src_b0;
  299. src_c2 -= src_a0 * src_b1;
  300. src_c3 -= src_a1 * src_b1;
  301. src_c4 -= src_a0 * src_b2;
  302. src_c5 -= src_a1 * src_b2;
  303. src_c6 -= src_a0 * src_b3;
  304. src_c7 -= src_a1 * src_b3;
  305. }
  306. a -= 32;
  307. b -= 16;
  308. src_b = LD_SP(b + 12);
  309. SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
  310. src_b8 = LD_SP(b + 8);
  311. src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
  312. src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
  313. src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
  314. src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
  315. src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
  316. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  317. src_c7 *= src_b15;
  318. src_c6 *= src_b15;
  319. src_c5 -= src_c7 * src_b14;
  320. src_c4 -= src_c6 * src_b14;
  321. src_c3 -= src_c7 * src_b13;
  322. src_c2 -= src_c6 * src_b13;
  323. src_c1 -= src_c7 * src_b12;
  324. src_c0 -= src_c6 * src_b12;
  325. src_c5 *= src_b10;
  326. src_c4 *= src_b10;
  327. src_c3 -= src_c5 * src_b9;
  328. src_c2 -= src_c4 * src_b9;
  329. src_c1 -= src_c5 * src_b8;
  330. src_c0 -= src_c4 * src_b8;
  331. src_c3 *= src_b5;
  332. src_c2 *= src_b5;
  333. src_c1 -= src_c3 * src_b4;
  334. src_c0 -= src_c2 * src_b4;
  335. src_c1 *= src_b0;
  336. src_c0 *= src_b0;
  337. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  338. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  339. ST_SP2(src_c0, src_c1, c, 4);
  340. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  341. ST_SP2(src_c4, src_c5, c_nxt2line, 4);
  342. ST_SP2(src_c6, src_c7, c_nxt3line, 4);
  343. }
  344. static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  345. {
  346. BLASLONG k;
  347. FLOAT *aa = a, *bb = b;
  348. v4f32 src_a0, src_a1, src_b1;
  349. v4f32 src_c0, src_c1, src_c2, src_c3, src_b0, src_b2, src_b3;
  350. FLOAT *c_nxt1line = c + ldc;
  351. LD_SP2(c, 4, src_c0, src_c1);
  352. LD_SP2(c_nxt1line, 4, src_c2, src_c3);
  353. for (k = 0; k < (bk >> 1); k++)
  354. {
  355. LD_SP2(aa, 4, src_a0, src_a1);
  356. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  357. src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
  358. src_c0 -= src_a0 * src_b0;
  359. src_c1 -= src_a1 * src_b0;
  360. src_c2 -= src_a0 * src_b1;
  361. src_c3 -= src_a1 * src_b1;
  362. aa += 8;
  363. bb += 2;
  364. LD_SP2(aa, 4, src_a0, src_a1);
  365. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  366. src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
  367. src_c0 -= src_a0 * src_b0;
  368. src_c1 -= src_a1 * src_b0;
  369. src_c2 -= src_a0 * src_b1;
  370. src_c3 -= src_a1 * src_b1;
  371. aa += 8;
  372. bb += 2;
  373. }
  374. if ((bk & 1) && (bk > 0))
  375. {
  376. LD_SP2(aa, 4, src_a0, src_a1);
  377. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  378. src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
  379. src_c0 -= src_a0 * src_b0;
  380. src_c1 -= src_a1 * src_b0;
  381. src_c2 -= src_a0 * src_b1;
  382. src_c3 -= src_a1 * src_b1;
  383. }
  384. a -= 16;
  385. b -= 4;
  386. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  387. src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
  388. src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
  389. src_c2 *= src_b3;
  390. src_c3 *= src_b3;
  391. src_c0 -= src_c2 * src_b2;
  392. src_c1 -= src_c3 * src_b2;
  393. src_c0 *= src_b0;
  394. src_c1 *= src_b0;
  395. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  396. ST_SP2(src_c0, src_c1, c, 4);
  397. ST_SP2(src_c2, src_c3, c_nxt1line, 4);
  398. }
  399. static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  400. {
  401. BLASLONG k;
  402. FLOAT *aa = a, *bb = b;
  403. v4f32 src_a0, src_a1, src_c0, src_c1, src_b0;
  404. LD_SP2(c, 4, src_c0, src_c1);
  405. for (k = 0; k < (bk >> 2); k++)
  406. {
  407. LD_SP2(aa, 4, src_a0, src_a1);
  408. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  409. src_c0 -= src_a0 * src_b0;
  410. src_c1 -= src_a1 * src_b0;
  411. aa += 8;
  412. bb += 1;
  413. LD_SP2(aa, 4, src_a0, src_a1);
  414. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  415. src_c0 -= src_a0 * src_b0;
  416. src_c1 -= src_a1 * src_b0;
  417. aa += 8;
  418. bb += 1;
  419. LD_SP2(aa, 4, src_a0, src_a1);
  420. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  421. src_c0 -= src_a0 * src_b0;
  422. src_c1 -= src_a1 * src_b0;
  423. aa += 8;
  424. bb += 1;
  425. LD_SP2(aa, 4, src_a0, src_a1);
  426. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  427. src_c0 -= src_a0 * src_b0;
  428. src_c1 -= src_a1 * src_b0;
  429. aa += 8;
  430. bb += 1;
  431. }
  432. if ((bk & 3) && (bk > 0))
  433. {
  434. if (bk & 2)
  435. {
  436. LD_SP2(aa, 4, src_a0, src_a1);
  437. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  438. src_c0 -= src_a0 * src_b0;
  439. src_c1 -= src_a1 * src_b0;
  440. aa += 8;
  441. bb += 1;
  442. LD_SP2(aa, 4, src_a0, src_a1);
  443. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  444. src_c0 -= src_a0 * src_b0;
  445. src_c1 -= src_a1 * src_b0;
  446. aa += 8;
  447. bb += 1;
  448. }
  449. if (bk & 1)
  450. {
  451. LD_SP2(aa, 4, src_a0, src_a1);
  452. src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
  453. src_c0 -= src_a0 * src_b0;
  454. src_c1 -= src_a1 * src_b0;
  455. }
  456. }
  457. a -= 8;
  458. b -= 1;
  459. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  460. src_c0 *= src_b0;
  461. src_c1 *= src_b0;
  462. ST_SP2(src_c0, src_c1, a, 4);
  463. ST_SP2(src_c0, src_c1, c, 4);
  464. }
  465. static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  466. {
  467. BLASLONG k;
  468. FLOAT *aa = a, *bb = b;
  469. v4f32 src_a0, src_b1, src_b2, src_b3;
  470. v4f32 src_c0, src_c1, src_c2, src_c3, src_c4, src_c5, src_c6, src_c7;
  471. v4f32 src_b, src_b0, src_b8, src_b9, src_b16, src_b17, src_b18, src_b24;
  472. v4f32 src_b25, src_b26, src_b27, src_b32, src_b33, src_b34, src_b35;
  473. v4f32 src_b36, src_b40, src_b41, src_b42, src_b43, src_b44, src_b45;
  474. v4f32 src_b48, src_b49, src_b50, src_b51, src_b52, src_b53, src_b54;
  475. v4f32 src_b56, src_b57, src_b58, src_b59, src_b60, src_b61, src_b62, src_b63;
  476. FLOAT *c_nxt1line = c + ldc;
  477. FLOAT *c_nxt2line = c + 2 * ldc;
  478. FLOAT *c_nxt3line = c + 3 * ldc;
  479. FLOAT *c_nxt4line = c + 4 * ldc;
  480. FLOAT *c_nxt5line = c + 5 * ldc;
  481. FLOAT *c_nxt6line = c + 6 * ldc;
  482. FLOAT *c_nxt7line = c + 7 * ldc;
  483. src_c0 = LD_SP(c);
  484. src_c1 = LD_SP(c_nxt1line);
  485. src_c2 = LD_SP(c_nxt2line);
  486. src_c3 = LD_SP(c_nxt3line);
  487. src_c4 = LD_SP(c_nxt4line);
  488. src_c5 = LD_SP(c_nxt5line);
  489. src_c6 = LD_SP(c_nxt6line);
  490. src_c7 = LD_SP(c_nxt7line);
  491. for (k = 0; k < bk; k++)
  492. {
  493. src_a0 = LD_SP(aa);
  494. src_b = LD_SP(bb + 0);
  495. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  496. src_c0 -= src_a0 * src_b0;
  497. src_c1 -= src_a0 * src_b1;
  498. src_c2 -= src_a0 * src_b2;
  499. src_c3 -= src_a0 * src_b3;
  500. src_b = LD_SP(bb + 4);
  501. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  502. src_c4 -= src_a0 * src_b0;
  503. src_c5 -= src_a0 * src_b1;
  504. src_c6 -= src_a0 * src_b2;
  505. src_c7 -= src_a0 * src_b3;
  506. aa += 4;
  507. bb += 8;
  508. }
  509. a -= 32;
  510. b -= 64;
  511. src_b = LD_SP(b + 60);
  512. SPLATI_W4_SP(src_b, src_b60, src_b61, src_b62, src_b63);
  513. src_b = LD_SP(b + 56);
  514. SPLATI_W4_SP(src_b, src_b56, src_b57, src_b58, src_b59);
  515. src_b = LD_SP(b + 48);
  516. SPLATI_W4_SP(src_b, src_b48, src_b49, src_b50, src_b51);
  517. src_b52 = LD_SP(b + 52);
  518. src_b54 = (v4f32) __msa_splati_w((v4i32) src_b52, 2);
  519. src_b53 = (v4f32) __msa_splati_w((v4i32) src_b52, 1);
  520. src_b52 = (v4f32) __msa_splati_w((v4i32) src_b52, 0);
  521. src_b = LD_SP(b + 40);
  522. SPLATI_W4_SP(src_b, src_b40, src_b41, src_b42, src_b43);
  523. src_b44 = LD_SP(b + 44);
  524. src_b45 = (v4f32) __msa_splati_w((v4i32) src_b44, 1);
  525. src_b44 = (v4f32) __msa_splati_w((v4i32) src_b44, 0);
  526. src_b = LD_SP(b + 32);
  527. SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
  528. src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
  529. src_b = LD_SP(b + 24);
  530. SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
  531. src_b16 = LD_SP(b + 16);
  532. src_b18 = (v4f32) __msa_splati_w((v4i32) src_b16, 2);
  533. src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
  534. src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
  535. src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
  536. src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
  537. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  538. src_c7 *= src_b63;
  539. src_c6 -= src_c7 * src_b62;
  540. src_c5 -= src_c7 * src_b61;
  541. src_c4 -= src_c7 * src_b60;
  542. src_c3 -= src_c7 * src_b59;
  543. src_c2 -= src_c7 * src_b58;
  544. src_c1 -= src_c7 * src_b57;
  545. src_c0 -= src_c7 * src_b56;
  546. src_c6 *= src_b54;
  547. src_c5 -= src_c6 * src_b53;
  548. src_c4 -= src_c6 * src_b52;
  549. src_c3 -= src_c6 * src_b51;
  550. src_c2 -= src_c6 * src_b50;
  551. src_c1 -= src_c6 * src_b49;
  552. src_c0 -= src_c6 * src_b48;
  553. src_c5 *= src_b45;
  554. src_c4 -= src_c5 * src_b44;
  555. src_c3 -= src_c5 * src_b43;
  556. src_c2 -= src_c5 * src_b42;
  557. src_c1 -= src_c5 * src_b41;
  558. src_c0 -= src_c5 * src_b40;
  559. src_c4 *= src_b36;
  560. src_c3 -= src_c4 * src_b35;
  561. src_c2 -= src_c4 * src_b34;
  562. src_c1 -= src_c4 * src_b33;
  563. src_c0 -= src_c4 * src_b32;
  564. src_c3 *= src_b27;
  565. src_c2 -= src_c3 * src_b26;
  566. src_c1 -= src_c3 * src_b25;
  567. src_c0 -= src_c3 * src_b24;
  568. src_c2 *= src_b18;
  569. src_c1 -= src_c2 * src_b17;
  570. src_c0 -= src_c2 * src_b16;
  571. src_c1 *= src_b9;
  572. src_c0 -= src_c1 * src_b8;
  573. src_c0 *= src_b0;
  574. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  575. ST_SP4(src_c4, src_c5, src_c6, src_c7, a + 16, 4);
  576. ST_SP(src_c0, c);
  577. ST_SP(src_c1, c_nxt1line);
  578. ST_SP(src_c2, c_nxt2line);
  579. ST_SP(src_c3, c_nxt3line);
  580. ST_SP(src_c4, c_nxt4line);
  581. ST_SP(src_c5, c_nxt5line);
  582. ST_SP(src_c6, c_nxt6line);
  583. ST_SP(src_c7, c_nxt7line);
  584. }
  585. static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  586. {
  587. BLASLONG k;
  588. FLOAT *aa = a, *bb = b;
  589. v4f32 src_c0, src_c1, src_c2, src_c3, src_b;
  590. v4f32 src_b0, src_b4, src_b5, src_b8, src_b9, src_b10, src_b12, src_b13;
  591. v4f32 src_b14, src_b15, src_a, src_b1, src_b2, src_b3;
  592. FLOAT *c_nxt1line = c + ldc;
  593. FLOAT *c_nxt2line = c + 2 * ldc;
  594. FLOAT *c_nxt3line = c + 3 * ldc;
  595. src_c0 = LD_SP(c);
  596. src_c1 = LD_SP(c_nxt1line);
  597. src_c2 = LD_SP(c_nxt2line);
  598. src_c3 = LD_SP(c_nxt3line);
  599. for (k = 0; k < (bk >> 1); k++)
  600. {
  601. src_a = LD_SP(aa);
  602. src_b = LD_SP(bb);
  603. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  604. src_c0 -= src_a * src_b0;
  605. src_c1 -= src_a * src_b1;
  606. src_c2 -= src_a * src_b2;
  607. src_c3 -= src_a * src_b3;
  608. aa += 4;
  609. bb += 4;
  610. src_a = LD_SP(aa);
  611. src_b = LD_SP(bb);
  612. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  613. src_c0 -= src_a * src_b0;
  614. src_c1 -= src_a * src_b1;
  615. src_c2 -= src_a * src_b2;
  616. src_c3 -= src_a * src_b3;
  617. aa += 4;
  618. bb += 4;
  619. }
  620. if ((bk & 1) && (bk > 0))
  621. {
  622. src_a = LD_SP(aa);
  623. src_b = LD_SP(bb);
  624. SPLATI_W4_SP(src_b, src_b0, src_b1, src_b2, src_b3);
  625. src_c0 -= src_a * src_b0;
  626. src_c1 -= src_a * src_b1;
  627. src_c2 -= src_a * src_b2;
  628. src_c3 -= src_a * src_b3;
  629. }
  630. a -= 16;
  631. b -= 16;
  632. src_b = LD_SP(b + 12);
  633. SPLATI_W4_SP(src_b, src_b12, src_b13, src_b14, src_b15);
  634. src_b8 = LD_SP(b + 8);
  635. src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
  636. src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
  637. src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
  638. src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
  639. src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
  640. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  641. src_c3 *= src_b15;
  642. src_c2 -= src_c3 * src_b14;
  643. src_c1 -= src_c3 * src_b13;
  644. src_c0 -= src_c3 * src_b12;
  645. src_c2 *= src_b10;
  646. src_c1 -= src_c2 * src_b9;
  647. src_c0 -= src_c2 * src_b8;
  648. src_c1 *= src_b5;
  649. src_c0 -= src_c1 * src_b4;
  650. src_c0 *= src_b0;
  651. ST_SP4(src_c0, src_c1, src_c2, src_c3, a, 4);
  652. ST_SP(src_c0, c);
  653. ST_SP(src_c1, c_nxt1line);
  654. ST_SP(src_c2, c_nxt2line);
  655. ST_SP(src_c3, c_nxt3line);
  656. }
  657. static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  658. {
  659. BLASLONG k;
  660. FLOAT *aa = a, *bb = b;
  661. v4f32 src_a, src_b1, src_c0, src_c1, src_b0, src_b2, src_b3;
  662. FLOAT *c_nxt1line = c + ldc;
  663. src_c0 = LD_SP(c);
  664. src_c1 = LD_SP(c_nxt1line);
  665. for (k = 0; k < (bk >> 2); k++)
  666. {
  667. src_a = LD_SP(aa);
  668. src_b0 = LD_SP(bb);
  669. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  670. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  671. src_c0 -= src_a * src_b0;
  672. src_c1 -= src_a * src_b1;
  673. aa += 4;
  674. bb += 2;
  675. src_a = LD_SP(aa);
  676. src_b0 = LD_SP(bb);
  677. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  678. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  679. src_c0 -= src_a * src_b0;
  680. src_c1 -= src_a * src_b1;
  681. aa += 4;
  682. bb += 2;
  683. src_a = LD_SP(aa);
  684. src_b0 = LD_SP(bb);
  685. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  686. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  687. src_c0 -= src_a * src_b0;
  688. src_c1 -= src_a * src_b1;
  689. aa += 4;
  690. bb += 2;
  691. src_a = LD_SP(aa);
  692. src_b0 = LD_SP(bb);
  693. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  694. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  695. src_c0 -= src_a * src_b0;
  696. src_c1 -= src_a * src_b1;
  697. aa += 4;
  698. bb += 2;
  699. }
  700. if ((bk & 3) && (bk > 0))
  701. {
  702. if (bk & 2)
  703. {
  704. src_a = LD_SP(aa);
  705. src_b0 = LD_SP(bb);
  706. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  707. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  708. src_c0 -= src_a * src_b0;
  709. src_c1 -= src_a * src_b1;
  710. aa += 4;
  711. bb += 2;
  712. src_a = LD_SP(aa);
  713. src_b0 = LD_SP(bb);
  714. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  715. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  716. src_c0 -= src_a * src_b0;
  717. src_c1 -= src_a * src_b1;
  718. aa += 4;
  719. bb += 2;
  720. }
  721. if (bk & 1)
  722. {
  723. src_a = LD_SP(aa);
  724. src_b0 = LD_SP(bb);
  725. src_b1 = (v4f32) __msa_splati_w((v4i32) src_b0, 1);
  726. src_b0 = (v4f32) __msa_splati_w((v4i32) src_b0, 0);
  727. src_c0 -= src_a * src_b0;
  728. src_c1 -= src_a * src_b1;
  729. }
  730. }
  731. a -= 8;
  732. b -= 4;
  733. src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
  734. src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
  735. src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
  736. src_c1 *= src_b3;
  737. src_c0 -= src_c1 * src_b2;
  738. src_c0 *= src_b0;
  739. ST_SP2(src_c0, src_c1, a, 4);
  740. ST_SP(src_c0, c);
  741. ST_SP(src_c1, c_nxt1line);
  742. }
  743. static void ssolve_4x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  744. {
  745. BLASLONG k;
  746. FLOAT *aa = a, *bb = b;
  747. FLOAT b0, c0, c1, c2, c3;
  748. c0 = *(c + 0);
  749. c1 = *(c + 1);
  750. c2 = *(c + 2);
  751. c3 = *(c + 3);
  752. for (k = 0; k < bk; k++)
  753. {
  754. c0 -= aa[0] * bb[0];
  755. c1 -= aa[1] * bb[0];
  756. c2 -= aa[2] * bb[0];
  757. c3 -= aa[3] * bb[0];
  758. aa += 4;
  759. bb += 1;
  760. }
  761. a -= 4;
  762. b -= 1;
  763. b0 = *b;
  764. c0 *= b0;
  765. c1 *= b0;
  766. c2 *= b0;
  767. c3 *= b0;
  768. *(a + 0) = c0;
  769. *(a + 1) = c1;
  770. *(a + 2) = c2;
  771. *(a + 3) = c3;
  772. *(c + 0) = c0;
  773. *(c + 1) = c1;
  774. *(c + 2) = c2;
  775. *(c + 3) = c3;
  776. }
  777. static void ssolve_2x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  778. {
  779. BLASLONG k;
  780. FLOAT *aa = a, *bb = b;
  781. FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
  782. FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
  783. FLOAT b56, b57, b58, b59, b60, b61, b62, b63, c0_nxt7, c1_nxt7;
  784. FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  785. FLOAT c0_nxt4, c1_nxt4, c0_nxt5, c1_nxt5, c0_nxt6, c1_nxt6;
  786. c0 = *(c + 0);
  787. c1 = *(c + 1);
  788. c0_nxt1 = *(c + 0 + 1 * ldc);
  789. c1_nxt1 = *(c + 1 + 1 * ldc);
  790. c0_nxt2 = *(c + 0 + 2 * ldc);
  791. c1_nxt2 = *(c + 1 + 2 * ldc);
  792. c0_nxt3 = *(c + 0 + 3 * ldc);
  793. c1_nxt3 = *(c + 1 + 3 * ldc);
  794. c0_nxt4 = *(c + 0 + 4 * ldc);
  795. c1_nxt4 = *(c + 1 + 4 * ldc);
  796. c0_nxt5 = *(c + 0 + 5 * ldc);
  797. c1_nxt5 = *(c + 1 + 5 * ldc);
  798. c0_nxt6 = *(c + 0 + 6 * ldc);
  799. c1_nxt6 = *(c + 1 + 6 * ldc);
  800. c0_nxt7 = *(c + 0 + 7 * ldc);
  801. c1_nxt7 = *(c + 1 + 7 * ldc);
  802. for (k = 0; k < bk; k++)
  803. {
  804. c0 -= aa[0] * bb[0];
  805. c1 -= aa[1] * bb[0];
  806. c0_nxt1 -= aa[0] * bb[1];
  807. c1_nxt1 -= aa[1] * bb[1];
  808. c0_nxt2 -= aa[0] * bb[2];
  809. c1_nxt2 -= aa[1] * bb[2];
  810. c0_nxt3 -= aa[0] * bb[3];
  811. c1_nxt3 -= aa[1] * bb[3];
  812. c0_nxt4 -= aa[0] * bb[4];
  813. c1_nxt4 -= aa[1] * bb[4];
  814. c0_nxt5 -= aa[0] * bb[5];
  815. c1_nxt5 -= aa[1] * bb[5];
  816. c0_nxt6 -= aa[0] * bb[6];
  817. c1_nxt6 -= aa[1] * bb[6];
  818. c0_nxt7 -= aa[0] * bb[7];
  819. c1_nxt7 -= aa[1] * bb[7];
  820. aa += 2;
  821. bb += 8;
  822. }
  823. a -= 16;
  824. b -= 64;
  825. b0 = *(b + 0);
  826. b8 = *(b + 8);
  827. b9 = *(b + 9);
  828. b16 = *(b + 16);
  829. b17 = *(b + 17);
  830. b18 = *(b + 18);
  831. b24 = *(b + 24);
  832. b25 = *(b + 25);
  833. b26 = *(b + 26);
  834. b27 = *(b + 27);
  835. b32 = *(b + 32);
  836. b33 = *(b + 33);
  837. b34 = *(b + 34);
  838. b35 = *(b + 35);
  839. b36 = *(b + 36);
  840. b40 = *(b + 40);
  841. b41 = *(b + 41);
  842. b42 = *(b + 42);
  843. b43 = *(b + 43);
  844. b44 = *(b + 44);
  845. b45 = *(b + 45);
  846. b48 = *(b + 48);
  847. b49 = *(b + 49);
  848. b50 = *(b + 50);
  849. b51 = *(b + 51);
  850. b52 = *(b + 52);
  851. b53 = *(b + 53);
  852. b54 = *(b + 54);
  853. b56 = *(b + 56);
  854. b57 = *(b + 57);
  855. b58 = *(b + 58);
  856. b59 = *(b + 59);
  857. b60 = *(b + 60);
  858. b61 = *(b + 61);
  859. b62 = *(b + 62);
  860. b63 = *(b + 63);
  861. c0_nxt7 *= b63;
  862. c1_nxt7 *= b63;
  863. c0_nxt6 -= c0_nxt7 * b62;
  864. c1_nxt6 -= c1_nxt7 * b62;
  865. c0_nxt6 *= b54;
  866. c1_nxt6 *= b54;
  867. c0_nxt5 -= c0_nxt7 * b61;
  868. c1_nxt5 -= c1_nxt7 * b61;
  869. c0_nxt5 -= c0_nxt6 * b53;
  870. c1_nxt5 -= c1_nxt6 * b53;
  871. c0_nxt5 *= b45;
  872. c1_nxt5 *= b45;
  873. c0_nxt4 -= c0_nxt7 * b60;
  874. c1_nxt4 -= c1_nxt7 * b60;
  875. c0_nxt4 -= c0_nxt6 * b52;
  876. c1_nxt4 -= c1_nxt6 * b52;
  877. c0_nxt4 -= c0_nxt5 * b44;
  878. c1_nxt4 -= c1_nxt5 * b44;
  879. c0_nxt4 *= b36;
  880. c1_nxt4 *= b36;
  881. c0_nxt3 -= c0_nxt7 * b59;
  882. c1_nxt3 -= c1_nxt7 * b59;
  883. c0_nxt3 -= c0_nxt6 * b51;
  884. c1_nxt3 -= c1_nxt6 * b51;
  885. c0_nxt3 -= c0_nxt5 * b43;
  886. c1_nxt3 -= c1_nxt5 * b43;
  887. c0_nxt3 -= c0_nxt4 * b35;
  888. c1_nxt3 -= c1_nxt4 * b35;
  889. c0_nxt3 *= b27;
  890. c1_nxt3 *= b27;
  891. c0_nxt2 -= c0_nxt7 * b58;
  892. c1_nxt2 -= c1_nxt7 * b58;
  893. c0_nxt2 -= c0_nxt6 * b50;
  894. c1_nxt2 -= c1_nxt6 * b50;
  895. c0_nxt2 -= c0_nxt5 * b42;
  896. c1_nxt2 -= c1_nxt5 * b42;
  897. c0_nxt2 -= c0_nxt4 * b34;
  898. c1_nxt2 -= c1_nxt4 * b34;
  899. c0_nxt2 -= c0_nxt3 * b26;
  900. c1_nxt2 -= c1_nxt3 * b26;
  901. c0_nxt2 *= b18;
  902. c1_nxt2 *= b18;
  903. c0_nxt1 -= c0_nxt7 * b57;
  904. c1_nxt1 -= c1_nxt7 * b57;
  905. c0_nxt1 -= c0_nxt6 * b49;
  906. c1_nxt1 -= c1_nxt6 * b49;
  907. c0_nxt1 -= c0_nxt5 * b41;
  908. c1_nxt1 -= c1_nxt5 * b41;
  909. c0_nxt1 -= c0_nxt4 * b33;
  910. c1_nxt1 -= c1_nxt4 * b33;
  911. c0_nxt1 -= c0_nxt3 * b25;
  912. c1_nxt1 -= c1_nxt3 * b25;
  913. c0_nxt1 -= c0_nxt2 * b17;
  914. c1_nxt1 -= c1_nxt2 * b17;
  915. c0_nxt1 *= b9;
  916. c1_nxt1 *= b9;
  917. c0 -= c0_nxt7 * b56;
  918. c1 -= c1_nxt7 * b56;
  919. c0 -= c0_nxt6 * b48;
  920. c1 -= c1_nxt6 * b48;
  921. c0 -= c0_nxt5 * b40;
  922. c1 -= c1_nxt5 * b40;
  923. c0 -= c0_nxt4 * b32;
  924. c1 -= c1_nxt4 * b32;
  925. c0 -= c0_nxt3 * b24;
  926. c1 -= c1_nxt3 * b24;
  927. c0 -= c0_nxt2 * b16;
  928. c1 -= c1_nxt2 * b16;
  929. c0 -= c0_nxt1 * b8;
  930. c1 -= c1_nxt1 * b8;
  931. c0 *= b0;
  932. c1 *= b0;
  933. *(a + 0) = c0;
  934. *(a + 1) = c1;
  935. *(a + 2) = c0_nxt1;
  936. *(a + 3) = c1_nxt1;
  937. *(a + 4) = c0_nxt2;
  938. *(a + 5) = c1_nxt2;
  939. *(a + 6) = c0_nxt3;
  940. *(a + 7) = c1_nxt3;
  941. *(a + 8) = c0_nxt4;
  942. *(a + 9) = c1_nxt4;
  943. *(a + 10) = c0_nxt5;
  944. *(a + 11) = c1_nxt5;
  945. *(a + 12) = c0_nxt6;
  946. *(a + 13) = c1_nxt6;
  947. *(a + 14) = c0_nxt7;
  948. *(a + 15) = c1_nxt7;
  949. *(c + 0) = c0;
  950. *(c + 1) = c1;
  951. *(c + 0 + 1 * ldc) = c0_nxt1;
  952. *(c + 1 + 1 * ldc) = c1_nxt1;
  953. *(c + 0 + 2 * ldc) = c0_nxt2;
  954. *(c + 1 + 2 * ldc) = c1_nxt2;
  955. *(c + 0 + 3 * ldc) = c0_nxt3;
  956. *(c + 1 + 3 * ldc) = c1_nxt3;
  957. *(c + 0 + 4 * ldc) = c0_nxt4;
  958. *(c + 1 + 4 * ldc) = c1_nxt4;
  959. *(c + 0 + 5 * ldc) = c0_nxt5;
  960. *(c + 1 + 5 * ldc) = c1_nxt5;
  961. *(c + 0 + 6 * ldc) = c0_nxt6;
  962. *(c + 1 + 6 * ldc) = c1_nxt6;
  963. *(c + 0 + 7 * ldc) = c0_nxt7;
  964. *(c + 1 + 7 * ldc) = c1_nxt7;
  965. }
  966. static void ssolve_2x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  967. {
  968. BLASLONG k;
  969. FLOAT *aa = a, *bb = b;
  970. FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
  971. FLOAT c0, c1, c0_nxt1, c1_nxt1, c0_nxt2, c1_nxt2, c0_nxt3, c1_nxt3;
  972. c0 = *(c + 0);
  973. c1 = *(c + 1);
  974. c0_nxt1 = *(c + 0 + 1 * ldc);
  975. c1_nxt1 = *(c + 1 + 1 * ldc);
  976. c0_nxt2 = *(c + 0 + 2 * ldc);
  977. c1_nxt2 = *(c + 1 + 2 * ldc);
  978. c0_nxt3 = *(c + 0 + 3 * ldc);
  979. c1_nxt3 = *(c + 1 + 3 * ldc);
  980. for (k = 0; k < bk; k++)
  981. {
  982. c0 -= aa[0] * bb[0];
  983. c1 -= aa[1] * bb[0];
  984. c0_nxt1 -= aa[0] * bb[1];
  985. c1_nxt1 -= aa[1] * bb[1];
  986. c0_nxt2 -= aa[0] * bb[2];
  987. c1_nxt2 -= aa[1] * bb[2];
  988. c0_nxt3 -= aa[0] * bb[3];
  989. c1_nxt3 -= aa[1] * bb[3];
  990. aa += 2;
  991. bb += 4;
  992. }
  993. a -= 8;
  994. b -= 16;
  995. b0 = *b;
  996. b4 = *(b + 4);
  997. b5 = *(b + 5);
  998. b8 = *(b + 8);
  999. b9 = *(b + 9);
  1000. b10 = *(b + 10);
  1001. b12 = *(b + 12);
  1002. b13 = *(b + 13);
  1003. b14 = *(b + 14);
  1004. b15 = *(b + 15);
  1005. c0_nxt3 *= b15;
  1006. c1_nxt3 *= b15;
  1007. c0_nxt2 = (c0_nxt2 - c0_nxt3 * b14) * b10;
  1008. c1_nxt2 = (c1_nxt2 - c1_nxt3 * b14) * b10;
  1009. c0_nxt1 = ((c0_nxt1 - c0_nxt3 * b13) - c0_nxt2 * b9) * b5;
  1010. c1_nxt1 = ((c1_nxt1 - c1_nxt3 * b13) - c1_nxt2 * b9) * b5;
  1011. c0 = (((c0 - c0_nxt3 * b12) - c0_nxt2 * b8) - c0_nxt1 * b4) * b0;
  1012. c1 = (((c1 - c1_nxt3 * b12) - c1_nxt2 * b8) - c1_nxt1 * b4) * b0;
  1013. *(a + 0) = c0;
  1014. *(a + 1) = c1;
  1015. *(a + 2) = c0_nxt1;
  1016. *(a + 3) = c1_nxt1;
  1017. *(a + 4) = c0_nxt2;
  1018. *(a + 5) = c1_nxt2;
  1019. *(a + 6) = c0_nxt3;
  1020. *(a + 7) = c1_nxt3;
  1021. *(c + 0) = c0;
  1022. *(c + 1) = c1;
  1023. *(c + 0 + 1 * ldc) = c0_nxt1;
  1024. *(c + 1 + 1 * ldc) = c1_nxt1;
  1025. *(c + 0 + 2 * ldc) = c0_nxt2;
  1026. *(c + 1 + 2 * ldc) = c1_nxt2;
  1027. *(c + 0 + 3 * ldc) = c0_nxt3;
  1028. *(c + 1 + 3 * ldc) = c1_nxt3;
  1029. }
  1030. static void ssolve_2x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1031. {
  1032. BLASLONG k;
  1033. FLOAT *aa = a, *bb = b;
  1034. FLOAT b0, b2, b3, c0, c1, c0_nxt, c1_nxt;
  1035. c0 = *(c + 0);
  1036. c1 = *(c + 1);
  1037. c0_nxt = *(c + 0 + ldc);
  1038. c1_nxt = *(c + 1 + ldc);
  1039. for (k = 0; k < bk; k++)
  1040. {
  1041. c0 -= aa[0] * bb[0];
  1042. c1 -= aa[1] * bb[0];
  1043. c0_nxt -= aa[0] * bb[1];
  1044. c1_nxt -= aa[1] * bb[1];
  1045. aa += 2;
  1046. bb += 2;
  1047. }
  1048. a -= 4;
  1049. b -= 4;
  1050. b3 = *(b + 3);
  1051. b2 = *(b + 2);
  1052. b0 = *b;
  1053. c0_nxt *= b3;
  1054. c1_nxt *= b3;
  1055. c0 -= c0_nxt * b2;
  1056. c1 -= c1_nxt * b2;
  1057. c0 *= b0;
  1058. c1 *= b0;
  1059. *(a + 0) = c0;
  1060. *(a + 1) = c1;
  1061. *(a + 2) = c0_nxt;
  1062. *(a + 3) = c1_nxt;
  1063. *(c + 0) = c0;
  1064. *(c + 1) = c1;
  1065. *(c + 0 + ldc) = c0_nxt;
  1066. *(c + 1 + ldc) = c1_nxt;
  1067. }
  1068. static void ssolve_2x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1069. {
  1070. BLASLONG k;
  1071. FLOAT *aa = a, *bb = b;
  1072. FLOAT b0, c0, c1;
  1073. c0 = *(c + 0);
  1074. c1 = *(c + 1);
  1075. for (k = 0; k < bk; k++)
  1076. {
  1077. c0 -= aa[0] * bb[0];
  1078. c1 -= aa[1] * bb[0];
  1079. aa += 2;
  1080. bb += 1;
  1081. }
  1082. a -= 2;
  1083. b -= 1;
  1084. b0 = *b;
  1085. c0 *= b0;
  1086. c1 *= b0;
  1087. *(a + 0) = c0;
  1088. *(a + 1) = c1;
  1089. *(c + 0) = c0;
  1090. *(c + 1) = c1;
  1091. }
  1092. static void ssolve_1x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1093. {
  1094. BLASLONG k;
  1095. FLOAT *aa = a, *bb = b;
  1096. FLOAT b0, b8, b9, b16, b17, b18, b24, b25, b26, b27, b32, b33, b34, b35;
  1097. FLOAT b36, b40, b41, b42, b43, b44, b45, b48, b49, b50, b51, b52, b53, b54;
  1098. FLOAT b56, b57, b58, b59, b60, b61, b62, b63;
  1099. FLOAT c0, c1, c2, c3, c4, c5, c6, c7;
  1100. c0 = *(c + 0);
  1101. c1 = *(c + 1 * ldc);
  1102. c2 = *(c + 2 * ldc);
  1103. c3 = *(c + 3 * ldc);
  1104. c4 = *(c + 4 * ldc);
  1105. c5 = *(c + 5 * ldc);
  1106. c6 = *(c + 6 * ldc);
  1107. c7 = *(c + 7 * ldc);
  1108. for (k = 0; k < bk; k++)
  1109. {
  1110. c0 -= aa[0] * bb[0];
  1111. c1 -= aa[0] * bb[1];
  1112. c2 -= aa[0] * bb[2];
  1113. c3 -= aa[0] * bb[3];
  1114. c4 -= aa[0] * bb[4];
  1115. c5 -= aa[0] * bb[5];
  1116. c6 -= aa[0] * bb[6];
  1117. c7 -= aa[0] * bb[7];
  1118. aa += 1;
  1119. bb += 8;
  1120. }
  1121. a -= 8;
  1122. b -= 64;
  1123. b0 = *(b + 0);
  1124. b8 = *(b + 8);
  1125. b9 = *(b + 9);
  1126. b16 = *(b + 16);
  1127. b17 = *(b + 17);
  1128. b18 = *(b + 18);
  1129. b24 = *(b + 24);
  1130. b25 = *(b + 25);
  1131. b26 = *(b + 26);
  1132. b27 = *(b + 27);
  1133. b32 = *(b + 32);
  1134. b33 = *(b + 33);
  1135. b34 = *(b + 34);
  1136. b35 = *(b + 35);
  1137. b36 = *(b + 36);
  1138. b40 = *(b + 40);
  1139. b41 = *(b + 41);
  1140. b42 = *(b + 42);
  1141. b43 = *(b + 43);
  1142. b44 = *(b + 44);
  1143. b45 = *(b + 45);
  1144. b48 = *(b + 48);
  1145. b49 = *(b + 49);
  1146. b50 = *(b + 50);
  1147. b51 = *(b + 51);
  1148. b52 = *(b + 52);
  1149. b53 = *(b + 53);
  1150. b54 = *(b + 54);
  1151. b56 = *(b + 56);
  1152. b57 = *(b + 57);
  1153. b58 = *(b + 58);
  1154. b59 = *(b + 59);
  1155. b60 = *(b + 60);
  1156. b61 = *(b + 61);
  1157. b62 = *(b + 62);
  1158. b63 = *(b + 63);
  1159. c7 *= b63;
  1160. c6 -= c7 * b62;
  1161. c6 *= b54;
  1162. c5 -= c7 * b61;
  1163. c5 -= c6 * b53;
  1164. c5 *= b45;
  1165. c4 -= c7 * b60;
  1166. c4 -= c6 * b52;
  1167. c4 -= c5 * b44;
  1168. c4 *= b36;
  1169. c3 -= c7 * b59;
  1170. c3 -= c6 * b51;
  1171. c3 -= c5 * b43;
  1172. c3 -= c4 * b35;
  1173. c3 *= b27;
  1174. c2 -= c7 * b58;
  1175. c2 -= c6 * b50;
  1176. c2 -= c5 * b42;
  1177. c2 -= c4 * b34;
  1178. c2 -= c3 * b26;
  1179. c2 *= b18;
  1180. c1 -= c7 * b57;
  1181. c1 -= c6 * b49;
  1182. c1 -= c5 * b41;
  1183. c1 -= c4 * b33;
  1184. c1 -= c3 * b25;
  1185. c1 -= c2 * b17;
  1186. c1 *= b9;
  1187. c0 -= c7 * b56;
  1188. c0 -= c6 * b48;
  1189. c0 -= c5 * b40;
  1190. c0 -= c4 * b32;
  1191. c0 -= c3 * b24;
  1192. c0 -= c2 * b16;
  1193. c0 -= c1 * b8;
  1194. c0 *= b0;
  1195. *(a + 0) = c0;
  1196. *(a + 1) = c1;
  1197. *(a + 2) = c2;
  1198. *(a + 3) = c3;
  1199. *(a + 4) = c4;
  1200. *(a + 5) = c5;
  1201. *(a + 6) = c6;
  1202. *(a + 7) = c7;
  1203. *(c + 0) = c0;
  1204. *(c + 1 * ldc) = c1;
  1205. *(c + 2 * ldc) = c2;
  1206. *(c + 3 * ldc) = c3;
  1207. *(c + 4 * ldc) = c4;
  1208. *(c + 5 * ldc) = c5;
  1209. *(c + 6 * ldc) = c6;
  1210. *(c + 7 * ldc) = c7;
  1211. }
  1212. static void ssolve_1x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1213. {
  1214. BLASLONG k;
  1215. FLOAT *aa = a, *bb = b;
  1216. FLOAT b0, b4, b5, b8, b9, b10, b12, b13, b14, b15;
  1217. FLOAT c0, c1, c2, c3;
  1218. c0 = *(c + 0);
  1219. c1 = *(c + 1 * ldc);
  1220. c2 = *(c + 2 * ldc);
  1221. c3 = *(c + 3 * ldc);
  1222. for (k = 0; k < bk; k++)
  1223. {
  1224. c0 -= aa[0] * bb[0];
  1225. c1 -= aa[0] * bb[1];
  1226. c2 -= aa[0] * bb[2];
  1227. c3 -= aa[0] * bb[3];
  1228. aa += 1;
  1229. bb += 4;
  1230. }
  1231. a -= 4;
  1232. b -= 16;
  1233. b0 = *b;
  1234. b4 = *(b + 4);
  1235. b5 = *(b + 5);
  1236. b8 = *(b + 8);
  1237. b9 = *(b + 9);
  1238. b10 = *(b + 10);
  1239. b12 = *(b + 12);
  1240. b13 = *(b + 13);
  1241. b14 = *(b + 14);
  1242. b15 = *(b + 15);
  1243. c3 *= b15;
  1244. c2 = (c2 - c3 * b14) * b10;
  1245. c1 = ((c1 - c3 * b13) - c2 * b9) * b5;
  1246. c0 = (((c0 - c3 * b12) - c2 * b8) - c1 * b4) * b0;
  1247. *(a + 0) = c0;
  1248. *(a + 1) = c1;
  1249. *(a + 2) = c2;
  1250. *(a + 3) = c3;
  1251. *(c) = c0;
  1252. *(c + 1 * ldc) = c1;
  1253. *(c + 2 * ldc) = c2;
  1254. *(c + 3 * ldc) = c3;
  1255. }
  1256. static void ssolve_1x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
  1257. {
  1258. BLASLONG k;
  1259. FLOAT *aa = a, *bb = b;
  1260. FLOAT b0, b2, b3, c0, c1;
  1261. c0 = *(c + 0);
  1262. c1 = *(c + ldc);
  1263. for (k = 0; k < bk; k++)
  1264. {
  1265. c0 -= aa[0] * bb[0];
  1266. c1 -= aa[0] * bb[1];
  1267. aa += 1;
  1268. bb += 2;
  1269. }
  1270. a -= 2;
  1271. b -= 4;
  1272. b3 = *(b + 3);
  1273. b2 = *(b + 2);
  1274. b0 = *b;
  1275. c1 *= b3;
  1276. c0 -= c1 * b2;
  1277. c0 *= b0;
  1278. *(a + 0) = c0;
  1279. *(a + 1) = c1;
  1280. *(c + 0) = c0;
  1281. *(c + ldc) = c1;
  1282. }
  1283. static void ssolve_1x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
  1284. {
  1285. BLASLONG k;
  1286. for (k = 0; k < bk; k++)
  1287. {
  1288. *c -= a[k] * b[k];
  1289. }
  1290. *c *= *(a - 1);
  1291. *(b - 1) = *c;
  1292. }
  1293. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
  1294. FLOAT *c, BLASLONG ldc, BLASLONG offset)
  1295. {
  1296. FLOAT *aa, *cc;
  1297. BLASLONG i, j, kk;
  1298. kk = n - offset;
  1299. c += n * ldc;
  1300. b += n * k;
  1301. if (n & 7)
  1302. {
  1303. if (n & 1)
  1304. {
  1305. aa = a;
  1306. b -= k;
  1307. c -= ldc;
  1308. cc = c;
  1309. for (i = (m >> 3); i--;)
  1310. {
  1311. ssolve_8x1_rt_msa(aa + 8 * kk, b + kk, cc, (k - kk));
  1312. aa += 8 * k;
  1313. cc += 8;
  1314. }
  1315. if (m & 7)
  1316. {
  1317. if (m & 4)
  1318. {
  1319. ssolve_4x1_rt_msa(aa + 4 * kk, b + kk, cc, (k - kk));
  1320. aa += 4 * k;
  1321. cc += 4;
  1322. }
  1323. if (m & 2)
  1324. {
  1325. ssolve_2x1_rt_msa(aa + 2 * kk, b + kk, cc, (k - kk));
  1326. aa += 2 * k;
  1327. cc += 2;
  1328. }
  1329. if (m & 1)
  1330. {
  1331. ssolve_1x1_rt_msa(b + kk, aa + kk, cc, (k - kk));
  1332. aa += k;
  1333. cc += 1;
  1334. }
  1335. }
  1336. kk -= 1;
  1337. }
  1338. if (n & 2)
  1339. {
  1340. aa = a;
  1341. b -= 2 * k;
  1342. c -= 2 * ldc;
  1343. cc = c;
  1344. for (i = (m >> 3); i--;)
  1345. {
  1346. ssolve_8x2_rt_msa(aa + 8 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1347. aa += 8 * k;
  1348. cc += 8;
  1349. }
  1350. if (m & 7)
  1351. {
  1352. if (m & 4)
  1353. {
  1354. ssolve_4x2_rt_msa(aa + 4 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1355. aa += 4 * k;
  1356. cc += 4;
  1357. }
  1358. if (m & 2)
  1359. {
  1360. ssolve_2x2_rt_msa(aa + 2 * kk, b + 2 * kk, cc, ldc, (k - kk));
  1361. aa += 2 * k;
  1362. cc += 2;
  1363. }
  1364. if (m & 1)
  1365. {
  1366. ssolve_1x2_rt_msa(aa + kk, b + 2 * kk, cc, ldc, (k - kk));
  1367. aa += k;
  1368. cc += 1;
  1369. }
  1370. }
  1371. kk -= 2;
  1372. }
  1373. if (n & 4)
  1374. {
  1375. aa = a;
  1376. b -= 4 * k;
  1377. c -= 4 * ldc;
  1378. cc = c;
  1379. for (i = (m >> 3); i--;)
  1380. {
  1381. ssolve_8x4_rt_msa(aa + 8 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1382. aa += 8 * k;
  1383. cc += 8;
  1384. }
  1385. if (m & 7)
  1386. {
  1387. if (m & 4)
  1388. {
  1389. ssolve_4x4_rt_msa(aa + 4 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1390. aa += 4 * k;
  1391. cc += 4;
  1392. }
  1393. if (m & 2)
  1394. {
  1395. ssolve_2x4_rt_msa(aa + 2 * kk, b + 4 * kk, cc, ldc, (k - kk));
  1396. aa += 2 * k;
  1397. cc += 2;
  1398. }
  1399. if (m & 1)
  1400. {
  1401. ssolve_1x4_rt_msa(aa + kk, b + 4 * kk, cc, ldc, (k - kk));
  1402. aa += k;
  1403. cc += 1;
  1404. }
  1405. }
  1406. kk -= 4;
  1407. }
  1408. }
  1409. for (j = (n >> 3); j--;)
  1410. {
  1411. aa = a;
  1412. b -= 8 * k;
  1413. c -= 8 * ldc;
  1414. cc = c;
  1415. for (i = (m >> 3); i--;)
  1416. {
  1417. ssolve_8x8_rt_msa(aa + 8 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1418. aa += 8 * k;
  1419. cc += 8;
  1420. }
  1421. if (m & 7)
  1422. {
  1423. if (m & 4)
  1424. {
  1425. ssolve_4x8_rt_msa(aa + 4 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1426. aa += 4 * k;
  1427. cc += 4;
  1428. }
  1429. if (m & 2)
  1430. {
  1431. ssolve_2x8_rt_msa(aa + 2 * kk, b + 8 * kk, cc, ldc, (k - kk));
  1432. aa += 2 * k;
  1433. cc += 2;
  1434. }
  1435. if (m & 1)
  1436. {
  1437. ssolve_1x8_rt_msa(aa + kk, b + 8 * kk, cc, ldc, (k - kk));
  1438. aa += k;
  1439. cc += 1;
  1440. }
  1441. }
  1442. kk -= 8;
  1443. }
  1444. return 0;
  1445. }