You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_power10.c 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include "common.h"
  39. #include <altivec.h>
  40. static FLOAT dm1 = -1.;
  41. #ifdef CONJ
  42. #define GEMM_KERNEL GEMM_KERNEL_R
  43. #else
  44. #define GEMM_KERNEL GEMM_KERNEL_N
  45. #endif
  46. #if GEMM_DEFAULT_UNROLL_M == 1
  47. #define GEMM_UNROLL_M_SHIFT 0
  48. #endif
  49. #if GEMM_DEFAULT_UNROLL_M == 2
  50. #define GEMM_UNROLL_M_SHIFT 1
  51. #endif
  52. #if GEMM_DEFAULT_UNROLL_M == 4
  53. #define GEMM_UNROLL_M_SHIFT 2
  54. #endif
  55. #if GEMM_DEFAULT_UNROLL_M == 6
  56. #define GEMM_UNROLL_M_SHIFT 2
  57. #endif
  58. #if GEMM_DEFAULT_UNROLL_M == 8
  59. #define GEMM_UNROLL_M_SHIFT 3
  60. #endif
  61. #if GEMM_DEFAULT_UNROLL_M == 16
  62. #define GEMM_UNROLL_M_SHIFT 4
  63. #endif
  64. #if GEMM_DEFAULT_UNROLL_N == 1
  65. #define GEMM_UNROLL_N_SHIFT 0
  66. #endif
  67. #if GEMM_DEFAULT_UNROLL_N == 2
  68. #define GEMM_UNROLL_N_SHIFT 1
  69. #endif
  70. #if GEMM_DEFAULT_UNROLL_N == 4
  71. #define GEMM_UNROLL_N_SHIFT 2
  72. #endif
  73. #if GEMM_DEFAULT_UNROLL_N == 8
  74. #define GEMM_UNROLL_N_SHIFT 3
  75. #endif
  76. #if GEMM_DEFAULT_UNROLL_N == 16
  77. #define GEMM_UNROLL_N_SHIFT 4
  78. #endif
  79. #ifndef COMPLEX
  80. #ifdef DOUBLE
  81. static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  82. FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
  83. c0 = &c[0*ldc];
  84. c1 = &c[1*ldc];
  85. c2 = &c[2*ldc];
  86. c3 = &c[3*ldc];
  87. c4 = &c[4*ldc];
  88. c5 = &c[5*ldc];
  89. c6 = &c[6*ldc];
  90. c7 = &c[7*ldc];
  91. vector FLOAT *Vb = (vector FLOAT *) b;
  92. vector FLOAT *Vc0 = (vector FLOAT *) c0;
  93. vector FLOAT *Vc1 = (vector FLOAT *) c1;
  94. vector FLOAT *Vc2 = (vector FLOAT *) c2;
  95. vector FLOAT *Vc3 = (vector FLOAT *) c3;
  96. vector FLOAT *Vc4 = (vector FLOAT *) c4;
  97. vector FLOAT *Vc5 = (vector FLOAT *) c5;
  98. vector FLOAT *Vc6 = (vector FLOAT *) c6;
  99. vector FLOAT *Vc7 = (vector FLOAT *) c7;
  100. vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6;
  101. a[56] = (c7[0] *= b[63]);
  102. a[57] = (c7[1] *= b[63]);
  103. a[58] = (c7[2] *= b[63]);
  104. a[59] = (c7[3] *= b[63]);
  105. a[60] = (c7[4] *= b[63]);
  106. a[61] = (c7[5] *= b[63]);
  107. a[62] = (c7[6] *= b[63]);
  108. a[63] = (c7[7] *= b[63]);
  109. VbS0 = vec_splat(Vb[28], 0);
  110. VbS1 = vec_splat(Vb[28], 1);
  111. VbS2 = vec_splat(Vb[29], 0);
  112. VbS3 = vec_splat(Vb[29], 1);
  113. VbS4 = vec_splat(Vb[30], 0);
  114. VbS5 = vec_splat(Vb[30], 1);
  115. VbS6 = vec_splat(Vb[31], 0);
  116. Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]);
  117. Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]);
  118. Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]);
  119. Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]);
  120. Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]);
  121. Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]);
  122. Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]);
  123. Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]);
  124. Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]);
  125. Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]);
  126. Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]);
  127. Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]);
  128. Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]);
  129. Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]);
  130. Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]);
  131. Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]);
  132. Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]);
  133. Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]);
  134. Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]);
  135. Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]);
  136. Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]);
  137. Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]);
  138. Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]);
  139. Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]);
  140. Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]);
  141. Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]);
  142. Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]);
  143. Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]);
  144. a[48] = (c6[0] *= b[54]);
  145. a[49] = (c6[1] *= b[54]);
  146. a[50] = (c6[2] *= b[54]);
  147. a[51] = (c6[3] *= b[54]);
  148. a[52] = (c6[4] *= b[54]);
  149. a[53] = (c6[5] *= b[54]);
  150. a[54] = (c6[6] *= b[54]);
  151. a[55] = (c6[7] *= b[54]);
  152. VbS0 = vec_splat(Vb[24], 0);
  153. VbS1 = vec_splat(Vb[24], 1);
  154. VbS2 = vec_splat(Vb[25], 0);
  155. VbS3 = vec_splat(Vb[25], 1);
  156. VbS4 = vec_splat(Vb[26], 0);
  157. VbS5 = vec_splat(Vb[26], 1);
  158. Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]);
  159. Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]);
  160. Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]);
  161. Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]);
  162. Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]);
  163. Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]);
  164. Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]);
  165. Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]);
  166. Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]);
  167. Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]);
  168. Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]);
  169. Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]);
  170. Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]);
  171. Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]);
  172. Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]);
  173. Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]);
  174. Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]);
  175. Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]);
  176. Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]);
  177. Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]);
  178. Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]);
  179. Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]);
  180. Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]);
  181. Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]);
  182. a[40] = (c5[0] *= b[45]);
  183. a[41] = (c5[1] *= b[45]);
  184. a[42] = (c5[2] *= b[45]);
  185. a[43] = (c5[3] *= b[45]);
  186. a[44] = (c5[4] *= b[45]);
  187. a[45] = (c5[5] *= b[45]);
  188. a[46] = (c5[6] *= b[45]);
  189. a[47] = (c5[7] *= b[45]);
  190. VbS0 = vec_splat(Vb[20], 0);
  191. VbS1 = vec_splat(Vb[20], 1);
  192. VbS2 = vec_splat(Vb[21], 0);
  193. VbS3 = vec_splat(Vb[21], 1);
  194. VbS4 = vec_splat(Vb[22], 0);
  195. Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]);
  196. Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]);
  197. Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]);
  198. Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]);
  199. Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]);
  200. Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]);
  201. Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]);
  202. Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]);
  203. Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]);
  204. Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]);
  205. Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]);
  206. Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]);
  207. Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]);
  208. Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]);
  209. Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]);
  210. Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]);
  211. Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]);
  212. Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]);
  213. Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]);
  214. Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]);
  215. a[32] = (c4[0] *= b[36]);
  216. a[33] = (c4[1] *= b[36]);
  217. a[34] = (c4[2] *= b[36]);
  218. a[35] = (c4[3] *= b[36]);
  219. a[36] = (c4[4] *= b[36]);
  220. a[37] = (c4[5] *= b[36]);
  221. a[38] = (c4[6] *= b[36]);
  222. a[39] = (c4[7] *= b[36]);
  223. VbS0 = vec_splat(Vb[16], 0);
  224. VbS1 = vec_splat(Vb[16], 1);
  225. VbS2 = vec_splat(Vb[17], 0);
  226. VbS3 = vec_splat(Vb[17], 1);
  227. Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]);
  228. Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]);
  229. Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]);
  230. Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]);
  231. Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]);
  232. Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]);
  233. Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]);
  234. Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]);
  235. Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]);
  236. Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]);
  237. Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]);
  238. Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]);
  239. Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]);
  240. Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]);
  241. Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]);
  242. Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]);
  243. a[24] = (c3[0] *= b[27]);
  244. a[25] = (c3[1] *= b[27]);
  245. a[26] = (c3[2] *= b[27]);
  246. a[27] = (c3[3] *= b[27]);
  247. a[28] = (c3[4] *= b[27]);
  248. a[29] = (c3[5] *= b[27]);
  249. a[30] = (c3[6] *= b[27]);
  250. a[31] = (c3[7] *= b[27]);
  251. VbS0 = vec_splat(Vb[12], 0);
  252. VbS1 = vec_splat(Vb[12], 1);
  253. VbS2 = vec_splat(Vb[13], 0);
  254. Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]);
  255. Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]);
  256. Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]);
  257. Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]);
  258. Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]);
  259. Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]);
  260. Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]);
  261. Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]);
  262. Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]);
  263. Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]);
  264. Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]);
  265. Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]);
  266. a[16] = (c2[0] *= b[18]);
  267. a[17] = (c2[1] *= b[18]);
  268. a[18] = (c2[2] *= b[18]);
  269. a[19] = (c2[3] *= b[18]);
  270. a[20] = (c2[4] *= b[18]);
  271. a[21] = (c2[5] *= b[18]);
  272. a[22] = (c2[6] *= b[18]);
  273. a[23] = (c2[7] *= b[18]);
  274. VbS0 = vec_splat(Vb[8], 0);
  275. VbS1 = vec_splat(Vb[8], 1);
  276. Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]);
  277. Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]);
  278. Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]);
  279. Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]);
  280. Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]);
  281. Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]);
  282. Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]);
  283. Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]);
  284. a[ 8] = (c1[0] *= b[9]);
  285. a[ 9] = (c1[1] *= b[9]);
  286. a[10] = (c1[2] *= b[9]);
  287. a[11] = (c1[3] *= b[9]);
  288. a[12] = (c1[4] *= b[9]);
  289. a[13] = (c1[5] *= b[9]);
  290. a[14] = (c1[6] *= b[9]);
  291. a[15] = (c1[7] *= b[9]);
  292. VbS0 = vec_splat(Vb[4], 0);
  293. Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]);
  294. Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]);
  295. Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]);
  296. Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]);
  297. a[0] = (c0[0] *= b[0]);
  298. a[1] = (c0[1] *= b[0]);
  299. a[2] = (c0[2] *= b[0]);
  300. a[3] = (c0[3] *= b[0]);
  301. a[4] = (c0[4] *= b[0]);
  302. a[5] = (c0[5] *= b[0]);
  303. a[6] = (c0[6] *= b[0]);
  304. a[7] = (c0[7] *= b[0]);
  305. }
  306. #else
  307. static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  308. FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
  309. c0 = &c[0*ldc];
  310. c1 = &c[1*ldc];
  311. c2 = &c[2*ldc];
  312. c3 = &c[3*ldc];
  313. c4 = &c[4*ldc];
  314. c5 = &c[5*ldc];
  315. c6 = &c[6*ldc];
  316. c7 = &c[7*ldc];
  317. vector FLOAT *Va = (vector FLOAT *) a;
  318. vector FLOAT *Vb = (vector FLOAT *) b;
  319. vector FLOAT *Vc0 = (vector FLOAT *) c0;
  320. vector FLOAT *Vc1 = (vector FLOAT *) c1;
  321. vector FLOAT *Vc2 = (vector FLOAT *) c2;
  322. vector FLOAT *Vc3 = (vector FLOAT *) c3;
  323. vector FLOAT *Vc4 = (vector FLOAT *) c4;
  324. vector FLOAT *Vc5 = (vector FLOAT *) c5;
  325. vector FLOAT *Vc6 = (vector FLOAT *) c6;
  326. vector FLOAT *Vc7 = (vector FLOAT *) c7;
  327. vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
  328. VbS0 = vec_splat(Vb[14], 0);
  329. VbS1 = vec_splat(Vb[14], 1);
  330. VbS2 = vec_splat(Vb[14], 2);
  331. VbS3 = vec_splat(Vb[14], 3);
  332. VbS4 = vec_splat(Vb[15], 0);
  333. VbS5 = vec_splat(Vb[15], 1);
  334. VbS6 = vec_splat(Vb[15], 2);
  335. VbS7 = vec_splat(Vb[15], 3);
  336. Vc7[0] = vec_mul(VbS7, Vc7[0]);
  337. Vc7[1] = vec_mul(VbS7, Vc7[1]);
  338. Vc7[2] = vec_mul(VbS7, Vc7[2]);
  339. Vc7[3] = vec_mul(VbS7, Vc7[3]);
  340. Va[28] = Vc7[0];
  341. Va[29] = Vc7[1];
  342. Va[30] = Vc7[2];
  343. Va[31] = Vc7[3];
  344. Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
  345. Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]);
  346. Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
  347. Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]);
  348. Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
  349. Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]);
  350. Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
  351. Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]);
  352. Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
  353. Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]);
  354. Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
  355. Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]);
  356. Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
  357. Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]);
  358. Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
  359. Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]);
  360. Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
  361. Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]);
  362. Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
  363. Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]);
  364. Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
  365. Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]);
  366. Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
  367. Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]);
  368. Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
  369. Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]);
  370. Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
  371. Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]);
  372. VbS0 = vec_splat(Vb[12], 0);
  373. VbS1 = vec_splat(Vb[12], 1);
  374. VbS2 = vec_splat(Vb[12], 2);
  375. VbS3 = vec_splat(Vb[12], 3);
  376. VbS4 = vec_splat(Vb[13], 0);
  377. VbS5 = vec_splat(Vb[13], 1);
  378. VbS6 = vec_splat(Vb[13], 2);
  379. Vc6[0] = vec_mul(VbS6, Vc6[0]);
  380. Vc6[1] = vec_mul(VbS6, Vc6[1]);
  381. Vc6[2] = vec_mul(VbS6, Vc6[2]);
  382. Vc6[3] = vec_mul(VbS6, Vc6[3]);
  383. Va[24] = Vc6[0];
  384. Va[25] = Vc6[1];
  385. Va[26] = Vc6[2];
  386. Va[27] = Vc6[3];
  387. Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
  388. Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]);
  389. Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
  390. Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]);
  391. Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
  392. Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]);
  393. Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
  394. Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]);
  395. Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
  396. Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]);
  397. Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
  398. Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]);
  399. Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
  400. Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]);
  401. Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
  402. Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]);
  403. Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
  404. Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]);
  405. Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
  406. Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]);
  407. Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
  408. Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]);
  409. Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
  410. Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]);
  411. VbS0 = vec_splat(Vb[10], 0);
  412. VbS1 = vec_splat(Vb[10], 1);
  413. VbS2 = vec_splat(Vb[10], 2);
  414. VbS3 = vec_splat(Vb[10], 3);
  415. VbS4 = vec_splat(Vb[11], 0);
  416. VbS5 = vec_splat(Vb[11], 1);
  417. Vc5[0] = vec_mul(VbS5, Vc5[0]);
  418. Vc5[1] = vec_mul(VbS5, Vc5[1]);
  419. Vc5[2] = vec_mul(VbS5, Vc5[2]);
  420. Vc5[3] = vec_mul(VbS5, Vc5[3]);
  421. Va[20] = Vc5[0];
  422. Va[21] = Vc5[1];
  423. Va[22] = Vc5[2];
  424. Va[23] = Vc5[3];
  425. Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
  426. Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]);
  427. Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]);
  428. Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
  429. Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
  430. Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]);
  431. Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]);
  432. Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
  433. Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
  434. Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]);
  435. Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]);
  436. Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
  437. Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
  438. Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]);
  439. Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]);
  440. Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
  441. Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
  442. Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]);
  443. Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]);
  444. Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
  445. VbS0 = vec_splat(Vb[8], 0);
  446. VbS1 = vec_splat(Vb[8], 1);
  447. VbS2 = vec_splat(Vb[8], 2);
  448. VbS3 = vec_splat(Vb[8], 3);
  449. VbS4 = vec_splat(Vb[9], 0);
  450. Vc4[0] = vec_mul(VbS4, Vc4[0]);
  451. Vc4[1] = vec_mul(VbS4, Vc4[1]);
  452. Vc4[2] = vec_mul(VbS4, Vc4[2]);
  453. Vc4[3] = vec_mul(VbS4, Vc4[3]);
  454. Va[16] = Vc4[0];
  455. Va[17] = Vc4[1];
  456. Va[18] = Vc4[2];
  457. Va[19] = Vc4[3];
  458. Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
  459. Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]);
  460. Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]);
  461. Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
  462. Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
  463. Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]);
  464. Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]);
  465. Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
  466. Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
  467. Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]);
  468. Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]);
  469. Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
  470. Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
  471. Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]);
  472. Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]);
  473. Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
  474. VbS0 = vec_splat(Vb[6], 0);
  475. VbS1 = vec_splat(Vb[6], 1);
  476. VbS2 = vec_splat(Vb[6], 2);
  477. VbS3 = vec_splat(Vb[6], 3);
  478. Vc3[0] = vec_mul(VbS3, Vc3[0]);
  479. Vc3[1] = vec_mul(VbS3, Vc3[1]);
  480. Vc3[2] = vec_mul(VbS3, Vc3[2]);
  481. Vc3[3] = vec_mul(VbS3, Vc3[3]);
  482. Va[12] = Vc3[0];
  483. Va[13] = Vc3[1];
  484. Va[14] = Vc3[2];
  485. Va[15] = Vc3[3];
  486. Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]);
  487. Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]);
  488. Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
  489. Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
  490. Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]);
  491. Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]);
  492. Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
  493. Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
  494. Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]);
  495. Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]);
  496. Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
  497. Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
  498. VbS0 = vec_splat(Vb[4], 0);
  499. VbS1 = vec_splat(Vb[4], 1);
  500. VbS2 = vec_splat(Vb[4], 2);
  501. Vc2[0] = vec_mul(VbS2, Vc2[0]);
  502. Vc2[1] = vec_mul(VbS2, Vc2[1]);
  503. Vc2[2] = vec_mul(VbS2, Vc2[2]);
  504. Vc2[3] = vec_mul(VbS2, Vc2[3]);
  505. Va[ 8] = Vc2[0];
  506. Va[ 9] = Vc2[1];
  507. Va[10] = Vc2[2];
  508. Va[11] = Vc2[3];
  509. Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]);
  510. Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]);
  511. Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
  512. Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
  513. Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]);
  514. Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]);
  515. Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
  516. Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
  517. VbS0 = vec_splat(Vb[2], 0);
  518. VbS1 = vec_splat(Vb[2], 1);
  519. Vc1[0] = vec_mul(VbS1, Vc1[0]);
  520. Vc1[1] = vec_mul(VbS1, Vc1[1]);
  521. Vc1[2] = vec_mul(VbS1, Vc1[2]);
  522. Vc1[3] = vec_mul(VbS1, Vc1[3]);
  523. Va[4] = Vc1[0];
  524. Va[5] = Vc1[1];
  525. Va[6] = Vc1[2];
  526. Va[7] = Vc1[3];
  527. Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]);
  528. Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
  529. Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
  530. Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
  531. VbS0 = vec_splat(Vb[0], 0);
  532. Vc0[0] = vec_mul(VbS0, Vc0[0]);
  533. Vc0[1] = vec_mul(VbS0, Vc0[1]);
  534. Vc0[2] = vec_mul(VbS0, Vc0[2]);
  535. Vc0[3] = vec_mul(VbS0, Vc0[3]);
  536. Va[0] = Vc0[0];
  537. Va[1] = Vc0[1];
  538. Va[2] = Vc0[2];
  539. Va[3] = Vc0[3];
  540. }
  541. #endif
  542. static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  543. FLOAT aa, bb;
  544. int i, j, k;
  545. a += (n - 1) * m;
  546. b += (n - 1) * n;
  547. for (i = n - 1; i >= 0; i--) {
  548. bb = *(b + i);
  549. for (j = 0; j < m; j ++) {
  550. aa = *(c + j + i * ldc);
  551. aa *= bb;
  552. *a = aa;
  553. *(c + j + i * ldc) = aa;
  554. a ++;
  555. for (k = 0; k < i; k ++){
  556. *(c + j + k * ldc) -= aa * *(b + k);
  557. }
  558. }
  559. b -= n;
  560. a -= 2 * m;
  561. }
  562. }
  563. #else
  564. static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  565. FLOAT aa1, aa2;
  566. FLOAT bb1, bb2;
  567. FLOAT cc1, cc2;
  568. int i, j, k;
  569. ldc *= 2;
  570. a += (n - 1) * m * 2;
  571. b += (n - 1) * n * 2;
  572. for (i = n - 1; i >= 0; i--) {
  573. bb1 = *(b + i * 2 + 0);
  574. bb2 = *(b + i * 2 + 1);
  575. for (j = 0; j < m; j ++) {
  576. aa1 = *(c + j * 2 + 0 + i * ldc);
  577. aa2 = *(c + j * 2 + 1 + i * ldc);
  578. #ifndef CONJ
  579. cc1 = aa1 * bb1 - aa2 * bb2;
  580. cc2 = aa1 * bb2 + aa2 * bb1;
  581. #else
  582. cc1 = aa1 * bb1 + aa2 * bb2;
  583. cc2 = - aa1 * bb2 + aa2 * bb1;
  584. #endif
  585. *(a + 0) = cc1;
  586. *(a + 1) = cc2;
  587. *(c + j * 2 + 0 + i * ldc) = cc1;
  588. *(c + j * 2 + 1 + i * ldc) = cc2;
  589. a += 2;
  590. for (k = 0; k < i; k ++){
  591. #ifndef CONJ
  592. *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
  593. *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
  594. #else
  595. *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
  596. *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
  597. #endif
  598. }
  599. }
  600. b -= n * 2;
  601. a -= 4 * m;
  602. }
  603. }
  604. #endif
  605. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
  606. #ifdef COMPLEX
  607. FLOAT dummy2,
  608. #endif
  609. FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
  610. BLASLONG i, j;
  611. FLOAT *aa, *cc;
  612. BLASLONG kk;
  613. #if 0
  614. fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n",
  615. m, n, k, offset);
  616. #endif
  617. #ifdef DOUBLE
  618. int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
  619. #else
  620. int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
  621. #endif
  622. kk = n - offset;
  623. c += n * ldc * COMPSIZE;
  624. b += n * k * COMPSIZE;
  625. if (n & (GEMM_UNROLL_N - 1)) {
  626. j = 1;
  627. while (j < GEMM_UNROLL_N) {
  628. if (n & j) {
  629. aa = a;
  630. b -= j * k * COMPSIZE;
  631. c -= j * ldc* COMPSIZE;
  632. cc = c;
  633. i = (m >> GEMM_UNROLL_M_SHIFT);
  634. if (i > 0) {
  635. do {
  636. if (k - kk > 0) {
  637. GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
  638. #ifdef COMPLEX
  639. ZERO,
  640. #endif
  641. aa + GEMM_UNROLL_M * kk * COMPSIZE,
  642. b + j * kk * COMPSIZE,
  643. cc,
  644. ldc);
  645. }
  646. solve(GEMM_UNROLL_M, j,
  647. aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE,
  648. b + (kk - j) * j * COMPSIZE,
  649. cc, ldc);
  650. aa += GEMM_UNROLL_M * k * COMPSIZE;
  651. cc += GEMM_UNROLL_M * COMPSIZE;
  652. i --;
  653. } while (i > 0);
  654. }
  655. if (m & (GEMM_UNROLL_M - 1)) {
  656. i = (GEMM_UNROLL_M >> 1);
  657. do {
  658. if (m & i) {
  659. if (k - kk > 0) {
  660. GEMM_KERNEL(i, j, k - kk, dm1,
  661. #ifdef COMPLEX
  662. ZERO,
  663. #endif
  664. aa + i * kk * COMPSIZE,
  665. b + j * kk * COMPSIZE,
  666. cc, ldc);
  667. }
  668. solve(i, j,
  669. aa + (kk - j) * i * COMPSIZE,
  670. b + (kk - j) * j * COMPSIZE,
  671. cc, ldc);
  672. aa += i * k * COMPSIZE;
  673. cc += i * COMPSIZE;
  674. }
  675. i >>= 1;
  676. } while (i > 0);
  677. }
  678. kk -= j;
  679. }
  680. j <<= 1;
  681. }
  682. }
  683. j = (n >> GEMM_UNROLL_N_SHIFT);
  684. if (j > 0) {
  685. do {
  686. aa = a;
  687. b -= GEMM_UNROLL_N * k * COMPSIZE;
  688. c -= GEMM_UNROLL_N * ldc * COMPSIZE;
  689. cc = c;
  690. i = (m >> GEMM_UNROLL_M_SHIFT);
  691. if (i > 0) {
  692. do {
  693. if (k - kk > 0) {
  694. GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
  695. #ifdef COMPLEX
  696. ZERO,
  697. #endif
  698. aa + GEMM_UNROLL_M * kk * COMPSIZE,
  699. b + GEMM_UNROLL_N * kk * COMPSIZE,
  700. cc,
  701. ldc);
  702. }
  703. if (well_aligned) {
  704. #ifdef DOUBLE
  705. solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
  706. b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
  707. #else
  708. solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
  709. b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
  710. #endif
  711. }
  712. else {
  713. solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
  714. aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
  715. b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
  716. cc, ldc);
  717. }
  718. aa += GEMM_UNROLL_M * k * COMPSIZE;
  719. cc += GEMM_UNROLL_M * COMPSIZE;
  720. i --;
  721. } while (i > 0);
  722. }
  723. if (m & (GEMM_UNROLL_M - 1)) {
  724. i = (GEMM_UNROLL_M >> 1);
  725. do {
  726. if (m & i) {
  727. if (k - kk > 0) {
  728. GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
  729. #ifdef COMPLEX
  730. ZERO,
  731. #endif
  732. aa + i * kk * COMPSIZE,
  733. b + GEMM_UNROLL_N * kk * COMPSIZE,
  734. cc,
  735. ldc);
  736. }
  737. solve(i, GEMM_UNROLL_N,
  738. aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE,
  739. b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
  740. cc, ldc);
  741. aa += i * k * COMPSIZE;
  742. cc += i * COMPSIZE;
  743. }
  744. i >>= 1;
  745. } while (i > 0);
  746. }
  747. kk -= GEMM_UNROLL_N;
  748. j --;
  749. } while (j > 0);
  750. }
  751. return 0;
  752. }