You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RN_power10.c 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include "common.h"
  39. #include <altivec.h>
  40. static FLOAT dm1 = -1.;
  41. #ifdef CONJ
  42. #define GEMM_KERNEL GEMM_KERNEL_R
  43. #else
  44. #define GEMM_KERNEL GEMM_KERNEL_N
  45. #endif
  46. #if GEMM_DEFAULT_UNROLL_M == 1
  47. #define GEMM_UNROLL_M_SHIFT 0
  48. #endif
  49. #if GEMM_DEFAULT_UNROLL_M == 2
  50. #define GEMM_UNROLL_M_SHIFT 1
  51. #endif
  52. #if GEMM_DEFAULT_UNROLL_M == 4
  53. #define GEMM_UNROLL_M_SHIFT 2
  54. #endif
  55. #if GEMM_DEFAULT_UNROLL_M == 6
  56. #define GEMM_UNROLL_M_SHIFT 2
  57. #endif
  58. #if GEMM_DEFAULT_UNROLL_M == 8
  59. #define GEMM_UNROLL_M_SHIFT 3
  60. #endif
  61. #if GEMM_DEFAULT_UNROLL_M == 16
  62. #define GEMM_UNROLL_M_SHIFT 4
  63. #endif
  64. #if GEMM_DEFAULT_UNROLL_N == 1
  65. #define GEMM_UNROLL_N_SHIFT 0
  66. #endif
  67. #if GEMM_DEFAULT_UNROLL_N == 2
  68. #define GEMM_UNROLL_N_SHIFT 1
  69. #endif
  70. #if GEMM_DEFAULT_UNROLL_N == 4
  71. #define GEMM_UNROLL_N_SHIFT 2
  72. #endif
  73. #if GEMM_DEFAULT_UNROLL_N == 8
  74. #define GEMM_UNROLL_N_SHIFT 3
  75. #endif
  76. #if GEMM_DEFAULT_UNROLL_N == 16
  77. #define GEMM_UNROLL_N_SHIFT 4
  78. #endif
  79. #ifndef COMPLEX
  80. #ifdef DOUBLE
  81. static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  82. FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
  83. c0 = &c[0*ldc];
  84. c1 = &c[1*ldc];
  85. c2 = &c[2*ldc];
  86. c3 = &c[3*ldc];
  87. c4 = &c[4*ldc];
  88. c5 = &c[5*ldc];
  89. c6 = &c[6*ldc];
  90. c7 = &c[7*ldc];
  91. vector FLOAT *Vb = (vector FLOAT *) b;
  92. vector FLOAT *Vc0 = (vector FLOAT *) c0;
  93. vector FLOAT *Vc1 = (vector FLOAT *) c1;
  94. vector FLOAT *Vc2 = (vector FLOAT *) c2;
  95. vector FLOAT *Vc3 = (vector FLOAT *) c3;
  96. vector FLOAT *Vc4 = (vector FLOAT *) c4;
  97. vector FLOAT *Vc5 = (vector FLOAT *) c5;
  98. vector FLOAT *Vc6 = (vector FLOAT *) c6;
  99. vector FLOAT *Vc7 = (vector FLOAT *) c7;
  100. vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6;
  101. a[0] = (c0[0] *= b[0]);
  102. a[1] = (c0[1] *= b[0]);
  103. a[2] = (c0[2] *= b[0]);
  104. a[3] = (c0[3] *= b[0]);
  105. a[4] = (c0[4] *= b[0]);
  106. a[5] = (c0[5] *= b[0]);
  107. a[6] = (c0[6] *= b[0]);
  108. a[7] = (c0[7] *= b[0]);
  109. VbS0 = vec_splat(Vb[0], 1);
  110. VbS1 = vec_splat(Vb[1], 0);
  111. VbS2 = vec_splat(Vb[1], 1);
  112. VbS3 = vec_splat(Vb[2], 0);
  113. VbS4 = vec_splat(Vb[2], 1);
  114. VbS5 = vec_splat(Vb[3], 0);
  115. VbS6 = vec_splat(Vb[3], 1);
  116. Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]);
  117. Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]);
  118. Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]);
  119. Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]);
  120. Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]);
  121. Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]);
  122. Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]);
  123. Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]);
  124. Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]);
  125. Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]);
  126. Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]);
  127. Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]);
  128. Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]);
  129. Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]);
  130. Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]);
  131. Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]);
  132. Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]);
  133. Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]);
  134. Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]);
  135. Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]);
  136. Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]);
  137. Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]);
  138. Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]);
  139. Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]);
  140. Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]);
  141. Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]);
  142. Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]);
  143. Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]);
  144. a[ 8] = (c1[0] *= b[9]);
  145. a[ 9] = (c1[1] *= b[9]);
  146. a[10] = (c1[2] *= b[9]);
  147. a[11] = (c1[3] *= b[9]);
  148. a[12] = (c1[4] *= b[9]);
  149. a[13] = (c1[5] *= b[9]);
  150. a[14] = (c1[6] *= b[9]);
  151. a[15] = (c1[7] *= b[9]);
  152. VbS0 = vec_splat(Vb[5], 0);
  153. VbS1 = vec_splat(Vb[5], 1);
  154. VbS2 = vec_splat(Vb[6], 0);
  155. VbS3 = vec_splat(Vb[6], 1);
  156. VbS4 = vec_splat(Vb[7], 0);
  157. VbS5 = vec_splat(Vb[7], 1);
  158. Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]);
  159. Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]);
  160. Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]);
  161. Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]);
  162. Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]);
  163. Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]);
  164. Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]);
  165. Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]);
  166. Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]);
  167. Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]);
  168. Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]);
  169. Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]);
  170. Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]);
  171. Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]);
  172. Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]);
  173. Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]);
  174. Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]);
  175. Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]);
  176. Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]);
  177. Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]);
  178. Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]);
  179. Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]);
  180. Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]);
  181. Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]);
  182. a[16] = (c2[0] *= b[18]);
  183. a[17] = (c2[1] *= b[18]);
  184. a[18] = (c2[2] *= b[18]);
  185. a[19] = (c2[3] *= b[18]);
  186. a[20] = (c2[4] *= b[18]);
  187. a[21] = (c2[5] *= b[18]);
  188. a[22] = (c2[6] *= b[18]);
  189. a[23] = (c2[7] *= b[18]);
  190. VbS0 = vec_splat(Vb[ 9], 1);
  191. VbS1 = vec_splat(Vb[10], 0);
  192. VbS2 = vec_splat(Vb[10], 1);
  193. VbS3 = vec_splat(Vb[11], 0);
  194. VbS4 = vec_splat(Vb[11], 1);
  195. Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]);
  196. Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]);
  197. Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]);
  198. Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]);
  199. Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]);
  200. Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]);
  201. Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]);
  202. Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]);
  203. Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]);
  204. Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]);
  205. Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]);
  206. Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]);
  207. Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]);
  208. Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]);
  209. Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]);
  210. Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]);
  211. Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]);
  212. Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]);
  213. Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]);
  214. Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]);
  215. a[24] = (c3[0] *= b[27]);
  216. a[25] = (c3[1] *= b[27]);
  217. a[26] = (c3[2] *= b[27]);
  218. a[27] = (c3[3] *= b[27]);
  219. a[28] = (c3[4] *= b[27]);
  220. a[29] = (c3[5] *= b[27]);
  221. a[30] = (c3[6] *= b[27]);
  222. a[31] = (c3[7] *= b[27]);
  223. VbS0 = vec_splat(Vb[14], 0);
  224. VbS1 = vec_splat(Vb[14], 1);
  225. VbS2 = vec_splat(Vb[15], 0);
  226. VbS3 = vec_splat(Vb[15], 1);
  227. Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]);
  228. Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]);
  229. Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]);
  230. Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]);
  231. Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]);
  232. Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]);
  233. Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]);
  234. Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]);
  235. Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]);
  236. Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]);
  237. Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]);
  238. Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]);
  239. Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]);
  240. Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]);
  241. Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]);
  242. Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]);
  243. a[32] = (c4[0] *= b[36]);
  244. a[33] = (c4[1] *= b[36]);
  245. a[34] = (c4[2] *= b[36]);
  246. a[35] = (c4[3] *= b[36]);
  247. a[36] = (c4[4] *= b[36]);
  248. a[37] = (c4[5] *= b[36]);
  249. a[38] = (c4[6] *= b[36]);
  250. a[39] = (c4[7] *= b[36]);
  251. VbS0 = vec_splat(Vb[18], 1);
  252. VbS1 = vec_splat(Vb[19], 0);
  253. VbS2 = vec_splat(Vb[19], 1);
  254. Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]);
  255. Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]);
  256. Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]);
  257. Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]);
  258. Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]);
  259. Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]);
  260. Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]);
  261. Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]);
  262. Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]);
  263. Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]);
  264. Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]);
  265. Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]);
  266. a[40] = (c5[0] *= b[45]);
  267. a[41] = (c5[1] *= b[45]);
  268. a[42] = (c5[2] *= b[45]);
  269. a[43] = (c5[3] *= b[45]);
  270. a[44] = (c5[4] *= b[45]);
  271. a[45] = (c5[5] *= b[45]);
  272. a[46] = (c5[6] *= b[45]);
  273. a[47] = (c5[7] *= b[45]);
  274. VbS0 = vec_splat(Vb[23], 0);
  275. VbS1 = vec_splat(Vb[23], 1);
  276. Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]);
  277. Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]);
  278. Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]);
  279. Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]);
  280. Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]);
  281. Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]);
  282. Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]);
  283. Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]);
  284. a[48] = (c6[0] *= b[54]);
  285. a[49] = (c6[1] *= b[54]);
  286. a[50] = (c6[2] *= b[54]);
  287. a[51] = (c6[3] *= b[54]);
  288. a[52] = (c6[4] *= b[54]);
  289. a[53] = (c6[5] *= b[54]);
  290. a[54] = (c6[6] *= b[54]);
  291. a[55] = (c6[7] *= b[54]);
  292. VbS0 = vec_splat(Vb[27], 1);
  293. Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]);
  294. Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]);
  295. Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]);
  296. Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]);
  297. a[56] = (c7[0] *= b[63]);
  298. a[57] = (c7[1] *= b[63]);
  299. a[58] = (c7[2] *= b[63]);
  300. a[59] = (c7[3] *= b[63]);
  301. a[60] = (c7[4] *= b[63]);
  302. a[61] = (c7[5] *= b[63]);
  303. a[62] = (c7[6] *= b[63]);
  304. a[63] = (c7[7] *= b[63]);
  305. }
  306. #else
  307. static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  308. FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
  309. c0 = &c[0*ldc];
  310. c1 = &c[1*ldc];
  311. c2 = &c[2*ldc];
  312. c3 = &c[3*ldc];
  313. c4 = &c[4*ldc];
  314. c5 = &c[5*ldc];
  315. c6 = &c[6*ldc];
  316. c7 = &c[7*ldc];
  317. vector FLOAT *Va = (vector FLOAT *) a;
  318. vector FLOAT *Vb = (vector FLOAT *) b;
  319. vector FLOAT *Vc0 = (vector FLOAT *) c0;
  320. vector FLOAT *Vc1 = (vector FLOAT *) c1;
  321. vector FLOAT *Vc2 = (vector FLOAT *) c2;
  322. vector FLOAT *Vc3 = (vector FLOAT *) c3;
  323. vector FLOAT *Vc4 = (vector FLOAT *) c4;
  324. vector FLOAT *Vc5 = (vector FLOAT *) c5;
  325. vector FLOAT *Vc6 = (vector FLOAT *) c6;
  326. vector FLOAT *Vc7 = (vector FLOAT *) c7;
  327. vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
  328. VbS0 = vec_splat(Vb[0], 0);
  329. VbS1 = vec_splat(Vb[0], 1);
  330. VbS2 = vec_splat(Vb[0], 2);
  331. VbS3 = vec_splat(Vb[0], 3);
  332. VbS4 = vec_splat(Vb[1], 0);
  333. VbS5 = vec_splat(Vb[1], 1);
  334. VbS6 = vec_splat(Vb[1], 2);
  335. VbS7 = vec_splat(Vb[1], 3);
  336. Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]);
  337. Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]);
  338. Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]);
  339. Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]);
  340. Va[0] = Vc0[0];
  341. Va[1] = Vc0[1];
  342. Va[2] = Vc0[2];
  343. Va[3] = Vc0[3];
  344. Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]);
  345. Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
  346. Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
  347. Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
  348. Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]);
  349. Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
  350. Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
  351. Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
  352. Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]);
  353. Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
  354. Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
  355. Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
  356. Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]);
  357. Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
  358. Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
  359. Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
  360. Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]);
  361. Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
  362. Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
  363. Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
  364. Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]);
  365. Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
  366. Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
  367. Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
  368. Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]);
  369. Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
  370. Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
  371. Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
  372. VbS0 = vec_splat(Vb[2], 1);
  373. VbS1 = vec_splat(Vb[2], 2);
  374. VbS2 = vec_splat(Vb[2], 3);
  375. VbS3 = vec_splat(Vb[3], 0);
  376. VbS4 = vec_splat(Vb[3], 1);
  377. VbS5 = vec_splat(Vb[3], 2);
  378. VbS6 = vec_splat(Vb[3], 3);
  379. Vc1[0] = vec_mul(VbS0, Vc1[0]);
  380. Vc1[1] = vec_mul(VbS0, Vc1[1]);
  381. Vc1[2] = vec_mul(VbS0, Vc1[2]);
  382. Vc1[3] = vec_mul(VbS0, Vc1[3]);
  383. Va[4] = Vc1[0];
  384. Va[5] = Vc1[1];
  385. Va[6] = Vc1[2];
  386. Va[7] = Vc1[3];
  387. Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]);
  388. Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]);
  389. Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]);
  390. Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]);
  391. Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]);
  392. Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]);
  393. Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]);
  394. Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]);
  395. Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]);
  396. Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]);
  397. Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]);
  398. Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]);
  399. Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]);
  400. Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]);
  401. Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]);
  402. Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]);
  403. Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]);
  404. Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]);
  405. Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]);
  406. Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]);
  407. Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]);
  408. Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]);
  409. Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]);
  410. Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]);
  411. VbS0 = vec_splat(Vb[4], 2);
  412. VbS1 = vec_splat(Vb[4], 3);
  413. VbS2 = vec_splat(Vb[5], 0);
  414. VbS3 = vec_splat(Vb[5], 1);
  415. VbS4 = vec_splat(Vb[5], 2);
  416. VbS5 = vec_splat(Vb[5], 3);
  417. Vc2[0] = vec_mul(VbS0, Vc2[0]);
  418. Vc2[1] = vec_mul(VbS0, Vc2[1]);
  419. Vc2[2] = vec_mul(VbS0, Vc2[2]);
  420. Vc2[3] = vec_mul(VbS0, Vc2[3]);
  421. Va[ 8] = Vc2[0];
  422. Va[ 9] = Vc2[1];
  423. Va[10] = Vc2[2];
  424. Va[11] = Vc2[3];
  425. Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]);
  426. Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]);
  427. Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]);
  428. Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]);
  429. Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]);
  430. Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]);
  431. Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]);
  432. Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]);
  433. Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]);
  434. Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]);
  435. Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]);
  436. Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]);
  437. Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]);
  438. Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]);
  439. Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]);
  440. Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]);
  441. Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]);
  442. Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]);
  443. Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]);
  444. Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]);
  445. VbS0 = vec_splat(Vb[6], 3);
  446. VbS1 = vec_splat(Vb[7], 0);
  447. VbS2 = vec_splat(Vb[7], 1);
  448. VbS3 = vec_splat(Vb[7], 2);
  449. VbS4 = vec_splat(Vb[7], 3);
  450. Vc3[0] = vec_mul(VbS0, Vc3[0]);
  451. Vc3[1] = vec_mul(VbS0, Vc3[1]);
  452. Vc3[2] = vec_mul(VbS0, Vc3[2]);
  453. Vc3[3] = vec_mul(VbS0, Vc3[3]);
  454. Va[12] = Vc3[0];
  455. Va[13] = Vc3[1];
  456. Va[14] = Vc3[2];
  457. Va[15] = Vc3[3];
  458. Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]);
  459. Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]);
  460. Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]);
  461. Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]);
  462. Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]);
  463. Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]);
  464. Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]);
  465. Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]);
  466. Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]);
  467. Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]);
  468. Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]);
  469. Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]);
  470. Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]);
  471. Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]);
  472. Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]);
  473. Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]);
  474. VbS0 = vec_splat(Vb[9], 0);
  475. VbS1 = vec_splat(Vb[9], 1);
  476. VbS2 = vec_splat(Vb[9], 2);
  477. VbS3 = vec_splat(Vb[9], 3);
  478. Vc4[0] = vec_mul(VbS0, Vc4[0]);
  479. Vc4[1] = vec_mul(VbS0, Vc4[1]);
  480. Vc4[2] = vec_mul(VbS0, Vc4[2]);
  481. Vc4[3] = vec_mul(VbS0, Vc4[3]);
  482. Va[16] = Vc4[0];
  483. Va[17] = Vc4[1];
  484. Va[18] = Vc4[2];
  485. Va[19] = Vc4[3];
  486. Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]);
  487. Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]);
  488. Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]);
  489. Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]);
  490. Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]);
  491. Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]);
  492. Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]);
  493. Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]);
  494. Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]);
  495. Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]);
  496. Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]);
  497. Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]);
  498. VbS0 = vec_splat(Vb[11], 1);
  499. VbS1 = vec_splat(Vb[11], 2);
  500. VbS2 = vec_splat(Vb[11], 3);
  501. Vc5[0] = vec_mul(VbS0, Vc5[0]);
  502. Vc5[1] = vec_mul(VbS0, Vc5[1]);
  503. Vc5[2] = vec_mul(VbS0, Vc5[2]);
  504. Vc5[3] = vec_mul(VbS0, Vc5[3]);
  505. Va[20] = Vc5[0];
  506. Va[21] = Vc5[1];
  507. Va[22] = Vc5[2];
  508. Va[23] = Vc5[3];
  509. Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]);
  510. Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]);
  511. Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]);
  512. Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]);
  513. Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]);
  514. Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]);
  515. Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]);
  516. Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]);
  517. VbS0 = vec_splat(Vb[13], 2);
  518. VbS1 = vec_splat(Vb[13], 3);
  519. Vc6[0] = vec_mul(VbS0, Vc6[0]);
  520. Vc6[1] = vec_mul(VbS0, Vc6[1]);
  521. Vc6[2] = vec_mul(VbS0, Vc6[2]);
  522. Vc6[3] = vec_mul(VbS0, Vc6[3]);
  523. Va[24] = Vc6[0];
  524. Va[25] = Vc6[1];
  525. Va[26] = Vc6[2];
  526. Va[27] = Vc6[3];
  527. Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]);
  528. Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]);
  529. Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]);
  530. Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]);
  531. VbS0 = vec_splat(Vb[15], 3);
  532. Vc7[0] = vec_mul(VbS0, Vc7[0]);
  533. Vc7[1] = vec_mul(VbS0, Vc7[1]);
  534. Vc7[2] = vec_mul(VbS0, Vc7[2]);
  535. Vc7[3] = vec_mul(VbS0, Vc7[3]);
  536. Va[28] = Vc7[0];
  537. Va[29] = Vc7[1];
  538. Va[30] = Vc7[2];
  539. Va[31] = Vc7[3];
  540. }
  541. #endif
  542. static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  543. FLOAT aa, bb;
  544. int i, j, k;
  545. for (i = 0; i < n; i++) {
  546. bb = *(b + i);
  547. for (j = 0; j < m; j ++) {
  548. aa = *(c + j + i * ldc);
  549. aa *= bb;
  550. *a = aa;
  551. *(c + j + i * ldc) = aa;
  552. a ++;
  553. for (k = i + 1; k < n; k ++){
  554. *(c + j + k * ldc) -= aa * *(b + k);
  555. }
  556. }
  557. b += n;
  558. }
  559. }
  560. #else
  561. static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
  562. FLOAT aa1, aa2;
  563. FLOAT bb1, bb2;
  564. FLOAT cc1, cc2;
  565. int i, j, k;
  566. ldc *= 2;
  567. for (i = 0; i < n; i++) {
  568. bb1 = *(b + i * 2 + 0);
  569. bb2 = *(b + i * 2 + 1);
  570. for (j = 0; j < m; j ++) {
  571. aa1 = *(c + j * 2 + 0 + i * ldc);
  572. aa2 = *(c + j * 2 + 1 + i * ldc);
  573. #ifndef CONJ
  574. cc1 = aa1 * bb1 - aa2 * bb2;
  575. cc2 = aa1 * bb2 + aa2 * bb1;
  576. #else
  577. cc1 = aa1 * bb1 + aa2 * bb2;
  578. cc2 = -aa1 * bb2 + aa2 * bb1;
  579. #endif
  580. *(a + 0) = cc1;
  581. *(a + 1) = cc2;
  582. *(c + j * 2 + 0 + i * ldc) = cc1;
  583. *(c + j * 2 + 1 + i * ldc) = cc2;
  584. a += 2;
  585. for (k = i + 1; k < n; k ++){
  586. #ifndef CONJ
  587. *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
  588. *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
  589. #else
  590. *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
  591. *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
  592. #endif
  593. }
  594. }
  595. b += n * 2;
  596. }
  597. }
  598. #endif
  599. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
  600. #ifdef COMPLEX
  601. FLOAT dummy2,
  602. #endif
  603. FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
  604. FLOAT *aa, *cc;
  605. BLASLONG kk;
  606. BLASLONG i, j, jj;
  607. #if 0
  608. fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n",
  609. m, n, k, offset);
  610. #endif
  611. jj = 0;
  612. j = (n >> GEMM_UNROLL_N_SHIFT);
  613. kk = -offset;
  614. #ifdef DOUBLE
  615. int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
  616. #else
  617. int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
  618. #endif
  619. while (j > 0) {
  620. aa = a;
  621. cc = c;
  622. i = (m >> GEMM_UNROLL_M_SHIFT);
  623. if (i > 0) {
  624. do {
  625. if (kk > 0) {
  626. GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
  627. #ifdef COMPLEX
  628. ZERO,
  629. #endif
  630. aa, b, cc, ldc);
  631. }
  632. if (well_aligned) {
  633. #ifdef DOUBLE
  634. solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
  635. b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
  636. #else
  637. solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
  638. b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
  639. #endif
  640. }
  641. else {
  642. solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
  643. aa + kk * GEMM_UNROLL_M * COMPSIZE,
  644. b + kk * GEMM_UNROLL_N * COMPSIZE,
  645. cc, ldc);
  646. }
  647. aa += GEMM_UNROLL_M * k * COMPSIZE;
  648. cc += GEMM_UNROLL_M * COMPSIZE;
  649. i --;
  650. } while (i > 0);
  651. }
  652. if (m & (GEMM_UNROLL_M - 1)) {
  653. i = (GEMM_UNROLL_M >> 1);
  654. while (i > 0) {
  655. if (m & i) {
  656. if (kk > 0) {
  657. GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
  658. #ifdef COMPLEX
  659. ZERO,
  660. #endif
  661. aa, b, cc, ldc);
  662. }
  663. solve(i, GEMM_UNROLL_N,
  664. aa + kk * i * COMPSIZE,
  665. b + kk * GEMM_UNROLL_N * COMPSIZE,
  666. cc, ldc);
  667. aa += i * k * COMPSIZE;
  668. cc += i * COMPSIZE;
  669. }
  670. i >>= 1;
  671. }
  672. }
  673. kk += GEMM_UNROLL_N;
  674. b += GEMM_UNROLL_N * k * COMPSIZE;
  675. c += GEMM_UNROLL_N * ldc * COMPSIZE;
  676. j --;
  677. jj += GEMM_UNROLL_M;
  678. }
  679. if (n & (GEMM_UNROLL_N - 1)) {
  680. j = (GEMM_UNROLL_N >> 1);
  681. while (j > 0) {
  682. if (n & j) {
  683. aa = a;
  684. cc = c;
  685. i = (m >> GEMM_UNROLL_M_SHIFT);
  686. while (i > 0) {
  687. if (kk > 0) {
  688. GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
  689. #ifdef COMPLEX
  690. ZERO,
  691. #endif
  692. aa,
  693. b,
  694. cc,
  695. ldc);
  696. }
  697. solve(GEMM_UNROLL_M, j,
  698. aa + kk * GEMM_UNROLL_M * COMPSIZE,
  699. b + kk * j * COMPSIZE, cc, ldc);
  700. aa += GEMM_UNROLL_M * k * COMPSIZE;
  701. cc += GEMM_UNROLL_M * COMPSIZE;
  702. i --;
  703. }
  704. if (m & (GEMM_UNROLL_M - 1)) {
  705. i = (GEMM_UNROLL_M >> 1);
  706. while (i > 0) {
  707. if (m & i) {
  708. if (kk > 0) {
  709. GEMM_KERNEL(i, j, kk, dm1,
  710. #ifdef COMPLEX
  711. ZERO,
  712. #endif
  713. aa,
  714. b,
  715. cc,
  716. ldc);
  717. }
  718. solve(i, j,
  719. aa + kk * i * COMPSIZE,
  720. b + kk * j * COMPSIZE, cc, ldc);
  721. aa += i * k * COMPSIZE;
  722. cc += i * COMPSIZE;
  723. }
  724. i >>= 1;
  725. }
  726. }
  727. b += j * k * COMPSIZE;
  728. c += j * ldc * COMPSIZE;
  729. kk += j;
  730. }
  731. j >>= 1;
  732. }
  733. }
  734. return 0;
  735. }