You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_8x8_msa.c 63 kB


  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
  30. FLOAT *C, BLASLONG ldc
  31. #ifdef TRMMKERNEL
  32. , BLASLONG offset
  33. #endif
  34. )
  35. {
  36. BLASLONG i, j, l, temp;
  37. #if defined(TRMMKERNEL)
  38. BLASLONG off;
  39. #endif
  40. FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
  41. FLOAT *pa0, *pb0;
  42. FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  43. FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  44. FLOAT a0, a1, b0, b1, b2, b3, b4, b5, b6, b7;
  45. v4f32 v_alpha = {alpha, alpha, alpha, alpha};
  46. v4f32 src_a0, src_a1, src_b, src_b0, src_b1;
  47. v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  48. v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
  49. v4f32 res8, res9, res10, res11, res12, res13, res14, res15;
  50. #if defined(TRMMKERNEL) && !defined(LEFT)
  51. off = -offset;
  52. #endif
  53. for (j = (n >> 3); j--;)
  54. {
  55. pc0 = C;
  56. pc1 = pc0 + ldc;
  57. pc2 = pc1 + ldc;
  58. pc3 = pc2 + ldc;
  59. pc4 = pc3 + ldc;
  60. pc5 = pc4 + ldc;
  61. pc6 = pc5 + ldc;
  62. pc7 = pc6 + ldc;
  63. #if defined(TRMMKERNEL) && defined(LEFT)
  64. off = offset;
  65. #endif
  66. pa0 = A;
  67. for (i = (m >> 3); i--;)
  68. {
  69. #if defined(TRMMKERNEL)
  70. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  71. pb0 = B;
  72. #else
  73. pa0 += off * 8;
  74. pb0 = B + off * 8;
  75. #endif
  76. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  77. temp = k - off;
  78. #elif defined(LEFT)
  79. temp = off + 8; // number of values in A
  80. #else
  81. temp = off + 8; // number of values in B
  82. #endif
  83. #else
  84. pb0 = B;
  85. temp = k;
  86. #endif
  87. #ifdef ENABLE_PREFETCH
  88. __asm__ __volatile__(
  89. "pref 0, 32(%[pa0]) \n\t"
  90. "pref 0, 32(%[pb0]) \n\t"
  91. :
  92. : [pa0] "r" (pa0), [pb0] "r" (pb0)
  93. );
  94. #endif
  95. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  96. LD_SP2_INC(pb0, 4, src_b0, src_b1);
  97. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  98. res0 = src_a0 * src_b;
  99. res1 = src_a1 * src_b;
  100. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  101. res2 = src_a0 * src_b;
  102. res3 = src_a1 * src_b;
  103. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  104. res4 = src_a0 * src_b;
  105. res5 = src_a1 * src_b;
  106. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  107. res6 = src_a0 * src_b;
  108. res7 = src_a1 * src_b;
  109. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  110. res8 = src_a0 * src_b;
  111. res9 = src_a1 * src_b;
  112. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  113. res10 = src_a0 * src_b;
  114. res11 = src_a1 * src_b;
  115. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  116. res12 = src_a0 * src_b;
  117. res13 = src_a1 * src_b;
  118. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  119. res14 = src_a0 * src_b;
  120. res15 = src_a1 * src_b;
  121. for (l = ((temp - 1) >> 1); l--;)
  122. {
  123. #ifdef ENABLE_PREFETCH
  124. __asm__ __volatile__(
  125. "pref 0, 64(%[pa0]) \n\t"
  126. "pref 0, 96(%[pa0]) \n\t"
  127. "pref 0, 64(%[pb0]) \n\t"
  128. "pref 0, 96(%[pb0]) \n\t"
  129. :
  130. : [pa0] "r" (pa0), [pb0] "r" (pb0)
  131. );
  132. #endif
  133. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  134. LD_SP2_INC(pb0, 4, src_b0, src_b1);
  135. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  136. res0 += src_a0 * src_b;
  137. res1 += src_a1 * src_b;
  138. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  139. res2 += src_a0 * src_b;
  140. res3 += src_a1 * src_b;
  141. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  142. res4 += src_a0 * src_b;
  143. res5 += src_a1 * src_b;
  144. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  145. res6 += src_a0 * src_b;
  146. res7 += src_a1 * src_b;
  147. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  148. res8 += src_a0 * src_b;
  149. res9 += src_a1 * src_b;
  150. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  151. res10 += src_a0 * src_b;
  152. res11 += src_a1 * src_b;
  153. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  154. res12 += src_a0 * src_b;
  155. res13 += src_a1 * src_b;
  156. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  157. res14 += src_a0 * src_b;
  158. res15 += src_a1 * src_b;
  159. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  160. LD_SP2_INC(pb0, 4, src_b0, src_b1);
  161. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  162. res0 += src_a0 * src_b;
  163. res1 += src_a1 * src_b;
  164. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  165. res2 += src_a0 * src_b;
  166. res3 += src_a1 * src_b;
  167. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  168. res4 += src_a0 * src_b;
  169. res5 += src_a1 * src_b;
  170. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  171. res6 += src_a0 * src_b;
  172. res7 += src_a1 * src_b;
  173. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  174. res8 += src_a0 * src_b;
  175. res9 += src_a1 * src_b;
  176. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  177. res10 += src_a0 * src_b;
  178. res11 += src_a1 * src_b;
  179. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  180. res12 += src_a0 * src_b;
  181. res13 += src_a1 * src_b;
  182. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  183. res14 += src_a0 * src_b;
  184. res15 += src_a1 * src_b;
  185. }
  186. if ((temp - 1) & 1)
  187. {
  188. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  189. LD_SP2_INC(pb0, 4, src_b0, src_b1);
  190. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  191. res0 += src_a0 * src_b;
  192. res1 += src_a1 * src_b;
  193. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  194. res2 += src_a0 * src_b;
  195. res3 += src_a1 * src_b;
  196. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  197. res4 += src_a0 * src_b;
  198. res5 += src_a1 * src_b;
  199. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  200. res6 += src_a0 * src_b;
  201. res7 += src_a1 * src_b;
  202. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  203. res8 += src_a0 * src_b;
  204. res9 += src_a1 * src_b;
  205. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  206. res10 += src_a0 * src_b;
  207. res11 += src_a1 * src_b;
  208. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  209. res12 += src_a0 * src_b;
  210. res13 += src_a1 * src_b;
  211. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  212. res14 += src_a0 * src_b;
  213. res15 += src_a1 * src_b;
  214. }
  215. #if defined(TRMMKERNEL)
  216. dst0 = res0 * v_alpha;
  217. dst1 = res1 * v_alpha;
  218. dst2 = res2 * v_alpha;
  219. dst3 = res3 * v_alpha;
  220. dst4 = res4 * v_alpha;
  221. dst5 = res5 * v_alpha;
  222. dst6 = res6 * v_alpha;
  223. dst7 = res7 * v_alpha;
  224. #else
  225. LD_SP2(pc0, 4, dst0, dst1);
  226. LD_SP2(pc1, 4, dst2, dst3);
  227. LD_SP2(pc2, 4, dst4, dst5);
  228. LD_SP2(pc3, 4, dst6, dst7);
  229. dst0 += res0 * v_alpha;
  230. dst1 += res1 * v_alpha;
  231. dst2 += res2 * v_alpha;
  232. dst3 += res3 * v_alpha;
  233. dst4 += res4 * v_alpha;
  234. dst5 += res5 * v_alpha;
  235. dst6 += res6 * v_alpha;
  236. dst7 += res7 * v_alpha;
  237. #endif
  238. ST_SP2_INC(dst0, dst1, pc0, 4);
  239. ST_SP2_INC(dst2, dst3, pc1, 4);
  240. ST_SP2_INC(dst4, dst5, pc2, 4);
  241. ST_SP2_INC(dst6, dst7, pc3, 4);
  242. #if defined(TRMMKERNEL)
  243. dst0 = res8 * v_alpha;
  244. dst1 = res9 * v_alpha;
  245. dst2 = res10 * v_alpha;
  246. dst3 = res11 * v_alpha;
  247. dst4 = res12 * v_alpha;
  248. dst5 = res13 * v_alpha;
  249. dst6 = res14 * v_alpha;
  250. dst7 = res15 * v_alpha;
  251. #else
  252. LD_SP2(pc4, 4, dst0, dst1);
  253. LD_SP2(pc5, 4, dst2, dst3);
  254. LD_SP2(pc6, 4, dst4, dst5);
  255. LD_SP2(pc7, 4, dst6, dst7);
  256. dst0 += res8 * v_alpha;
  257. dst1 += res9 * v_alpha;
  258. dst2 += res10 * v_alpha;
  259. dst3 += res11 * v_alpha;
  260. dst4 += res12 * v_alpha;
  261. dst5 += res13 * v_alpha;
  262. dst6 += res14 * v_alpha;
  263. dst7 += res15 * v_alpha;
  264. #endif
  265. ST_SP2_INC(dst0, dst1, pc4, 4);
  266. ST_SP2_INC(dst2, dst3, pc5, 4);
  267. ST_SP2_INC(dst4, dst5, pc6, 4);
  268. ST_SP2_INC(dst6, dst7, pc7, 4);
  269. #if defined(TRMMKERNEL)
  270. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  271. temp = k - off;
  272. #ifdef LEFT
  273. temp -= 8; // number of values in A
  274. #else
  275. temp -= 8; // number of values in B
  276. #endif
  277. pa0 += temp * 8;
  278. pb0 += temp * 8;
  279. #endif
  280. #ifdef LEFT
  281. off += 8; // number of values in A
  282. #endif
  283. #endif
  284. }
  285. if (m & 4)
  286. {
  287. #if defined(TRMMKERNEL)
  288. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  289. pb0 = B;
  290. #else
  291. pa0 += off * 4;
  292. pb0 = B + off * 8;
  293. #endif
  294. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  295. temp = k - off;
  296. #elif defined(LEFT)
  297. temp = off + 4; // number of values in A
  298. #else
  299. temp = off + 8; // number of values in B
  300. #endif
  301. #else
  302. pb0 = B;
  303. temp = k;
  304. #endif
  305. src_a0 = LD_SP(pa0);
  306. LD_SP2_INC(pb0, 4, src_b0, src_b1);
  307. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  308. res0 = src_a0 * src_b;
  309. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  310. res1 = src_a0 * src_b;
  311. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  312. res2 = src_a0 * src_b;
  313. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  314. res3 = src_a0 * src_b;
  315. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  316. res4 = src_a0 * src_b;
  317. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  318. res5 = src_a0 * src_b;
  319. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  320. res6 = src_a0 * src_b;
  321. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  322. res7 = src_a0 * src_b;
  323. pa0 += 4;
  324. for (l = ((temp - 1) >> 1); l--;)
  325. {
  326. src_a0 = LD_SP(pa0);
  327. LD_SP2_INC(pb0, 4, src_b0, src_b1);
  328. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  329. res0 += src_a0 * src_b;
  330. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  331. res1 += src_a0 * src_b;
  332. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  333. res2 += src_a0 * src_b;
  334. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  335. res3 += src_a0 * src_b;
  336. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  337. res4 += src_a0 * src_b;
  338. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  339. res5 += src_a0 * src_b;
  340. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  341. res6 += src_a0 * src_b;
  342. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  343. res7 += src_a0 * src_b;
  344. pa0 += 4;
  345. src_a0 = LD_SP(pa0);
  346. LD_SP2_INC(pb0, 4, src_b0, src_b1);
  347. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  348. res0 += src_a0 * src_b;
  349. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  350. res1 += src_a0 * src_b;
  351. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  352. res2 += src_a0 * src_b;
  353. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  354. res3 += src_a0 * src_b;
  355. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  356. res4 += src_a0 * src_b;
  357. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  358. res5 += src_a0 * src_b;
  359. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  360. res6 += src_a0 * src_b;
  361. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  362. res7 += src_a0 * src_b;
  363. pa0 += 4;
  364. }
  365. if ((temp - 1) & 1)
  366. {
  367. src_a0 = LD_SP(pa0);
  368. LD_SP2_INC(pb0, 4, src_b0, src_b1);
  369. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  370. res0 += src_a0 * src_b;
  371. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  372. res1 += src_a0 * src_b;
  373. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  374. res2 += src_a0 * src_b;
  375. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  376. res3 += src_a0 * src_b;
  377. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  378. res4 += src_a0 * src_b;
  379. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  380. res5 += src_a0 * src_b;
  381. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  382. res6 += src_a0 * src_b;
  383. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  384. res7 += src_a0 * src_b;
  385. pa0 += 4;
  386. }
  387. #if defined(TRMMKERNEL)
  388. dst0 = res0 * v_alpha;
  389. dst1 = res1 * v_alpha;
  390. dst2 = res2 * v_alpha;
  391. dst3 = res3 * v_alpha;
  392. #else
  393. dst0 = LD_SP(pc0);
  394. dst1 = LD_SP(pc1);
  395. dst2 = LD_SP(pc2);
  396. dst3 = LD_SP(pc3);
  397. dst0 += res0 * v_alpha;
  398. dst1 += res1 * v_alpha;
  399. dst2 += res2 * v_alpha;
  400. dst3 += res3 * v_alpha;
  401. #endif
  402. ST_SP(dst0, pc0);
  403. ST_SP(dst1, pc1);
  404. ST_SP(dst2, pc2);
  405. ST_SP(dst3, pc3);
  406. #if defined(TRMMKERNEL)
  407. dst0 = res4 * v_alpha;
  408. dst1 = res5 * v_alpha;
  409. dst2 = res6 * v_alpha;
  410. dst3 = res7 * v_alpha;
  411. #else
  412. dst0 = LD_SP(pc4);
  413. dst1 = LD_SP(pc5);
  414. dst2 = LD_SP(pc6);
  415. dst3 = LD_SP(pc7);
  416. dst0 += res4 * v_alpha;
  417. dst1 += res5 * v_alpha;
  418. dst2 += res6 * v_alpha;
  419. dst3 += res7 * v_alpha;
  420. #endif
  421. ST_SP(dst0, pc4);
  422. ST_SP(dst1, pc5);
  423. ST_SP(dst2, pc6);
  424. ST_SP(dst3, pc7);
  425. pc0 += 4;
  426. pc1 += 4;
  427. pc2 += 4;
  428. pc3 += 4;
  429. pc4 += 4;
  430. pc5 += 4;
  431. pc6 += 4;
  432. pc7 += 4;
  433. #if defined(TRMMKERNEL)
  434. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  435. temp = k - off;
  436. #ifdef LEFT
  437. temp -= 4; // number of values in A
  438. #else
  439. temp -= 8; // number of values in B
  440. #endif
  441. pa0 += temp * 4;
  442. pb0 += temp * 8;
  443. #endif
  444. #ifdef LEFT
  445. off += 4; // number of values in A
  446. #endif
  447. #endif
  448. }
  449. if (m & 2)
  450. {
  451. #if defined(TRMMKERNEL)
  452. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  453. pb0 = B;
  454. #else
  455. pa0 += off * 2;
  456. pb0 = B + off * 8;
  457. #endif
  458. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  459. temp = k - off;
  460. #elif defined(LEFT)
  461. temp = off + 2; // number of values in A
  462. #else
  463. temp = off + 8; // number of values in B
  464. #endif
  465. #else
  466. pb0 = B;
  467. temp = k;
  468. #endif
  469. a0 = pa0[0];
  470. b0 = pb0[0];
  471. tmp0 = a0 * b0;
  472. a1 = pa0[1];
  473. tmp1 = a1 * b0;
  474. b1 = pb0[1];
  475. tmp2 = a0 * b1;
  476. tmp3 = a1 * b1;
  477. b2 = pb0[2];
  478. tmp4 = a0 * b2;
  479. tmp5 = a1 * b2;
  480. b3 = pb0[3];
  481. tmp6 = a0 * b3;
  482. tmp7 = a1 * b3;
  483. b4 = pb0[4];
  484. tmp8 = a0 * b4;
  485. tmp9 = a1 * b4;
  486. b5 = pb0[5];
  487. tmp10 = a0 * b5;
  488. tmp11 = a1 * b5;
  489. b6 = pb0[6];
  490. tmp12 = a0 * b6;
  491. tmp13 = a1 * b6;
  492. b7 = pb0[7];
  493. tmp14 = a0 * b7;
  494. tmp15 = a1 * b7;
  495. pa0 += 2;
  496. pb0 += 8;
  497. for (l = ((temp - 1) >> 1); l--;)
  498. {
  499. a0 = pa0[0];
  500. b0 = pb0[0];
  501. tmp0 += a0 * b0;
  502. a1 = pa0[1];
  503. tmp1 += a1 * b0;
  504. b1 = pb0[1];
  505. tmp2 += a0 * b1;
  506. tmp3 += a1 * b1;
  507. b2 = pb0[2];
  508. tmp4 += a0 * b2;
  509. tmp5 += a1 * b2;
  510. b3 = pb0[3];
  511. tmp6 += a0 * b3;
  512. tmp7 += a1 * b3;
  513. b4 = pb0[4];
  514. tmp8 += a0 * b4;
  515. tmp9 += a1 * b4;
  516. b5 = pb0[5];
  517. tmp10 += a0 * b5;
  518. tmp11 += a1 * b5;
  519. b6 = pb0[6];
  520. tmp12 += a0 * b6;
  521. tmp13 += a1 * b6;
  522. b7 = pb0[7];
  523. tmp14 += a0 * b7;
  524. tmp15 += a1 * b7;
  525. pa0 += 2;
  526. pb0 += 8;
  527. a0 = pa0[0];
  528. b0 = pb0[0];
  529. tmp0 += a0 * b0;
  530. a1 = pa0[1];
  531. tmp1 += a1 * b0;
  532. b1 = pb0[1];
  533. tmp2 += a0 * b1;
  534. tmp3 += a1 * b1;
  535. b2 = pb0[2];
  536. tmp4 += a0 * b2;
  537. tmp5 += a1 * b2;
  538. b3 = pb0[3];
  539. tmp6 += a0 * b3;
  540. tmp7 += a1 * b3;
  541. b4 = pb0[4];
  542. tmp8 += a0 * b4;
  543. tmp9 += a1 * b4;
  544. b5 = pb0[5];
  545. tmp10 += a0 * b5;
  546. tmp11 += a1 * b5;
  547. b6 = pb0[6];
  548. tmp12 += a0 * b6;
  549. tmp13 += a1 * b6;
  550. b7 = pb0[7];
  551. tmp14 += a0 * b7;
  552. tmp15 += a1 * b7;
  553. pa0 += 2;
  554. pb0 += 8;
  555. }
  556. if ((temp - 1) & 1)
  557. {
  558. a0 = pa0[0];
  559. b0 = pb0[0];
  560. tmp0 += a0 * b0;
  561. a1 = pa0[1];
  562. tmp1 += a1 * b0;
  563. b1 = pb0[1];
  564. tmp2 += a0 * b1;
  565. tmp3 += a1 * b1;
  566. b2 = pb0[2];
  567. tmp4 += a0 * b2;
  568. tmp5 += a1 * b2;
  569. b3 = pb0[3];
  570. tmp6 += a0 * b3;
  571. tmp7 += a1 * b3;
  572. b4 = pb0[4];
  573. tmp8 += a0 * b4;
  574. tmp9 += a1 * b4;
  575. b5 = pb0[5];
  576. tmp10 += a0 * b5;
  577. tmp11 += a1 * b5;
  578. b6 = pb0[6];
  579. tmp12 += a0 * b6;
  580. tmp13 += a1 * b6;
  581. b7 = pb0[7];
  582. tmp14 += a0 * b7;
  583. tmp15 += a1 * b7;
  584. pa0 += 2;
  585. pb0 += 8;
  586. }
  587. tmp0 = alpha * tmp0;
  588. tmp2 = alpha * tmp2;
  589. tmp4 = alpha * tmp4;
  590. tmp6 = alpha * tmp6;
  591. tmp8 = alpha * tmp8;
  592. tmp10 = alpha * tmp10;
  593. tmp12 = alpha * tmp12;
  594. tmp14 = alpha * tmp14;
  595. #if defined(TRMMKERNEL)
  596. pc0[0] = tmp0;
  597. pc1[0] = tmp2;
  598. pc2[0] = tmp4;
  599. pc3[0] = tmp6;
  600. pc4[0] = tmp8;
  601. pc5[0] = tmp10;
  602. pc6[0] = tmp12;
  603. pc7[0] = tmp14;
  604. #else
  605. pc0[0] += tmp0;
  606. pc1[0] += tmp2;
  607. pc2[0] += tmp4;
  608. pc3[0] += tmp6;
  609. pc4[0] += tmp8;
  610. pc5[0] += tmp10;
  611. pc6[0] += tmp12;
  612. pc7[0] += tmp14;
  613. #endif
  614. tmp1 = alpha * tmp1;
  615. tmp3 = alpha * tmp3;
  616. tmp5 = alpha * tmp5;
  617. tmp7 = alpha * tmp7;
  618. tmp9 = alpha * tmp9;
  619. tmp11 = alpha * tmp11;
  620. tmp13 = alpha * tmp13;
  621. tmp15 = alpha * tmp15;
  622. #if defined(TRMMKERNEL)
  623. pc0[1] = tmp1;
  624. pc1[1] = tmp3;
  625. pc2[1] = tmp5;
  626. pc3[1] = tmp7;
  627. pc4[1] = tmp9;
  628. pc5[1] = tmp11;
  629. pc6[1] = tmp13;
  630. pc7[1] = tmp15;
  631. #else
  632. pc0[1] += tmp1;
  633. pc1[1] += tmp3;
  634. pc2[1] += tmp5;
  635. pc3[1] += tmp7;
  636. pc4[1] += tmp9;
  637. pc5[1] += tmp11;
  638. pc6[1] += tmp13;
  639. pc7[1] += tmp15;
  640. #endif
  641. pc0 += 2;
  642. pc1 += 2;
  643. pc2 += 2;
  644. pc3 += 2;
  645. pc4 += 2;
  646. pc5 += 2;
  647. pc6 += 2;
  648. pc7 += 2;
  649. #if defined(TRMMKERNEL)
  650. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  651. temp = k - off;
  652. #ifdef LEFT
  653. temp -= 2; // number of values in A
  654. #else
  655. temp -= 8; // number of values in B
  656. #endif
  657. pa0 += temp * 2;
  658. pb0 += temp * 8;
  659. #endif
  660. #ifdef LEFT
  661. off += 2; // number of values in A
  662. #endif
  663. #endif
  664. }
  665. if (m & 1)
  666. {
  667. #if defined(TRMMKERNEL)
  668. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  669. pb0 = B;
  670. #else
  671. pa0 += off * 1;
  672. pb0 = B + off * 8;
  673. #endif
  674. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  675. temp = k - off;
  676. #elif defined(LEFT)
  677. temp = off + 1; // number of values in A
  678. #else
  679. temp = off + 8; // number of values in B
  680. #endif
  681. #else
  682. pb0 = B;
  683. temp = k;
  684. #endif
  685. a0 = pa0[0];
  686. b0 = pb0[0];
  687. tmp0 = a0 * b0;
  688. b1 = pb0[1];
  689. tmp1 = a0 * b1;
  690. b2 = pb0[2];
  691. tmp2 = a0 * b2;
  692. b3 = pb0[3];
  693. tmp3 = a0 * b3;
  694. b4 = pb0[4];
  695. tmp4 = a0 * b4;
  696. b5 = pb0[5];
  697. tmp5 = a0 * b5;
  698. b6 = pb0[6];
  699. tmp6 = a0 * b6;
  700. b7 = pb0[7];
  701. tmp7 = a0 * b7;
  702. pa0 += 1;
  703. pb0 += 8;
  704. for (l = ((temp - 1) >> 1); l--;)
  705. {
  706. a0 = pa0[0];
  707. b0 = pb0[0];
  708. tmp0 += a0 * b0;
  709. b1 = pb0[1];
  710. tmp1 += a0 * b1;
  711. b2 = pb0[2];
  712. tmp2 += a0 * b2;
  713. b3 = pb0[3];
  714. tmp3 += a0 * b3;
  715. b4 = pb0[4];
  716. tmp4 += a0 * b4;
  717. b5 = pb0[5];
  718. tmp5 += a0 * b5;
  719. b6 = pb0[6];
  720. tmp6 += a0 * b6;
  721. b7 = pb0[7];
  722. tmp7 += a0 * b7;
  723. pa0 += 1;
  724. pb0 += 8;
  725. a0 = pa0[0];
  726. b0 = pb0[0];
  727. tmp0 += a0 * b0;
  728. b1 = pb0[1];
  729. tmp1 += a0 * b1;
  730. b2 = pb0[2];
  731. tmp2 += a0 * b2;
  732. b3 = pb0[3];
  733. tmp3 += a0 * b3;
  734. b4 = pb0[4];
  735. tmp4 += a0 * b4;
  736. b5 = pb0[5];
  737. tmp5 += a0 * b5;
  738. b6 = pb0[6];
  739. tmp6 += a0 * b6;
  740. b7 = pb0[7];
  741. tmp7 += a0 * b7;
  742. pa0 += 1;
  743. pb0 += 8;
  744. }
  745. if ((temp - 1) & 1)
  746. {
  747. a0 = pa0[0];
  748. b0 = pb0[0];
  749. tmp0 += a0 * b0;
  750. b1 = pb0[1];
  751. tmp1 += a0 * b1;
  752. b2 = pb0[2];
  753. tmp2 += a0 * b2;
  754. b3 = pb0[3];
  755. tmp3 += a0 * b3;
  756. b4 = pb0[4];
  757. tmp4 += a0 * b4;
  758. b5 = pb0[5];
  759. tmp5 += a0 * b5;
  760. b6 = pb0[6];
  761. tmp6 += a0 * b6;
  762. b7 = pb0[7];
  763. tmp7 += a0 * b7;
  764. pa0 += 1;
  765. pb0 += 8;
  766. }
  767. tmp0 = alpha * tmp0;
  768. tmp1 = alpha * tmp1;
  769. tmp2 = alpha * tmp2;
  770. tmp3 = alpha * tmp3;
  771. tmp4 = alpha * tmp4;
  772. tmp5 = alpha * tmp5;
  773. tmp6 = alpha * tmp6;
  774. tmp7 = alpha * tmp7;
  775. #if defined(TRMMKERNEL)
  776. pc0[0] = tmp0;
  777. pc1[0] = tmp1;
  778. pc2[0] = tmp2;
  779. pc3[0] = tmp3;
  780. pc4[0] = tmp4;
  781. pc5[0] = tmp5;
  782. pc6[0] = tmp6;
  783. pc7[0] = tmp7;
  784. #else
  785. pc0[0] += tmp0;
  786. pc1[0] += tmp1;
  787. pc2[0] += tmp2;
  788. pc3[0] += tmp3;
  789. pc4[0] += tmp4;
  790. pc5[0] += tmp5;
  791. pc6[0] += tmp6;
  792. pc7[0] += tmp7;
  793. #endif
  794. pc0 += 1;
  795. pc1 += 1;
  796. pc2 += 1;
  797. pc3 += 1;
  798. pc4 += 1;
  799. pc5 += 1;
  800. pc6 += 1;
  801. pc7 += 1;
  802. #if defined(TRMMKERNEL)
  803. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  804. temp = k - off;
  805. #ifdef LEFT
  806. temp -= 1; // number of values in A
  807. #else
  808. temp -= 8; // number of values in B
  809. #endif
  810. pa0 += temp * 1;
  811. pb0 += temp * 8;
  812. #endif
  813. #ifdef LEFT
  814. off += 1; // number of values in A
  815. #endif
  816. #endif
  817. }
  818. #if defined(TRMMKERNEL) && !defined(LEFT)
  819. off += 8; // number of values in A
  820. #endif
  821. B += (k << 3);
  822. C += (ldc << 3);
  823. }
  824. if (n & 4)
  825. {
  826. pc0 = C;
  827. pc1 = pc0 + ldc;
  828. pc2 = pc1 + ldc;
  829. pc3 = pc2 + ldc;
  830. #if defined(TRMMKERNEL) && defined(LEFT)
  831. off = offset;
  832. #endif
  833. pa0 = A;
  834. for (i = (m >> 3); i--;)
  835. {
  836. #if defined(TRMMKERNEL)
  837. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  838. pb0 = B;
  839. #else
  840. pa0 += off * 8;
  841. pb0 = B + off * 4;
  842. #endif
  843. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  844. temp = k - off;
  845. #elif defined(LEFT)
  846. temp = off + 8; // number of values in A
  847. #else
  848. temp = off + 4; // number of values in B
  849. #endif
  850. #else
  851. pb0 = B;
  852. temp = k;
  853. #endif
  854. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  855. src_b0 = LD_SP(pb0);
  856. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  857. res0 = src_a0 * src_b;
  858. res1 = src_a1 * src_b;
  859. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  860. res2 = src_a0 * src_b;
  861. res3 = src_a1 * src_b;
  862. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  863. res4 = src_a0 * src_b;
  864. res5 = src_a1 * src_b;
  865. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  866. res6 = src_a0 * src_b;
  867. res7 = src_a1 * src_b;
  868. pb0 += 4;
  869. for (l = ((temp - 1) >> 1); l--;)
  870. {
  871. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  872. src_b0 = LD_SP(pb0);
  873. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  874. res0 += src_a0 * src_b;
  875. res1 += src_a1 * src_b;
  876. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  877. res2 += src_a0 * src_b;
  878. res3 += src_a1 * src_b;
  879. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  880. res4 += src_a0 * src_b;
  881. res5 += src_a1 * src_b;
  882. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  883. res6 += src_a0 * src_b;
  884. res7 += src_a1 * src_b;
  885. pb0 += 4;
  886. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  887. src_b0 = LD_SP(pb0);
  888. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  889. res0 += src_a0 * src_b;
  890. res1 += src_a1 * src_b;
  891. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  892. res2 += src_a0 * src_b;
  893. res3 += src_a1 * src_b;
  894. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  895. res4 += src_a0 * src_b;
  896. res5 += src_a1 * src_b;
  897. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  898. res6 += src_a0 * src_b;
  899. res7 += src_a1 * src_b;
  900. pb0 += 4;
  901. }
  902. if ((temp - 1) & 1)
  903. {
  904. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  905. src_b0 = LD_SP(pb0);
  906. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  907. res0 += src_a0 * src_b;
  908. res1 += src_a1 * src_b;
  909. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  910. res2 += src_a0 * src_b;
  911. res3 += src_a1 * src_b;
  912. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  913. res4 += src_a0 * src_b;
  914. res5 += src_a1 * src_b;
  915. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  916. res6 += src_a0 * src_b;
  917. res7 += src_a1 * src_b;
  918. pb0 += 4;
  919. }
  920. #if defined(TRMMKERNEL)
  921. dst0 = res0 * v_alpha;
  922. dst1 = res1 * v_alpha;
  923. dst2 = res2 * v_alpha;
  924. dst3 = res3 * v_alpha;
  925. dst4 = res4 * v_alpha;
  926. dst5 = res5 * v_alpha;
  927. dst6 = res6 * v_alpha;
  928. dst7 = res7 * v_alpha;
  929. #else
  930. LD_SP2(pc0, 4, dst0, dst1);
  931. LD_SP2(pc1, 4, dst2, dst3);
  932. LD_SP2(pc2, 4, dst4, dst5);
  933. LD_SP2(pc3, 4, dst6, dst7);
  934. dst0 += res0 * v_alpha;
  935. dst1 += res1 * v_alpha;
  936. dst2 += res2 * v_alpha;
  937. dst3 += res3 * v_alpha;
  938. dst4 += res4 * v_alpha;
  939. dst5 += res5 * v_alpha;
  940. dst6 += res6 * v_alpha;
  941. dst7 += res7 * v_alpha;
  942. #endif
  943. ST_SP2_INC(dst0, dst1, pc0, 4);
  944. ST_SP2_INC(dst2, dst3, pc1, 4);
  945. ST_SP2_INC(dst4, dst5, pc2, 4);
  946. ST_SP2_INC(dst6, dst7, pc3, 4);
  947. #if defined(TRMMKERNEL)
  948. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  949. temp = k - off;
  950. #ifdef LEFT
  951. temp -= 8; // number of values in A
  952. #else
  953. temp -= 4; // number of values in B
  954. #endif
  955. pa0 += temp * 8;
  956. pb0 += temp * 4;
  957. #endif
  958. #ifdef LEFT
  959. off += 8; // number of values in A
  960. #endif
  961. #endif
  962. }
  963. if (m & 4)
  964. {
  965. #if defined(TRMMKERNEL)
  966. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  967. pb0 = B;
  968. #else
  969. pa0 += off * 4;
  970. pb0 = B + off * 4;
  971. #endif
  972. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  973. temp = k - off;
  974. #elif defined(LEFT)
  975. temp = off + 4; // number of values in A
  976. #else
  977. temp = off + 4; // number of values in B
  978. #endif
  979. #else
  980. pb0 = B;
  981. temp = k;
  982. #endif
  983. src_a0 = LD_SP(pa0);
  984. src_b0 = LD_SP(pb0);
  985. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  986. res0 = src_a0 * src_b;
  987. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  988. res1 = src_a0 * src_b;
  989. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  990. res2 = src_a0 * src_b;
  991. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  992. res3 = src_a0 * src_b;
  993. pa0 += 4;
  994. pb0 += 4;
  995. for (l = ((temp - 1) >> 1); l--;)
  996. {
  997. src_a0 = LD_SP(pa0);
  998. src_b0 = LD_SP(pb0);
  999. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1000. res0 += src_a0 * src_b;
  1001. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1002. res1 += src_a0 * src_b;
  1003. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  1004. res2 += src_a0 * src_b;
  1005. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  1006. res3 += src_a0 * src_b;
  1007. pa0 += 4;
  1008. pb0 += 4;
  1009. src_a0 = LD_SP(pa0);
  1010. src_b0 = LD_SP(pb0);
  1011. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1012. res0 += src_a0 * src_b;
  1013. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1014. res1 += src_a0 * src_b;
  1015. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  1016. res2 += src_a0 * src_b;
  1017. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  1018. res3 += src_a0 * src_b;
  1019. pa0 += 4;
  1020. pb0 += 4;
  1021. }
  1022. if ((temp - 1) & 1)
  1023. {
  1024. src_a0 = LD_SP(pa0);
  1025. src_b0 = LD_SP(pb0);
  1026. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1027. res0 += src_a0 * src_b;
  1028. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1029. res1 += src_a0 * src_b;
  1030. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  1031. res2 += src_a0 * src_b;
  1032. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  1033. res3 += src_a0 * src_b;
  1034. pa0 += 4;
  1035. pb0 += 4;
  1036. }
  1037. #if defined(TRMMKERNEL)
  1038. dst0 = res0 * v_alpha;
  1039. dst1 = res1 * v_alpha;
  1040. dst2 = res2 * v_alpha;
  1041. dst3 = res3 * v_alpha;
  1042. #else
  1043. dst0 = LD_SP(pc0);
  1044. dst1 = LD_SP(pc1);
  1045. dst2 = LD_SP(pc2);
  1046. dst3 = LD_SP(pc3);
  1047. dst0 += res0 * v_alpha;
  1048. dst1 += res1 * v_alpha;
  1049. dst2 += res2 * v_alpha;
  1050. dst3 += res3 * v_alpha;
  1051. #endif
  1052. ST_SP(dst0, pc0);
  1053. ST_SP(dst1, pc1);
  1054. ST_SP(dst2, pc2);
  1055. ST_SP(dst3, pc3);
  1056. pc0 += 4;
  1057. pc1 += 4;
  1058. pc2 += 4;
  1059. pc3 += 4;
  1060. #if defined(TRMMKERNEL)
  1061. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1062. temp = k - off;
  1063. #ifdef LEFT
  1064. temp -= 4; // number of values in A
  1065. #else
  1066. temp -= 4; // number of values in B
  1067. #endif
  1068. pa0 += temp * 4;
  1069. pb0 += temp * 4;
  1070. #endif
  1071. #ifdef LEFT
  1072. off += 4; // number of values in A
  1073. #endif
  1074. #endif
  1075. }
  1076. if (m & 2)
  1077. {
  1078. #if defined(TRMMKERNEL)
  1079. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1080. pb0 = B;
  1081. #else
  1082. pa0 += off * 2;
  1083. pb0 = B + off * 4;
  1084. #endif
  1085. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1086. temp = k - off;
  1087. #elif defined(LEFT)
  1088. temp = off + 2; // number of values in A
  1089. #else
  1090. temp = off + 4; // number of values in B
  1091. #endif
  1092. #else
  1093. pb0 = B;
  1094. temp = k;
  1095. #endif
  1096. a0 = pa0[0];
  1097. b0 = pb0[0];
  1098. tmp0 = a0 * b0;
  1099. a1 = pa0[1];
  1100. tmp1 = a1 * b0;
  1101. b1 = pb0[1];
  1102. tmp2 = a0 * b1;
  1103. tmp3 = a1 * b1;
  1104. b2 = pb0[2];
  1105. tmp4 = a0 * b2;
  1106. tmp5 = a1 * b2;
  1107. b3 = pb0[3];
  1108. tmp6 = a0 * b3;
  1109. tmp7 = a1 * b3;
  1110. pa0 += 2;
  1111. pb0 += 4;
  1112. for (l = ((temp - 1) >> 1); l--;)
  1113. {
  1114. a0 = pa0[0];
  1115. b0 = pb0[0];
  1116. tmp0 += a0 * b0;
  1117. a1 = pa0[1];
  1118. tmp1 += a1 * b0;
  1119. b1 = pb0[1];
  1120. tmp2 += a0 * b1;
  1121. tmp3 += a1 * b1;
  1122. b2 = pb0[2];
  1123. tmp4 += a0 * b2;
  1124. tmp5 += a1 * b2;
  1125. b3 = pb0[3];
  1126. tmp6 += a0 * b3;
  1127. tmp7 += a1 * b3;
  1128. pa0 += 2;
  1129. pb0 += 4;
  1130. a0 = pa0[0];
  1131. b0 = pb0[0];
  1132. tmp0 += a0 * b0;
  1133. a1 = pa0[1];
  1134. tmp1 += a1 * b0;
  1135. b1 = pb0[1];
  1136. tmp2 += a0 * b1;
  1137. tmp3 += a1 * b1;
  1138. b2 = pb0[2];
  1139. tmp4 += a0 * b2;
  1140. tmp5 += a1 * b2;
  1141. b3 = pb0[3];
  1142. tmp6 += a0 * b3;
  1143. tmp7 += a1 * b3;
  1144. pa0 += 2;
  1145. pb0 += 4;
  1146. }
  1147. if ((temp - 1) & 1)
  1148. {
  1149. a0 = pa0[0];
  1150. b0 = pb0[0];
  1151. tmp0 += a0 * b0;
  1152. a1 = pa0[1];
  1153. tmp1 += a1 * b0;
  1154. b1 = pb0[1];
  1155. tmp2 += a0 * b1;
  1156. tmp3 += a1 * b1;
  1157. b2 = pb0[2];
  1158. tmp4 += a0 * b2;
  1159. tmp5 += a1 * b2;
  1160. b3 = pb0[3];
  1161. tmp6 += a0 * b3;
  1162. tmp7 += a1 * b3;
  1163. pa0 += 2;
  1164. pb0 += 4;
  1165. }
  1166. tmp0 = alpha * tmp0;
  1167. tmp2 = alpha * tmp2;
  1168. tmp4 = alpha * tmp4;
  1169. tmp6 = alpha * tmp6;
  1170. #if defined(TRMMKERNEL)
  1171. pc0[0] = tmp0;
  1172. pc1[0] = tmp2;
  1173. pc2[0] = tmp4;
  1174. pc3[0] = tmp6;
  1175. #else
  1176. pc0[0] += tmp0;
  1177. pc1[0] += tmp2;
  1178. pc2[0] += tmp4;
  1179. pc3[0] += tmp6;
  1180. #endif
  1181. tmp1 = alpha * tmp1;
  1182. tmp3 = alpha * tmp3;
  1183. tmp5 = alpha * tmp5;
  1184. tmp7 = alpha * tmp7;
  1185. #if defined(TRMMKERNEL)
  1186. pc0[1] = tmp1;
  1187. pc1[1] = tmp3;
  1188. pc2[1] = tmp5;
  1189. pc3[1] = tmp7;
  1190. #else
  1191. pc0[1] += tmp1;
  1192. pc1[1] += tmp3;
  1193. pc2[1] += tmp5;
  1194. pc3[1] += tmp7;
  1195. #endif
  1196. pc0 += 2;
  1197. pc1 += 2;
  1198. pc2 += 2;
  1199. pc3 += 2;
  1200. #if defined(TRMMKERNEL)
  1201. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1202. temp = k - off;
  1203. #ifdef LEFT
  1204. temp -= 2; // number of values in A
  1205. #else
  1206. temp -= 4; // number of values in B
  1207. #endif
  1208. pa0 += temp * 2;
  1209. pb0 += temp * 4;
  1210. #endif
  1211. #ifdef LEFT
  1212. off += 2; // number of values in A
  1213. #endif
  1214. #endif
  1215. }
  1216. if (m & 1)
  1217. {
  1218. #if defined(TRMMKERNEL)
  1219. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1220. pb0 = B;
  1221. #else
  1222. pa0 += off * 1;
  1223. pb0 = B + off * 4;
  1224. #endif
  1225. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1226. temp = k - off;
  1227. #elif defined(LEFT)
  1228. temp = off + 1; // number of values in A
  1229. #else
  1230. temp = off + 4; // number of values in B
  1231. #endif
  1232. #else
  1233. pb0 = B;
  1234. temp = k;
  1235. #endif
  1236. a0 = pa0[0];
  1237. b0 = pb0[0];
  1238. tmp0 = a0 * b0;
  1239. b1 = pb0[1];
  1240. tmp1 = a0 * b1;
  1241. b2 = pb0[2];
  1242. tmp2 = a0 * b2;
  1243. b3 = pb0[3];
  1244. tmp3 = a0 * b3;
  1245. pa0 += 1;
  1246. pb0 += 4;
  1247. for (l = ((temp - 1) >> 1); l--;)
  1248. {
  1249. a0 = pa0[0];
  1250. b0 = pb0[0];
  1251. tmp0 += a0 * b0;
  1252. b1 = pb0[1];
  1253. tmp1 += a0 * b1;
  1254. b2 = pb0[2];
  1255. tmp2 += a0 * b2;
  1256. b3 = pb0[3];
  1257. tmp3 += a0 * b3;
  1258. pa0 += 1;
  1259. pb0 += 4;
  1260. a0 = pa0[0];
  1261. b0 = pb0[0];
  1262. tmp0 += a0 * b0;
  1263. b1 = pb0[1];
  1264. tmp1 += a0 * b1;
  1265. b2 = pb0[2];
  1266. tmp2 += a0 * b2;
  1267. b3 = pb0[3];
  1268. tmp3 += a0 * b3;
  1269. pa0 += 1;
  1270. pb0 += 4;
  1271. }
  1272. if ((temp - 1) & 1)
  1273. {
  1274. a0 = pa0[0];
  1275. b0 = pb0[0];
  1276. tmp0 += a0 * b0;
  1277. b1 = pb0[1];
  1278. tmp1 += a0 * b1;
  1279. b2 = pb0[2];
  1280. tmp2 += a0 * b2;
  1281. b3 = pb0[3];
  1282. tmp3 += a0 * b3;
  1283. pa0 += 1;
  1284. pb0 += 4;
  1285. }
  1286. tmp0 = alpha * tmp0;
  1287. tmp1 = alpha * tmp1;
  1288. tmp2 = alpha * tmp2;
  1289. tmp3 = alpha * tmp3;
  1290. #if defined(TRMMKERNEL)
  1291. pc0[0] = tmp0;
  1292. pc1[0] = tmp1;
  1293. pc2[0] = tmp2;
  1294. pc3[0] = tmp3;
  1295. #else
  1296. pc0[0] += tmp0;
  1297. pc1[0] += tmp1;
  1298. pc2[0] += tmp2;
  1299. pc3[0] += tmp3;
  1300. #endif
  1301. pc0 += 1;
  1302. pc1 += 1;
  1303. pc2 += 1;
  1304. pc3 += 1;
  1305. #if defined(TRMMKERNEL)
  1306. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1307. temp = k - off;
  1308. #ifdef LEFT
  1309. temp -= 1; // number of values in A
  1310. #else
  1311. temp -= 4; // number of values in B
  1312. #endif
  1313. pa0 += temp * 1;
  1314. pb0 += temp * 4;
  1315. #endif
  1316. #ifdef LEFT
  1317. off += 1; // number of values in A
  1318. #endif
  1319. #endif
  1320. }
  1321. #if defined(TRMMKERNEL) && !defined(LEFT)
  1322. off += 4; // number of values in A
  1323. #endif
  1324. B += (k << 2);
  1325. C += (ldc << 2);
  1326. }
  1327. if (n & 2)
  1328. {
  1329. pc0 = C;
  1330. pc1 = pc0 + ldc;
  1331. #if defined(TRMMKERNEL) && defined(LEFT)
  1332. off = offset;
  1333. #endif
  1334. pa0 = A;
  1335. for (i = (m >> 3); i--;)
  1336. {
  1337. #if defined(TRMMKERNEL)
  1338. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1339. pb0 = B;
  1340. #else
  1341. pa0 += off * 8;
  1342. pb0 = B + off * 2;
  1343. #endif
  1344. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1345. temp = k - off;
  1346. #elif defined(LEFT)
  1347. temp = off + 8; // number of values in A
  1348. #else
  1349. temp = off + 2; // number of values in B
  1350. #endif
  1351. #else
  1352. pb0 = B;
  1353. temp = k;
  1354. #endif
  1355. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  1356. src_b0[0] = pb0[0];
  1357. src_b0[1] = pb0[1];
  1358. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1359. res0 = src_a0 * src_b;
  1360. res1 = src_a1 * src_b;
  1361. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1362. res2 = src_a0 * src_b;
  1363. res3 = src_a1 * src_b;
  1364. pb0 += 2;
  1365. for (l = ((temp - 1) >> 1); l--;)
  1366. {
  1367. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  1368. src_b0[0] = pb0[0];
  1369. src_b0[1] = pb0[1];
  1370. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1371. res0 += src_a0 * src_b;
  1372. res1 += src_a1 * src_b;
  1373. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1374. res2 += src_a0 * src_b;
  1375. res3 += src_a1 * src_b;
  1376. pb0 += 2;
  1377. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  1378. src_b0[0] = pb0[0];
  1379. src_b0[1] = pb0[1];
  1380. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1381. res0 += src_a0 * src_b;
  1382. res1 += src_a1 * src_b;
  1383. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1384. res2 += src_a0 * src_b;
  1385. res3 += src_a1 * src_b;
  1386. pb0 += 2;
  1387. }
  1388. if ((temp - 1) & 1)
  1389. {
  1390. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  1391. src_b0[0] = pb0[0];
  1392. src_b0[1] = pb0[1];
  1393. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1394. res0 += src_a0 * src_b;
  1395. res1 += src_a1 * src_b;
  1396. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1397. res2 += src_a0 * src_b;
  1398. res3 += src_a1 * src_b;
  1399. pb0 += 2;
  1400. }
  1401. #if defined(TRMMKERNEL)
  1402. dst0 = res0 * v_alpha;
  1403. dst1 = res1 * v_alpha;
  1404. dst2 = res2 * v_alpha;
  1405. dst3 = res3 * v_alpha;
  1406. #else
  1407. LD_SP2(pc0, 4, dst0, dst1);
  1408. LD_SP2(pc1, 4, dst2, dst3);
  1409. dst0 += res0 * v_alpha;
  1410. dst1 += res1 * v_alpha;
  1411. dst2 += res2 * v_alpha;
  1412. dst3 += res3 * v_alpha;
  1413. #endif
  1414. ST_SP2_INC(dst0, dst1, pc0, 4);
  1415. ST_SP2_INC(dst2, dst3, pc1, 4);
  1416. #if defined(TRMMKERNEL)
  1417. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1418. temp = k - off;
  1419. #ifdef LEFT
  1420. temp -= 8; // number of values in A
  1421. #else
  1422. temp -= 2; // number of values in B
  1423. #endif
  1424. pa0 += temp * 8;
  1425. pb0 += temp * 2;
  1426. #endif
  1427. #ifdef LEFT
  1428. off += 8; // number of values in A
  1429. #endif
  1430. #endif
  1431. }
  1432. if (m & 4)
  1433. {
  1434. #if defined(TRMMKERNEL)
  1435. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1436. pb0 = B;
  1437. #else
  1438. pa0 += off * 4;
  1439. pb0 = B + off * 2;
  1440. #endif
  1441. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1442. temp = k - off;
  1443. #elif defined(LEFT)
  1444. temp = off + 4; // number of values in A
  1445. #else
  1446. temp = off + 2; // number of values in B
  1447. #endif
  1448. #else
  1449. pb0 = B;
  1450. temp = k;
  1451. #endif
  1452. src_a0 = LD_SP(pa0);
  1453. src_b0[0] = pb0[0];
  1454. src_b0[1] = pb0[1];
  1455. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1456. res0 = src_a0 * src_b;
  1457. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1458. res1 = src_a0 * src_b;
  1459. pa0 += 4;
  1460. pb0 += 2;
  1461. for (l = ((temp - 1) >> 1); l--;)
  1462. {
  1463. src_a0 = LD_SP(pa0);
  1464. src_b0[0] = pb0[0];
  1465. src_b0[1] = pb0[1];
  1466. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1467. res0 += src_a0 * src_b;
  1468. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1469. res1 += src_a0 * src_b;
  1470. pa0 += 4;
  1471. pb0 += 2;
  1472. src_a0 = LD_SP(pa0);
  1473. src_b0[0] = pb0[0];
  1474. src_b0[1] = pb0[1];
  1475. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1476. res0 += src_a0 * src_b;
  1477. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1478. res1 += src_a0 * src_b;
  1479. pa0 += 4;
  1480. pb0 += 2;
  1481. }
  1482. if ((temp - 1) & 1)
  1483. {
  1484. src_a0 = LD_SP(pa0);
  1485. src_b0[0] = pb0[0];
  1486. src_b0[1] = pb0[1];
  1487. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1488. res0 += src_a0 * src_b;
  1489. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1490. res1 += src_a0 * src_b;
  1491. pa0 += 4;
  1492. pb0 += 2;
  1493. }
  1494. #if defined(TRMMKERNEL)
  1495. dst0 = res0 * v_alpha;
  1496. dst1 = res1 * v_alpha;
  1497. #else
  1498. dst0 = LD_SP(pc0);
  1499. dst1 = LD_SP(pc1);
  1500. dst0 += res0 * v_alpha;
  1501. dst1 += res1 * v_alpha;
  1502. #endif
  1503. ST_SP(dst0, pc0);
  1504. ST_SP(dst1, pc1);
  1505. pc0 += 4;
  1506. pc1 += 4;
  1507. #if defined(TRMMKERNEL)
  1508. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1509. temp = k - off;
  1510. #ifdef LEFT
  1511. temp -= 4; // number of values in A
  1512. #else
  1513. temp -= 2; // number of values in B
  1514. #endif
  1515. pa0 += temp * 4;
  1516. pb0 += temp * 2;
  1517. #endif
  1518. #ifdef LEFT
  1519. off += 4; // number of values in A
  1520. #endif
  1521. #endif
  1522. }
  1523. if (m & 2)
  1524. {
  1525. #if defined(TRMMKERNEL)
  1526. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1527. pb0 = B;
  1528. #else
  1529. pa0 += off * 2;
  1530. pb0 = B + off * 2;
  1531. #endif
  1532. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1533. temp = k - off;
  1534. #elif defined(LEFT)
  1535. temp = off + 2; // number of values in A
  1536. #else
  1537. temp = off + 2; // number of values in B
  1538. #endif
  1539. #else
  1540. pb0 = B;
  1541. temp = k;
  1542. #endif
  1543. a0 = pa0[0];
  1544. b0 = pb0[0];
  1545. tmp0 = a0 * b0;
  1546. a1 = pa0[1];
  1547. tmp1 = a1 * b0;
  1548. b1 = pb0[1];
  1549. tmp2 = a0 * b1;
  1550. tmp3 = a1 * b1;
  1551. pa0 += 2;
  1552. pb0 += 2;
  1553. for (l = ((temp - 1) >> 1); l--;)
  1554. {
  1555. a0 = pa0[0];
  1556. b0 = pb0[0];
  1557. tmp0 += a0 * b0;
  1558. a1 = pa0[1];
  1559. tmp1 += a1 * b0;
  1560. b1 = pb0[1];
  1561. tmp2 += a0 * b1;
  1562. tmp3 += a1 * b1;
  1563. pa0 += 2;
  1564. pb0 += 2;
  1565. a0 = pa0[0];
  1566. b0 = pb0[0];
  1567. tmp0 += a0 * b0;
  1568. a1 = pa0[1];
  1569. tmp1 += a1 * b0;
  1570. b1 = pb0[1];
  1571. tmp2 += a0 * b1;
  1572. tmp3 += a1 * b1;
  1573. pa0 += 2;
  1574. pb0 += 2;
  1575. }
  1576. if ((temp - 1) & 1)
  1577. {
  1578. a0 = pa0[0];
  1579. b0 = pb0[0];
  1580. tmp0 += a0 * b0;
  1581. a1 = pa0[1];
  1582. tmp1 += a1 * b0;
  1583. b1 = pb0[1];
  1584. tmp2 += a0 * b1;
  1585. tmp3 += a1 * b1;
  1586. pa0 += 2;
  1587. pb0 += 2;
  1588. }
  1589. tmp0 = alpha * tmp0;
  1590. tmp1 = alpha * tmp1;
  1591. tmp2 = alpha * tmp2;
  1592. tmp3 = alpha * tmp3;
  1593. #if defined(TRMMKERNEL)
  1594. pc0[0] = tmp0;
  1595. pc1[0] = tmp2;
  1596. pc0[1] = tmp1;
  1597. pc1[1] = tmp3;
  1598. #else
  1599. pc0[0] += tmp0;
  1600. pc1[0] += tmp2;
  1601. pc0[1] += tmp1;
  1602. pc1[1] += tmp3;
  1603. #endif
  1604. pc0 += 2;
  1605. pc1 += 2;
  1606. #if defined(TRMMKERNEL)
  1607. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1608. temp = k - off;
  1609. #ifdef LEFT
  1610. temp -= 2; // number of values in A
  1611. #else
  1612. temp -= 2; // number of values in B
  1613. #endif
  1614. pa0 += temp * 2;
  1615. pb0 += temp * 2;
  1616. #endif
  1617. #ifdef LEFT
  1618. off += 2; // number of values in A
  1619. #endif
  1620. #endif
  1621. }
  1622. if (m & 1)
  1623. {
  1624. #if defined(TRMMKERNEL)
  1625. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1626. pb0 = B;
  1627. #else
  1628. pa0 += off * 1;
  1629. pb0 = B + off * 2;
  1630. #endif
  1631. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1632. temp = k - off;
  1633. #elif defined(LEFT)
  1634. temp = off + 1; // number of values in A
  1635. #else
  1636. temp = off + 2; // number of values in B
  1637. #endif
  1638. #else
  1639. pb0 = B;
  1640. temp = k;
  1641. #endif
  1642. a0 = pa0[0];
  1643. b0 = pb0[0];
  1644. tmp0 = a0 * b0;
  1645. b1 = pb0[1];
  1646. tmp1 = a0 * b1;
  1647. pa0 += 1;
  1648. pb0 += 2;
  1649. for (l = ((temp - 1) >> 1); l--;)
  1650. {
  1651. a0 = pa0[0];
  1652. b0 = pb0[0];
  1653. tmp0 += a0 * b0;
  1654. b1 = pb0[1];
  1655. tmp1 += a0 * b1;
  1656. pa0 += 1;
  1657. pb0 += 2;
  1658. a0 = pa0[0];
  1659. b0 = pb0[0];
  1660. tmp0 += a0 * b0;
  1661. b1 = pb0[1];
  1662. tmp1 += a0 * b1;
  1663. pa0 += 1;
  1664. pb0 += 2;
  1665. }
  1666. if ((temp - 1) & 1)
  1667. {
  1668. a0 = pa0[0];
  1669. b0 = pb0[0];
  1670. tmp0 += a0 * b0;
  1671. b1 = pb0[1];
  1672. tmp1 += a0 * b1;
  1673. pa0 += 1;
  1674. pb0 += 2;
  1675. }
  1676. tmp0 = alpha * tmp0;
  1677. tmp1 = alpha * tmp1;
  1678. #if defined(TRMMKERNEL)
  1679. pc0[0] = tmp0;
  1680. pc1[0] = tmp1;
  1681. #else
  1682. pc0[0] += tmp0;
  1683. pc1[0] += tmp1;
  1684. #endif
  1685. pc0 += 1;
  1686. pc1 += 1;
  1687. #if defined(TRMMKERNEL)
  1688. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1689. temp = k - off;
  1690. #ifdef LEFT
  1691. temp -= 1; // number of values in A
  1692. #else
  1693. temp -= 2; // number of values in B
  1694. #endif
  1695. pa0 += temp * 1;
  1696. pb0 += temp * 2;
  1697. #endif
  1698. #ifdef LEFT
  1699. off += 1; // number of values in A
  1700. #endif
  1701. #endif
  1702. }
  1703. #if defined(TRMMKERNEL) && !defined(LEFT)
  1704. off += 2; // number of values in A
  1705. #endif
  1706. B += (k << 1);
  1707. C += (ldc << 1);
  1708. }
  1709. if (n & 1)
  1710. {
  1711. pc0 = C;
  1712. #if defined(TRMMKERNEL) && defined(LEFT)
  1713. off = offset;
  1714. #endif
  1715. pa0 = A;
  1716. for (i = (m >> 3); i--;)
  1717. {
  1718. #if defined(TRMMKERNEL)
  1719. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1720. pb0 = B;
  1721. #else
  1722. pa0 += off * 8;
  1723. pb0 = B + off * 1;
  1724. #endif
  1725. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1726. temp = k - off;
  1727. #elif defined(LEFT)
  1728. temp = off + 8; // number of values in A
  1729. #else
  1730. temp = off + 1; // number of values in B
  1731. #endif
  1732. #else
  1733. pb0 = B;
  1734. temp = k;
  1735. #endif
  1736. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  1737. src_b0[0] = pb0[0];
  1738. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1739. res0 = src_a0 * src_b;
  1740. res1 = src_a1 * src_b;
  1741. pb0 += 1;
  1742. for (l = ((temp - 1) >> 1); l--;)
  1743. {
  1744. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  1745. src_b0[0] = pb0[0];
  1746. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1747. res0 += src_a0 * src_b;
  1748. res1 += src_a1 * src_b;
  1749. pb0 += 1;
  1750. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  1751. src_b0[0] = pb0[0];
  1752. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1753. res0 += src_a0 * src_b;
  1754. res1 += src_a1 * src_b;
  1755. pb0 += 1;
  1756. }
  1757. if ((temp - 1) & 1)
  1758. {
  1759. LD_SP2_INC(pa0, 4, src_a0, src_a1);
  1760. src_b0[0] = pb0[0];
  1761. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1762. res0 += src_a0 * src_b;
  1763. res1 += src_a1 * src_b;
  1764. pb0 += 1;
  1765. }
  1766. #if defined(TRMMKERNEL)
  1767. dst0 = res0 * v_alpha;
  1768. dst1 = res1 * v_alpha;
  1769. #else
  1770. LD_SP2(pc0, 4, dst0, dst1);
  1771. dst0 += res0 * v_alpha;
  1772. dst1 += res1 * v_alpha;
  1773. #endif
  1774. ST_SP2_INC(dst0, dst1, pc0, 4);
  1775. #if defined(TRMMKERNEL)
  1776. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1777. temp = k - off;
  1778. #ifdef LEFT
  1779. temp -= 8; // number of values in A
  1780. #else
  1781. temp -= 1; // number of values in B
  1782. #endif
  1783. pa0 += temp * 8;
  1784. pb0 += temp * 1;
  1785. #endif
  1786. #ifdef LEFT
  1787. off += 8; // number of values in A
  1788. #endif
  1789. #endif
  1790. }
  1791. if (m & 4)
  1792. {
  1793. #if defined(TRMMKERNEL)
  1794. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1795. pb0 = B;
  1796. #else
  1797. pa0 += off * 4;
  1798. pb0 = B + off * 1;
  1799. #endif
  1800. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1801. temp = k - off;
  1802. #elif defined(LEFT)
  1803. temp = off + 4; // number of values in A
  1804. #else
  1805. temp = off + 1; // number of values in B
  1806. #endif
  1807. #else
  1808. pb0 = B;
  1809. temp = k;
  1810. #endif
  1811. src_a0 = LD_SP(pa0);
  1812. src_b0[0] = pb0[0];
  1813. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1814. res0 = src_a0 * src_b;
  1815. pa0 += 4;
  1816. pb0 += 1;
  1817. for (l = ((temp - 1) >> 1); l--;)
  1818. {
  1819. src_a0 = LD_SP(pa0);
  1820. src_b0[0] = pb0[0];
  1821. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1822. res0 += src_a0 * src_b;
  1823. pa0 += 4;
  1824. pb0 += 1;
  1825. src_a0 = LD_SP(pa0);
  1826. src_b0[0] = pb0[0];
  1827. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1828. res0 += src_a0 * src_b;
  1829. pa0 += 4;
  1830. pb0 += 1;
  1831. }
  1832. if ((temp - 1) & 1)
  1833. {
  1834. src_a0 = LD_SP(pa0);
  1835. src_b0[0] = pb0[0];
  1836. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1837. res0 += src_a0 * src_b;
  1838. pa0 += 4;
  1839. pb0 += 1;
  1840. }
  1841. #if defined(TRMMKERNEL)
  1842. dst0 = res0 * v_alpha;
  1843. #else
  1844. dst0 = LD_SP(pc0);
  1845. dst0 += res0 * v_alpha;
  1846. #endif
  1847. ST_SP(dst0, pc0);
  1848. pc0 += 4;
  1849. #if defined(TRMMKERNEL)
  1850. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1851. temp = k - off;
  1852. #ifdef LEFT
  1853. temp -= 4; // number of values in A
  1854. #else
  1855. temp -= 1; // number of values in B
  1856. #endif
  1857. pa0 += temp * 4;
  1858. pb0 += temp * 1;
  1859. #endif
  1860. #ifdef LEFT
  1861. off += 4; // number of values in A
  1862. #endif
  1863. #endif
  1864. }
  1865. if (m & 2)
  1866. {
  1867. #if defined(TRMMKERNEL)
  1868. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1869. pb0 = B;
  1870. #else
  1871. pa0 += off * 2;
  1872. pb0 = B + off * 1;
  1873. #endif
  1874. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1875. temp = k - off;
  1876. #elif defined(LEFT)
  1877. temp = off + 2; // number of values in A
  1878. #else
  1879. temp = off + 1; // number of values in B
  1880. #endif
  1881. #else
  1882. pb0 = B;
  1883. temp = k;
  1884. #endif
  1885. a0 = pa0[0];
  1886. b0 = pb0[0];
  1887. tmp0 = a0 * b0;
  1888. a1 = pa0[1];
  1889. tmp1 = a1 * b0;
  1890. pa0 += 2;
  1891. pb0 += 1;
  1892. for (l = ((temp - 1) >> 1); l--;)
  1893. {
  1894. a0 = pa0[0];
  1895. b0 = pb0[0];
  1896. tmp0 += a0 * b0;
  1897. a1 = pa0[1];
  1898. tmp1 += a1 * b0;
  1899. pa0 += 2;
  1900. pb0 += 1;
  1901. a0 = pa0[0];
  1902. b0 = pb0[0];
  1903. tmp0 += a0 * b0;
  1904. a1 = pa0[1];
  1905. tmp1 += a1 * b0;
  1906. pa0 += 2;
  1907. pb0 += 1;
  1908. }
  1909. if ((temp - 1) & 1)
  1910. {
  1911. a0 = pa0[0];
  1912. b0 = pb0[0];
  1913. tmp0 += a0 * b0;
  1914. a1 = pa0[1];
  1915. tmp1 += a1 * b0;
  1916. pa0 += 2;
  1917. pb0 += 1;
  1918. }
  1919. tmp0 = alpha * tmp0;
  1920. tmp1 = alpha * tmp1;
  1921. #if defined(TRMMKERNEL)
  1922. pc0[0] = tmp0;
  1923. pc0[1] = tmp1;
  1924. #else
  1925. pc0[0] += tmp0;
  1926. pc0[1] += tmp1;
  1927. #endif
  1928. pc0 += 2;
  1929. #if defined(TRMMKERNEL)
  1930. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1931. temp = k - off;
  1932. #ifdef LEFT
  1933. temp -= 2; // number of values in A
  1934. #else
  1935. temp -= 1; // number of values in B
  1936. #endif
  1937. pa0 += temp * 2;
  1938. pb0 += temp * 1;
  1939. #endif
  1940. #ifdef LEFT
  1941. off += 2; // number of values in A
  1942. #endif
  1943. #endif
  1944. }
  1945. if (m & 1)
  1946. {
  1947. #if defined(TRMMKERNEL)
  1948. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1949. pb0 = B;
  1950. #else
  1951. pa0 += off * 1;
  1952. pb0 = B + off * 1;
  1953. #endif
  1954. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1955. temp = k - off;
  1956. #elif defined(LEFT)
  1957. temp = off + 1; // number of values in A
  1958. #else
  1959. temp = off + 1; // number of values in B
  1960. #endif
  1961. #else
  1962. pb0 = B;
  1963. temp = k;
  1964. #endif
  1965. a0 = pa0[0];
  1966. b0 = pb0[0];
  1967. tmp0 = a0 * b0;
  1968. pa0 += 1;
  1969. pb0 += 1;
  1970. for (l = ((temp - 1) >> 1); l--;)
  1971. {
  1972. a0 = pa0[0];
  1973. b0 = pb0[0];
  1974. tmp0 += a0 * b0;
  1975. pa0 += 1;
  1976. pb0 += 1;
  1977. a0 = pa0[0];
  1978. b0 = pb0[0];
  1979. tmp0 += a0 * b0;
  1980. pa0 += 1;
  1981. pb0 += 1;
  1982. }
  1983. if ((temp - 1) & 1)
  1984. {
  1985. a0 = pa0[0];
  1986. b0 = pb0[0];
  1987. tmp0 += a0 * b0;
  1988. pa0 += 1;
  1989. pb0 += 1;
  1990. }
  1991. #if defined(TRMMKERNEL)
  1992. pc0[0] = alpha * tmp0;
  1993. #else
  1994. pc0[0] += alpha * tmp0;
  1995. #endif
  1996. }
  1997. }
  1998. return 0;
  1999. }