You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_8x8_msa.c 46 kB


  1. /*******************************************************************************
  2. Copyright (c) 2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #include "common.h"
  28. #include "macros_msa.h"
  29. int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
  30. FLOAT *C, BLASLONG ldc
  31. #ifdef TRMMKERNEL
  32. , BLASLONG offset
  33. #endif
  34. )
  35. {
  36. BLASLONG i, j, l;
  37. FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
  38. FLOAT *pa0, *pb0;
  39. FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  40. FLOAT tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
  41. FLOAT a0, a1;
  42. FLOAT b0, b1, b2, b3, b4, b5, b6, b7;
  43. v4f32 v_alpha = {alpha, alpha, alpha, alpha};
  44. v4f32 src_a0, src_a1, src_b, src_b0, src_b1;
  45. v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  46. v4f32 res0, res1, res2, res3, res4, res5, res6, res7;
  47. v4f32 res8, res9, res10, res11, res12, res13, res14, res15;
  48. for (j = (n / 8); j--;)
  49. {
  50. pc0 = C;
  51. pc1 = pc0 + ldc;
  52. pc2 = pc1 + ldc;
  53. pc3 = pc2 + ldc;
  54. pc4 = pc3 + ldc;
  55. pc5 = pc4 + ldc;
  56. pc6 = pc5 + ldc;
  57. pc7 = pc6 + ldc;
  58. pa0 = A;
  59. for (i = (m / 8); i--;)
  60. {
  61. pb0 = B;
  62. LD_SP2(pa0, 4, src_a0, src_a1);
  63. LD_SP2(pb0, 4, src_b0, src_b1);
  64. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  65. res0 = src_a0 * src_b;
  66. res1 = src_a1 * src_b;
  67. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  68. res2 = src_a0 * src_b;
  69. res3 = src_a1 * src_b;
  70. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  71. res4 = src_a0 * src_b;
  72. res5 = src_a1 * src_b;
  73. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  74. res6 = src_a0 * src_b;
  75. res7 = src_a1 * src_b;
  76. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  77. res8 = src_a0 * src_b;
  78. res9 = src_a1 * src_b;
  79. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  80. res10 = src_a0 * src_b;
  81. res11 = src_a1 * src_b;
  82. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  83. res12 = src_a0 * src_b;
  84. res13 = src_a1 * src_b;
  85. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  86. res14 = src_a0 * src_b;
  87. res15 = src_a1 * src_b;
  88. pa0 += 8;
  89. pb0 += 8;
  90. for (l = ((k - 1) / 2); l--;)
  91. {
  92. LD_SP2(pa0, 4, src_a0, src_a1);
  93. LD_SP2(pb0, 4, src_b0, src_b1);
  94. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  95. res0 += src_a0 * src_b;
  96. res1 += src_a1 * src_b;
  97. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  98. res2 += src_a0 * src_b;
  99. res3 += src_a1 * src_b;
  100. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  101. res4 += src_a0 * src_b;
  102. res5 += src_a1 * src_b;
  103. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  104. res6 += src_a0 * src_b;
  105. res7 += src_a1 * src_b;
  106. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  107. res8 += src_a0 * src_b;
  108. res9 += src_a1 * src_b;
  109. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  110. res10 += src_a0 * src_b;
  111. res11 += src_a1 * src_b;
  112. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  113. res12 += src_a0 * src_b;
  114. res13 += src_a1 * src_b;
  115. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  116. res14 += src_a0 * src_b;
  117. res15 += src_a1 * src_b;
  118. pa0 += 8;
  119. pb0 += 8;
  120. LD_SP2(pa0, 4, src_a0, src_a1);
  121. LD_SP2(pb0, 4, src_b0, src_b1);
  122. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  123. res0 += src_a0 * src_b;
  124. res1 += src_a1 * src_b;
  125. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  126. res2 += src_a0 * src_b;
  127. res3 += src_a1 * src_b;
  128. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  129. res4 += src_a0 * src_b;
  130. res5 += src_a1 * src_b;
  131. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  132. res6 += src_a0 * src_b;
  133. res7 += src_a1 * src_b;
  134. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  135. res8 += src_a0 * src_b;
  136. res9 += src_a1 * src_b;
  137. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  138. res10 += src_a0 * src_b;
  139. res11 += src_a1 * src_b;
  140. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  141. res12 += src_a0 * src_b;
  142. res13 += src_a1 * src_b;
  143. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  144. res14 += src_a0 * src_b;
  145. res15 += src_a1 * src_b;
  146. pa0 += 8;
  147. pb0 += 8;
  148. }
  149. if ((k - 1) & 1)
  150. {
  151. LD_SP2(pa0, 4, src_a0, src_a1);
  152. LD_SP2(pb0, 4, src_b0, src_b1);
  153. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  154. res0 += src_a0 * src_b;
  155. res1 += src_a1 * src_b;
  156. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  157. res2 += src_a0 * src_b;
  158. res3 += src_a1 * src_b;
  159. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  160. res4 += src_a0 * src_b;
  161. res5 += src_a1 * src_b;
  162. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  163. res6 += src_a0 * src_b;
  164. res7 += src_a1 * src_b;
  165. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  166. res8 += src_a0 * src_b;
  167. res9 += src_a1 * src_b;
  168. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  169. res10 += src_a0 * src_b;
  170. res11 += src_a1 * src_b;
  171. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  172. res12 += src_a0 * src_b;
  173. res13 += src_a1 * src_b;
  174. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  175. res14 += src_a0 * src_b;
  176. res15 += src_a1 * src_b;
  177. pa0 += 8;
  178. pb0 += 8;
  179. }
  180. LD_SP2(pc0, 4, dst0, dst1);
  181. LD_SP2(pc1, 4, dst2, dst3);
  182. LD_SP2(pc2, 4, dst4, dst5);
  183. LD_SP2(pc3, 4, dst6, dst7);
  184. dst0 += res0 * v_alpha;
  185. dst1 += res1 * v_alpha;
  186. dst2 += res2 * v_alpha;
  187. dst3 += res3 * v_alpha;
  188. dst4 += res4 * v_alpha;
  189. dst5 += res5 * v_alpha;
  190. dst6 += res6 * v_alpha;
  191. dst7 += res7 * v_alpha;
  192. ST_SP2(dst0, dst1, pc0, 4);
  193. ST_SP2(dst2, dst3, pc1, 4);
  194. ST_SP2(dst4, dst5, pc2, 4);
  195. ST_SP2(dst6, dst7, pc3, 4);
  196. LD_SP2(pc4, 4, dst0, dst1);
  197. LD_SP2(pc5, 4, dst2, dst3);
  198. LD_SP2(pc6, 4, dst4, dst5);
  199. LD_SP2(pc7, 4, dst6, dst7);
  200. dst0 += res8 * v_alpha;
  201. dst1 += res9 * v_alpha;
  202. dst2 += res10 * v_alpha;
  203. dst3 += res11 * v_alpha;
  204. dst4 += res12 * v_alpha;
  205. dst5 += res13 * v_alpha;
  206. dst6 += res14 * v_alpha;
  207. dst7 += res15 * v_alpha;
  208. ST_SP2(dst0, dst1, pc4, 4);
  209. ST_SP2(dst2, dst3, pc5, 4);
  210. ST_SP2(dst4, dst5, pc6, 4);
  211. ST_SP2(dst6, dst7, pc7, 4);
  212. pc0 += 8;
  213. pc1 += 8;
  214. pc2 += 8;
  215. pc3 += 8;
  216. pc4 += 8;
  217. pc5 += 8;
  218. pc6 += 8;
  219. pc7 += 8;
  220. }
  221. for (i = ((m & 4) / 4); i--;)
  222. {
  223. pb0 = B;
  224. src_a0 = LD_SP(pa0);
  225. LD_SP2(pb0, 4, src_b0, src_b1);
  226. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  227. res0 = src_a0 * src_b;
  228. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  229. res1 = src_a0 * src_b;
  230. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  231. res2 = src_a0 * src_b;
  232. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  233. res3 = src_a0 * src_b;
  234. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  235. res4 = src_a0 * src_b;
  236. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  237. res5 = src_a0 * src_b;
  238. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  239. res6 = src_a0 * src_b;
  240. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  241. res7 = src_a0 * src_b;
  242. pa0 += 4;
  243. pb0 += 8;
  244. for (l = ((k - 1) / 2); l--;)
  245. {
  246. src_a0 = LD_SP(pa0);
  247. LD_SP2(pb0, 4, src_b0, src_b1);
  248. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  249. res0 += src_a0 * src_b;
  250. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  251. res1 += src_a0 * src_b;
  252. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  253. res2 += src_a0 * src_b;
  254. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  255. res3 += src_a0 * src_b;
  256. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  257. res4 += src_a0 * src_b;
  258. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  259. res5 += src_a0 * src_b;
  260. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  261. res6 += src_a0 * src_b;
  262. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  263. res7 += src_a0 * src_b;
  264. pa0 += 4;
  265. pb0 += 8;
  266. src_a0 = LD_SP(pa0);
  267. LD_SP2(pb0, 4, src_b0, src_b1);
  268. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  269. res0 += src_a0 * src_b;
  270. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  271. res1 += src_a0 * src_b;
  272. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  273. res2 += src_a0 * src_b;
  274. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  275. res3 += src_a0 * src_b;
  276. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  277. res4 += src_a0 * src_b;
  278. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  279. res5 += src_a0 * src_b;
  280. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  281. res6 += src_a0 * src_b;
  282. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  283. res7 += src_a0 * src_b;
  284. pa0 += 4;
  285. pb0 += 8;
  286. }
  287. if ((k - 1) & 1)
  288. {
  289. src_a0 = LD_SP(pa0);
  290. LD_SP2(pb0, 4, src_b0, src_b1);
  291. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  292. res0 += src_a0 * src_b;
  293. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  294. res1 += src_a0 * src_b;
  295. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  296. res2 += src_a0 * src_b;
  297. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  298. res3 += src_a0 * src_b;
  299. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0);
  300. res4 += src_a0 * src_b;
  301. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0x55);
  302. res5 += src_a0 * src_b;
  303. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xAA);
  304. res6 += src_a0 * src_b;
  305. src_b = (v4f32) __msa_shf_w((v4i32) src_b1, 0xFF);
  306. res7 += src_a0 * src_b;
  307. pa0 += 4;
  308. pb0 += 8;
  309. }
  310. dst0 = LD_SP(pc0);
  311. dst1 = LD_SP(pc1);
  312. dst2 = LD_SP(pc2);
  313. dst3 = LD_SP(pc3);
  314. dst0 += res0 * v_alpha;
  315. dst1 += res1 * v_alpha;
  316. dst2 += res2 * v_alpha;
  317. dst3 += res3 * v_alpha;
  318. ST_SP(dst0, pc0);
  319. ST_SP(dst1, pc1);
  320. ST_SP(dst2, pc2);
  321. ST_SP(dst3, pc3);
  322. dst0 = LD_SP(pc4);
  323. dst1 = LD_SP(pc5);
  324. dst2 = LD_SP(pc6);
  325. dst3 = LD_SP(pc7);
  326. dst0 += res4 * v_alpha;
  327. dst1 += res5 * v_alpha;
  328. dst2 += res6 * v_alpha;
  329. dst3 += res7 * v_alpha;
  330. ST_SP(dst0, pc4);
  331. ST_SP(dst1, pc5);
  332. ST_SP(dst2, pc6);
  333. ST_SP(dst3, pc7);
  334. pc0 += 4;
  335. pc1 += 4;
  336. pc2 += 4;
  337. pc3 += 4;
  338. pc4 += 4;
  339. pc5 += 4;
  340. pc6 += 4;
  341. pc7 += 4;
  342. }
  343. for (i = ((m & 2) / 2); i--;)
  344. {
  345. pb0 = B;
  346. a0 = pa0[0];
  347. b0 = pb0[0];
  348. tmp0 = a0 * b0;
  349. a1 = pa0[1];
  350. tmp1 = a1 * b0;
  351. b1 = pb0[1];
  352. tmp2 = a0 * b1;
  353. tmp3 = a1 * b1;
  354. b2 = pb0[2];
  355. tmp4 = a0 * b2;
  356. tmp5 = a1 * b2;
  357. b3 = pb0[3];
  358. tmp6 = a0 * b3;
  359. tmp7 = a1 * b3;
  360. b4 = pb0[4];
  361. tmp8 = a0 * b4;
  362. tmp9 = a1 * b4;
  363. b5 = pb0[5];
  364. tmp10 = a0 * b5;
  365. tmp11 = a1 * b5;
  366. b6 = pb0[6];
  367. tmp12 = a0 * b6;
  368. tmp13 = a1 * b6;
  369. b7 = pb0[7];
  370. tmp14 = a0 * b7;
  371. tmp15 = a1 * b7;
  372. pa0 += 2;
  373. pb0 += 8;
  374. for (l = ((k - 1) / 2); l--;)
  375. {
  376. a0 = pa0[0];
  377. b0 = pb0[0];
  378. tmp0 += a0 * b0;
  379. a1 = pa0[1];
  380. tmp1 += a1 * b0;
  381. b1 = pb0[1];
  382. tmp2 += a0 * b1;
  383. tmp3 += a1 * b1;
  384. b2 = pb0[2];
  385. tmp4 += a0 * b2;
  386. tmp5 += a1 * b2;
  387. b3 = pb0[3];
  388. tmp6 += a0 * b3;
  389. tmp7 += a1 * b3;
  390. b4 = pb0[4];
  391. tmp8 += a0 * b4;
  392. tmp9 += a1 * b4;
  393. b5 = pb0[5];
  394. tmp10 += a0 * b5;
  395. tmp11 += a1 * b5;
  396. b6 = pb0[6];
  397. tmp12 += a0 * b6;
  398. tmp13 += a1 * b6;
  399. b7 = pb0[7];
  400. tmp14 += a0 * b7;
  401. tmp15 += a1 * b7;
  402. pa0 += 2;
  403. pb0 += 8;
  404. a0 = pa0[0];
  405. b0 = pb0[0];
  406. tmp0 += a0 * b0;
  407. a1 = pa0[1];
  408. tmp1 += a1 * b0;
  409. b1 = pb0[1];
  410. tmp2 += a0 * b1;
  411. tmp3 += a1 * b1;
  412. b2 = pb0[2];
  413. tmp4 += a0 * b2;
  414. tmp5 += a1 * b2;
  415. b3 = pb0[3];
  416. tmp6 += a0 * b3;
  417. tmp7 += a1 * b3;
  418. b4 = pb0[4];
  419. tmp8 += a0 * b4;
  420. tmp9 += a1 * b4;
  421. b5 = pb0[5];
  422. tmp10 += a0 * b5;
  423. tmp11 += a1 * b5;
  424. b6 = pb0[6];
  425. tmp12 += a0 * b6;
  426. tmp13 += a1 * b6;
  427. b7 = pb0[7];
  428. tmp14 += a0 * b7;
  429. tmp15 += a1 * b7;
  430. pa0 += 2;
  431. pb0 += 8;
  432. }
  433. if ((k - 1) & 1)
  434. {
  435. a0 = pa0[0];
  436. b0 = pb0[0];
  437. tmp0 += a0 * b0;
  438. a1 = pa0[1];
  439. tmp1 += a1 * b0;
  440. b1 = pb0[1];
  441. tmp2 += a0 * b1;
  442. tmp3 += a1 * b1;
  443. b2 = pb0[2];
  444. tmp4 += a0 * b2;
  445. tmp5 += a1 * b2;
  446. b3 = pb0[3];
  447. tmp6 += a0 * b3;
  448. tmp7 += a1 * b3;
  449. b4 = pb0[4];
  450. tmp8 += a0 * b4;
  451. tmp9 += a1 * b4;
  452. b5 = pb0[5];
  453. tmp10 += a0 * b5;
  454. tmp11 += a1 * b5;
  455. b6 = pb0[6];
  456. tmp12 += a0 * b6;
  457. tmp13 += a1 * b6;
  458. b7 = pb0[7];
  459. tmp14 += a0 * b7;
  460. tmp15 += a1 * b7;
  461. pa0 += 2;
  462. pb0 += 8;
  463. }
  464. tmp0 = alpha * tmp0;
  465. tmp2 = alpha * tmp2;
  466. tmp4 = alpha * tmp4;
  467. tmp6 = alpha * tmp6;
  468. tmp8 = alpha * tmp8;
  469. tmp10 = alpha * tmp10;
  470. tmp12 = alpha * tmp12;
  471. tmp14 = alpha * tmp14;
  472. pc0[0] += tmp0;
  473. pc1[0] += tmp2;
  474. pc2[0] += tmp4;
  475. pc3[0] += tmp6;
  476. pc4[0] += tmp8;
  477. pc5[0] += tmp10;
  478. pc6[0] += tmp12;
  479. pc7[0] += tmp14;
  480. tmp1 = alpha * tmp1;
  481. tmp3 = alpha * tmp3;
  482. tmp5 = alpha * tmp5;
  483. tmp7 = alpha * tmp7;
  484. tmp9 = alpha * tmp9;
  485. tmp11 = alpha * tmp11;
  486. tmp13 = alpha * tmp13;
  487. tmp15 = alpha * tmp15;
  488. pc0[1] += tmp1;
  489. pc1[1] += tmp3;
  490. pc2[1] += tmp5;
  491. pc3[1] += tmp7;
  492. pc4[1] += tmp9;
  493. pc5[1] += tmp11;
  494. pc6[1] += tmp13;
  495. pc7[1] += tmp15;
  496. pc0 += 2;
  497. pc1 += 2;
  498. pc2 += 2;
  499. pc3 += 2;
  500. pc4 += 2;
  501. pc5 += 2;
  502. pc6 += 2;
  503. pc7 += 2;
  504. }
  505. for (i = (m & 1); i--;)
  506. {
  507. pb0 = B;
  508. a0 = pa0[0];
  509. b0 = pb0[0];
  510. tmp0 = a0 * b0;
  511. b1 = pb0[1];
  512. tmp1 = a0 * b1;
  513. b2 = pb0[2];
  514. tmp2 = a0 * b2;
  515. b3 = pb0[3];
  516. tmp3 = a0 * b3;
  517. b4 = pb0[4];
  518. tmp4 = a0 * b4;
  519. b5 = pb0[5];
  520. tmp5 = a0 * b5;
  521. b6 = pb0[6];
  522. tmp6 = a0 * b6;
  523. b7 = pb0[7];
  524. tmp7 = a0 * b7;
  525. pa0 += 1;
  526. pb0 += 8;
  527. for (l = ((k - 1) / 2); l--;)
  528. {
  529. a0 = pa0[0];
  530. b0 = pb0[0];
  531. tmp0 += a0 * b0;
  532. b1 = pb0[1];
  533. tmp1 += a0 * b1;
  534. b2 = pb0[2];
  535. tmp2 += a0 * b2;
  536. b3 = pb0[3];
  537. tmp3 += a0 * b3;
  538. b4 = pb0[4];
  539. tmp4 += a0 * b4;
  540. b5 = pb0[5];
  541. tmp5 += a0 * b5;
  542. b6 = pb0[6];
  543. tmp6 += a0 * b6;
  544. b7 = pb0[7];
  545. tmp7 += a0 * b7;
  546. pa0 += 1;
  547. pb0 += 8;
  548. a0 = pa0[0];
  549. b0 = pb0[0];
  550. tmp0 += a0 * b0;
  551. b1 = pb0[1];
  552. tmp1 += a0 * b1;
  553. b2 = pb0[2];
  554. tmp2 += a0 * b2;
  555. b3 = pb0[3];
  556. tmp3 += a0 * b3;
  557. b4 = pb0[4];
  558. tmp4 += a0 * b4;
  559. b5 = pb0[5];
  560. tmp5 += a0 * b5;
  561. b6 = pb0[6];
  562. tmp6 += a0 * b6;
  563. b7 = pb0[7];
  564. tmp7 += a0 * b7;
  565. pa0 += 1;
  566. pb0 += 8;
  567. }
  568. if ((k - 1) & 1)
  569. {
  570. a0 = pa0[0];
  571. b0 = pb0[0];
  572. tmp0 += a0 * b0;
  573. b1 = pb0[1];
  574. tmp1 += a0 * b1;
  575. b2 = pb0[2];
  576. tmp2 += a0 * b2;
  577. b3 = pb0[3];
  578. tmp3 += a0 * b3;
  579. b4 = pb0[4];
  580. tmp4 += a0 * b4;
  581. b5 = pb0[5];
  582. tmp5 += a0 * b5;
  583. b6 = pb0[6];
  584. tmp6 += a0 * b6;
  585. b7 = pb0[7];
  586. tmp7 += a0 * b7;
  587. pa0 += 1;
  588. pb0 += 8;
  589. }
  590. tmp0 = alpha * tmp0;
  591. tmp1 = alpha * tmp1;
  592. tmp2 = alpha * tmp2;
  593. tmp3 = alpha * tmp3;
  594. tmp4 = alpha * tmp4;
  595. tmp5 = alpha * tmp5;
  596. tmp6 = alpha * tmp6;
  597. tmp7 = alpha * tmp7;
  598. pc0[0] += tmp0;
  599. pc1[0] += tmp1;
  600. pc2[0] += tmp2;
  601. pc3[0] += tmp3;
  602. pc4[0] += tmp4;
  603. pc5[0] += tmp5;
  604. pc6[0] += tmp6;
  605. pc7[0] += tmp7;
  606. pc0 += 1;
  607. pc1 += 1;
  608. pc2 += 1;
  609. pc3 += 1;
  610. pc4 += 1;
  611. pc5 += 1;
  612. pc6 += 1;
  613. pc7 += 1;
  614. }
  615. l = (k << 3);
  616. B = B + l;
  617. i = (ldc << 3);
  618. C = C + i;
  619. }
  620. for (j = ((n & 4) / 4); j--;)
  621. {
  622. pc0 = C;
  623. pc1 = pc0 + ldc;
  624. pc2 = pc1 + ldc;
  625. pc3 = pc2 + ldc;
  626. pa0 = A;
  627. for (i = (m / 8); i--;)
  628. {
  629. pb0 = B;
  630. LD_SP2(pa0, 4, src_a0, src_a1);
  631. src_b0 = LD_SP(pb0);
  632. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  633. res0 = src_a0 * src_b;
  634. res1 = src_a1 * src_b;
  635. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  636. res2 = src_a0 * src_b;
  637. res3 = src_a1 * src_b;
  638. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  639. res4 = src_a0 * src_b;
  640. res5 = src_a1 * src_b;
  641. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  642. res6 = src_a0 * src_b;
  643. res7 = src_a1 * src_b;
  644. pa0 += 8;
  645. pb0 += 4;
  646. for (l = ((k - 1) / 2); l--;)
  647. {
  648. LD_SP2(pa0, 4, src_a0, src_a1);
  649. src_b0 = LD_SP(pb0);
  650. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  651. res0 += src_a0 * src_b;
  652. res1 += src_a1 * src_b;
  653. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  654. res2 += src_a0 * src_b;
  655. res3 += src_a1 * src_b;
  656. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  657. res4 += src_a0 * src_b;
  658. res5 += src_a1 * src_b;
  659. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  660. res6 += src_a0 * src_b;
  661. res7 += src_a1 * src_b;
  662. pa0 += 8;
  663. pb0 += 4;
  664. LD_SP2(pa0, 4, src_a0, src_a1);
  665. src_b0 = LD_SP(pb0);
  666. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  667. res0 += src_a0 * src_b;
  668. res1 += src_a1 * src_b;
  669. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  670. res2 += src_a0 * src_b;
  671. res3 += src_a1 * src_b;
  672. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  673. res4 += src_a0 * src_b;
  674. res5 += src_a1 * src_b;
  675. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  676. res6 += src_a0 * src_b;
  677. res7 += src_a1 * src_b;
  678. pa0 += 8;
  679. pb0 += 4;
  680. }
  681. if ((k - 1) & 1)
  682. {
  683. LD_SP2(pa0, 4, src_a0, src_a1);
  684. src_b0 = LD_SP(pb0);
  685. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  686. res0 += src_a0 * src_b;
  687. res1 += src_a1 * src_b;
  688. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  689. res2 += src_a0 * src_b;
  690. res3 += src_a1 * src_b;
  691. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  692. res4 += src_a0 * src_b;
  693. res5 += src_a1 * src_b;
  694. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  695. res6 += src_a0 * src_b;
  696. res7 += src_a1 * src_b;
  697. pa0 += 8;
  698. pb0 += 4;
  699. }
  700. LD_SP2(pc0, 4, dst0, dst1);
  701. LD_SP2(pc1, 4, dst2, dst3);
  702. LD_SP2(pc2, 4, dst4, dst5);
  703. LD_SP2(pc3, 4, dst6, dst7);
  704. dst0 += res0 * v_alpha;
  705. dst1 += res1 * v_alpha;
  706. dst2 += res2 * v_alpha;
  707. dst3 += res3 * v_alpha;
  708. dst4 += res4 * v_alpha;
  709. dst5 += res5 * v_alpha;
  710. dst6 += res6 * v_alpha;
  711. dst7 += res7 * v_alpha;
  712. ST_SP2(dst0, dst1, pc0, 4);
  713. ST_SP2(dst2, dst3, pc1, 4);
  714. ST_SP2(dst4, dst5, pc2, 4);
  715. ST_SP2(dst6, dst7, pc3, 4);
  716. pc0 += 8;
  717. pc1 += 8;
  718. pc2 += 8;
  719. pc3 += 8;
  720. }
  721. for (i = ((m & 4) / 4); i--;)
  722. {
  723. pb0 = B;
  724. src_a0 = LD_SP(pa0);
  725. src_b0 = LD_SP(pb0);
  726. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  727. res0 = src_a0 * src_b;
  728. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  729. res1 = src_a0 * src_b;
  730. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  731. res2 = src_a0 * src_b;
  732. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  733. res3 = src_a0 * src_b;
  734. pa0 += 4;
  735. pb0 += 4;
  736. for (l = ((k - 1) / 2); l--;)
  737. {
  738. src_a0 = LD_SP(pa0);
  739. src_b0 = LD_SP(pb0);
  740. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  741. res0 += src_a0 * src_b;
  742. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  743. res1 += src_a0 * src_b;
  744. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  745. res2 += src_a0 * src_b;
  746. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  747. res3 += src_a0 * src_b;
  748. pa0 += 4;
  749. pb0 += 4;
  750. src_a0 = LD_SP(pa0);
  751. src_b0 = LD_SP(pb0);
  752. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  753. res0 += src_a0 * src_b;
  754. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  755. res1 += src_a0 * src_b;
  756. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  757. res2 += src_a0 * src_b;
  758. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  759. res3 += src_a0 * src_b;
  760. pa0 += 4;
  761. pb0 += 4;
  762. }
  763. if ((k - 1) & 1)
  764. {
  765. src_a0 = LD_SP(pa0);
  766. src_b0 = LD_SP(pb0);
  767. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  768. res0 += src_a0 * src_b;
  769. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  770. res1 += src_a0 * src_b;
  771. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xAA);
  772. res2 += src_a0 * src_b;
  773. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0xFF);
  774. res3 += src_a0 * src_b;
  775. pa0 += 4;
  776. pb0 += 4;
  777. }
  778. dst0 = LD_SP(pc0);
  779. dst1 = LD_SP(pc1);
  780. dst2 = LD_SP(pc2);
  781. dst3 = LD_SP(pc3);
  782. dst0 += res0 * v_alpha;
  783. dst1 += res1 * v_alpha;
  784. dst2 += res2 * v_alpha;
  785. dst3 += res3 * v_alpha;
  786. ST_SP(dst0, pc0);
  787. ST_SP(dst1, pc1);
  788. ST_SP(dst2, pc2);
  789. ST_SP(dst3, pc3);
  790. pc0 += 4;
  791. pc1 += 4;
  792. pc2 += 4;
  793. pc3 += 4;
  794. }
  795. for (i = ((m & 2) / 2); i--;)
  796. {
  797. pb0 = B;
  798. a0 = pa0[0];
  799. b0 = pb0[0];
  800. tmp0 = a0 * b0;
  801. a1 = pa0[1];
  802. tmp1 = a1 * b0;
  803. b1 = pb0[1];
  804. tmp2 = a0 * b1;
  805. tmp3 = a1 * b1;
  806. b2 = pb0[2];
  807. tmp4 = a0 * b2;
  808. tmp5 = a1 * b2;
  809. b3 = pb0[3];
  810. tmp6 = a0 * b3;
  811. tmp7 = a1 * b3;
  812. pa0 += 2;
  813. pb0 += 4;
  814. for (l = ((k - 1) / 2); l--;)
  815. {
  816. a0 = pa0[0];
  817. b0 = pb0[0];
  818. tmp0 += a0 * b0;
  819. a1 = pa0[1];
  820. tmp1 += a1 * b0;
  821. b1 = pb0[1];
  822. tmp2 += a0 * b1;
  823. tmp3 += a1 * b1;
  824. b2 = pb0[2];
  825. tmp4 += a0 * b2;
  826. tmp5 += a1 * b2;
  827. b3 = pb0[3];
  828. tmp6 += a0 * b3;
  829. tmp7 += a1 * b3;
  830. pa0 += 2;
  831. pb0 += 4;
  832. a0 = pa0[0];
  833. b0 = pb0[0];
  834. tmp0 += a0 * b0;
  835. a1 = pa0[1];
  836. tmp1 += a1 * b0;
  837. b1 = pb0[1];
  838. tmp2 += a0 * b1;
  839. tmp3 += a1 * b1;
  840. b2 = pb0[2];
  841. tmp4 += a0 * b2;
  842. tmp5 += a1 * b2;
  843. b3 = pb0[3];
  844. tmp6 += a0 * b3;
  845. tmp7 += a1 * b3;
  846. pa0 += 2;
  847. pb0 += 4;
  848. }
  849. if ((k - 1) & 1)
  850. {
  851. a0 = pa0[0];
  852. b0 = pb0[0];
  853. tmp0 += a0 * b0;
  854. a1 = pa0[1];
  855. tmp1 += a1 * b0;
  856. b1 = pb0[1];
  857. tmp2 += a0 * b1;
  858. tmp3 += a1 * b1;
  859. b2 = pb0[2];
  860. tmp4 += a0 * b2;
  861. tmp5 += a1 * b2;
  862. b3 = pb0[3];
  863. tmp6 += a0 * b3;
  864. tmp7 += a1 * b3;
  865. pa0 += 2;
  866. pb0 += 4;
  867. }
  868. tmp0 = alpha * tmp0;
  869. tmp2 = alpha * tmp2;
  870. tmp4 = alpha * tmp4;
  871. tmp6 = alpha * tmp6;
  872. pc0[0] += tmp0;
  873. pc1[0] += tmp2;
  874. pc2[0] += tmp4;
  875. pc3[0] += tmp6;
  876. tmp1 = alpha * tmp1;
  877. tmp3 = alpha * tmp3;
  878. tmp5 = alpha * tmp5;
  879. tmp7 = alpha * tmp7;
  880. pc0[1] += tmp1;
  881. pc1[1] += tmp3;
  882. pc2[1] += tmp5;
  883. pc3[1] += tmp7;
  884. pc0 += 2;
  885. pc1 += 2;
  886. pc2 += 2;
  887. pc3 += 2;
  888. }
  889. for (i = (m & 1); i--;)
  890. {
  891. pb0 = B;
  892. a0 = pa0[0];
  893. b0 = pb0[0];
  894. tmp0 = a0 * b0;
  895. b1 = pb0[1];
  896. tmp1 = a0 * b1;
  897. b2 = pb0[2];
  898. tmp2 = a0 * b2;
  899. b3 = pb0[3];
  900. tmp3 = a0 * b3;
  901. pa0 += 1;
  902. pb0 += 4;
  903. for (l = ((k - 1) / 2); l--;)
  904. {
  905. a0 = pa0[0];
  906. b0 = pb0[0];
  907. tmp0 += a0 * b0;
  908. b1 = pb0[1];
  909. tmp1 += a0 * b1;
  910. b2 = pb0[2];
  911. tmp2 += a0 * b2;
  912. b3 = pb0[3];
  913. tmp3 += a0 * b3;
  914. pa0 += 1;
  915. pb0 += 4;
  916. a0 = pa0[0];
  917. b0 = pb0[0];
  918. tmp0 += a0 * b0;
  919. b1 = pb0[1];
  920. tmp1 += a0 * b1;
  921. b2 = pb0[2];
  922. tmp2 += a0 * b2;
  923. b3 = pb0[3];
  924. tmp3 += a0 * b3;
  925. pa0 += 1;
  926. pb0 += 4;
  927. }
  928. if ((k - 1) & 1)
  929. {
  930. a0 = pa0[0];
  931. b0 = pb0[0];
  932. tmp0 += a0 * b0;
  933. b1 = pb0[1];
  934. tmp1 += a0 * b1;
  935. b2 = pb0[2];
  936. tmp2 += a0 * b2;
  937. b3 = pb0[3];
  938. tmp3 += a0 * b3;
  939. pa0 += 1;
  940. pb0 += 4;
  941. }
  942. tmp0 = alpha * tmp0;
  943. tmp1 = alpha * tmp1;
  944. tmp2 = alpha * tmp2;
  945. tmp3 = alpha * tmp3;
  946. pc0[0] += tmp0;
  947. pc1[0] += tmp1;
  948. pc2[0] += tmp2;
  949. pc3[0] += tmp3;
  950. pc0 += 1;
  951. pc1 += 1;
  952. pc2 += 1;
  953. pc3 += 1;
  954. }
  955. l = (k << 2);
  956. B = B + l;
  957. i = (ldc << 2);
  958. C = C + i;
  959. }
  960. for (j = ((n & 2) / 2); j--;)
  961. {
  962. pc0 = C;
  963. pc1 = pc0 + ldc;
  964. pa0 = A;
  965. for (i = (m / 8); i--;)
  966. {
  967. pb0 = B;
  968. LD_SP2(pa0, 4, src_a0, src_a1);
  969. src_b0[0] = pb0[0];
  970. src_b0[1] = pb0[1];
  971. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  972. res0 = src_a0 * src_b;
  973. res1 = src_a1 * src_b;
  974. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  975. res2 = src_a0 * src_b;
  976. res3 = src_a1 * src_b;
  977. pa0 += 8;
  978. pb0 += 2;
  979. for (l = ((k - 1) / 2); l--;)
  980. {
  981. LD_SP2(pa0, 4, src_a0, src_a1);
  982. src_b0[0] = pb0[0];
  983. src_b0[1] = pb0[1];
  984. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  985. res0 += src_a0 * src_b;
  986. res1 += src_a1 * src_b;
  987. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  988. res2 += src_a0 * src_b;
  989. res3 += src_a1 * src_b;
  990. pa0 += 8;
  991. pb0 += 2;
  992. LD_SP2(pa0, 4, src_a0, src_a1);
  993. src_b0[0] = pb0[0];
  994. src_b0[1] = pb0[1];
  995. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  996. res0 += src_a0 * src_b;
  997. res1 += src_a1 * src_b;
  998. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  999. res2 += src_a0 * src_b;
  1000. res3 += src_a1 * src_b;
  1001. pa0 += 8;
  1002. pb0 += 2;
  1003. }
  1004. if ((k - 1) & 1)
  1005. {
  1006. LD_SP2(pa0, 4, src_a0, src_a1);
  1007. src_b0[0] = pb0[0];
  1008. src_b0[1] = pb0[1];
  1009. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1010. res0 += src_a0 * src_b;
  1011. res1 += src_a1 * src_b;
  1012. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1013. res2 += src_a0 * src_b;
  1014. res3 += src_a1 * src_b;
  1015. pa0 += 8;
  1016. pb0 += 2;
  1017. }
  1018. LD_SP2(pc0, 4, dst0, dst1);
  1019. LD_SP2(pc1, 4, dst2, dst3);
  1020. dst0 += res0 * v_alpha;
  1021. dst1 += res1 * v_alpha;
  1022. dst2 += res2 * v_alpha;
  1023. dst3 += res3 * v_alpha;
  1024. ST_SP2(dst0, dst1, pc0, 4);
  1025. ST_SP2(dst2, dst3, pc1, 4);
  1026. pc0 += 8;
  1027. pc1 += 8;
  1028. }
  1029. for (i = ((m & 4) / 4); i--;)
  1030. {
  1031. pb0 = B;
  1032. src_a0 = LD_SP(pa0);
  1033. src_b0[0] = pb0[0];
  1034. src_b0[1] = pb0[1];
  1035. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1036. res0 = src_a0 * src_b;
  1037. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1038. res1 = src_a0 * src_b;
  1039. pa0 += 4;
  1040. pb0 += 2;
  1041. for (l = ((k - 1) / 2); l--;)
  1042. {
  1043. src_a0 = LD_SP(pa0);
  1044. src_b0[0] = pb0[0];
  1045. src_b0[1] = pb0[1];
  1046. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1047. res0 += src_a0 * src_b;
  1048. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1049. res1 += src_a0 * src_b;
  1050. pa0 += 4;
  1051. pb0 += 2;
  1052. src_a0 = LD_SP(pa0);
  1053. src_b0[0] = pb0[0];
  1054. src_b0[1] = pb0[1];
  1055. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1056. res0 += src_a0 * src_b;
  1057. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1058. res1 += src_a0 * src_b;
  1059. pa0 += 4;
  1060. pb0 += 2;
  1061. }
  1062. if ((k - 1) & 1)
  1063. {
  1064. src_a0 = LD_SP(pa0);
  1065. src_b0[0] = pb0[0];
  1066. src_b0[1] = pb0[1];
  1067. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1068. res0 += src_a0 * src_b;
  1069. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0x55);
  1070. res1 += src_a0 * src_b;
  1071. pa0 += 4;
  1072. pb0 += 2;
  1073. }
  1074. dst0 = LD_SP(pc0);
  1075. dst1 = LD_SP(pc1);
  1076. dst0 += res0 * v_alpha;
  1077. dst1 += res1 * v_alpha;
  1078. ST_SP(dst0, pc0);
  1079. ST_SP(dst1, pc1);
  1080. pc0 += 4;
  1081. pc1 += 4;
  1082. }
  1083. for (i = ((m & 2) / 2); i--;)
  1084. {
  1085. pb0 = B;
  1086. a0 = pa0[0];
  1087. b0 = pb0[0];
  1088. tmp0 = a0 * b0;
  1089. a1 = pa0[1];
  1090. tmp1 = a1 * b0;
  1091. b1 = pb0[1];
  1092. tmp2 = a0 * b1;
  1093. tmp3 = a1 * b1;
  1094. pa0 += 2;
  1095. pb0 += 2;
  1096. for (l = ((k - 1) / 2); l--;)
  1097. {
  1098. a0 = pa0[0];
  1099. b0 = pb0[0];
  1100. tmp0 += a0 * b0;
  1101. a1 = pa0[1];
  1102. tmp1 += a1 * b0;
  1103. b1 = pb0[1];
  1104. tmp2 += a0 * b1;
  1105. tmp3 += a1 * b1;
  1106. pa0 += 2;
  1107. pb0 += 2;
  1108. a0 = pa0[0];
  1109. b0 = pb0[0];
  1110. tmp0 += a0 * b0;
  1111. a1 = pa0[1];
  1112. tmp1 += a1 * b0;
  1113. b1 = pb0[1];
  1114. tmp2 += a0 * b1;
  1115. tmp3 += a1 * b1;
  1116. pa0 += 2;
  1117. pb0 += 2;
  1118. }
  1119. if ((k - 1) & 1)
  1120. {
  1121. a0 = pa0[0];
  1122. b0 = pb0[0];
  1123. tmp0 += a0 * b0;
  1124. a1 = pa0[1];
  1125. tmp1 += a1 * b0;
  1126. b1 = pb0[1];
  1127. tmp2 += a0 * b1;
  1128. tmp3 += a1 * b1;
  1129. pa0 += 2;
  1130. pb0 += 2;
  1131. }
  1132. tmp0 = alpha * tmp0;
  1133. tmp2 = alpha * tmp2;
  1134. pc0[0] += tmp0;
  1135. pc1[0] += tmp2;
  1136. tmp1 = alpha * tmp1;
  1137. tmp3 = alpha * tmp3;
  1138. pc0[1] += tmp1;
  1139. pc1[1] += tmp3;
  1140. pc0 += 2;
  1141. pc1 += 2;
  1142. }
  1143. for (i = (m & 1); i--;)
  1144. {
  1145. pb0 = B;
  1146. a0 = pa0[0];
  1147. b0 = pb0[0];
  1148. tmp0 = a0 * b0;
  1149. b1 = pb0[1];
  1150. tmp1 = a0 * b1;
  1151. pa0 += 1;
  1152. pb0 += 2;
  1153. for (l = ((k - 1) / 2); l--;)
  1154. {
  1155. a0 = pa0[0];
  1156. b0 = pb0[0];
  1157. tmp0 += a0 * b0;
  1158. b1 = pb0[1];
  1159. tmp1 += a0 * b1;
  1160. pa0 += 1;
  1161. pb0 += 2;
  1162. a0 = pa0[0];
  1163. b0 = pb0[0];
  1164. tmp0 += a0 * b0;
  1165. b1 = pb0[1];
  1166. tmp1 += a0 * b1;
  1167. pa0 += 1;
  1168. pb0 += 2;
  1169. }
  1170. if ((k - 1) & 1)
  1171. {
  1172. a0 = pa0[0];
  1173. b0 = pb0[0];
  1174. tmp0 += a0 * b0;
  1175. b1 = pb0[1];
  1176. tmp1 += a0 * b1;
  1177. pa0 += 1;
  1178. pb0 += 2;
  1179. }
  1180. tmp0 = alpha * tmp0;
  1181. tmp1 = alpha * tmp1;
  1182. pc0[0] += tmp0;
  1183. pc1[0] += tmp1;
  1184. pc0 += 1;
  1185. pc1 += 1;
  1186. }
  1187. l = (k << 1);
  1188. B = B + l;
  1189. i = (ldc << 1);
  1190. C = C + i;
  1191. }
  1192. for (j = (n & 1); j--;)
  1193. {
  1194. pc0 = C;
  1195. pa0 = A;
  1196. for (i = (m / 8); i--;)
  1197. {
  1198. pb0 = B;
  1199. LD_SP2(pa0, 4, src_a0, src_a1);
  1200. src_b0[0] = pb0[0];
  1201. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1202. res0 = src_a0 * src_b;
  1203. res1 = src_a1 * src_b;
  1204. pa0 += 8;
  1205. pb0 += 1;
  1206. for (l = ((k - 1) / 2); l--;)
  1207. {
  1208. LD_SP2(pa0, 4, src_a0, src_a1);
  1209. src_b0[0] = pb0[0];
  1210. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1211. res0 += src_a0 * src_b;
  1212. res1 += src_a1 * src_b;
  1213. pa0 += 8;
  1214. pb0 += 1;
  1215. LD_SP2(pa0, 4, src_a0, src_a1);
  1216. src_b0[0] = pb0[0];
  1217. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1218. res0 += src_a0 * src_b;
  1219. res1 += src_a1 * src_b;
  1220. pa0 += 8;
  1221. pb0 += 1;
  1222. }
  1223. if ((k - 1) & 1)
  1224. {
  1225. LD_SP2(pa0, 4, src_a0, src_a1);
  1226. src_b0[0] = pb0[0];
  1227. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1228. res0 += src_a0 * src_b;
  1229. res1 += src_a1 * src_b;
  1230. pa0 += 8;
  1231. pb0 += 1;
  1232. }
  1233. LD_SP2(pc0, 4, dst0, dst1);
  1234. dst0 += res0 * v_alpha;
  1235. dst1 += res1 * v_alpha;
  1236. ST_SP2(dst0, dst1, pc0, 4);
  1237. pc0 += 8;
  1238. }
  1239. for (i = ((m & 4) / 4); i--;)
  1240. {
  1241. pb0 = B;
  1242. src_a0 = LD_SP(pa0);
  1243. src_b0[0] = pb0[0];
  1244. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1245. res0 = src_a0 * src_b;
  1246. pa0 += 4;
  1247. pb0 += 1;
  1248. for (l = ((k - 1) / 2); l--;)
  1249. {
  1250. src_a0 = LD_SP(pa0);
  1251. src_b0[0] = pb0[0];
  1252. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1253. res0 += src_a0 * src_b;
  1254. pa0 += 4;
  1255. pb0 += 1;
  1256. src_a0 = LD_SP(pa0);
  1257. src_b0[0] = pb0[0];
  1258. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1259. res0 += src_a0 * src_b;
  1260. pa0 += 4;
  1261. pb0 += 1;
  1262. }
  1263. if ((k - 1) & 1)
  1264. {
  1265. src_a0 = LD_SP(pa0);
  1266. src_b0[0] = pb0[0];
  1267. src_b = (v4f32) __msa_shf_w((v4i32) src_b0, 0);
  1268. res0 += src_a0 * src_b;
  1269. pa0 += 4;
  1270. pb0 += 1;
  1271. }
  1272. dst0 = LD_SP(pc0);
  1273. dst0 += res0 * v_alpha;
  1274. ST_SP(dst0, pc0);
  1275. pc0 += 4;
  1276. }
  1277. for (i = (m & 2) / 2; i--;)
  1278. {
  1279. pb0 = B;
  1280. a0 = pa0[0];
  1281. b0 = pb0[0];
  1282. tmp0 = a0 * b0;
  1283. a1 = pa0[1];
  1284. tmp1 = a1 * b0;
  1285. pa0 += 2;
  1286. pb0 += 1;
  1287. for (l = ((k - 1) / 2); l--;)
  1288. {
  1289. a0 = pa0[0];
  1290. b0 = pb0[0];
  1291. tmp0 += a0 * b0;
  1292. a1 = pa0[1];
  1293. tmp1 += a1 * b0;
  1294. pa0 += 2;
  1295. pb0 += 1;
  1296. a0 = pa0[0];
  1297. b0 = pb0[0];
  1298. tmp0 += a0 * b0;
  1299. a1 = pa0[1];
  1300. tmp1 += a1 * b0;
  1301. pa0 += 2;
  1302. pb0 += 1;
  1303. }
  1304. if ((k - 1) & 1)
  1305. {
  1306. a0 = pa0[0];
  1307. b0 = pb0[0];
  1308. tmp0 += a0 * b0;
  1309. a1 = pa0[1];
  1310. tmp1 += a1 * b0;
  1311. pa0 += 2;
  1312. pb0 += 1;
  1313. }
  1314. tmp0 = alpha * tmp0;
  1315. pc0[0] += tmp0;
  1316. tmp1 = alpha * tmp1;
  1317. pc0[1] += tmp1;
  1318. pc0 += 2;
  1319. }
  1320. for (i = (m & 1); i--;)
  1321. {
  1322. pb0 = B;
  1323. a0 = pa0[0];
  1324. b0 = pb0[0];
  1325. tmp0 = a0 * b0;
  1326. pa0 += 1;
  1327. pb0 += 1;
  1328. for (l = ((k - 1) / 2); l--;)
  1329. {
  1330. a0 = pa0[0];
  1331. b0 = pb0[0];
  1332. tmp0 += a0 * b0;
  1333. pa0 += 1;
  1334. pb0 += 1;
  1335. a0 = pa0[0];
  1336. b0 = pb0[0];
  1337. tmp0 += a0 * b0;
  1338. pa0 += 1;
  1339. pb0 += 1;
  1340. }
  1341. if ((k - 1) & 1)
  1342. {
  1343. a0 = pa0[0];
  1344. b0 = pb0[0];
  1345. tmp0 += a0 * b0;
  1346. pa0 += 1;
  1347. pb0 += 1;
  1348. }
  1349. pc0[0] += alpha * tmp0;
  1350. pc0 += 1;
  1351. }
  1352. l = (k << 0);
  1353. B = B + l;
  1354. i = (ldc << 0);
  1355. C = C + i;
  1356. }
  1357. return 0;
  1358. }