You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

strmm_macros_16x8_power8.S 95 kB


  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/04/02 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. /**********************************************************************************************
  35. * Macros for N=8 and M=16
  36. **********************************************************************************************/
  37. #if defined(_AIX)
  38. define(`LOAD8x16_1', `
  39. #else
  40. .macro LOAD8x16_1
  41. #endif
  42. lxvw4x vs0, o0, AO
  43. lxvw4x vs1, o16, AO
  44. lxvw4x vs2, o32, AO
  45. lxvw4x vs3, o48, AO
  46. addi AO, AO, 64
  47. lxvw4x vs28, o0, BO
  48. xxspltw vs8, vs28, 0
  49. xxspltw vs9, vs28, 1
  50. xxspltw vs10, vs28, 2
  51. xxspltw vs11, vs28, 3
  52. lxvw4x vs29, o16, BO
  53. xxspltw vs12, vs29, 0
  54. xxspltw vs13, vs29, 1
  55. xxspltw vs14, vs29, 2
  56. xxspltw vs15, vs29, 3
  57. addi BO, BO, 32
  58. #if defined(_AIX)
  59. ')
  60. #else
  61. .endm
  62. #endif
  63. #if defined(_AIX)
  64. define(`KERNEL8x16_I1', `
  65. #else
  66. .macro KERNEL8x16_I1
  67. #endif
  68. lxvw4x vs4, o0, AO
  69. lxvw4x vs5, o16, AO
  70. lxvw4x vs6, o32, AO
  71. lxvw4x vs7, o48, AO
  72. addi AO, AO, 64
  73. lxvw4x vs28, o0, BO
  74. xxspltw vs16, vs28, 0
  75. xxspltw vs17, vs28, 1
  76. xxspltw vs18, vs28, 2
  77. xxspltw vs19, vs28, 3
  78. lxvw4x vs29, o16, BO
  79. xxspltw vs20, vs29, 0
  80. xxspltw vs21, vs29, 1
  81. xxspltw vs22, vs29, 2
  82. xxspltw vs23, vs29, 3
  83. addi BO, BO, 32
  84. xvmulsp vs32, vs0, vs8
  85. xvmulsp vs33, vs1, vs8
  86. xvmulsp vs34, vs2, vs8
  87. xvmulsp vs35, vs3, vs8
  88. xvmulsp vs36, vs0, vs9
  89. xvmulsp vs37, vs1, vs9
  90. xvmulsp vs38, vs2, vs9
  91. xvmulsp vs39, vs3, vs9
  92. xvmulsp vs40, vs0, vs10
  93. xvmulsp vs41, vs1, vs10
  94. xvmulsp vs42, vs2, vs10
  95. xvmulsp vs43, vs3, vs10
  96. xvmulsp vs44, vs0, vs11
  97. xvmulsp vs45, vs1, vs11
  98. xvmulsp vs46, vs2, vs11
  99. xvmulsp vs47, vs3, vs11
  100. xvmulsp vs48, vs0, vs12
  101. xvmulsp vs49, vs1, vs12
  102. xvmulsp vs50, vs2, vs12
  103. xvmulsp vs51, vs3, vs12
  104. xvmulsp vs52, vs0, vs13
  105. xvmulsp vs53, vs1, vs13
  106. xvmulsp vs54, vs2, vs13
  107. xvmulsp vs55, vs3, vs13
  108. xvmulsp vs56, vs0, vs14
  109. xvmulsp vs57, vs1, vs14
  110. xvmulsp vs58, vs2, vs14
  111. xvmulsp vs59, vs3, vs14
  112. xvmulsp vs60, vs0, vs15
  113. xvmulsp vs61, vs1, vs15
  114. xvmulsp vs62, vs2, vs15
  115. xvmulsp vs63, vs3, vs15
  116. #if defined(_AIX)
  117. ')
  118. #else
  119. .endm
  120. #endif
  121. #if defined(_AIX)
  122. define(`KERNEL8x16_1', `
  123. #else
  124. .macro KERNEL8x16_1
  125. #endif
  126. lxvw4x vs4, o0, AO
  127. lxvw4x vs5, o16, AO
  128. lxvw4x vs6, o32, AO
  129. lxvw4x vs7, o48, AO
  130. addi AO, AO, 64
  131. lxvw4x vs28, o0, BO
  132. xxspltw vs16, vs28, 0
  133. xxspltw vs17, vs28, 1
  134. xxspltw vs18, vs28, 2
  135. xxspltw vs19, vs28, 3
  136. lxvw4x vs29, o16, BO
  137. xxspltw vs20, vs29, 0
  138. xxspltw vs21, vs29, 1
  139. xxspltw vs22, vs29, 2
  140. xxspltw vs23, vs29, 3
  141. addi BO, BO, 32
  142. xvmaddasp vs32, vs0, vs8
  143. xvmaddasp vs33, vs1, vs8
  144. xvmaddasp vs34, vs2, vs8
  145. xvmaddasp vs35, vs3, vs8
  146. xvmaddasp vs36, vs0, vs9
  147. xvmaddasp vs37, vs1, vs9
  148. xvmaddasp vs38, vs2, vs9
  149. xvmaddasp vs39, vs3, vs9
  150. xvmaddasp vs40, vs0, vs10
  151. xvmaddasp vs41, vs1, vs10
  152. xvmaddasp vs42, vs2, vs10
  153. xvmaddasp vs43, vs3, vs10
  154. xvmaddasp vs44, vs0, vs11
  155. xvmaddasp vs45, vs1, vs11
  156. xvmaddasp vs46, vs2, vs11
  157. xvmaddasp vs47, vs3, vs11
  158. xvmaddasp vs48, vs0, vs12
  159. xvmaddasp vs49, vs1, vs12
  160. xvmaddasp vs50, vs2, vs12
  161. xvmaddasp vs51, vs3, vs12
  162. xvmaddasp vs52, vs0, vs13
  163. xvmaddasp vs53, vs1, vs13
  164. xvmaddasp vs54, vs2, vs13
  165. xvmaddasp vs55, vs3, vs13
  166. xvmaddasp vs56, vs0, vs14
  167. xvmaddasp vs57, vs1, vs14
  168. xvmaddasp vs58, vs2, vs14
  169. xvmaddasp vs59, vs3, vs14
  170. xvmaddasp vs60, vs0, vs15
  171. xvmaddasp vs61, vs1, vs15
  172. xvmaddasp vs62, vs2, vs15
  173. xvmaddasp vs63, vs3, vs15
  174. #if defined(_AIX)
  175. ')
  176. #else
  177. .endm
  178. #endif
  179. #if defined(_AIX)
  180. define(`KERNEL8x16_2', `
  181. #else
  182. .macro KERNEL8x16_2
  183. #endif
  184. lxvw4x vs0, o0, AO
  185. lxvw4x vs1, o16, AO
  186. lxvw4x vs2, o32, AO
  187. lxvw4x vs3, o48, AO
  188. addi AO, AO, 64
  189. lxvw4x vs28, o0, BO
  190. xxspltw vs8, vs28, 0
  191. xxspltw vs9, vs28, 1
  192. xxspltw vs10, vs28, 2
  193. xxspltw vs11, vs28, 3
  194. lxvw4x vs29, o16, BO
  195. xxspltw vs12, vs29, 0
  196. xxspltw vs13, vs29, 1
  197. xxspltw vs14, vs29, 2
  198. xxspltw vs15, vs29, 3
  199. addi BO, BO, 32
  200. xvmaddasp vs32, vs4, vs16
  201. xvmaddasp vs33, vs5, vs16
  202. xvmaddasp vs34, vs6, vs16
  203. xvmaddasp vs35, vs7, vs16
  204. xvmaddasp vs36, vs4, vs17
  205. xvmaddasp vs37, vs5, vs17
  206. xvmaddasp vs38, vs6, vs17
  207. xvmaddasp vs39, vs7, vs17
  208. xvmaddasp vs40, vs4, vs18
  209. xvmaddasp vs41, vs5, vs18
  210. xvmaddasp vs42, vs6, vs18
  211. xvmaddasp vs43, vs7, vs18
  212. xvmaddasp vs44, vs4, vs19
  213. xvmaddasp vs45, vs5, vs19
  214. xvmaddasp vs46, vs6, vs19
  215. xvmaddasp vs47, vs7, vs19
  216. xvmaddasp vs48, vs4, vs20
  217. xvmaddasp vs49, vs5, vs20
  218. xvmaddasp vs50, vs6, vs20
  219. xvmaddasp vs51, vs7, vs20
  220. xvmaddasp vs52, vs4, vs21
  221. xvmaddasp vs53, vs5, vs21
  222. xvmaddasp vs54, vs6, vs21
  223. xvmaddasp vs55, vs7, vs21
  224. xvmaddasp vs56, vs4, vs22
  225. xvmaddasp vs57, vs5, vs22
  226. xvmaddasp vs58, vs6, vs22
  227. xvmaddasp vs59, vs7, vs22
  228. xvmaddasp vs60, vs4, vs23
  229. xvmaddasp vs61, vs5, vs23
  230. xvmaddasp vs62, vs6, vs23
  231. xvmaddasp vs63, vs7, vs23
  232. #if defined(_AIX)
  233. ')
  234. #else
  235. .endm
  236. #endif
  237. #if defined(_AIX)
  238. define(`KERNEL8x16_E2', `
  239. #else
  240. .macro KERNEL8x16_E2
  241. #endif
  242. xvmaddasp vs32, vs4, vs16
  243. xvmaddasp vs33, vs5, vs16
  244. xvmaddasp vs34, vs6, vs16
  245. xvmaddasp vs35, vs7, vs16
  246. xvmaddasp vs36, vs4, vs17
  247. xvmaddasp vs37, vs5, vs17
  248. xvmaddasp vs38, vs6, vs17
  249. xvmaddasp vs39, vs7, vs17
  250. xvmaddasp vs40, vs4, vs18
  251. xvmaddasp vs41, vs5, vs18
  252. xvmaddasp vs42, vs6, vs18
  253. xvmaddasp vs43, vs7, vs18
  254. xvmaddasp vs44, vs4, vs19
  255. xvmaddasp vs45, vs5, vs19
  256. xvmaddasp vs46, vs6, vs19
  257. xvmaddasp vs47, vs7, vs19
  258. xvmaddasp vs48, vs4, vs20
  259. xvmaddasp vs49, vs5, vs20
  260. xvmaddasp vs50, vs6, vs20
  261. xvmaddasp vs51, vs7, vs20
  262. xvmaddasp vs52, vs4, vs21
  263. xvmaddasp vs53, vs5, vs21
  264. xvmaddasp vs54, vs6, vs21
  265. xvmaddasp vs55, vs7, vs21
  266. xvmaddasp vs56, vs4, vs22
  267. xvmaddasp vs57, vs5, vs22
  268. xvmaddasp vs58, vs6, vs22
  269. xvmaddasp vs59, vs7, vs22
  270. xvmaddasp vs60, vs4, vs23
  271. xvmaddasp vs61, vs5, vs23
  272. xvmaddasp vs62, vs6, vs23
  273. xvmaddasp vs63, vs7, vs23
  274. #if defined(_AIX)
  275. ')
  276. #else
  277. .endm
  278. #endif
  279. #if defined(_AIX)
  280. define(`KERNEL8x16_SUBI1', `
  281. #else
  282. .macro KERNEL8x16_SUBI1
  283. #endif
  284. lxvw4x vs0, o0, AO
  285. lxvw4x vs1, o16, AO
  286. lxvw4x vs2, o32, AO
  287. lxvw4x vs3, o48, AO
  288. addi AO, AO, 64
  289. lxvw4x vs28, o0, BO
  290. xxspltw vs8, vs28, 0
  291. xxspltw vs9, vs28, 1
  292. xxspltw vs10, vs28, 2
  293. xxspltw vs11, vs28, 3
  294. lxvw4x vs29, o16, BO
  295. xxspltw vs12, vs29, 0
  296. xxspltw vs13, vs29, 1
  297. xxspltw vs14, vs29, 2
  298. xxspltw vs15, vs29, 3
  299. addi BO, BO, 32
  300. xvmulsp vs32, vs0, vs8
  301. xvmulsp vs33, vs1, vs8
  302. xvmulsp vs34, vs2, vs8
  303. xvmulsp vs35, vs3, vs8
  304. xvmulsp vs36, vs0, vs9
  305. xvmulsp vs37, vs1, vs9
  306. xvmulsp vs38, vs2, vs9
  307. xvmulsp vs39, vs3, vs9
  308. xvmulsp vs40, vs0, vs10
  309. xvmulsp vs41, vs1, vs10
  310. xvmulsp vs42, vs2, vs10
  311. xvmulsp vs43, vs3, vs10
  312. xvmulsp vs44, vs0, vs11
  313. xvmulsp vs45, vs1, vs11
  314. xvmulsp vs46, vs2, vs11
  315. xvmulsp vs47, vs3, vs11
  316. xvmulsp vs48, vs0, vs12
  317. xvmulsp vs49, vs1, vs12
  318. xvmulsp vs50, vs2, vs12
  319. xvmulsp vs51, vs3, vs12
  320. xvmulsp vs52, vs0, vs13
  321. xvmulsp vs53, vs1, vs13
  322. xvmulsp vs54, vs2, vs13
  323. xvmulsp vs55, vs3, vs13
  324. xvmulsp vs56, vs0, vs14
  325. xvmulsp vs57, vs1, vs14
  326. xvmulsp vs58, vs2, vs14
  327. xvmulsp vs59, vs3, vs14
  328. xvmulsp vs60, vs0, vs15
  329. xvmulsp vs61, vs1, vs15
  330. xvmulsp vs62, vs2, vs15
  331. xvmulsp vs63, vs3, vs15
  332. #if defined(_AIX)
  333. ')
  334. #else
  335. .endm
  336. #endif
  337. #if defined(_AIX)
  338. define(`KERNEL8x16_SUB1', `
  339. #else
  340. .macro KERNEL8x16_SUB1
  341. #endif
  342. lxvw4x vs0, o0, AO
  343. lxvw4x vs1, o16, AO
  344. lxvw4x vs2, o32, AO
  345. lxvw4x vs3, o48, AO
  346. addi AO, AO, 64
  347. lxvw4x vs28, o0, BO
  348. xxspltw vs8, vs28, 0
  349. xxspltw vs9, vs28, 1
  350. xxspltw vs10, vs28, 2
  351. xxspltw vs11, vs28, 3
  352. lxvw4x vs29, o16, BO
  353. xxspltw vs12, vs29, 0
  354. xxspltw vs13, vs29, 1
  355. xxspltw vs14, vs29, 2
  356. xxspltw vs15, vs29, 3
  357. addi BO, BO, 32
  358. xvmaddasp vs32, vs0, vs8
  359. xvmaddasp vs33, vs1, vs8
  360. xvmaddasp vs34, vs2, vs8
  361. xvmaddasp vs35, vs3, vs8
  362. xvmaddasp vs36, vs0, vs9
  363. xvmaddasp vs37, vs1, vs9
  364. xvmaddasp vs38, vs2, vs9
  365. xvmaddasp vs39, vs3, vs9
  366. xvmaddasp vs40, vs0, vs10
  367. xvmaddasp vs41, vs1, vs10
  368. xvmaddasp vs42, vs2, vs10
  369. xvmaddasp vs43, vs3, vs10
  370. xvmaddasp vs44, vs0, vs11
  371. xvmaddasp vs45, vs1, vs11
  372. xvmaddasp vs46, vs2, vs11
  373. xvmaddasp vs47, vs3, vs11
  374. xvmaddasp vs48, vs0, vs12
  375. xvmaddasp vs49, vs1, vs12
  376. xvmaddasp vs50, vs2, vs12
  377. xvmaddasp vs51, vs3, vs12
  378. xvmaddasp vs52, vs0, vs13
  379. xvmaddasp vs53, vs1, vs13
  380. xvmaddasp vs54, vs2, vs13
  381. xvmaddasp vs55, vs3, vs13
  382. xvmaddasp vs56, vs0, vs14
  383. xvmaddasp vs57, vs1, vs14
  384. xvmaddasp vs58, vs2, vs14
  385. xvmaddasp vs59, vs3, vs14
  386. xvmaddasp vs60, vs0, vs15
  387. xvmaddasp vs61, vs1, vs15
  388. xvmaddasp vs62, vs2, vs15
  389. xvmaddasp vs63, vs3, vs15
  390. #if defined(_AIX)
  391. ')
  392. #else
  393. .endm
  394. #endif
  395. #if defined(_AIX)
  396. define(`SAVE8x16', `
  397. #else
  398. .macro SAVE8x16
  399. #endif
  400. mr T1, CO
  401. #ifndef TRMMKERNEL
  402. lxvw4x vs0, o0, T1
  403. lxvw4x vs1, o16, T1
  404. lxvw4x vs2, o32, T1
  405. lxvw4x vs3, o48, T1
  406. #endif
  407. #ifdef TRMMKERNEL
  408. xvmulsp vs0, vs32, alpha_vr
  409. xvmulsp vs1, vs33, alpha_vr
  410. xvmulsp vs2, vs34, alpha_vr
  411. xvmulsp vs3, vs35, alpha_vr
  412. #else
  413. xvmaddasp vs0, vs32, alpha_vr
  414. xvmaddasp vs1, vs33, alpha_vr
  415. xvmaddasp vs2, vs34, alpha_vr
  416. xvmaddasp vs3, vs35, alpha_vr
  417. #endif
  418. stxvw4x vs0, o0, T1
  419. stxvw4x vs1, o16, T1
  420. stxvw4x vs2, o32, T1
  421. stxvw4x vs3, o48, T1
  422. add T1, T1, LDC
  423. #ifndef TRMMKERNEL
  424. lxvw4x vs0, o0, T1
  425. lxvw4x vs1, o16, T1
  426. lxvw4x vs2, o32, T1
  427. lxvw4x vs3, o48, T1
  428. #endif
  429. #ifdef TRMMKERNEL
  430. xvmulsp vs0, vs36, alpha_vr
  431. xvmulsp vs1, vs37, alpha_vr
  432. xvmulsp vs2, vs38, alpha_vr
  433. xvmulsp vs3, vs39, alpha_vr
  434. #else
  435. xvmaddasp vs0, vs36, alpha_vr
  436. xvmaddasp vs1, vs37, alpha_vr
  437. xvmaddasp vs2, vs38, alpha_vr
  438. xvmaddasp vs3, vs39, alpha_vr
  439. #endif
  440. stxvw4x vs0, o0, T1
  441. stxvw4x vs1, o16, T1
  442. stxvw4x vs2, o32, T1
  443. stxvw4x vs3, o48, T1
  444. add T1, T1, LDC
  445. #ifndef TRMMKERNEL
  446. lxvw4x vs0, o0, T1
  447. lxvw4x vs1, o16, T1
  448. lxvw4x vs2, o32, T1
  449. lxvw4x vs3, o48, T1
  450. #endif
  451. #ifdef TRMMKERNEL
  452. xvmulsp vs0, vs40, alpha_vr
  453. xvmulsp vs1, vs41, alpha_vr
  454. xvmulsp vs2, vs42, alpha_vr
  455. xvmulsp vs3, vs43, alpha_vr
  456. #else
  457. xvmaddasp vs0, vs40, alpha_vr
  458. xvmaddasp vs1, vs41, alpha_vr
  459. xvmaddasp vs2, vs42, alpha_vr
  460. xvmaddasp vs3, vs43, alpha_vr
  461. #endif
  462. stxvw4x vs0, o0, T1
  463. stxvw4x vs1, o16, T1
  464. stxvw4x vs2, o32, T1
  465. stxvw4x vs3, o48, T1
  466. add T1, T1, LDC
  467. #ifndef TRMMKERNEL
  468. lxvw4x vs0, o0, T1
  469. lxvw4x vs1, o16, T1
  470. lxvw4x vs2, o32, T1
  471. lxvw4x vs3, o48, T1
  472. #endif
  473. #ifdef TRMMKERNEL
  474. xvmulsp vs0, vs44, alpha_vr
  475. xvmulsp vs1, vs45, alpha_vr
  476. xvmulsp vs2, vs46, alpha_vr
  477. xvmulsp vs3, vs47, alpha_vr
  478. #else
  479. xvmaddasp vs0, vs44, alpha_vr
  480. xvmaddasp vs1, vs45, alpha_vr
  481. xvmaddasp vs2, vs46, alpha_vr
  482. xvmaddasp vs3, vs47, alpha_vr
  483. #endif
  484. stxvw4x vs0, o0, T1
  485. stxvw4x vs1, o16, T1
  486. stxvw4x vs2, o32, T1
  487. stxvw4x vs3, o48, T1
  488. add T1, T1, LDC
  489. #ifndef TRMMKERNEL
  490. lxvw4x vs0, o0, T1
  491. lxvw4x vs1, o16, T1
  492. lxvw4x vs2, o32, T1
  493. lxvw4x vs3, o48, T1
  494. #endif
  495. #ifdef TRMMKERNEL
  496. xvmulsp vs0, vs48, alpha_vr
  497. xvmulsp vs1, vs49, alpha_vr
  498. xvmulsp vs2, vs50, alpha_vr
  499. xvmulsp vs3, vs51, alpha_vr
  500. #else
  501. xvmaddasp vs0, vs48, alpha_vr
  502. xvmaddasp vs1, vs49, alpha_vr
  503. xvmaddasp vs2, vs50, alpha_vr
  504. xvmaddasp vs3, vs51, alpha_vr
  505. #endif
  506. stxvw4x vs0, o0, T1
  507. stxvw4x vs1, o16, T1
  508. stxvw4x vs2, o32, T1
  509. stxvw4x vs3, o48, T1
  510. add T1, T1, LDC
  511. #ifndef TRMMKERNEL
  512. lxvw4x vs0, o0, T1
  513. lxvw4x vs1, o16, T1
  514. lxvw4x vs2, o32, T1
  515. lxvw4x vs3, o48, T1
  516. #endif
  517. #ifdef TRMMKERNEL
  518. xvmulsp vs0, vs52, alpha_vr
  519. xvmulsp vs1, vs53, alpha_vr
  520. xvmulsp vs2, vs54, alpha_vr
  521. xvmulsp vs3, vs55, alpha_vr
  522. #else
  523. xvmaddasp vs0, vs52, alpha_vr
  524. xvmaddasp vs1, vs53, alpha_vr
  525. xvmaddasp vs2, vs54, alpha_vr
  526. xvmaddasp vs3, vs55, alpha_vr
  527. #endif
  528. stxvw4x vs0, o0, T1
  529. stxvw4x vs1, o16, T1
  530. stxvw4x vs2, o32, T1
  531. stxvw4x vs3, o48, T1
  532. add T1, T1, LDC
  533. #ifndef TRMMKERNEL
  534. lxvw4x vs0, o0, T1
  535. lxvw4x vs1, o16, T1
  536. lxvw4x vs2, o32, T1
  537. lxvw4x vs3, o48, T1
  538. #endif
  539. #ifdef TRMMKERNEL
  540. xvmulsp vs0, vs56, alpha_vr
  541. xvmulsp vs1, vs57, alpha_vr
  542. xvmulsp vs2, vs58, alpha_vr
  543. xvmulsp vs3, vs59, alpha_vr
  544. #else
  545. xvmaddasp vs0, vs56, alpha_vr
  546. xvmaddasp vs1, vs57, alpha_vr
  547. xvmaddasp vs2, vs58, alpha_vr
  548. xvmaddasp vs3, vs59, alpha_vr
  549. #endif
  550. stxvw4x vs0, o0, T1
  551. stxvw4x vs1, o16, T1
  552. stxvw4x vs2, o32, T1
  553. stxvw4x vs3, o48, T1
  554. add T1, T1, LDC
  555. #ifndef TRMMKERNEL
  556. lxvw4x vs0, o0, T1
  557. lxvw4x vs1, o16, T1
  558. lxvw4x vs2, o32, T1
  559. lxvw4x vs3, o48, T1
  560. #endif
  561. #ifdef TRMMKERNEL
  562. xvmulsp vs0, vs60, alpha_vr
  563. xvmulsp vs1, vs61, alpha_vr
  564. xvmulsp vs2, vs62, alpha_vr
  565. xvmulsp vs3, vs63, alpha_vr
  566. #else
  567. xvmaddasp vs0, vs60, alpha_vr
  568. xvmaddasp vs1, vs61, alpha_vr
  569. xvmaddasp vs2, vs62, alpha_vr
  570. xvmaddasp vs3, vs63, alpha_vr
  571. #endif
  572. stxvw4x vs0, o0, T1
  573. stxvw4x vs1, o16, T1
  574. stxvw4x vs2, o32, T1
  575. stxvw4x vs3, o48, T1
  576. add T1, T1, LDC
  577. addi CO, CO, 64
  578. #if defined(_AIX)
  579. ')
  580. #else
  581. .endm
  582. #endif
  583. /**********************************************************************************************
  584. * Macros for N=8 and M=8
  585. **********************************************************************************************/
  586. #if defined(_AIX)
  587. define(`LOAD8x8_1', `
  588. #else
  589. .macro LOAD8x8_1
  590. #endif
  591. lxvw4x vs0, o0, AO
  592. lxvw4x vs1, o16, AO
  593. addi AO, AO, 32
  594. lxvw4x vs28, o0, BO
  595. xxspltw vs8, vs28, 0
  596. xxspltw vs9, vs28, 1
  597. xxspltw vs10, vs28, 2
  598. xxspltw vs11, vs28, 3
  599. lxvw4x vs29, o16, BO
  600. xxspltw vs12, vs29, 0
  601. xxspltw vs13, vs29, 1
  602. xxspltw vs14, vs29, 2
  603. xxspltw vs15, vs29, 3
  604. addi BO, BO, 32
  605. #if defined(_AIX)
  606. ')
  607. #else
  608. .endm
  609. #endif
  610. #if defined(_AIX)
  611. define(`KERNEL8x8_I1', `
  612. #else
  613. .macro KERNEL8x8_I1
  614. #endif
  615. lxvw4x vs4, o0, AO
  616. lxvw4x vs5, o16, AO
  617. addi AO, AO, 32
  618. lxvw4x vs28, o0, BO
  619. xxspltw vs16, vs28, 0
  620. xxspltw vs17, vs28, 1
  621. xxspltw vs18, vs28, 2
  622. xxspltw vs19, vs28, 3
  623. lxvw4x vs29, o16, BO
  624. xxspltw vs20, vs29, 0
  625. xxspltw vs21, vs29, 1
  626. xxspltw vs22, vs29, 2
  627. xxspltw vs23, vs29, 3
  628. addi BO, BO, 32
  629. xvmulsp vs32, vs0, vs8
  630. xvmulsp vs33, vs1, vs8
  631. xvmulsp vs34, vs0, vs9
  632. xvmulsp vs35, vs1, vs9
  633. xvmulsp vs36, vs0, vs10
  634. xvmulsp vs37, vs1, vs10
  635. xvmulsp vs38, vs0, vs11
  636. xvmulsp vs39, vs1, vs11
  637. xvmulsp vs40, vs0, vs12
  638. xvmulsp vs41, vs1, vs12
  639. xvmulsp vs42, vs0, vs13
  640. xvmulsp vs43, vs1, vs13
  641. xvmulsp vs44, vs0, vs14
  642. xvmulsp vs45, vs1, vs14
  643. xvmulsp vs46, vs0, vs15
  644. xvmulsp vs47, vs1, vs15
  645. #if defined(_AIX)
  646. ')
  647. #else
  648. .endm
  649. #endif
  650. #if defined(_AIX)
  651. define(`KERNEL8x8_1', `
  652. #else
  653. .macro KERNEL8x8_1
  654. #endif
  655. lxvw4x vs4, o0, AO
  656. lxvw4x vs5, o16, AO
  657. addi AO, AO, 32
  658. lxvw4x vs28, o0, BO
  659. xxspltw vs16, vs28, 0
  660. xxspltw vs17, vs28, 1
  661. xxspltw vs18, vs28, 2
  662. xxspltw vs19, vs28, 3
  663. lxvw4x vs29, o16, BO
  664. xxspltw vs20, vs29, 0
  665. xxspltw vs21, vs29, 1
  666. xxspltw vs22, vs29, 2
  667. xxspltw vs23, vs29, 3
  668. addi BO, BO, 32
  669. xvmaddasp vs32, vs0, vs8
  670. xvmaddasp vs33, vs1, vs8
  671. xvmaddasp vs34, vs0, vs9
  672. xvmaddasp vs35, vs1, vs9
  673. xvmaddasp vs36, vs0, vs10
  674. xvmaddasp vs37, vs1, vs10
  675. xvmaddasp vs38, vs0, vs11
  676. xvmaddasp vs39, vs1, vs11
  677. xvmaddasp vs40, vs0, vs12
  678. xvmaddasp vs41, vs1, vs12
  679. xvmaddasp vs42, vs0, vs13
  680. xvmaddasp vs43, vs1, vs13
  681. xvmaddasp vs44, vs0, vs14
  682. xvmaddasp vs45, vs1, vs14
  683. xvmaddasp vs46, vs0, vs15
  684. xvmaddasp vs47, vs1, vs15
  685. #if defined(_AIX)
  686. ')
  687. #else
  688. .endm
  689. #endif
  690. #if defined(_AIX)
  691. define(`KERNEL8x8_2', `
  692. #else
  693. .macro KERNEL8x8_2
  694. #endif
  695. lxvw4x vs0, o0, AO
  696. lxvw4x vs1, o16, AO
  697. addi AO, AO, 32
  698. lxvw4x vs28, o0, BO
  699. xxspltw vs8, vs28, 0
  700. xxspltw vs9, vs28, 1
  701. xxspltw vs10, vs28, 2
  702. xxspltw vs11, vs28, 3
  703. lxvw4x vs29, o16, BO
  704. xxspltw vs12, vs29, 0
  705. xxspltw vs13, vs29, 1
  706. xxspltw vs14, vs29, 2
  707. xxspltw vs15, vs29, 3
  708. addi BO, BO, 32
  709. xvmaddasp vs32, vs4, vs16
  710. xvmaddasp vs33, vs5, vs16
  711. xvmaddasp vs34, vs4, vs17
  712. xvmaddasp vs35, vs5, vs17
  713. xvmaddasp vs36, vs4, vs18
  714. xvmaddasp vs37, vs5, vs18
  715. xvmaddasp vs38, vs4, vs19
  716. xvmaddasp vs39, vs5, vs19
  717. xvmaddasp vs40, vs4, vs20
  718. xvmaddasp vs41, vs5, vs20
  719. xvmaddasp vs42, vs4, vs21
  720. xvmaddasp vs43, vs5, vs21
  721. xvmaddasp vs44, vs4, vs22
  722. xvmaddasp vs45, vs5, vs22
  723. xvmaddasp vs46, vs4, vs23
  724. xvmaddasp vs47, vs5, vs23
  725. #if defined(_AIX)
  726. ')
  727. #else
  728. .endm
  729. #endif
  730. #if defined(_AIX)
  731. define(`KERNEL8x8_E2', `
  732. #else
  733. .macro KERNEL8x8_E2
  734. #endif
  735. xvmaddasp vs32, vs4, vs16
  736. xvmaddasp vs33, vs5, vs16
  737. xvmaddasp vs34, vs4, vs17
  738. xvmaddasp vs35, vs5, vs17
  739. xvmaddasp vs36, vs4, vs18
  740. xvmaddasp vs37, vs5, vs18
  741. xvmaddasp vs38, vs4, vs19
  742. xvmaddasp vs39, vs5, vs19
  743. xvmaddasp vs40, vs4, vs20
  744. xvmaddasp vs41, vs5, vs20
  745. xvmaddasp vs42, vs4, vs21
  746. xvmaddasp vs43, vs5, vs21
  747. xvmaddasp vs44, vs4, vs22
  748. xvmaddasp vs45, vs5, vs22
  749. xvmaddasp vs46, vs4, vs23
  750. xvmaddasp vs47, vs5, vs23
  751. #if defined(_AIX)
  752. ')
  753. #else
  754. .endm
  755. #endif
  756. #if defined(_AIX)
  757. define(`KERNEL8x8_SUBI1', `
  758. #else
  759. .macro KERNEL8x8_SUBI1
  760. #endif
  761. lxvw4x vs0, o0, AO
  762. lxvw4x vs1, o16, AO
  763. addi AO, AO, 32
  764. lxvw4x vs28, o0, BO
  765. xxspltw vs8, vs28, 0
  766. xxspltw vs9, vs28, 1
  767. xxspltw vs10, vs28, 2
  768. xxspltw vs11, vs28, 3
  769. lxvw4x vs29, o16, BO
  770. xxspltw vs12, vs29, 0
  771. xxspltw vs13, vs29, 1
  772. xxspltw vs14, vs29, 2
  773. xxspltw vs15, vs29, 3
  774. addi BO, BO, 32
  775. xvmulsp vs32, vs0, vs8
  776. xvmulsp vs33, vs1, vs8
  777. xvmulsp vs34, vs0, vs9
  778. xvmulsp vs35, vs1, vs9
  779. xvmulsp vs36, vs0, vs10
  780. xvmulsp vs37, vs1, vs10
  781. xvmulsp vs38, vs0, vs11
  782. xvmulsp vs39, vs1, vs11
  783. xvmulsp vs40, vs0, vs12
  784. xvmulsp vs41, vs1, vs12
  785. xvmulsp vs42, vs0, vs13
  786. xvmulsp vs43, vs1, vs13
  787. xvmulsp vs44, vs0, vs14
  788. xvmulsp vs45, vs1, vs14
  789. xvmulsp vs46, vs0, vs15
  790. xvmulsp vs47, vs1, vs15
  791. #if defined(_AIX)
  792. ')
  793. #else
  794. .endm
  795. #endif
  796. #if defined(_AIX)
  797. define(`KERNEL8x8_SUB1', `
  798. #else
  799. .macro KERNEL8x8_SUB1
  800. #endif
  801. lxvw4x vs0, o0, AO
  802. lxvw4x vs1, o16, AO
  803. addi AO, AO, 32
  804. lxvw4x vs28, o0, BO
  805. xxspltw vs8, vs28, 0
  806. xxspltw vs9, vs28, 1
  807. xxspltw vs10, vs28, 2
  808. xxspltw vs11, vs28, 3
  809. lxvw4x vs29, o16, BO
  810. xxspltw vs12, vs29, 0
  811. xxspltw vs13, vs29, 1
  812. xxspltw vs14, vs29, 2
  813. xxspltw vs15, vs29, 3
  814. addi BO, BO, 32
  815. xvmaddasp vs32, vs0, vs8
  816. xvmaddasp vs33, vs1, vs8
  817. xvmaddasp vs34, vs0, vs9
  818. xvmaddasp vs35, vs1, vs9
  819. xvmaddasp vs36, vs0, vs10
  820. xvmaddasp vs37, vs1, vs10
  821. xvmaddasp vs38, vs0, vs11
  822. xvmaddasp vs39, vs1, vs11
  823. xvmaddasp vs40, vs0, vs12
  824. xvmaddasp vs41, vs1, vs12
  825. xvmaddasp vs42, vs0, vs13
  826. xvmaddasp vs43, vs1, vs13
  827. xvmaddasp vs44, vs0, vs14
  828. xvmaddasp vs45, vs1, vs14
  829. xvmaddasp vs46, vs0, vs15
  830. xvmaddasp vs47, vs1, vs15
  831. #if defined(_AIX)
  832. ')
  833. #else
  834. .endm
  835. #endif
  836. #if defined(_AIX)
  837. define(`SAVE8x8', `
  838. #else
  839. .macro SAVE8x8
  840. #endif
  841. mr T1, CO
  842. #ifndef TRMMKERNEL
  843. lxvw4x vs0, o0, T1
  844. lxvw4x vs1, o16, T1
  845. #endif
  846. #ifdef TRMMKERNEL
  847. xvmulsp vs0, vs32, alpha_vr
  848. xvmulsp vs1, vs33, alpha_vr
  849. #else
  850. xvmaddasp vs0, vs32, alpha_vr
  851. xvmaddasp vs1, vs33, alpha_vr
  852. #endif
  853. stxvw4x vs0, o0, T1
  854. stxvw4x vs1, o16, T1
  855. add T1, T1, LDC
  856. #ifndef TRMMKERNEL
  857. lxvw4x vs0, o0, T1
  858. lxvw4x vs1, o16, T1
  859. #endif
  860. #ifdef TRMMKERNEL
  861. xvmulsp vs0, vs34, alpha_vr
  862. xvmulsp vs1, vs35, alpha_vr
  863. #else
  864. xvmaddasp vs0, vs34, alpha_vr
  865. xvmaddasp vs1, vs35, alpha_vr
  866. #endif
  867. stxvw4x vs0, o0, T1
  868. stxvw4x vs1, o16, T1
  869. add T1, T1, LDC
  870. #ifndef TRMMKERNEL
  871. lxvw4x vs0, o0, T1
  872. lxvw4x vs1, o16, T1
  873. #endif
  874. #ifdef TRMMKERNEL
  875. xvmulsp vs0, vs36, alpha_vr
  876. xvmulsp vs1, vs37, alpha_vr
  877. #else
  878. xvmaddasp vs0, vs36, alpha_vr
  879. xvmaddasp vs1, vs37, alpha_vr
  880. #endif
  881. stxvw4x vs0, o0, T1
  882. stxvw4x vs1, o16, T1
  883. add T1, T1, LDC
  884. #ifndef TRMMKERNEL
  885. lxvw4x vs0, o0, T1
  886. lxvw4x vs1, o16, T1
  887. #endif
  888. #ifdef TRMMKERNEL
  889. xvmulsp vs0, vs38, alpha_vr
  890. xvmulsp vs1, vs39, alpha_vr
  891. #else
  892. xvmaddasp vs0, vs38, alpha_vr
  893. xvmaddasp vs1, vs39, alpha_vr
  894. #endif
  895. stxvw4x vs0, o0, T1
  896. stxvw4x vs1, o16, T1
  897. add T1, T1, LDC
  898. #ifndef TRMMKERNEL
  899. lxvw4x vs0, o0, T1
  900. lxvw4x vs1, o16, T1
  901. #endif
  902. #ifdef TRMMKERNEL
  903. xvmulsp vs0, vs40, alpha_vr
  904. xvmulsp vs1, vs41, alpha_vr
  905. #else
  906. xvmaddasp vs0, vs40, alpha_vr
  907. xvmaddasp vs1, vs41, alpha_vr
  908. #endif
  909. stxvw4x vs0, o0, T1
  910. stxvw4x vs1, o16, T1
  911. add T1, T1, LDC
  912. #ifndef TRMMKERNEL
  913. lxvw4x vs0, o0, T1
  914. lxvw4x vs1, o16, T1
  915. #endif
  916. #ifdef TRMMKERNEL
  917. xvmulsp vs0, vs42, alpha_vr
  918. xvmulsp vs1, vs43, alpha_vr
  919. #else
  920. xvmaddasp vs0, vs42, alpha_vr
  921. xvmaddasp vs1, vs43, alpha_vr
  922. #endif
  923. stxvw4x vs0, o0, T1
  924. stxvw4x vs1, o16, T1
  925. add T1, T1, LDC
  926. #ifndef TRMMKERNEL
  927. lxvw4x vs0, o0, T1
  928. lxvw4x vs1, o16, T1
  929. #endif
  930. #ifdef TRMMKERNEL
  931. xvmulsp vs0, vs44, alpha_vr
  932. xvmulsp vs1, vs45, alpha_vr
  933. #else
  934. xvmaddasp vs0, vs44, alpha_vr
  935. xvmaddasp vs1, vs45, alpha_vr
  936. #endif
  937. stxvw4x vs0, o0, T1
  938. stxvw4x vs1, o16, T1
  939. add T1, T1, LDC
  940. #ifndef TRMMKERNEL
  941. lxvw4x vs0, o0, T1
  942. lxvw4x vs1, o16, T1
  943. #endif
  944. #ifdef TRMMKERNEL
  945. xvmulsp vs0, vs46, alpha_vr
  946. xvmulsp vs1, vs47, alpha_vr
  947. #else
  948. xvmaddasp vs0, vs46, alpha_vr
  949. xvmaddasp vs1, vs47, alpha_vr
  950. #endif
  951. stxvw4x vs0, o0, T1
  952. stxvw4x vs1, o16, T1
  953. add T1, T1, LDC
  954. addi CO, CO, 32
  955. #if defined(_AIX)
  956. ')
  957. #else
  958. .endm
  959. #endif
  960. /**********************************************************************************************
  961. * Macros for N=8 and M=4
  962. **********************************************************************************************/
  963. #if defined(_AIX)
  964. define(`LOAD8x4_1', `
  965. #else
  966. .macro LOAD8x4_1
  967. #endif
  968. lxvw4x vs0, o0, AO
  969. addi AO, AO, 16
  970. lxvw4x vs28, o0, BO
  971. xxspltw vs8, vs28, 0
  972. xxspltw vs9, vs28, 1
  973. xxspltw vs10, vs28, 2
  974. xxspltw vs11, vs28, 3
  975. lxvw4x vs29, o16, BO
  976. xxspltw vs12, vs29, 0
  977. xxspltw vs13, vs29, 1
  978. xxspltw vs14, vs29, 2
  979. xxspltw vs15, vs29, 3
  980. addi BO, BO, 32
  981. #if defined(_AIX)
  982. ')
  983. #else
  984. .endm
  985. #endif
  986. #if defined(_AIX)
  987. define(`KERNEL8x4_I1', `
  988. #else
  989. .macro KERNEL8x4_I1
  990. #endif
  991. lxvw4x vs4, o0, AO
  992. addi AO, AO, 16
  993. lxvw4x vs28, o0, BO
  994. xxspltw vs16, vs28, 0
  995. xxspltw vs17, vs28, 1
  996. xxspltw vs18, vs28, 2
  997. xxspltw vs19, vs28, 3
  998. lxvw4x vs29, o16, BO
  999. xxspltw vs20, vs29, 0
  1000. xxspltw vs21, vs29, 1
  1001. xxspltw vs22, vs29, 2
  1002. xxspltw vs23, vs29, 3
  1003. addi BO, BO, 32
  1004. xvmulsp vs32, vs0, vs8
  1005. xvmulsp vs33, vs0, vs9
  1006. xvmulsp vs34, vs0, vs10
  1007. xvmulsp vs35, vs0, vs11
  1008. xvmulsp vs36, vs0, vs12
  1009. xvmulsp vs37, vs0, vs13
  1010. xvmulsp vs38, vs0, vs14
  1011. xvmulsp vs39, vs0, vs15
  1012. #if defined(_AIX)
  1013. ')
  1014. #else
  1015. .endm
  1016. #endif
  1017. #if defined(_AIX)
  1018. define(`KERNEL8x4_1', `
  1019. #else
  1020. .macro KERNEL8x4_1
  1021. #endif
  1022. lxvw4x vs4, o0, AO
  1023. addi AO, AO, 16
  1024. lxvw4x vs28, o0, BO
  1025. xxspltw vs16, vs28, 0
  1026. xxspltw vs17, vs28, 1
  1027. xxspltw vs18, vs28, 2
  1028. xxspltw vs19, vs28, 3
  1029. lxvw4x vs29, o16, BO
  1030. xxspltw vs20, vs29, 0
  1031. xxspltw vs21, vs29, 1
  1032. xxspltw vs22, vs29, 2
  1033. xxspltw vs23, vs29, 3
  1034. addi BO, BO, 32
  1035. xvmaddasp vs32, vs0, vs8
  1036. xvmaddasp vs33, vs0, vs9
  1037. xvmaddasp vs34, vs0, vs10
  1038. xvmaddasp vs35, vs0, vs11
  1039. xvmaddasp vs36, vs0, vs12
  1040. xvmaddasp vs37, vs0, vs13
  1041. xvmaddasp vs38, vs0, vs14
  1042. xvmaddasp vs39, vs0, vs15
  1043. #if defined(_AIX)
  1044. ')
  1045. #else
  1046. .endm
  1047. #endif
  1048. #if defined(_AIX)
  1049. define(`KERNEL8x4_2', `
  1050. #else
  1051. .macro KERNEL8x4_2
  1052. #endif
  1053. lxvw4x vs0, o0, AO
  1054. addi AO, AO, 16
  1055. lxvw4x vs28, o0, BO
  1056. xxspltw vs8, vs28, 0
  1057. xxspltw vs9, vs28, 1
  1058. xxspltw vs10, vs28, 2
  1059. xxspltw vs11, vs28, 3
  1060. lxvw4x vs29, o16, BO
  1061. xxspltw vs12, vs29, 0
  1062. xxspltw vs13, vs29, 1
  1063. xxspltw vs14, vs29, 2
  1064. xxspltw vs15, vs29, 3
  1065. addi BO, BO, 32
  1066. xvmaddasp vs32, vs4, vs16
  1067. xvmaddasp vs33, vs4, vs17
  1068. xvmaddasp vs34, vs4, vs18
  1069. xvmaddasp vs35, vs4, vs19
  1070. xvmaddasp vs36, vs4, vs20
  1071. xvmaddasp vs37, vs4, vs21
  1072. xvmaddasp vs38, vs4, vs22
  1073. xvmaddasp vs39, vs4, vs23
  1074. #if defined(_AIX)
  1075. ')
  1076. #else
  1077. .endm
  1078. #endif
  1079. #if defined(_AIX)
  1080. define(`KERNEL8x4_E2', `
  1081. #else
  1082. .macro KERNEL8x4_E2
  1083. #endif
  1084. xvmaddasp vs32, vs4, vs16
  1085. xvmaddasp vs33, vs4, vs17
  1086. xvmaddasp vs34, vs4, vs18
  1087. xvmaddasp vs35, vs4, vs19
  1088. xvmaddasp vs36, vs4, vs20
  1089. xvmaddasp vs37, vs4, vs21
  1090. xvmaddasp vs38, vs4, vs22
  1091. xvmaddasp vs39, vs4, vs23
  1092. #if defined(_AIX)
  1093. ')
  1094. #else
  1095. .endm
  1096. #endif
  1097. #if defined(_AIX)
  1098. define(`KERNEL8x4_SUBI1', `
  1099. #else
  1100. .macro KERNEL8x4_SUBI1
  1101. #endif
  1102. lxvw4x vs0, o0, AO
  1103. addi AO, AO, 16
  1104. lxvw4x vs28, o0, BO
  1105. xxspltw vs8, vs28, 0
  1106. xxspltw vs9, vs28, 1
  1107. xxspltw vs10, vs28, 2
  1108. xxspltw vs11, vs28, 3
  1109. lxvw4x vs29, o16, BO
  1110. xxspltw vs12, vs29, 0
  1111. xxspltw vs13, vs29, 1
  1112. xxspltw vs14, vs29, 2
  1113. xxspltw vs15, vs29, 3
  1114. addi BO, BO, 32
  1115. xvmulsp vs32, vs0, vs8
  1116. xvmulsp vs33, vs0, vs9
  1117. xvmulsp vs34, vs0, vs10
  1118. xvmulsp vs35, vs0, vs11
  1119. xvmulsp vs36, vs0, vs12
  1120. xvmulsp vs37, vs0, vs13
  1121. xvmulsp vs38, vs0, vs14
  1122. xvmulsp vs39, vs0, vs15
  1123. #if defined(_AIX)
  1124. ')
  1125. #else
  1126. .endm
  1127. #endif
  1128. #if defined(_AIX)
  1129. define(`KERNEL8x4_SUB1', `
  1130. #else
  1131. .macro KERNEL8x4_SUB1
  1132. #endif
  1133. lxvw4x vs0, o0, AO
  1134. addi AO, AO, 16
  1135. lxvw4x vs28, o0, BO
  1136. xxspltw vs8, vs28, 0
  1137. xxspltw vs9, vs28, 1
  1138. xxspltw vs10, vs28, 2
  1139. xxspltw vs11, vs28, 3
  1140. lxvw4x vs29, o16, BO
  1141. xxspltw vs12, vs29, 0
  1142. xxspltw vs13, vs29, 1
  1143. xxspltw vs14, vs29, 2
  1144. xxspltw vs15, vs29, 3
  1145. addi BO, BO, 32
  1146. xvmaddasp vs32, vs0, vs8
  1147. xvmaddasp vs33, vs0, vs9
  1148. xvmaddasp vs34, vs0, vs10
  1149. xvmaddasp vs35, vs0, vs11
  1150. xvmaddasp vs36, vs0, vs12
  1151. xvmaddasp vs37, vs0, vs13
  1152. xvmaddasp vs38, vs0, vs14
  1153. xvmaddasp vs39, vs0, vs15
  1154. #if defined(_AIX)
  1155. ')
  1156. #else
  1157. .endm
  1158. #endif
  1159. #if defined(_AIX)
  1160. define(`SAVE8x4', `
  1161. #else
  1162. .macro SAVE8x4
  1163. #endif
  1164. mr T1, CO
  1165. #ifndef TRMMKERNEL
  1166. lxvw4x vs0, o0, T1
  1167. #endif
  1168. #ifdef TRMMKERNEL
  1169. xvmulsp vs0, vs32, alpha_vr
  1170. #else
  1171. xvmaddasp vs0, vs32, alpha_vr
  1172. #endif
  1173. stxvw4x vs0, o0, T1
  1174. add T1, T1, LDC
  1175. #ifndef TRMMKERNEL
  1176. lxvw4x vs0, o0, T1
  1177. #endif
  1178. #ifdef TRMMKERNEL
  1179. xvmulsp vs0, vs33, alpha_vr
  1180. #else
  1181. xvmaddasp vs0, vs33, alpha_vr
  1182. #endif
  1183. stxvw4x vs0, o0, T1
  1184. add T1, T1, LDC
  1185. #ifndef TRMMKERNEL
  1186. lxvw4x vs0, o0, T1
  1187. #endif
  1188. #ifdef TRMMKERNEL
  1189. xvmulsp vs0, vs34, alpha_vr
  1190. #else
  1191. xvmaddasp vs0, vs34, alpha_vr
  1192. #endif
  1193. stxvw4x vs0, o0, T1
  1194. add T1, T1, LDC
  1195. #ifndef TRMMKERNEL
  1196. lxvw4x vs0, o0, T1
  1197. #endif
  1198. #ifdef TRMMKERNEL
  1199. xvmulsp vs0, vs35, alpha_vr
  1200. #else
  1201. xvmaddasp vs0, vs35, alpha_vr
  1202. #endif
  1203. stxvw4x vs0, o0, T1
  1204. add T1, T1, LDC
  1205. #ifndef TRMMKERNEL
  1206. lxvw4x vs0, o0, T1
  1207. #endif
  1208. #ifdef TRMMKERNEL
  1209. xvmulsp vs0, vs36, alpha_vr
  1210. #else
  1211. xvmaddasp vs0, vs36, alpha_vr
  1212. #endif
  1213. stxvw4x vs0, o0, T1
  1214. add T1, T1, LDC
  1215. #ifndef TRMMKERNEL
  1216. lxvw4x vs0, o0, T1
  1217. #endif
  1218. #ifdef TRMMKERNEL
  1219. xvmulsp vs0, vs37, alpha_vr
  1220. #else
  1221. xvmaddasp vs0, vs37, alpha_vr
  1222. #endif
  1223. stxvw4x vs0, o0, T1
  1224. add T1, T1, LDC
  1225. #ifndef TRMMKERNEL
  1226. lxvw4x vs0, o0, T1
  1227. #endif
  1228. #ifdef TRMMKERNEL
  1229. xvmulsp vs0, vs38, alpha_vr
  1230. #else
  1231. xvmaddasp vs0, vs38, alpha_vr
  1232. #endif
  1233. stxvw4x vs0, o0, T1
  1234. add T1, T1, LDC
  1235. #ifndef TRMMKERNEL
  1236. lxvw4x vs0, o0, T1
  1237. #endif
  1238. #ifdef TRMMKERNEL
  1239. xvmulsp vs0, vs39, alpha_vr
  1240. #else
  1241. xvmaddasp vs0, vs39, alpha_vr
  1242. #endif
  1243. stxvw4x vs0, o0, T1
  1244. add T1, T1, LDC
  1245. addi CO, CO, 16
  1246. #if defined(_AIX)
  1247. ')
  1248. #else
  1249. .endm
  1250. #endif
  1251. /**********************************************************************************************
  1252. * Macros for N=8 and M=2
  1253. **********************************************************************************************/
  1254. #if defined(_AIX)
  1255. define(`LOAD8x2_1', `
  1256. #else
  1257. .macro LOAD8x2_1
  1258. #endif
  1259. lxsspx vs0, o0, AO
  1260. lxsspx vs1, o4, AO
  1261. addi AO, AO, 8
  1262. mr T1, BO
  1263. lxsspx vs8, o0, T1
  1264. lxsspx vs9, o4, T1
  1265. lxsspx vs10, o8, T1
  1266. lxsspx vs11, o12, T1
  1267. addi T1, T1, 16
  1268. lxsspx vs12, o0, T1
  1269. lxsspx vs13, o4, T1
  1270. lxsspx vs14, o8, T1
  1271. lxsspx vs15, o12, T1
  1272. addi BO, BO, 32
  1273. #if defined(_AIX)
  1274. ')
  1275. #else
  1276. .endm
  1277. #endif
  1278. #if defined(_AIX)
  1279. define(`KERNEL8x2_I1', `
  1280. #else
  1281. .macro KERNEL8x2_I1
  1282. #endif
  1283. lxsspx vs4, o0, AO
  1284. lxsspx vs5, o4, AO
  1285. addi AO, AO, 8
  1286. mr T1, BO
  1287. lxsspx vs16, o0, T1
  1288. lxsspx vs17, o4, T1
  1289. lxsspx vs18, o8, T1
  1290. lxsspx vs19, o12, T1
  1291. addi T1, T1, 16
  1292. lxsspx vs20, o0, T1
  1293. lxsspx vs21, o4, T1
  1294. lxsspx vs22, o8, T1
  1295. lxsspx vs23, o12, T1
  1296. addi BO, BO, 32
  1297. xsmuldp vs32, vs0, vs8
  1298. xsmuldp vs33, vs1, vs8
  1299. xsmuldp vs34, vs0, vs9
  1300. xsmuldp vs35, vs1, vs9
  1301. xsmuldp vs36, vs0, vs10
  1302. xsmuldp vs37, vs1, vs10
  1303. xsmuldp vs38, vs0, vs11
  1304. xsmuldp vs39, vs1, vs11
  1305. xsmuldp vs40, vs0, vs12
  1306. xsmuldp vs41, vs1, vs12
  1307. xsmuldp vs42, vs0, vs13
  1308. xsmuldp vs43, vs1, vs13
  1309. xsmuldp vs44, vs0, vs14
  1310. xsmuldp vs45, vs1, vs14
  1311. xsmuldp vs46, vs0, vs15
  1312. xsmuldp vs47, vs1, vs15
  1313. #if defined(_AIX)
  1314. ')
  1315. #else
  1316. .endm
  1317. #endif
  1318. #if defined(_AIX)
  1319. define(`KERNEL8x2_1', `
  1320. #else
  1321. .macro KERNEL8x2_1
  1322. #endif
  1323. lxsspx vs4, o0, AO
  1324. lxsspx vs5, o4, AO
  1325. addi AO, AO, 8
  1326. mr T1, BO
  1327. lxsspx vs16, o0, T1
  1328. lxsspx vs17, o4, T1
  1329. lxsspx vs18, o8, T1
  1330. lxsspx vs19, o12, T1
  1331. addi T1, T1, 16
  1332. lxsspx vs20, o0, T1
  1333. lxsspx vs21, o4, T1
  1334. lxsspx vs22, o8, T1
  1335. lxsspx vs23, o12, T1
  1336. addi BO, BO, 32
  1337. xsmaddadp vs32, vs0, vs8
  1338. xsmaddadp vs33, vs1, vs8
  1339. xsmaddadp vs34, vs0, vs9
  1340. xsmaddadp vs35, vs1, vs9
  1341. xsmaddadp vs36, vs0, vs10
  1342. xsmaddadp vs37, vs1, vs10
  1343. xsmaddadp vs38, vs0, vs11
  1344. xsmaddadp vs39, vs1, vs11
  1345. xsmaddadp vs40, vs0, vs12
  1346. xsmaddadp vs41, vs1, vs12
  1347. xsmaddadp vs42, vs0, vs13
  1348. xsmaddadp vs43, vs1, vs13
  1349. xsmaddadp vs44, vs0, vs14
  1350. xsmaddadp vs45, vs1, vs14
  1351. xsmaddadp vs46, vs0, vs15
  1352. xsmaddadp vs47, vs1, vs15
  1353. #if defined(_AIX)
  1354. ')
  1355. #else
  1356. .endm
  1357. #endif
  1358. #if defined(_AIX)
  1359. define(`KERNEL8x2_2', `
  1360. #else
  1361. .macro KERNEL8x2_2
  1362. #endif
  1363. lxsspx vs0, o0, AO
  1364. lxsspx vs1, o4, AO
  1365. addi AO, AO, 8
  1366. mr T1, BO
  1367. lxsspx vs8, o0, T1
  1368. lxsspx vs9, o4, T1
  1369. lxsspx vs10, o8, T1
  1370. lxsspx vs11, o12, T1
  1371. addi T1, T1, 16
  1372. lxsspx vs12, o0, T1
  1373. lxsspx vs13, o4, T1
  1374. lxsspx vs14, o8, T1
  1375. lxsspx vs15, o12, T1
  1376. addi BO, BO, 32
  1377. xsmaddadp vs32, vs4, vs16
  1378. xsmaddadp vs33, vs5, vs16
  1379. xsmaddadp vs34, vs4, vs17
  1380. xsmaddadp vs35, vs5, vs17
  1381. xsmaddadp vs36, vs4, vs18
  1382. xsmaddadp vs37, vs5, vs18
  1383. xsmaddadp vs38, vs4, vs19
  1384. xsmaddadp vs39, vs5, vs19
  1385. xsmaddadp vs40, vs4, vs20
  1386. xsmaddadp vs41, vs5, vs20
  1387. xsmaddadp vs42, vs4, vs21
  1388. xsmaddadp vs43, vs5, vs21
  1389. xsmaddadp vs44, vs4, vs22
  1390. xsmaddadp vs45, vs5, vs22
  1391. xsmaddadp vs46, vs4, vs23
  1392. xsmaddadp vs47, vs5, vs23
  1393. #if defined(_AIX)
  1394. ')
  1395. #else
  1396. .endm
  1397. #endif
  1398. #if defined(_AIX)
  1399. define(`KERNEL8x2_E2', `
  1400. #else
  1401. .macro KERNEL8x2_E2
  1402. #endif
  1403. xsmaddadp vs32, vs4, vs16
  1404. xsmaddadp vs33, vs5, vs16
  1405. xsmaddadp vs34, vs4, vs17
  1406. xsmaddadp vs35, vs5, vs17
  1407. xsmaddadp vs36, vs4, vs18
  1408. xsmaddadp vs37, vs5, vs18
  1409. xsmaddadp vs38, vs4, vs19
  1410. xsmaddadp vs39, vs5, vs19
  1411. xsmaddadp vs40, vs4, vs20
  1412. xsmaddadp vs41, vs5, vs20
  1413. xsmaddadp vs42, vs4, vs21
  1414. xsmaddadp vs43, vs5, vs21
  1415. xsmaddadp vs44, vs4, vs22
  1416. xsmaddadp vs45, vs5, vs22
  1417. xsmaddadp vs46, vs4, vs23
  1418. xsmaddadp vs47, vs5, vs23
  1419. #if defined(_AIX)
  1420. ')
  1421. #else
  1422. .endm
  1423. #endif
  1424. #if defined(_AIX)
  1425. define(`KERNEL8x2_SUBI1', `
  1426. #else
  1427. .macro KERNEL8x2_SUBI1
  1428. #endif
  1429. lxsspx vs0, o0, AO
  1430. lxsspx vs1, o4, AO
  1431. addi AO, AO, 8
  1432. mr T1, BO
  1433. lxsspx vs8, o0, T1
  1434. lxsspx vs9, o4, T1
  1435. lxsspx vs10, o8, T1
  1436. lxsspx vs11, o12, T1
  1437. addi T1, T1, 16
  1438. lxsspx vs12, o0, T1
  1439. lxsspx vs13, o4, T1
  1440. lxsspx vs14, o8, T1
  1441. lxsspx vs15, o12, T1
  1442. addi BO, BO, 32
  1443. xsmuldp vs32, vs0, vs8
  1444. xsmuldp vs33, vs1, vs8
  1445. xsmuldp vs34, vs0, vs9
  1446. xsmuldp vs35, vs1, vs9
  1447. xsmuldp vs36, vs0, vs10
  1448. xsmuldp vs37, vs1, vs10
  1449. xsmuldp vs38, vs0, vs11
  1450. xsmuldp vs39, vs1, vs11
  1451. xsmuldp vs40, vs0, vs12
  1452. xsmuldp vs41, vs1, vs12
  1453. xsmuldp vs42, vs0, vs13
  1454. xsmuldp vs43, vs1, vs13
  1455. xsmuldp vs44, vs0, vs14
  1456. xsmuldp vs45, vs1, vs14
  1457. xsmuldp vs46, vs0, vs15
  1458. xsmuldp vs47, vs1, vs15
  1459. #if defined(_AIX)
  1460. ')
  1461. #else
  1462. .endm
  1463. #endif
  1464. #if defined(_AIX)
  1465. define(`KERNEL8x2_SUB1', `
  1466. #else
  1467. .macro KERNEL8x2_SUB1
  1468. #endif
  1469. lxsspx vs0, o0, AO
  1470. lxsspx vs1, o4, AO
  1471. addi AO, AO, 8
  1472. mr T1, BO
  1473. lxsspx vs8, o0, T1
  1474. lxsspx vs9, o4, T1
  1475. lxsspx vs10, o8, T1
  1476. lxsspx vs11, o12, T1
  1477. addi T1, T1, 16
  1478. lxsspx vs12, o0, T1
  1479. lxsspx vs13, o4, T1
  1480. lxsspx vs14, o8, T1
  1481. lxsspx vs15, o12, T1
  1482. addi BO, BO, 32
  1483. xsmaddadp vs32, vs0, vs8
  1484. xsmaddadp vs33, vs1, vs8
  1485. xsmaddadp vs34, vs0, vs9
  1486. xsmaddadp vs35, vs1, vs9
  1487. xsmaddadp vs36, vs0, vs10
  1488. xsmaddadp vs37, vs1, vs10
  1489. xsmaddadp vs38, vs0, vs11
  1490. xsmaddadp vs39, vs1, vs11
  1491. xsmaddadp vs40, vs0, vs12
  1492. xsmaddadp vs41, vs1, vs12
  1493. xsmaddadp vs42, vs0, vs13
  1494. xsmaddadp vs43, vs1, vs13
  1495. xsmaddadp vs44, vs0, vs14
  1496. xsmaddadp vs45, vs1, vs14
  1497. xsmaddadp vs46, vs0, vs15
  1498. xsmaddadp vs47, vs1, vs15
  1499. #if defined(_AIX)
  1500. ')
  1501. #else
  1502. .endm
  1503. #endif
  1504. #if defined(_AIX)
  1505. define(`SAVE8x2', `
  1506. #else
  1507. .macro SAVE8x2
  1508. #endif
  1509. mr T1, CO
  1510. #ifndef TRMMKERNEL
  1511. lxsspx vs0, o0, T1
  1512. lxsspx vs1, o4, T1
  1513. #endif
  1514. #ifdef TRMMKERNEL
  1515. xsmuldp vs0, vs32, alpha_r
  1516. xsmuldp vs1, vs33, alpha_r
  1517. #else
  1518. xsmaddadp vs0, vs32, alpha_r
  1519. xsmaddadp vs1, vs33, alpha_r
  1520. #endif
  1521. stxsspx vs0, o0, T1
  1522. stxsspx vs1, o4, T1
  1523. add T1, T1, LDC
  1524. #ifndef TRMMKERNEL
  1525. lxsspx vs0, o0, T1
  1526. lxsspx vs1, o4, T1
  1527. #endif
  1528. #ifdef TRMMKERNEL
  1529. xsmuldp vs0, vs34, alpha_r
  1530. xsmuldp vs1, vs35, alpha_r
  1531. #else
  1532. xsmaddadp vs0, vs34, alpha_r
  1533. xsmaddadp vs1, vs35, alpha_r
  1534. #endif
  1535. stxsspx vs0, o0, T1
  1536. stxsspx vs1, o4, T1
  1537. add T1, T1, LDC
  1538. #ifndef TRMMKERNEL
  1539. lxsspx vs0, o0, T1
  1540. lxsspx vs1, o4, T1
  1541. #endif
  1542. #ifdef TRMMKERNEL
  1543. xsmuldp vs0, vs36, alpha_r
  1544. xsmuldp vs1, vs37, alpha_r
  1545. #else
  1546. xsmaddadp vs0, vs36, alpha_r
  1547. xsmaddadp vs1, vs37, alpha_r
  1548. #endif
  1549. stxsspx vs0, o0, T1
  1550. stxsspx vs1, o4, T1
  1551. add T1, T1, LDC
  1552. #ifndef TRMMKERNEL
  1553. lxsspx vs0, o0, T1
  1554. lxsspx vs1, o4, T1
  1555. #endif
  1556. #ifdef TRMMKERNEL
  1557. xsmuldp vs0, vs38, alpha_r
  1558. xsmuldp vs1, vs39, alpha_r
  1559. #else
  1560. xsmaddadp vs0, vs38, alpha_r
  1561. xsmaddadp vs1, vs39, alpha_r
  1562. #endif
  1563. stxsspx vs0, o0, T1
  1564. stxsspx vs1, o4, T1
  1565. add T1, T1, LDC
  1566. #ifndef TRMMKERNEL
  1567. lxsspx vs0, o0, T1
  1568. lxsspx vs1, o4, T1
  1569. #endif
  1570. #ifdef TRMMKERNEL
  1571. xsmuldp vs0, vs40, alpha_r
  1572. xsmuldp vs1, vs41, alpha_r
  1573. #else
  1574. xsmaddadp vs0, vs40, alpha_r
  1575. xsmaddadp vs1, vs41, alpha_r
  1576. #endif
  1577. stxsspx vs0, o0, T1
  1578. stxsspx vs1, o4, T1
  1579. add T1, T1, LDC
  1580. #ifndef TRMMKERNEL
  1581. lxsspx vs0, o0, T1
  1582. lxsspx vs1, o4, T1
  1583. #endif
  1584. #ifdef TRMMKERNEL
  1585. xsmuldp vs0, vs42, alpha_r
  1586. xsmuldp vs1, vs43, alpha_r
  1587. #else
  1588. xsmaddadp vs0, vs42, alpha_r
  1589. xsmaddadp vs1, vs43, alpha_r
  1590. #endif
  1591. stxsspx vs0, o0, T1
  1592. stxsspx vs1, o4, T1
  1593. add T1, T1, LDC
  1594. #ifndef TRMMKERNEL
  1595. lxsspx vs0, o0, T1
  1596. lxsspx vs1, o4, T1
  1597. #endif
  1598. #ifdef TRMMKERNEL
  1599. xsmuldp vs0, vs44, alpha_r
  1600. xsmuldp vs1, vs45, alpha_r
  1601. #else
  1602. xsmaddadp vs0, vs44, alpha_r
  1603. xsmaddadp vs1, vs45, alpha_r
  1604. #endif
  1605. stxsspx vs0, o0, T1
  1606. stxsspx vs1, o4, T1
  1607. add T1, T1, LDC
  1608. #ifndef TRMMKERNEL
  1609. lxsspx vs0, o0, T1
  1610. lxsspx vs1, o4, T1
  1611. #endif
  1612. #ifdef TRMMKERNEL
  1613. xsmuldp vs0, vs46, alpha_r
  1614. xsmuldp vs1, vs47, alpha_r
  1615. #else
  1616. xsmaddadp vs0, vs46, alpha_r
  1617. xsmaddadp vs1, vs47, alpha_r
  1618. #endif
  1619. stxsspx vs0, o0, T1
  1620. stxsspx vs1, o4, T1
  1621. add T1, T1, LDC
  1622. addi CO, CO, 8
  1623. #if defined(_AIX)
  1624. ')
  1625. #else
  1626. .endm
  1627. #endif
  1628. /**********************************************************************************************
  1629. * Macros for N=8 and M=1
  1630. **********************************************************************************************/
  1631. #if defined(_AIX)
  1632. define(`LOAD8x1_1', `
  1633. #else
  1634. .macro LOAD8x1_1
  1635. #endif
  1636. lxsspx vs0, o0, AO
  1637. addi AO, AO, 4
  1638. mr T1, BO
  1639. lxsspx vs8, o0, T1
  1640. lxsspx vs9, o4, T1
  1641. lxsspx vs10, o8, T1
  1642. lxsspx vs11, o12, T1
  1643. addi T1, T1, 16
  1644. lxsspx vs12, o0, T1
  1645. lxsspx vs13, o4, T1
  1646. lxsspx vs14, o8, T1
  1647. lxsspx vs15, o12, T1
  1648. addi BO, BO, 32
  1649. #if defined(_AIX)
  1650. ')
  1651. #else
  1652. .endm
  1653. #endif
  1654. #if defined(_AIX)
  1655. define(`KERNEL8x1_I1', `
  1656. #else
  1657. .macro KERNEL8x1_I1
  1658. #endif
  1659. lxsspx vs4, o0, AO
  1660. addi AO, AO, 4
  1661. mr T1, BO
  1662. lxsspx vs16, o0, T1
  1663. lxsspx vs17, o4, T1
  1664. lxsspx vs18, o8, T1
  1665. lxsspx vs19, o12, T1
  1666. addi T1, T1, 16
  1667. lxsspx vs20, o0, T1
  1668. lxsspx vs21, o4, T1
  1669. lxsspx vs22, o8, T1
  1670. lxsspx vs23, o12, T1
  1671. addi BO, BO, 32
  1672. xsmuldp vs32, vs0, vs8
  1673. xsmuldp vs33, vs0, vs9
  1674. xsmuldp vs34, vs0, vs10
  1675. xsmuldp vs35, vs0, vs11
  1676. xsmuldp vs36, vs0, vs12
  1677. xsmuldp vs37, vs0, vs13
  1678. xsmuldp vs38, vs0, vs14
  1679. xsmuldp vs39, vs0, vs15
  1680. #if defined(_AIX)
  1681. ')
  1682. #else
  1683. .endm
  1684. #endif
  1685. #if defined(_AIX)
  1686. define(`KERNEL8x1_1', `
  1687. #else
  1688. .macro KERNEL8x1_1
  1689. #endif
  1690. lxsspx vs4, o0, AO
  1691. addi AO, AO, 4
  1692. mr T1, BO
  1693. lxsspx vs16, o0, T1
  1694. lxsspx vs17, o4, T1
  1695. lxsspx vs18, o8, T1
  1696. lxsspx vs19, o12, T1
  1697. addi T1, T1, 16
  1698. lxsspx vs20, o0, T1
  1699. lxsspx vs21, o4, T1
  1700. lxsspx vs22, o8, T1
  1701. lxsspx vs23, o12, T1
  1702. addi BO, BO, 32
  1703. xsmaddadp vs32, vs0, vs8
  1704. xsmaddadp vs33, vs0, vs9
  1705. xsmaddadp vs34, vs0, vs10
  1706. xsmaddadp vs35, vs0, vs11
  1707. xsmaddadp vs36, vs0, vs12
  1708. xsmaddadp vs37, vs0, vs13
  1709. xsmaddadp vs38, vs0, vs14
  1710. xsmaddadp vs39, vs0, vs15
  1711. #if defined(_AIX)
  1712. ')
  1713. #else
  1714. .endm
  1715. #endif
  1716. #if defined(_AIX)
  1717. define(`KERNEL8x1_2', `
  1718. #else
  1719. .macro KERNEL8x1_2
  1720. #endif
  1721. lxsspx vs0, o0, AO
  1722. addi AO, AO, 4
  1723. mr T1, BO
  1724. lxsspx vs8, o0, T1
  1725. lxsspx vs9, o4, T1
  1726. lxsspx vs10, o8, T1
  1727. lxsspx vs11, o12, T1
  1728. addi T1, T1, 16
  1729. lxsspx vs12, o0, T1
  1730. lxsspx vs13, o4, T1
  1731. lxsspx vs14, o8, T1
  1732. lxsspx vs15, o12, T1
  1733. addi BO, BO, 32
  1734. xsmaddadp vs32, vs4, vs16
  1735. xsmaddadp vs33, vs4, vs17
  1736. xsmaddadp vs34, vs4, vs18
  1737. xsmaddadp vs35, vs4, vs19
  1738. xsmaddadp vs36, vs4, vs20
  1739. xsmaddadp vs37, vs4, vs21
  1740. xsmaddadp vs38, vs4, vs22
  1741. xsmaddadp vs39, vs4, vs23
  1742. #if defined(_AIX)
  1743. ')
  1744. #else
  1745. .endm
  1746. #endif
  1747. #if defined(_AIX)
  1748. define(`KERNEL8x1_E2', `
  1749. #else
  1750. .macro KERNEL8x1_E2
  1751. #endif
  1752. xsmaddadp vs32, vs4, vs16
  1753. xsmaddadp vs33, vs4, vs17
  1754. xsmaddadp vs34, vs4, vs18
  1755. xsmaddadp vs35, vs4, vs19
  1756. xsmaddadp vs36, vs4, vs20
  1757. xsmaddadp vs37, vs4, vs21
  1758. xsmaddadp vs38, vs4, vs22
  1759. xsmaddadp vs39, vs4, vs23
  1760. #if defined(_AIX)
  1761. ')
  1762. #else
  1763. .endm
  1764. #endif
  1765. #if defined(_AIX)
  1766. define(`KERNEL8x1_SUBI1', `
  1767. #else
  1768. .macro KERNEL8x1_SUBI1
  1769. #endif
  1770. lxsspx vs0, o0, AO
  1771. addi AO, AO, 4
  1772. mr T1, BO
  1773. lxsspx vs8, o0, T1
  1774. lxsspx vs9, o4, T1
  1775. lxsspx vs10, o8, T1
  1776. lxsspx vs11, o12, T1
  1777. addi T1, T1, 16
  1778. lxsspx vs12, o0, T1
  1779. lxsspx vs13, o4, T1
  1780. lxsspx vs14, o8, T1
  1781. lxsspx vs15, o12, T1
  1782. addi BO, BO, 32
  1783. xsmuldp vs32, vs0, vs8
  1784. xsmuldp vs33, vs0, vs9
  1785. xsmuldp vs34, vs0, vs10
  1786. xsmuldp vs35, vs0, vs11
  1787. xsmuldp vs36, vs0, vs12
  1788. xsmuldp vs37, vs0, vs13
  1789. xsmuldp vs38, vs0, vs14
  1790. xsmuldp vs39, vs0, vs15
  1791. #if defined(_AIX)
  1792. ')
  1793. #else
  1794. .endm
  1795. #endif
  1796. #if defined(_AIX)
  1797. define(`KERNEL8x1_SUB1', `
  1798. #else
  1799. .macro KERNEL8x1_SUB1
  1800. #endif
  1801. lxsspx vs0, o0, AO
  1802. addi AO, AO, 4
  1803. mr T1, BO
  1804. lxsspx vs8, o0, T1
  1805. lxsspx vs9, o4, T1
  1806. lxsspx vs10, o8, T1
  1807. lxsspx vs11, o12, T1
  1808. addi T1, T1, 16
  1809. lxsspx vs12, o0, T1
  1810. lxsspx vs13, o4, T1
  1811. lxsspx vs14, o8, T1
  1812. lxsspx vs15, o12, T1
  1813. addi BO, BO, 32
  1814. xsmaddadp vs32, vs0, vs8
  1815. xsmaddadp vs33, vs0, vs9
  1816. xsmaddadp vs34, vs0, vs10
  1817. xsmaddadp vs35, vs0, vs11
  1818. xsmaddadp vs36, vs0, vs12
  1819. xsmaddadp vs37, vs0, vs13
  1820. xsmaddadp vs38, vs0, vs14
  1821. xsmaddadp vs39, vs0, vs15
  1822. #if defined(_AIX)
  1823. ')
  1824. #else
  1825. .endm
  1826. #endif
  1827. #if defined(_AIX)
  1828. define(`SAVE8x1', `
  1829. #else
  1830. .macro SAVE8x1
  1831. #endif
  1832. mr T1, CO
  1833. #ifndef TRMMKERNEL
  1834. lxsspx vs0, o0, T1
  1835. #endif
  1836. #ifdef TRMMKERNEL
  1837. xsmuldp vs0, vs32, alpha_r
  1838. #else
  1839. xsmaddadp vs0, vs32, alpha_r
  1840. #endif
  1841. stxsspx vs0, o0, T1
  1842. add T1, T1, LDC
  1843. #ifndef TRMMKERNEL
  1844. lxsspx vs0, o0, T1
  1845. #endif
  1846. #ifdef TRMMKERNEL
  1847. xsmuldp vs0, vs33, alpha_r
  1848. #else
  1849. xsmaddadp vs0, vs33, alpha_r
  1850. #endif
  1851. stxsspx vs0, o0, T1
  1852. add T1, T1, LDC
  1853. #ifndef TRMMKERNEL
  1854. lxsspx vs0, o0, T1
  1855. #endif
  1856. #ifdef TRMMKERNEL
  1857. xsmuldp vs0, vs34, alpha_r
  1858. #else
  1859. xsmaddadp vs0, vs34, alpha_r
  1860. #endif
  1861. stxsspx vs0, o0, T1
  1862. add T1, T1, LDC
  1863. #ifndef TRMMKERNEL
  1864. lxsspx vs0, o0, T1
  1865. #endif
  1866. #ifdef TRMMKERNEL
  1867. xsmuldp vs0, vs35, alpha_r
  1868. #else
  1869. xsmaddadp vs0, vs35, alpha_r
  1870. #endif
  1871. stxsspx vs0, o0, T1
  1872. add T1, T1, LDC
  1873. #ifndef TRMMKERNEL
  1874. lxsspx vs0, o0, T1
  1875. #endif
  1876. #ifdef TRMMKERNEL
  1877. xsmuldp vs0, vs36, alpha_r
  1878. #else
  1879. xsmaddadp vs0, vs36, alpha_r
  1880. #endif
  1881. stxsspx vs0, o0, T1
  1882. add T1, T1, LDC
  1883. #ifndef TRMMKERNEL
  1884. lxsspx vs0, o0, T1
  1885. #endif
  1886. #ifdef TRMMKERNEL
  1887. xsmuldp vs0, vs37, alpha_r
  1888. #else
  1889. xsmaddadp vs0, vs37, alpha_r
  1890. #endif
  1891. stxsspx vs0, o0, T1
  1892. add T1, T1, LDC
  1893. #ifndef TRMMKERNEL
  1894. lxsspx vs0, o0, T1
  1895. #endif
  1896. #ifdef TRMMKERNEL
  1897. xsmuldp vs0, vs38, alpha_r
  1898. #else
  1899. xsmaddadp vs0, vs38, alpha_r
  1900. #endif
  1901. stxsspx vs0, o0, T1
  1902. add T1, T1, LDC
  1903. #ifndef TRMMKERNEL
  1904. lxsspx vs0, o0, T1
  1905. #endif
  1906. #ifdef TRMMKERNEL
  1907. xsmuldp vs0, vs39, alpha_r
  1908. #else
  1909. xsmaddadp vs0, vs39, alpha_r
  1910. #endif
  1911. stxsspx vs0, o0, T1
  1912. add T1, T1, LDC
  1913. addi CO, CO, 4
  1914. #if defined(_AIX)
  1915. ')
  1916. #else
  1917. .endm
  1918. #endif
  1919. /**********************************************************************************************
  1920. * Macros for N=4 and M=16
  1921. **********************************************************************************************/
  1922. #if defined(_AIX)
  1923. define(`LOAD4x16_1', `
  1924. #else
  1925. .macro LOAD4x16_1
  1926. #endif
  1927. lxvw4x vs0, o0, AO
  1928. lxvw4x vs1, o16, AO
  1929. lxvw4x vs2, o32, AO
  1930. lxvw4x vs3, o48, AO
  1931. addi AO, AO, 64
  1932. lxvw4x vs28, o0, BO
  1933. xxspltw vs8, vs28, 0
  1934. xxspltw vs9, vs28, 1
  1935. xxspltw vs10, vs28, 2
  1936. xxspltw vs11, vs28, 3
  1937. addi BO, BO, 16
  1938. #if defined(_AIX)
  1939. ')
  1940. #else
  1941. .endm
  1942. #endif
  1943. #if defined(_AIX)
  1944. define(`KERNEL4x16_I1', `
  1945. #else
  1946. .macro KERNEL4x16_I1
  1947. #endif
  1948. lxvw4x vs4, o0, AO
  1949. lxvw4x vs5, o16, AO
  1950. lxvw4x vs6, o32, AO
  1951. lxvw4x vs7, o48, AO
  1952. addi AO, AO, 64
  1953. lxvw4x vs28, o0, BO
  1954. xxspltw vs16, vs28, 0
  1955. xxspltw vs17, vs28, 1
  1956. xxspltw vs18, vs28, 2
  1957. xxspltw vs19, vs28, 3
  1958. addi BO, BO, 16
  1959. xvmulsp vs32, vs0, vs8
  1960. xvmulsp vs33, vs1, vs8
  1961. xvmulsp vs34, vs2, vs8
  1962. xvmulsp vs35, vs3, vs8
  1963. xvmulsp vs36, vs0, vs9
  1964. xvmulsp vs37, vs1, vs9
  1965. xvmulsp vs38, vs2, vs9
  1966. xvmulsp vs39, vs3, vs9
  1967. xvmulsp vs40, vs0, vs10
  1968. xvmulsp vs41, vs1, vs10
  1969. xvmulsp vs42, vs2, vs10
  1970. xvmulsp vs43, vs3, vs10
  1971. xvmulsp vs44, vs0, vs11
  1972. xvmulsp vs45, vs1, vs11
  1973. xvmulsp vs46, vs2, vs11
  1974. xvmulsp vs47, vs3, vs11
  1975. #if defined(_AIX)
  1976. ')
  1977. #else
  1978. .endm
  1979. #endif
  1980. #if defined(_AIX)
  1981. define(`KERNEL4x16_1', `
  1982. #else
  1983. .macro KERNEL4x16_1
  1984. #endif
  1985. lxvw4x vs4, o0, AO
  1986. lxvw4x vs5, o16, AO
  1987. lxvw4x vs6, o32, AO
  1988. lxvw4x vs7, o48, AO
  1989. addi AO, AO, 64
  1990. lxvw4x vs28, o0, BO
  1991. xxspltw vs16, vs28, 0
  1992. xxspltw vs17, vs28, 1
  1993. xxspltw vs18, vs28, 2
  1994. xxspltw vs19, vs28, 3
  1995. addi BO, BO, 16
  1996. xvmaddasp vs32, vs0, vs8
  1997. xvmaddasp vs33, vs1, vs8
  1998. xvmaddasp vs34, vs2, vs8
  1999. xvmaddasp vs35, vs3, vs8
  2000. xvmaddasp vs36, vs0, vs9
  2001. xvmaddasp vs37, vs1, vs9
  2002. xvmaddasp vs38, vs2, vs9
  2003. xvmaddasp vs39, vs3, vs9
  2004. xvmaddasp vs40, vs0, vs10
  2005. xvmaddasp vs41, vs1, vs10
  2006. xvmaddasp vs42, vs2, vs10
  2007. xvmaddasp vs43, vs3, vs10
  2008. xvmaddasp vs44, vs0, vs11
  2009. xvmaddasp vs45, vs1, vs11
  2010. xvmaddasp vs46, vs2, vs11
  2011. xvmaddasp vs47, vs3, vs11
  2012. #if defined(_AIX)
  2013. ')
  2014. #else
  2015. .endm
  2016. #endif
  2017. #if defined(_AIX)
  2018. define(`KERNEL4x16_2', `
  2019. #else
  2020. .macro KERNEL4x16_2
  2021. #endif
  2022. lxvw4x vs0, o0, AO
  2023. lxvw4x vs1, o16, AO
  2024. lxvw4x vs2, o32, AO
  2025. lxvw4x vs3, o48, AO
  2026. addi AO, AO, 64
  2027. lxvw4x vs28, o0, BO
  2028. xxspltw vs8, vs28, 0
  2029. xxspltw vs9, vs28, 1
  2030. xxspltw vs10, vs28, 2
  2031. xxspltw vs11, vs28, 3
  2032. addi BO, BO, 16
  2033. xvmaddasp vs32, vs4, vs16
  2034. xvmaddasp vs33, vs5, vs16
  2035. xvmaddasp vs34, vs6, vs16
  2036. xvmaddasp vs35, vs7, vs16
  2037. xvmaddasp vs36, vs4, vs17
  2038. xvmaddasp vs37, vs5, vs17
  2039. xvmaddasp vs38, vs6, vs17
  2040. xvmaddasp vs39, vs7, vs17
  2041. xvmaddasp vs40, vs4, vs18
  2042. xvmaddasp vs41, vs5, vs18
  2043. xvmaddasp vs42, vs6, vs18
  2044. xvmaddasp vs43, vs7, vs18
  2045. xvmaddasp vs44, vs4, vs19
  2046. xvmaddasp vs45, vs5, vs19
  2047. xvmaddasp vs46, vs6, vs19
  2048. xvmaddasp vs47, vs7, vs19
  2049. #if defined(_AIX)
  2050. ')
  2051. #else
  2052. .endm
  2053. #endif
  2054. #if defined(_AIX)
  2055. define(`KERNEL4x16_E2', `
  2056. #else
  2057. .macro KERNEL4x16_E2
  2058. #endif
  2059. xvmaddasp vs32, vs4, vs16
  2060. xvmaddasp vs33, vs5, vs16
  2061. xvmaddasp vs34, vs6, vs16
  2062. xvmaddasp vs35, vs7, vs16
  2063. xvmaddasp vs36, vs4, vs17
  2064. xvmaddasp vs37, vs5, vs17
  2065. xvmaddasp vs38, vs6, vs17
  2066. xvmaddasp vs39, vs7, vs17
  2067. xvmaddasp vs40, vs4, vs18
  2068. xvmaddasp vs41, vs5, vs18
  2069. xvmaddasp vs42, vs6, vs18
  2070. xvmaddasp vs43, vs7, vs18
  2071. xvmaddasp vs44, vs4, vs19
  2072. xvmaddasp vs45, vs5, vs19
  2073. xvmaddasp vs46, vs6, vs19
  2074. xvmaddasp vs47, vs7, vs19
  2075. #if defined(_AIX)
  2076. ')
  2077. #else
  2078. .endm
  2079. #endif
  2080. #if defined(_AIX)
  2081. define(`KERNEL4x16_SUBI1', `
  2082. #else
  2083. .macro KERNEL4x16_SUBI1
  2084. #endif
  2085. lxvw4x vs0, o0, AO
  2086. lxvw4x vs1, o16, AO
  2087. lxvw4x vs2, o32, AO
  2088. lxvw4x vs3, o48, AO
  2089. addi AO, AO, 64
  2090. lxvw4x vs28, o0, BO
  2091. xxspltw vs8, vs28, 0
  2092. xxspltw vs9, vs28, 1
  2093. xxspltw vs10, vs28, 2
  2094. xxspltw vs11, vs28, 3
  2095. addi BO, BO, 16
  2096. xvmulsp vs32, vs0, vs8
  2097. xvmulsp vs33, vs1, vs8
  2098. xvmulsp vs34, vs2, vs8
  2099. xvmulsp vs35, vs3, vs8
  2100. xvmulsp vs36, vs0, vs9
  2101. xvmulsp vs37, vs1, vs9
  2102. xvmulsp vs38, vs2, vs9
  2103. xvmulsp vs39, vs3, vs9
  2104. xvmulsp vs40, vs0, vs10
  2105. xvmulsp vs41, vs1, vs10
  2106. xvmulsp vs42, vs2, vs10
  2107. xvmulsp vs43, vs3, vs10
  2108. xvmulsp vs44, vs0, vs11
  2109. xvmulsp vs45, vs1, vs11
  2110. xvmulsp vs46, vs2, vs11
  2111. xvmulsp vs47, vs3, vs11
  2112. #if defined(_AIX)
  2113. ')
  2114. #else
  2115. .endm
  2116. #endif
  2117. #if defined(_AIX)
  2118. define(`KERNEL4x16_SUB1', `
  2119. #else
  2120. .macro KERNEL4x16_SUB1
  2121. #endif
  2122. lxvw4x vs0, o0, AO
  2123. lxvw4x vs1, o16, AO
  2124. lxvw4x vs2, o32, AO
  2125. lxvw4x vs3, o48, AO
  2126. addi AO, AO, 64
  2127. lxvw4x vs28, o0, BO
  2128. xxspltw vs8, vs28, 0
  2129. xxspltw vs9, vs28, 1
  2130. xxspltw vs10, vs28, 2
  2131. xxspltw vs11, vs28, 3
  2132. addi BO, BO, 16
  2133. xvmaddasp vs32, vs0, vs8
  2134. xvmaddasp vs33, vs1, vs8
  2135. xvmaddasp vs34, vs2, vs8
  2136. xvmaddasp vs35, vs3, vs8
  2137. xvmaddasp vs36, vs0, vs9
  2138. xvmaddasp vs37, vs1, vs9
  2139. xvmaddasp vs38, vs2, vs9
  2140. xvmaddasp vs39, vs3, vs9
  2141. xvmaddasp vs40, vs0, vs10
  2142. xvmaddasp vs41, vs1, vs10
  2143. xvmaddasp vs42, vs2, vs10
  2144. xvmaddasp vs43, vs3, vs10
  2145. xvmaddasp vs44, vs0, vs11
  2146. xvmaddasp vs45, vs1, vs11
  2147. xvmaddasp vs46, vs2, vs11
  2148. xvmaddasp vs47, vs3, vs11
  2149. #if defined(_AIX)
  2150. ')
  2151. #else
  2152. .endm
  2153. #endif
  2154. #if defined(_AIX)
  2155. define(`SAVE4x16', `
  2156. #else
  2157. .macro SAVE4x16
  2158. #endif
  2159. mr T1, CO
  2160. #ifndef TRMMKERNEL
  2161. lxvw4x vs0, o0, T1
  2162. lxvw4x vs1, o16, T1
  2163. lxvw4x vs2, o32, T1
  2164. lxvw4x vs3, o48, T1
  2165. #endif
  2166. #ifdef TRMMKERNEL
  2167. xvmulsp vs0, vs32, alpha_vr
  2168. xvmulsp vs1, vs33, alpha_vr
  2169. xvmulsp vs2, vs34, alpha_vr
  2170. xvmulsp vs3, vs35, alpha_vr
  2171. #else
  2172. xvmaddasp vs0, vs32, alpha_vr
  2173. xvmaddasp vs1, vs33, alpha_vr
  2174. xvmaddasp vs2, vs34, alpha_vr
  2175. xvmaddasp vs3, vs35, alpha_vr
  2176. #endif
  2177. stxvw4x vs0, o0, T1
  2178. stxvw4x vs1, o16, T1
  2179. stxvw4x vs2, o32, T1
  2180. stxvw4x vs3, o48, T1
  2181. add T1, T1, LDC
  2182. #ifndef TRMMKERNEL
  2183. lxvw4x vs0, o0, T1
  2184. lxvw4x vs1, o16, T1
  2185. lxvw4x vs2, o32, T1
  2186. lxvw4x vs3, o48, T1
  2187. #endif
  2188. #ifdef TRMMKERNEL
  2189. xvmulsp vs0, vs36, alpha_vr
  2190. xvmulsp vs1, vs37, alpha_vr
  2191. xvmulsp vs2, vs38, alpha_vr
  2192. xvmulsp vs3, vs39, alpha_vr
  2193. #else
  2194. xvmaddasp vs0, vs36, alpha_vr
  2195. xvmaddasp vs1, vs37, alpha_vr
  2196. xvmaddasp vs2, vs38, alpha_vr
  2197. xvmaddasp vs3, vs39, alpha_vr
  2198. #endif
  2199. stxvw4x vs0, o0, T1
  2200. stxvw4x vs1, o16, T1
  2201. stxvw4x vs2, o32, T1
  2202. stxvw4x vs3, o48, T1
  2203. add T1, T1, LDC
  2204. #ifndef TRMMKERNEL
  2205. lxvw4x vs0, o0, T1
  2206. lxvw4x vs1, o16, T1
  2207. lxvw4x vs2, o32, T1
  2208. lxvw4x vs3, o48, T1
  2209. #endif
  2210. #ifdef TRMMKERNEL
  2211. xvmulsp vs0, vs40, alpha_vr
  2212. xvmulsp vs1, vs41, alpha_vr
  2213. xvmulsp vs2, vs42, alpha_vr
  2214. xvmulsp vs3, vs43, alpha_vr
  2215. #else
  2216. xvmaddasp vs0, vs40, alpha_vr
  2217. xvmaddasp vs1, vs41, alpha_vr
  2218. xvmaddasp vs2, vs42, alpha_vr
  2219. xvmaddasp vs3, vs43, alpha_vr
  2220. #endif
  2221. stxvw4x vs0, o0, T1
  2222. stxvw4x vs1, o16, T1
  2223. stxvw4x vs2, o32, T1
  2224. stxvw4x vs3, o48, T1
  2225. add T1, T1, LDC
  2226. #ifndef TRMMKERNEL
  2227. lxvw4x vs0, o0, T1
  2228. lxvw4x vs1, o16, T1
  2229. lxvw4x vs2, o32, T1
  2230. lxvw4x vs3, o48, T1
  2231. #endif
  2232. #ifdef TRMMKERNEL
  2233. xvmulsp vs0, vs44, alpha_vr
  2234. xvmulsp vs1, vs45, alpha_vr
  2235. xvmulsp vs2, vs46, alpha_vr
  2236. xvmulsp vs3, vs47, alpha_vr
  2237. #else
  2238. xvmaddasp vs0, vs44, alpha_vr
  2239. xvmaddasp vs1, vs45, alpha_vr
  2240. xvmaddasp vs2, vs46, alpha_vr
  2241. xvmaddasp vs3, vs47, alpha_vr
  2242. #endif
  2243. stxvw4x vs0, o0, T1
  2244. stxvw4x vs1, o16, T1
  2245. stxvw4x vs2, o32, T1
  2246. stxvw4x vs3, o48, T1
  2247. add T1, T1, LDC
  2248. addi CO, CO, 64
  2249. #if defined(_AIX)
  2250. ')
  2251. #else
  2252. .endm
  2253. #endif
  2254. /**********************************************************************************************
  2255. * Macros for N=4 and M=8
  2256. **********************************************************************************************/
  2257. #if defined(_AIX)
  2258. define(`LOAD4x8_1', `
  2259. #else
  2260. .macro LOAD4x8_1
  2261. #endif
  2262. lxvw4x vs0, o0, AO
  2263. lxvw4x vs1, o16, AO
  2264. addi AO, AO, 32
  2265. lxvw4x vs28, o0, BO
  2266. xxspltw vs8, vs28, 0
  2267. xxspltw vs9, vs28, 1
  2268. xxspltw vs10, vs28, 2
  2269. xxspltw vs11, vs28, 3
  2270. addi BO, BO, 16
  2271. #if defined(_AIX)
  2272. ')
  2273. #else
  2274. .endm
  2275. #endif
  2276. #if defined(_AIX)
  2277. define(`KERNEL4x8_I1', `
  2278. #else
  2279. .macro KERNEL4x8_I1
  2280. #endif
  2281. lxvw4x vs4, o0, AO
  2282. lxvw4x vs5, o16, AO
  2283. addi AO, AO, 32
  2284. lxvw4x vs28, o0, BO
  2285. xxspltw vs16, vs28, 0
  2286. xxspltw vs17, vs28, 1
  2287. xxspltw vs18, vs28, 2
  2288. xxspltw vs19, vs28, 3
  2289. addi BO, BO, 16
  2290. xvmulsp vs32, vs0, vs8
  2291. xvmulsp vs33, vs1, vs8
  2292. xvmulsp vs34, vs0, vs9
  2293. xvmulsp vs35, vs1, vs9
  2294. xvmulsp vs36, vs0, vs10
  2295. xvmulsp vs37, vs1, vs10
  2296. xvmulsp vs38, vs0, vs11
  2297. xvmulsp vs39, vs1, vs11
  2298. #if defined(_AIX)
  2299. ')
  2300. #else
  2301. .endm
  2302. #endif
  2303. #if defined(_AIX)
  2304. define(`KERNEL4x8_1', `
  2305. #else
  2306. .macro KERNEL4x8_1
  2307. #endif
  2308. lxvw4x vs4, o0, AO
  2309. lxvw4x vs5, o16, AO
  2310. addi AO, AO, 32
  2311. lxvw4x vs28, o0, BO
  2312. xxspltw vs16, vs28, 0
  2313. xxspltw vs17, vs28, 1
  2314. xxspltw vs18, vs28, 2
  2315. xxspltw vs19, vs28, 3
  2316. addi BO, BO, 16
  2317. xvmaddasp vs32, vs0, vs8
  2318. xvmaddasp vs33, vs1, vs8
  2319. xvmaddasp vs34, vs0, vs9
  2320. xvmaddasp vs35, vs1, vs9
  2321. xvmaddasp vs36, vs0, vs10
  2322. xvmaddasp vs37, vs1, vs10
  2323. xvmaddasp vs38, vs0, vs11
  2324. xvmaddasp vs39, vs1, vs11
  2325. #if defined(_AIX)
  2326. ')
  2327. #else
  2328. .endm
  2329. #endif
  2330. #if defined(_AIX)
  2331. define(`KERNEL4x8_2', `
  2332. #else
  2333. .macro KERNEL4x8_2
  2334. #endif
  2335. lxvw4x vs0, o0, AO
  2336. lxvw4x vs1, o16, AO
  2337. addi AO, AO, 32
  2338. lxvw4x vs28, o0, BO
  2339. xxspltw vs8, vs28, 0
  2340. xxspltw vs9, vs28, 1
  2341. xxspltw vs10, vs28, 2
  2342. xxspltw vs11, vs28, 3
  2343. addi BO, BO, 16
  2344. xvmaddasp vs32, vs4, vs16
  2345. xvmaddasp vs33, vs5, vs16
  2346. xvmaddasp vs34, vs4, vs17
  2347. xvmaddasp vs35, vs5, vs17
  2348. xvmaddasp vs36, vs4, vs18
  2349. xvmaddasp vs37, vs5, vs18
  2350. xvmaddasp vs38, vs4, vs19
  2351. xvmaddasp vs39, vs5, vs19
  2352. #if defined(_AIX)
  2353. ')
  2354. #else
  2355. .endm
  2356. #endif
  2357. #if defined(_AIX)
  2358. define(`KERNEL4x8_E2', `
  2359. #else
  2360. .macro KERNEL4x8_E2
  2361. #endif
  2362. xvmaddasp vs32, vs4, vs16
  2363. xvmaddasp vs33, vs5, vs16
  2364. xvmaddasp vs34, vs4, vs17
  2365. xvmaddasp vs35, vs5, vs17
  2366. xvmaddasp vs36, vs4, vs18
  2367. xvmaddasp vs37, vs5, vs18
  2368. xvmaddasp vs38, vs4, vs19
  2369. xvmaddasp vs39, vs5, vs19
  2370. #if defined(_AIX)
  2371. ')
  2372. #else
  2373. .endm
  2374. #endif
  2375. #if defined(_AIX)
  2376. define(`KERNEL4x8_SUBI1', `
  2377. #else
  2378. .macro KERNEL4x8_SUBI1
  2379. #endif
  2380. lxvw4x vs0, o0, AO
  2381. lxvw4x vs1, o16, AO
  2382. addi AO, AO, 32
  2383. lxvw4x vs28, o0, BO
  2384. xxspltw vs8, vs28, 0
  2385. xxspltw vs9, vs28, 1
  2386. xxspltw vs10, vs28, 2
  2387. xxspltw vs11, vs28, 3
  2388. addi BO, BO, 16
  2389. xvmulsp vs32, vs0, vs8
  2390. xvmulsp vs33, vs1, vs8
  2391. xvmulsp vs34, vs0, vs9
  2392. xvmulsp vs35, vs1, vs9
  2393. xvmulsp vs36, vs0, vs10
  2394. xvmulsp vs37, vs1, vs10
  2395. xvmulsp vs38, vs0, vs11
  2396. xvmulsp vs39, vs1, vs11
  2397. #if defined(_AIX)
  2398. ')
  2399. #else
  2400. .endm
  2401. #endif
  2402. #if defined(_AIX)
  2403. define(`KERNEL4x8_SUB1', `
  2404. #else
  2405. .macro KERNEL4x8_SUB1
  2406. #endif
  2407. lxvw4x vs0, o0, AO
  2408. lxvw4x vs1, o16, AO
  2409. addi AO, AO, 32
  2410. lxvw4x vs28, o0, BO
  2411. xxspltw vs8, vs28, 0
  2412. xxspltw vs9, vs28, 1
  2413. xxspltw vs10, vs28, 2
  2414. xxspltw vs11, vs28, 3
  2415. addi BO, BO, 16
  2416. xvmaddasp vs32, vs0, vs8
  2417. xvmaddasp vs33, vs1, vs8
  2418. xvmaddasp vs34, vs0, vs9
  2419. xvmaddasp vs35, vs1, vs9
  2420. xvmaddasp vs36, vs0, vs10
  2421. xvmaddasp vs37, vs1, vs10
  2422. xvmaddasp vs38, vs0, vs11
  2423. xvmaddasp vs39, vs1, vs11
  2424. #if defined(_AIX)
  2425. ')
  2426. #else
  2427. .endm
  2428. #endif
  2429. #if defined(_AIX)
  2430. define(`SAVE4x8', `
  2431. #else
  2432. .macro SAVE4x8
  2433. #endif
  2434. mr T1, CO
  2435. #ifndef TRMMKERNEL
  2436. lxvw4x vs0, o0, T1
  2437. lxvw4x vs1, o16, T1
  2438. #endif
  2439. #ifdef TRMMKERNEL
  2440. xvmulsp vs0, vs32, alpha_vr
  2441. xvmulsp vs1, vs33, alpha_vr
  2442. #else
  2443. xvmaddasp vs0, vs32, alpha_vr
  2444. xvmaddasp vs1, vs33, alpha_vr
  2445. #endif
  2446. stxvw4x vs0, o0, T1
  2447. stxvw4x vs1, o16, T1
  2448. add T1, T1, LDC
  2449. #ifndef TRMMKERNEL
  2450. lxvw4x vs0, o0, T1
  2451. lxvw4x vs1, o16, T1
  2452. #endif
  2453. #ifdef TRMMKERNEL
  2454. xvmulsp vs0, vs34, alpha_vr
  2455. xvmulsp vs1, vs35, alpha_vr
  2456. #else
  2457. xvmaddasp vs0, vs34, alpha_vr
  2458. xvmaddasp vs1, vs35, alpha_vr
  2459. #endif
  2460. stxvw4x vs0, o0, T1
  2461. stxvw4x vs1, o16, T1
  2462. add T1, T1, LDC
  2463. #ifndef TRMMKERNEL
  2464. lxvw4x vs0, o0, T1
  2465. lxvw4x vs1, o16, T1
  2466. #endif
  2467. #ifdef TRMMKERNEL
  2468. xvmulsp vs0, vs36, alpha_vr
  2469. xvmulsp vs1, vs37, alpha_vr
  2470. #else
  2471. xvmaddasp vs0, vs36, alpha_vr
  2472. xvmaddasp vs1, vs37, alpha_vr
  2473. #endif
  2474. stxvw4x vs0, o0, T1
  2475. stxvw4x vs1, o16, T1
  2476. add T1, T1, LDC
  2477. #ifndef TRMMKERNEL
  2478. lxvw4x vs0, o0, T1
  2479. lxvw4x vs1, o16, T1
  2480. #endif
  2481. #ifdef TRMMKERNEL
  2482. xvmulsp vs0, vs38, alpha_vr
  2483. xvmulsp vs1, vs39, alpha_vr
  2484. #else
  2485. xvmaddasp vs0, vs38, alpha_vr
  2486. xvmaddasp vs1, vs39, alpha_vr
  2487. #endif
  2488. stxvw4x vs0, o0, T1
  2489. stxvw4x vs1, o16, T1
  2490. add T1, T1, LDC
  2491. addi CO, CO, 32
  2492. #if defined(_AIX)
  2493. ')
  2494. #else
  2495. .endm
  2496. #endif
  2497. /**********************************************************************************************
  2498. * Macros for N=4 and M=4
  2499. **********************************************************************************************/
  2500. #if defined(_AIX)
  2501. define(`LOAD4x4_1', `
  2502. #else
  2503. .macro LOAD4x4_1
  2504. #endif
  2505. lxvw4x vs0, o0, AO
  2506. addi AO, AO, 16
  2507. lxvw4x vs28, o0, BO
  2508. xxspltw vs8, vs28, 0
  2509. xxspltw vs9, vs28, 1
  2510. xxspltw vs10, vs28, 2
  2511. xxspltw vs11, vs28, 3
  2512. addi BO, BO, 16
  2513. #if defined(_AIX)
  2514. ')
  2515. #else
  2516. .endm
  2517. #endif
  2518. #if defined(_AIX)
  2519. define(`KERNEL4x4_I1', `
  2520. #else
  2521. .macro KERNEL4x4_I1
  2522. #endif
  2523. lxvw4x vs4, o0, AO
  2524. addi AO, AO, 16
  2525. lxvw4x vs28, o0, BO
  2526. xxspltw vs16, vs28, 0
  2527. xxspltw vs17, vs28, 1
  2528. xxspltw vs18, vs28, 2
  2529. xxspltw vs19, vs28, 3
  2530. addi BO, BO, 16
  2531. xvmulsp vs32, vs0, vs8
  2532. xvmulsp vs33, vs0, vs9
  2533. xvmulsp vs34, vs0, vs10
  2534. xvmulsp vs35, vs0, vs11
  2535. #if defined(_AIX)
  2536. ')
  2537. #else
  2538. .endm
  2539. #endif
  2540. #if defined(_AIX)
  2541. define(`KERNEL4x4_1', `
  2542. #else
  2543. .macro KERNEL4x4_1
  2544. #endif
  2545. lxvw4x vs4, o0, AO
  2546. addi AO, AO, 16
  2547. lxvw4x vs28, o0, BO
  2548. xxspltw vs16, vs28, 0
  2549. xxspltw vs17, vs28, 1
  2550. xxspltw vs18, vs28, 2
  2551. xxspltw vs19, vs28, 3
  2552. addi BO, BO, 16
  2553. xvmaddasp vs32, vs0, vs8
  2554. xvmaddasp vs33, vs0, vs9
  2555. xvmaddasp vs34, vs0, vs10
  2556. xvmaddasp vs35, vs0, vs11
  2557. #if defined(_AIX)
  2558. ')
  2559. #else
  2560. .endm
  2561. #endif
  2562. #if defined(_AIX)
  2563. define(`KERNEL4x4_2', `
  2564. #else
  2565. .macro KERNEL4x4_2
  2566. #endif
  2567. lxvw4x vs0, o0, AO
  2568. addi AO, AO, 16
  2569. lxvw4x vs28, o0, BO
  2570. xxspltw vs8, vs28, 0
  2571. xxspltw vs9, vs28, 1
  2572. xxspltw vs10, vs28, 2
  2573. xxspltw vs11, vs28, 3
  2574. addi BO, BO, 16
  2575. xvmaddasp vs32, vs4, vs16
  2576. xvmaddasp vs33, vs4, vs17
  2577. xvmaddasp vs34, vs4, vs18
  2578. xvmaddasp vs35, vs4, vs19
  2579. #if defined(_AIX)
  2580. ')
  2581. #else
  2582. .endm
  2583. #endif
  2584. #if defined(_AIX)
  2585. define(`KERNEL4x4_E2', `
  2586. #else
  2587. .macro KERNEL4x4_E2
  2588. #endif
  2589. xvmaddasp vs32, vs4, vs16
  2590. xvmaddasp vs33, vs4, vs17
  2591. xvmaddasp vs34, vs4, vs18
  2592. xvmaddasp vs35, vs4, vs19
  2593. #if defined(_AIX)
  2594. ')
  2595. #else
  2596. .endm
  2597. #endif
  2598. #if defined(_AIX)
  2599. define(`KERNEL4x4_SUBI1', `
  2600. #else
  2601. .macro KERNEL4x4_SUBI1
  2602. #endif
  2603. lxvw4x vs0, o0, AO
  2604. addi AO, AO, 16
  2605. lxvw4x vs28, o0, BO
  2606. xxspltw vs8, vs28, 0
  2607. xxspltw vs9, vs28, 1
  2608. xxspltw vs10, vs28, 2
  2609. xxspltw vs11, vs28, 3
  2610. addi BO, BO, 16
  2611. xvmulsp vs32, vs0, vs8
  2612. xvmulsp vs33, vs0, vs9
  2613. xvmulsp vs34, vs0, vs10
  2614. xvmulsp vs35, vs0, vs11
  2615. #if defined(_AIX)
  2616. ')
  2617. #else
  2618. .endm
  2619. #endif
  2620. #if defined(_AIX)
  2621. define(`KERNEL4x4_SUB1', `
  2622. #else
  2623. .macro KERNEL4x4_SUB1
  2624. #endif
  2625. lxvw4x vs0, o0, AO
  2626. addi AO, AO, 16
  2627. lxvw4x vs28, o0, BO
  2628. xxspltw vs8, vs28, 0
  2629. xxspltw vs9, vs28, 1
  2630. xxspltw vs10, vs28, 2
  2631. xxspltw vs11, vs28, 3
  2632. addi BO, BO, 16
  2633. xvmaddasp vs32, vs0, vs8
  2634. xvmaddasp vs33, vs0, vs9
  2635. xvmaddasp vs34, vs0, vs10
  2636. xvmaddasp vs35, vs0, vs11
  2637. #if defined(_AIX)
  2638. ')
  2639. #else
  2640. .endm
  2641. #endif
  2642. #if defined(_AIX)
  2643. define(`SAVE4x4', `
  2644. #else
  2645. .macro SAVE4x4
  2646. #endif
  2647. mr T1, CO
  2648. #ifndef TRMMKERNEL
  2649. lxvw4x vs0, o0, T1
  2650. #endif
  2651. #ifdef TRMMKERNEL
  2652. xvmulsp vs0, vs32, alpha_vr
  2653. #else
  2654. xvmaddasp vs0, vs32, alpha_vr
  2655. #endif
  2656. stxvw4x vs0, o0, T1
  2657. add T1, T1, LDC
  2658. #ifndef TRMMKERNEL
  2659. lxvw4x vs0, o0, T1
  2660. #endif
  2661. #ifdef TRMMKERNEL
  2662. xvmulsp vs0, vs33, alpha_vr
  2663. #else
  2664. xvmaddasp vs0, vs33, alpha_vr
  2665. #endif
  2666. stxvw4x vs0, o0, T1
  2667. add T1, T1, LDC
  2668. #ifndef TRMMKERNEL
  2669. lxvw4x vs0, o0, T1
  2670. #endif
  2671. #ifdef TRMMKERNEL
  2672. xvmulsp vs0, vs34, alpha_vr
  2673. #else
  2674. xvmaddasp vs0, vs34, alpha_vr
  2675. #endif
  2676. stxvw4x vs0, o0, T1
  2677. add T1, T1, LDC
  2678. #ifndef TRMMKERNEL
  2679. lxvw4x vs0, o0, T1
  2680. #endif
  2681. #ifdef TRMMKERNEL
  2682. xvmulsp vs0, vs35, alpha_vr
  2683. #else
  2684. xvmaddasp vs0, vs35, alpha_vr
  2685. #endif
  2686. stxvw4x vs0, o0, T1
  2687. add T1, T1, LDC
  2688. addi CO, CO, 16
  2689. #if defined(_AIX)
  2690. ')
  2691. #else
  2692. .endm
  2693. #endif
  2694. /**********************************************************************************************
  2695. * Macros for N=4 and M=2
  2696. **********************************************************************************************/
  2697. #if defined(_AIX)
  2698. define(`LOAD4x2_1', `
  2699. #else
  2700. .macro LOAD4x2_1
  2701. #endif
  2702. lxsspx vs0, o0, AO
  2703. lxsspx vs1, o4, AO
  2704. addi AO, AO, 8
  2705. mr T1, BO
  2706. lxsspx vs8, o0, T1
  2707. lxsspx vs9, o4, T1
  2708. lxsspx vs10, o8, T1
  2709. lxsspx vs11, o12, T1
  2710. addi BO, BO, 16
  2711. #if defined(_AIX)
  2712. ')
  2713. #else
  2714. .endm
  2715. #endif
  2716. #if defined(_AIX)
  2717. define(`KERNEL4x2_I1', `
  2718. #else
  2719. .macro KERNEL4x2_I1
  2720. #endif
  2721. lxsspx vs4, o0, AO
  2722. lxsspx vs5, o4, AO
  2723. addi AO, AO, 8
  2724. mr T1, BO
  2725. lxsspx vs16, o0, T1
  2726. lxsspx vs17, o4, T1
  2727. lxsspx vs18, o8, T1
  2728. lxsspx vs19, o12, T1
  2729. addi BO, BO, 16
  2730. xsmuldp vs32, vs0, vs8
  2731. xsmuldp vs33, vs1, vs8
  2732. xsmuldp vs34, vs0, vs9
  2733. xsmuldp vs35, vs1, vs9
  2734. xsmuldp vs36, vs0, vs10
  2735. xsmuldp vs37, vs1, vs10
  2736. xsmuldp vs38, vs0, vs11
  2737. xsmuldp vs39, vs1, vs11
  2738. #if defined(_AIX)
  2739. ')
  2740. #else
  2741. .endm
  2742. #endif
  2743. #if defined(_AIX)
  2744. define(`KERNEL4x2_1', `
  2745. #else
  2746. .macro KERNEL4x2_1
  2747. #endif
  2748. lxsspx vs4, o0, AO
  2749. lxsspx vs5, o4, AO
  2750. addi AO, AO, 8
  2751. mr T1, BO
  2752. lxsspx vs16, o0, T1
  2753. lxsspx vs17, o4, T1
  2754. lxsspx vs18, o8, T1
  2755. lxsspx vs19, o12, T1
  2756. addi BO, BO, 16
  2757. xsmaddadp vs32, vs0, vs8
  2758. xsmaddadp vs33, vs1, vs8
  2759. xsmaddadp vs34, vs0, vs9
  2760. xsmaddadp vs35, vs1, vs9
  2761. xsmaddadp vs36, vs0, vs10
  2762. xsmaddadp vs37, vs1, vs10
  2763. xsmaddadp vs38, vs0, vs11
  2764. xsmaddadp vs39, vs1, vs11
  2765. #if defined(_AIX)
  2766. ')
  2767. #else
  2768. .endm
  2769. #endif
  2770. #if defined(_AIX)
  2771. define(`KERNEL4x2_2', `
  2772. #else
  2773. .macro KERNEL4x2_2
  2774. #endif
  2775. lxsspx vs0, o0, AO
  2776. lxsspx vs1, o4, AO
  2777. addi AO, AO, 8
  2778. mr T1, BO
  2779. lxsspx vs8, o0, T1
  2780. lxsspx vs9, o4, T1
  2781. lxsspx vs10, o8, T1
  2782. lxsspx vs11, o12, T1
  2783. addi BO, BO, 16
  2784. xsmaddadp vs32, vs4, vs16
  2785. xsmaddadp vs33, vs5, vs16
  2786. xsmaddadp vs34, vs4, vs17
  2787. xsmaddadp vs35, vs5, vs17
  2788. xsmaddadp vs36, vs4, vs18
  2789. xsmaddadp vs37, vs5, vs18
  2790. xsmaddadp vs38, vs4, vs19
  2791. xsmaddadp vs39, vs5, vs19
  2792. #if defined(_AIX)
  2793. ')
  2794. #else
  2795. .endm
  2796. #endif
  2797. #if defined(_AIX)
  2798. define(`KERNEL4x2_E2', `
  2799. #else
  2800. .macro KERNEL4x2_E2
  2801. #endif
  2802. xsmaddadp vs32, vs4, vs16
  2803. xsmaddadp vs33, vs5, vs16
  2804. xsmaddadp vs34, vs4, vs17
  2805. xsmaddadp vs35, vs5, vs17
  2806. xsmaddadp vs36, vs4, vs18
  2807. xsmaddadp vs37, vs5, vs18
  2808. xsmaddadp vs38, vs4, vs19
  2809. xsmaddadp vs39, vs5, vs19
  2810. #if defined(_AIX)
  2811. ')
  2812. #else
  2813. .endm
  2814. #endif
  2815. #if defined(_AIX)
  2816. define(`KERNEL4x2_SUBI1', `
  2817. #else
  2818. .macro KERNEL4x2_SUBI1
  2819. #endif
  2820. lxsspx vs0, o0, AO
  2821. lxsspx vs1, o4, AO
  2822. addi AO, AO, 8
  2823. mr T1, BO
  2824. lxsspx vs8, o0, T1
  2825. lxsspx vs9, o4, T1
  2826. lxsspx vs10, o8, T1
  2827. lxsspx vs11, o12, T1
  2828. addi BO, BO, 16
  2829. xsmuldp vs32, vs0, vs8
  2830. xsmuldp vs33, vs1, vs8
  2831. xsmuldp vs34, vs0, vs9
  2832. xsmuldp vs35, vs1, vs9
  2833. xsmuldp vs36, vs0, vs10
  2834. xsmuldp vs37, vs1, vs10
  2835. xsmuldp vs38, vs0, vs11
  2836. xsmuldp vs39, vs1, vs11
  2837. #if defined(_AIX)
  2838. ')
  2839. #else
  2840. .endm
  2841. #endif
  2842. #if defined(_AIX)
  2843. define(`KERNEL4x2_SUB1', `
  2844. #else
  2845. .macro KERNEL4x2_SUB1
  2846. #endif
  2847. lxsspx vs0, o0, AO
  2848. lxsspx vs1, o4, AO
  2849. addi AO, AO, 8
  2850. mr T1, BO
  2851. lxsspx vs8, o0, T1
  2852. lxsspx vs9, o4, T1
  2853. lxsspx vs10, o8, T1
  2854. lxsspx vs11, o12, T1
  2855. addi BO, BO, 16
  2856. xsmaddadp vs32, vs0, vs8
  2857. xsmaddadp vs33, vs1, vs8
  2858. xsmaddadp vs34, vs0, vs9
  2859. xsmaddadp vs35, vs1, vs9
  2860. xsmaddadp vs36, vs0, vs10
  2861. xsmaddadp vs37, vs1, vs10
  2862. xsmaddadp vs38, vs0, vs11
  2863. xsmaddadp vs39, vs1, vs11
  2864. #if defined(_AIX)
  2865. ')
  2866. #else
  2867. .endm
  2868. #endif
  2869. #if defined(_AIX)
  2870. define(`SAVE4x2', `
  2871. #else
  2872. .macro SAVE4x2
  2873. #endif
  2874. mr T1, CO
  2875. #ifndef TRMMKERNEL
  2876. lxsspx vs0, o0, T1
  2877. lxsspx vs1, o4, T1
  2878. #endif
  2879. #ifdef TRMMKERNEL
  2880. xsmuldp vs0, vs32, alpha_r
  2881. xsmuldp vs1, vs33, alpha_r
  2882. #else
  2883. xsmaddadp vs0, vs32, alpha_r
  2884. xsmaddadp vs1, vs33, alpha_r
  2885. #endif
  2886. stxsspx vs0, o0, T1
  2887. stxsspx vs1, o4, T1
  2888. add T1, T1, LDC
  2889. #ifndef TRMMKERNEL
  2890. lxsspx vs0, o0, T1
  2891. lxsspx vs1, o4, T1
  2892. #endif
  2893. #ifdef TRMMKERNEL
  2894. xsmuldp vs0, vs34, alpha_r
  2895. xsmuldp vs1, vs35, alpha_r
  2896. #else
  2897. xsmaddadp vs0, vs34, alpha_r
  2898. xsmaddadp vs1, vs35, alpha_r
  2899. #endif
  2900. stxsspx vs0, o0, T1
  2901. stxsspx vs1, o4, T1
  2902. add T1, T1, LDC
  2903. #ifndef TRMMKERNEL
  2904. lxsspx vs0, o0, T1
  2905. lxsspx vs1, o4, T1
  2906. #endif
  2907. #ifdef TRMMKERNEL
  2908. xsmuldp vs0, vs36, alpha_r
  2909. xsmuldp vs1, vs37, alpha_r
  2910. #else
  2911. xsmaddadp vs0, vs36, alpha_r
  2912. xsmaddadp vs1, vs37, alpha_r
  2913. #endif
  2914. stxsspx vs0, o0, T1
  2915. stxsspx vs1, o4, T1
  2916. add T1, T1, LDC
  2917. #ifndef TRMMKERNEL
  2918. lxsspx vs0, o0, T1
  2919. lxsspx vs1, o4, T1
  2920. #endif
  2921. #ifdef TRMMKERNEL
  2922. xsmuldp vs0, vs38, alpha_r
  2923. xsmuldp vs1, vs39, alpha_r
  2924. #else
  2925. xsmaddadp vs0, vs38, alpha_r
  2926. xsmaddadp vs1, vs39, alpha_r
  2927. #endif
  2928. stxsspx vs0, o0, T1
  2929. stxsspx vs1, o4, T1
  2930. add T1, T1, LDC
  2931. addi CO, CO, 8
  2932. #if defined(_AIX)
  2933. ')
  2934. #else
  2935. .endm
  2936. #endif
  2937. /**********************************************************************************************
  2938. * Macros for N=4 and M=1
  2939. **********************************************************************************************/
  2940. #if defined(_AIX)
  2941. define(`LOAD4x1_1', `
  2942. #else
  2943. .macro LOAD4x1_1
  2944. #endif
  2945. lxsspx vs0, o0, AO
  2946. addi AO, AO, 4
  2947. mr T1, BO
  2948. lxsspx vs8, o0, T1
  2949. lxsspx vs9, o4, T1
  2950. lxsspx vs10, o8, T1
  2951. lxsspx vs11, o12, T1
  2952. addi BO, BO, 16
  2953. #if defined(_AIX)
  2954. ')
  2955. #else
  2956. .endm
  2957. #endif
  2958. #if defined(_AIX)
  2959. define(`KERNEL4x1_I1', `
  2960. #else
  2961. .macro KERNEL4x1_I1
  2962. #endif
  2963. lxsspx vs4, o0, AO
  2964. addi AO, AO, 4
  2965. mr T1, BO
  2966. lxsspx vs16, o0, T1
  2967. lxsspx vs17, o4, T1
  2968. lxsspx vs18, o8, T1
  2969. lxsspx vs19, o12, T1
  2970. addi BO, BO, 16
  2971. xsmuldp vs32, vs0, vs8
  2972. xsmuldp vs33, vs0, vs9
  2973. xsmuldp vs34, vs0, vs10
  2974. xsmuldp vs35, vs0, vs11
  2975. #if defined(_AIX)
  2976. ')
  2977. #else
  2978. .endm
  2979. #endif
  2980. #if defined(_AIX)
  2981. define(`KERNEL4x1_1', `
  2982. #else
  2983. .macro KERNEL4x1_1
  2984. #endif
  2985. lxsspx vs4, o0, AO
  2986. addi AO, AO, 4
  2987. mr T1, BO
  2988. lxsspx vs16, o0, T1
  2989. lxsspx vs17, o4, T1
  2990. lxsspx vs18, o8, T1
  2991. lxsspx vs19, o12, T1
  2992. addi BO, BO, 16
  2993. xsmaddadp vs32, vs0, vs8
  2994. xsmaddadp vs33, vs0, vs9
  2995. xsmaddadp vs34, vs0, vs10
  2996. xsmaddadp vs35, vs0, vs11
  2997. #if defined(_AIX)
  2998. ')
  2999. #else
  3000. .endm
  3001. #endif
  3002. #if defined(_AIX)
  3003. define(`KERNEL4x1_2', `
  3004. #else
  3005. .macro KERNEL4x1_2
  3006. #endif
  3007. lxsspx vs0, o0, AO
  3008. addi AO, AO, 4
  3009. mr T1, BO
  3010. lxsspx vs8, o0, T1
  3011. lxsspx vs9, o4, T1
  3012. lxsspx vs10, o8, T1
  3013. lxsspx vs11, o12, T1
  3014. addi BO, BO, 16
  3015. xsmaddadp vs32, vs4, vs16
  3016. xsmaddadp vs33, vs4, vs17
  3017. xsmaddadp vs34, vs4, vs18
  3018. xsmaddadp vs35, vs4, vs19
  3019. #if defined(_AIX)
  3020. ')
  3021. #else
  3022. .endm
  3023. #endif
  3024. #if defined(_AIX)
  3025. define(`KERNEL4x1_E2', `
  3026. #else
  3027. .macro KERNEL4x1_E2
  3028. #endif
  3029. xsmaddadp vs32, vs4, vs16
  3030. xsmaddadp vs33, vs4, vs17
  3031. xsmaddadp vs34, vs4, vs18
  3032. xsmaddadp vs35, vs4, vs19
  3033. #if defined(_AIX)
  3034. ')
  3035. #else
  3036. .endm
  3037. #endif
  3038. #if defined(_AIX)
  3039. define(`KERNEL4x1_SUBI1', `
  3040. #else
  3041. .macro KERNEL4x1_SUBI1
  3042. #endif
  3043. lxsspx vs0, o0, AO
  3044. addi AO, AO, 4
  3045. mr T1, BO
  3046. lxsspx vs8, o0, T1
  3047. lxsspx vs9, o4, T1
  3048. lxsspx vs10, o8, T1
  3049. lxsspx vs11, o12, T1
  3050. addi BO, BO, 16
  3051. xsmuldp vs32, vs0, vs8
  3052. xsmuldp vs33, vs0, vs9
  3053. xsmuldp vs34, vs0, vs10
  3054. xsmuldp vs35, vs0, vs11
  3055. #if defined(_AIX)
  3056. ')
  3057. #else
  3058. .endm
  3059. #endif
  3060. #if defined(_AIX)
  3061. define(`KERNEL4x1_SUB1', `
  3062. #else
  3063. .macro KERNEL4x1_SUB1
  3064. #endif
  3065. lxsspx vs0, o0, AO
  3066. addi AO, AO, 4
  3067. mr T1, BO
  3068. lxsspx vs8, o0, T1
  3069. lxsspx vs9, o4, T1
  3070. lxsspx vs10, o8, T1
  3071. lxsspx vs11, o12, T1
  3072. addi BO, BO, 16
  3073. xsmaddadp vs32, vs0, vs8
  3074. xsmaddadp vs33, vs0, vs9
  3075. xsmaddadp vs34, vs0, vs10
  3076. xsmaddadp vs35, vs0, vs11
  3077. #if defined(_AIX)
  3078. ')
  3079. #else
  3080. .endm
  3081. #endif
  3082. #if defined(_AIX)
  3083. define(`SAVE4x1', `
  3084. #else
  3085. .macro SAVE4x1
  3086. #endif
  3087. mr T1, CO
  3088. #ifndef TRMMKERNEL
  3089. lxsspx vs0, o0, T1
  3090. #endif
  3091. #ifdef TRMMKERNEL
  3092. xsmuldp vs0, vs32, alpha_r
  3093. #else
  3094. xsmaddadp vs0, vs32, alpha_r
  3095. #endif
  3096. stxsspx vs0, o0, T1
  3097. add T1, T1, LDC
  3098. #ifndef TRMMKERNEL
  3099. lxsspx vs0, o0, T1
  3100. #endif
  3101. #ifdef TRMMKERNEL
  3102. xsmuldp vs0, vs33, alpha_r
  3103. #else
  3104. xsmaddadp vs0, vs33, alpha_r
  3105. #endif
  3106. stxsspx vs0, o0, T1
  3107. add T1, T1, LDC
  3108. #ifndef TRMMKERNEL
  3109. lxsspx vs0, o0, T1
  3110. #endif
  3111. #ifdef TRMMKERNEL
  3112. xsmuldp vs0, vs34, alpha_r
  3113. #else
  3114. xsmaddadp vs0, vs34, alpha_r
  3115. #endif
  3116. stxsspx vs0, o0, T1
  3117. add T1, T1, LDC
  3118. #ifndef TRMMKERNEL
  3119. lxsspx vs0, o0, T1
  3120. #endif
  3121. #ifdef TRMMKERNEL
  3122. xsmuldp vs0, vs35, alpha_r
  3123. #else
  3124. xsmaddadp vs0, vs35, alpha_r
  3125. #endif
  3126. stxsspx vs0, o0, T1
  3127. add T1, T1, LDC
  3128. addi CO, CO, 4
  3129. #if defined(_AIX)
  3130. ')
  3131. #else
  3132. .endm
  3133. #endif
  3134. /**********************************************************************************************
  3135. * Macros for N=2 and M=16
  3136. **********************************************************************************************/
  3137. #if defined(_AIX)
  3138. define(`LOAD2x16_1', `
  3139. #else
  3140. .macro LOAD2x16_1
  3141. #endif
  3142. lxvw4x vs0, o0, AO
  3143. lxvw4x vs1, o16, AO
  3144. lxvw4x vs2, o32, AO
  3145. lxvw4x vs3, o48, AO
  3146. addi AO, AO, 64
  3147. lxvw4x vs28, o0, BO
  3148. xxspltw vs8, vs28, 0
  3149. xxspltw vs9, vs28, 1
  3150. addi BO, BO, 8
  3151. #if defined(_AIX)
  3152. ')
  3153. #else
  3154. .endm
  3155. #endif
  3156. #if defined(_AIX)
  3157. define(`KERNEL2x16_I1', `
  3158. #else
  3159. .macro KERNEL2x16_I1
  3160. #endif
  3161. lxvw4x vs4, o0, AO
  3162. lxvw4x vs5, o16, AO
  3163. lxvw4x vs6, o32, AO
  3164. lxvw4x vs7, o48, AO
  3165. addi AO, AO, 64
  3166. lxvw4x vs28, o0, BO
  3167. xxspltw vs16, vs28, 0
  3168. xxspltw vs17, vs28, 1
  3169. addi BO, BO, 8
  3170. xvmulsp vs32, vs0, vs8
  3171. xvmulsp vs33, vs1, vs8
  3172. xvmulsp vs34, vs2, vs8
  3173. xvmulsp vs35, vs3, vs8
  3174. xvmulsp vs36, vs0, vs9
  3175. xvmulsp vs37, vs1, vs9
  3176. xvmulsp vs38, vs2, vs9
  3177. xvmulsp vs39, vs3, vs9
  3178. #if defined(_AIX)
  3179. ')
  3180. #else
  3181. .endm
  3182. #endif
  3183. #if defined(_AIX)
  3184. define(`KERNEL2x16_1', `
  3185. #else
  3186. .macro KERNEL2x16_1
  3187. #endif
  3188. lxvw4x vs4, o0, AO
  3189. lxvw4x vs5, o16, AO
  3190. lxvw4x vs6, o32, AO
  3191. lxvw4x vs7, o48, AO
  3192. addi AO, AO, 64
  3193. lxvw4x vs28, o0, BO
  3194. xxspltw vs16, vs28, 0
  3195. xxspltw vs17, vs28, 1
  3196. addi BO, BO, 8
  3197. xvmaddasp vs32, vs0, vs8
  3198. xvmaddasp vs33, vs1, vs8
  3199. xvmaddasp vs34, vs2, vs8
  3200. xvmaddasp vs35, vs3, vs8
  3201. xvmaddasp vs36, vs0, vs9
  3202. xvmaddasp vs37, vs1, vs9
  3203. xvmaddasp vs38, vs2, vs9
  3204. xvmaddasp vs39, vs3, vs9
  3205. #if defined(_AIX)
  3206. ')
  3207. #else
  3208. .endm
  3209. #endif
  3210. #if defined(_AIX)
  3211. define(`KERNEL2x16_2', `
  3212. #else
  3213. .macro KERNEL2x16_2
  3214. #endif
  3215. lxvw4x vs0, o0, AO
  3216. lxvw4x vs1, o16, AO
  3217. lxvw4x vs2, o32, AO
  3218. lxvw4x vs3, o48, AO
  3219. addi AO, AO, 64
  3220. lxvw4x vs28, o0, BO
  3221. xxspltw vs8, vs28, 0
  3222. xxspltw vs9, vs28, 1
  3223. addi BO, BO, 8
  3224. xvmaddasp vs32, vs4, vs16
  3225. xvmaddasp vs33, vs5, vs16
  3226. xvmaddasp vs34, vs6, vs16
  3227. xvmaddasp vs35, vs7, vs16
  3228. xvmaddasp vs36, vs4, vs17
  3229. xvmaddasp vs37, vs5, vs17
  3230. xvmaddasp vs38, vs6, vs17
  3231. xvmaddasp vs39, vs7, vs17
  3232. #if defined(_AIX)
  3233. ')
  3234. #else
  3235. .endm
  3236. #endif
  3237. #if defined(_AIX)
  3238. define(`KERNEL2x16_E2', `
  3239. #else
  3240. .macro KERNEL2x16_E2
  3241. #endif
  3242. xvmaddasp vs32, vs4, vs16
  3243. xvmaddasp vs33, vs5, vs16
  3244. xvmaddasp vs34, vs6, vs16
  3245. xvmaddasp vs35, vs7, vs16
  3246. xvmaddasp vs36, vs4, vs17
  3247. xvmaddasp vs37, vs5, vs17
  3248. xvmaddasp vs38, vs6, vs17
  3249. xvmaddasp vs39, vs7, vs17
  3250. #if defined(_AIX)
  3251. ')
  3252. #else
  3253. .endm
  3254. #endif
  3255. #if defined(_AIX)
  3256. define(`KERNEL2x16_SUBI1', `
  3257. #else
  3258. .macro KERNEL2x16_SUBI1
  3259. #endif
  3260. lxvw4x vs0, o0, AO
  3261. lxvw4x vs1, o16, AO
  3262. lxvw4x vs2, o32, AO
  3263. lxvw4x vs3, o48, AO
  3264. addi AO, AO, 64
  3265. lxvw4x vs28, o0, BO
  3266. xxspltw vs8, vs28, 0
  3267. xxspltw vs9, vs28, 1
  3268. addi BO, BO, 8
  3269. xvmulsp vs32, vs0, vs8
  3270. xvmulsp vs33, vs1, vs8
  3271. xvmulsp vs34, vs2, vs8
  3272. xvmulsp vs35, vs3, vs8
  3273. xvmulsp vs36, vs0, vs9
  3274. xvmulsp vs37, vs1, vs9
  3275. xvmulsp vs38, vs2, vs9
  3276. xvmulsp vs39, vs3, vs9
  3277. #if defined(_AIX)
  3278. ')
  3279. #else
  3280. .endm
  3281. #endif
  3282. #if defined(_AIX)
  3283. define(`KERNEL2x16_SUB1', `
  3284. #else
  3285. .macro KERNEL2x16_SUB1
  3286. #endif
  3287. lxvw4x vs0, o0, AO
  3288. lxvw4x vs1, o16, AO
  3289. lxvw4x vs2, o32, AO
  3290. lxvw4x vs3, o48, AO
  3291. addi AO, AO, 64
  3292. lxvw4x vs28, o0, BO
  3293. xxspltw vs8, vs28, 0
  3294. xxspltw vs9, vs28, 1
  3295. addi BO, BO, 8
  3296. xvmaddasp vs32, vs0, vs8
  3297. xvmaddasp vs33, vs1, vs8
  3298. xvmaddasp vs34, vs2, vs8
  3299. xvmaddasp vs35, vs3, vs8
  3300. xvmaddasp vs36, vs0, vs9
  3301. xvmaddasp vs37, vs1, vs9
  3302. xvmaddasp vs38, vs2, vs9
  3303. xvmaddasp vs39, vs3, vs9
  3304. #if defined(_AIX)
  3305. ')
  3306. #else
  3307. .endm
  3308. #endif
  3309. #if defined(_AIX)
  3310. define(`SAVE2x16', `
  3311. #else
  3312. .macro SAVE2x16
  3313. #endif
  3314. mr T1, CO
  3315. #ifndef TRMMKERNEL
  3316. lxvw4x vs0, o0, T1
  3317. lxvw4x vs1, o16, T1
  3318. lxvw4x vs2, o32, T1
  3319. lxvw4x vs3, o48, T1
  3320. #endif
  3321. #ifdef TRMMKERNEL
  3322. xvmulsp vs0, vs32, alpha_vr
  3323. xvmulsp vs1, vs33, alpha_vr
  3324. xvmulsp vs2, vs34, alpha_vr
  3325. xvmulsp vs3, vs35, alpha_vr
  3326. #else
  3327. xvmaddasp vs0, vs32, alpha_vr
  3328. xvmaddasp vs1, vs33, alpha_vr
  3329. xvmaddasp vs2, vs34, alpha_vr
  3330. xvmaddasp vs3, vs35, alpha_vr
  3331. #endif
  3332. stxvw4x vs0, o0, T1
  3333. stxvw4x vs1, o16, T1
  3334. stxvw4x vs2, o32, T1
  3335. stxvw4x vs3, o48, T1
  3336. add T1, T1, LDC
  3337. #ifndef TRMMKERNEL
  3338. lxvw4x vs0, o0, T1
  3339. lxvw4x vs1, o16, T1
  3340. lxvw4x vs2, o32, T1
  3341. lxvw4x vs3, o48, T1
  3342. #endif
  3343. #ifdef TRMMKERNEL
  3344. xvmulsp vs0, vs36, alpha_vr
  3345. xvmulsp vs1, vs37, alpha_vr
  3346. xvmulsp vs2, vs38, alpha_vr
  3347. xvmulsp vs3, vs39, alpha_vr
  3348. #else
  3349. xvmaddasp vs0, vs36, alpha_vr
  3350. xvmaddasp vs1, vs37, alpha_vr
  3351. xvmaddasp vs2, vs38, alpha_vr
  3352. xvmaddasp vs3, vs39, alpha_vr
  3353. #endif
  3354. stxvw4x vs0, o0, T1
  3355. stxvw4x vs1, o16, T1
  3356. stxvw4x vs2, o32, T1
  3357. stxvw4x vs3, o48, T1
  3358. add T1, T1, LDC
  3359. addi CO, CO, 64
  3360. #if defined(_AIX)
  3361. ')
  3362. #else
  3363. .endm
  3364. #endif
  3365. /**********************************************************************************************
  3366. * Macros for N=2 and M=8
  3367. **********************************************************************************************/
  3368. #if defined(_AIX)
  3369. define(`LOAD2x8_1', `
  3370. #else
  3371. .macro LOAD2x8_1
  3372. #endif
  3373. lxvw4x vs0, o0, AO
  3374. lxvw4x vs1, o16, AO
  3375. addi AO, AO, 32
  3376. lxvw4x vs28, o0, BO
  3377. xxspltw vs8, vs28, 0
  3378. xxspltw vs9, vs28, 1
  3379. addi BO, BO, 8
  3380. #if defined(_AIX)
  3381. ')
  3382. #else
  3383. .endm
  3384. #endif
  3385. #if defined(_AIX)
  3386. define(`KERNEL2x8_I1', `
  3387. #else
  3388. .macro KERNEL2x8_I1
  3389. #endif
  3390. lxvw4x vs4, o0, AO
  3391. lxvw4x vs5, o16, AO
  3392. addi AO, AO, 32
  3393. lxvw4x vs28, o0, BO
  3394. xxspltw vs16, vs28, 0
  3395. xxspltw vs17, vs28, 1
  3396. addi BO, BO, 8
  3397. xvmulsp vs32, vs0, vs8
  3398. xvmulsp vs33, vs1, vs8
  3399. xvmulsp vs34, vs0, vs9
  3400. xvmulsp vs35, vs1, vs9
  3401. #if defined(_AIX)
  3402. ')
  3403. #else
  3404. .endm
  3405. #endif
  3406. #if defined(_AIX)
  3407. define(`KERNEL2x8_1', `
  3408. #else
  3409. .macro KERNEL2x8_1
  3410. #endif
  3411. lxvw4x vs4, o0, AO
  3412. lxvw4x vs5, o16, AO
  3413. addi AO, AO, 32
  3414. lxvw4x vs28, o0, BO
  3415. xxspltw vs16, vs28, 0
  3416. xxspltw vs17, vs28, 1
  3417. addi BO, BO, 8
  3418. xvmaddasp vs32, vs0, vs8
  3419. xvmaddasp vs33, vs1, vs8
  3420. xvmaddasp vs34, vs0, vs9
  3421. xvmaddasp vs35, vs1, vs9
  3422. #if defined(_AIX)
  3423. ')
  3424. #else
  3425. .endm
  3426. #endif
  3427. #if defined(_AIX)
  3428. define(`KERNEL2x8_2', `
  3429. #else
  3430. .macro KERNEL2x8_2
  3431. #endif
  3432. lxvw4x vs0, o0, AO
  3433. lxvw4x vs1, o16, AO
  3434. addi AO, AO, 32
  3435. lxvw4x vs28, o0, BO
  3436. xxspltw vs8, vs28, 0
  3437. xxspltw vs9, vs28, 1
  3438. addi BO, BO, 8
  3439. xvmaddasp vs32, vs4, vs16
  3440. xvmaddasp vs33, vs5, vs16
  3441. xvmaddasp vs34, vs4, vs17
  3442. xvmaddasp vs35, vs5, vs17
  3443. #if defined(_AIX)
  3444. ')
  3445. #else
  3446. .endm
  3447. #endif
  3448. #if defined(_AIX)
  3449. define(`KERNEL2x8_E2', `
  3450. #else
  3451. .macro KERNEL2x8_E2
  3452. #endif
  3453. xvmaddasp vs32, vs4, vs16
  3454. xvmaddasp vs33, vs5, vs16
  3455. xvmaddasp vs34, vs4, vs17
  3456. xvmaddasp vs35, vs5, vs17
  3457. #if defined(_AIX)
  3458. ')
  3459. #else
  3460. .endm
  3461. #endif
  3462. #if defined(_AIX)
  3463. define(`KERNEL2x8_SUBI1', `
  3464. #else
  3465. .macro KERNEL2x8_SUBI1
  3466. #endif
  3467. lxvw4x vs0, o0, AO
  3468. lxvw4x vs1, o16, AO
  3469. addi AO, AO, 32
  3470. lxvw4x vs28, o0, BO
  3471. xxspltw vs8, vs28, 0
  3472. xxspltw vs9, vs28, 1
  3473. addi BO, BO, 8
  3474. xvmulsp vs32, vs0, vs8
  3475. xvmulsp vs33, vs1, vs8
  3476. xvmulsp vs34, vs0, vs9
  3477. xvmulsp vs35, vs1, vs9
  3478. #if defined(_AIX)
  3479. ')
  3480. #else
  3481. .endm
  3482. #endif
  3483. #if defined(_AIX)
  3484. define(`KERNEL2x8_SUB1', `
  3485. #else
  3486. .macro KERNEL2x8_SUB1
  3487. #endif
  3488. lxvw4x vs0, o0, AO
  3489. lxvw4x vs1, o16, AO
  3490. addi AO, AO, 32
  3491. lxvw4x vs28, o0, BO
  3492. xxspltw vs8, vs28, 0
  3493. xxspltw vs9, vs28, 1
  3494. addi BO, BO, 8
  3495. xvmaddasp vs32, vs0, vs8
  3496. xvmaddasp vs33, vs1, vs8
  3497. xvmaddasp vs34, vs0, vs9
  3498. xvmaddasp vs35, vs1, vs9
  3499. #if defined(_AIX)
  3500. ')
  3501. #else
  3502. .endm
  3503. #endif
  3504. #if defined(_AIX)
  3505. define(`SAVE2x8', `
  3506. #else
  3507. .macro SAVE2x8
  3508. #endif
  3509. mr T1, CO
  3510. #ifndef TRMMKERNEL
  3511. lxvw4x vs0, o0, T1
  3512. lxvw4x vs1, o16, T1
  3513. #endif
  3514. #ifdef TRMMKERNEL
  3515. xvmulsp vs0, vs32, alpha_vr
  3516. xvmulsp vs1, vs33, alpha_vr
  3517. #else
  3518. xvmaddasp vs0, vs32, alpha_vr
  3519. xvmaddasp vs1, vs33, alpha_vr
  3520. #endif
  3521. stxvw4x vs0, o0, T1
  3522. stxvw4x vs1, o16, T1
  3523. add T1, T1, LDC
  3524. #ifndef TRMMKERNEL
  3525. lxvw4x vs0, o0, T1
  3526. lxvw4x vs1, o16, T1
  3527. #endif
  3528. #ifdef TRMMKERNEL
  3529. xvmulsp vs0, vs34, alpha_vr
  3530. xvmulsp vs1, vs35, alpha_vr
  3531. #else
  3532. xvmaddasp vs0, vs34, alpha_vr
  3533. xvmaddasp vs1, vs35, alpha_vr
  3534. #endif
  3535. stxvw4x vs0, o0, T1
  3536. stxvw4x vs1, o16, T1
  3537. add T1, T1, LDC
  3538. addi CO, CO, 32
  3539. #if defined(_AIX)
  3540. ')
  3541. #else
  3542. .endm
  3543. #endif
  3544. /**********************************************************************************************
  3545. * Macros for N=2 and M=4
  3546. **********************************************************************************************/
  3547. #if defined(_AIX)
  3548. define(`LOAD2x4_1', `
  3549. #else
  3550. .macro LOAD2x4_1
  3551. #endif
  3552. lxvw4x vs0, o0, AO
  3553. addi AO, AO, 16
  3554. lxvw4x vs28, o0, BO
  3555. xxspltw vs8, vs28, 0
  3556. xxspltw vs9, vs28, 1
  3557. addi BO, BO, 8
  3558. #if defined(_AIX)
  3559. ')
  3560. #else
  3561. .endm
  3562. #endif
  3563. #if defined(_AIX)
  3564. define(`KERNEL2x4_I1', `
  3565. #else
  3566. .macro KERNEL2x4_I1
  3567. #endif
  3568. lxvw4x vs4, o0, AO
  3569. addi AO, AO, 16
  3570. lxvw4x vs28, o0, BO
  3571. xxspltw vs16, vs28, 0
  3572. xxspltw vs17, vs28, 1
  3573. addi BO, BO, 8
  3574. xvmulsp vs32, vs0, vs8
  3575. xvmulsp vs33, vs0, vs9
  3576. #if defined(_AIX)
  3577. ')
  3578. #else
  3579. .endm
  3580. #endif
  3581. #if defined(_AIX)
  3582. define(`KERNEL2x4_1', `
  3583. #else
  3584. .macro KERNEL2x4_1
  3585. #endif
  3586. lxvw4x vs4, o0, AO
  3587. addi AO, AO, 16
  3588. lxvw4x vs28, o0, BO
  3589. xxspltw vs16, vs28, 0
  3590. xxspltw vs17, vs28, 1
  3591. addi BO, BO, 8
  3592. xvmaddasp vs32, vs0, vs8
  3593. xvmaddasp vs33, vs0, vs9
  3594. #if defined(_AIX)
  3595. ')
  3596. #else
  3597. .endm
  3598. #endif
  3599. #if defined(_AIX)
  3600. define(`KERNEL2x4_2', `
  3601. #else
  3602. .macro KERNEL2x4_2
  3603. #endif
  3604. lxvw4x vs0, o0, AO
  3605. addi AO, AO, 16
  3606. lxvw4x vs28, o0, BO
  3607. xxspltw vs8, vs28, 0
  3608. xxspltw vs9, vs28, 1
  3609. addi BO, BO, 8
  3610. xvmaddasp vs32, vs4, vs16
  3611. xvmaddasp vs33, vs4, vs17
  3612. #if defined(_AIX)
  3613. ')
  3614. #else
  3615. .endm
  3616. #endif
  3617. #if defined(_AIX)
  3618. define(`KERNEL2x4_E2', `
  3619. #else
  3620. .macro KERNEL2x4_E2
  3621. #endif
  3622. xvmaddasp vs32, vs4, vs16
  3623. xvmaddasp vs33, vs4, vs17
  3624. #if defined(_AIX)
  3625. ')
  3626. #else
  3627. .endm
  3628. #endif
  3629. #if defined(_AIX)
  3630. define(`KERNEL2x4_SUBI1', `
  3631. #else
  3632. .macro KERNEL2x4_SUBI1
  3633. #endif
  3634. lxvw4x vs0, o0, AO
  3635. addi AO, AO, 16
  3636. lxvw4x vs28, o0, BO
  3637. xxspltw vs8, vs28, 0
  3638. xxspltw vs9, vs28, 1
  3639. addi BO, BO, 8
  3640. xvmulsp vs32, vs0, vs8
  3641. xvmulsp vs33, vs0, vs9
  3642. #if defined(_AIX)
  3643. ')
  3644. #else
  3645. .endm
  3646. #endif
  3647. #if defined(_AIX)
  3648. define(`KERNEL2x4_SUB1', `
  3649. #else
  3650. .macro KERNEL2x4_SUB1
  3651. #endif
  3652. lxvw4x vs0, o0, AO
  3653. addi AO, AO, 16
  3654. lxvw4x vs28, o0, BO
  3655. xxspltw vs8, vs28, 0
  3656. xxspltw vs9, vs28, 1
  3657. addi BO, BO, 8
  3658. xvmaddasp vs32, vs0, vs8
  3659. xvmaddasp vs33, vs0, vs9
  3660. #if defined(_AIX)
  3661. ')
  3662. #else
  3663. .endm
  3664. #endif
  3665. #if defined(_AIX)
  3666. define(`SAVE2x4', `
  3667. #else
  3668. .macro SAVE2x4
  3669. #endif
  3670. mr T1, CO
  3671. #ifndef TRMMKERNEL
  3672. lxvw4x vs0, o0, T1
  3673. #endif
  3674. #ifdef TRMMKERNEL
  3675. xvmulsp vs0, vs32, alpha_vr
  3676. #else
  3677. xvmaddasp vs0, vs32, alpha_vr
  3678. #endif
  3679. stxvw4x vs0, o0, T1
  3680. add T1, T1, LDC
  3681. #ifndef TRMMKERNEL
  3682. lxvw4x vs0, o0, T1
  3683. #endif
  3684. #ifdef TRMMKERNEL
  3685. xvmulsp vs0, vs33, alpha_vr
  3686. #else
  3687. xvmaddasp vs0, vs33, alpha_vr
  3688. #endif
  3689. stxvw4x vs0, o0, T1
  3690. add T1, T1, LDC
  3691. addi CO, CO, 16
  3692. #if defined(_AIX)
  3693. ')
  3694. #else
  3695. .endm
  3696. #endif
  3697. /**********************************************************************************************
  3698. * Macros for N=2 and M=2
  3699. **********************************************************************************************/
  3700. #if defined(_AIX)
  3701. define(`LOAD2x2_1', `
  3702. #else
  3703. .macro LOAD2x2_1
  3704. #endif
  3705. lxsspx vs0, o0, AO
  3706. lxsspx vs1, o4, AO
  3707. addi AO, AO, 8
  3708. mr T1, BO
  3709. lxsspx vs8, o0, T1
  3710. lxsspx vs9, o4, T1
  3711. addi BO, BO, 8
  3712. #if defined(_AIX)
  3713. ')
  3714. #else
  3715. .endm
  3716. #endif
  3717. #if defined(_AIX)
  3718. define(`KERNEL2x2_I1', `
  3719. #else
  3720. .macro KERNEL2x2_I1
  3721. #endif
  3722. lxsspx vs4, o0, AO
  3723. lxsspx vs5, o4, AO
  3724. addi AO, AO, 8
  3725. mr T1, BO
  3726. lxsspx vs16, o0, T1
  3727. lxsspx vs17, o4, T1
  3728. addi BO, BO, 8
  3729. xsmuldp vs32, vs0, vs8
  3730. xsmuldp vs33, vs1, vs8
  3731. xsmuldp vs34, vs0, vs9
  3732. xsmuldp vs35, vs1, vs9
  3733. #if defined(_AIX)
  3734. ')
  3735. #else
  3736. .endm
  3737. #endif
  3738. #if defined(_AIX)
  3739. define(`KERNEL2x2_1', `
  3740. #else
  3741. .macro KERNEL2x2_1
  3742. #endif
  3743. lxsspx vs4, o0, AO
  3744. lxsspx vs5, o4, AO
  3745. addi AO, AO, 8
  3746. mr T1, BO
  3747. lxsspx vs16, o0, T1
  3748. lxsspx vs17, o4, T1
  3749. addi BO, BO, 8
  3750. xsmaddadp vs32, vs0, vs8
  3751. xsmaddadp vs33, vs1, vs8
  3752. xsmaddadp vs34, vs0, vs9
  3753. xsmaddadp vs35, vs1, vs9
  3754. #if defined(_AIX)
  3755. ')
  3756. #else
  3757. .endm
  3758. #endif
  3759. #if defined(_AIX)
  3760. define(`KERNEL2x2_2', `
  3761. #else
  3762. .macro KERNEL2x2_2
  3763. #endif
  3764. lxsspx vs0, o0, AO
  3765. lxsspx vs1, o4, AO
  3766. addi AO, AO, 8
  3767. mr T1, BO
  3768. lxsspx vs8, o0, T1
  3769. lxsspx vs9, o4, T1
  3770. addi BO, BO, 8
  3771. xsmaddadp vs32, vs4, vs16
  3772. xsmaddadp vs33, vs5, vs16
  3773. xsmaddadp vs34, vs4, vs17
  3774. xsmaddadp vs35, vs5, vs17
  3775. #if defined(_AIX)
  3776. ')
  3777. #else
  3778. .endm
  3779. #endif
  3780. #if defined(_AIX)
  3781. define(`KERNEL2x2_E2', `
  3782. #else
  3783. .macro KERNEL2x2_E2
  3784. #endif
  3785. xsmaddadp vs32, vs4, vs16
  3786. xsmaddadp vs33, vs5, vs16
  3787. xsmaddadp vs34, vs4, vs17
  3788. xsmaddadp vs35, vs5, vs17
  3789. #if defined(_AIX)
  3790. ')
  3791. #else
  3792. .endm
  3793. #endif
  3794. #if defined(_AIX)
  3795. define(`KERNEL2x2_SUBI1', `
  3796. #else
  3797. .macro KERNEL2x2_SUBI1
  3798. #endif
  3799. lxsspx vs0, o0, AO
  3800. lxsspx vs1, o4, AO
  3801. addi AO, AO, 8
  3802. mr T1, BO
  3803. lxsspx vs8, o0, T1
  3804. lxsspx vs9, o4, T1
  3805. addi BO, BO, 8
  3806. xsmuldp vs32, vs0, vs8
  3807. xsmuldp vs33, vs1, vs8
  3808. xsmuldp vs34, vs0, vs9
  3809. xsmuldp vs35, vs1, vs9
  3810. #if defined(_AIX)
  3811. ')
  3812. #else
  3813. .endm
  3814. #endif
  3815. #if defined(_AIX)
  3816. define(`KERNEL2x2_SUB1', `
  3817. #else
  3818. .macro KERNEL2x2_SUB1
  3819. #endif
  3820. lxsspx vs0, o0, AO
  3821. lxsspx vs1, o4, AO
  3822. addi AO, AO, 8
  3823. mr T1, BO
  3824. lxsspx vs8, o0, T1
  3825. lxsspx vs9, o4, T1
  3826. addi BO, BO, 8
  3827. xsmaddadp vs32, vs0, vs8
  3828. xsmaddadp vs33, vs1, vs8
  3829. xsmaddadp vs34, vs0, vs9
  3830. xsmaddadp vs35, vs1, vs9
  3831. #if defined(_AIX)
  3832. ')
  3833. #else
  3834. .endm
  3835. #endif
  3836. #if defined(_AIX)
  3837. define(`SAVE2x2', `
  3838. #else
  3839. .macro SAVE2x2
  3840. #endif
  3841. mr T1, CO
  3842. #ifndef TRMMKERNEL
  3843. lxsspx vs0, o0, T1
  3844. lxsspx vs1, o4, T1
  3845. #endif
  3846. #ifdef TRMMKERNEL
  3847. xsmuldp vs0, vs32, alpha_r
  3848. xsmuldp vs1, vs33, alpha_r
  3849. #else
  3850. xsmaddadp vs0, vs32, alpha_r
  3851. xsmaddadp vs1, vs33, alpha_r
  3852. #endif
  3853. stxsspx vs0, o0, T1
  3854. stxsspx vs1, o4, T1
  3855. add T1, T1, LDC
  3856. #ifndef TRMMKERNEL
  3857. lxsspx vs0, o0, T1
  3858. lxsspx vs1, o4, T1
  3859. #endif
  3860. #ifdef TRMMKERNEL
  3861. xsmuldp vs0, vs34, alpha_r
  3862. xsmuldp vs1, vs35, alpha_r
  3863. #else
  3864. xsmaddadp vs0, vs34, alpha_r
  3865. xsmaddadp vs1, vs35, alpha_r
  3866. #endif
  3867. stxsspx vs0, o0, T1
  3868. stxsspx vs1, o4, T1
  3869. add T1, T1, LDC
  3870. addi CO, CO, 8
  3871. #if defined(_AIX)
  3872. ')
  3873. #else
  3874. .endm
  3875. #endif
  3876. /**********************************************************************************************
  3877. * Macros for N=2 and M=1
  3878. **********************************************************************************************/
  3879. #if defined(_AIX)
  3880. define(`LOAD2x1_1', `
  3881. #else
  3882. .macro LOAD2x1_1
  3883. #endif
  3884. lxsspx vs0, o0, AO
  3885. addi AO, AO, 4
  3886. mr T1, BO
  3887. lxsspx vs8, o0, T1
  3888. lxsspx vs9, o4, T1
  3889. addi BO, BO, 8
  3890. #if defined(_AIX)
  3891. ')
  3892. #else
  3893. .endm
  3894. #endif
  3895. #if defined(_AIX)
  3896. define(`KERNEL2x1_I1', `
  3897. #else
  3898. .macro KERNEL2x1_I1
  3899. #endif
  3900. lxsspx vs4, o0, AO
  3901. addi AO, AO, 4
  3902. mr T1, BO
  3903. lxsspx vs16, o0, T1
  3904. lxsspx vs17, o4, T1
  3905. addi BO, BO, 8
  3906. xsmuldp vs32, vs0, vs8
  3907. xsmuldp vs33, vs0, vs9
  3908. #if defined(_AIX)
  3909. ')
  3910. #else
  3911. .endm
  3912. #endif
  3913. #if defined(_AIX)
  3914. define(`KERNEL2x1_1', `
  3915. #else
  3916. .macro KERNEL2x1_1
  3917. #endif
  3918. lxsspx vs4, o0, AO
  3919. addi AO, AO, 4
  3920. mr T1, BO
  3921. lxsspx vs16, o0, T1
  3922. lxsspx vs17, o4, T1
  3923. addi BO, BO, 8
  3924. xsmaddadp vs32, vs0, vs8
  3925. xsmaddadp vs33, vs0, vs9
  3926. #if defined(_AIX)
  3927. ')
  3928. #else
  3929. .endm
  3930. #endif
  3931. #if defined(_AIX)
  3932. define(`KERNEL2x1_2', `
  3933. #else
  3934. .macro KERNEL2x1_2
  3935. #endif
  3936. lxsspx vs0, o0, AO
  3937. addi AO, AO, 4
  3938. mr T1, BO
  3939. lxsspx vs8, o0, T1
  3940. lxsspx vs9, o4, T1
  3941. addi BO, BO, 8
  3942. xsmaddadp vs32, vs4, vs16
  3943. xsmaddadp vs33, vs4, vs17
  3944. #if defined(_AIX)
  3945. ')
  3946. #else
  3947. .endm
  3948. #endif
  3949. #if defined(_AIX)
  3950. define(`KERNEL2x1_E2', `
  3951. #else
  3952. .macro KERNEL2x1_E2
  3953. #endif
  3954. xsmaddadp vs32, vs4, vs16
  3955. xsmaddadp vs33, vs4, vs17
  3956. #if defined(_AIX)
  3957. ')
  3958. #else
  3959. .endm
  3960. #endif
  3961. #if defined(_AIX)
  3962. define(`KERNEL2x1_SUBI1', `
  3963. #else
  3964. .macro KERNEL2x1_SUBI1
  3965. #endif
  3966. lxsspx vs0, o0, AO
  3967. addi AO, AO, 4
  3968. mr T1, BO
  3969. lxsspx vs8, o0, T1
  3970. lxsspx vs9, o4, T1
  3971. addi BO, BO, 8
  3972. xsmuldp vs32, vs0, vs8
  3973. xsmuldp vs33, vs0, vs9
  3974. #if defined(_AIX)
  3975. ')
  3976. #else
  3977. .endm
  3978. #endif
  3979. #if defined(_AIX)
  3980. define(`KERNEL2x1_SUB1', `
  3981. #else
  3982. .macro KERNEL2x1_SUB1
  3983. #endif
  3984. lxsspx vs0, o0, AO
  3985. addi AO, AO, 4
  3986. mr T1, BO
  3987. lxsspx vs8, o0, T1
  3988. lxsspx vs9, o4, T1
  3989. addi BO, BO, 8
  3990. xsmaddadp vs32, vs0, vs8
  3991. xsmaddadp vs33, vs0, vs9
  3992. #if defined(_AIX)
  3993. ')
  3994. #else
  3995. .endm
  3996. #endif
  3997. #if defined(_AIX)
  3998. define(`SAVE2x1', `
  3999. #else
  4000. .macro SAVE2x1
  4001. #endif
  4002. mr T1, CO
  4003. #ifndef TRMMKERNEL
  4004. lxsspx vs0, o0, T1
  4005. #endif
  4006. #ifdef TRMMKERNEL
  4007. xsmuldp vs0, vs32, alpha_r
  4008. #else
  4009. xsmaddadp vs0, vs32, alpha_r
  4010. #endif
  4011. stxsspx vs0, o0, T1
  4012. add T1, T1, LDC
  4013. #ifndef TRMMKERNEL
  4014. lxsspx vs0, o0, T1
  4015. #endif
  4016. #ifdef TRMMKERNEL
  4017. xsmuldp vs0, vs33, alpha_r
  4018. #else
  4019. xsmaddadp vs0, vs33, alpha_r
  4020. #endif
  4021. stxsspx vs0, o0, T1
  4022. add T1, T1, LDC
  4023. addi CO, CO, 4
  4024. #if defined(_AIX)
  4025. ')
  4026. #else
  4027. .endm
  4028. #endif
  4029. /**********************************************************************************************
  4030. * Macros for N=1 and M=16
  4031. **********************************************************************************************/
  4032. #if defined(_AIX)
  4033. define(`LOAD1x16_1', `
  4034. #else
  4035. .macro LOAD1x16_1
  4036. #endif
  4037. lxvw4x vs0, o0, AO
  4038. lxvw4x vs1, o16, AO
  4039. lxvw4x vs2, o32, AO
  4040. lxvw4x vs3, o48, AO
  4041. addi AO, AO, 64
  4042. lxvw4x vs28, o0, BO
  4043. xxspltw vs8, vs28, 0
  4044. addi BO, BO, 4
  4045. #if defined(_AIX)
  4046. ')
  4047. #else
  4048. .endm
  4049. #endif
  4050. #if defined(_AIX)
  4051. define(`KERNEL1x16_I1', `
  4052. #else
  4053. .macro KERNEL1x16_I1
  4054. #endif
  4055. lxvw4x vs4, o0, AO
  4056. lxvw4x vs5, o16, AO
  4057. lxvw4x vs6, o32, AO
  4058. lxvw4x vs7, o48, AO
  4059. addi AO, AO, 64
  4060. lxvw4x vs28, o0, BO
  4061. xxspltw vs16, vs28, 0
  4062. addi BO, BO, 4
  4063. xvmulsp vs32, vs0, vs8
  4064. xvmulsp vs33, vs1, vs8
  4065. xvmulsp vs34, vs2, vs8
  4066. xvmulsp vs35, vs3, vs8
  4067. #if defined(_AIX)
  4068. ')
  4069. #else
  4070. .endm
  4071. #endif
  4072. #if defined(_AIX)
  4073. define(`KERNEL1x16_1', `
  4074. #else
  4075. .macro KERNEL1x16_1
  4076. #endif
  4077. lxvw4x vs4, o0, AO
  4078. lxvw4x vs5, o16, AO
  4079. lxvw4x vs6, o32, AO
  4080. lxvw4x vs7, o48, AO
  4081. addi AO, AO, 64
  4082. lxvw4x vs28, o0, BO
  4083. xxspltw vs16, vs28, 0
  4084. addi BO, BO, 4
  4085. xvmaddasp vs32, vs0, vs8
  4086. xvmaddasp vs33, vs1, vs8
  4087. xvmaddasp vs34, vs2, vs8
  4088. xvmaddasp vs35, vs3, vs8
  4089. #if defined(_AIX)
  4090. ')
  4091. #else
  4092. .endm
  4093. #endif
  4094. #if defined(_AIX)
  4095. define(`KERNEL1x16_2', `
  4096. #else
  4097. .macro KERNEL1x16_2
  4098. #endif
  4099. lxvw4x vs0, o0, AO
  4100. lxvw4x vs1, o16, AO
  4101. lxvw4x vs2, o32, AO
  4102. lxvw4x vs3, o48, AO
  4103. addi AO, AO, 64
  4104. lxvw4x vs28, o0, BO
  4105. xxspltw vs8, vs28, 0
  4106. addi BO, BO, 4
  4107. xvmaddasp vs32, vs4, vs16
  4108. xvmaddasp vs33, vs5, vs16
  4109. xvmaddasp vs34, vs6, vs16
  4110. xvmaddasp vs35, vs7, vs16
  4111. #if defined(_AIX)
  4112. ')
  4113. #else
  4114. .endm
  4115. #endif
  4116. #if defined(_AIX)
  4117. define(`KERNEL1x16_E2', `
  4118. #else
  4119. .macro KERNEL1x16_E2
  4120. #endif
  4121. xvmaddasp vs32, vs4, vs16
  4122. xvmaddasp vs33, vs5, vs16
  4123. xvmaddasp vs34, vs6, vs16
  4124. xvmaddasp vs35, vs7, vs16
  4125. #if defined(_AIX)
  4126. ')
  4127. #else
  4128. .endm
  4129. #endif
  4130. #if defined(_AIX)
  4131. define(`KERNEL1x16_SUBI1', `
  4132. #else
  4133. .macro KERNEL1x16_SUBI1
  4134. #endif
  4135. lxvw4x vs0, o0, AO
  4136. lxvw4x vs1, o16, AO
  4137. lxvw4x vs2, o32, AO
  4138. lxvw4x vs3, o48, AO
  4139. addi AO, AO, 64
  4140. lxvw4x vs28, o0, BO
  4141. xxspltw vs8, vs28, 0
  4142. addi BO, BO, 4
  4143. xvmulsp vs32, vs0, vs8
  4144. xvmulsp vs33, vs1, vs8
  4145. xvmulsp vs34, vs2, vs8
  4146. xvmulsp vs35, vs3, vs8
  4147. #if defined(_AIX)
  4148. ')
  4149. #else
  4150. .endm
  4151. #endif
  4152. #if defined(_AIX)
  4153. define(`KERNEL1x16_SUB1', `
  4154. #else
  4155. .macro KERNEL1x16_SUB1
  4156. #endif
  4157. lxvw4x vs0, o0, AO
  4158. lxvw4x vs1, o16, AO
  4159. lxvw4x vs2, o32, AO
  4160. lxvw4x vs3, o48, AO
  4161. addi AO, AO, 64
  4162. lxvw4x vs28, o0, BO
  4163. xxspltw vs8, vs28, 0
  4164. addi BO, BO, 4
  4165. xvmaddasp vs32, vs0, vs8
  4166. xvmaddasp vs33, vs1, vs8
  4167. xvmaddasp vs34, vs2, vs8
  4168. xvmaddasp vs35, vs3, vs8
  4169. #if defined(_AIX)
  4170. ')
  4171. #else
  4172. .endm
  4173. #endif
  4174. #if defined(_AIX)
  4175. define(`SAVE1x16', `
  4176. #else
  4177. .macro SAVE1x16
  4178. #endif
  4179. mr T1, CO
  4180. #ifndef TRMMKERNEL
  4181. lxvw4x vs0, o0, T1
  4182. lxvw4x vs1, o16, T1
  4183. lxvw4x vs2, o32, T1
  4184. lxvw4x vs3, o48, T1
  4185. #endif
  4186. #ifdef TRMMKERNEL
  4187. xvmulsp vs0, vs32, alpha_vr
  4188. xvmulsp vs1, vs33, alpha_vr
  4189. xvmulsp vs2, vs34, alpha_vr
  4190. xvmulsp vs3, vs35, alpha_vr
  4191. #else
  4192. xvmaddasp vs0, vs32, alpha_vr
  4193. xvmaddasp vs1, vs33, alpha_vr
  4194. xvmaddasp vs2, vs34, alpha_vr
  4195. xvmaddasp vs3, vs35, alpha_vr
  4196. #endif
  4197. stxvw4x vs0, o0, T1
  4198. stxvw4x vs1, o16, T1
  4199. stxvw4x vs2, o32, T1
  4200. stxvw4x vs3, o48, T1
  4201. add T1, T1, LDC
  4202. addi CO, CO, 64
  4203. #if defined(_AIX)
  4204. ')
  4205. #else
  4206. .endm
  4207. #endif
  4208. /**********************************************************************************************
  4209. * Macros for N=1 and M=8
  4210. **********************************************************************************************/
  4211. #if defined(_AIX)
  4212. define(`LOAD1x8_1', `
  4213. #else
  4214. .macro LOAD1x8_1
  4215. #endif
  4216. lxvw4x vs0, o0, AO
  4217. lxvw4x vs1, o16, AO
  4218. addi AO, AO, 32
  4219. lxvw4x vs28, o0, BO
  4220. xxspltw vs8, vs28, 0
  4221. addi BO, BO, 4
  4222. #if defined(_AIX)
  4223. ')
  4224. #else
  4225. .endm
  4226. #endif
  4227. #if defined(_AIX)
  4228. define(`KERNEL1x8_I1', `
  4229. #else
  4230. .macro KERNEL1x8_I1
  4231. #endif
  4232. lxvw4x vs4, o0, AO
  4233. lxvw4x vs5, o16, AO
  4234. addi AO, AO, 32
  4235. lxvw4x vs28, o0, BO
  4236. xxspltw vs16, vs28, 0
  4237. addi BO, BO, 4
  4238. xvmulsp vs32, vs0, vs8
  4239. xvmulsp vs33, vs1, vs8
  4240. #if defined(_AIX)
  4241. ')
  4242. #else
  4243. .endm
  4244. #endif
  4245. #if defined(_AIX)
  4246. define(`KERNEL1x8_1', `
  4247. #else
  4248. .macro KERNEL1x8_1
  4249. #endif
  4250. lxvw4x vs4, o0, AO
  4251. lxvw4x vs5, o16, AO
  4252. addi AO, AO, 32
  4253. lxvw4x vs28, o0, BO
  4254. xxspltw vs16, vs28, 0
  4255. addi BO, BO, 4
  4256. xvmaddasp vs32, vs0, vs8
  4257. xvmaddasp vs33, vs1, vs8
  4258. #if defined(_AIX)
  4259. ')
  4260. #else
  4261. .endm
  4262. #endif
  4263. #if defined(_AIX)
  4264. define(`KERNEL1x8_2', `
  4265. #else
  4266. .macro KERNEL1x8_2
  4267. #endif
  4268. lxvw4x vs0, o0, AO
  4269. lxvw4x vs1, o16, AO
  4270. addi AO, AO, 32
  4271. lxvw4x vs28, o0, BO
  4272. xxspltw vs8, vs28, 0
  4273. addi BO, BO, 4
  4274. xvmaddasp vs32, vs4, vs16
  4275. xvmaddasp vs33, vs5, vs16
  4276. #if defined(_AIX)
  4277. ')
  4278. #else
  4279. .endm
  4280. #endif
  4281. #if defined(_AIX)
  4282. define(`KERNEL1x8_E2', `
  4283. #else
  4284. .macro KERNEL1x8_E2
  4285. #endif
  4286. xvmaddasp vs32, vs4, vs16
  4287. xvmaddasp vs33, vs5, vs16
  4288. #if defined(_AIX)
  4289. ')
  4290. #else
  4291. .endm
  4292. #endif
  4293. #if defined(_AIX)
  4294. define(`KERNEL1x8_SUBI1', `
  4295. #else
  4296. .macro KERNEL1x8_SUBI1
  4297. #endif
  4298. lxvw4x vs0, o0, AO
  4299. lxvw4x vs1, o16, AO
  4300. addi AO, AO, 32
  4301. lxvw4x vs28, o0, BO
  4302. xxspltw vs8, vs28, 0
  4303. addi BO, BO, 4
  4304. xvmulsp vs32, vs0, vs8
  4305. xvmulsp vs33, vs1, vs8
  4306. #if defined(_AIX)
  4307. ')
  4308. #else
  4309. .endm
  4310. #endif
  4311. #if defined(_AIX)
  4312. define(`KERNEL1x8_SUB1', `
  4313. #else
  4314. .macro KERNEL1x8_SUB1
  4315. #endif
  4316. lxvw4x vs0, o0, AO
  4317. lxvw4x vs1, o16, AO
  4318. addi AO, AO, 32
  4319. lxvw4x vs28, o0, BO
  4320. xxspltw vs8, vs28, 0
  4321. addi BO, BO, 4
  4322. xvmaddasp vs32, vs0, vs8
  4323. xvmaddasp vs33, vs1, vs8
  4324. #if defined(_AIX)
  4325. ')
  4326. #else
  4327. .endm
  4328. #endif
  4329. #if defined(_AIX)
  4330. define(`SAVE1x8', `
  4331. #else
  4332. .macro SAVE1x8
  4333. #endif
  4334. mr T1, CO
  4335. #ifndef TRMMKERNEL
  4336. lxvw4x vs0, o0, T1
  4337. lxvw4x vs1, o16, T1
  4338. #endif
  4339. #ifdef TRMMKERNEL
  4340. xvmulsp vs0, vs32, alpha_vr
  4341. xvmulsp vs1, vs33, alpha_vr
  4342. #else
  4343. xvmaddasp vs0, vs32, alpha_vr
  4344. xvmaddasp vs1, vs33, alpha_vr
  4345. #endif
  4346. stxvw4x vs0, o0, T1
  4347. stxvw4x vs1, o16, T1
  4348. add T1, T1, LDC
  4349. addi CO, CO, 32
  4350. #if defined(_AIX)
  4351. ')
  4352. #else
  4353. .endm
  4354. #endif
  4355. /**********************************************************************************************
  4356. * Macros for N=1 and M=4
  4357. **********************************************************************************************/
  4358. #if defined(_AIX)
  4359. define(`LOAD1x4_1', `
  4360. #else
  4361. .macro LOAD1x4_1
  4362. #endif
  4363. lxvw4x vs0, o0, AO
  4364. addi AO, AO, 16
  4365. lxvw4x vs28, o0, BO
  4366. xxspltw vs8, vs28, 0
  4367. addi BO, BO, 4
  4368. #if defined(_AIX)
  4369. ')
  4370. #else
  4371. .endm
  4372. #endif
  4373. #if defined(_AIX)
  4374. define(`KERNEL1x4_I1', `
  4375. #else
  4376. .macro KERNEL1x4_I1
  4377. #endif
  4378. lxvw4x vs4, o0, AO
  4379. addi AO, AO, 16
  4380. lxvw4x vs28, o0, BO
  4381. xxspltw vs16, vs28, 0
  4382. addi BO, BO, 4
  4383. xvmulsp vs32, vs0, vs8
  4384. #if defined(_AIX)
  4385. ')
  4386. #else
  4387. .endm
  4388. #endif
  4389. #if defined(_AIX)
  4390. define(`KERNEL1x4_1', `
  4391. #else
  4392. .macro KERNEL1x4_1
  4393. #endif
  4394. lxvw4x vs4, o0, AO
  4395. addi AO, AO, 16
  4396. lxvw4x vs28, o0, BO
  4397. xxspltw vs16, vs28, 0
  4398. addi BO, BO, 4
  4399. xvmaddasp vs32, vs0, vs8
  4400. #if defined(_AIX)
  4401. ')
  4402. #else
  4403. .endm
  4404. #endif
  4405. #if defined(_AIX)
  4406. define(`KERNEL1x4_2', `
  4407. #else
  4408. .macro KERNEL1x4_2
  4409. #endif
  4410. lxvw4x vs0, o0, AO
  4411. addi AO, AO, 16
  4412. lxvw4x vs28, o0, BO
  4413. xxspltw vs8, vs28, 0
  4414. addi BO, BO, 4
  4415. xvmaddasp vs32, vs4, vs16
  4416. #if defined(_AIX)
  4417. ')
  4418. #else
  4419. .endm
  4420. #endif
  4421. #if defined(_AIX)
  4422. define(`KERNEL1x4_E2', `
  4423. #else
  4424. .macro KERNEL1x4_E2
  4425. #endif
  4426. xvmaddasp vs32, vs4, vs16
  4427. #if defined(_AIX)
  4428. ')
  4429. #else
  4430. .endm
  4431. #endif
  4432. #if defined(_AIX)
  4433. define(`KERNEL1x4_SUBI1', `
  4434. #else
  4435. .macro KERNEL1x4_SUBI1
  4436. #endif
  4437. lxvw4x vs0, o0, AO
  4438. addi AO, AO, 16
  4439. lxvw4x vs28, o0, BO
  4440. xxspltw vs8, vs28, 0
  4441. addi BO, BO, 4
  4442. xvmulsp vs32, vs0, vs8
  4443. #if defined(_AIX)
  4444. ')
  4445. #else
  4446. .endm
  4447. #endif
  4448. #if defined(_AIX)
  4449. define(`KERNEL1x4_SUB1', `
  4450. #else
  4451. .macro KERNEL1x4_SUB1
  4452. #endif
  4453. lxvw4x vs0, o0, AO
  4454. addi AO, AO, 16
  4455. lxvw4x vs28, o0, BO
  4456. xxspltw vs8, vs28, 0
  4457. addi BO, BO, 4
  4458. xvmaddasp vs32, vs0, vs8
  4459. #if defined(_AIX)
  4460. ')
  4461. #else
  4462. .endm
  4463. #endif
  4464. #if defined(_AIX)
  4465. define(`SAVE1x4', `
  4466. #else
  4467. .macro SAVE1x4
  4468. #endif
  4469. mr T1, CO
  4470. #ifndef TRMMKERNEL
  4471. lxvw4x vs0, o0, T1
  4472. #endif
  4473. #ifdef TRMMKERNEL
  4474. xvmulsp vs0, vs32, alpha_vr
  4475. #else
  4476. xvmaddasp vs0, vs32, alpha_vr
  4477. #endif
  4478. stxvw4x vs0, o0, T1
  4479. add T1, T1, LDC
  4480. addi CO, CO, 16
  4481. #if defined(_AIX)
  4482. ')
  4483. #else
  4484. .endm
  4485. #endif
  4486. /**********************************************************************************************
  4487. * Macros for N=1 and M=2
  4488. **********************************************************************************************/
  4489. #if defined(_AIX)
  4490. define(`LOAD1x2_1', `
  4491. #else
  4492. .macro LOAD1x2_1
  4493. #endif
  4494. lxsspx vs0, o0, AO
  4495. lxsspx vs1, o4, AO
  4496. addi AO, AO, 8
  4497. mr T1, BO
  4498. lxsspx vs8, o0, T1
  4499. addi BO, BO, 4
  4500. #if defined(_AIX)
  4501. ')
  4502. #else
  4503. .endm
  4504. #endif
  4505. #if defined(_AIX)
  4506. define(`KERNEL1x2_I1', `
  4507. #else
  4508. .macro KERNEL1x2_I1
  4509. #endif
  4510. lxsspx vs4, o0, AO
  4511. lxsspx vs5, o4, AO
  4512. addi AO, AO, 8
  4513. mr T1, BO
  4514. lxsspx vs16, o0, T1
  4515. addi BO, BO, 4
  4516. xsmuldp vs32, vs0, vs8
  4517. xsmuldp vs33, vs1, vs8
  4518. #if defined(_AIX)
  4519. ')
  4520. #else
  4521. .endm
  4522. #endif
  4523. #if defined(_AIX)
  4524. define(`KERNEL1x2_1', `
  4525. #else
  4526. .macro KERNEL1x2_1
  4527. #endif
  4528. lxsspx vs4, o0, AO
  4529. lxsspx vs5, o4, AO
  4530. addi AO, AO, 8
  4531. mr T1, BO
  4532. lxsspx vs16, o0, T1
  4533. addi BO, BO, 4
  4534. xsmaddadp vs32, vs0, vs8
  4535. xsmaddadp vs33, vs1, vs8
  4536. #if defined(_AIX)
  4537. ')
  4538. #else
  4539. .endm
  4540. #endif
  4541. #if defined(_AIX)
  4542. define(`KERNEL1x2_2', `
  4543. #else
  4544. .macro KERNEL1x2_2
  4545. #endif
  4546. lxsspx vs0, o0, AO
  4547. lxsspx vs1, o4, AO
  4548. addi AO, AO, 8
  4549. mr T1, BO
  4550. lxsspx vs8, o0, T1
  4551. addi BO, BO, 4
  4552. xsmaddadp vs32, vs4, vs16
  4553. xsmaddadp vs33, vs5, vs16
  4554. #if defined(_AIX)
  4555. ')
  4556. #else
  4557. .endm
  4558. #endif
  4559. #if defined(_AIX)
  4560. define(`KERNEL1x2_E2', `
  4561. #else
  4562. .macro KERNEL1x2_E2
  4563. #endif
  4564. xsmaddadp vs32, vs4, vs16
  4565. xsmaddadp vs33, vs5, vs16
  4566. #if defined(_AIX)
  4567. ')
  4568. #else
  4569. .endm
  4570. #endif
  4571. #if defined(_AIX)
  4572. define(`KERNEL1x2_SUBI1', `
  4573. #else
  4574. .macro KERNEL1x2_SUBI1
  4575. #endif
  4576. lxsspx vs0, o0, AO
  4577. lxsspx vs1, o4, AO
  4578. addi AO, AO, 8
  4579. mr T1, BO
  4580. lxsspx vs8, o0, T1
  4581. addi BO, BO, 4
  4582. xsmuldp vs32, vs0, vs8
  4583. xsmuldp vs33, vs1, vs8
  4584. #if defined(_AIX)
  4585. ')
  4586. #else
  4587. .endm
  4588. #endif
  4589. #if defined(_AIX)
  4590. define(`KERNEL1x2_SUB1', `
  4591. #else
  4592. .macro KERNEL1x2_SUB1
  4593. #endif
  4594. lxsspx vs0, o0, AO
  4595. lxsspx vs1, o4, AO
  4596. addi AO, AO, 8
  4597. mr T1, BO
  4598. lxsspx vs8, o0, T1
  4599. addi BO, BO, 4
  4600. xsmaddadp vs32, vs0, vs8
  4601. xsmaddadp vs33, vs1, vs8
  4602. #if defined(_AIX)
  4603. ')
  4604. #else
  4605. .endm
  4606. #endif
  4607. #if defined(_AIX)
  4608. define(`SAVE1x2', `
  4609. #else
  4610. .macro SAVE1x2
  4611. #endif
  4612. mr T1, CO
  4613. #ifndef TRMMKERNEL
  4614. lxsspx vs0, o0, T1
  4615. lxsspx vs1, o4, T1
  4616. #endif
  4617. #ifdef TRMMKERNEL
  4618. xsmuldp vs0, vs32, alpha_r
  4619. xsmuldp vs1, vs33, alpha_r
  4620. #else
  4621. xsmaddadp vs0, vs32, alpha_r
  4622. xsmaddadp vs1, vs33, alpha_r
  4623. #endif
  4624. stxsspx vs0, o0, T1
  4625. stxsspx vs1, o4, T1
  4626. add T1, T1, LDC
  4627. addi CO, CO, 8
  4628. #if defined(_AIX)
  4629. ')
  4630. #else
  4631. .endm
  4632. #endif
  4633. /**********************************************************************************************
  4634. * Macros for N=1 and M=1
  4635. **********************************************************************************************/
  4636. #if defined(_AIX)
  4637. define(`LOAD1x1_1', `
  4638. #else
  4639. .macro LOAD1x1_1
  4640. #endif
  4641. lxsspx vs0, o0, AO
  4642. addi AO, AO, 4
  4643. mr T1, BO
  4644. lxsspx vs8, o0, T1
  4645. addi BO, BO, 4
  4646. #if defined(_AIX)
  4647. ')
  4648. #else
  4649. .endm
  4650. #endif
  4651. #if defined(_AIX)
  4652. define(`KERNEL1x1_I1', `
  4653. #else
  4654. .macro KERNEL1x1_I1
  4655. #endif
  4656. lxsspx vs4, o0, AO
  4657. addi AO, AO, 4
  4658. mr T1, BO
  4659. lxsspx vs16, o0, T1
  4660. addi BO, BO, 4
  4661. xsmuldp vs32, vs0, vs8
  4662. #if defined(_AIX)
  4663. ')
  4664. #else
  4665. .endm
  4666. #endif
  4667. #if defined(_AIX)
  4668. define(`KERNEL1x1_1', `
  4669. #else
  4670. .macro KERNEL1x1_1
  4671. #endif
  4672. lxsspx vs4, o0, AO
  4673. addi AO, AO, 4
  4674. mr T1, BO
  4675. lxsspx vs16, o0, T1
  4676. addi BO, BO, 4
  4677. xsmaddadp vs32, vs0, vs8
  4678. #if defined(_AIX)
  4679. ')
  4680. #else
  4681. .endm
  4682. #endif
  4683. #if defined(_AIX)
  4684. define(`KERNEL1x1_2', `
  4685. #else
  4686. .macro KERNEL1x1_2
  4687. #endif
  4688. lxsspx vs0, o0, AO
  4689. addi AO, AO, 4
  4690. mr T1, BO
  4691. lxsspx vs8, o0, T1
  4692. addi BO, BO, 4
  4693. xsmaddadp vs32, vs4, vs16
  4694. #if defined(_AIX)
  4695. ')
  4696. #else
  4697. .endm
  4698. #endif
  4699. #if defined(_AIX)
  4700. define(`KERNEL1x1_E2', `
  4701. #else
  4702. .macro KERNEL1x1_E2
  4703. #endif
  4704. xsmaddadp vs32, vs4, vs16
  4705. #if defined(_AIX)
  4706. ')
  4707. #else
  4708. .endm
  4709. #endif
  4710. #if defined(_AIX)
  4711. define(`KERNEL1x1_SUBI1', `
  4712. #else
  4713. .macro KERNEL1x1_SUBI1
  4714. #endif
  4715. lxsspx vs0, o0, AO
  4716. addi AO, AO, 4
  4717. mr T1, BO
  4718. lxsspx vs8, o0, T1
  4719. addi BO, BO, 4
  4720. xsmuldp vs32, vs0, vs8
  4721. #if defined(_AIX)
  4722. ')
  4723. #else
  4724. .endm
  4725. #endif
  4726. #if defined(_AIX)
  4727. define(`KERNEL1x1_SUB1', `
  4728. #else
  4729. .macro KERNEL1x1_SUB1
  4730. #endif
  4731. lxsspx vs0, o0, AO
  4732. addi AO, AO, 4
  4733. mr T1, BO
  4734. lxsspx vs8, o0, T1
  4735. addi BO, BO, 4
  4736. xsmaddadp vs32, vs0, vs8
  4737. #if defined(_AIX)
  4738. ')
  4739. #else
  4740. .endm
  4741. #endif
  4742. #if defined(_AIX)
  4743. define(`SAVE1x1', `
  4744. #else
  4745. .macro SAVE1x1
  4746. #endif
  4747. mr T1, CO
  4748. #ifndef TRMMKERNEL
  4749. lxsspx vs0, o0, T1
  4750. #endif
  4751. #ifdef TRMMKERNEL
  4752. xsmuldp vs0, vs32, alpha_r
  4753. #else
  4754. xsmaddadp vs0, vs32, alpha_r
  4755. #endif
  4756. stxsspx vs0, o0, T1
  4757. add T1, T1, LDC
  4758. addi CO, CO, 4
  4759. #if defined(_AIX)
  4760. ')
  4761. #else
  4762. .endm
  4763. #endif