You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_macros_power10.S 33 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138
  1. /***************************************************************************
  2. Copyright (c) 2013-2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define unit_size 16
  28. #define DISP32(ind,disp) (ind*unit_size*32+disp)
  29. #define DISP16(ind,disp) (ind*unit_size*16+disp)
  30. #define DISP8(ind,disp) (ind*unit_size*8+disp)
  31. #define DISP4(ind,disp) (ind*unit_size*4+disp)
  32. #define DISP2(ind,disp) (ind*unit_size*2+disp)
  33. #define DISP1(ind,disp) (ind*unit_size+disp)
  34. #define DISPX(disp) (disp)
  35. /* HELPERS FOR SAVE */
  36. /* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
  37. .macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
  38. #ifndef TRMMKERNEL
  39. lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
  40. lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
  41. xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
  42. xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
  43. #endif
  44. .endm
  45. /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
  46. .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
  47. xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
  48. xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
  49. .endm
  50. /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
  51. .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
  52. xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
  53. xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
  54. .endm
  55. /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
  56. .macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
  57. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  58. xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
  59. xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
  60. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  61. xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
  62. xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
  63. #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
  64. xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
  65. xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
  66. #else // CC || CR || RC || RR
  67. /*we will assume {-alpha_r,-alpha_i} for this case */
  68. /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
  69. xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
  70. /*we will negate alpha image instead instead to fix sign*/
  71. xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
  72. #endif
  73. .endm
  74. /* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
  75. .macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
  76. #ifndef TRMMKERNEL
  77. xvmsubadp \VSOUT1,\VSINII, alpha_i
  78. xvmaddadp \VSOUT2,\VSINRR, alpha_i
  79. #else
  80. xvmuldp \VSOUT1,\VSINII, alpha_i
  81. xvmuldp \VSOUT2,\VSINRR, alpha_i
  82. #endif
  83. .endm
  84. /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
  85. .macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
  86. xvmsubadp \VSOUT1,\VSINRR, alpha_r
  87. xvmaddadp \VSOUT2,\VSINII, alpha_r
  88. .endm
  89. /* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
  90. .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
  91. xxmrghd \VSOUT1,\VSIN2,\VSIN1
  92. xxmrgld \VSOUT2,\VSIN2,\VSIN1
  93. .endm
  94. .macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
  95. stxv \VSIN1, DISPX(\LOFFSET)(\REG)
  96. stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
  97. .endm
  98. .macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
  99. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
  100. LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
  101. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
  102. LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
  103. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
  104. LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64)
  105. RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
  106. LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96)
  107. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43
  108. AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
  109. RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45
  110. AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
  111. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
  112. MULT_APLHA_PART1 vs34,vs36, vs46,vs47
  113. RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
  114. MULT_APLHA_PART1 vs38,vs40,vs48,vs49
  115. MULT_APLHA_PART2 vs34,vs36,vs46,vs47
  116. AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45
  117. MULT_APLHA_PART2 vs38,vs40,vs48,vs49
  118. AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
  119. UNPACK_FOR_STORE vs46,vs47,vs39,vs41
  120. MULT_APLHA_PART1 vs42,vs44, vs56,vs57
  121. UNPACK_FOR_STORE vs48,vs49,vs35,vs37
  122. MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59
  123. STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
  124. MULT_APLHA_PART2 vs42,vs44,vs56,vs57
  125. STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
  126. MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59
  127. UNPACK_FOR_STORE vs56,vs57,vs42,vs44
  128. UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3
  129. STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44
  130. STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
  131. .endm
  132. .macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
  133. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
  134. LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
  135. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
  136. LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
  137. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
  138. RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
  139. AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
  140. AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
  141. MULT_APLHA_PART1 vs34,vs36, vs46,vs47
  142. MULT_APLHA_PART1 vs38,vs40, vs48,vs49
  143. MULT_APLHA_PART2 vs34,vs36, vs46,vs47
  144. MULT_APLHA_PART2 vs38,vs40,vs48,vs49
  145. UNPACK_FOR_STORE vs46,vs47,vs39,vs41
  146. UNPACK_FOR_STORE vs48,vs49,vs35,vs37
  147. STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
  148. STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
  149. .endm
  150. .macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
  151. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
  152. LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
  153. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
  154. AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
  155. MULT_APLHA_PART1 vs34,vs36, vs46,vs47
  156. MULT_APLHA_PART2 vs34,vs36, vs46,vs47
  157. UNPACK_FOR_STORE vs46,vs47,vs39,vs41
  158. STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
  159. .endm
  160. .macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
  161. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
  162. #ifndef TRMMKERNEL
  163. lxv vs50, (\LOFFSET)(\BASE_REG)
  164. xxmrgld vs46,vs50,vs50
  165. xxmrghd vs47,vs50,vs50
  166. #endif
  167. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37
  168. AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
  169. MULT_APLHA_PART1 vs34,vs36, vs46,vs47
  170. MULT_APLHA_PART2 vs34,vs36, vs46,vs47
  171. UNPACK_FOR_STORE vs46,vs47,vs39,vs41
  172. xxmrghd vs39,vs47,vs46
  173. stxv vs39, (\LOFFSET)(\BASE_REG)
  174. .endm
  175. /**********************************************************************************************
  176. *
  177. .macros for N=2 and M=8
  178. **********************************************************************************************/
  179. .macro KERNEL2x8_ZERO_AND_PRIME_MMA
  180. /* zero out and prime the MMA accumulators */
  181. xxsetaccz 0
  182. xxsetaccz 1
  183. xxsetaccz 2
  184. xxsetaccz 3
  185. xxsetaccz 4
  186. xxsetaccz 5
  187. xxsetaccz 6
  188. xxsetaccz 7
  189. .endm
  190. .macro KERNEL2x8_PRELOAD
  191. lxvp vs32, 0(AO) // load real,imag from A
  192. lxvp vs34, 32(AO) // load real,imag from A
  193. lxvp vs36, 64(AO) // load real,imag from A
  194. lxvp vs38, 96(AO) // load real,imag from A
  195. lxvp vs48, 0(BO) // load real imag from B
  196. .endm
  197. .macro KERNEL2x8_2 Index, IsLast
  198. lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A
  199. lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A
  200. lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A
  201. lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A
  202. lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
  203. xvf64gerpp 0, vs32, vs49
  204. xvf64gerpp 1, vs34, vs49
  205. xvf64gerpp 2, vs36, vs49
  206. xvf64gerpp 3, vs38, vs49
  207. xvf64gerpp 4, vs32, vs48
  208. xvf64gerpp 5, vs34, vs48
  209. xvf64gerpp 6, vs36, vs48
  210. xvf64gerpp 7, vs38, vs48
  211. lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A
  212. lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A
  213. lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A
  214. lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A
  215. lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
  216. xvf64gerpp 0, vs40, vs51
  217. xvf64gerpp 1, vs42, vs51
  218. xvf64gerpp 2, vs44, vs51
  219. xvf64gerpp 3, vs46, vs51
  220. xvf64gerpp 4, vs40, vs50
  221. xvf64gerpp 5, vs42, vs50
  222. xvf64gerpp 6, vs44, vs50
  223. xvf64gerpp 7, vs46, vs50
  224. .if \IsLast==1
  225. addi AO, AO, DISP16(\Index,256)
  226. addi BO, BO, DISP4(\Index,64)
  227. .endif
  228. .endm
  229. .macro LOAD_END_2x8 OffsetA,OffsetB
  230. xvf64gerpp 0, vs32, vs49
  231. xvf64gerpp 1, vs34, vs49
  232. xvf64gerpp 2, vs36, vs49
  233. xvf64gerpp 3, vs38, vs49
  234. xvf64gerpp 4, vs32, vs48
  235. xvf64gerpp 5, vs34, vs48
  236. xvf64gerpp 6, vs36, vs48
  237. xvf64gerpp 7, vs38, vs48
  238. addi BO, BO, \OffsetB
  239. addi AO, AO, \OffsetA
  240. .endm
  241. .macro KERNEL2x8_UNPRIME_MMA
  242. /* "unprime" MMA accumulators */
  243. xxmfacc 0
  244. xxmfacc 1
  245. xxmfacc 2
  246. xxmfacc 3
  247. xxmfacc 4
  248. xxmfacc 5
  249. xxmfacc 6
  250. xxmfacc 7
  251. .endm
  252. .macro SAVE2x8
  253. add T1, CO ,LDC
  254. xxpermdi vs32, vs0, vs1, 0b01
  255. xxpermdi vs33, vs0, vs1, 0b10
  256. xxpermdi vs34, vs2, vs3, 0b01
  257. xxpermdi vs35, vs2, vs3, 0b10
  258. xxpermdi vs36, vs4, vs5, 0b01
  259. xxpermdi vs37, vs4, vs5, 0b10
  260. xxpermdi vs38, vs6, vs7, 0b01
  261. xxpermdi vs39, vs6, vs7, 0b10
  262. xxpermdi vs40, vs8, vs9, 0b01
  263. xxpermdi vs41, vs8, vs9, 0b10
  264. xxpermdi vs42, vs10, vs11, 0b01
  265. xxpermdi vs43, vs10, vs11, 0b10
  266. xxpermdi vs44, vs12, vs13, 0b01
  267. xxpermdi vs45, vs12, vs13, 0b10
  268. xxpermdi vs46, vs14, vs15, 0b01
  269. xxpermdi vs47, vs14, vs15, 0b10
  270. xxlor vs2, vs32, vs32
  271. xxlor vs3, vs33, vs33
  272. xxlor vs0, vs34, vs34
  273. xxlor vs1, vs35, vs35
  274. xxlor vs6, vs36, vs36
  275. xxlor vs7, vs37, vs37
  276. xxlor vs4, vs38, vs38
  277. xxlor vs5, vs39, vs39
  278. xxlor vs10, vs40, vs40
  279. xxlor vs11, vs41, vs41
  280. xxlor vs8, vs42, vs42
  281. xxlor vs9, vs43, vs43
  282. xxlor vs14, vs44, vs44
  283. xxlor vs15, vs45, vs45
  284. xxlor vs12, vs46, vs46
  285. xxlor vs13, vs47, vs47
  286. xxpermdi vs32, vs16, vs17, 0b01
  287. xxpermdi vs33, vs16, vs17, 0b10
  288. xxpermdi vs34, vs18, vs19, 0b01
  289. xxpermdi vs35, vs18, vs19, 0b10
  290. xxpermdi vs36, vs20, vs21, 0b01
  291. xxpermdi vs37, vs20, vs21, 0b10
  292. xxpermdi vs38, vs22, vs23, 0b01
  293. xxpermdi vs39, vs22, vs23, 0b10
  294. xxpermdi vs40, vs24, vs25, 0b01
  295. xxpermdi vs41, vs24, vs25, 0b10
  296. xxpermdi vs42, vs26, vs27, 0b01
  297. xxpermdi vs43, vs26, vs27, 0b10
  298. xxpermdi vs44, vs28, vs29, 0b01
  299. xxpermdi vs45, vs28, vs29, 0b10
  300. xxpermdi vs46, vs30, vs31, 0b01
  301. xxpermdi vs47, vs30, vs31, 0b10
  302. xxlor vs18, vs32, vs32
  303. xxlor vs19, vs33, vs33
  304. xxlor vs16, vs34, vs34
  305. xxlor vs17, vs35, vs35
  306. xxlor vs22, vs36, vs36
  307. xxlor vs23, vs37, vs37
  308. xxlor vs20, vs38, vs38
  309. xxlor vs21, vs39, vs39
  310. xxlor vs26, vs40, vs40
  311. xxlor vs27, vs41, vs41
  312. xxlor vs24, vs42, vs42
  313. xxlor vs25, vs43, vs43
  314. xxlor vs30, vs44, vs44
  315. xxlor vs31, vs45, vs45
  316. xxlor vs28, vs46, vs46
  317. xxlor vs29, vs47, vs47
  318. SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
  319. SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0
  320. addi CO, CO, 128
  321. .endm
  322. /**********************************************************************************************
  323. *
  324. .macros for N=2 and M=4
  325. **********************************************************************************************/
  326. .macro KERNEL2x4_ZERO_AND_PRIME_MMA
  327. /* zero out and prime the MMA accumulators */
  328. xxsetaccz 0
  329. xxsetaccz 1
  330. xxsetaccz 2
  331. xxsetaccz 3
  332. .endm
  333. .macro KERNEL2x4_PRELOAD
  334. lxvp vs32, 0(AO) // load real,imag from A
  335. lxvp vs34, 32(AO) // load real,imag from A
  336. lxvp vs48, 0(BO) // load real imag from B
  337. .endm
  338. .macro KERNEL2x4_2 Index, IsLast
  339. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
  340. lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
  341. lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
  342. xvf64gerpp 0, vs32, vs49
  343. xvf64gerpp 1, vs34, vs49
  344. xvf64gerpp 2, vs32, vs48
  345. xvf64gerpp 3, vs34, vs48
  346. lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A
  347. lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A
  348. lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B
  349. xvf64gerpp 0, vs40, vs51
  350. xvf64gerpp 1, vs42, vs51
  351. xvf64gerpp 2, vs40, vs50
  352. xvf64gerpp 3, vs42, vs50
  353. .if \IsLast==1
  354. addi AO, AO, DISP8(\Index,128)
  355. addi BO, BO, DISP4(\Index,64)
  356. .endif
  357. .endm
  358. .macro LOAD_END_2x4 OffsetA, OffsetB
  359. xvf64gerpp 0, vs32, vs49
  360. xvf64gerpp 1, vs34, vs49
  361. xvf64gerpp 2, vs32, vs48
  362. xvf64gerpp 3, vs34, vs48
  363. addi BO, BO, \OffsetB
  364. addi AO, AO, \OffsetA
  365. .endm
  366. .macro KERNEL2x4_UNPRIME_MMA
  367. /* "unprime" MMA accumulators */
  368. xxmfacc 0
  369. xxmfacc 1
  370. xxmfacc 2
  371. xxmfacc 3
  372. .endm
  373. .macro SAVE2x4
  374. add T1, CO ,LDC
  375. xxpermdi vs32, vs0, vs1, 0b01
  376. xxpermdi vs33, vs0, vs1, 0b10
  377. xxpermdi vs34, vs2, vs3, 0b01
  378. xxpermdi vs35, vs2, vs3, 0b10
  379. xxpermdi vs36, vs4, vs5, 0b01
  380. xxpermdi vs37, vs4, vs5, 0b10
  381. xxpermdi vs38, vs6, vs7, 0b01
  382. xxpermdi vs39, vs6, vs7, 0b10
  383. xxpermdi vs40, vs8, vs9, 0b01
  384. xxpermdi vs41, vs8, vs9, 0b10
  385. xxpermdi vs42, vs10, vs11, 0b01
  386. xxpermdi vs43, vs10, vs11, 0b10
  387. xxpermdi vs44, vs12, vs13, 0b01
  388. xxpermdi vs45, vs12, vs13, 0b10
  389. xxpermdi vs46, vs14, vs15, 0b01
  390. xxpermdi vs47, vs14, vs15, 0b10
  391. xxlor vs2, vs32, vs32
  392. xxlor vs3, vs33, vs33
  393. xxlor vs0, vs34, vs34
  394. xxlor vs1, vs35, vs35
  395. xxlor vs6, vs36, vs36
  396. xxlor vs7, vs37, vs37
  397. xxlor vs4, vs38, vs38
  398. xxlor vs5, vs39, vs39
  399. xxlor vs10, vs40, vs40
  400. xxlor vs11, vs41, vs41
  401. xxlor vs8, vs42, vs42
  402. xxlor vs9, vs43, vs43
  403. xxlor vs14, vs44, vs44
  404. xxlor vs15, vs45, vs45
  405. xxlor vs12, vs46, vs46
  406. xxlor vs13, vs47, vs47
  407. SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
  408. SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0
  409. addi CO, CO, 64
  410. .endm
  411. /**********************************************************************************************
  412. *
  413. .macros for N=2 and M=2
  414. **********************************************************************************************/
  415. .macro KERNEL2x2_ZERO_AND_PRIME_MMA
  416. /* zero out and prime the MMA accumulators */
  417. xxsetaccz 0
  418. xxsetaccz 1
  419. .endm
  420. .macro KERNEL2x2_PRELOAD
  421. lxvp vs32, 0(AO) // load real,imag from A
  422. lxvp vs48, 0(BO) // load real imag from B
  423. .endm
  424. .macro KERNEL2x2_2 Index, IsLast
  425. lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
  426. lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
  427. xvf64gerpp 0, vs32, vs49
  428. xvf64gerpp 1, vs32, vs48
  429. lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A
  430. lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
  431. xvf64gerpp 0, vs40, vs51
  432. xvf64gerpp 1, vs40, vs50
  433. .if \IsLast==1
  434. addi AO, AO, DISP4(\Index,64)
  435. addi BO, BO, DISP4(\Index,64)
  436. .endif
  437. .endm
  438. .macro LOAD_END_2x2 OffsetA,OffsetB
  439. xvf64gerpp 0, vs32, vs49
  440. xvf64gerpp 1, vs32, vs48
  441. addi BO, BO, \OffsetB
  442. addi AO, AO, \OffsetA
  443. .endm
  444. .macro KERNEL2x2_UNPRIME_MMA
  445. /* "unprime" MMA accumulators */
  446. xxmfacc 0
  447. xxmfacc 1
  448. .endm
  449. .macro SAVE2x2
  450. add T1, CO ,LDC
  451. xxpermdi vs32, vs0, vs1, 0b01
  452. xxpermdi vs33, vs0, vs1, 0b10
  453. xxpermdi vs34, vs2, vs3, 0b01
  454. xxpermdi vs35, vs2, vs3, 0b10
  455. xxpermdi vs36, vs4, vs5, 0b01
  456. xxpermdi vs37, vs4, vs5, 0b10
  457. xxpermdi vs38, vs6, vs7, 0b01
  458. xxpermdi vs39, vs6, vs7, 0b10
  459. xxlor vs2, vs32, vs32
  460. xxlor vs3, vs33, vs33
  461. xxlor vs0, vs34, vs34
  462. xxlor vs1, vs35, vs35
  463. xxlor vs6, vs36, vs36
  464. xxlor vs7, vs37, vs37
  465. xxlor vs4, vs38, vs38
  466. xxlor vs5, vs39, vs39
  467. SAVE2 vs0,vs1,vs2,vs3,CO,0
  468. SAVE2 vs4,vs5,vs6,vs7,T1,0
  469. addi CO, CO, 32
  470. .endm
  471. /**********************************************************************************************
  472. *
  473. .macros for N=2 and M=1
  474. **********************************************************************************************/
  475. .macro ZERO2x1
  476. xxlxor vs0, vs0, vs0
  477. xxlxor vs1, vs1, vs1
  478. xxlxor vs2, vs2, vs2
  479. xxlxor vs3, vs3, vs3
  480. .endm
  481. .macro LOAD2x1
  482. LOAD2x1O 0,0
  483. .endm
  484. .macro LOAD2x1O OffsetA,OffsetB
  485. lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
  486. lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
  487. xxswapd vs49, vs48
  488. xxswapd vs51, vs50
  489. lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
  490. .endm
  491. .macro END2x1_WITHOUT_ADD
  492. END2x1 AO,BO,0,0
  493. .endm
  494. .macro END2x1 AREG, BREG, OffsetA, OffsetB
  495. .if \OffsetB != 0
  496. addi \BREG, \BREG, \OffsetB
  497. .endif
  498. .if \OffsetA != 0
  499. addi \AREG, \AREG, \OffsetA
  500. .endif
  501. xvmaddadp vs0, vs32, vs48
  502. xvmaddadp vs2, vs32, vs50
  503. xvmaddadp vs1, vs32, vs49
  504. xvmaddadp vs3, vs32, vs51
  505. .endm
  506. .macro LOAD2x1_2
  507. LOAD2x1_2O 0,0
  508. .endm
  509. .macro LOAD2x1_2O OffsetA,OffsetB
  510. lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
  511. lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
  512. lxv vs52, (\OffsetB+32)(BO) // load real,imag from B
  513. lxv vs54, (\OffsetB+48)(BO) // load real,imag from B
  514. xxswapd vs49, vs48
  515. xxswapd vs51, vs50
  516. lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
  517. lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
  518. .endm
  519. .macro END2x1_2
  520. /*for load2 offset will be 32 and 64*/
  521. KERNEL2x1_2 AO,BO, 32,64,0 ,1,1
  522. .endm
  523. .macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
  524. KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  525. .endm
  526. .macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
  527. KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  528. .endm
  529. .macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  530. xxswapd vs53, vs52
  531. xxswapd vs55, vs54
  532. xvmaddadp vs0, vs32, vs48
  533. xvmaddadp vs2, vs32, vs50
  534. xvmaddadp vs1, vs32, vs49
  535. xvmaddadp vs3, vs32, vs51
  536. .if \Complete==0
  537. lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  538. .endif
  539. .if \Complete==0
  540. lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  541. lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  542. .endif
  543. .if \Complete==0
  544. xxswapd vs49, vs48
  545. xxswapd vs51, vs50
  546. .endif
  547. xvmaddadp vs0, vs40, vs52
  548. xvmaddadp vs2, vs40, vs54
  549. xvmaddadp vs1, vs40, vs53
  550. xvmaddadp vs3, vs40, vs55
  551. .if \Complete==0
  552. lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
  553. .endif
  554. .if \Complete==0
  555. lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
  556. lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
  557. .endif
  558. .if \IsLast==1
  559. .if \Complete==1
  560. addi \AREG, \AREG, DISP2(\Index,\OffsetA)
  561. addi \BREG, \BREG, DISP4(\Index,\OffsetB)
  562. .else
  563. addi \AREG, \AREG, DISP2(\Index,32)
  564. addi \BREG, \BREG, DISP4(\Index,64)
  565. .endif
  566. .endif
  567. .endm
  568. .macro KERNEL2x1
  569. LOAD2x1
  570. END2x1 AO, BO, 16,32
  571. .endm
  572. .macro SAVE2x1
  573. add T1, CO ,LDC
  574. SAVE1 vs0,vs1,CO,0
  575. SAVE1 vs2,vs3,T1,0
  576. addi CO, CO, 16
  577. .endm
  578. /**********************************************************************************************
  579. *
  580. .macros for N=1 and M=8
  581. **********************************************************************************************/
  582. .macro KERNEL1x8_ZERO_AND_PRIME_MMA
  583. /* zero out and prime the MMA accumulators */
  584. xxsetaccz 0
  585. xxsetaccz 1
  586. xxsetaccz 2
  587. xxsetaccz 3
  588. .endm
  589. .macro KERNEL1x8_2 Index,IsLast
  590. lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A
  591. lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A
  592. lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A
  593. lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A
  594. lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A
  595. lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A
  596. lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A
  597. lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A
  598. lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
  599. xvf64gerpp 0, vs32, vs49
  600. xvf64gerpp 1, vs34, vs49
  601. xvf64gerpp 2, vs36, vs49
  602. xvf64gerpp 3, vs38, vs49
  603. xvf64gerpp 0, vs40, vs48
  604. xvf64gerpp 1, vs42, vs48
  605. xvf64gerpp 2, vs44, vs48
  606. xvf64gerpp 3, vs46, vs48
  607. .if \IsLast==1
  608. addi AO, AO, DISP16(\Index,256)
  609. addi BO, BO, DISP2(\Index,32)
  610. .endif
  611. .endm
  612. .macro LOAD_END_1x8 OffsetA,OffsetB
  613. lxvp vs32, 0(AO) // load real,imag from A
  614. lxvp vs34, 32(AO) // load real,imag from A
  615. lxvp vs36, 64(AO) // load real,imag from A
  616. lxvp vs38, 96(AO) // load real,imag from A
  617. lxv vs48, 0(BO) // load real imag from B
  618. xvf64gerpp 0, vs32, vs48
  619. xvf64gerpp 1, vs34, vs48
  620. xvf64gerpp 2, vs36, vs48
  621. xvf64gerpp 3, vs38, vs48
  622. addi BO, BO, \OffsetB
  623. addi AO, AO, \OffsetA
  624. .endm
  625. .macro KERNEL1x8_UNPRIME_MMA
  626. /* "unprime" MMA accumulators */
  627. xxmfacc 0
  628. xxmfacc 1
  629. xxmfacc 2
  630. xxmfacc 3
  631. .endm
  632. .macro SAVE1x8
  633. xxpermdi vs32, vs0, vs1, 0b01
  634. xxpermdi vs33, vs0, vs1, 0b10
  635. xxpermdi vs34, vs2, vs3, 0b01
  636. xxpermdi vs35, vs2, vs3, 0b10
  637. xxpermdi vs36, vs4, vs5, 0b01
  638. xxpermdi vs37, vs4, vs5, 0b10
  639. xxpermdi vs38, vs6, vs7, 0b01
  640. xxpermdi vs39, vs6, vs7, 0b10
  641. xxpermdi vs40, vs8, vs9, 0b01
  642. xxpermdi vs41, vs8, vs9, 0b10
  643. xxpermdi vs42, vs10, vs11, 0b01
  644. xxpermdi vs43, vs10, vs11, 0b10
  645. xxpermdi vs44, vs12, vs13, 0b01
  646. xxpermdi vs45, vs12, vs13, 0b10
  647. xxpermdi vs46, vs14, vs15, 0b01
  648. xxpermdi vs47, vs14, vs15, 0b10
  649. xxlor vs2, vs32, vs32
  650. xxlor vs3, vs33, vs33
  651. xxlor vs0, vs34, vs34
  652. xxlor vs1, vs35, vs35
  653. xxlor vs6, vs36, vs36
  654. xxlor vs7, vs37, vs37
  655. xxlor vs4, vs38, vs38
  656. xxlor vs5, vs39, vs39
  657. xxlor vs10, vs40, vs40
  658. xxlor vs11, vs41, vs41
  659. xxlor vs8, vs42, vs42
  660. xxlor vs9, vs43, vs43
  661. xxlor vs14, vs44, vs44
  662. xxlor vs15, vs45, vs45
  663. xxlor vs12, vs46, vs46
  664. xxlor vs13, vs47, vs47
  665. SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
  666. addi CO, CO, 128
  667. .endm
  668. /**********************************************************************************************
  669. *
  670. .macros for N=1 and M=4
  671. **********************************************************************************************/
  672. .macro KERNEL1x4_ZERO_AND_PRIME_MMA
  673. /* zero out and prime the MMA accumulators */
  674. xxsetaccz 0
  675. xxsetaccz 1
  676. .endm
  677. .macro KERNEL1x4_2 Index,IsLast
  678. lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A
  679. lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A
  680. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
  681. lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
  682. lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
  683. xvf64gerpp 0, vs32, vs49
  684. xvf64gerpp 1, vs34, vs49
  685. xvf64gerpp 0, vs40, vs48
  686. xvf64gerpp 1, vs42, vs48
  687. .if \IsLast==1
  688. addi AO, AO, DISP8(\Index,128)
  689. addi BO, BO, DISP2(\Index,32)
  690. .endif
  691. .endm
  692. .macro LOAD_END_1x4 OffsetA,OffsetB
  693. lxvp vs32, 0(AO) // load real,imag from A
  694. lxvp vs34, 32(AO) // load real,imag from A
  695. lxv vs48, 0(BO) // load real imag from B
  696. xvf64gerpp 0, vs32, vs48
  697. xvf64gerpp 1, vs34, vs48
  698. addi BO, BO, \OffsetB
  699. addi AO, AO, \OffsetA
  700. .endm
  701. .macro KERNEL1x4_UNPRIME_MMA
  702. /* "unprime" MMA accumulators */
  703. xxmfacc 0
  704. xxmfacc 1
  705. .endm
  706. .macro SAVE1x4
  707. xxpermdi vs32, vs0, vs1, 0b01
  708. xxpermdi vs33, vs0, vs1, 0b10
  709. xxpermdi vs34, vs2, vs3, 0b01
  710. xxpermdi vs35, vs2, vs3, 0b10
  711. xxpermdi vs36, vs4, vs5, 0b01
  712. xxpermdi vs37, vs4, vs5, 0b10
  713. xxpermdi vs38, vs6, vs7, 0b01
  714. xxpermdi vs39, vs6, vs7, 0b10
  715. xxlor vs2, vs32, vs32
  716. xxlor vs3, vs33, vs33
  717. xxlor vs0, vs34, vs34
  718. xxlor vs1, vs35, vs35
  719. xxlor vs6, vs36, vs36
  720. xxlor vs7, vs37, vs37
  721. xxlor vs4, vs38, vs38
  722. xxlor vs5, vs39, vs39
  723. SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
  724. addi CO, CO, 64
  725. .endm
  726. /**********************************************************************************************
  727. *
  728. .macros for N=1 and M=2
  729. **********************************************************************************************/
  730. .macro KERNEL1x2_ZERO_AND_PRIME_MMA
  731. /* zero out and prime the MMA accumulators */
  732. xxsetaccz 0
  733. .endm
  734. .macro KERNEL1x2_2 Index,IsLast
  735. lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A
  736. lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
  737. lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
  738. xvf64gerpp 0, vs32, vs49
  739. xvf64gerpp 0, vs40, vs48
  740. .if \IsLast==1
  741. addi AO, AO, DISP4(\Index,64)
  742. addi BO, BO, DISP2(\Index,32)
  743. .endif
  744. .endm
  745. .macro LOAD_END_1x2 OffsetA,OffsetB
  746. lxvp vs32, 0(AO) // load real,imag from A
  747. lxv vs48, 0(BO) // load real imag from B
  748. xvf64gerpp 0, vs32, vs48
  749. addi BO, BO, \OffsetB
  750. addi AO, AO, \OffsetA
  751. .endm
  752. .macro KERNEL1x2_UNPRIME_MMA
  753. /* "unprime" MMA accumulators */
  754. xxmfacc 0
  755. .endm
  756. .macro SAVE1x2
  757. xxpermdi vs32, vs0, vs1, 0b01
  758. xxpermdi vs33, vs0, vs1, 0b10
  759. xxpermdi vs34, vs2, vs3, 0b01
  760. xxpermdi vs35, vs2, vs3, 0b10
  761. xxlor vs2, vs32, vs32
  762. xxlor vs3, vs33, vs33
  763. xxlor vs0, vs34, vs34
  764. xxlor vs1, vs35, vs35
  765. SAVE2 vs0,vs1,vs2,vs3,CO,0
  766. addi CO, CO, 32
  767. .endm
  768. /**********************************************************************************************
  769. *
  770. .macros for N=1 and M=1
  771. **********************************************************************************************/
  772. .macro ZERO1x1
  773. xxlxor vs0, vs0, vs0
  774. xxlxor vs1, vs1, vs1
  775. .endm
  776. .macro LOAD1x1
  777. LOAD1x1O 0,0
  778. .endm
  779. .macro LOAD1x1O OffsetA,OffsetB
  780. lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
  781. lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
  782. xxswapd vs49, vs48
  783. .endm
  784. .macro END1x1_WITHOUT_ADD
  785. END1x1 AO,BO,0,0
  786. .endm
  787. .macro END1x1 AREG, BREG, OffsetA, OffsetB
  788. .if \OffsetB != 0
  789. addi \BREG, \BREG, \OffsetB
  790. .endif
  791. .if \OffsetA != 0
  792. addi \AREG, \AREG, \OffsetA
  793. .endif
  794. xvmaddadp vs0, vs32, vs48
  795. xvmaddadp vs1, vs32, vs49
  796. .endm
  797. .macro LOAD1x1_2
  798. LOAD1x1_2O 0,0
  799. .endm
  800. .macro LOAD1x1_2O OffsetA,OffsetB
  801. lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
  802. lxv vs52, (\OffsetB+16)(BO) // load real,imag from B
  803. xxswapd vs49, vs48
  804. lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
  805. lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
  806. .endm
  807. .macro END1x1_2
  808. /*for load2 offset will be 32 and 32*/
  809. KERNEL1x1_2 AO,BO, 32,32,0 ,1,1
  810. .endm
  811. .macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
  812. KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  813. .endm
  814. .macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
  815. KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  816. .endm
  817. .macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  818. xxswapd vs53, vs52
  819. xvmaddadp vs0, vs32, vs48
  820. xvmaddadp vs1, vs32, vs49
  821. .if \Complete==0
  822. lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  823. .endif
  824. .if \Complete==0
  825. lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  826. .endif
  827. .if \Complete==0
  828. xxswapd vs49, vs48
  829. .endif
  830. xvmaddadp vs0, vs40, vs52
  831. xvmaddadp vs1, vs40, vs53
  832. .if \Complete==0
  833. lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
  834. .endif
  835. .if \Complete==0
  836. lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  837. .endif
  838. .if \IsLast==1
  839. .if \Complete==1
  840. addi \AREG, \AREG, DISP2(\Index,\OffsetA)
  841. addi \BREG, \BREG, DISP2(\Index,\OffsetB)
  842. .else
  843. addi \AREG, \AREG, DISP2(\Index,32)
  844. addi \BREG, \BREG, DISP2(\Index,32)
  845. .endif
  846. .endif
  847. .endm
  848. .macro KERNEL1x1
  849. LOAD1x1
  850. END1x1 AO, BO, 16,16
  851. .endm
  852. .macro SAVE1x1
  853. SAVE1 vs0,vs1,CO,0
  854. addi CO, CO, 16
  855. .endm
  856. /****************************TRMM POINTER REFRESH
  857. .macroSES*************************/
  858. .macro SHIFT_REG REG1,REG2,SHIFT_VAL
  859. .if \SHIFT_VAL==16
  860. slwi \REG1, \REG2, 8
  861. .elseif \SHIFT_VAL==8
  862. slwi \REG1, \REG2, 7
  863. .elseif \SHIFT_VAL==4
  864. slwi \REG1, \REG2, 6
  865. .elseif \SHIFT_VAL==2
  866. slwi \REG1, \REG2, 5
  867. .elseif \SHIFT_VAL==1
  868. slwi \REG1, \REG2, 4
  869. .endif
  870. .endm
  871. /*
  872. //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  873. // ptrbb = bb;
  874. // #else
  875. // ptrba += off*16;
  876. // ptrbb = bb + off*2;
  877. // #endif
  878. */
  879. .macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
  880. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  881. /* ptrbb = bb;*/
  882. mr \PTR_B,\B_VAL /* refresh BPOINT */
  883. #else
  884. /*
  885. // ptrba =ptrba+ off*C_A;
  886. // ptrbb = bb + off*C_B;
  887. */
  888. SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
  889. SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
  890. add \PTR_B, \B_VAL , T4 /* Add values to BO */
  891. add \PTR_A, \PTR_A, T2 /* Add values to AO */
  892. #endif
  893. .endm
  894. /*
  895. // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  896. // temp = bk-off;
  897. // #elif defined(LEFT)
  898. // temp = off+16; // number of values in A
  899. // #else
  900. // temp = off+2; // number of values in B
  901. // #endif
  902. */
  903. .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
  904. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  905. /* temp = bk-off;*/
  906. sub \TEMP_BK,\BK_VAL,\OFF_VAL
  907. #elif defined(LEFT)
  908. /* temp = off+INCR_A; // number of values in A */
  909. addi \TEMP_BK, \OFF_VAL, \INCR_A
  910. #else
  911. /* temp = off+INCR_B // number of values in B*/
  912. addi \TEMP_BK,\OFF_VAL, \INCR_B
  913. #endif
  914. .endm
  915. /*
  916. // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  917. // temp = bk - off;
  918. // #ifdef LEFT
  919. // temp -= 16; // number of values in A
  920. // #else
  921. // temp -= 2; // number of values in B
  922. // #endif
  923. // ptrba += temp*16;
  924. // ptrbb += temp*2;
  925. // #endif
  926. // #ifdef LEFT
  927. // off += 16; // number of values in A
  928. // #endif
  929. */
  930. .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
  931. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  932. /*temp = bk - off;*/
  933. sub \TEMP_BK,\BK_VAL,\OFF_VAL
  934. #ifdef LEFT
  935. /*temp -= 8; // number of values in A*/
  936. addi \TEMP_BK,\TEMP_BK,-\C_A
  937. #else
  938. /*temp -= 4; // number of values in B*/
  939. addi \TEMP_BK,\TEMP_BK,-\C_B
  940. #endif
  941. /*ptrba += temp*C_A;
  942. ptrbb += temp*C_B;*/
  943. SHIFT_REG T4,\TEMP_BK,\C_A
  944. SHIFT_REG T2,\TEMP_BK,\C_B
  945. add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
  946. add \PTR_B, \PTR_B,T2
  947. #endif
  948. #ifdef LEFT
  949. /*off += 8; // number of values in A*/
  950. addi \OFF_VAL,\OFF_VAL,\C_A
  951. #endif
  952. .endm