You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_macros_power9.S 48 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825
  1. /***************************************************************************
  2. Copyright (c) 2013-2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define unit_size 16
  28. #define DISP32(ind,disp) (ind*unit_size*32+disp)
  29. #define DISP16(ind,disp) (ind*unit_size*16+disp)
  30. #define DISP8(ind,disp) (ind*unit_size*8+disp)
  31. #define DISP4(ind,disp) (ind*unit_size*4+disp)
  32. #define DISP2(ind,disp) (ind*unit_size*2+disp)
  33. #define DISP1(ind,disp) (ind*unit_size+disp)
  34. #define DISPX(disp) (disp)
  35. /* HELPERS FOR SAVE */
  36. /* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
  37. .macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
  38. #ifndef TRMMKERNEL
  39. lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
  40. lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
  41. xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
  42. xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
  43. #endif
  44. .endm
  45. /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
  46. .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
  47. xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
  48. xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
  49. .endm
  50. /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
  51. .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
  52. xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
  53. xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
  54. .endm
  55. /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
  56. .macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
  57. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  58. xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
  59. xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
  60. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  61. xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
  62. xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
  63. #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
  64. xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
  65. xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
  66. #else // CC || CR || RC || RR
  67. /*we will assume {-alpha_r,-alpha_i} for this case */
  68. /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
  69. xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
  70. /*we will negate alpha image instead instead to fix sign*/
  71. xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
  72. #endif
  73. .endm
  74. /* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
  75. .macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
  76. #ifndef TRMMKERNEL
  77. xvmsubadp \VSOUT1,\VSINII, alpha_i
  78. xvmaddadp \VSOUT2,\VSINRR, alpha_i
  79. #else
  80. xvmuldp \VSOUT1,\VSINII, alpha_i
  81. xvmuldp \VSOUT2,\VSINRR, alpha_i
  82. #endif
  83. .endm
  84. /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
  85. .macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
  86. xvmsubadp \VSOUT1,\VSINRR, alpha_r
  87. xvmaddadp \VSOUT2,\VSINII, alpha_r
  88. .endm
  89. /* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
  90. .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
  91. xxmrghd \VSOUT1,\VSIN2,\VSIN1
  92. xxmrgld \VSOUT2,\VSIN2,\VSIN1
  93. .endm
  94. .macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
  95. stxv \VSIN1, DISPX(\LOFFSET)(\REG)
  96. stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
  97. .endm
  98. .macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
  99. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
  100. LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
  101. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
  102. LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
  103. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
  104. LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
  105. RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
  106. LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
  107. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
  108. AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
  109. RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13
  110. AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
  111. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
  112. MULT_APLHA_PART1 vs2,vs4, vs14,vs15
  113. RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
  114. MULT_APLHA_PART1 vs6,vs8,vs16,vs17
  115. MULT_APLHA_PART2 vs2,vs4,vs14,vs15
  116. AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13
  117. MULT_APLHA_PART2 vs6,vs8,vs16,vs17
  118. AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
  119. UNPACK_FOR_STORE vs14,vs15,vs7,vs9
  120. MULT_APLHA_PART1 vs10,vs12, vs24,vs25
  121. UNPACK_FOR_STORE vs16,vs17,vs3,vs5
  122. MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27
  123. STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
  124. MULT_APLHA_PART2 vs10,vs12,vs24,vs25
  125. STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
  126. MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27
  127. UNPACK_FOR_STORE vs24,vs25,vs10,vs12
  128. UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3
  129. STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12
  130. STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
  131. .endm
  132. .macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
  133. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
  134. LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
  135. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
  136. LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
  137. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
  138. RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9
  139. AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
  140. AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9
  141. MULT_APLHA_PART1 vs2,vs4, vs14,vs15
  142. MULT_APLHA_PART1 vs6,vs8, vs16,vs17
  143. MULT_APLHA_PART2 vs2,vs4, vs14,vs15
  144. MULT_APLHA_PART2 vs6,vs8,vs16,vs17
  145. UNPACK_FOR_STORE vs14,vs15,vs7,vs9
  146. UNPACK_FOR_STORE vs16,vs17,vs3,vs5
  147. STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
  148. STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5
  149. .endm
  150. .macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
  151. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
  152. LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
  153. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5
  154. AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
  155. MULT_APLHA_PART1 vs2,vs4, vs14,vs15
  156. MULT_APLHA_PART2 vs2,vs4, vs14,vs15
  157. UNPACK_FOR_STORE vs14,vs15,vs7,vs9
  158. STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9
  159. .endm
  160. .macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
  161. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
  162. #ifndef TRMMKERNEL
  163. lxv vs18, (\LOFFSET)(\BASE_REG)
  164. xxmrgld vs14,vs18,vs18
  165. xxmrghd vs15,vs18,vs18
  166. #endif
  167. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5
  168. AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5
  169. MULT_APLHA_PART1 vs2,vs4, vs14,vs15
  170. MULT_APLHA_PART2 vs2,vs4, vs14,vs15
  171. UNPACK_FOR_STORE vs14,vs15,vs7,vs9
  172. xxmrghd vs7,vs15,vs14
  173. stxv vs7, (\LOFFSET)(\BASE_REG)
  174. .endm
  175. /**********************************************************************************************
  176. *
  177. .macros for N=2 and M=8
  178. **********************************************************************************************/
  179. .macro Zero2x8
  180. xxlxor vs32, vs32, vs32
  181. xxlxor vs33, vs33, vs33
  182. xxlxor vs34, vs34, vs34
  183. xxlxor vs35, vs35, vs35
  184. xxlxor vs36, vs36, vs36
  185. xxlxor vs37, vs37, vs37
  186. xxlxor vs38, vs38, vs38
  187. xxlxor vs39, vs39, vs39
  188. xxlxor vs40, vs40, vs40
  189. xxlxor vs41, vs41, vs41
  190. xxlxor vs42, vs42, vs42
  191. xxlxor vs43, vs43, vs43
  192. xxlxor vs44, vs44, vs44
  193. xxlxor vs45, vs45, vs45
  194. xxlxor vs46, vs46, vs46
  195. xxlxor vs47, vs47, vs47
  196. xxlxor vs48, vs48, vs48
  197. xxlxor vs49, vs49, vs49
  198. xxlxor vs50, vs50, vs50
  199. xxlxor vs51, vs51, vs51
  200. xxlxor vs52, vs52, vs52
  201. xxlxor vs53, vs53, vs53
  202. xxlxor vs54, vs54, vs54
  203. xxlxor vs55, vs55, vs55
  204. xxlxor vs56, vs56, vs56
  205. xxlxor vs57, vs57, vs57
  206. xxlxor vs58, vs58, vs58
  207. xxlxor vs59, vs59, vs59
  208. xxlxor vs60, vs60, vs60
  209. xxlxor vs61, vs61, vs61
  210. xxlxor vs62, vs62, vs62
  211. xxlxor vs63, vs63, vs63
  212. .endm
  213. .macro LOAD2x8
  214. LOAD2x8O 0,0
  215. .endm
  216. .macro LOAD2x8O OffsetA,OffsetB
  217. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  218. lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
  219. xxswapd vs17, vs16
  220. xxswapd vs19, vs18
  221. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  222. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  223. lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
  224. lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
  225. lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
  226. lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
  227. lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
  228. lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
  229. .endm
  230. .macro END2x8_NORMAL
  231. END2x8 AO,BO,128,32
  232. .endm
  233. .macro END2x8_WITHOUT_ADD
  234. END2x8 AO,BO,0,0
  235. .endm
  236. .macro END2x8 AREG, BREG, OffsetA, OffsetB
  237. .if \OffsetB != 0
  238. addi \BREG, \BREG, \OffsetB
  239. .endif
  240. .if \OffsetA != 0
  241. addi \AREG, \AREG, \OffsetA
  242. .endif
  243. xvmaddadp vs32, vs0, vs16
  244. xvmaddadp vs48, vs0, vs18
  245. xvmaddadp vs33, vs0, vs17
  246. xvmaddadp vs49, vs0, vs19
  247. xvmaddadp vs34, vs1, vs16
  248. xvmaddadp vs50, vs1, vs18
  249. xvmaddadp vs35, vs1, vs17
  250. xvmaddadp vs51, vs1, vs19
  251. xvmaddadp vs36, vs2, vs16
  252. xvmaddadp vs52, vs2, vs18
  253. xvmaddadp vs37, vs2, vs17
  254. xvmaddadp vs53, vs2, vs19
  255. xvmaddadp vs38, vs3, vs16
  256. xvmaddadp vs54, vs3, vs18
  257. xvmaddadp vs39, vs3, vs17
  258. xvmaddadp vs55, vs3, vs19
  259. xvmaddadp vs40, vs4, vs16
  260. xvmaddadp vs56, vs4, vs18
  261. xvmaddadp vs41, vs4, vs17
  262. xvmaddadp vs57, vs4, vs19
  263. xvmaddadp vs42, vs5, vs16
  264. xvmaddadp vs58, vs5, vs18
  265. xvmaddadp vs43, vs5, vs17
  266. xvmaddadp vs59, vs5, vs19
  267. xvmaddadp vs44, vs6, vs16
  268. xvmaddadp vs60, vs6, vs18
  269. xvmaddadp vs45, vs6, vs17
  270. xvmaddadp vs61, vs6, vs19
  271. xvmaddadp vs46, vs7, vs16
  272. xvmaddadp vs62, vs7, vs18
  273. xvmaddadp vs47, vs7, vs17
  274. xvmaddadp vs63, vs7, vs19
  275. .endm
  276. .macro LOAD2x8_2
  277. LOAD2x8_2O 0,0
  278. .endm
  279. .macro LOAD2x8_2O OffsetA,OffsetB
  280. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  281. lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
  282. lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
  283. lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
  284. xxswapd vs17, vs16
  285. xxswapd vs19, vs18
  286. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  287. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  288. lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
  289. lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
  290. lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
  291. lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
  292. lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
  293. lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
  294. lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A
  295. lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A
  296. lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A
  297. lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A
  298. lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A
  299. lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A
  300. lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A
  301. lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A
  302. .endm
  303. .macro END2x8_2
  304. /*for load2 offset will be 256 and 64*/
  305. KERNEL2x8_2 AO,BO, 256,64,0 ,1,1
  306. .endm
  307. .macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast
  308. KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  309. .endm
  310. .macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
  311. KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  312. .endm
  313. .macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  314. xvmaddadp vs32, vs0, vs16
  315. xvmaddadp vs48, vs0, vs18
  316. xvmaddadp vs33, vs0, vs17
  317. xvmaddadp vs49, vs0, vs19
  318. xxswapd vs21, vs20
  319. xxswapd vs23, vs22
  320. xvmaddadp vs34, vs1, vs16
  321. xvmaddadp vs50, vs1, vs18
  322. xvmaddadp vs35, vs1, vs17
  323. xvmaddadp vs51, vs1, vs19
  324. .if \Complete==0
  325. lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  326. lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
  327. .endif
  328. xvmaddadp vs36, vs2, vs16
  329. xvmaddadp vs52, vs2, vs18
  330. xvmaddadp vs37, vs2, vs17
  331. xvmaddadp vs53, vs2, vs19
  332. xvmaddadp vs38, vs3, vs16
  333. xvmaddadp vs54, vs3, vs18
  334. xvmaddadp vs39, vs3, vs17
  335. xvmaddadp vs55, vs3, vs19
  336. .if \Complete==0
  337. lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
  338. lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
  339. .endif
  340. xvmaddadp vs40, vs4, vs16
  341. xvmaddadp vs56, vs4, vs18
  342. xvmaddadp vs41, vs4, vs17
  343. xvmaddadp vs57, vs4, vs19
  344. xvmaddadp vs42, vs5, vs16
  345. xvmaddadp vs58, vs5, vs18
  346. xvmaddadp vs43, vs5, vs17
  347. xvmaddadp vs59, vs5, vs19
  348. .if \Complete==0
  349. lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
  350. lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
  351. .endif
  352. xvmaddadp vs44, vs6, vs16
  353. xvmaddadp vs60, vs6, vs18
  354. xvmaddadp vs45, vs6, vs17
  355. xvmaddadp vs61, vs6, vs19
  356. xvmaddadp vs46, vs7, vs16
  357. xvmaddadp vs62, vs7, vs18
  358. xvmaddadp vs47, vs7, vs17
  359. xvmaddadp vs63, vs7, vs19
  360. .if \Complete==0
  361. lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  362. lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  363. .endif
  364. xvmaddadp vs32, vs8, vs20
  365. xvmaddadp vs48, vs8, vs22
  366. .if \Complete==0
  367. lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
  368. lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
  369. .endif
  370. xvmaddadp vs33, vs8, vs21
  371. xvmaddadp vs49, vs8, vs23
  372. .if \Complete==0
  373. xxswapd vs17, vs16
  374. xxswapd vs19, vs18
  375. .endif
  376. xvmaddadp vs34, vs9, vs20
  377. xvmaddadp vs50, vs9, vs22
  378. xvmaddadp vs35, vs9, vs21
  379. xvmaddadp vs51, vs9, vs23
  380. .if \Complete==0
  381. lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
  382. lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
  383. .endif
  384. xvmaddadp vs36, vs10, vs20
  385. xvmaddadp vs52, vs10, vs22
  386. xvmaddadp vs37, vs10, vs21
  387. xvmaddadp vs53, vs10, vs23
  388. xvmaddadp vs38, vs11, vs20
  389. xvmaddadp vs54, vs11, vs22
  390. xvmaddadp vs39, vs11, vs21
  391. xvmaddadp vs55, vs11, vs23
  392. .if \Complete==0
  393. lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
  394. lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
  395. .endif
  396. xvmaddadp vs40, vs12, vs20
  397. xvmaddadp vs56, vs12, vs22
  398. xvmaddadp vs41, vs12, vs21
  399. xvmaddadp vs57, vs12, vs23
  400. xvmaddadp vs42, vs13, vs20
  401. xvmaddadp vs58, vs13, vs22
  402. xvmaddadp vs43, vs13, vs21
  403. xvmaddadp vs59, vs13, vs23
  404. .if \Complete==0
  405. lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
  406. lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
  407. .endif
  408. xvmaddadp vs44, vs14, vs20
  409. xvmaddadp vs60, vs14, vs22
  410. xvmaddadp vs45, vs14, vs21
  411. xvmaddadp vs61, vs14, vs23
  412. xvmaddadp vs46, vs15, vs20
  413. xvmaddadp vs62, vs15, vs22
  414. xvmaddadp vs47, vs15, vs21
  415. xvmaddadp vs63, vs15, vs23
  416. .if \Complete==0
  417. lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
  418. lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
  419. lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
  420. lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
  421. .endif
  422. .if \IsLast==1
  423. .if \Complete==1
  424. addi \AREG, \AREG, DISP16(\Index,\OffsetA)
  425. addi \BREG, \BREG, DISP4(\Index,\OffsetB)
  426. .else
  427. addi \AREG, \AREG, DISP16(\Index,256)
  428. addi \BREG, \BREG, DISP4(\Index,64)
  429. .endif
  430. .endif
  431. .endm
  432. .macro KERNEL2x8
  433. LOAD2x8
  434. END2x8 AO, BO, 128,32
  435. .endm
  436. .macro SAVE2x8
  437. add T1, CO ,LDC
  438. SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
  439. SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0
  440. addi CO, CO, 128
  441. .endm
  442. /**********************************************************************************************
  443. *
  444. .macros for N=2 and M=4
  445. **********************************************************************************************/
  446. .macro Zero2x4
  447. xxlxor vs32, vs32, vs32
  448. xxlxor vs33, vs33, vs33
  449. xxlxor vs34, vs34, vs34
  450. xxlxor vs35, vs35, vs35
  451. xxlxor vs36, vs36, vs36
  452. xxlxor vs37, vs37, vs37
  453. xxlxor vs38, vs38, vs38
  454. xxlxor vs39, vs39, vs39
  455. xxlxor vs40, vs40, vs40
  456. xxlxor vs41, vs41, vs41
  457. xxlxor vs42, vs42, vs42
  458. xxlxor vs43, vs43, vs43
  459. xxlxor vs44, vs44, vs44
  460. xxlxor vs45, vs45, vs45
  461. xxlxor vs46, vs46, vs46
  462. xxlxor vs47, vs47, vs47
  463. .endm
  464. .macro LOAD2x4
  465. LOAD2x4O 0,0
  466. .endm
  467. .macro LOAD2x4O OffsetA,OffsetB
  468. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  469. lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
  470. xxswapd vs17, vs16
  471. xxswapd vs19, vs18
  472. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  473. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  474. lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
  475. lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
  476. .endm
  477. .macro END2x4_NORMAL
  478. END2x4 AO,BO,64,32
  479. .endm
  480. .macro END2x4_WITHOUT_ADD
  481. END2x4 AO,BO,0,0
  482. .endm
  483. .macro END2x4 AREG, BREG, OffsetA, OffsetB
  484. .if \OffsetB != 0
  485. addi \BREG, \BREG, \OffsetB
  486. .endif
  487. .if \OffsetA != 0
  488. addi \AREG, \AREG, \OffsetA
  489. .endif
  490. xvmaddadp vs32, vs0, vs16
  491. xvmaddadp vs40, vs0, vs18
  492. xvmaddadp vs33, vs0, vs17
  493. xvmaddadp vs41, vs0, vs19
  494. xvmaddadp vs34, vs1, vs16
  495. xvmaddadp vs42, vs1, vs18
  496. xvmaddadp vs35, vs1, vs17
  497. xvmaddadp vs43, vs1, vs19
  498. xvmaddadp vs36, vs2, vs16
  499. xvmaddadp vs44, vs2, vs18
  500. xvmaddadp vs37, vs2, vs17
  501. xvmaddadp vs45, vs2, vs19
  502. xvmaddadp vs38, vs3, vs16
  503. xvmaddadp vs46, vs3, vs18
  504. xvmaddadp vs39, vs3, vs17
  505. xvmaddadp vs47, vs3, vs19
  506. .endm
  507. .macro LOAD2x4_2
  508. LOAD2x4_2O 0,0
  509. .endm
  510. .macro LOAD2x4_2O OffsetA,OffsetB
  511. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  512. lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
  513. lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
  514. lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
  515. xxswapd vs17, vs16
  516. xxswapd vs19, vs18
  517. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  518. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  519. lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
  520. lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
  521. lxv vs8, (64+\OffsetA)(AO) // load real,imag from A
  522. lxv vs9, (80+\OffsetA)(AO) // load real,imag from A
  523. lxv vs10, (96+\OffsetA)(AO) // load real,imag from A
  524. lxv vs11, (112+\OffsetA)(AO) // load real,imag from A
  525. .endm
  526. .macro END2x4_2
  527. /*for load2 offset will be 128 and 64*/
  528. KERNEL2x4_2 AO,BO, 128,64,0 ,1,1
  529. .endm
  530. .macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast
  531. KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  532. .endm
  533. .macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
  534. KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  535. .endm
  536. .macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  537. xvmaddadp vs32, vs0, vs16
  538. xvmaddadp vs40, vs0, vs18
  539. xvmaddadp vs33, vs0, vs17
  540. xvmaddadp vs41, vs0, vs19
  541. xxswapd vs21, vs20
  542. xxswapd vs23, vs22
  543. xvmaddadp vs34, vs1, vs16
  544. xvmaddadp vs42, vs1, vs18
  545. xvmaddadp vs35, vs1, vs17
  546. xvmaddadp vs43, vs1, vs19
  547. .if \Complete==0
  548. lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  549. lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
  550. .endif
  551. xvmaddadp vs36, vs2, vs16
  552. xvmaddadp vs44, vs2, vs18
  553. xvmaddadp vs37, vs2, vs17
  554. xvmaddadp vs45, vs2, vs19
  555. xvmaddadp vs38, vs3, vs16
  556. xvmaddadp vs46, vs3, vs18
  557. xvmaddadp vs39, vs3, vs17
  558. xvmaddadp vs47, vs3, vs19
  559. .if \Complete==0
  560. lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
  561. lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
  562. .endif
  563. .if \Complete==0
  564. lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  565. lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  566. .endif
  567. xvmaddadp vs32, vs8, vs20
  568. xvmaddadp vs40, vs8, vs22
  569. xvmaddadp vs33, vs8, vs21
  570. xvmaddadp vs41, vs8, vs23
  571. .if \Complete==0
  572. xxswapd vs17, vs16
  573. xxswapd vs19, vs18
  574. .endif
  575. xvmaddadp vs34, vs9, vs20
  576. xvmaddadp vs42, vs9, vs22
  577. xvmaddadp vs35, vs9, vs21
  578. xvmaddadp vs43, vs9, vs23
  579. .if \Complete==0
  580. lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A
  581. lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
  582. .endif
  583. xvmaddadp vs36, vs10, vs20
  584. xvmaddadp vs44, vs10, vs22
  585. xvmaddadp vs37, vs10, vs21
  586. xvmaddadp vs45, vs10, vs23
  587. xvmaddadp vs38, vs11, vs20
  588. xvmaddadp vs46, vs11, vs22
  589. xvmaddadp vs39, vs11, vs21
  590. xvmaddadp vs47, vs11, vs23
  591. .if \Complete==0
  592. lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
  593. lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
  594. .endif
  595. .if \Complete==0
  596. lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
  597. lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
  598. .endif
  599. .if \IsLast==1
  600. .if \Complete==1
  601. addi \AREG, \AREG, DISP8(\Index,\OffsetA)
  602. addi \BREG, \BREG, DISP4(\Index,\OffsetB)
  603. .else
  604. addi \AREG, \AREG, DISP8(\Index,128)
  605. addi \BREG, \BREG, DISP4(\Index,64)
  606. .endif
  607. .endif
  608. .endm
  609. .macro KERNEL2x4
  610. LOAD2x4
  611. END2x4 AO, BO, 64,32
  612. .endm
  613. .macro SAVE2x4
  614. add T1, CO ,LDC
  615. SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
  616. SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0
  617. addi CO, CO, 64
  618. .endm
  619. /**********************************************************************************************
  620. *
  621. .macros for N=2 and M=2
  622. **********************************************************************************************/
  623. .macro Zero2x2
  624. xxlxor vs32, vs32, vs32
  625. xxlxor vs33, vs33, vs33
  626. xxlxor vs34, vs34, vs34
  627. xxlxor vs35, vs35, vs35
  628. xxlxor vs36, vs36, vs36
  629. xxlxor vs37, vs37, vs37
  630. xxlxor vs38, vs38, vs38
  631. xxlxor vs39, vs39, vs39
  632. .endm
  633. .macro LOAD2x2
  634. LOAD2x2O 0,0
  635. .endm
  636. .macro LOAD2x2O OffsetA,OffsetB
  637. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  638. lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
  639. xxswapd vs17, vs16
  640. xxswapd vs19, vs18
  641. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  642. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  643. .endm
  644. .macro END2x2_NORMAL
  645. END2x2 AO,BO,32,32
  646. .endm
  647. .macro END2x2_WITHOUT_ADD
  648. END2x2 AO,BO,0,0
  649. .endm
  650. .macro END2x2 AREG, BREG, OffsetA, OffsetB
  651. .if \OffsetB != 0
  652. addi \BREG, \BREG, \OffsetB
  653. .endif
  654. .if \OffsetA != 0
  655. addi \AREG, \AREG, \OffsetA
  656. .endif
  657. xvmaddadp vs32, vs0, vs16
  658. xvmaddadp vs36, vs0, vs18
  659. xvmaddadp vs33, vs0, vs17
  660. xvmaddadp vs37, vs0, vs19
  661. xvmaddadp vs34, vs1, vs16
  662. xvmaddadp vs38, vs1, vs18
  663. xvmaddadp vs35, vs1, vs17
  664. xvmaddadp vs39, vs1, vs19
  665. .endm
  666. .macro LOAD2x2_2
  667. LOAD2x2_2O 0,0
  668. .endm
  669. .macro LOAD2x2_2O OffsetA,OffsetB
  670. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  671. lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
  672. lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
  673. lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
  674. xxswapd vs17, vs16
  675. xxswapd vs19, vs18
  676. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  677. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  678. lxv vs8, (32+\OffsetA)(AO) // load real,imag from A
  679. lxv vs9, (48+\OffsetA)(AO) // load real,imag from A
  680. .endm
  681. .macro END2x2_2
  682. /*for load2 offset will be 64 and 64*/
  683. KERNEL2x2_2 AO,BO, 64,64,0 ,1,1
  684. .endm
  685. .macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast
  686. KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  687. .endm
  688. .macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
  689. KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  690. .endm
  691. .macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  692. xvmaddadp vs32, vs0, vs16
  693. xvmaddadp vs36, vs0, vs18
  694. xvmaddadp vs33, vs0, vs17
  695. xvmaddadp vs37, vs0, vs19
  696. xxswapd vs21, vs20
  697. xxswapd vs23, vs22
  698. xvmaddadp vs34, vs1, vs16
  699. xvmaddadp vs38, vs1, vs18
  700. xvmaddadp vs35, vs1, vs17
  701. xvmaddadp vs39, vs1, vs19
  702. .if \Complete==0
  703. lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  704. lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
  705. .endif
  706. .if \Complete==0
  707. lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  708. lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  709. .endif
  710. xvmaddadp vs32, vs8, vs20
  711. xvmaddadp vs36, vs8, vs22
  712. xvmaddadp vs33, vs8, vs21
  713. xvmaddadp vs37, vs8, vs23
  714. .if \Complete==0
  715. xxswapd vs17, vs16
  716. xxswapd vs19, vs18
  717. .endif
  718. xvmaddadp vs34, vs9, vs20
  719. xvmaddadp vs38, vs9, vs22
  720. xvmaddadp vs35, vs9, vs21
  721. xvmaddadp vs39, vs9, vs23
  722. .if \Complete==0
  723. lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
  724. lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
  725. .endif
  726. .if \Complete==0
  727. lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A
  728. lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A
  729. .endif
  730. .if \IsLast==1
  731. .if \Complete==1
  732. addi \AREG, \AREG, DISP4(\Index,\OffsetA)
  733. addi \BREG, \BREG, DISP4(\Index,\OffsetB)
  734. .else
  735. addi \AREG, \AREG, DISP4(\Index,64)
  736. addi \BREG, \BREG, DISP4(\Index,64)
  737. .endif
  738. .endif
  739. .endm
  740. .macro KERNEL2x2
  741. LOAD2x2
  742. END2x2 AO, BO, 32,32
  743. .endm
  744. .macro SAVE2x2
  745. add T1, CO ,LDC
  746. SAVE2 vs32,vs33,vs34,vs35,CO,0
  747. SAVE2 vs36,vs37,vs38,vs39,T1,0
  748. addi CO, CO, 32
  749. .endm
  750. /**********************************************************************************************
  751. *
  752. .macros for N=2 and M=1
  753. **********************************************************************************************/
  754. .macro Zero2x1
  755. xxlxor vs32, vs32, vs32
  756. xxlxor vs33, vs33, vs33
  757. xxlxor vs34, vs34, vs34
  758. xxlxor vs35, vs35, vs35
  759. .endm
  760. .macro LOAD2x1
  761. LOAD2x1O 0,0
  762. .endm
  763. .macro LOAD2x1O OffsetA,OffsetB
  764. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  765. lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
  766. xxswapd vs17, vs16
  767. xxswapd vs19, vs18
  768. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  769. .endm
  770. .macro END2x1_NORMAL
  771. END2x1 AO,BO,16,32
  772. .endm
  773. .macro END2x1_WITHOUT_ADD
  774. END2x1 AO,BO,0,0
  775. .endm
  776. .macro END2x1 AREG, BREG, OffsetA, OffsetB
  777. .if \OffsetB != 0
  778. addi \BREG, \BREG, \OffsetB
  779. .endif
  780. .if \OffsetA != 0
  781. addi \AREG, \AREG, \OffsetA
  782. .endif
  783. xvmaddadp vs32, vs0, vs16
  784. xvmaddadp vs34, vs0, vs18
  785. xvmaddadp vs33, vs0, vs17
  786. xvmaddadp vs35, vs0, vs19
  787. .endm
  788. .macro LOAD2x1_2
  789. LOAD2x1_2O 0,0
  790. .endm
  791. .macro LOAD2x1_2O OffsetA,OffsetB
  792. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  793. lxv vs18, (\OffsetB+16)(BO) // load real,imag from B
  794. lxv vs20, (\OffsetB+32)(BO) // load real,imag from B
  795. lxv vs22, (\OffsetB+48)(BO) // load real,imag from B
  796. xxswapd vs17, vs16
  797. xxswapd vs19, vs18
  798. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  799. lxv vs8, (16+\OffsetA)(AO) // load real,imag from A
  800. .endm
  801. .macro END2x1_2
  802. /*for load2 offset will be 32 and 64*/
  803. KERNEL2x1_2 AO,BO, 32,64,0 ,1,1
  804. .endm
  805. .macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
  806. KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  807. .endm
  808. .macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
  809. KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  810. .endm
  811. .macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  812. xxswapd vs21, vs20
  813. xxswapd vs23, vs22
  814. xvmaddadp vs32, vs0, vs16
  815. xvmaddadp vs34, vs0, vs18
  816. xvmaddadp vs33, vs0, vs17
  817. xvmaddadp vs35, vs0, vs19
  818. .if \Complete==0
  819. lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  820. .endif
  821. .if \Complete==0
  822. lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  823. lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  824. .endif
  825. .if \Complete==0
  826. xxswapd vs17, vs16
  827. xxswapd vs19, vs18
  828. .endif
  829. xvmaddadp vs32, vs8, vs20
  830. xvmaddadp vs34, vs8, vs22
  831. xvmaddadp vs33, vs8, vs21
  832. xvmaddadp vs35, vs8, vs23
  833. .if \Complete==0
  834. lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
  835. .endif
  836. .if \Complete==0
  837. lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
  838. lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
  839. .endif
  840. .if \IsLast==1
  841. .if \Complete==1
  842. addi \AREG, \AREG, DISP2(\Index,\OffsetA)
  843. addi \BREG, \BREG, DISP4(\Index,\OffsetB)
  844. .else
  845. addi \AREG, \AREG, DISP2(\Index,32)
  846. addi \BREG, \BREG, DISP4(\Index,64)
  847. .endif
  848. .endif
  849. .endm
  850. .macro KERNEL2x1
  851. LOAD2x1
  852. END2x1 AO, BO, 16,32
  853. .endm
  854. .macro SAVE2x1
  855. add T1, CO ,LDC
  856. SAVE1 vs32,vs33,CO,0
  857. SAVE1 vs34,vs35,T1,0
  858. addi CO, CO, 16
  859. .endm
  860. /**********************************************************************************************
  861. *
  862. .macros for N=1 and M=8
  863. **********************************************************************************************/
  864. .macro Zero1x8
  865. xxlxor vs32, vs32, vs32
  866. xxlxor vs33, vs33, vs33
  867. xxlxor vs34, vs34, vs34
  868. xxlxor vs35, vs35, vs35
  869. xxlxor vs36, vs36, vs36
  870. xxlxor vs37, vs37, vs37
  871. xxlxor vs38, vs38, vs38
  872. xxlxor vs39, vs39, vs39
  873. xxlxor vs40, vs40, vs40
  874. xxlxor vs41, vs41, vs41
  875. xxlxor vs42, vs42, vs42
  876. xxlxor vs43, vs43, vs43
  877. xxlxor vs44, vs44, vs44
  878. xxlxor vs45, vs45, vs45
  879. xxlxor vs46, vs46, vs46
  880. xxlxor vs47, vs47, vs47
  881. xxlxor vs48, vs48, vs48
  882. .endm
  883. .macro LOAD1x8
  884. LOAD1x8O 0,0
  885. .endm
  886. .macro LOAD1x8O OffsetA,OffsetB
  887. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  888. xxswapd vs17, vs16
  889. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  890. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  891. lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
  892. lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
  893. lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
  894. lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
  895. lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
  896. lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
  897. .endm
  898. .macro END1x8_NORMAL
  899. END1x8 AO,BO,128,16
  900. .endm
  901. .macro END1x8_WITHOUT_ADD
  902. END1x8 AO,BO,0,0
  903. .endm
  904. .macro END1x8 AREG, BREG, OffsetA, OffsetB
  905. .if \OffsetB != 0
  906. addi \BREG, \BREG, \OffsetB
  907. .endif
  908. .if \OffsetA != 0
  909. addi \AREG, \AREG, \OffsetA
  910. .endif
  911. xvmaddadp vs32, vs0, vs16
  912. xvmaddadp vs33, vs0, vs17
  913. xvmaddadp vs34, vs1, vs16
  914. xvmaddadp vs35, vs1, vs17
  915. xvmaddadp vs36, vs2, vs16
  916. xvmaddadp vs37, vs2, vs17
  917. xvmaddadp vs38, vs3, vs16
  918. xvmaddadp vs39, vs3, vs17
  919. xvmaddadp vs40, vs4, vs16
  920. xvmaddadp vs41, vs4, vs17
  921. xvmaddadp vs42, vs5, vs16
  922. xvmaddadp vs43, vs5, vs17
  923. xvmaddadp vs44, vs6, vs16
  924. xvmaddadp vs45, vs6, vs17
  925. xvmaddadp vs46, vs7, vs16
  926. xvmaddadp vs47, vs7, vs17
  927. .endm
  928. .macro LOAD1x8_2
  929. LOAD1x8_2O 0,0
  930. .endm
  931. .macro LOAD1x8_2O OffsetA,OffsetB
  932. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  933. lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
  934. xxswapd vs17, vs16
  935. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  936. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  937. lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
  938. lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
  939. lxv vs4, (64+\OffsetA)(AO) // load real,imag from A
  940. lxv vs5, (80+\OffsetA)(AO) // load real,imag from A
  941. lxv vs6, (96+\OffsetA)(AO) // load real,imag from A
  942. lxv vs7, (112+\OffsetA)(AO) // load real,imag from A
  943. lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A
  944. lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A
  945. lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A
  946. lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A
  947. lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A
  948. lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A
  949. lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A
  950. lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A
  951. .endm
  952. .macro END1x8_2
  953. /*for load2 offset will be 256 and 32*/
  954. KERNEL1x8_2 AO,BO, 256,32,0 ,1,1
  955. .endm
  956. .macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast
  957. KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  958. .endm
  959. .macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
  960. KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  961. .endm
  962. .macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  963. xvmaddadp vs32, vs0, vs16
  964. xvmaddadp vs33, vs0, vs17
  965. xxswapd vs21, vs20
  966. xvmaddadp vs34, vs1, vs16
  967. xvmaddadp vs35, vs1, vs17
  968. .if \Complete==0
  969. lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  970. lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
  971. .endif
  972. xvmaddadp vs36, vs2, vs16
  973. xvmaddadp vs37, vs2, vs17
  974. xvmaddadp vs38, vs3, vs16
  975. xvmaddadp vs39, vs3, vs17
  976. .if \Complete==0
  977. lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
  978. lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
  979. .endif
  980. xvmaddadp vs40, vs4, vs16
  981. xvmaddadp vs41, vs4, vs17
  982. xvmaddadp vs42, vs5, vs16
  983. xvmaddadp vs43, vs5, vs17
  984. .if \Complete==0
  985. lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A
  986. lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
  987. .endif
  988. xvmaddadp vs44, vs6, vs16
  989. xvmaddadp vs45, vs6, vs17
  990. xvmaddadp vs46, vs7, vs16
  991. xvmaddadp vs47, vs7, vs17
  992. .if \Complete==0
  993. lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  994. .endif
  995. .if \Complete==0
  996. xxswapd vs17, vs16
  997. .endif
  998. xvmaddadp vs32, vs8, vs20
  999. xvmaddadp vs33, vs8, vs21
  1000. .if \Complete==0
  1001. lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
  1002. lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
  1003. .endif
  1004. xvmaddadp vs34, vs9, vs20
  1005. xvmaddadp vs35, vs9, vs21
  1006. .if \Complete==0
  1007. lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A
  1008. lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A
  1009. .endif
  1010. xvmaddadp vs36, vs10, vs20
  1011. xvmaddadp vs37, vs10, vs21
  1012. xvmaddadp vs38, vs11, vs20
  1013. xvmaddadp vs39, vs11, vs21
  1014. .if \Complete==0
  1015. lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A
  1016. lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A
  1017. .endif
  1018. xvmaddadp vs40, vs12, vs20
  1019. xvmaddadp vs41, vs12, vs21
  1020. xvmaddadp vs42, vs13, vs20
  1021. xvmaddadp vs43, vs13, vs21
  1022. .if \Complete==0
  1023. lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A
  1024. lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A
  1025. .endif
  1026. xvmaddadp vs44, vs14, vs20
  1027. xvmaddadp vs45, vs14, vs21
  1028. xvmaddadp vs46, vs15, vs20
  1029. xvmaddadp vs47, vs15, vs21
  1030. .if \Complete==0
  1031. lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A
  1032. lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A
  1033. lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  1034. .endif
  1035. .if \IsLast==1
  1036. .if \Complete==1
  1037. addi \AREG, \AREG, DISP16(\Index,\OffsetA)
  1038. addi \BREG, \BREG, DISP2(\Index,\OffsetB)
  1039. .else
  1040. addi \AREG, \AREG, DISP16(\Index,256)
  1041. addi \BREG, \BREG, DISP2(\Index,32)
  1042. .endif
  1043. .endif
  1044. .endm
  1045. .macro KERNEL1x8
  1046. LOAD1x8
  1047. END1x8 AO, BO, 128,16
  1048. .endm
  1049. .macro SAVE1x8
  1050. SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
  1051. addi CO, CO, 128
  1052. .endm
  1053. /**********************************************************************************************
  1054. *
  1055. .macros for N=2 and M=4
  1056. **********************************************************************************************/
  1057. .macro Zero1x4
  1058. xxlxor vs32, vs32, vs32
  1059. xxlxor vs33, vs33, vs33
  1060. xxlxor vs34, vs34, vs34
  1061. xxlxor vs35, vs35, vs35
  1062. xxlxor vs36, vs36, vs36
  1063. xxlxor vs37, vs37, vs37
  1064. xxlxor vs38, vs38, vs38
  1065. xxlxor vs39, vs39, vs39
  1066. .endm
  1067. .macro LOAD1x4
  1068. LOAD1x4O 0,0
  1069. .endm
  1070. .macro LOAD1x4O OffsetA,OffsetB
  1071. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  1072. xxswapd vs17, vs16
  1073. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  1074. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  1075. lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
  1076. lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
  1077. .endm
  1078. .macro END1x4_NORMAL
  1079. END1x4 AO,BO,64,16
  1080. .endm
  1081. .macro END1x4_WITHOUT_ADD
  1082. END1x4 AO,BO,0,0
  1083. .endm
  1084. .macro END1x4 AREG, BREG, OffsetA, OffsetB
  1085. .if \OffsetB != 0
  1086. addi \BREG, \BREG, \OffsetB
  1087. .endif
  1088. .if \OffsetA != 0
  1089. addi \AREG, \AREG, \OffsetA
  1090. .endif
  1091. xvmaddadp vs32, vs0, vs16
  1092. xvmaddadp vs33, vs0, vs17
  1093. xvmaddadp vs34, vs1, vs16
  1094. xvmaddadp vs35, vs1, vs17
  1095. xvmaddadp vs36, vs2, vs16
  1096. xvmaddadp vs37, vs2, vs17
  1097. xvmaddadp vs38, vs3, vs16
  1098. xvmaddadp vs39, vs3, vs17
  1099. .endm
  1100. .macro LOAD1x4_2
  1101. LOAD1x4_2O 0,0
  1102. .endm
  1103. .macro LOAD1x4_2O OffsetA,OffsetB
  1104. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  1105. lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
  1106. xxswapd vs17, vs16
  1107. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  1108. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  1109. lxv vs2, (32+\OffsetA)(AO) // load real,imag from A
  1110. lxv vs3, (48+\OffsetA)(AO) // load real,imag from A
  1111. lxv vs8, (64+\OffsetA)(AO) // load real,imag from A
  1112. lxv vs9, (80+\OffsetA)(AO) // load real,imag from A
  1113. lxv vs10, (96+\OffsetA)(AO) // load real,imag from A
  1114. lxv vs11, (112+\OffsetA)(AO) // load real,imag from A
  1115. .endm
  1116. .macro END1x4_2
  1117. /*for load2 offset will be 128 and 32*/
  1118. KERNEL1x4_2 AO,BO, 128,32,0 ,1,1
  1119. .endm
  1120. .macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast
  1121. KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  1122. .endm
  1123. .macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
  1124. KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  1125. .endm
  1126. .macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  1127. xvmaddadp vs32, vs0, vs16
  1128. xvmaddadp vs33, vs0, vs17
  1129. xxswapd vs21, vs20
  1130. xvmaddadp vs34, vs1, vs16
  1131. xvmaddadp vs35, vs1, vs17
  1132. .if \Complete==0
  1133. lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  1134. lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
  1135. .endif
  1136. xvmaddadp vs36, vs2, vs16
  1137. xvmaddadp vs37, vs2, vs17
  1138. xvmaddadp vs38, vs3, vs16
  1139. xvmaddadp vs39, vs3, vs17
  1140. .if \Complete==0
  1141. lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A
  1142. lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A
  1143. .endif
  1144. .if \Complete==0
  1145. lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  1146. .endif
  1147. xvmaddadp vs32, vs8, vs20
  1148. xvmaddadp vs33, vs8, vs21
  1149. .if \Complete==0
  1150. xxswapd vs17, vs16
  1151. .endif
  1152. xvmaddadp vs34, vs9, vs20
  1153. xvmaddadp vs35, vs9, vs21
  1154. .if \Complete==0
  1155. lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A
  1156. lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A
  1157. .endif
  1158. xvmaddadp vs36, vs10, vs20
  1159. xvmaddadp vs37, vs10, vs21
  1160. xvmaddadp vs38, vs11, vs20
  1161. xvmaddadp vs39, vs11, vs21
  1162. .if \Complete==0
  1163. lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A
  1164. lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A
  1165. .endif
  1166. .if \Complete==0
  1167. lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  1168. .endif
  1169. .if \IsLast==1
  1170. .if \Complete==1
  1171. addi \AREG, \AREG, DISP8(\Index,\OffsetA)
  1172. addi \BREG, \BREG, DISP2(\Index,\OffsetB)
  1173. .else
  1174. addi \AREG, \AREG, DISP8(\Index,128)
  1175. addi \BREG, \BREG, DISP2(\Index,32)
  1176. .endif
  1177. .endif
  1178. .endm
  1179. .macro KERNEL1x4
  1180. LOAD1x4
  1181. END1x4 AO, BO, 64,16
  1182. .endm
  1183. .macro SAVE1x4
  1184. SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
  1185. addi CO, CO, 64
  1186. .endm
  1187. /**********************************************************************************************
  1188. *
  1189. .macros for N=2 and M=2
  1190. **********************************************************************************************/
  1191. .macro Zero1x2
  1192. xxlxor vs32, vs32, vs32
  1193. xxlxor vs33, vs33, vs33
  1194. xxlxor vs34, vs34, vs34
  1195. xxlxor vs35, vs35, vs35
  1196. .endm
  1197. .macro LOAD1x2
  1198. LOAD1x2O 0,0
  1199. .endm
  1200. .macro LOAD1x2O OffsetA,OffsetB
  1201. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  1202. xxswapd vs17, vs16
  1203. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  1204. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  1205. .endm
  1206. .macro END1x2_NORMAL
  1207. END1x2 AO,BO,32,16
  1208. .endm
  1209. .macro END1x2_WITHOUT_ADD
  1210. END1x2 AO,BO,0,0
  1211. .endm
  1212. .macro END1x2 AREG, BREG, OffsetA, OffsetB
  1213. .if \OffsetB != 0
  1214. addi \BREG, \BREG, \OffsetB
  1215. .endif
  1216. .if \OffsetA != 0
  1217. addi \AREG, \AREG, \OffsetA
  1218. .endif
  1219. xvmaddadp vs32, vs0, vs16
  1220. xvmaddadp vs33, vs0, vs17
  1221. xvmaddadp vs34, vs1, vs16
  1222. xvmaddadp vs35, vs1, vs17
  1223. .endm
  1224. .macro LOAD1x2_2
  1225. LOAD1x2_2O 0,0
  1226. .endm
  1227. .macro LOAD1x2_2O OffsetA,OffsetB
  1228. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  1229. lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
  1230. xxswapd vs17, vs16
  1231. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  1232. lxv vs1, (16+\OffsetA)(AO) // load real,imag from A
  1233. lxv vs8, (32+\OffsetA)(AO) // load real,imag from A
  1234. lxv vs9, (48+\OffsetA)(AO) // load real,imag from A
  1235. .endm
  1236. .macro END1x2_2
  1237. /*for load2 offset will be 64 and 32*/
  1238. KERNEL1x2_2 AO,BO, 64,32,0 ,1,1
  1239. .endm
  1240. .macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast
  1241. KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  1242. .endm
  1243. .macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
  1244. KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  1245. .endm
  1246. .macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  1247. xvmaddadp vs32, vs0, vs16
  1248. xvmaddadp vs33, vs0, vs17
  1249. xxswapd vs21, vs20
  1250. xvmaddadp vs34, vs1, vs16
  1251. xvmaddadp vs35, vs1, vs17
  1252. .if \Complete==0
  1253. lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  1254. lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A
  1255. .endif
  1256. .if \Complete==0
  1257. lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  1258. .endif
  1259. xvmaddadp vs32, vs8, vs20
  1260. xvmaddadp vs33, vs8, vs21
  1261. .if \Complete==0
  1262. xxswapd vs17, vs16
  1263. .endif
  1264. xvmaddadp vs34, vs9, vs20
  1265. xvmaddadp vs35, vs9, vs21
  1266. .if \Complete==0
  1267. lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  1268. .endif
  1269. .if \Complete==0
  1270. lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A
  1271. lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A
  1272. .endif
  1273. .if \IsLast==1
  1274. .if \Complete==1
  1275. addi \AREG, \AREG, DISP4(\Index,\OffsetA)
  1276. addi \BREG, \BREG, DISP2(\Index,\OffsetB)
  1277. .else
  1278. addi \AREG, \AREG, DISP4(\Index,64)
  1279. addi \BREG, \BREG, DISP2(\Index,32)
  1280. .endif
  1281. .endif
  1282. .endm
  1283. .macro KERNEL1x2
  1284. LOAD1x2
  1285. END1x2 AO, BO, 32,16
  1286. .endm
  1287. .macro SAVE1x2
  1288. SAVE2 vs32,vs33,vs34,vs35,CO,0
  1289. addi CO, CO, 32
  1290. .endm
  1291. /**********************************************************************************************
  1292. *
  1293. .macros for N=2 and M=1
  1294. **********************************************************************************************/
  1295. .macro Zero1x1
  1296. xxlxor vs32, vs32, vs32
  1297. xxlxor vs33, vs33, vs33
  1298. .endm
  1299. .macro LOAD1x1
  1300. LOAD1x1O 0,0
  1301. .endm
  1302. .macro LOAD1x1O OffsetA,OffsetB
  1303. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  1304. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  1305. xxswapd vs17, vs16
  1306. .endm
  1307. .macro END1x1_NORMAL
  1308. END1x1 AO,BO,16,16
  1309. .endm
  1310. .macro END1x1_WITHOUT_ADD
  1311. END1x1 AO,BO,0,0
  1312. .endm
  1313. .macro END1x1 AREG, BREG, OffsetA, OffsetB
  1314. .if \OffsetB != 0
  1315. addi \BREG, \BREG, \OffsetB
  1316. .endif
  1317. .if \OffsetA != 0
  1318. addi \AREG, \AREG, \OffsetA
  1319. .endif
  1320. xvmaddadp vs32, vs0, vs16
  1321. xvmaddadp vs33, vs0, vs17
  1322. .endm
  1323. .macro LOAD1x1_2
  1324. LOAD1x1_2O 0,0
  1325. .endm
  1326. .macro LOAD1x1_2O OffsetA,OffsetB
  1327. lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B
  1328. lxv vs20, (\OffsetB+16)(BO) // load real,imag from B
  1329. xxswapd vs17, vs16
  1330. lxv vs0, (0+\OffsetA)(AO) // load real,imag from A
  1331. lxv vs8, (16+\OffsetA)(AO) // load real,imag from A
  1332. .endm
  1333. .macro END1x1_2
  1334. /*for load2 offset will be 32 and 32*/
  1335. KERNEL1x1_2 AO,BO, 32,32,0 ,1,1
  1336. .endm
  1337. .macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
  1338. KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  1339. .endm
  1340. .macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
  1341. KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  1342. .endm
  1343. .macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  1344. xxswapd vs21, vs20
  1345. xvmaddadp vs32, vs0, vs16
  1346. xvmaddadp vs33, vs0, vs17
  1347. .if \Complete==0
  1348. lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  1349. .endif
  1350. .if \Complete==0
  1351. lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  1352. .endif
  1353. .if \Complete==0
  1354. xxswapd vs17, vs16
  1355. .endif
  1356. xvmaddadp vs32, vs8, vs20
  1357. xvmaddadp vs33, vs8, vs21
  1358. .if \Complete==0
  1359. lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
  1360. .endif
  1361. .if \Complete==0
  1362. lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  1363. .endif
  1364. .if \IsLast==1
  1365. .if \Complete==1
  1366. addi \AREG, \AREG, DISP2(\Index,\OffsetA)
  1367. addi \BREG, \BREG, DISP2(\Index,\OffsetB)
  1368. .else
  1369. addi \AREG, \AREG, DISP2(\Index,32)
  1370. addi \BREG, \BREG, DISP2(\Index,32)
  1371. .endif
  1372. .endif
  1373. .endm
  1374. .macro KERNEL1x1
  1375. LOAD1x1
  1376. END1x1 AO, BO, 16,16
  1377. .endm
  1378. .macro SAVE1x1
  1379. SAVE1 vs32,vs33,CO,0
  1380. addi CO, CO, 16
  1381. .endm
  1382. /****************************TRMM POINTER REFRESH
  1383. .macroSES*************************/
  1384. .macro SHIFT_REG REG1,REG2,SHIFT_VAL
  1385. .if \SHIFT_VAL==16
  1386. slwi \REG1, \REG2, 8
  1387. .elseif \SHIFT_VAL==8
  1388. slwi \REG1, \REG2, 7
  1389. .elseif \SHIFT_VAL==4
  1390. slwi \REG1, \REG2, 6
  1391. .elseif \SHIFT_VAL==2
  1392. slwi \REG1, \REG2, 5
  1393. .elseif \SHIFT_VAL==1
  1394. slwi \REG1, \REG2, 4
  1395. .endif
  1396. .endm
  1397. /*
  1398. //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1399. // ptrbb = bb;
  1400. // #else
  1401. // ptrba += off*16;
  1402. // ptrbb = bb + off*2;
  1403. // #endif
  1404. */
  1405. .macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
  1406. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1407. /* ptrbb = bb;*/
  1408. mr \PTR_B,\B_VAL /* refresh BPOINT */
  1409. #else
  1410. /*
  1411. // ptrba =ptrba+ off*C_A;
  1412. // ptrbb = bb + off*C_B;
  1413. */
  1414. SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
  1415. SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
  1416. add \PTR_B, \B_VAL , T4 /* Add values to BO */
  1417. add \PTR_A, \PTR_A, T2 /* Add values to AO */
  1418. #endif
  1419. .endm
  1420. /*
  1421. // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1422. // temp = bk-off;
  1423. // #elif defined(LEFT)
  1424. // temp = off+16; // number of values in A
  1425. // #else
  1426. // temp = off+2; // number of values in B
  1427. // #endif
  1428. */
  1429. .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
  1430. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1431. /* temp = bk-off;*/
  1432. sub \TEMP_BK,\BK_VAL,\OFF_VAL
  1433. #elif defined(LEFT)
  1434. /* temp = off+INCR_A; // number of values in A */
  1435. addi \TEMP_BK, \OFF_VAL, \INCR_A
  1436. #else
  1437. /* temp = off+INCR_B // number of values in B*/
  1438. addi \TEMP_BK,\OFF_VAL, \INCR_B
  1439. #endif
  1440. .endm
  1441. /*
  1442. // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1443. // temp = bk - off;
  1444. // #ifdef LEFT
  1445. // temp -= 16; // number of values in A
  1446. // #else
  1447. // temp -= 2; // number of values in B
  1448. // #endif
  1449. // ptrba += temp*16;
  1450. // ptrbb += temp*2;
  1451. // #endif
  1452. // #ifdef LEFT
  1453. // off += 16; // number of values in A
  1454. // #endif
  1455. */
  1456. .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
  1457. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1458. /*temp = bk - off;*/
  1459. sub \TEMP_BK,\BK_VAL,\OFF_VAL
  1460. #ifdef LEFT
  1461. /*temp -= 8; // number of values in A*/
  1462. addi \TEMP_BK,\TEMP_BK,-\C_A
  1463. #else
  1464. /*temp -= 4; // number of values in B*/
  1465. addi \TEMP_BK,\TEMP_BK,-\C_B
  1466. #endif
  1467. /*ptrba += temp*C_A;
  1468. ptrbb += temp*C_B;*/
  1469. SHIFT_REG T4,\TEMP_BK,\C_A
  1470. SHIFT_REG T2,\TEMP_BK,\C_B
  1471. add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
  1472. add \PTR_B, \PTR_B,T2
  1473. #endif
  1474. #ifdef LEFT
  1475. /*off += 8; // number of values in A*/
  1476. addi \OFF_VAL,\OFF_VAL,\C_A
  1477. #endif
  1478. .endm