You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_macros_power10.S 59 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437
  1. /***************************************************************************
  2. Copyright (c) 2013-2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define unit_size 8
  28. #define DISP32(ind, disp) (ind*unit_size*32+disp)
  29. #define DISP16(ind, disp) (ind*unit_size*16+disp)
  30. #define DISP8(ind, disp) (ind*unit_size*8+disp)
  31. #define DISP4(ind, disp) (ind*unit_size*4+disp)
  32. #define DISP2(ind, disp) (ind*unit_size*2+disp)
  33. #define DISP1(ind, disp) (ind*unit_size+disp)
  34. #define DISPX(disp) (disp)
  35. .macro AGGREGATE_REALS_IMAGES VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
  36. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  37. xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  38. xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  39. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  40. xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  41. xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  42. #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
  43. xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  44. xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2
  45. #else // CC || CR || RC || RR
  46. /*we will assume {-alpha_r,-alpha_i} for this case */
  47. /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
  48. xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1
  49. /*we will negate alpha image instead to fix sign*/
  50. xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  51. #endif
  52. .endm
  53. .macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
  54. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  55. xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  56. xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  57. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  58. xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  59. xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2
  60. #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
  61. xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR
  62. xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  63. #else // CC || CR || RC || RR
  64. /*we will assume {-alpha_r,-alpha_i} for this case */
  65. /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
  66. xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1
  67. /*we will negate alpha image instead to fix sign*/
  68. xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI
  69. #endif
  70. .endm
  71. /* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
  72. .macro MULT_APLHA_PART1 VSINRR, VSINII, VSOUT1, VSOUT2
  73. xvmulsp \VSOUT1, \VSINII, alpha_i
  74. xvmulsp \VSOUT2, \VSINRR, alpha_i
  75. .endm
  76. /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
  77. .macro MULT_APLHA_PART2 VSINRR, VSINII, VSOUT1, VSOUT2
  78. xvmsubasp \VSOUT1, \VSINRR, alpha_r
  79. xvmaddasp \VSOUT2, \VSINII, alpha_r
  80. .endm
  81. .macro PERMUTE1 OUT, R1, R2, R3, R4
  82. xxsel vs62, \R1, \R2, vs57
  83. xxsel \OUT, \R3, \R4, vs57
  84. xxpermdi \OUT, \OUT, vs62, 1
  85. .endm
  86. .macro PERMUTE2 OUT, R1, R2, R3, R4
  87. xxsel vs62, \R2, \R1, vs57
  88. xxsel \OUT, \R4, \R3, vs57
  89. xxpermdi \OUT, vs62, \OUT, 1
  90. xxperm \OUT, \OUT, permute_mask
  91. .endm
  92. .macro PERMUTE3 OUT, R1, R2, R3, R4
  93. xxsel vs62, \R1, \R2, vs57
  94. xxsel \OUT, \R3, \R4, vs57
  95. xxpermdi \OUT, vs62, \OUT, 2
  96. .endm
  97. .macro PERMUTE4 OUT, R1, R2, R3, R4
  98. xxsel vs62, \R2, \R1, vs57
  99. xxsel \OUT, \R4, \R3, vs57
  100. xxpermdi \OUT, \OUT, vs62, 2
  101. xxperm \OUT, \OUT, permute_mask
  102. .endm
  103. .macro GROUP1
  104. xxperm vs0, vs32, permute_mask
  105. xxperm vs4, vs40, permute_mask
  106. xxperm vs1, vs33, permute_mask
  107. xxperm vs5, vs41, permute_mask
  108. xxperm vs8, vs36, permute_mask
  109. xxperm vs12, vs44, permute_mask
  110. xxperm vs9, vs37, permute_mask
  111. xxperm vs13, vs45, permute_mask
  112. .endm
  113. .macro AGG_GROUP1
  114. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  115. AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
  116. AGGREGATE_REALS_IMAGES vs36, vs8, vs44, vs12
  117. AGGREGATE_REALS_IMAGES vs37, vs9, vs45, vs13
  118. .endm
  119. .macro GROUP2
  120. xxperm vs0, vs34, permute_mask
  121. xxperm vs4, vs42, permute_mask
  122. xxperm vs1, vs35, permute_mask
  123. xxperm vs5, vs43, permute_mask
  124. xxperm vs8, vs38, permute_mask
  125. xxperm vs12, vs46, permute_mask
  126. xxperm vs9, vs39, permute_mask
  127. xxperm vs13, vs47, permute_mask
  128. .endm
  129. .macro AGG_GROUP2
  130. AGGREGATE_REALS_IMAGES vs34, vs0, vs42, vs4
  131. AGGREGATE_REALS_IMAGES vs35, vs1, vs43, vs5
  132. AGGREGATE_REALS_IMAGES vs38, vs8, vs46, vs12
  133. AGGREGATE_REALS_IMAGES vs39, vs9, vs47, vs13
  134. .endm
  135. .macro MULTIPLY_GROUP1
  136. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  137. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  138. MULT_APLHA_PART1 vs36, vs44, vs8, vs9
  139. MULT_APLHA_PART1 vs37, vs45, vs10, vs11
  140. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  141. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  142. MULT_APLHA_PART2 vs36, vs44, vs8, vs9
  143. MULT_APLHA_PART2 vs37, vs45, vs10, vs11
  144. .endm
  145. .macro MULTIPLY_GROUP2
  146. MULT_APLHA_PART1 vs34, vs42, vs4, vs5
  147. MULT_APLHA_PART1 vs35, vs43, vs6, vs7
  148. MULT_APLHA_PART1 vs38, vs46, vs12, vs13
  149. MULT_APLHA_PART1 vs39, vs47, vs14, vs15
  150. MULT_APLHA_PART2 vs34, vs42, vs4, vs5
  151. MULT_APLHA_PART2 vs35, vs43, vs6, vs7
  152. MULT_APLHA_PART2 vs38, vs46, vs12, vs13
  153. MULT_APLHA_PART2 vs39, vs47, vs14, vs15
  154. .endm
  155. /* reconstruct r, i pairs*/
  156. .macro RECONSTRUCT_PAIR1
  157. xxperm vs0, vs1, save_permute_1
  158. xxperm vs2, vs3, save_permute_1
  159. xxperm vs8, vs9, save_permute_1
  160. xxperm vs10, vs11, save_permute_1
  161. .endm
  162. .macro RECONSTRUCT_PAIR2
  163. xxperm vs4, vs5, save_permute_1
  164. xxperm vs6, vs7, save_permute_1
  165. xxperm vs12, vs13, save_permute_1
  166. xxperm vs14, vs15, save_permute_1
  167. .endm
  168. .macro SHUFFLE_ACC ACC, R0, R1, R2, R3, O1, O2, O3, O4
  169. xxmfacc \ACC
  170. PERMUTE1 \O1, \R3, \R2, \R1, \R0
  171. PERMUTE2 \O2, \R1, \R0, \R3, \R2
  172. PERMUTE3 \O3, \R1, \R0, \R3, \R2
  173. PERMUTE4 \O4, \R3, \R2, \R1, \R0
  174. .endm
  175. /* macros for N=4 and M=8
  176. **********************************************************************************************/
  177. .macro ZERO4x8
  178. xxsetaccz 0
  179. xxsetaccz 1
  180. xxsetaccz 2
  181. xxsetaccz 3
  182. xxsetaccz 4
  183. xxsetaccz 5
  184. xxsetaccz 6
  185. xxsetaccz 7
  186. .endm
  187. .macro LOAD4x8
  188. LOAD4x8O 0, 0
  189. .endm
  190. .macro LOAD4x8O OffsetA, OffsetB
  191. lxvp vs34, (\OffsetB+0)(BO)
  192. lxvp vs32, (\OffsetA+0)(AO)
  193. lxvp vs36, (\OffsetA+32)(AO)
  194. .endm
  195. .macro END4x8_NORMAL
  196. END4x8 AO, BO, 64, 32
  197. .endm
  198. .macro END4x8_WITHOUT_ADD
  199. END4x8 AO, BO, 0, 0
  200. .endm
  201. .macro END4x8 AREG, BREG, OffsetA, OffsetB
  202. .if \OffsetB != 0
  203. addi \BREG, \BREG, \OffsetB
  204. .endif
  205. .if \OffsetA != 0
  206. addi \AREG, \AREG, \OffsetA
  207. .endif
  208. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  209. xvf32gerpp 3, 36, 34
  210. xvf32gerpp 2, 37, 34
  211. xvf32gerpp 1, 32, 34
  212. xvf32gerpp 0, 33, 34
  213. xvf32gerpp 7, 36, 35
  214. xvf32gerpp 6, 37, 35
  215. xvf32gerpp 5, 32, 35
  216. xvf32gerpp 4, 33, 35
  217. #else
  218. xvf32gerpp 3, 36, 35
  219. xvf32gerpp 2, 37, 35
  220. xvf32gerpp 1, 32, 35
  221. xvf32gerpp 0, 33, 35
  222. xvf32gerpp 7, 36, 34
  223. xvf32gerpp 6, 37, 34
  224. xvf32gerpp 5, 32, 34
  225. xvf32gerpp 4, 33, 34
  226. #endif
  227. .endm
  228. .macro LOAD4x8_2
  229. LOAD4x8_2O 0, 0
  230. .endm
  231. .macro LOAD4x8_2O OffsetA, OffsetB
  232. lxvp vs34, (\OffsetB)(BO)
  233. lxvp vs38, (32+\OffsetB)(BO)
  234. lxvp vs32, (0+\OffsetA)(AO)
  235. lxvp vs36, (32+\OffsetA)(AO)
  236. lxvp vs40, (64+\OffsetA)(AO)
  237. lxvp vs42, (64+32+\OffsetA)(AO)
  238. .endm
  239. .macro END4x8_2
  240. /*for load2 offset will be 128 and 64*/
  241. KERNEL4x8_2 AO, BO, 128, 64, 0, 1, 1
  242. .endm
  243. .macro KERNEL4x8_E2 OffsetA, OffsetB, Index, IsLast
  244. KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  245. .endm
  246. .macro KERNEL4x8_L2 OffsetA, OffsetB, Index, IsLast
  247. KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  248. .endm
  249. .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  250. #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  251. xvf32gerpp 3, 36, 34
  252. xvf32gerpp 2, 37, 34
  253. xvf32gerpp 1, 32, 34
  254. xvf32gerpp 0, 33, 34
  255. xvf32gerpp 7, 36, 35
  256. xvf32gerpp 6, 37, 35
  257. xvf32gerpp 5, 32, 35
  258. xvf32gerpp 4, 33, 35
  259. #else
  260. xvf32gerpp 3, 36, 35
  261. xvf32gerpp 2, 37, 35
  262. xvf32gerpp 1, 32, 35
  263. xvf32gerpp 0, 33, 35
  264. xvf32gerpp 7, 36, 34
  265. xvf32gerpp 6, 37, 34
  266. xvf32gerpp 5, 32, 34
  267. xvf32gerpp 4, 33, 34
  268. #endif
  269. .if \Complete==0
  270. lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
  271. lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
  272. lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
  273. .endif
  274. #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  275. xvf32gerpp 3, 42, 38
  276. xvf32gerpp 2, 43, 38
  277. xvf32gerpp 1, 40, 38
  278. xvf32gerpp 0, 41, 38
  279. xvf32gerpp 7, 42, 39
  280. xvf32gerpp 6, 43, 39
  281. xvf32gerpp 5, 40, 39
  282. xvf32gerpp 4, 41, 39
  283. #else
  284. xvf32gerpp 3, 42, 39
  285. xvf32gerpp 2, 43, 39
  286. xvf32gerpp 1, 40, 39
  287. xvf32gerpp 0, 41, 39
  288. xvf32gerpp 7, 42, 38
  289. xvf32gerpp 6, 43, 38
  290. xvf32gerpp 5, 40, 38
  291. xvf32gerpp 4, 41, 38
  292. #endif
  293. .if \Complete==0
  294. lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
  295. lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
  296. lxvp vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG)
  297. .endif
  298. .if \IsLast==1
  299. .if \Complete==1
  300. addi \BREG, \BREG, DISP8(\Index, \OffsetB)
  301. addi \AREG, \AREG, DISP16(\Index, \OffsetA)
  302. .else
  303. addi \BREG, \BREG, DISP8(\Index, 64)
  304. addi \AREG, \AREG, DISP16(\Index, 128)
  305. .endif
  306. .endif
  307. .endm
  308. .macro KERNEL4x8
  309. LOAD4x8
  310. END4x8 AO, BO, 64, 32
  311. .endm
  312. .macro SAVE4x8
  313. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  314. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  315. SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
  316. SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
  317. SHUFFLE_ACC 4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60
  318. SHUFFLE_ACC 5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61
  319. SHUFFLE_ACC 7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20
  320. SHUFFLE_ACC 6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21
  321. add T4, LDC, LDC
  322. add T1, CO, LDC
  323. #ifndef TRMMKERNEL
  324. lxvp vs24, 0(CO)
  325. #endif
  326. #ifndef TRMMKERNEL
  327. lxvp vs26, 32(CO)
  328. #endif
  329. #ifndef TRMMKERNEL
  330. lxvp vs28, 0(T1)
  331. #endif
  332. xxperm vs2, vs34, permute_mask
  333. xxperm vs6, vs42, permute_mask
  334. #ifndef TRMMKERNEL
  335. lxvp vs30, 32(T1)
  336. #endif
  337. xxperm vs3, vs35, permute_mask
  338. xxperm vs7, vs43, permute_mask
  339. add T2, CO, T4
  340. add T3, T1, T4
  341. GROUP1
  342. AGG_GROUP1
  343. AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6
  344. xxperm vs10, vs38, permute_mask
  345. xxperm vs14, vs46, permute_mask
  346. AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7
  347. xxperm vs11, vs39, permute_mask
  348. xxperm vs15, vs47, permute_mask
  349. xxperm vs0, vs48, permute_mask
  350. xxperm vs4, vs56, permute_mask
  351. xxperm vs1, vs49, permute_mask
  352. xxperm vs5, vs16, permute_mask
  353. AGGREGATE_REALS_IMAGES vs38, vs10, vs46, vs14
  354. xxperm vs2, vs50, permute_mask
  355. xxperm vs6, vs58, permute_mask
  356. AGGREGATE_REALS_IMAGES vs39, vs11, vs47, vs15
  357. xxperm vs3, vs17, permute_mask
  358. xxperm vs7, vs19, permute_mask
  359. AGGREGATE_REALS_IMAGES vs48, vs0, vs56, vs4
  360. xxperm vs8, vs52, permute_mask
  361. xxperm vs12, vs60, permute_mask
  362. AGGREGATE_REALS_IMAGES vs49, vs1, vs16, vs5
  363. xxperm vs9, vs53, permute_mask
  364. xxperm vs13, vs61, permute_mask
  365. AGGREGATE_REALS_IMAGES vs50, vs2, vs58, vs6
  366. xxperm vs10, vs54, permute_mask
  367. xxperm vs14, vs21, permute_mask
  368. AGGREGATE_REALS_IMAGES vs17, vs3, vs19, vs7
  369. xxperm vs11, vs18, permute_mask
  370. xxperm vs15, vs20, permute_mask
  371. AGGREGATE_REALS_IMAGES vs52, vs8, vs60, vs12
  372. AGGREGATE_REALS_IMAGES vs53, vs9, vs61, vs13
  373. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  374. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  375. AGGREGATE_REALS_IMAGES vs54, vs10, vs21, vs14
  376. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  377. AGGREGATE_REALS_IMAGES vs18, vs11, vs20, vs15
  378. MULT_APLHA_PART1 vs34, vs42, vs4, vs5
  379. MULT_APLHA_PART1 vs35, vs43, vs6, vs7
  380. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  381. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  382. MULT_APLHA_PART2 vs34, vs42, vs4, vs5
  383. MULT_APLHA_PART2 vs35, vs43, vs6, vs7
  384. #ifndef TRMMKERNEL
  385. lxvp vs32, 0(T2)
  386. #endif
  387. MULT_APLHA_PART1 vs36, vs44, vs8, vs9
  388. MULT_APLHA_PART1 vs37, vs45, vs10, vs11
  389. #ifndef TRMMKERNEL
  390. lxvp vs40, 32(T2)
  391. #endif
  392. MULT_APLHA_PART1 vs38, vs46, vs12, vs13
  393. MULT_APLHA_PART1 vs39, vs47, vs14, vs15
  394. #ifndef TRMMKERNEL
  395. lxvp vs34, 0(T3)
  396. #endif
  397. MULT_APLHA_PART2 vs36, vs44, vs8, vs9
  398. MULT_APLHA_PART2 vs37, vs45, vs10, vs11
  399. #ifndef TRMMKERNEL
  400. lxvp vs42, 32(T3)
  401. #endif
  402. MULT_APLHA_PART2 vs38, vs46, vs12, vs13
  403. MULT_APLHA_PART2 vs39, vs47, vs14, vs15
  404. RECONSTRUCT_PAIR1
  405. RECONSTRUCT_PAIR2
  406. #ifndef TRMMKERNEL
  407. /* add */
  408. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  409. xxpermdi vs1, vs0, vs8, 1
  410. xxpermdi vs3, vs2, vs10, 1
  411. xxpermdi vs5, vs4, vs12, 1
  412. xxpermdi vs7, vs6, vs14, 1
  413. xxpermdi vs9, vs8, vs0, 1
  414. xxpermdi vs11, vs10, vs2, 1
  415. #else
  416. xxpermdi vs1, vs8, vs0, 2
  417. xxpermdi vs3, vs10, vs2, 2
  418. xxpermdi vs5, vs12, vs4, 2
  419. xxpermdi vs7, vs14, vs6, 2
  420. xxpermdi vs9, vs0, vs8, 2
  421. xxpermdi vs11, vs2, vs10, 2
  422. #endif
  423. xvaddsp vs24, vs24, vs3
  424. xvaddsp vs25, vs25, vs1
  425. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  426. xxpermdi vs13, vs12, vs4, 1
  427. xxpermdi vs15, vs14, vs6, 1
  428. #else
  429. xxpermdi vs13, vs4, vs12, 2
  430. xxpermdi vs15, vs6, vs14, 2
  431. #endif
  432. xvaddsp vs26, vs26, vs7
  433. xvaddsp vs27, vs27, vs5
  434. xvaddsp vs28, vs28, vs11
  435. xvaddsp vs29, vs29, vs9
  436. xvaddsp vs30, vs30, vs15
  437. xvaddsp vs31, vs31, vs13
  438. #else
  439. #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  440. xxpermdi vs25, vs0, vs8, 1
  441. xxpermdi vs24, vs2, vs10, 1
  442. xxpermdi vs27, vs4, vs12, 1
  443. xxpermdi vs26, vs6, vs14, 1
  444. xxpermdi vs29, vs8, vs0, 1
  445. xxpermdi vs28, vs10, vs2, 1
  446. xxpermdi vs31, vs12, vs4, 1
  447. xxpermdi vs30, vs14, vs6, 1
  448. #else
  449. xxpermdi vs25, vs8, vs0, 2
  450. xxpermdi vs24, vs10, vs2, 2
  451. xxpermdi vs27, vs12, vs4, 2
  452. xxpermdi vs26, vs14, vs6, 2
  453. xxpermdi vs29, vs0, vs8, 2
  454. xxpermdi vs28, vs2, vs10, 2
  455. xxpermdi vs31, vs4, vs12, 2
  456. xxpermdi vs30, vs6, vs14, 2
  457. #endif
  458. #endif
  459. stxvp vs24, 0(CO)
  460. MULT_APLHA_PART1 vs48, vs56, vs0, vs1
  461. MULT_APLHA_PART1 vs49, vs16, vs2, vs3
  462. stxvp vs26, 32(CO)
  463. MULT_APLHA_PART1 vs50, vs58, vs4, vs5
  464. MULT_APLHA_PART1 vs17, vs19, vs6, vs7
  465. stxvp vs28, 0(T1)
  466. MULT_APLHA_PART2 vs48, vs56, vs0, vs1
  467. MULT_APLHA_PART2 vs49, vs16, vs2, vs3
  468. stxvp vs30, 32(T1)
  469. MULT_APLHA_PART2 vs50, vs58, vs4, vs5
  470. MULT_APLHA_PART2 vs17, vs19, vs6, vs7
  471. MULT_APLHA_PART1 vs52, vs60, vs8, vs9
  472. MULT_APLHA_PART1 vs53, vs61, vs10, vs11
  473. MULT_APLHA_PART1 vs54, vs21, vs12, vs13
  474. MULT_APLHA_PART1 vs18, vs20, vs14, vs15
  475. MULT_APLHA_PART2 vs52, vs60, vs8, vs9
  476. MULT_APLHA_PART2 vs53, vs61, vs10, vs11
  477. MULT_APLHA_PART2 vs54, vs21, vs12, vs13
  478. MULT_APLHA_PART2 vs18, vs20, vs14, vs15
  479. RECONSTRUCT_PAIR1
  480. RECONSTRUCT_PAIR2
  481. #ifndef TRMMKERNEL
  482. /* add */
  483. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  484. xxpermdi vs1, vs0, vs8, 1
  485. xxpermdi vs3, vs2, vs10, 1
  486. xxpermdi vs5, vs4, vs12, 1
  487. xxpermdi vs7, vs6, vs14, 1
  488. xxpermdi vs9, vs8, vs0, 1
  489. xxpermdi vs11, vs10, vs2, 1
  490. #else
  491. xxpermdi vs1, vs8, vs0, 2
  492. xxpermdi vs3, vs10, vs2, 2
  493. xxpermdi vs5, vs12, vs4, 2
  494. xxpermdi vs7, vs14, vs6, 2
  495. xxpermdi vs9, vs0, vs8, 2
  496. xxpermdi vs11, vs2, vs10, 2
  497. #endif
  498. xvaddsp vs32, vs32, vs3
  499. xvaddsp vs33, vs33, vs1
  500. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  501. xxpermdi vs13, vs12, vs4, 1
  502. xxpermdi vs15, vs14, vs6, 1
  503. #else
  504. xxpermdi vs13, vs4, vs12, 2
  505. xxpermdi vs15, vs6, vs14, 2
  506. #endif
  507. xvaddsp vs40, vs40, vs7
  508. xvaddsp vs41, vs41, vs5
  509. xvaddsp vs34, vs34, vs11
  510. xvaddsp vs35, vs35, vs9
  511. xvaddsp vs42, vs42, vs15
  512. xvaddsp vs43, vs43, vs13
  513. #else
  514. #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  515. xxpermdi vs33, vs0, vs8, 1
  516. xxpermdi vs32, vs2, vs10, 1
  517. xxpermdi vs41, vs4, vs12, 1
  518. xxpermdi vs40, vs6, vs14, 1
  519. xxpermdi vs35, vs8, vs0, 1
  520. xxpermdi vs34, vs10, vs2, 1
  521. xxpermdi vs43, vs12, vs4, 1
  522. xxpermdi vs42, vs14, vs6, 1
  523. #else
  524. xxpermdi vs33, vs8, vs0, 2
  525. xxpermdi vs32, vs10, vs2, 2
  526. xxpermdi vs41, vs12, vs4, 2
  527. xxpermdi vs40, vs14, vs6, 2
  528. xxpermdi vs35, vs0, vs8, 2
  529. xxpermdi vs34, vs2, vs10, 2
  530. xxpermdi vs43, vs4, vs12, 2
  531. xxpermdi vs42, vs6, vs14, 2
  532. #endif
  533. #endif
  534. stxvp vs32, 0(T2)
  535. stxvp vs40, 32(T2)
  536. stxvp vs34, 0(T3)
  537. stxvp vs42, 32(T3)
  538. addi CO, CO, 64
  539. .endm
  540. /* macros for N=4 and M=4
  541. **********************************************************************************************/
  542. .macro ZERO4x4
  543. xxsetaccz 0
  544. xxsetaccz 1
  545. xxsetaccz 2
  546. xxsetaccz 3
  547. .endm
  548. .macro LOAD4x4
  549. LOAD4x4O 0, 0
  550. .endm
  551. .macro LOAD4x4O OffsetA, OffsetB
  552. lxvp vs34, (\OffsetB+0)(BO)
  553. lxvp vs32, (\OffsetA+0)(AO)
  554. .endm
  555. .macro END4x4_NORMAL
  556. END4x4 AO, BO, 32, 32
  557. .endm
  558. .macro END4x4_WITHOUT_ADD
  559. END4x4 AO, BO, 0, 0
  560. .endm
  561. .macro END4x4 AREG, BREG, OffsetA, OffsetB
  562. .if \OffsetB != 0
  563. addi \BREG, \BREG, \OffsetB
  564. .endif
  565. .if \OffsetA != 0
  566. addi \AREG, \AREG, \OffsetA
  567. .endif
  568. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  569. xvf32gerpp 3, 32, 35
  570. xvf32gerpp 2, 33, 35
  571. xvf32gerpp 1, 32, 34
  572. xvf32gerpp 0, 33, 34
  573. #else
  574. xvf32gerpp 3, 32, 34
  575. xvf32gerpp 2, 33, 34
  576. xvf32gerpp 1, 32, 35
  577. xvf32gerpp 0, 33, 35
  578. #endif
  579. .endm
  580. .macro LOAD4x4_2
  581. LOAD4x4_2O 0, 0
  582. .endm
  583. .macro LOAD4x4_2O OffsetA, OffsetB
  584. lxvp vs34, (\OffsetB)(BO)
  585. lxvp vs38, (32+\OffsetB)(BO)
  586. lxvp vs32, (0+\OffsetA)(AO)
  587. lxvp vs36, (32+\OffsetA)(AO)
  588. .endm
  589. .macro END4x4_2
  590. /*for load2 offset will be 64 and 64*/
  591. KERNEL4x4_2 AO, BO, 64, 64, 0, 1, 1
  592. .endm
  593. .macro KERNEL4x4_E2 OffsetA, OffsetB, Index, IsLast
  594. KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  595. .endm
  596. .macro KERNEL4x4_L2 OffsetA, OffsetB, Index, IsLast
  597. KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  598. .endm
  599. .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  600. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  601. xvf32gerpp 3, 32, 35
  602. xvf32gerpp 2, 33, 35
  603. xvf32gerpp 1, 32, 34
  604. xvf32gerpp 0, 33, 34
  605. #else
  606. xvf32gerpp 3, 32, 34
  607. xvf32gerpp 2, 33, 34
  608. xvf32gerpp 1, 32, 35
  609. xvf32gerpp 0, 33, 35
  610. #endif
  611. .if \Complete==0
  612. lxvp vs34, DISP8(\Index, \OffsetB)(\BREG)
  613. lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
  614. .endif
  615. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  616. xvf32gerpp 3, 36, 39
  617. xvf32gerpp 2, 37, 39
  618. xvf32gerpp 1, 36, 38
  619. xvf32gerpp 0, 37, 38
  620. #else
  621. xvf32gerpp 3, 36, 38
  622. xvf32gerpp 2, 37, 38
  623. xvf32gerpp 1, 36, 39
  624. xvf32gerpp 0, 37, 39
  625. #endif
  626. .if \Complete==0
  627. lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
  628. lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
  629. .endif
  630. .if \IsLast==1
  631. .if \Complete==1
  632. addi \BREG, \BREG, DISP8(\Index, \OffsetB)
  633. addi \AREG, \AREG, DISP8(\Index, \OffsetA)
  634. .else
  635. addi \BREG, \BREG, DISP8(\Index, 64)
  636. addi \AREG, \AREG, DISP8(\Index, 64)
  637. .endif
  638. .endif
  639. .endm
  640. .macro KERNEL4x4
  641. LOAD4x4
  642. END4x4 AO, BO, 32, 32
  643. .endm
  644. .macro SAVE4x4
  645. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  646. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  647. SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
  648. SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
  649. add T4, LDC, LDC
  650. add T1, CO, LDC
  651. #ifndef TRMMKERNEL
  652. lxvp vs24, 0(CO)
  653. #endif
  654. add T2, CO, T4
  655. add T3, T1, T4
  656. #ifndef TRMMKERNEL
  657. lxvp vs26, 0(T1)
  658. #endif
  659. #ifndef TRMMKERNEL
  660. lxvp vs28, 0(T2)
  661. #endif
  662. #ifndef TRMMKERNEL
  663. lxvp vs30, 0(T3)
  664. #endif
  665. GROUP1
  666. AGG_GROUP1
  667. GROUP2
  668. AGG_GROUP2
  669. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  670. MULTIPLY_GROUP1
  671. MULTIPLY_GROUP2
  672. /* reconstruct r, i pairs*/
  673. RECONSTRUCT_PAIR1
  674. RECONSTRUCT_PAIR2
  675. #ifndef TRMMKERNEL
  676. /* add */
  677. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  678. xxpermdi vs1, vs0, vs8, 1
  679. xxpermdi vs3, vs2, vs10, 1
  680. xxpermdi vs9, vs8, vs0, 1
  681. xxpermdi vs11, vs10, vs2, 1
  682. xxpermdi vs5, vs4, vs12, 1
  683. xxpermdi vs7, vs6, vs14, 1
  684. xxpermdi vs13, vs12, vs4, 1
  685. xxpermdi vs15, vs14, vs6, 1
  686. #else
  687. xxpermdi vs1, vs8, vs0, 2
  688. xxpermdi vs3, vs10, vs2, 2
  689. xxpermdi vs9, vs0, vs8, 2
  690. xxpermdi vs11, vs2, vs10, 2
  691. xxpermdi vs5, vs12, vs4, 2
  692. xxpermdi vs7, vs14, vs6, 2
  693. xxpermdi vs13, vs4, vs12, 2
  694. xxpermdi vs15, vs6, vs14, 2
  695. #endif
  696. xvaddsp vs24, vs24, vs3
  697. xvaddsp vs25, vs25, vs1
  698. xvaddsp vs26, vs26, vs11
  699. xvaddsp vs27, vs27, vs9
  700. xvaddsp vs28, vs28, vs7
  701. xvaddsp vs29, vs29, vs5
  702. xvaddsp vs30, vs30, vs15
  703. xvaddsp vs31, vs31, vs13
  704. #else
  705. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  706. xxpermdi vs25, vs0, vs8, 1
  707. xxpermdi vs24, vs2, vs10, 1
  708. xxpermdi vs27, vs8, vs0, 1
  709. xxpermdi vs26, vs10, vs2, 1
  710. xxpermdi vs29, vs4, vs12, 1
  711. xxpermdi vs28, vs6, vs14, 1
  712. xxpermdi vs31, vs12, vs4, 1
  713. xxpermdi vs30, vs14, vs6, 1
  714. #else
  715. xxpermdi vs25, vs8, vs0, 2
  716. xxpermdi vs24, vs10, vs2, 2
  717. xxpermdi vs27, vs0, vs8, 2
  718. xxpermdi vs26, vs2, vs10, 2
  719. xxpermdi vs29, vs12, vs4, 2
  720. xxpermdi vs28, vs14, vs6, 2
  721. xxpermdi vs31, vs4, vs12, 2
  722. xxpermdi vs30, vs6, vs14, 2
  723. #endif
  724. #endif
  725. stxvp vs24, 0(CO)
  726. stxvp vs26, 0(T1)
  727. stxvp vs28, 0(T2)
  728. stxvp vs30, 0(T3)
  729. addi CO, CO, 32
  730. .endm
  731. /* macros for N=4 and M=2
  732. **********************************************************************************************/
  733. .macro ZERO4x2
  734. xxsetaccz 0
  735. xxsetaccz 1
  736. .endm
  737. .macro LOAD4x2
  738. LOAD4x2O 0, 0
  739. .endm
  740. .macro LOAD4x2O OffsetA, OffsetB
  741. lxv vs32, (\OffsetA+0)(AO)
  742. lxvp vs34, (\OffsetB+0)(BO)
  743. .endm
  744. .macro END4x2_NORMAL
  745. END4x2 AO, BO, 16, 32
  746. .endm
  747. .macro END4x2_WITHOUT_ADD
  748. END4x2 AO, BO, 0, 0
  749. .endm
  750. .macro END4x2 AREG, BREG, OffsetA, OffsetB
  751. .if \OffsetB != 0
  752. addi \BREG, \BREG, \OffsetB
  753. .endif
  754. .if \OffsetA != 0
  755. addi \AREG, \AREG, \OffsetA
  756. .endif
  757. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  758. xvf32gerpp 1, 35, 32
  759. xvf32gerpp 0, 34, 32
  760. #else
  761. xvf32gerpp 1, 34, 32
  762. xvf32gerpp 0, 35, 32
  763. #endif
  764. .endm
  765. .macro LOAD4x2_2
  766. LOAD4x2_2O 0, 0
  767. .endm
  768. .macro LOAD4x2_2O OffsetA, OffsetB
  769. lxvp vs32, (\OffsetA)(AO)
  770. lxvp vs34, (0+\OffsetB)(BO)
  771. lxvp vs36, (32+\OffsetB)(BO)
  772. .endm
  773. .macro END4x2_2
  774. /*for load2 offset will be 32 and 64*/
  775. KERNEL4x2_2 AO, BO, 32, 64, 0, 1, 1
  776. .endm
  777. .macro KERNEL4x2_E2 OffsetA, OffsetB, Index, IsLast
  778. KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  779. .endm
  780. .macro KERNEL4x2_L2 OffsetA, OffsetB, Index, IsLast
  781. KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  782. .endm
  783. .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  784. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  785. xvf32gerpp 1, 35, 32
  786. xvf32gerpp 0, 34, 32
  787. #else
  788. xvf32gerpp 1, 34, 33
  789. xvf32gerpp 0, 35, 33
  790. #endif
  791. .if \Complete==0
  792. lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
  793. .endif
  794. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  795. xvf32gerpp 1, 37, 33
  796. xvf32gerpp 0, 36, 33
  797. #else
  798. xvf32gerpp 1, 36, 32
  799. xvf32gerpp 0, 37, 32
  800. #endif
  801. .if \Complete==0
  802. lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
  803. lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
  804. .endif
  805. .if \IsLast==1
  806. .if \Complete==1
  807. addi \AREG, \AREG, DISP4(\Index, \OffsetA)
  808. addi \BREG, \BREG, DISP8(\Index, \OffsetB)
  809. .else
  810. addi \AREG, \AREG, DISP4(\Index, 32)
  811. addi \BREG, \BREG, DISP8(\Index, 64)
  812. .endif
  813. .endif
  814. .endm
  815. .macro KERNEL4x2
  816. LOAD4x2
  817. END4x2 AO, BO, 16, 32
  818. .endm
  819. .macro SAVE4x2
  820. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  821. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  822. add T4, LDC, LDC
  823. add T1, CO, LDC
  824. add T2, CO, T4
  825. add T3, T1, T4
  826. #ifndef TRMMKERNEL
  827. lxv vs24, 0(CO)
  828. #endif
  829. #ifndef TRMMKERNEL
  830. lxv vs25, 0(T1)
  831. #endif
  832. #ifndef TRMMKERNEL
  833. lxv vs26, 0(T2)
  834. #endif
  835. #ifndef TRMMKERNEL
  836. lxv vs27, 0(T3)
  837. #endif
  838. GROUP1
  839. AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
  840. AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
  841. AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
  842. AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13
  843. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  844. MULTIPLY_GROUP1
  845. /* reconstruct r, i pairs*/
  846. RECONSTRUCT_PAIR1
  847. #ifndef TRMMKERNEL
  848. /* add */
  849. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  850. xxpermdi vs1, vs0, vs8, 0
  851. xxpermdi vs9, vs2, vs10, 0
  852. xxpermdi vs3, vs8, vs0, 3
  853. xxpermdi vs11, vs10, vs2, 3
  854. #else
  855. xxpermdi vs1, vs8, vs0, 0
  856. xxpermdi vs9, vs10, vs2, 0
  857. xxpermdi vs3, vs0, vs8, 3
  858. xxpermdi vs11, vs2, vs10, 3
  859. #endif
  860. xvaddsp vs24, vs24, vs1
  861. xvaddsp vs26, vs26, vs9
  862. xvaddsp vs25, vs25, vs3
  863. xvaddsp vs27, vs27, vs11
  864. #else
  865. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  866. xxpermdi vs24, vs0, vs8, 0
  867. xxpermdi vs26, vs2, vs10, 0
  868. xxpermdi vs25, vs8, vs0, 3
  869. xxpermdi vs27, vs10, vs2, 3
  870. #else
  871. xxpermdi vs24, vs8, vs0, 0
  872. xxpermdi vs26, vs10, vs2, 0
  873. xxpermdi vs25, vs0, vs8, 3
  874. xxpermdi vs27, vs2, vs10, 3
  875. #endif
  876. #endif
  877. stxv vs24, 0(CO)
  878. stxv vs25, 0(T1)
  879. stxv vs26, 0(T2)
  880. stxv vs27, 0(T3)
  881. addi CO, CO, 16
  882. .endm
  883. /* macros for N=4 and M=2
  884. **********************************************************************************************/
  885. .macro ZERO4x1
  886. xxsetaccz 0
  887. xxsetaccz 1
  888. .endm
  889. .macro LOAD4x1
  890. LOAD4x1O 0, 0
  891. .endm
  892. .macro LOAD4x1O OffsetA, OffsetB
  893. lxsd v0, (\OffsetA+0)(AO)
  894. lxvp vs34, (\OffsetB+0)(BO)
  895. .endm
  896. .macro END4x1_NORMAL
  897. END4x1 AO, BO,8, 32
  898. .endm
  899. .macro END4x1_WITHOUT_ADD
  900. END4x1 AO, BO, 0, 0
  901. .endm
  902. .macro END4x1 AREG, BREG, OffsetA, OffsetB
  903. .if \OffsetB != 0
  904. addi \BREG, \BREG, \OffsetB
  905. .endif
  906. .if \OffsetA != 0
  907. addi \AREG, \AREG, \OffsetA
  908. .endif
  909. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  910. xvf32gerpp 0, 34, 32
  911. xvf32gerpp 1, 35, 32
  912. #else
  913. xvf32gerpp 0, 35, 32
  914. xvf32gerpp 1, 34, 32
  915. #endif
  916. .endm
  917. .macro LOAD4x1_2
  918. LOAD4x1_2O 0, 0
  919. .endm
  920. .macro LOAD4x1_2O OffsetA, OffsetB
  921. lxv vs32, (\OffsetA)(AO)
  922. vspltisb v6, 0
  923. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  924. xxpermdi vs33, vs32, vs38, 2
  925. xxpermdi vs32, vs32, vs38, 0
  926. #else
  927. xxpermdi vs33, vs32, vs38, 0
  928. xxpermdi vs32, vs32, vs38, 2
  929. #endif
  930. lxvp vs34, (0+\OffsetB)(BO)
  931. lxvp vs36, (32+\OffsetB)(BO)
  932. .endm
  933. .macro END4x1_2
  934. /*for load2 offset will be 16 and 64*/
  935. KERNEL4x1_2 AO, BO, 16, 64, 0, 1, 1
  936. .endm
  937. .macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast
  938. KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  939. .endm
  940. .macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast
  941. KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  942. .endm
  943. .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  944. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  945. xvf32gerpp 0, 34, 32
  946. xvf32gerpp 1, 35, 32
  947. #else
  948. xvf32gerpp 0, 35, 32
  949. xvf32gerpp 1, 34, 32
  950. #endif
  951. .if \Complete==0
  952. lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
  953. .endif
  954. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  955. xvf32gerpp 0, 36, 33
  956. xvf32gerpp 1, 37, 33
  957. #else
  958. xvf32gerpp 0, 37, 33
  959. xvf32gerpp 1, 36, 33
  960. #endif
  961. .if \Complete==0
  962. lxv vs32, DISP2(\Index, \OffsetA)(\AREG)
  963. lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
  964. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  965. xxpermdi vs33, vs32, vs38, 2
  966. xxpermdi vs32, vs32, vs38, 0
  967. #else
  968. xxpermdi vs33, vs32, vs38, 0
  969. xxpermdi vs32, vs32, vs38, 2
  970. #endif
  971. .endif
  972. .if \IsLast==1
  973. .if \Complete==1
  974. addi \AREG, \AREG, DISP2(\Index, \OffsetA)
  975. addi \BREG, \BREG, DISP8(\Index, \OffsetB)
  976. .else
  977. addi \AREG, \AREG, DISP2(\Index, 16)
  978. addi \BREG, \BREG, DISP8(\Index, 64)
  979. .endif
  980. .endif
  981. .endm
  982. .macro KERNEL4x1
  983. LOAD4x1
  984. END4x1 AO, BO, 8, 32
  985. .endm
  986. .macro SAVE4x1
  987. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  988. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  989. xxpermdi vs32, vs32, vs36, 1
  990. xxpermdi vs40, vs40, vs44, 1
  991. xxpermdi vs33, vs33, vs37, 1
  992. xxpermdi vs41, vs41, vs45, 1
  993. add T4, LDC, LDC
  994. add T1, CO, LDC
  995. add T2, CO, T4
  996. add T3, T1, T4
  997. #ifndef TRMMKERNEL
  998. lxsd v4, 0(CO)
  999. #endif
  1000. #ifndef TRMMKERNEL
  1001. lxsd v5, 0(T1)
  1002. #endif
  1003. #ifndef TRMMKERNEL
  1004. lxsd v6, 0(T2)
  1005. #endif
  1006. #ifndef TRMMKERNEL
  1007. lxsd v7, 0(T3)
  1008. #endif
  1009. xxperm vs0, vs32, permute_mask
  1010. xxperm vs4, vs40, permute_mask
  1011. xxperm vs1, vs33, permute_mask
  1012. xxperm vs5, vs41, permute_mask
  1013. AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
  1014. AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
  1015. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1016. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1017. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  1018. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1019. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  1020. /* reconstruct r, i pairs*/
  1021. xxperm vs0, vs1, save_permute_1
  1022. xxperm vs2, vs3, save_permute_1
  1023. #ifndef TRMMKERNEL
  1024. /* add */
  1025. xxspltd vs1, vs0, 0
  1026. xxspltd vs3, vs0, 1
  1027. xxspltd vs9, vs2, 0
  1028. xxspltd vs11, vs2, 1
  1029. /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
  1030. xvaddsp vs36, vs36, vs1
  1031. xvaddsp vs37, vs37, vs3
  1032. xvaddsp vs38, vs38, vs9
  1033. xvaddsp vs39, vs39, vs11
  1034. #else
  1035. /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
  1036. xxspltd vs36, vs0, 0
  1037. xxspltd vs37, vs0, 1
  1038. xxspltd vs38, vs2, 0
  1039. xxspltd vs39, vs2, 1
  1040. #endif
  1041. stxsd v4, 0(CO)
  1042. stxsd v5, 0(T1)
  1043. stxsd v6, 0(T2)
  1044. stxsd v7, 0(T3)
  1045. addi CO, CO, 8
  1046. .endm
  1047. /* macros for N=2 and M=8
  1048. **********************************************************************************************/
  1049. .macro ZERO2x8
  1050. xxsetaccz 0
  1051. xxsetaccz 1
  1052. xxsetaccz 2
  1053. xxsetaccz 3
  1054. .endm
  1055. .macro LOAD2x8
  1056. LOAD2x8O 0, 0
  1057. .endm
  1058. .macro LOAD2x8O OffsetA, OffsetB
  1059. lxv vs34, (\OffsetB+0)(BO)
  1060. lxvp vs32, (\OffsetA+0)(AO)
  1061. lxvp vs36, (\OffsetA+32)(AO)
  1062. .endm
  1063. .macro END2x8_NORMAL
  1064. END2x8 AO, BO, 64, 16
  1065. .endm
  1066. .macro END2x8_WITHOUT_ADD
  1067. END2x8 AO, BO, 0, 0
  1068. .endm
  1069. .macro END2x8 AREG, BREG, OffsetA, OffsetB
  1070. .if \OffsetB != 0
  1071. addi \BREG, \BREG, \OffsetB
  1072. .endif
  1073. .if \OffsetA != 0
  1074. addi \AREG, \AREG, \OffsetA
  1075. .endif
  1076. xvf32gerpp 2, 37, 34
  1077. xvf32gerpp 3, 36, 34
  1078. xvf32gerpp 0, 33, 34
  1079. xvf32gerpp 1, 32, 34
  1080. .endm
  1081. .macro LOAD2x8_2
  1082. LOAD2x8_2O 0, 0
  1083. .endm
  1084. .macro LOAD2x8_2O OffsetA, OffsetB
  1085. lxvp vs34, (\OffsetB)(BO)
  1086. lxvp vs32, (0+\OffsetA)(AO)
  1087. lxvp vs36, (32+\OffsetA)(AO)
  1088. lxvp vs38, (64+\OffsetA)(AO)
  1089. lxvp vs40, (64+32+\OffsetA)(AO)
  1090. .endm
  1091. .macro END2x8_2
  1092. /*for load2 offset will be 128 and 32*/
  1093. KERNEL2x8_2 AO, BO, 128, 32, 0, 1, 1
  1094. .endm
  1095. .macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast
  1096. KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1097. .endm
  1098. .macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast
  1099. KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1100. .endm
  1101. .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1102. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1103. xvf32gerpp 2, 37, 34
  1104. xvf32gerpp 3, 36, 34
  1105. xvf32gerpp 0, 33, 34
  1106. xvf32gerpp 1, 32, 34
  1107. #else
  1108. xvf32gerpp 2, 37, 35
  1109. xvf32gerpp 3, 36, 35
  1110. xvf32gerpp 0, 33, 35
  1111. xvf32gerpp 1, 32, 35
  1112. #endif
  1113. .if \Complete==0
  1114. lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
  1115. lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
  1116. .endif
  1117. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1118. xvf32gerpp 2, 41, 35
  1119. xvf32gerpp 3, 40, 35
  1120. xvf32gerpp 0, 39, 35
  1121. xvf32gerpp 1, 38, 35
  1122. #else
  1123. xvf32gerpp 2, 41, 34
  1124. xvf32gerpp 3, 40, 34
  1125. xvf32gerpp 0, 39, 34
  1126. xvf32gerpp 1, 38, 34
  1127. #endif
  1128. .if \Complete==0
  1129. lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
  1130. lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
  1131. lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
  1132. .endif
  1133. .if \IsLast==1
  1134. .if \Complete==1
  1135. addi \BREG, \BREG, DISP4(\Index, \OffsetB)
  1136. addi \AREG, \AREG, DISP16(\Index, \OffsetA)
  1137. .else
  1138. addi \BREG, \BREG, DISP4(\Index, 32)
  1139. addi \AREG, \AREG, DISP16(\Index, 128)
  1140. .endif
  1141. .endif
  1142. .endm
  1143. .macro KERNEL2x8
  1144. LOAD2x8
  1145. END2x8 AO, BO, 64, 16
  1146. .endm
  1147. .macro SAVE2x8
  1148. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  1149. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  1150. SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
  1151. SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
  1152. add T1, CO, LDC
  1153. #ifndef TRMMKERNEL
  1154. lxvp vs24, 0(CO)
  1155. #endif
  1156. #ifndef TRMMKERNEL
  1157. lxvp vs26, 32(CO)
  1158. #endif
  1159. #ifndef TRMMKERNEL
  1160. lxvp vs28, 0(T1)
  1161. #endif
  1162. #ifndef TRMMKERNEL
  1163. lxvp vs30, 32(T1)
  1164. #endif
  1165. add T2, CO, T4
  1166. add T3, T1, T4
  1167. GROUP1
  1168. AGG_GROUP1
  1169. GROUP2
  1170. AGG_GROUP2
  1171. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1172. MULTIPLY_GROUP1
  1173. MULTIPLY_GROUP2
  1174. /* reconstruct r, i pairs*/
  1175. RECONSTRUCT_PAIR1
  1176. RECONSTRUCT_PAIR2
  1177. #ifndef TRMMKERNEL
  1178. /* add */
  1179. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1180. xxpermdi vs1, vs0, vs8, 1
  1181. xxpermdi vs3, vs2, vs10, 1
  1182. xxpermdi vs5, vs4, vs12, 1
  1183. xxpermdi vs7, vs6, vs14, 1
  1184. xxpermdi vs9, vs8, vs0, 1
  1185. xxpermdi vs11, vs10, vs2, 1
  1186. #else
  1187. xxpermdi vs1, vs8, vs0, 2
  1188. xxpermdi vs3, vs10, vs2, 2
  1189. xxpermdi vs5, vs12, vs4, 2
  1190. xxpermdi vs7, vs14, vs6, 2
  1191. xxpermdi vs9, vs0, vs8, 2
  1192. xxpermdi vs11, vs2, vs10, 2
  1193. #endif
  1194. xvaddsp vs24, vs24, vs3
  1195. xvaddsp vs25, vs25, vs1
  1196. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1197. xxpermdi vs13, vs12, vs4, 1
  1198. xxpermdi vs15, vs14, vs6, 1
  1199. #else
  1200. xxpermdi vs13, vs4, vs12, 2
  1201. xxpermdi vs15, vs6, vs14, 2
  1202. #endif
  1203. xvaddsp vs26, vs26, vs7
  1204. xvaddsp vs27, vs27, vs5
  1205. xvaddsp vs28, vs28, vs11
  1206. xvaddsp vs29, vs29, vs9
  1207. xvaddsp vs30, vs30, vs15
  1208. xvaddsp vs31, vs31, vs13
  1209. #else
  1210. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1211. xxpermdi vs25, vs0, vs8, 1
  1212. xxpermdi vs24, vs2, vs10, 1
  1213. xxpermdi vs27, vs4, vs12, 1
  1214. xxpermdi vs26, vs6, vs14, 1
  1215. xxpermdi vs29, vs8, vs0, 1
  1216. xxpermdi vs28, vs10, vs2, 1
  1217. xxpermdi vs31, vs12, vs4, 1
  1218. xxpermdi vs30, vs14, vs6, 1
  1219. #else
  1220. xxpermdi vs25, vs8, vs0, 2
  1221. xxpermdi vs24, vs10, vs2, 2
  1222. xxpermdi vs27, vs12, vs4, 2
  1223. xxpermdi vs26, vs14, vs6, 2
  1224. xxpermdi vs29, vs0, vs8, 2
  1225. xxpermdi vs28, vs2, vs10, 2
  1226. xxpermdi vs31, vs4, vs12, 2
  1227. xxpermdi vs30, vs6, vs14, 2
  1228. #endif
  1229. #endif
  1230. stxvp vs24, 0(CO)
  1231. stxvp vs26, 32(CO)
  1232. stxvp vs28, 0(T1)
  1233. stxvp vs30, 32(T1)
  1234. addi CO, CO, 64
  1235. .endm
  1236. /* macros for N=2 and M=4
  1237. **********************************************************************************************/
  1238. .macro ZERO2x4
  1239. xxsetaccz 0
  1240. xxsetaccz 1
  1241. .endm
  1242. .macro LOAD2x4
  1243. LOAD2x4O 0, 0
  1244. .endm
  1245. .macro LOAD2x4O OffsetA, OffsetB
  1246. lxv vs34, (\OffsetB+0)(BO)
  1247. lxvp vs32, (\OffsetA+0)(AO)
  1248. .endm
  1249. .macro END2x4_NORMAL
  1250. END2x4 AO, BO, 32, 16
  1251. .endm
  1252. .macro END2x4_WITHOUT_ADD
  1253. END2x4 AO, BO, 0, 0
  1254. .endm
  1255. .macro END2x4 AREG, BREG, OffsetA, OffsetB
  1256. .if \OffsetB != 0
  1257. addi \BREG, \BREG, \OffsetB
  1258. .endif
  1259. .if \OffsetA != 0
  1260. addi \AREG, \AREG, \OffsetA
  1261. .endif
  1262. xvf32gerpp 0, 33, 34
  1263. xvf32gerpp 1, 32, 34
  1264. .endm
  1265. .macro LOAD2x4_2
  1266. LOAD2x4_2O 0, 0
  1267. .endm
  1268. .macro LOAD2x4_2O OffsetA, OffsetB
  1269. lxvp vs34, (\OffsetB)(BO)
  1270. lxvp vs32, (0+\OffsetA)(AO)
  1271. lxvp vs36, (32+\OffsetA)(AO)
  1272. .endm
  1273. .macro END2x4_2
  1274. /*for load2 offset will be 64 and 32*/
  1275. KERNEL2x4_2 AO, BO, 64, 32, 0, 1, 1
  1276. .endm
  1277. .macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast
  1278. KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1279. .endm
  1280. .macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast
  1281. KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1282. .endm
  1283. .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1284. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1285. xvf32gerpp 0, 33, 34
  1286. xvf32gerpp 1, 32, 34
  1287. #else
  1288. xvf32gerpp 0, 33, 35
  1289. xvf32gerpp 1, 32, 35
  1290. #endif
  1291. .if \Complete==0
  1292. lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
  1293. .endif
  1294. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1295. xvf32gerpp 0, 37, 35
  1296. xvf32gerpp 1, 36, 35
  1297. #else
  1298. xvf32gerpp 0, 37, 34
  1299. xvf32gerpp 1, 36, 34
  1300. #endif
  1301. .if \Complete==0
  1302. lxvp vs34, DISP4(\Index, \OffsetB)(\BREG)
  1303. lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
  1304. .endif
  1305. .if \IsLast==1
  1306. .if \Complete==1
  1307. addi \BREG, \BREG, DISP4(\Index, \OffsetB)
  1308. addi \AREG, \AREG, DISP8(\Index, \OffsetA)
  1309. .else
  1310. addi \BREG, \BREG, DISP4(\Index, 32)
  1311. addi \AREG, \AREG, DISP8(\Index, 64)
  1312. .endif
  1313. .endif
  1314. .endm
  1315. .macro KERNEL2x4
  1316. LOAD2x4
  1317. END2x4 AO, BO, 32, 16
  1318. .endm
  1319. .macro SAVE2x4
  1320. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  1321. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  1322. add T1, CO, LDC
  1323. #ifndef TRMMKERNEL
  1324. lxvp vs24, 0(CO)
  1325. #endif
  1326. #ifndef TRMMKERNEL
  1327. lxvp vs26, 0(T1)
  1328. #endif
  1329. GROUP1
  1330. AGG_GROUP1
  1331. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1332. MULTIPLY_GROUP1
  1333. /* reconstruct r, i pairs*/
  1334. RECONSTRUCT_PAIR1
  1335. #ifndef TRMMKERNEL
  1336. /* add */
  1337. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1338. xxpermdi vs1, vs0, vs8, 1
  1339. xxpermdi vs3, vs2, vs10, 1
  1340. xxpermdi vs9, vs8, vs0, 1
  1341. xxpermdi vs11, vs10, vs2, 1
  1342. #else
  1343. xxpermdi vs1, vs8, vs0, 2
  1344. xxpermdi vs3, vs10, vs2, 2
  1345. xxpermdi vs9, vs0, vs8, 2
  1346. xxpermdi vs11, vs2, vs10, 2
  1347. #endif
  1348. xvaddsp vs24, vs24, vs3
  1349. xvaddsp vs25, vs25, vs1
  1350. xvaddsp vs26, vs26, vs11
  1351. xvaddsp vs27, vs27, vs9
  1352. #else
  1353. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1354. xxpermdi vs25, vs0, vs8, 1
  1355. xxpermdi vs24, vs2, vs10, 1
  1356. xxpermdi vs27, vs8, vs0, 1
  1357. xxpermdi vs26, vs10, vs2, 1
  1358. #else
  1359. xxpermdi vs25, vs8, vs0, 2
  1360. xxpermdi vs24, vs10, vs2, 2
  1361. xxpermdi vs27, vs0, vs8, 2
  1362. xxpermdi vs26, vs2, vs10, 2
  1363. #endif
  1364. #endif
  1365. stxvp vs24, 0(CO)
  1366. stxvp vs26, 0(T1)
  1367. addi CO, CO, 32
  1368. .endm
  1369. /* macros for N=2 and M=2
  1370. **********************************************************************************************/
  1371. .macro ZERO2x2
  1372. xxsetaccz 0
  1373. .endm
  1374. .macro LOAD2x2
  1375. LOAD2x2O 0, 0
  1376. .endm
  1377. .macro LOAD2x2O OffsetA, OffsetB
  1378. lxv vs32, (\OffsetA+0)(AO)
  1379. lxv vs34, (\OffsetB+0)(BO)
  1380. .endm
  1381. .macro END2x2_NORMAL
  1382. END2x2 AO, BO, 16, 16
  1383. .endm
  1384. .macro END2x2_WITHOUT_ADD
  1385. END2x2 AO, BO, 0, 0
  1386. .endm
  1387. .macro END2x2 AREG, BREG, OffsetA, OffsetB
  1388. .if \OffsetB != 0
  1389. addi \BREG, \BREG, \OffsetB
  1390. .endif
  1391. .if \OffsetA != 0
  1392. addi \AREG, \AREG, \OffsetA
  1393. .endif
  1394. xvf32gerpp 0, 34, 32
  1395. .endm
  1396. .macro LOAD2x2_2
  1397. LOAD2x2_2O 0, 0
  1398. .endm
  1399. .macro LOAD2x2_2O OffsetA, OffsetB
  1400. lxvp vs32, (\OffsetA)(AO)
  1401. lxvp vs34, (0+\OffsetB)(BO)
  1402. .endm
  1403. .macro END2x2_2
  1404. /*for load2 offset will be 32 and 32*/
  1405. KERNEL2x2_2 AO, BO, 32, 32, 0, 1, 1
  1406. .endm
  1407. .macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast
  1408. KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1409. .endm
  1410. .macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast
  1411. KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1412. .endm
  1413. .macro KERNEL2x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1414. xvf32gerpp 0, 34, 32
  1415. xvf32gerpp 0, 35, 33
  1416. .if \Complete==0
  1417. lxvp vs32, DISP4(\Index, \OffsetA)(\AREG)
  1418. lxvp vs34, DISP4(\Index, \OffsetA)(\BREG)
  1419. .endif
  1420. .if \IsLast==1
  1421. .if \Complete==1
  1422. addi \AREG, \AREG, DISP4(\Index, \OffsetA)
  1423. addi \BREG, \BREG, DISP4(\Index, \OffsetB)
  1424. .else
  1425. addi \AREG, \AREG, DISP4(\Index, 32)
  1426. addi \BREG, \BREG, DISP4(\Index, 32)
  1427. .endif
  1428. .endif
  1429. .endm
  1430. .macro KERNEL2x2
  1431. LOAD2x2
  1432. END2x2 AO, BO, 16, 16
  1433. .endm
  1434. .macro SAVE2x2
  1435. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  1436. add T1, CO, LDC
  1437. #ifndef TRMMKERNEL
  1438. lxv vs24, 0(CO)
  1439. #endif
  1440. #ifndef TRMMKERNEL
  1441. lxv vs26, 0(T1)
  1442. #endif
  1443. xxperm vs0, vs32, permute_mask
  1444. xxperm vs4, vs40, permute_mask
  1445. xxperm vs8, vs36, permute_mask
  1446. xxperm vs12, vs44, permute_mask
  1447. AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
  1448. AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
  1449. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1450. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1451. MULT_APLHA_PART1 vs36, vs44, vs8, vs9
  1452. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1453. MULT_APLHA_PART2 vs36, vs44, vs8, vs9
  1454. /* reconstruct r, i pairs*/
  1455. xxperm vs0, vs1, save_permute_1
  1456. xxperm vs8, vs9, save_permute_1
  1457. #ifndef TRMMKERNEL
  1458. /* add */
  1459. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1460. xxpermdi vs1, vs0, vs8, 0
  1461. xxpermdi vs9, vs8, vs0, 3
  1462. #else
  1463. xxpermdi vs1, vs8, vs0, 0
  1464. xxpermdi vs9, vs0, vs8, 3
  1465. #endif
  1466. xvaddsp vs24, vs24, vs1
  1467. xvaddsp vs26, vs26, vs9
  1468. #else
  1469. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1470. xxpermdi vs24, vs0, vs8, 0
  1471. xxpermdi vs26, vs8, vs0, 3
  1472. #else
  1473. xxpermdi vs24, vs8, vs0, 0
  1474. xxpermdi vs26, vs0, vs8, 3
  1475. #endif
  1476. #endif
  1477. stxv vs24, 0(CO)
  1478. stxv vs26, 0(T1)
  1479. addi CO, CO, 16
  1480. .endm
  1481. /* macros for N=2 and M=1
  1482. **********************************************************************************************/
  1483. .macro ZERO2x1
  1484. xxlxor vs32, vs32, vs32
  1485. xxlxor vs40, vs40, vs40
  1486. .endm
  1487. .macro LOAD2x1
  1488. LOAD2x1O 0, 0
  1489. .endm
  1490. .macro LOAD2x1O OffsetA, OffsetB
  1491. lxsd v4, (\OffsetA+0)(AO)
  1492. lxv vs0, (\OffsetB+0)(BO)
  1493. xxspltd vs24, vs36, 0
  1494. xxperm vs26, vs24, permute_mask
  1495. .endm
  1496. .macro END2x1_NORMAL
  1497. END2x1 AO, BO,8, 16
  1498. .endm
  1499. .macro END2x1_WITHOUT_ADD
  1500. END2x1 AO, BO, 0, 0
  1501. .endm
  1502. .macro END2x1 AREG, BREG, OffsetA, OffsetB
  1503. .if \OffsetB != 0
  1504. addi \BREG, \BREG, \OffsetB
  1505. .endif
  1506. .if \OffsetA != 0
  1507. addi \AREG, \AREG, \OffsetA
  1508. .endif
  1509. xvmaddasp vs32, vs0, vs24
  1510. xvmaddasp vs40, vs0, vs26
  1511. .endm
  1512. .macro LOAD2x1_2
  1513. LOAD2x1_2O 0, 0
  1514. .endm
  1515. .macro LOAD2x1_2O OffsetA, OffsetB
  1516. lxv vs27, (\OffsetA)(AO)
  1517. lxvp vs4, (0+\OffsetB)(BO)
  1518. xxspltd vs8, vs27, 1
  1519. xxspltd vs24, vs27, 0
  1520. xxperm vs10, vs8, permute_mask
  1521. xxperm vs26, vs24, permute_mask
  1522. .endm
  1523. .macro END2x1_2
  1524. /*for load2 offset will be 16 and 32*/
  1525. KERNEL2x1_2 AO, BO, 16, 32, 0, 1, 1
  1526. .endm
  1527. .macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast
  1528. KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1529. .endm
  1530. .macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast
  1531. KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1532. .endm
  1533. .macro KERNEL2x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1534. xvmaddasp vs32, vs5, vs8
  1535. xvmaddasp vs40, vs5, vs10
  1536. .if \Complete==0
  1537. lxv vs27, DISP2(\Index, \OffsetA)(\AREG)
  1538. xxspltd vs8, vs27, 1
  1539. .endif
  1540. .if \Complete==0
  1541. xxperm vs10, vs8, permute_mask
  1542. .endif
  1543. xvmaddasp vs32, vs4, vs24
  1544. xvmaddasp vs40, vs4, vs26
  1545. .if \Complete==0
  1546. xxspltd vs24, vs27, 0
  1547. xxperm vs26, vs24, permute_mask
  1548. .endif
  1549. .if \Complete==0
  1550. lxvp vs4, DISP4(\Index, 0+\OffsetB)(\BREG)
  1551. .endif
  1552. .if \IsLast==1
  1553. .if \Complete==1
  1554. addi \AREG, \AREG, DISP2(\Index, \OffsetA)
  1555. addi \BREG, \BREG, DISP4(\Index, \OffsetB)
  1556. .else
  1557. addi \AREG, \AREG, DISP2(\Index, 16)
  1558. addi \BREG, \BREG, DISP4(\Index, 32)
  1559. .endif
  1560. .endif
  1561. .endm
  1562. .macro KERNEL2x1
  1563. LOAD2x1
  1564. END2x1 AO, BO, 8, 16
  1565. .endm
  1566. .macro SAVE2x1
  1567. add T1, CO, LDC
  1568. #ifndef TRMMKERNEL
  1569. lxsd v4, 0(CO)
  1570. #endif
  1571. #ifndef TRMMKERNEL
  1572. lxsd v5, 0(T1)
  1573. #endif
  1574. xxperm vs0, vs32, permute_mask
  1575. xxperm vs4, vs40, permute_mask
  1576. AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
  1577. AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
  1578. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1579. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1580. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1581. /* reconstruct r, i pairs*/
  1582. xxperm vs0, vs1, save_permute_1
  1583. #ifndef TRMMKERNEL
  1584. /* add */
  1585. xxspltd vs1, vs0, 0
  1586. xxspltd vs3, vs0, 1
  1587. /*--v4==vs36 v5==vs37---*/
  1588. xvaddsp vs36, vs36, vs1
  1589. xvaddsp vs37, vs37, vs3
  1590. #else
  1591. /*--v4==vs36 v5==vs37---*/
  1592. xxspltd vs36, vs0, 0
  1593. xxspltd vs37, vs0, 1
  1594. #endif
  1595. stxsd v4, 0(CO)
  1596. stxsd v5, 0(T1)
  1597. addi CO, CO, 8
  1598. .endm
  1599. /* macros for N=1 and M=8
  1600. **********************************************************************************************/
  1601. .macro ZERO1x8
  1602. xxsetaccz 0
  1603. xxsetaccz 1
  1604. xxsetaccz 2
  1605. xxsetaccz 3
  1606. .endm
  1607. .macro LOAD1x8
  1608. LOAD1x8O 0, 0
  1609. .endm
  1610. .macro LOAD1x8O OffsetA, OffsetB
  1611. lxsd v2, (\OffsetB+0)(BO)
  1612. lxvp vs32, (\OffsetA+0)(AO)
  1613. lxvp vs36, (\OffsetA+32)(AO)
  1614. .endm
  1615. .macro END1x8_NORMAL
  1616. END1x8 AO, BO, 64,8
  1617. .endm
  1618. .macro END1x8_WITHOUT_ADD
  1619. END1x8 AO, BO, 0, 0
  1620. .endm
  1621. .macro END1x8 AREG, BREG, OffsetA, OffsetB
  1622. .if \OffsetB != 0
  1623. addi \BREG, \BREG, \OffsetB
  1624. .endif
  1625. .if \OffsetA != 0
  1626. addi \AREG, \AREG, \OffsetA
  1627. .endif
  1628. xvf32gerpp 0, 34, 33
  1629. xvf32gerpp 1, 34, 32
  1630. xvf32gerpp 2, 34, 37
  1631. xvf32gerpp 3, 34, 36
  1632. .endm
  1633. .macro LOAD1x8_2
  1634. LOAD1x8_2O 0, 0
  1635. .endm
  1636. .macro LOAD1x8_2O OffsetA, OffsetB
  1637. lxv vs34, (\OffsetB)(BO)
  1638. lxvp vs32, (0+\OffsetA)(AO)
  1639. lxvp vs36, (32+\OffsetA)(AO)
  1640. vspltisb v10, 0
  1641. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1642. xxpermdi vs35, vs34, vs42, 2
  1643. xxpermdi vs34, vs34, vs42, 0
  1644. #else
  1645. xxpermdi vs35, vs34, vs42, 0
  1646. xxpermdi vs34, vs34, vs42, 2
  1647. #endif
  1648. lxvp vs38, (64+\OffsetA)(AO)
  1649. lxvp vs40, (64+32+\OffsetA)(AO)
  1650. .endm
  1651. .macro END1x8_2
  1652. /*for load2 offset will be 128 and 16*/
  1653. KERNEL1x8_2 AO, BO, 128, 16, 0, 1, 1
  1654. .endm
  1655. .macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast
  1656. KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1657. .endm
  1658. .macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast
  1659. KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1660. .endm
  1661. .macro KERNEL1x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1662. xvf32gerpp 0, 34, 33
  1663. xvf32gerpp 1, 34, 32
  1664. .if \Complete==0
  1665. lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
  1666. .endif
  1667. xvf32gerpp 2, 34, 37
  1668. xvf32gerpp 3, 34, 36
  1669. .if \Complete==0
  1670. lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
  1671. .endif
  1672. xvf32gerpp 0, 35, 39
  1673. xvf32gerpp 1, 35, 38
  1674. .if \Complete==0
  1675. lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
  1676. .endif
  1677. xvf32gerpp 2, 35, 41
  1678. xvf32gerpp 3, 35, 40
  1679. .if \Complete==0
  1680. lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
  1681. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1682. xxpermdi vs35, vs34, vs42, 2
  1683. xxpermdi vs34, vs34, vs42, 0
  1684. #else
  1685. xxpermdi vs35, vs34, vs42, 0
  1686. xxpermdi vs34, vs34, vs42, 2
  1687. #endif
  1688. lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
  1689. .endif
  1690. .if \IsLast==1
  1691. .if \Complete==1
  1692. addi \BREG, \BREG, DISP2(\Index, \OffsetB)
  1693. addi \AREG, \AREG, DISP16(\Index, \OffsetA)
  1694. .else
  1695. addi \BREG, \BREG, DISP2(\Index, 16)
  1696. addi \AREG, \AREG, DISP16(\Index, 128)
  1697. .endif
  1698. .endif
  1699. .endm
  1700. .macro KERNEL1x8
  1701. LOAD1x8
  1702. END1x8 AO, BO, 64,8
  1703. .endm
  1704. .macro SAVE1x8
  1705. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  1706. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  1707. SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
  1708. SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
  1709. xxpermdi vs32, vs32, vs36, 0
  1710. xxpermdi vs33, vs33, vs37, 0
  1711. xxpermdi vs34, vs34, vs38, 0
  1712. xxpermdi vs35, vs35, vs39, 0
  1713. xxpermdi vs40, vs40, vs44, 0
  1714. xxperm vs40, vs40, permute_mask
  1715. xxpermdi vs41, vs41, vs45, 0
  1716. xxperm vs41, vs41, permute_mask
  1717. xxpermdi vs42, vs42, vs46, 0
  1718. xxperm vs42, vs42, permute_mask
  1719. xxpermdi vs43, vs43, vs47, 0
  1720. xxperm vs43, vs43, permute_mask
  1721. #ifndef TRMMKERNEL
  1722. lxvp vs24, 0(CO)
  1723. #endif
  1724. xxperm vs0, vs32, permute_mask
  1725. xxperm vs4, vs40, permute_mask
  1726. #ifndef TRMMKERNEL
  1727. lxvp vs26, 32(CO)
  1728. #endif
  1729. xxperm vs1, vs33, permute_mask
  1730. xxperm vs5, vs41, permute_mask
  1731. xxperm vs2, vs34, permute_mask
  1732. xxperm vs6, vs42, permute_mask
  1733. xxperm vs3, vs35, permute_mask
  1734. xxperm vs7, vs43, permute_mask
  1735. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  1736. AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
  1737. AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6
  1738. AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7
  1739. /*inner reverse save_permute and store vs28 */
  1740. xxpermdi vs28,save_permute_1,save_permute_1, 2
  1741. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1742. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1743. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  1744. MULT_APLHA_PART1 vs34, vs42, vs4, vs5
  1745. MULT_APLHA_PART1 vs35, vs43, vs6, vs7
  1746. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1747. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  1748. MULT_APLHA_PART2 vs34, vs42, vs4, vs5
  1749. MULT_APLHA_PART2 vs35, vs43, vs6, vs7
  1750. /* reconstruct r, i pairs*/
  1751. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1752. xxperm vs0, vs1, save_permute_1
  1753. xxperm vs2, vs3, save_permute_1
  1754. xxperm vs4, vs5, save_permute_1
  1755. xxperm vs6, vs7, save_permute_1
  1756. #else
  1757. xxperm vs0, vs1, vs28
  1758. xxperm vs2, vs3, vs28
  1759. xxperm vs4, vs5, vs28
  1760. xxperm vs6, vs7, vs28
  1761. #endif
  1762. #ifndef TRMMKERNEL
  1763. /* add */
  1764. xvaddsp vs24, vs24, vs2
  1765. xvaddsp vs25, vs25, vs0
  1766. xvaddsp vs26, vs26, vs6
  1767. xvaddsp vs27, vs27, vs4
  1768. stxvp vs24, 0(CO)
  1769. stxvp vs26, 32(CO)
  1770. #else
  1771. /* reconstruct r, i pairs*/
  1772. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1773. stxv vs2, 0(CO)
  1774. stxv vs0, 16(CO)
  1775. stxv vs6, 32(CO)
  1776. stxv vs4, 48(CO)
  1777. #else
  1778. stxv vs0, 0(CO)
  1779. stxv vs2, 16(CO)
  1780. stxv vs4, 32(CO)
  1781. stxv vs6, 48(CO)
  1782. #endif
  1783. #endif
  1784. addi CO, CO, 64
  1785. .endm
  1786. /* macros for N=1 and M=4
  1787. **********************************************************************************************/
  1788. .macro ZERO1x4
  1789. xxsetaccz 0
  1790. xxsetaccz 1
  1791. .endm
  1792. .macro LOAD1x4
  1793. LOAD1x4O 0, 0
  1794. .endm
  1795. .macro LOAD1x4O OffsetA, OffsetB
  1796. lxsd v2, (\OffsetB+0)(BO)
  1797. lxvp vs32, (\OffsetA+0)(AO)
  1798. .endm
  1799. .macro END1x4_NORMAL
  1800. END1x4 AO, BO, 32,8
  1801. .endm
  1802. .macro END1x4_WITHOUT_ADD
  1803. END1x4 AO, BO, 0, 0
  1804. .endm
  1805. .macro END1x4 AREG, BREG, OffsetA, OffsetB
  1806. .if \OffsetB != 0
  1807. addi \BREG, \BREG, \OffsetB
  1808. .endif
  1809. .if \OffsetA != 0
  1810. addi \AREG, \AREG, \OffsetA
  1811. .endif
  1812. xvf32gerpp 0, 34, 33
  1813. xvf32gerpp 1, 34, 32
  1814. .endm
  1815. .macro LOAD1x4_2
  1816. LOAD1x4_2O 0, 0
  1817. .endm
  1818. .macro LOAD1x4_2O OffsetA, OffsetB
  1819. lxv vs34, (\OffsetB)(BO)
  1820. lxvp vs32, (0+\OffsetA)(AO)
  1821. vspltisb v6, 0
  1822. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1823. xxpermdi vs35, vs34, vs38, 2
  1824. xxpermdi vs34, vs34, vs38, 0
  1825. #else
  1826. xxpermdi vs35, vs34, vs38, 0
  1827. xxpermdi vs34, vs34, vs38, 2
  1828. #endif
  1829. lxvp vs36, (32+\OffsetA)(AO)
  1830. .endm
  1831. .macro END1x4_2
  1832. /*for load2 offset will be 64 and 16*/
  1833. KERNEL1x4_2 AO, BO, 64, 16, 0, 1, 1
  1834. .endm
  1835. .macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast
  1836. KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1837. .endm
  1838. .macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast
  1839. KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1840. .endm
  1841. .macro KERNEL1x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1842. xvf32gerpp 0, 34, 33
  1843. xvf32gerpp 1, 34, 32
  1844. .if \Complete==0
  1845. lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
  1846. .endif
  1847. xvf32gerpp 0, 35, 37
  1848. xvf32gerpp 1, 35, 36
  1849. .if \Complete==0
  1850. lxv vs34, DISP2(\Index, \OffsetB)(\BREG)
  1851. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1852. xxpermdi vs35, vs34, vs38, 2
  1853. xxpermdi vs34, vs34, vs38, 0
  1854. #else
  1855. xxpermdi vs35, vs34, vs38, 0
  1856. xxpermdi vs34, vs34, vs38, 2
  1857. #endif
  1858. lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
  1859. .endif
  1860. .if \IsLast==1
  1861. .if \Complete==1
  1862. addi \BREG, \BREG, DISP2(\Index, \OffsetB)
  1863. addi \AREG, \AREG, DISP8(\Index, \OffsetA)
  1864. .else
  1865. addi \BREG, \BREG, DISP2(\Index, 16)
  1866. addi \AREG, \AREG, DISP8(\Index, 64)
  1867. .endif
  1868. .endif
  1869. .endm
  1870. .macro KERNEL1x4
  1871. LOAD1x4
  1872. END1x4 AO, BO, 32,8
  1873. .endm
  1874. .macro SAVE1x4
  1875. SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
  1876. SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
  1877. xxpermdi vs32, vs32, vs36, 0
  1878. xxpermdi vs40, vs40, vs44, 0
  1879. xxpermdi vs33, vs33, vs37, 0
  1880. xxpermdi vs41, vs41, vs45, 0
  1881. xxperm vs40, vs40, permute_mask
  1882. xxperm vs41, vs41, permute_mask
  1883. #ifndef TRMMKERNEL
  1884. lxvp vs24, 0(CO)
  1885. #endif
  1886. xxperm vs0, vs32, permute_mask
  1887. xxperm vs4, vs40, permute_mask
  1888. xxperm vs1, vs33, permute_mask
  1889. xxperm vs5, vs41, permute_mask
  1890. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  1891. AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5
  1892. /*inner reverse save_permute and store vs28 */
  1893. xxpermdi vs28,save_permute_1,save_permute_1, 2
  1894. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  1895. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  1896. MULT_APLHA_PART1 vs33, vs41, vs2, vs3
  1897. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  1898. MULT_APLHA_PART2 vs33, vs41, vs2, vs3
  1899. /* reconstruct r, i pairs*/
  1900. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1901. xxperm vs0, vs1, save_permute_1
  1902. xxperm vs2, vs3, save_permute_1
  1903. #else
  1904. xxperm vs0, vs1, vs28
  1905. xxperm vs2, vs3, vs28
  1906. #endif
  1907. #ifndef TRMMKERNEL
  1908. /* add */
  1909. xvaddsp vs24, vs24, vs2
  1910. xvaddsp vs25, vs25, vs0
  1911. stxvp vs24, 0(CO)
  1912. #else
  1913. /* reconstruct r, i pairs*/
  1914. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  1915. stxv vs2, 0(CO)
  1916. stxv vs0, 16(CO)
  1917. #else
  1918. stxv vs0, 0(CO)
  1919. stxv vs2, 16(CO)
  1920. #endif
  1921. #endif
  1922. addi CO, CO, 32
  1923. .endm
  1924. /* macros for N=1 and M=2
  1925. **********************************************************************************************/
  1926. .macro ZERO1x2
  1927. xxlxor vs32, vs32, vs32
  1928. xxlxor vs40, vs40, vs40
  1929. .endm
  1930. .macro LOAD1x2
  1931. LOAD1x2O 0, 0
  1932. .endm
  1933. .macro LOAD1x2O OffsetA, OffsetB
  1934. lxsd vs4, (\OffsetB+0)(BO)
  1935. lxv vs0, (\OffsetA+0)(AO)
  1936. xxspltd vs24, vs36, 0
  1937. xxperm vs26, vs24, permute_mask
  1938. .endm
  1939. .macro END1x2_NORMAL
  1940. END1x2 AO, BO, 16,8
  1941. .endm
  1942. .macro END1x2_WITHOUT_ADD
  1943. END1x2 AO, BO, 0, 0
  1944. .endm
  1945. .macro END1x2 AREG, BREG, OffsetA, OffsetB
  1946. .if \OffsetB != 0
  1947. addi \BREG, \BREG, \OffsetB
  1948. .endif
  1949. .if \OffsetA != 0
  1950. addi \AREG, \AREG, \OffsetA
  1951. .endif
  1952. xvmaddasp vs32, vs0, vs24
  1953. xvmaddasp vs40, vs0, vs26
  1954. .endm
  1955. .macro LOAD1x2_2
  1956. LOAD1x2_2O 0, 0
  1957. .endm
  1958. .macro LOAD1x2_2O OffsetA, OffsetB
  1959. lxv vs27, (\OffsetB)(BO)
  1960. lxvp vs4, (0+\OffsetA)(AO)
  1961. xxspltd vs8, vs27, 1
  1962. xxspltd vs24, vs27, 0
  1963. xxperm vs10, vs8, permute_mask
  1964. xxperm vs26, vs24, permute_mask
  1965. .endm
  1966. .macro END1x2_2
  1967. /*for load2 offset will be 32 and 16*/
  1968. KERNEL1x2_2 AO, BO, 32, 16, 0, 1, 1
  1969. .endm
  1970. .macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast
  1971. KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  1972. .endm
  1973. .macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast
  1974. KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  1975. .endm
  1976. .macro KERNEL1x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  1977. .if \Complete==0
  1978. lxv vs27, DISP2(\Index, \OffsetB)(\BREG)
  1979. .endif
  1980. xvmaddasp vs32, vs5, vs8
  1981. xvmaddasp vs40, vs5, vs10
  1982. .if \Complete==0
  1983. xxspltd vs8, vs27, 1
  1984. xxperm vs10, vs8, permute_mask
  1985. .endif
  1986. xvmaddasp vs32, vs4, vs24
  1987. xvmaddasp vs40, vs4, vs26
  1988. .if \Complete==0
  1989. lxvp vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
  1990. .endif
  1991. .if \Complete==0
  1992. xxspltd vs24, vs27, 0
  1993. xxperm vs26, vs24, permute_mask
  1994. .endif
  1995. .if \IsLast==1
  1996. .if \Complete==1
  1997. addi \BREG, \BREG, DISP2(\Index, \OffsetB)
  1998. addi \AREG, \AREG, DISP4(\Index, \OffsetA)
  1999. .else
  2000. addi \BREG, \BREG, DISP2(\Index, 16)
  2001. addi \AREG, \AREG, DISP4(\Index, 32)
  2002. .endif
  2003. .endif
  2004. .endm
  2005. .macro KERNEL1x2
  2006. LOAD1x2
  2007. END1x2 AO, BO, 16,8
  2008. .endm
  2009. .macro SAVE1x2
  2010. #ifndef TRMMKERNEL
  2011. lxv vs24, 0(CO)
  2012. #endif
  2013. xxperm vs0, vs32, permute_mask
  2014. xxperm vs4, vs40, permute_mask
  2015. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  2016. /*inner reverse save_permute and store vs28 */
  2017. xxpermdi vs28,save_permute_1,save_permute_1, 2
  2018. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  2019. MULT_APLHA_PART1 vs32, vs40, vs0, vs1
  2020. MULT_APLHA_PART2 vs32, vs40, vs0, vs1
  2021. /* reconstruct r, i pairs*/
  2022. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  2023. xxperm vs0, vs1, save_permute_1
  2024. #else
  2025. xxperm vs0, vs1, vs28
  2026. #endif
  2027. #ifndef TRMMKERNEL
  2028. /* add */
  2029. xvaddsp vs24, vs24, vs0
  2030. stxv vs24, 0(CO)
  2031. #else
  2032. /* reconstruct r, i pairs*/
  2033. stxv vs0, 0(CO)
  2034. #endif
  2035. addi CO, CO, 16
  2036. .endm
  2037. /* macros for N=1 and M=1
  2038. **********************************************************************************************/
  2039. .macro ZERO1x1
  2040. xxlxor vs32, vs32, vs32
  2041. xxlxor vs40, vs40, vs40
  2042. .endm
  2043. .macro LOAD1x1
  2044. LOAD1x1O 0, 0
  2045. .endm
  2046. .macro LOAD1x1O OffsetA, OffsetB
  2047. lxsd v4, (\OffsetB+0)(BO)
  2048. lxsd v5, (\OffsetA+0)(AO)
  2049. xxperm vs38, vs36, permute_mask
  2050. .endm
  2051. .macro END1x1_NORMAL
  2052. END1x1 AO, BO,8,8
  2053. .endm
  2054. .macro END1x1_WITHOUT_ADD
  2055. END1x1 AO, BO, 0, 0
  2056. .endm
  2057. .macro END1x1 AREG, BREG, OffsetA, OffsetB
  2058. .if \OffsetB != 0
  2059. addi \BREG, \BREG, \OffsetB
  2060. .endif
  2061. .if \OffsetA != 0
  2062. addi \AREG, \AREG, \OffsetA
  2063. .endif
  2064. xvmaddasp vs32, vs37, vs36
  2065. xvmaddasp vs40, vs37, vs38
  2066. .endm
  2067. .macro LOAD1x1_2
  2068. LOAD1x1_2O 0, 0
  2069. .endm
  2070. .macro LOAD1x1_2O OffsetA, OffsetB
  2071. lxv vs8, (\OffsetB)(BO)
  2072. lxv vs4, (0+\OffsetA)(AO)
  2073. xxperm vs10, vs8, permute_mask
  2074. .endm
  2075. .macro END1x1_2
  2076. /*for load2 offset will be 16 and 16*/
  2077. KERNEL1x1_2 AO, BO, 16, 16, 0, 1, 1
  2078. .endm
  2079. .macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast
  2080. KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
  2081. .endm
  2082. .macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast
  2083. KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
  2084. .endm
  2085. .macro KERNEL1x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
  2086. xvmaddasp vs32, vs4, vs8
  2087. xvmaddasp vs40, vs4, vs10
  2088. .if \Complete==0
  2089. lxv vs8, DISP2(\Index, \OffsetB)(\BREG)
  2090. lxv vs4, DISP2(\Index, \OffsetB)(\AREG)
  2091. xxperm vs10, vs8, permute_mask
  2092. .endif
  2093. .if \IsLast==1
  2094. .if \Complete==1
  2095. addi \BREG, \BREG, DISP2(\Index, \OffsetB)
  2096. addi \AREG, \AREG, DISP2(\Index, \OffsetA)
  2097. .else
  2098. addi \BREG, \BREG, DISP2(\Index, 16)
  2099. addi \AREG, \AREG, DISP2(\Index, 16)
  2100. .endif
  2101. .endif
  2102. .endm
  2103. .macro KERNEL1x1
  2104. LOAD1x1
  2105. END1x1 AO, BO, 8,8
  2106. .endm
  2107. .macro SAVE1x1
  2108. #ifndef TRMMKERNEL
  2109. lxsd v4, 0(CO)
  2110. #endif
  2111. /*aggregate x2*/
  2112. xxpermdi vs33, vs32, vs32, 2
  2113. xxpermdi vs41, vs40, vs40, 2
  2114. xvaddsp vs32, vs32, vs33
  2115. xvaddsp vs40, vs40, vs41
  2116. xxperm vs0, vs32, permute_mask
  2117. xxperm vs4, vs40, permute_mask
  2118. AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4
  2119. /*inner reverse save_permute and store vs28 */
  2120. xxpermdi vs28,save_permute_1,save_permute_1, 2
  2121. /*VSINRR, VSINII, VSOUT1, VSOUT2*/
  2122. MULT_APLHA_PART1 vs32, vs40, vs37, vs1
  2123. MULT_APLHA_PART2 vs32, vs40, vs37, vs1
  2124. /* reconstruct r, i pairs*/
  2125. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  2126. xxperm vs37, vs1, save_permute_1
  2127. #else
  2128. xxperm vs37, vs1, vs28
  2129. #endif
  2130. #ifndef TRMMKERNEL
  2131. /* add */
  2132. xvaddsp vs36, vs36, vs37
  2133. stxsd v4, 0(CO)
  2134. #else
  2135. /* vs37 is v5 */
  2136. stxsd v5, 0(CO)
  2137. #endif
  2138. addi CO, CO, 8
  2139. .endm
  2140. /****************************TRMM POINTER REFRESH MACROSES*************************/
  2141. .macro SHIFT_REG REG1,REG2,SHIFT_VAL
  2142. .if \SHIFT_VAL==16
  2143. slwi \REG1, \REG2, 7
  2144. .elseif \SHIFT_VAL==8
  2145. slwi \REG1, \REG2, 6
  2146. .elseif \SHIFT_VAL==4
  2147. slwi \REG1, \REG2, 5
  2148. .elseif \SHIFT_VAL==2
  2149. slwi \REG1, \REG2, 4
  2150. .elseif \SHIFT_VAL==1
  2151. slwi \REG1, \REG2, 3
  2152. .endif
  2153. .endm
  2154. /*
  2155. //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2156. // ptrbb = bb;
  2157. // #else
  2158. // ptrba += off*8;
  2159. // ptrbb = bb + off*4;
  2160. // #endif
  2161. */
  2162. .macro REFRESH_POINTERS PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B
  2163. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2164. /* ptrbb = bb;*/
  2165. mr \PTR_B, \B_VAL /* refresh BPOINT */
  2166. #else
  2167. /*
  2168. // ptrba =ptrba+ off*C_A;
  2169. // ptrbb = bb + off*C_B;
  2170. */
  2171. SHIFT_REG T4, \OFF_VAL, \C_B /* Number of values in B shifted */
  2172. SHIFT_REG T2, \OFF_VAL, \C_A /* Number of values in A shifted */
  2173. add \PTR_B, \B_VAL, T4 /* Add values to BO */
  2174. add \PTR_A, \PTR_A, T2 /* Add values to AO */
  2175. #endif
  2176. .endm
  2177. /*
  2178. // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2179. // temp = bk-off;
  2180. // #elif defined(LEFT)
  2181. // temp = off+8; // number of values in A
  2182. // #else
  2183. // temp = off+4; // number of values in B
  2184. // #endif
  2185. */
  2186. .macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B
  2187. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2188. /* temp = bk-off;*/
  2189. sub \TEMP_BK, \BK_VAL, \OFF_VAL
  2190. #elif defined(LEFT)
  2191. /* temp = off+INCR_A; // number of values in A */
  2192. addi \TEMP_BK, \OFF_VAL, \INCR_A
  2193. #else
  2194. /* temp = off+INCR_B // number of values in B*/
  2195. addi \TEMP_BK, \OFF_VAL, \INCR_B
  2196. #endif
  2197. .endm
  2198. /*
  2199. // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2200. // temp = bk - off;
  2201. // #ifdef LEFT
  2202. // temp -= 8; // number of values in A
  2203. // #else
  2204. // temp -= 4; // number of values in B
  2205. // #endif
  2206. // ptrba += temp*8;
  2207. // ptrbb += temp*4;
  2208. // #endif
  2209. // #ifdef LEFT
  2210. // off += 8; // number of values in A
  2211. // #endif
  2212. */
  2213. .macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B
  2214. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2215. /*temp = bk - off;*/
  2216. sub \TEMP_BK, \BK_VAL, \OFF_VAL
  2217. #ifdef LEFT
  2218. /*temp -= 8; // number of values in A*/
  2219. addi \TEMP_BK, \TEMP_BK,-\C_A
  2220. #else
  2221. /*temp -= 4; // number of values in B*/
  2222. addi \TEMP_BK, \TEMP_BK,-\C_B
  2223. #endif
  2224. /*ptrba += temp*C_A;
  2225. ptrbb += temp*C_B;*/
  2226. SHIFT_REG T4, \TEMP_BK, \C_A
  2227. SHIFT_REG T2, \TEMP_BK, \C_B
  2228. add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/
  2229. add \PTR_B, \PTR_B, T2
  2230. #endif
  2231. #ifdef LEFT
  2232. /*off += 8; // number of values in A*/
  2233. addi \OFF_VAL, \OFF_VAL, \C_A
  2234. #endif
  2235. .endm