You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_macros_power10.S 38 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349
  1. /***************************************************************************
  2. Copyright (c) 2013-2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define unit_size 16
  28. #define DISP32(ind,disp) (ind*unit_size*32+disp)
  29. #define DISP16(ind,disp) (ind*unit_size*16+disp)
  30. #define DISP8(ind,disp) (ind*unit_size*8+disp)
  31. #define DISP4(ind,disp) (ind*unit_size*4+disp)
  32. #define DISP2(ind,disp) (ind*unit_size*2+disp)
  33. #define DISP1(ind,disp) (ind*unit_size+disp)
  34. #define DISPX(disp) (disp)
  35. /* HELPERS FOR SAVE */
  36. /* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
  37. .macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
  38. #ifndef TRMMKERNEL
  39. lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
  40. lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
  41. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  42. xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2
  43. xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2
  44. #else
  45. xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
  46. xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
  47. #endif
  48. #endif
  49. .endm
  50. /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
  51. .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
  52. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  53. xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
  54. xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
  55. #else
  56. xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
  57. xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
  58. #endif
  59. .endm
  60. /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
  61. .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
  62. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  63. xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
  64. xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
  65. #else
  66. xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
  67. xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
  68. #endif
  69. .endm
  70. /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
  71. .macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
  72. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  73. xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
  74. xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
  75. #elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
  76. xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
  77. xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
  78. #elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
  79. xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
  80. xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
  81. #else // CC || CR || RC || RR
  82. /*we will assume {-alpha_r,-alpha_i} for this case */
  83. /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
  84. xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
  85. /*we will negate alpha image instead instead to fix sign*/
  86. xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
  87. #endif
  88. .endm
  89. /* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
  90. .macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
  91. #ifndef TRMMKERNEL
  92. xvmsubadp \VSOUT1,\VSINII, alpha_i
  93. xvmaddadp \VSOUT2,\VSINRR, alpha_i
  94. #else
  95. xvmuldp \VSOUT1,\VSINII, alpha_i
  96. xvmuldp \VSOUT2,\VSINRR, alpha_i
  97. #endif
  98. .endm
  99. /* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
  100. .macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
  101. xvmsubadp \VSOUT1,\VSINRR, alpha_r
  102. xvmaddadp \VSOUT2,\VSINII, alpha_r
  103. .endm
  104. /* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
  105. .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
  106. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  107. xxmrghd \VSOUT1,\VSIN1,\VSIN2
  108. xxmrgld \VSOUT2,\VSIN1,\VSIN2
  109. #else
  110. xxmrghd \VSOUT1,\VSIN2,\VSIN1
  111. xxmrgld \VSOUT2,\VSIN2,\VSIN1
  112. #endif
  113. .endm
  114. .macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
  115. stxv \VSIN1, DISPX(\LOFFSET)(\REG)
  116. stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
  117. .endm
  118. .macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
  119. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
  120. LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
  121. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
  122. LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
  123. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
  124. LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64)
  125. RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
  126. LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96)
  127. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43
  128. AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
  129. RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45
  130. AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
  131. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
  132. MULT_APLHA_PART1 vs34,vs36, vs46,vs47
  133. RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
  134. MULT_APLHA_PART1 vs38,vs40,vs48,vs49
  135. MULT_APLHA_PART2 vs34,vs36,vs46,vs47
  136. AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45
  137. MULT_APLHA_PART2 vs38,vs40,vs48,vs49
  138. AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
  139. UNPACK_FOR_STORE vs46,vs47,vs39,vs41
  140. MULT_APLHA_PART1 vs42,vs44, vs56,vs57
  141. UNPACK_FOR_STORE vs48,vs49,vs35,vs37
  142. MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59
  143. STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
  144. MULT_APLHA_PART2 vs42,vs44,vs56,vs57
  145. STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
  146. MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59
  147. UNPACK_FOR_STORE vs56,vs57,vs42,vs44
  148. UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3
  149. STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44
  150. STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
  151. .endm
  152. .macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
  153. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
  154. LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
  155. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
  156. LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
  157. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
  158. RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
  159. AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
  160. AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
  161. MULT_APLHA_PART1 vs34,vs36, vs46,vs47
  162. MULT_APLHA_PART1 vs38,vs40, vs48,vs49
  163. MULT_APLHA_PART2 vs34,vs36, vs46,vs47
  164. MULT_APLHA_PART2 vs38,vs40,vs48,vs49
  165. UNPACK_FOR_STORE vs46,vs47,vs39,vs41
  166. UNPACK_FOR_STORE vs48,vs49,vs35,vs37
  167. STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
  168. STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
  169. .endm
  170. .macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
  171. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
  172. LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
  173. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
  174. AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
  175. MULT_APLHA_PART1 vs34,vs36, vs46,vs47
  176. MULT_APLHA_PART2 vs34,vs36, vs46,vs47
  177. UNPACK_FOR_STORE vs46,vs47,vs39,vs41
  178. STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
  179. .endm
  180. .macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
  181. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
  182. #ifndef TRMMKERNEL
  183. lxv vs50, (\LOFFSET)(\BASE_REG)
  184. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  185. xxmrghd vs46,vs50,vs50
  186. xxmrgld vs47,vs50,vs50
  187. #else
  188. xxmrgld vs46,vs50,vs50
  189. xxmrghd vs47,vs50,vs50
  190. #endif
  191. #endif
  192. RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37
  193. AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
  194. MULT_APLHA_PART1 vs34,vs36, vs46,vs47
  195. MULT_APLHA_PART2 vs34,vs36, vs46,vs47
  196. UNPACK_FOR_STORE vs46,vs47,vs39,vs41
  197. #if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
  198. xxmrghd vs39,vs47,vs46
  199. #endif
  200. stxv vs39, (\LOFFSET)(\BASE_REG)
  201. .endm
  202. /**********************************************************************************************
  203. *
  204. .macros for N=2 and M=8
  205. **********************************************************************************************/
  206. .macro KERNEL2x8_ZERO_AND_PRIME_MMA
  207. /* zero out and prime the MMA accumulators */
  208. xxsetaccz 0
  209. xxsetaccz 1
  210. xxsetaccz 2
  211. xxsetaccz 3
  212. xxsetaccz 4
  213. xxsetaccz 5
  214. xxsetaccz 6
  215. xxsetaccz 7
  216. .endm
  217. .macro KERNEL2x8_PRELOAD
  218. lxvp vs32, 0(AO) // load real,imag from A
  219. lxvp vs34, 32(AO) // load real,imag from A
  220. lxvp vs36, 64(AO) // load real,imag from A
  221. lxvp vs38, 96(AO) // load real,imag from A
  222. lxvp vs48, 0(BO) // load real imag from B
  223. .endm
  224. .macro KERNEL2x8_2 Index, IsLast
  225. lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A
  226. lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A
  227. lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A
  228. lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A
  229. lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
  230. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  231. xvf64gerpp 0, vs32, vs48
  232. xvf64gerpp 1, vs34, vs48
  233. xvf64gerpp 2, vs36, vs48
  234. xvf64gerpp 3, vs38, vs48
  235. xvf64gerpp 4, vs32, vs49
  236. xvf64gerpp 5, vs34, vs49
  237. xvf64gerpp 6, vs36, vs49
  238. xvf64gerpp 7, vs38, vs49
  239. #else
  240. xvf64gerpp 0, vs32, vs49
  241. xvf64gerpp 1, vs34, vs49
  242. xvf64gerpp 2, vs36, vs49
  243. xvf64gerpp 3, vs38, vs49
  244. xvf64gerpp 4, vs32, vs48
  245. xvf64gerpp 5, vs34, vs48
  246. xvf64gerpp 6, vs36, vs48
  247. xvf64gerpp 7, vs38, vs48
  248. #endif
  249. lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A
  250. lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A
  251. lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A
  252. lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A
  253. lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
  254. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  255. xvf64gerpp 0, vs40, vs50
  256. xvf64gerpp 1, vs42, vs50
  257. xvf64gerpp 2, vs44, vs50
  258. xvf64gerpp 3, vs46, vs50
  259. xvf64gerpp 4, vs40, vs51
  260. xvf64gerpp 5, vs42, vs51
  261. xvf64gerpp 6, vs44, vs51
  262. xvf64gerpp 7, vs46, vs51
  263. #else
  264. xvf64gerpp 0, vs40, vs51
  265. xvf64gerpp 1, vs42, vs51
  266. xvf64gerpp 2, vs44, vs51
  267. xvf64gerpp 3, vs46, vs51
  268. xvf64gerpp 4, vs40, vs50
  269. xvf64gerpp 5, vs42, vs50
  270. xvf64gerpp 6, vs44, vs50
  271. xvf64gerpp 7, vs46, vs50
  272. #endif
  273. .if \IsLast==1
  274. addi AO, AO, DISP16(\Index,256)
  275. addi BO, BO, DISP4(\Index,64)
  276. .endif
  277. .endm
  278. .macro LOAD_END_2x8 OffsetA,OffsetB
  279. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  280. xvf64gerpp 0, vs32, vs48
  281. xvf64gerpp 1, vs34, vs48
  282. xvf64gerpp 2, vs36, vs48
  283. xvf64gerpp 3, vs38, vs48
  284. xvf64gerpp 4, vs32, vs49
  285. xvf64gerpp 5, vs34, vs49
  286. xvf64gerpp 6, vs36, vs49
  287. xvf64gerpp 7, vs38, vs49
  288. #else
  289. xvf64gerpp 0, vs32, vs49
  290. xvf64gerpp 1, vs34, vs49
  291. xvf64gerpp 2, vs36, vs49
  292. xvf64gerpp 3, vs38, vs49
  293. xvf64gerpp 4, vs32, vs48
  294. xvf64gerpp 5, vs34, vs48
  295. xvf64gerpp 6, vs36, vs48
  296. xvf64gerpp 7, vs38, vs48
  297. #endif
  298. addi BO, BO, \OffsetB
  299. addi AO, AO, \OffsetA
  300. .endm
  301. .macro KERNEL2x8_UNPRIME_MMA
  302. /* "unprime" MMA accumulators */
  303. xxmfacc 0
  304. xxmfacc 1
  305. xxmfacc 2
  306. xxmfacc 3
  307. xxmfacc 4
  308. xxmfacc 5
  309. xxmfacc 6
  310. xxmfacc 7
  311. .endm
  312. .macro SAVE2x8
  313. add T1, CO ,LDC
  314. xxpermdi vs32, vs0, vs1, 0b01
  315. xxpermdi vs33, vs0, vs1, 0b10
  316. xxpermdi vs34, vs2, vs3, 0b01
  317. xxpermdi vs35, vs2, vs3, 0b10
  318. xxpermdi vs36, vs4, vs5, 0b01
  319. xxpermdi vs37, vs4, vs5, 0b10
  320. xxpermdi vs38, vs6, vs7, 0b01
  321. xxpermdi vs39, vs6, vs7, 0b10
  322. xxpermdi vs40, vs8, vs9, 0b01
  323. xxpermdi vs41, vs8, vs9, 0b10
  324. xxpermdi vs42, vs10, vs11, 0b01
  325. xxpermdi vs43, vs10, vs11, 0b10
  326. xxpermdi vs44, vs12, vs13, 0b01
  327. xxpermdi vs45, vs12, vs13, 0b10
  328. xxpermdi vs46, vs14, vs15, 0b01
  329. xxpermdi vs47, vs14, vs15, 0b10
  330. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  331. xxlor vs0, vs32, vs32
  332. xxlor vs1, vs33, vs33
  333. xxlor vs2, vs34, vs34
  334. xxlor vs3, vs35, vs35
  335. xxlor vs4, vs36, vs36
  336. xxlor vs5, vs37, vs37
  337. xxlor vs6, vs38, vs38
  338. xxlor vs7, vs39, vs39
  339. xxlor vs8, vs40, vs40
  340. xxlor vs9, vs41, vs41
  341. xxlor vs10, vs42, vs42
  342. xxlor vs11, vs43, vs43
  343. xxlor vs12, vs44, vs44
  344. xxlor vs13, vs45, vs45
  345. xxlor vs14, vs46, vs46
  346. xxlor vs15, vs47, vs47
  347. #else
  348. xxlor vs2, vs32, vs32
  349. xxlor vs3, vs33, vs33
  350. xxlor vs0, vs34, vs34
  351. xxlor vs1, vs35, vs35
  352. xxlor vs6, vs36, vs36
  353. xxlor vs7, vs37, vs37
  354. xxlor vs4, vs38, vs38
  355. xxlor vs5, vs39, vs39
  356. xxlor vs10, vs40, vs40
  357. xxlor vs11, vs41, vs41
  358. xxlor vs8, vs42, vs42
  359. xxlor vs9, vs43, vs43
  360. xxlor vs14, vs44, vs44
  361. xxlor vs15, vs45, vs45
  362. xxlor vs12, vs46, vs46
  363. xxlor vs13, vs47, vs47
  364. #endif
  365. xxpermdi vs32, vs16, vs17, 0b01
  366. xxpermdi vs33, vs16, vs17, 0b10
  367. xxpermdi vs34, vs18, vs19, 0b01
  368. xxpermdi vs35, vs18, vs19, 0b10
  369. xxpermdi vs36, vs20, vs21, 0b01
  370. xxpermdi vs37, vs20, vs21, 0b10
  371. xxpermdi vs38, vs22, vs23, 0b01
  372. xxpermdi vs39, vs22, vs23, 0b10
  373. xxpermdi vs40, vs24, vs25, 0b01
  374. xxpermdi vs41, vs24, vs25, 0b10
  375. xxpermdi vs42, vs26, vs27, 0b01
  376. xxpermdi vs43, vs26, vs27, 0b10
  377. xxpermdi vs44, vs28, vs29, 0b01
  378. xxpermdi vs45, vs28, vs29, 0b10
  379. xxpermdi vs46, vs30, vs31, 0b01
  380. xxpermdi vs47, vs30, vs31, 0b10
  381. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  382. xxlor vs16, vs32, vs32
  383. xxlor vs17, vs33, vs33
  384. xxlor vs18, vs34, vs34
  385. xxlor vs19, vs35, vs35
  386. xxlor vs20, vs36, vs36
  387. xxlor vs21, vs37, vs37
  388. xxlor vs22, vs38, vs38
  389. xxlor vs23, vs39, vs39
  390. xxlor vs24, vs40, vs40
  391. xxlor vs25, vs41, vs41
  392. xxlor vs26, vs42, vs42
  393. xxlor vs27, vs43, vs43
  394. xxlor vs28, vs44, vs44
  395. xxlor vs29, vs45, vs45
  396. xxlor vs30, vs46, vs46
  397. xxlor vs31, vs47, vs47
  398. #else
  399. xxlor vs18, vs32, vs32
  400. xxlor vs19, vs33, vs33
  401. xxlor vs16, vs34, vs34
  402. xxlor vs17, vs35, vs35
  403. xxlor vs22, vs36, vs36
  404. xxlor vs23, vs37, vs37
  405. xxlor vs20, vs38, vs38
  406. xxlor vs21, vs39, vs39
  407. xxlor vs26, vs40, vs40
  408. xxlor vs27, vs41, vs41
  409. xxlor vs24, vs42, vs42
  410. xxlor vs25, vs43, vs43
  411. xxlor vs30, vs44, vs44
  412. xxlor vs31, vs45, vs45
  413. xxlor vs28, vs46, vs46
  414. xxlor vs29, vs47, vs47
  415. #endif
  416. SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
  417. SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0
  418. addi CO, CO, 128
  419. .endm
  420. /**********************************************************************************************
  421. *
  422. .macros for N=2 and M=4
  423. **********************************************************************************************/
  424. .macro KERNEL2x4_ZERO_AND_PRIME_MMA
  425. /* zero out and prime the MMA accumulators */
  426. xxsetaccz 0
  427. xxsetaccz 1
  428. xxsetaccz 2
  429. xxsetaccz 3
  430. .endm
  431. .macro KERNEL2x4_PRELOAD
  432. lxvp vs32, 0(AO) // load real,imag from A
  433. lxvp vs34, 32(AO) // load real,imag from A
  434. lxvp vs48, 0(BO) // load real imag from B
  435. .endm
  436. .macro KERNEL2x4_2 Index, IsLast
  437. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
  438. lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
  439. lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
  440. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  441. xvf64gerpp 0, vs32, vs48
  442. xvf64gerpp 1, vs34, vs48
  443. xvf64gerpp 2, vs32, vs49
  444. xvf64gerpp 3, vs34, vs49
  445. #else
  446. xvf64gerpp 0, vs32, vs49
  447. xvf64gerpp 1, vs34, vs49
  448. xvf64gerpp 2, vs32, vs48
  449. xvf64gerpp 3, vs34, vs48
  450. #endif
  451. lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A
  452. lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A
  453. lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B
  454. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  455. xvf64gerpp 0, vs40, vs50
  456. xvf64gerpp 1, vs42, vs50
  457. xvf64gerpp 2, vs40, vs51
  458. xvf64gerpp 3, vs42, vs51
  459. #else
  460. xvf64gerpp 0, vs40, vs51
  461. xvf64gerpp 1, vs42, vs51
  462. xvf64gerpp 2, vs40, vs50
  463. xvf64gerpp 3, vs42, vs50
  464. #endif
  465. .if \IsLast==1
  466. addi AO, AO, DISP8(\Index,128)
  467. addi BO, BO, DISP4(\Index,64)
  468. .endif
  469. .endm
  470. .macro LOAD_END_2x4 OffsetA, OffsetB
  471. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  472. xvf64gerpp 0, vs32, vs48
  473. xvf64gerpp 1, vs34, vs48
  474. xvf64gerpp 2, vs32, vs49
  475. xvf64gerpp 3, vs34, vs49
  476. #else
  477. xvf64gerpp 0, vs32, vs49
  478. xvf64gerpp 1, vs34, vs49
  479. xvf64gerpp 2, vs32, vs48
  480. xvf64gerpp 3, vs34, vs48
  481. #endif
  482. addi BO, BO, \OffsetB
  483. addi AO, AO, \OffsetA
  484. .endm
  485. .macro KERNEL2x4_UNPRIME_MMA
  486. /* "unprime" MMA accumulators */
  487. xxmfacc 0
  488. xxmfacc 1
  489. xxmfacc 2
  490. xxmfacc 3
  491. .endm
  492. .macro SAVE2x4
  493. add T1, CO ,LDC
  494. xxpermdi vs32, vs0, vs1, 0b01
  495. xxpermdi vs33, vs0, vs1, 0b10
  496. xxpermdi vs34, vs2, vs3, 0b01
  497. xxpermdi vs35, vs2, vs3, 0b10
  498. xxpermdi vs36, vs4, vs5, 0b01
  499. xxpermdi vs37, vs4, vs5, 0b10
  500. xxpermdi vs38, vs6, vs7, 0b01
  501. xxpermdi vs39, vs6, vs7, 0b10
  502. xxpermdi vs40, vs8, vs9, 0b01
  503. xxpermdi vs41, vs8, vs9, 0b10
  504. xxpermdi vs42, vs10, vs11, 0b01
  505. xxpermdi vs43, vs10, vs11, 0b10
  506. xxpermdi vs44, vs12, vs13, 0b01
  507. xxpermdi vs45, vs12, vs13, 0b10
  508. xxpermdi vs46, vs14, vs15, 0b01
  509. xxpermdi vs47, vs14, vs15, 0b10
  510. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  511. xxlor vs0, vs32, vs32
  512. xxlor vs1, vs33, vs33
  513. xxlor vs2, vs34, vs34
  514. xxlor vs3, vs35, vs35
  515. xxlor vs4, vs36, vs36
  516. xxlor vs5, vs37, vs37
  517. xxlor vs6, vs38, vs38
  518. xxlor vs7, vs39, vs39
  519. xxlor vs8, vs40, vs40
  520. xxlor vs9, vs41, vs41
  521. xxlor vs10, vs42, vs42
  522. xxlor vs11, vs43, vs43
  523. xxlor vs12, vs44, vs44
  524. xxlor vs13, vs45, vs45
  525. xxlor vs14, vs46, vs46
  526. xxlor vs15, vs47, vs47
  527. #else
  528. xxlor vs2, vs32, vs32
  529. xxlor vs3, vs33, vs33
  530. xxlor vs0, vs34, vs34
  531. xxlor vs1, vs35, vs35
  532. xxlor vs6, vs36, vs36
  533. xxlor vs7, vs37, vs37
  534. xxlor vs4, vs38, vs38
  535. xxlor vs5, vs39, vs39
  536. xxlor vs10, vs40, vs40
  537. xxlor vs11, vs41, vs41
  538. xxlor vs8, vs42, vs42
  539. xxlor vs9, vs43, vs43
  540. xxlor vs14, vs44, vs44
  541. xxlor vs15, vs45, vs45
  542. xxlor vs12, vs46, vs46
  543. xxlor vs13, vs47, vs47
  544. #endif
  545. SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
  546. SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0
  547. addi CO, CO, 64
  548. .endm
  549. /**********************************************************************************************
  550. *
  551. .macros for N=2 and M=2
  552. **********************************************************************************************/
  553. .macro KERNEL2x2_ZERO_AND_PRIME_MMA
  554. /* zero out and prime the MMA accumulators */
  555. xxsetaccz 0
  556. xxsetaccz 1
  557. .endm
  558. .macro KERNEL2x2_PRELOAD
  559. lxvp vs32, 0(AO) // load real,imag from A
  560. lxvp vs48, 0(BO) // load real imag from B
  561. .endm
  562. .macro KERNEL2x2_2 Index, IsLast
  563. lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
  564. lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
  565. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  566. xvf64gerpp 0, vs32, vs48
  567. xvf64gerpp 1, vs32, vs49
  568. #else
  569. xvf64gerpp 0, vs32, vs49
  570. xvf64gerpp 1, vs32, vs48
  571. #endif
  572. lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A
  573. lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
  574. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  575. xvf64gerpp 0, vs40, vs50
  576. xvf64gerpp 1, vs40, vs51
  577. #else
  578. xvf64gerpp 0, vs40, vs51
  579. xvf64gerpp 1, vs40, vs50
  580. #endif
  581. .if \IsLast==1
  582. addi AO, AO, DISP4(\Index,64)
  583. addi BO, BO, DISP4(\Index,64)
  584. .endif
  585. .endm
  586. .macro LOAD_END_2x2 OffsetA,OffsetB
  587. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  588. xvf64gerpp 0, vs32, vs48
  589. xvf64gerpp 1, vs32, vs49
  590. #else
  591. xvf64gerpp 0, vs32, vs49
  592. xvf64gerpp 1, vs32, vs48
  593. #endif
  594. addi BO, BO, \OffsetB
  595. addi AO, AO, \OffsetA
  596. .endm
  597. .macro KERNEL2x2_UNPRIME_MMA
  598. /* "unprime" MMA accumulators */
  599. xxmfacc 0
  600. xxmfacc 1
  601. .endm
  602. .macro SAVE2x2
  603. add T1, CO ,LDC
  604. xxpermdi vs32, vs0, vs1, 0b01
  605. xxpermdi vs33, vs0, vs1, 0b10
  606. xxpermdi vs34, vs2, vs3, 0b01
  607. xxpermdi vs35, vs2, vs3, 0b10
  608. xxpermdi vs36, vs4, vs5, 0b01
  609. xxpermdi vs37, vs4, vs5, 0b10
  610. xxpermdi vs38, vs6, vs7, 0b01
  611. xxpermdi vs39, vs6, vs7, 0b10
  612. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  613. xxlor vs0, vs32, vs32
  614. xxlor vs1, vs33, vs33
  615. xxlor vs2, vs34, vs34
  616. xxlor vs3, vs35, vs35
  617. xxlor vs4, vs36, vs36
  618. xxlor vs5, vs37, vs37
  619. xxlor vs6, vs38, vs38
  620. xxlor vs7, vs39, vs39
  621. #else
  622. xxlor vs2, vs32, vs32
  623. xxlor vs3, vs33, vs33
  624. xxlor vs0, vs34, vs34
  625. xxlor vs1, vs35, vs35
  626. xxlor vs6, vs36, vs36
  627. xxlor vs7, vs37, vs37
  628. xxlor vs4, vs38, vs38
  629. xxlor vs5, vs39, vs39
  630. #endif
  631. SAVE2 vs0,vs1,vs2,vs3,CO,0
  632. SAVE2 vs4,vs5,vs6,vs7,T1,0
  633. addi CO, CO, 32
  634. .endm
  635. /**********************************************************************************************
  636. *
  637. .macros for N=2 and M=1
  638. **********************************************************************************************/
  639. .macro ZERO2x1
  640. xxlxor vs0, vs0, vs0
  641. xxlxor vs1, vs1, vs1
  642. xxlxor vs2, vs2, vs2
  643. xxlxor vs3, vs3, vs3
  644. .endm
  645. .macro LOAD2x1
  646. LOAD2x1O 0,0
  647. .endm
  648. .macro LOAD2x1O OffsetA,OffsetB
  649. lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
  650. lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
  651. xxswapd vs49, vs48
  652. xxswapd vs51, vs50
  653. lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
  654. .endm
  655. .macro END2x1_WITHOUT_ADD
  656. END2x1 AO,BO,0,0
  657. .endm
  658. .macro END2x1 AREG, BREG, OffsetA, OffsetB
  659. .if \OffsetB != 0
  660. addi \BREG, \BREG, \OffsetB
  661. .endif
  662. .if \OffsetA != 0
  663. addi \AREG, \AREG, \OffsetA
  664. .endif
  665. xvmaddadp vs0, vs32, vs48
  666. xvmaddadp vs2, vs32, vs50
  667. xvmaddadp vs1, vs32, vs49
  668. xvmaddadp vs3, vs32, vs51
  669. .endm
  670. .macro LOAD2x1_2
  671. LOAD2x1_2O 0,0
  672. .endm
  673. .macro LOAD2x1_2O OffsetA,OffsetB
  674. lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
  675. lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
  676. lxv vs52, (\OffsetB+32)(BO) // load real,imag from B
  677. lxv vs54, (\OffsetB+48)(BO) // load real,imag from B
  678. xxswapd vs49, vs48
  679. xxswapd vs51, vs50
  680. lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
  681. lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
  682. .endm
  683. .macro END2x1_2
  684. /*for load2 offset will be 32 and 64*/
  685. KERNEL2x1_2 AO,BO, 32,64,0 ,1,1
  686. .endm
  687. .macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
  688. KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  689. .endm
  690. .macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
  691. KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  692. .endm
  693. .macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  694. xxswapd vs53, vs52
  695. xxswapd vs55, vs54
  696. xvmaddadp vs0, vs32, vs48
  697. xvmaddadp vs2, vs32, vs50
  698. xvmaddadp vs1, vs32, vs49
  699. xvmaddadp vs3, vs32, vs51
  700. .if \Complete==0
  701. lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  702. .endif
  703. .if \Complete==0
  704. lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  705. lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  706. .endif
  707. .if \Complete==0
  708. xxswapd vs49, vs48
  709. xxswapd vs51, vs50
  710. .endif
  711. xvmaddadp vs0, vs40, vs52
  712. xvmaddadp vs2, vs40, vs54
  713. xvmaddadp vs1, vs40, vs53
  714. xvmaddadp vs3, vs40, vs55
  715. .if \Complete==0
  716. lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
  717. .endif
  718. .if \Complete==0
  719. lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
  720. lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
  721. .endif
  722. .if \IsLast==1
  723. .if \Complete==1
  724. addi \AREG, \AREG, DISP2(\Index,\OffsetA)
  725. addi \BREG, \BREG, DISP4(\Index,\OffsetB)
  726. .else
  727. addi \AREG, \AREG, DISP2(\Index,32)
  728. addi \BREG, \BREG, DISP4(\Index,64)
  729. .endif
  730. .endif
  731. .endm
  732. .macro KERNEL2x1
  733. LOAD2x1
  734. END2x1 AO, BO, 16,32
  735. .endm
  736. .macro SAVE2x1
  737. add T1, CO ,LDC
  738. SAVE1 vs0,vs1,CO,0
  739. SAVE1 vs2,vs3,T1,0
  740. addi CO, CO, 16
  741. .endm
  742. /**********************************************************************************************
  743. *
  744. .macros for N=1 and M=8
  745. **********************************************************************************************/
  746. .macro KERNEL1x8_ZERO_AND_PRIME_MMA
  747. /* zero out and prime the MMA accumulators */
  748. xxsetaccz 0
  749. xxsetaccz 1
  750. xxsetaccz 2
  751. xxsetaccz 3
  752. .endm
  753. .macro KERNEL1x8_2 Index,IsLast
  754. lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A
  755. lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A
  756. lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A
  757. lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A
  758. lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A
  759. lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A
  760. lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A
  761. lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A
  762. lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
  763. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  764. xvf64gerpp 0, vs32, vs48
  765. xvf64gerpp 1, vs34, vs48
  766. xvf64gerpp 2, vs36, vs48
  767. xvf64gerpp 3, vs38, vs48
  768. xvf64gerpp 0, vs40, vs49
  769. xvf64gerpp 1, vs42, vs49
  770. xvf64gerpp 2, vs44, vs49
  771. xvf64gerpp 3, vs46, vs49
  772. #else
  773. xvf64gerpp 0, vs32, vs49
  774. xvf64gerpp 1, vs34, vs49
  775. xvf64gerpp 2, vs36, vs49
  776. xvf64gerpp 3, vs38, vs49
  777. xvf64gerpp 0, vs40, vs48
  778. xvf64gerpp 1, vs42, vs48
  779. xvf64gerpp 2, vs44, vs48
  780. xvf64gerpp 3, vs46, vs48
  781. #endif
  782. .if \IsLast==1
  783. addi AO, AO, DISP16(\Index,256)
  784. addi BO, BO, DISP2(\Index,32)
  785. .endif
  786. .endm
  787. .macro LOAD_END_1x8 OffsetA,OffsetB
  788. lxvp vs32, 0(AO) // load real,imag from A
  789. lxvp vs34, 32(AO) // load real,imag from A
  790. lxvp vs36, 64(AO) // load real,imag from A
  791. lxvp vs38, 96(AO) // load real,imag from A
  792. lxv vs48, 0(BO) // load real imag from B
  793. xvf64gerpp 0, vs32, vs48
  794. xvf64gerpp 1, vs34, vs48
  795. xvf64gerpp 2, vs36, vs48
  796. xvf64gerpp 3, vs38, vs48
  797. addi BO, BO, \OffsetB
  798. addi AO, AO, \OffsetA
  799. .endm
  800. .macro KERNEL1x8_UNPRIME_MMA
  801. /* "unprime" MMA accumulators */
  802. xxmfacc 0
  803. xxmfacc 1
  804. xxmfacc 2
  805. xxmfacc 3
  806. .endm
  807. .macro SAVE1x8
  808. xxpermdi vs32, vs0, vs1, 0b01
  809. xxpermdi vs33, vs0, vs1, 0b10
  810. xxpermdi vs34, vs2, vs3, 0b01
  811. xxpermdi vs35, vs2, vs3, 0b10
  812. xxpermdi vs36, vs4, vs5, 0b01
  813. xxpermdi vs37, vs4, vs5, 0b10
  814. xxpermdi vs38, vs6, vs7, 0b01
  815. xxpermdi vs39, vs6, vs7, 0b10
  816. xxpermdi vs40, vs8, vs9, 0b01
  817. xxpermdi vs41, vs8, vs9, 0b10
  818. xxpermdi vs42, vs10, vs11, 0b01
  819. xxpermdi vs43, vs10, vs11, 0b10
  820. xxpermdi vs44, vs12, vs13, 0b01
  821. xxpermdi vs45, vs12, vs13, 0b10
  822. xxpermdi vs46, vs14, vs15, 0b01
  823. xxpermdi vs47, vs14, vs15, 0b10
  824. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  825. xxlor vs0, vs32, vs32
  826. xxlor vs1, vs33, vs33
  827. xxlor vs2, vs34, vs34
  828. xxlor vs3, vs35, vs35
  829. xxlor vs4, vs36, vs36
  830. xxlor vs5, vs37, vs37
  831. xxlor vs6, vs38, vs38
  832. xxlor vs7, vs39, vs39
  833. xxlor vs8, vs40, vs40
  834. xxlor vs9, vs41, vs41
  835. xxlor vs10, vs42, vs42
  836. xxlor vs11, vs43, vs43
  837. xxlor vs12, vs44, vs44
  838. xxlor vs13, vs45, vs45
  839. xxlor vs14, vs46, vs46
  840. xxlor vs15, vs47, vs47
  841. #else
  842. xxlor vs2, vs32, vs32
  843. xxlor vs3, vs33, vs33
  844. xxlor vs0, vs34, vs34
  845. xxlor vs1, vs35, vs35
  846. xxlor vs6, vs36, vs36
  847. xxlor vs7, vs37, vs37
  848. xxlor vs4, vs38, vs38
  849. xxlor vs5, vs39, vs39
  850. xxlor vs10, vs40, vs40
  851. xxlor vs11, vs41, vs41
  852. xxlor vs8, vs42, vs42
  853. xxlor vs9, vs43, vs43
  854. xxlor vs14, vs44, vs44
  855. xxlor vs15, vs45, vs45
  856. xxlor vs12, vs46, vs46
  857. xxlor vs13, vs47, vs47
  858. #endif
  859. SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
  860. addi CO, CO, 128
  861. .endm
  862. /**********************************************************************************************
  863. *
  864. .macros for N=1 and M=4
  865. **********************************************************************************************/
  866. .macro KERNEL1x4_ZERO_AND_PRIME_MMA
  867. /* zero out and prime the MMA accumulators */
  868. xxsetaccz 0
  869. xxsetaccz 1
  870. .endm
  871. .macro KERNEL1x4_2 Index,IsLast
  872. lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A
  873. lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A
  874. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
  875. lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
  876. lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
  877. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  878. xvf64gerpp 0, vs32, vs48
  879. xvf64gerpp 1, vs34, vs48
  880. xvf64gerpp 0, vs40, vs49
  881. xvf64gerpp 1, vs42, vs49
  882. #else
  883. xvf64gerpp 0, vs32, vs49
  884. xvf64gerpp 1, vs34, vs49
  885. xvf64gerpp 0, vs40, vs48
  886. xvf64gerpp 1, vs42, vs48
  887. #endif
  888. .if \IsLast==1
  889. addi AO, AO, DISP8(\Index,128)
  890. addi BO, BO, DISP2(\Index,32)
  891. .endif
  892. .endm
  893. .macro LOAD_END_1x4 OffsetA,OffsetB
  894. lxvp vs32, 0(AO) // load real,imag from A
  895. lxvp vs34, 32(AO) // load real,imag from A
  896. lxv vs48, 0(BO) // load real imag from B
  897. xvf64gerpp 0, vs32, vs48
  898. xvf64gerpp 1, vs34, vs48
  899. addi BO, BO, \OffsetB
  900. addi AO, AO, \OffsetA
  901. .endm
  902. .macro KERNEL1x4_UNPRIME_MMA
  903. /* "unprime" MMA accumulators */
  904. xxmfacc 0
  905. xxmfacc 1
  906. .endm
  907. .macro SAVE1x4
  908. xxpermdi vs32, vs0, vs1, 0b01
  909. xxpermdi vs33, vs0, vs1, 0b10
  910. xxpermdi vs34, vs2, vs3, 0b01
  911. xxpermdi vs35, vs2, vs3, 0b10
  912. xxpermdi vs36, vs4, vs5, 0b01
  913. xxpermdi vs37, vs4, vs5, 0b10
  914. xxpermdi vs38, vs6, vs7, 0b01
  915. xxpermdi vs39, vs6, vs7, 0b10
  916. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  917. xxlor vs0, vs32, vs32
  918. xxlor vs1, vs33, vs33
  919. xxlor vs2, vs34, vs34
  920. xxlor vs3, vs35, vs35
  921. xxlor vs4, vs36, vs36
  922. xxlor vs5, vs37, vs37
  923. xxlor vs6, vs38, vs38
  924. xxlor vs7, vs39, vs39
  925. #else
  926. xxlor vs2, vs32, vs32
  927. xxlor vs3, vs33, vs33
  928. xxlor vs0, vs34, vs34
  929. xxlor vs1, vs35, vs35
  930. xxlor vs6, vs36, vs36
  931. xxlor vs7, vs37, vs37
  932. xxlor vs4, vs38, vs38
  933. xxlor vs5, vs39, vs39
  934. #endif
  935. SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
  936. addi CO, CO, 64
  937. .endm
  938. /**********************************************************************************************
  939. *
  940. .macros for N=1 and M=2
  941. **********************************************************************************************/
  942. .macro KERNEL1x2_ZERO_AND_PRIME_MMA
  943. /* zero out and prime the MMA accumulators */
  944. xxsetaccz 0
  945. .endm
  946. .macro KERNEL1x2_2 Index,IsLast
  947. lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A
  948. lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
  949. lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
  950. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  951. xvf64gerpp 0, vs32, vs48
  952. xvf64gerpp 0, vs40, vs49
  953. #else
  954. xvf64gerpp 0, vs32, vs49
  955. xvf64gerpp 0, vs40, vs48
  956. #endif
  957. .if \IsLast==1
  958. addi AO, AO, DISP4(\Index,64)
  959. addi BO, BO, DISP2(\Index,32)
  960. .endif
  961. .endm
  962. .macro LOAD_END_1x2 OffsetA,OffsetB
  963. lxvp vs32, 0(AO) // load real,imag from A
  964. lxv vs48, 0(BO) // load real imag from B
  965. xvf64gerpp 0, vs32, vs48
  966. addi BO, BO, \OffsetB
  967. addi AO, AO, \OffsetA
  968. .endm
  969. .macro KERNEL1x2_UNPRIME_MMA
  970. /* "unprime" MMA accumulators */
  971. xxmfacc 0
  972. .endm
  973. .macro SAVE1x2
  974. xxpermdi vs32, vs0, vs1, 0b01
  975. xxpermdi vs33, vs0, vs1, 0b10
  976. xxpermdi vs34, vs2, vs3, 0b01
  977. xxpermdi vs35, vs2, vs3, 0b10
  978. #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
  979. xxlor vs0, vs32, vs32
  980. xxlor vs1, vs33, vs33
  981. xxlor vs2, vs34, vs34
  982. xxlor vs3, vs35, vs35
  983. #else
  984. xxlor vs2, vs32, vs32
  985. xxlor vs3, vs33, vs33
  986. xxlor vs0, vs34, vs34
  987. xxlor vs1, vs35, vs35
  988. #endif
  989. SAVE2 vs0,vs1,vs2,vs3,CO,0
  990. addi CO, CO, 32
  991. .endm
  992. /**********************************************************************************************
  993. *
  994. .macros for N=1 and M=1
  995. **********************************************************************************************/
  996. .macro ZERO1x1
  997. xxlxor vs0, vs0, vs0
  998. xxlxor vs1, vs1, vs1
  999. .endm
  1000. .macro LOAD1x1
  1001. LOAD1x1O 0,0
  1002. .endm
  1003. .macro LOAD1x1O OffsetA,OffsetB
  1004. lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
  1005. lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
  1006. xxswapd vs49, vs48
  1007. .endm
  1008. .macro END1x1_WITHOUT_ADD
  1009. END1x1 AO,BO,0,0
  1010. .endm
  1011. .macro END1x1 AREG, BREG, OffsetA, OffsetB
  1012. .if \OffsetB != 0
  1013. addi \BREG, \BREG, \OffsetB
  1014. .endif
  1015. .if \OffsetA != 0
  1016. addi \AREG, \AREG, \OffsetA
  1017. .endif
  1018. xvmaddadp vs0, vs32, vs48
  1019. xvmaddadp vs1, vs32, vs49
  1020. .endm
  1021. .macro LOAD1x1_2
  1022. LOAD1x1_2O 0,0
  1023. .endm
  1024. .macro LOAD1x1_2O OffsetA,OffsetB
  1025. lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
  1026. lxv vs52, (\OffsetB+16)(BO) // load real,imag from B
  1027. xxswapd vs49, vs48
  1028. lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
  1029. lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
  1030. .endm
  1031. .macro END1x1_2
  1032. /*for load2 offset will be 32 and 32*/
  1033. KERNEL1x1_2 AO,BO, 32,32,0 ,1,1
  1034. .endm
  1035. .macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
  1036. KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
  1037. .endm
  1038. .macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
  1039. KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
  1040. .endm
  1041. .macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
  1042. xxswapd vs53, vs52
  1043. xvmaddadp vs0, vs32, vs48
  1044. xvmaddadp vs1, vs32, vs49
  1045. .if \Complete==0
  1046. lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
  1047. .endif
  1048. .if \Complete==0
  1049. lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
  1050. .endif
  1051. .if \Complete==0
  1052. xxswapd vs49, vs48
  1053. .endif
  1054. xvmaddadp vs0, vs40, vs52
  1055. xvmaddadp vs1, vs40, vs53
  1056. .if \Complete==0
  1057. lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
  1058. .endif
  1059. .if \Complete==0
  1060. lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
  1061. .endif
  1062. .if \IsLast==1
  1063. .if \Complete==1
  1064. addi \AREG, \AREG, DISP2(\Index,\OffsetA)
  1065. addi \BREG, \BREG, DISP2(\Index,\OffsetB)
  1066. .else
  1067. addi \AREG, \AREG, DISP2(\Index,32)
  1068. addi \BREG, \BREG, DISP2(\Index,32)
  1069. .endif
  1070. .endif
  1071. .endm
  1072. .macro KERNEL1x1
  1073. LOAD1x1
  1074. END1x1 AO, BO, 16,16
  1075. .endm
  1076. .macro SAVE1x1
  1077. SAVE1 vs0,vs1,CO,0
  1078. addi CO, CO, 16
  1079. .endm
  1080. /****************************TRMM POINTER REFRESH
  1081. .macroSES*************************/
  1082. .macro SHIFT_REG REG1,REG2,SHIFT_VAL
  1083. .if \SHIFT_VAL==16
  1084. slwi \REG1, \REG2, 8
  1085. .elseif \SHIFT_VAL==8
  1086. slwi \REG1, \REG2, 7
  1087. .elseif \SHIFT_VAL==4
  1088. slwi \REG1, \REG2, 6
  1089. .elseif \SHIFT_VAL==2
  1090. slwi \REG1, \REG2, 5
  1091. .elseif \SHIFT_VAL==1
  1092. slwi \REG1, \REG2, 4
  1093. .endif
  1094. .endm
  1095. /*
  1096. //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1097. // ptrbb = bb;
  1098. // #else
  1099. // ptrba += off*16;
  1100. // ptrbb = bb + off*2;
  1101. // #endif
  1102. */
  1103. .macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
  1104. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1105. /* ptrbb = bb;*/
  1106. mr \PTR_B,\B_VAL /* refresh BPOINT */
  1107. #else
  1108. /*
  1109. // ptrba =ptrba+ off*C_A;
  1110. // ptrbb = bb + off*C_B;
  1111. */
  1112. SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
  1113. SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
  1114. add \PTR_B, \B_VAL , T4 /* Add values to BO */
  1115. add \PTR_A, \PTR_A, T2 /* Add values to AO */
  1116. #endif
  1117. .endm
  1118. /*
  1119. // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1120. // temp = bk-off;
  1121. // #elif defined(LEFT)
  1122. // temp = off+16; // number of values in A
  1123. // #else
  1124. // temp = off+2; // number of values in B
  1125. // #endif
  1126. */
  1127. .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
  1128. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1129. /* temp = bk-off;*/
  1130. sub \TEMP_BK,\BK_VAL,\OFF_VAL
  1131. #elif defined(LEFT)
  1132. /* temp = off+INCR_A; // number of values in A */
  1133. addi \TEMP_BK, \OFF_VAL, \INCR_A
  1134. #else
  1135. /* temp = off+INCR_B // number of values in B*/
  1136. addi \TEMP_BK,\OFF_VAL, \INCR_B
  1137. #endif
  1138. .endm
  1139. /*
  1140. // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1141. // temp = bk - off;
  1142. // #ifdef LEFT
  1143. // temp -= 16; // number of values in A
  1144. // #else
  1145. // temp -= 2; // number of values in B
  1146. // #endif
  1147. // ptrba += temp*16;
  1148. // ptrbb += temp*2;
  1149. // #endif
  1150. // #ifdef LEFT
  1151. // off += 16; // number of values in A
  1152. // #endif
  1153. */
  1154. .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
  1155. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1156. /*temp = bk - off;*/
  1157. sub \TEMP_BK,\BK_VAL,\OFF_VAL
  1158. #ifdef LEFT
  1159. /*temp -= 8; // number of values in A*/
  1160. addi \TEMP_BK,\TEMP_BK,-\C_A
  1161. #else
  1162. /*temp -= 4; // number of values in B*/
  1163. addi \TEMP_BK,\TEMP_BK,-\C_B
  1164. #endif
  1165. /*ptrba += temp*C_A;
  1166. ptrbb += temp*C_B;*/
  1167. SHIFT_REG T4,\TEMP_BK,\C_A
  1168. SHIFT_REG T2,\TEMP_BK,\C_B
  1169. add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
  1170. add \PTR_B, \PTR_B,T2
  1171. #endif
  1172. #ifdef LEFT
  1173. /*off += 8; // number of values in A*/
  1174. addi \OFF_VAL,\OFF_VAL,\C_A
  1175. #endif
  1176. .endm