You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ckernelMacrosV.S 51 kB

8 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484
  1. /****************************************Implementation**Details**********************************************/
  2. /* */
  3. /* Lets denote (a,a1i) complex which is mathematically a+a1*i */
  4. /* Complex number multiplication: (a,a1i)*(b,b1i) */
  5. /* As i*i=-1 .The multiplication result will be: */
  6. /* (a+a1*i)(b+b1*i)=a*b+a1*i*b1*i+ a1*i*b+a*b1*i=a*b-a1*b1 + (a1*b+a*b1)*i which is (ab-a1b1,a1b+ab1) */
  7. /* so let c= ab-a1b1 , ci=a1b+ab1 then */
  8. /* c=c+a*b-a1*b1 => c=a*b-( a1*b1-c) => c= a1*b1-c then c=a*b-c two mseb */
  9. /* ci=ci+a1*b+a*b1 => ci= a1*b+ci then ci= a*b1+ci */
  10. /* For simd real and imaginary parts will be grouped together */
  11. /* such (realA,realK) and (imageA ,imageK) */
  12. /* Simd(0,1)=(a*b,k*b)-((ai*bi,ki*bi)-Simd(0,1)) */
  13. /* SimdI(0,1)=SimdI(0,1)+(a*bi,k*bi)+(ai*b,ki*b) */
  14. /* */
  15. /* */
  16. /* for defined(NR) || defined(NC) || defined(TR) || defined(TC) */
  17. /* (a+a1*I)(b-b1*I)=ab+a1*b1+I(a1b-ab1) */
  18. /* */
  19. /* c=c+ab+a1b1 => c=a1b1+c;c=ab+c */
  20. /* ci=ci+a1b-ab1 => ci=a1*b-(ab1-ci) => ci=ab1-ci; ci=a1*b-ci */
  21. /* */
  22. /* */
  23. /* for defined(RN) || defined(RT) || defined(CN) || defined(CT) */
  24. /* (a-a1*I)(b+b1*I)=ab+a1*b1+I(-a1b+ab1) */
  25. /* */
  26. /* c=c+ab+a1b1 => c=a1b1+c;c=ab+c */
  27. /* ci=ci+a1b-ab1 => ci=a*b1-(a1b-ci) => ci=a1b-ci; ci=a*b1-ci */
  28. /* */
  29. /* */
  30. /* for defined(RR) || defined(RC) || defined(CR) || defined(CC) */
  31. /* (a-a1*I)(b-b1*I)=ab-a1*b1+I(-a1b-ab1) */
  32. /* */
  33. /* c= a1*b1-c then c=a*b-c */
  34. /* ci = ci-a1*b -a*b1; */
  35. /* as ibm z13 only has x*z-m x*z+m instructions implementation will be changed a bit */
  36. /* Assuming ci=0; and cix=cix+a1b+ab1 ; ci=ci-cix will work */
  37. /* cix= a*b1+cix ; cix= a1*b+cix (two madb) ci=ci-cix (sign change if ci=0) */
  38. /* As c=0 then */
  39. /* c=a*b-c then c=a1*b1-c => c=(a1*b1-(a*b-c)) which is -1*( a*b -(a1*b1-c)) */
  40. /* */
  41. /* Values will be equal to (-c) and (-ci) */
  42. /* To change sign it'll be multiplied by -1*(alpha+alpha_i) */
  43. /* This is done once: */
  44. /* lcdbr ALPHA_I,ALPHA_I */
  45. /* lcdbr ALPHA ,ALPHA */
  46. /*************************************************************************************************************/
  47. /*************************Zero vectors***************************************/
  48. /*zero vectors for 4x4 */
  49. .macro ZERO_ZCVEC_4x4
  50. vzero %v16
  51. vzero %v17
  52. vzero %v18
  53. vzero %v19
  54. vzero %v20
  55. vzero %v21
  56. vzero %v22
  57. vzero %v23
  58. vzero %v24
  59. vzero %v25
  60. vzero %v26
  61. vzero %v27
  62. vzero %v28
  63. vzero %v29
  64. vzero %v30
  65. vzero %v31
  66. .endm
  67. /*zero vectors for */
  68. .macro ZERO_ZCVEC_2x4
  69. vzero %v16
  70. vzero %v17
  71. vzero %v18
  72. vzero %v19
  73. vzero %v20
  74. vzero %v21
  75. vzero %v22
  76. vzero %v23
  77. .endm
  78. /*zero vectors for */
  79. .macro ZERO_ZCVEC_1x4
  80. vzero %v16
  81. vzero %v17
  82. vzero %v18
  83. vzero %v19
  84. .endm
  85. /*zero vectors for */
  86. .macro ZERO_ZCVEC_4x2
  87. ZERO_ZCVEC_2x4
  88. .endm
  89. .macro ZERO_ZCVEC_4x1
  90. ZERO_ZCVEC_1x4
  91. .endm
  92. /*zero vectors for */
  93. .macro ZERO_ZCVEC_2x2
  94. vzero %v16
  95. vzero %v17
  96. vzero %v20
  97. vzero %v21
  98. .endm
  99. /*zero vectors for */
  100. .macro ZERO_ZCVEC_1x2
  101. vzero %v16
  102. vzero %v17
  103. .endm
  104. /*zero vectors for */
  105. .macro ZERO_ZCVEC_2x1
  106. vzero %v16
  107. vzero %v17
  108. .endm
  109. /*zero vectors for 1x1*/
  110. .macro ZERO_ZCVEC_1x1
  111. lzer %f6
  112. lzer %f7
  113. .endm
  114. /*
  115. Calculate for 4x2 inner
  116. */
  117. .macro CalcComplex_4x2 vResR1, vResI1, vResR2, vResI2, vResR3, vResI3, vResR4, vResI4, vr1, vi1, vr2, vi2, vrB, viB,vrB2, viB2
  118. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  119. vfmsdb \vResR1, \vi1, \viB, \vResR1
  120. vfmadb \vResI1, \vr1, \viB, \vResI1
  121. vfmsdb \vResR2, \vi2, \viB, \vResR2
  122. vfmadb \vResI2, \vr2, \viB, \vResI2
  123. vfmsdb \vResR3, \vi1, \viB2, \vResR3
  124. vfmadb \vResI3, \vr1, \viB2, \vResI3
  125. vfmsdb \vResR4, \vi2, \viB2, \vResR4
  126. vfmadb \vResI4, \vr2, \viB2, \vResI4
  127. vfmsdb \vResR1, \vr1, \vrB, \vResR1
  128. vfmadb \vResI1, \vi1, \vrB, \vResI1
  129. vfmsdb \vResR2, \vr2, \vrB, \vResR2
  130. vfmadb \vResI2, \vi2, \vrB, \vResI2
  131. vfmsdb \vResR3, \vr1, \vrB2, \vResR3
  132. vfmadb \vResI3, \vi1, \vrB2, \vResI3
  133. vfmsdb \vResR4, \vr2, \vrB2, \vResR4
  134. vfmadb \vResI4, \vi2, \vrB2, \vResI4
  135. #endif
  136. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  137. vfmadb \vResR1, \vi1, \viB, \vResR1
  138. vfmsdb \vResI1, \vr1, \viB, \vResI1
  139. vfmadb \vResR2, \vi2, \viB, \vResR2
  140. vfmsdb \vResI2, \vr2, \viB, \vResI2
  141. vfmadb \vResR3, \vi1, \viB2, \vResR3
  142. vfmsdb \vResI3, \vr1, \viB2, \vResI3
  143. vfmadb \vResR4, \vi2, \viB2, \vResR4
  144. vfmsdb \vResI4, \vr2, \viB2, \vResI4
  145. vfmadb \vResR1, \vr1, \vrB, \vResR1
  146. vfmsdb \vResI1, \vi1, \vrB, \vResI1
  147. vfmadb \vResR2, \vr2, \vrB, \vResR2
  148. vfmsdb \vResI2, \vi2, \vrB, \vResI2
  149. vfmadb \vResR3, \vr1, \vrB2, \vResR3
  150. vfmsdb \vResI3, \vi1, \vrB2, \vResI3
  151. vfmadb \vResR4, \vr2, \vrB2, \vResR4
  152. vfmsdb \vResI4, \vi2, \vrB2, \vResI4
  153. #endif
  154. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  155. vfmadb \vResR1, \vi1, \viB, \vResR1
  156. vfmsdb \vResI1, \vi1, \vrB, \vResI1
  157. vfmadb \vResR2, \vi2, \viB, \vResR2
  158. vfmsdb \vResI2, \vi2, \vrB, \vResI2
  159. vfmadb \vResR3, \vi1, \viB2, \vResR3
  160. vfmsdb \vResI3, \vi1, \vrB2, \vResI3
  161. vfmadb \vResR4, \vi2, \viB2, \vResR4
  162. vfmsdb \vResI4, \vi2, \vrB2, \vResI4
  163. vfmadb \vResR1, \vr1, \vrB, \vResR1
  164. vfmsdb \vResI1, \vr1, \viB, \vResI1
  165. vfmadb \vResR2, \vr2, \vrB, \vResR2
  166. vfmsdb \vResI2, \vr2, \viB, \vResI2
  167. vfmadb \vResR3, \vr1, \vrB2, \vResR3
  168. vfmsdb \vResI3, \vr1, \viB2, \vResI3
  169. vfmadb \vResR4, \vr2, \vrB2, \vResR4
  170. vfmsdb \vResI4, \vr2, \viB2, \vResI4
  171. #endif
  172. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  173. vfmsdb \vResR1, \vr1, \vrB, \vResR1
  174. vfmadb \vResI1, \vi1, \vrB, \vResI1
  175. vfmsdb \vResR2, \vr2, \vrB, \vResR2
  176. vfmadb \vResI2, \vi2, \vrB, \vResI2
  177. vfmsdb \vResR3, \vr1, \vrB2, \vResR3
  178. vfmadb \vResI3, \vi1, \vrB2, \vResI3
  179. vfmsdb \vResR4, \vr2, \vrB2, \vResR4
  180. vfmadb \vResI4, \vi2, \vrB2, \vResI4
  181. vfmsdb \vResR1, \vi1, \viB, \vResR1
  182. vfmadb \vResI1, \vr1, \viB, \vResI1
  183. vfmsdb \vResR2, \vi2, \viB, \vResR2
  184. vfmadb \vResI2, \vr2, \viB, \vResI2
  185. vfmsdb \vResR3, \vi1, \viB2, \vResR3
  186. vfmadb \vResI3, \vr1, \viB2, \vResI3
  187. vfmsdb \vResR4, \vi2, \viB2, \vResR4
  188. vfmadb \vResI4, \vr2, \viB2, \vResI4
  189. #endif
  190. .endm
  191. /*
  192. Calculate for 2x4 inner
  193. */
  194. .macro CalcComplex_2x4 vResR1, vResI1, vResR2, vResI2, vResR3, vResI3, vResR4, vResI4, vr1, vi1, vr2, vi2, vrB, viB,vrB2, viB2
  195. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  196. vfmsdb \vResR1, \vi1, \viB, \vResR1
  197. vfmadb \vResI1, \vr1, \viB, \vResI1
  198. vfmsdb \vResR2, \vi2, \viB, \vResR2
  199. vfmadb \vResI2, \vr2, \viB, \vResI2
  200. vfmsdb \vResR3, \vi1, \viB2, \vResR3
  201. vfmadb \vResI3, \vr1, \viB2, \vResI3
  202. vfmsdb \vResR4, \vi2, \viB2, \vResR4
  203. vfmadb \vResI4, \vr2, \viB2, \vResI4
  204. vfmsdb \vResR1, \vr1, \vrB, \vResR1
  205. vfmadb \vResI1, \vi1, \vrB, \vResI1
  206. vfmsdb \vResR2, \vr2, \vrB, \vResR2
  207. vfmadb \vResI2, \vi2, \vrB, \vResI2
  208. vfmsdb \vResR3, \vr1, \vrB2, \vResR3
  209. vfmadb \vResI3, \vi1, \vrB2, \vResI3
  210. vfmsdb \vResR4, \vr2, \vrB2, \vResR4
  211. vfmadb \vResI4, \vi2, \vrB2, \vResI4
  212. #endif
  213. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  214. vfmadb \vResR1, \vi1, \viB, \vResR1
  215. vfmsdb \vResI1, \vr1, \viB, \vResI1
  216. vfmadb \vResR2, \vi2, \viB, \vResR2
  217. vfmsdb \vResI2, \vr2, \viB, \vResI2
  218. vfmadb \vResR3, \vi1, \viB2, \vResR3
  219. vfmsdb \vResI3, \vr1, \viB2, \vResI3
  220. vfmadb \vResR4, \vi2, \viB2, \vResR4
  221. vfmsdb \vResI4, \vr2, \viB2, \vResI4
  222. vfmadb \vResR1, \vr1, \vrB, \vResR1
  223. vfmsdb \vResI1, \vi1, \vrB, \vResI1
  224. vfmadb \vResR2, \vr2, \vrB, \vResR2
  225. vfmsdb \vResI2, \vi2, \vrB, \vResI2
  226. vfmadb \vResR3, \vr1, \vrB2, \vResR3
  227. vfmsdb \vResI3, \vi1, \vrB2, \vResI3
  228. vfmadb \vResR4, \vr2, \vrB2, \vResR4
  229. vfmsdb \vResI4, \vi2, \vrB2, \vResI4
  230. #endif
  231. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  232. vfmadb \vResR1, \vi1, \viB, \vResR1
  233. vfmsdb \vResI1, \vi1, \vrB, \vResI1
  234. vfmadb \vResR2, \vi2, \viB, \vResR2
  235. vfmsdb \vResI2, \vi2, \vrB, \vResI2
  236. vfmadb \vResR3, \vi1, \viB2, \vResR3
  237. vfmsdb \vResI3, \vi1, \vrB2, \vResI3
  238. vfmadb \vResR4, \vi2, \viB2, \vResR4
  239. vfmsdb \vResI4, \vi2, \vrB2, \vResI4
  240. vfmadb \vResR1, \vr1, \vrB, \vResR1
  241. vfmsdb \vResI1, \vr1, \viB, \vResI1
  242. vfmadb \vResR2, \vr2, \vrB, \vResR2
  243. vfmsdb \vResI2, \vr2, \viB, \vResI2
  244. vfmadb \vResR3, \vr1, \vrB2, \vResR3
  245. vfmsdb \vResI3, \vr1, \viB2, \vResI3
  246. vfmadb \vResR4, \vr2, \vrB2, \vResR4
  247. vfmsdb \vResI4, \vr2, \viB2, \vResI4
  248. #endif
  249. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  250. vfmsdb \vResR1, \vr1, \vrB, \vResR1
  251. vfmadb \vResI1, \vi1, \vrB, \vResI1
  252. vfmsdb \vResR2, \vr2, \vrB, \vResR2
  253. vfmadb \vResI2, \vi2, \vrB, \vResI2
  254. vfmsdb \vResR3, \vr1, \vrB2, \vResR3
  255. vfmadb \vResI3, \vi1, \vrB2, \vResI3
  256. vfmsdb \vResR4, \vr2, \vrB2, \vResR4
  257. vfmadb \vResI4, \vi2, \vrB2, \vResI4
  258. vfmsdb \vResR1, \vi1, \viB, \vResR1
  259. vfmadb \vResI1, \vr1, \viB, \vResI1
  260. vfmsdb \vResR2, \vi2, \viB, \vResR2
  261. vfmadb \vResI2, \vr2, \viB, \vResI2
  262. vfmsdb \vResR3, \vi1, \viB2, \vResR3
  263. vfmadb \vResI3, \vr1, \viB2, \vResI3
  264. vfmsdb \vResR4, \vi2, \viB2, \vResR4
  265. vfmadb \vResI4, \vr2, \viB2, \vResI4
  266. #endif
  267. .endm
  268. /*
  269. Calculate for 2x2 inner
  270. */
  271. .macro CalcComplex_2x2 vResR1, vResI1,vResR2, vResI2, vR1, vI1, vRB, vIB, vRB2, vIB2
  272. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  273. vfmsdb \vResR1, \vI1, \vIB, \vResR1
  274. vfmadb \vResI1, \vR1, \vIB, \vResI1
  275. vfmsdb \vResR2, \vI1, \vIB2, \vResR2
  276. vfmadb \vResI2, \vR1, \vIB2, \vResI2
  277. vfmsdb \vResR1, \vR1, \vRB, \vResR1
  278. vfmadb \vResI1, \vI1, \vRB, \vResI1
  279. vfmsdb \vResR2, \vR1, \vRB2, \vResR2
  280. vfmadb \vResI2, \vI1, \vRB2, \vResI2
  281. #endif
  282. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  283. vfmadb \vResR1, \vI1, \vIB, \vResR1
  284. vfmsdb \vResI1, \vR1, \vIB, \vResI1
  285. vfmadb \vResR2, \vI1, \vIB2, \vResR2
  286. vfmsdb \vResI2, \vR1, \vIB2, \vResI2
  287. vfmadb \vResR1, \vR1, \vRB, \vResR1
  288. vfmsdb \vResI1, \vI1, \vRB, \vResI1
  289. vfmadb \vResR2, \vR1, \vRB2, \vResR2
  290. vfmsdb \vResI2, \vI1, \vRB2, \vResI2
  291. #endif
  292. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  293. vfmadb \vResR1, \vI1, \vIB, \vResR1
  294. vfmsdb \vResI1, \vI1, \vRB, \vResI1
  295. vfmadb \vResR2, \vI1, \vIB2, \vResR2
  296. vfmsdb \vResI2, \vI1, \vRB2, \vResI2
  297. vfmadb \vResR1, \vR1, \vRB, \vResR1
  298. vfmsdb \vResI1, \vR1, \vIB, \vResI1
  299. vfmadb \vResR2, \vR1, \vRB2, \vResR2
  300. vfmsdb \vResI2, \vR1, \vIB2, \vResI2
  301. #endif
  302. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  303. vfmsdb \vResR1, \vR1, \vRB, \vResR1
  304. vfmadb \vResI1, \vI1, \vRB, \vResI1
  305. vfmsdb \vResR2, \vR1, \vRB2, \vResR2
  306. vfmadb \vResI2, \vI1, \vRB2, \vResI2
  307. vfmsdb \vResR1, \vI1, \vIB, \vResR1
  308. vfmadb \vResI1, \vR1, \vIB, \vResI1
  309. vfmsdb \vResR2, \vI1, \vIB2, \vResR2
  310. vfmadb \vResI2, \vR1, \vIB2, \vResI2
  311. #endif
  312. .endm
  313. /*
  314. Calculate for 2x1 inner
  315. */
  316. .macro CalcComplex_2x1 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB
  317. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  318. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  319. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  320. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  321. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  322. #endif
  323. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  324. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  325. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  326. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  327. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  328. #endif
  329. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  330. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  331. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  332. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  333. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  334. #endif
  335. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  336. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  337. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  338. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  339. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  340. #endif
  341. .endm
  342. /*
  343. Calculate for 1x2 inner
  344. */
  345. .macro CalcComplex_1x2 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB
  346. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  347. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  348. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  349. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  350. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  351. #endif
  352. #if defined(RN) || defined(CN) || defined(RT) || defined(CT)
  353. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  354. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  355. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  356. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  357. #endif
  358. #if defined(NR) || defined(TR) || defined(NC) || defined(TC)
  359. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  360. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  361. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  362. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  363. #endif
  364. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  365. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  366. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  367. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  368. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  369. #endif
  370. .endm
  371. /*
  372. Calculate for 4x1 inner
  373. */
  374. .macro CalcComplex_4x1 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB
  375. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  376. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  377. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  378. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  379. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  380. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  381. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  382. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  383. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  384. #endif
  385. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  386. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  387. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  388. vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  389. vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  390. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  391. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  392. vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  393. vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  394. #endif
  395. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  396. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  397. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  398. vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  399. vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  400. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  401. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  402. vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  403. vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  404. #endif
  405. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  406. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  407. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  408. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  409. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  410. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  411. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  412. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  413. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  414. #endif
  415. .endm
  416. /*
  417. Calculate for 1x4 inner
  418. */
  419. .macro CalcComplex_1x4 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB
  420. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  421. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  422. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  423. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  424. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  425. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  426. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  427. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  428. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  429. #endif
  430. #if defined(RN) || defined(CN) || defined(RT) || defined(CT)
  431. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  432. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  433. vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  434. vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  435. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  436. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  437. vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  438. vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  439. #endif
  440. #if defined(NR) || defined(TR) || defined(NC) || defined(TC)
  441. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  442. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  443. vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  444. vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  445. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  446. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  447. vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  448. vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  449. #endif
  450. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  451. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  452. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  453. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  454. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  455. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  456. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  457. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  458. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  459. #endif
  460. .endm
  461. .macro CalcComplex_1x1 RealResult1, ImageResult1, Real1, Image1, RealB, ImageB
  462. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  463. msebr \RealResult1, \Image1, \ImageB
  464. maebr \ImageResult1, \Real1, \ImageB
  465. msebr \RealResult1, \Real1, \RealB
  466. maebr \ImageResult1, \Image1, \RealB
  467. #endif
  468. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  469. maebr \RealResult1, \Image1, \ImageB
  470. msebr \ImageResult1, \Real1, \ImageB
  471. maebr \RealResult1, \Real1, \RealB
  472. msebr \ImageResult1, \Image1, \RealB
  473. #endif
  474. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  475. maebr \RealResult1, \Image1, \ImageB
  476. msebr \ImageResult1, \Image1, \RealB
  477. maebr \RealResult1, \Real1, \RealB
  478. msebr \ImageResult1, \Real1, \ImageB
  479. #endif
  480. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  481. msebr \RealResult1, \Real1, \RealB
  482. maebr \ImageResult1, \Image1, \RealB
  483. msebr \RealResult1, \Image1, \ImageB
  484. maebr \ImageResult1, \Real1, \ImageB
  485. #endif
  486. .endm
  487. #define DISP(ind,stride,disp) (ind*stride+disp)
  488. #define DISP64(ind,disp) (ind*32+disp)
  489. #define DISP32(ind,disp) (ind*16+disp)
  490. #define DISP16(ind,disp) (ind*8+disp)
  491. #define unit_size 8
  492. #define DISP(ind,stride,disp) (ind*stride+disp)
  493. #define DISP8(ind,disp) (ind*unit_size*8+disp)
  494. #define DISP4(ind,disp) (ind*unit_size*4+disp)
  495. #define DISP2(ind,disp) (ind*unit_size*2+disp)
  496. #define DISP1(ind,disp) (ind*unit_size+disp)
  497. #define N8 (8*unit_size)
  498. #define N4 (4*unit_size)
  499. #define N2 (2*unit_size)
  500. #define N1 (1*unit_size)
  501. .macro ZCALC_4x4_I PTR_A_REG,PTR_B_REG,Index,IsLast
  502. vlef %v1, DISP4(\Index ,0) (\PTR_A_REG),0
  503. vlef %v5, DISP4(\Index ,4) (\PTR_A_REG),0
  504. vlef %v1, DISP4(\Index ,8) (\PTR_A_REG),2
  505. vlef %v5, DISP4(\Index ,12) (\PTR_A_REG),2
  506. vlef %v3, DISP4(\Index ,16) (\PTR_A_REG),0
  507. vlef %v7, DISP4(\Index ,20) (\PTR_A_REG),0
  508. vlef %v3, DISP4(\Index ,24) (\PTR_A_REG),2
  509. vlef %v7, DISP4(\Index ,28) (\PTR_A_REG),2
  510. vlrepf %v9, DISP4(\Index ,0)(\PTR_B_REG)
  511. vlrepf %v10 , DISP4(\Index ,4)(\PTR_B_REG)
  512. vlrepf %v11, DISP4(\Index ,8)(\PTR_B_REG)
  513. vlrepf %v12 , DISP4(\Index ,12)(\PTR_B_REG)
  514. vldeb %v1,%v1
  515. vldeb %v5,%v5
  516. vldeb %v3,%v3
  517. vldeb %v7,%v7
  518. vldeb %v9,%v9
  519. vldeb %v10,%v10
  520. vldeb %v11,%v11
  521. vldeb %v12,%v12
  522. CalcComplex_4x2 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12
  523. vlrepf %v9, DISP4(\Index ,16)(\PTR_B_REG)
  524. vlrepf %v10 , DISP4(\Index ,20)(\PTR_B_REG)
  525. vlrepf %v11, DISP4(\Index ,24)(\PTR_B_REG)
  526. vlrepf %v12 , DISP4(\Index ,28)(\PTR_B_REG)
  527. vldeb %v9,%v9
  528. vldeb %v10,%v10
  529. vldeb %v11,%v11
  530. vldeb %v12,%v12
  531. .if \IsLast==1
  532. la \PTR_A_REG, DISP4(\Index ,32)(\PTR_A_REG)
  533. .endif
  534. CalcComplex_4x2 %v24,%v25,%v26,%v27,%v28,%v29,%v30,%v31,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12
  535. .if \IsLast==1
  536. la \PTR_B_REG, DISP4(\Index ,32)(\PTR_B_REG)
  537. .endif
  538. .endm
  539. .macro ZCALC_4x2_I PTR_A_REG,PTR_B_REG,Index,IsLast
  540. vlef %v1, DISP4(\Index ,0) (\PTR_A_REG),0
  541. vlef %v5, DISP4(\Index ,4) (\PTR_A_REG),0
  542. vlef %v1, DISP4(\Index ,8) (\PTR_A_REG),2
  543. vlef %v5, DISP4(\Index ,12) (\PTR_A_REG),2
  544. vlef %v3, DISP4(\Index ,16) (\PTR_A_REG),0
  545. vlef %v7, DISP4(\Index ,20) (\PTR_A_REG),0
  546. vlef %v3, DISP4(\Index ,24) (\PTR_A_REG),2
  547. vlef %v7, DISP4(\Index ,28) (\PTR_A_REG),2
  548. vlrepf %v9, DISP2(\Index ,0)(\PTR_B_REG)
  549. vlrepf %v10 , DISP2(\Index ,4)(\PTR_B_REG)
  550. vlrepf %v11, DISP2(\Index ,8)(\PTR_B_REG)
  551. vlrepf %v12 , DISP2(\Index ,12)(\PTR_B_REG)
  552. vldeb %v1,%v1
  553. vldeb %v5,%v5
  554. vldeb %v3,%v3
  555. vldeb %v7,%v7
  556. vldeb %v9,%v9
  557. vldeb %v10,%v10
  558. vldeb %v11,%v11
  559. vldeb %v12,%v12
  560. .if \IsLast==1
  561. la \PTR_A_REG, DISP4(\Index ,32)(\PTR_A_REG)
  562. .endif
  563. CalcComplex_4x2 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12
  564. .if \IsLast==1
  565. la \PTR_B_REG, DISP2(\Index ,16)(\PTR_B_REG)
  566. .endif
  567. .endm
  568. .macro ZCALC_2x4_I PTR_A_REG,PTR_B_REG,Index,IsLast
  569. vlef %v1, DISP4(\Index ,0) (\PTR_B_REG),0
  570. vlef %v5, DISP4(\Index ,4) (\PTR_B_REG),0
  571. vlef %v1, DISP4(\Index ,8) (\PTR_B_REG),2
  572. vlef %v5, DISP4(\Index ,12) (\PTR_B_REG),2
  573. vlef %v3, DISP4(\Index ,16) (\PTR_B_REG),0
  574. vlef %v7, DISP4(\Index ,20) (\PTR_B_REG),0
  575. vlef %v3, DISP4(\Index ,24) (\PTR_B_REG),2
  576. vlef %v7, DISP4(\Index ,28) (\PTR_B_REG),2
  577. vlrepf %v9, DISP2(\Index ,0)(\PTR_A_REG)
  578. vlrepf %v10 , DISP2(\Index ,4)(\PTR_A_REG)
  579. vlrepf %v11, DISP2(\Index ,8)(\PTR_A_REG)
  580. vlrepf %v12 , DISP2(\Index ,12)(\PTR_A_REG)
  581. vldeb %v1,%v1
  582. vldeb %v5,%v5
  583. vldeb %v3,%v3
  584. vldeb %v7,%v7
  585. vldeb %v9,%v9
  586. vldeb %v10,%v10
  587. vldeb %v11,%v11
  588. vldeb %v12,%v12
  589. .if \IsLast==1
  590. la \PTR_B_REG, DISP4(\Index ,32)(\PTR_B_REG)
  591. .endif
  592. CalcComplex_2x4 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12
  593. .if \IsLast==1
  594. la \PTR_A_REG, DISP2(\Index ,16)(\PTR_A_REG)
  595. .endif
  596. .endm
  597. .macro ZCALC_4x1_I PTR_A_REG,PTR_B_REG,Index,IsLast
  598. vlef %v1, DISP4(\Index ,0) (\PTR_A_REG),0
  599. vlef %v5, DISP4(\Index ,4) (\PTR_A_REG),0
  600. vlef %v1, DISP4(\Index ,8) (\PTR_A_REG),2
  601. vlef %v5, DISP4(\Index ,12) (\PTR_A_REG),2
  602. vlef %v3, DISP4(\Index ,16) (\PTR_A_REG),0
  603. vlef %v7, DISP4(\Index ,20) (\PTR_A_REG),0
  604. vlef %v3, DISP4(\Index ,24) (\PTR_A_REG),2
  605. vlef %v7, DISP4(\Index ,28) (\PTR_A_REG),2
  606. vlrepf %v9, DISP1(\Index ,0)(\PTR_B_REG)
  607. vlrepf %v10 , DISP1(\Index ,4)(\PTR_B_REG)
  608. vldeb %v1,%v1
  609. vldeb %v5,%v5
  610. vldeb %v3,%v3
  611. vldeb %v7,%v7
  612. vldeb %v9,%v9
  613. vldeb %v10,%v10
  614. .if \IsLast==1
  615. la \PTR_A_REG, DISP4(\Index ,32)(\PTR_A_REG)
  616. .endif
  617. CalcComplex_4x1 %v16,%v17,%v18,%v19,%v1,%v5,%v3,%v7,%v9,%v10
  618. .if \IsLast==1
  619. la \PTR_B_REG, DISP1(\Index ,8)(\PTR_B_REG)
  620. .endif
  621. .endm
  622. .macro ZCALC_1x4_I PTR_A_REG,PTR_B_REG,Index,IsLast
  623. vlef %v1, DISP4(\Index ,0) (\PTR_B_REG),0
  624. vlef %v5, DISP4(\Index ,4) (\PTR_B_REG),0
  625. vlef %v1, DISP4(\Index ,8) (\PTR_B_REG),2
  626. vlef %v5, DISP4(\Index ,12) (\PTR_B_REG),2
  627. vlef %v3, DISP4(\Index ,16) (\PTR_B_REG),0
  628. vlef %v7, DISP4(\Index ,20) (\PTR_B_REG),0
  629. vlef %v3, DISP4(\Index ,24) (\PTR_B_REG),2
  630. vlef %v7, DISP4(\Index ,28) (\PTR_B_REG),2
  631. vlrepf %v9, DISP1(\Index ,0)(\PTR_A_REG)
  632. vlrepf %v10 , DISP1(\Index ,4)(\PTR_A_REG)
  633. vldeb %v1,%v1
  634. vldeb %v5,%v5
  635. vldeb %v3,%v3
  636. vldeb %v7,%v7
  637. vldeb %v9,%v9
  638. vldeb %v10,%v10
  639. .if \IsLast==1
  640. la \PTR_B_REG, DISP4(\Index ,32)(\PTR_B_REG)
  641. .endif
  642. CalcComplex_1x4 %v16,%v17,%v18,%v19,%v1,%v5,%v3,%v7,%v9,%v10
  643. .if \IsLast==1
  644. la \PTR_A_REG, DISP1(\Index ,8)(\PTR_A_REG)
  645. .endif
  646. .endm
  647. .macro ZCALC_2x2_I PTR_A_REG,PTR_B_REG ,Index,IsLast
  648. vlef %v1, DISP2(\Index ,0) (\PTR_A_REG),0
  649. vlef %v5, DISP2(\Index ,4) (\PTR_A_REG),0
  650. vlef %v1, DISP2(\Index ,8) (\PTR_A_REG),2
  651. vlef %v5, DISP2(\Index ,12) (\PTR_A_REG),2
  652. vlrepf %v9, DISP2(\Index ,0)(\PTR_B_REG)
  653. vlrepf %v10 , DISP2(\Index ,4)(\PTR_B_REG)
  654. vlrepf %v11, DISP2(\Index ,8)(\PTR_B_REG)
  655. vlrepf %v12 , DISP2(\Index ,12)(\PTR_B_REG)
  656. vldeb %v1,%v1
  657. vldeb %v5,%v5
  658. vldeb %v9,%v9
  659. vldeb %v10,%v10
  660. vldeb %v11,%v11
  661. vldeb %v12,%v12
  662. .if \IsLast==1
  663. la \PTR_A_REG, DISP2(\Index ,16)(\PTR_A_REG)
  664. .endif
  665. CalcComplex_2x2 %v16,%v17,%v20,%v21,%v1,%v5, %v9,%v10,%v11,%v12
  666. .if \IsLast==1
  667. la \PTR_B_REG, DISP2(\Index ,16)(\PTR_B_REG)
  668. .endif
  669. .endm
  670. .macro ZCALC_2x1_I PTR_A_REG,PTR_B_REG ,Index,IsLast
  671. vlef %v1, DISP2(\Index ,0) (\PTR_A_REG),0
  672. vlef %v5, DISP2(\Index ,4) (\PTR_A_REG),0
  673. vlef %v1, DISP2(\Index ,8) (\PTR_A_REG),2
  674. vlef %v5, DISP2(\Index ,12) (\PTR_A_REG),2
  675. vlrepf %v9, DISP1(\Index ,0)(\PTR_B_REG)
  676. vlrepf %v10 , DISP1(\Index ,4)(\PTR_B_REG)
  677. vldeb %v1,%v1
  678. vldeb %v5,%v5
  679. vldeb %v9,%v9
  680. vldeb %v10,%v10
  681. .if \IsLast==1
  682. la \PTR_A_REG, DISP2(\Index ,16)(\PTR_A_REG)
  683. .endif
  684. CalcComplex_2x1 %v16,%v17, %v1,%v5, %v9,%v10
  685. .if \IsLast==1
  686. la \PTR_B_REG, DISP1(\Index ,8)(\PTR_B_REG)
  687. .endif
  688. .endm
  689. .macro ZCALC_1x2_I PTR_A_REG,PTR_B_REG ,Index,IsLast
  690. vlef %v1, DISP2(\Index ,0) (\PTR_B_REG),0
  691. vlef %v5, DISP2(\Index ,4) (\PTR_B_REG),0
  692. vlef %v1, DISP2(\Index ,8) (\PTR_B_REG),2
  693. vlef %v5, DISP2(\Index ,12) (\PTR_B_REG),2
  694. vlrepf %v9, DISP1(\Index ,0)(\PTR_A_REG)
  695. vlrepf %v10 , DISP1(\Index ,4)(\PTR_A_REG)
  696. vldeb %v1,%v1
  697. vldeb %v5,%v5
  698. vldeb %v9,%v9
  699. vldeb %v10,%v10
  700. .if \IsLast==1
  701. la \PTR_B_REG, DISP2(\Index ,16)(\PTR_B_REG)
  702. .endif
  703. CalcComplex_1x2 %v16,%v17, %v1,%v5, %v9,%v10
  704. .if \IsLast==1
  705. la \PTR_A_REG, DISP1(\Index ,8)(\PTR_A_REG)
  706. .endif
  707. .endm
  708. .macro ZCALC_1x1_I PTR_A_REG,PTR_B_REG ,Index,IsLast
  709. le %f1 , DISP1(\Index ,0)(\PTR_A_REG)
  710. le %f3 , DISP1(\Index ,4)(\PTR_A_REG)
  711. le %f4 , DISP1(\Index ,0)(\PTR_B_REG)
  712. le %f5 , DISP1(\Index ,4)(\PTR_B_REG)
  713. .if \IsLast==1
  714. la \PTR_A_REG, DISP1(\Index ,8)(\PTR_A_REG)
  715. .endif
  716. CalcComplex_1x1 %f6,%f7,%f1,%f3,%f4,%f5
  717. .if \IsLast==1
  718. la \PTR_B_REG, DISP1(\Index ,8)(\PTR_B_REG)
  719. .endif
  720. .endm
  721. .macro ZCALC_4x4 PTR_A_REG,PTR_B_REG
  722. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,0,1
  723. .endm
  724. .macro ZCALC_4x2 PTR_A_REG,PTR_B_REG
  725. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,0,1
  726. .endm
  727. .macro ZCALC_4x1 PTR_A_REG,PTR_B_REG
  728. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,0,1
  729. .endm
  730. .macro ZCALC_4x4_4 PTR_A_REG,PTR_B_REG
  731. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,0,0
  732. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,1,0
  733. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,2,0
  734. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,3,1
  735. .endm
  736. .macro ZCALC_4x2_4 PTR_A_REG,PTR_B_REG
  737. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,0,0
  738. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,1,0
  739. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,2,0
  740. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,3,1
  741. .endm
  742. .macro ZCALC_4x1_4 PTR_A_REG,PTR_B_REG
  743. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,0,0
  744. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,1,0
  745. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,2,0
  746. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,3,1
  747. .endm
  748. .macro ZCALC_2x4_4 PTR_A_REG,PTR_B_REG
  749. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,0,0
  750. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,1,0
  751. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,2,0
  752. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,3,1
  753. .endm
  754. .macro ZCALC_2x4 PTR_A_REG,PTR_B_REG
  755. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,0,1
  756. .endm
  757. .macro ZCALC_1x4_4 PTR_A_REG,PTR_B_REG
  758. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,0,0
  759. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,1,0
  760. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,2,0
  761. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,3,1
  762. .endm
  763. .macro ZCALC_1x4 PTR_A_REG,PTR_B_REG
  764. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,0,1
  765. .endm
  766. .macro ZCALC_2x2 PTR_A_REG,PTR_B_REG
  767. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,0,1
  768. .endm
  769. .macro ZCALC_2x2_4 PTR_A_REG,PTR_B_REG
  770. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,0,0
  771. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,1,0
  772. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,2,0
  773. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,3,1
  774. .endm
  775. .macro ZCALC_2x1 PTR_A_REG,PTR_B_REG
  776. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,0,1
  777. .endm
  778. .macro ZCALC_2x1_4 PTR_A_REG,PTR_B_REG
  779. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,0,0
  780. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,1,0
  781. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,2,0
  782. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,3,1
  783. .endm
  784. .macro ZCALC_1x2_4 PTR_A_REG,PTR_B_REG
  785. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,0,0
  786. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,1,0
  787. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,2,0
  788. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,3,1
  789. .endm
  790. .macro ZCALC_1x2 PTR_A_REG,PTR_B_REG
  791. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,0,1
  792. .endm
  793. .macro ZCALC_1x1_4 PTR_A_REG,PTR_B_REG
  794. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,0,0
  795. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,1,0
  796. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,2,0
  797. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,3,1
  798. .endm
  799. .macro ZCALC_1x1 PTR_A_REG,PTR_B_REG
  800. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,0,1
  801. .endm
  802. /*****************************STORE RESULTS************************************/
  803. .macro CalcMultAlpha_4x1 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB
  804. #if defined (TRMMKERNEL)
  805. vfmdb \vRealResult1, \vImage1, \vecImageB
  806. vfmdb \vImageResult1, \vReal1, \vecImageB
  807. vfmdb \vRealResult2, \vImage2, \vecImageB
  808. vfmdb \vImageResult2, \vReal2, \vecImageB
  809. #else
  810. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  811. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  812. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  813. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  814. #endif
  815. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  816. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  817. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  818. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  819. .endm
  820. .macro CalcMultAlpha_2x1 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB
  821. #if defined (TRMMKERNEL)
  822. vfmdb \vRealResult1, \vImage1, \vecImageB
  823. vfmdb \vImageResult1, \vReal1, \vecImageB
  824. #else
  825. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  826. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  827. #endif
  828. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  829. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  830. .endm
  831. .macro CalcMultAlpha_1x1 RealResult1, ImageResult1, Real1, Image1, RealB, ImageB
  832. msebr \RealResult1, \Image1, \ImageB
  833. maebr \ImageResult1, \Real1, \ImageB
  834. msebr \RealResult1, \Real1, \RealB
  835. maebr \ImageResult1, \Image1, \RealB
  836. .endm
  837. .macro ZSTORE_4x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL ,LC1,LC2
  838. #if !defined(TRMMKERNEL)
  839. vlef %v3, 0(\CIJ_REG),0
  840. vlef %v4, 4(\CIJ_REG),0
  841. vlef %v3, 8(\CIJ_REG),2
  842. vlef %v4, 12(\CIJ_REG),2
  843. vlef %v5, 16(\CIJ_REG),0
  844. vlef %v6, 20(\CIJ_REG),0
  845. vlef %v5, 24(\CIJ_REG),2
  846. vlef %v6, 28(\CIJ_REG),2
  847. vldeb %v3,%v3
  848. vldeb %v4,%v4
  849. vldeb %v5,%v5
  850. vldeb %v6,%v6
  851. #endif
  852. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  853. CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  854. vledb %v3, %v3,0,0
  855. vledb %v4, %v4,0,0
  856. vledb %v5, %v5,0,0
  857. vledb %v6, %v6,0,0
  858. vstef %v3, 0(\CIJ_REG),0
  859. vstef %v4, 4(\CIJ_REG),0
  860. vstef %v3, 8(\CIJ_REG),2
  861. vstef %v4, 12(\CIJ_REG),2
  862. vstef %v5, 16(\CIJ_REG),0
  863. vstef %v6, 20(\CIJ_REG),0
  864. vstef %v5, 24(\CIJ_REG),2
  865. vstef %v6, 28(\CIJ_REG),2
  866. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  867. #if !defined(TRMMKERNEL)
  868. vlef %v16, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  869. vlef %v17, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  870. vlef %v16, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  871. vlef %v17, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  872. vlef %v18, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  873. vlef %v19, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  874. vlef %v18, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  875. vlef %v19, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  876. vldeb %v16,%v16
  877. vldeb %v17,%v17
  878. vldeb %v18,%v18
  879. vldeb %v19,%v19
  880. #endif
  881. CalcMultAlpha_4x1 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI
  882. vledb %v16, %v16,0,0
  883. vledb %v17, %v17,0,0
  884. vledb %v18, %v18,0,0
  885. vledb %v19, %v19,0,0
  886. vstef %v16, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  887. vstef %v17, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  888. vstef %v16, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  889. vstef %v17, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  890. vstef %v18, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  891. vstef %v19, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  892. vstef %v18, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  893. vstef %v19, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  894. #if !defined(TRMMKERNEL)
  895. vlef %v3, 0(\CIJ_REG, \LC1),0
  896. vlef %v4, 4(\CIJ_REG, \LC1),0
  897. vlef %v3, 8(\CIJ_REG, \LC1),2
  898. vlef %v4, 12(\CIJ_REG, \LC1),2
  899. vlef %v5, 16(\CIJ_REG, \LC1),0
  900. vlef %v6, 20(\CIJ_REG, \LC1),0
  901. vlef %v5, 24(\CIJ_REG, \LC1),2
  902. vlef %v6, 28(\CIJ_REG, \LC1),2
  903. vldeb %v3,%v3
  904. vldeb %v4,%v4
  905. vldeb %v5,%v5
  906. vldeb %v6,%v6
  907. #endif
  908. CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v24,%v25,%v26,%v27,\ALPHA_VECREG,\ALPHA_VECI
  909. vledb %v3, %v3,0,0
  910. vledb %v4, %v4,0,0
  911. vledb %v5, %v5,0,0
  912. vledb %v6, %v6,0,0
  913. vstef %v3, 0(\CIJ_REG,\LC1),0
  914. vstef %v4, 4(\CIJ_REG,\LC1),0
  915. vstef %v3, 8(\CIJ_REG,\LC1),2
  916. vstef %v4, 12(\CIJ_REG,\LC1),2
  917. vstef %v5, 16(\CIJ_REG,\LC1),0
  918. vstef %v6, 20(\CIJ_REG,\LC1),0
  919. vstef %v5, 24(\CIJ_REG,\LC1),2
  920. vstef %v6, 28(\CIJ_REG,\LC1),2
  921. #if !defined(TRMMKERNEL)
  922. vlef %v16, 0(\CIJ_REG,\LC2),0
  923. vlef %v17, 4(\CIJ_REG,\LC2),0
  924. vlef %v16, 8(\CIJ_REG,\LC2),2
  925. vlef %v17, 12(\CIJ_REG,\LC2),2
  926. vlef %v18, 16(\CIJ_REG,\LC2),0
  927. vlef %v19, 20(\CIJ_REG,\LC2),0
  928. vlef %v18, 24(\CIJ_REG,\LC2),2
  929. vlef %v19, 28(\CIJ_REG,\LC2),2
  930. vldeb %v16,%v16
  931. vldeb %v17,%v17
  932. vldeb %v18,%v18
  933. vldeb %v19,%v19
  934. #endif
  935. CalcMultAlpha_4x1 %v16,%v17,%v18,%v19,%v28,%v29,%v30,%v31,\ALPHA_VECREG,\ALPHA_VECI
  936. vledb %v16, %v16,0,0
  937. vledb %v17, %v17,0,0
  938. vledb %v18, %v18,0,0
  939. vledb %v19, %v19,0,0
  940. vstef %v16, 0(\CIJ_REG,\LC2),0
  941. vstef %v17, 4(\CIJ_REG,\LC2),0
  942. vstef %v16, 8(\CIJ_REG,\LC2),2
  943. vstef %v17, 12(\CIJ_REG,\LC2),2
  944. vstef %v18, 16(\CIJ_REG,\LC2),0
  945. vstef %v19, 20(\CIJ_REG,\LC2),0
  946. vstef %v18, 24(\CIJ_REG,\LC2),2
  947. vstef %v19, 28(\CIJ_REG,\LC2),2
  948. la \CIJ_REG,32(\CIJ_REG)
  949. .endm
  950. .macro ZSTORE_4x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  951. #if !defined(TRMMKERNEL)
  952. vlef %v3, 0(\CIJ_REG),0
  953. vlef %v4, 4(\CIJ_REG),0
  954. vlef %v3, 8(\CIJ_REG),2
  955. vlef %v4, 12(\CIJ_REG),2
  956. vlef %v5, 16(\CIJ_REG),0
  957. vlef %v6, 20(\CIJ_REG),0
  958. vlef %v5, 24(\CIJ_REG),2
  959. vlef %v6, 28(\CIJ_REG),2
  960. vldeb %v3,%v3
  961. vldeb %v4,%v4
  962. vldeb %v5,%v5
  963. vldeb %v6,%v6
  964. #endif
  965. CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  966. vledb %v3, %v3,0,0
  967. vledb %v4, %v4,0,0
  968. vledb %v5, %v5,0,0
  969. vledb %v6, %v6,0,0
  970. vstef %v3, 0(\CIJ_REG),0
  971. vstef %v4, 4(\CIJ_REG),0
  972. vstef %v3, 8(\CIJ_REG),2
  973. vstef %v4, 12(\CIJ_REG),2
  974. vstef %v5, 16(\CIJ_REG),0
  975. vstef %v6, 20(\CIJ_REG),0
  976. vstef %v5, 24(\CIJ_REG),2
  977. vstef %v6, 28(\CIJ_REG),2
  978. #if !defined(TRMMKERNEL)
  979. vlef %v16, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  980. vlef %v17, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  981. vlef %v16, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  982. vlef %v17, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  983. vlef %v18, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  984. vlef %v19, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  985. vlef %v18, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  986. vlef %v19, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  987. vldeb %v16,%v16
  988. vldeb %v17,%v17
  989. vldeb %v18,%v18
  990. vldeb %v19,%v19
  991. #endif
  992. CalcMultAlpha_4x1 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI
  993. vledb %v16, %v16,0,0
  994. vledb %v17, %v17,0,0
  995. vledb %v18, %v18,0,0
  996. vledb %v19, %v19,0,0
  997. vstef %v16, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  998. vstef %v17, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  999. vstef %v16, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1000. vstef %v17, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1001. vstef %v18, 16(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  1002. vstef %v19, 20(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  1003. vstef %v18, 24(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1004. vstef %v19, 28(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1005. la \CIJ_REG,32(\CIJ_REG)
  1006. .endm
  1007. .macro ZSTORE_4x1 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  1008. #if !defined(TRMMKERNEL)
  1009. vlef %v3, 0(\CIJ_REG),0
  1010. vlef %v4, 4(\CIJ_REG),0
  1011. vlef %v3, 8(\CIJ_REG),2
  1012. vlef %v4, 12(\CIJ_REG),2
  1013. vlef %v5, 16(\CIJ_REG),0
  1014. vlef %v6, 20(\CIJ_REG),0
  1015. vlef %v5, 24(\CIJ_REG),2
  1016. vlef %v6, 28(\CIJ_REG),2
  1017. vldeb %v3,%v3
  1018. vldeb %v4,%v4
  1019. vldeb %v5,%v5
  1020. vldeb %v6,%v6
  1021. #endif
  1022. CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  1023. vledb %v3, %v3,0,0
  1024. vledb %v4, %v4,0,0
  1025. vledb %v5, %v5,0,0
  1026. vledb %v6, %v6,0,0
  1027. vstef %v3, 0(\CIJ_REG),0
  1028. vstef %v4, 4(\CIJ_REG),0
  1029. vstef %v3, 8(\CIJ_REG),2
  1030. vstef %v4, 12(\CIJ_REG),2
  1031. vstef %v5, 16(\CIJ_REG),0
  1032. vstef %v6, 20(\CIJ_REG),0
  1033. vstef %v5, 24(\CIJ_REG),2
  1034. vstef %v6, 28(\CIJ_REG),2
  1035. la \CIJ_REG,32(\CIJ_REG)
  1036. .endm
  1037. .macro ZSTORE_1x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL,LC1,LC2
  1038. #if !defined(TRMMKERNEL)
  1039. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  1040. vlef %v3, 0(\CIJ_REG),0
  1041. vlef %v4, 4(\CIJ_REG),0
  1042. vlef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1043. vlef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1044. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  1045. vlef %v5, 0(\CIJ_REG,\LC1),0
  1046. vlef %v6, 4(\CIJ_REG,\LC1),0
  1047. vlef %v5, 0(\CIJ_REG,\LC2),2
  1048. vlef %v6, 4(\CIJ_REG,\LC2),2
  1049. vldeb %v3,%v3
  1050. vldeb %v4,%v4
  1051. vldeb %v5,%v5
  1052. vldeb %v6,%v6
  1053. #else
  1054. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  1055. #endif
  1056. CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  1057. #if defined(TRMMKERNEL)
  1058. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  1059. #endif
  1060. vledb %v3, %v3,0,0
  1061. vledb %v4, %v4,0,0
  1062. vledb %v5, %v5,0,0
  1063. vledb %v6, %v6,0,0
  1064. vstef %v3, 0(\CIJ_REG),0
  1065. vstef %v4, 4(\CIJ_REG),0
  1066. vstef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1067. vstef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1068. vstef %v5, 0(\CIJ_REG,\LC1),0
  1069. vstef %v6, 4(\CIJ_REG,\LC1),0
  1070. vstef %v5, 0(\CIJ_REG,\LC2),2
  1071. vstef %v6, 4(\CIJ_REG,\LC2),2
  1072. la \CIJ_REG,8(\CIJ_REG)
  1073. .endm
  1074. .macro ZSTORE_2x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL,LC1,LC2
  1075. #if !defined(TRMMKERNEL)
  1076. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  1077. vlef %v3, 0(\CIJ_REG),0
  1078. vlef %v4, 4(\CIJ_REG),0
  1079. vlef %v24, 8(\CIJ_REG),0
  1080. vlef %v25, 12(\CIJ_REG),0
  1081. vlef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1082. vlef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1083. vlef %v24, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1084. vlef %v25, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1085. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  1086. vlef %v5, 0(\CIJ_REG,\LC1),0
  1087. vlef %v6, 4(\CIJ_REG,\LC1),0
  1088. vlef %v26, 8(\CIJ_REG,\LC1),0
  1089. vlef %v27, 12(\CIJ_REG,\LC1),0
  1090. vlef %v5, 0(\CIJ_REG,\LC2),2
  1091. vlef %v6, 4(\CIJ_REG,\LC2),2
  1092. vlef %v26, 8(\CIJ_REG,\LC2),2
  1093. vlef %v27, 12(\CIJ_REG,\LC2),2
  1094. vldeb %v3,%v3
  1095. vldeb %v4,%v4
  1096. vldeb %v5,%v5
  1097. vldeb %v6,%v6
  1098. vldeb %v24,%v24
  1099. vldeb %v25,%v25
  1100. vldeb %v26,%v26
  1101. vldeb %v27,%v27
  1102. #else
  1103. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  1104. #endif
  1105. CalcMultAlpha_4x1 %v3,%v4,%v5,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  1106. CalcMultAlpha_4x1 %v24,%v25,%v26,%v27,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI
  1107. #if defined(TRMMKERNEL)
  1108. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  1109. #endif
  1110. vledb %v3, %v3,0,0
  1111. vledb %v4, %v4,0,0
  1112. vledb %v5, %v5,0,0
  1113. vledb %v6, %v6,0,0
  1114. vledb %v24, %v24,0,0
  1115. vledb %v25, %v25,0,0
  1116. vledb %v26, %v26,0,0
  1117. vledb %v27, %v27,0,0
  1118. vstef %v3, 0(\CIJ_REG),0
  1119. vstef %v4, 4(\CIJ_REG),0
  1120. vstef %v24, 8(\CIJ_REG),0
  1121. vstef %v25, 12(\CIJ_REG),0
  1122. vstef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1123. vstef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1124. vstef %v24, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1125. vstef %v25, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1126. vstef %v5, 0(\CIJ_REG,\LC1),0
  1127. vstef %v6, 4(\CIJ_REG,\LC1),0
  1128. vstef %v26, 8(\CIJ_REG,\LC1),0
  1129. vstef %v27, 12(\CIJ_REG,\LC1),0
  1130. vstef %v5, 0(\CIJ_REG,\LC2),2
  1131. vstef %v6, 4(\CIJ_REG,\LC2),2
  1132. vstef %v26, 8(\CIJ_REG,\LC2),2
  1133. vstef %v27, 12(\CIJ_REG,\LC2),2
  1134. la \CIJ_REG,16(\CIJ_REG)
  1135. .endm
  1136. .macro ZSTORE_2x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  1137. #if !defined(TRMMKERNEL)
  1138. vlef %v3, 0(\CIJ_REG),0
  1139. vlef %v4, 4(\CIJ_REG),0
  1140. vlef %v3, 8(\CIJ_REG),2
  1141. vlef %v4, 12(\CIJ_REG),2
  1142. vlef %v5, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  1143. vlef %v6, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  1144. vlef %v5, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1145. vlef %v6, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1146. vldeb %v3,%v3
  1147. vldeb %v4,%v4
  1148. vldeb %v5,%v5
  1149. vldeb %v6,%v6
  1150. #endif
  1151. CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI
  1152. CalcMultAlpha_2x1 %v5,%v6, %v20,%v21 ,\ALPHA_VECREG,\ALPHA_VECI
  1153. vledb %v3, %v3,0,0
  1154. vledb %v4, %v4,0,0
  1155. vledb %v5, %v5,0,0
  1156. vledb %v6, %v6,0,0
  1157. vstef %v3, 0(\CIJ_REG),0
  1158. vstef %v4, 4(\CIJ_REG),0
  1159. vstef %v3, 8(\CIJ_REG),2
  1160. vstef %v4, 12(\CIJ_REG),2
  1161. vstef %v5, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  1162. vstef %v6, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),0
  1163. vstef %v5, 8(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1164. vstef %v6, 12(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1165. la \CIJ_REG,16(\CIJ_REG)
  1166. .endm
  1167. .macro ZSTORE_2x1 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  1168. #if !defined(TRMMKERNEL)
  1169. vlef %v3, 0(\CIJ_REG),0
  1170. vlef %v4, 4(\CIJ_REG),0
  1171. vlef %v3, 8(\CIJ_REG),2
  1172. vlef %v4, 12(\CIJ_REG),2
  1173. vldeb %v3,%v3
  1174. vldeb %v4,%v4
  1175. #endif
  1176. CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI
  1177. vledb %v3, %v3,0,0
  1178. vledb %v4, %v4,0,0
  1179. vstef %v3, 0(\CIJ_REG),0
  1180. vstef %v4, 4(\CIJ_REG),0
  1181. vstef %v3, 8(\CIJ_REG),2
  1182. vstef %v4, 12(\CIJ_REG),2
  1183. la \CIJ_REG,16(\CIJ_REG)
  1184. .endm
  1185. .macro ZSTORE_1x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  1186. #if !defined(TRMMKERNEL)
  1187. vlef %v3, 0(\CIJ_REG),0
  1188. vlef %v4, 4(\CIJ_REG),0
  1189. vlef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1190. vlef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1191. vldeb %v3,%v3
  1192. vldeb %v4,%v4
  1193. #endif
  1194. CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI
  1195. vledb %v3, %v3,0,0
  1196. vledb %v4, %v4,0,0
  1197. vstef %v3, 0(\CIJ_REG),0
  1198. vstef %v4, 4(\CIJ_REG),0
  1199. vstef %v3, 0(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1200. vstef %v4, 4(\CIJ_REG,\LDC_BYTE_ORIGINAL),2
  1201. la \CIJ_REG,8(\CIJ_REG)
  1202. .endm
  1203. .macro ZSTORE_1x1 ALPHA_RR,ALPHA_RI ,CIJ_REG
  1204. #if defined (TRMMKERNEL)
  1205. lzer %f1
  1206. lzer %f3
  1207. #else
  1208. le %f1 , 0(\CIJ_REG)
  1209. le %f3 , 4(\CIJ_REG )
  1210. #endif
  1211. ledbr %f4,\ALPHA_RR
  1212. ledbr %f5,\ALPHA_RI
  1213. CalcMultAlpha_1x1 %f1,%f3, %f6,%f7,%f4,%f5
  1214. ste %f1,0(\CIJ_REG)
  1215. ste %f3,4(\CIJ_REG)
  1216. la \CIJ_REG,8(\CIJ_REG)
  1217. .endm
  1218. /****************************TRMM POINTER REFRESH MACROSES*************************/
  1219. .macro RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
  1220. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1221. /* ptrbb = bb;*/
  1222. lgr \PTR_B,\B_VAL /*refresh BPOINT*/
  1223. #else
  1224. /* ptrba =ptrba+ off*C_A;
  1225. ptrbb = bb + off*C_B;*/
  1226. .if \C_B==4
  1227. .if \C_A==4
  1228. sllg \PTR_B, \OFF_VAL,5
  1229. agr \PTR_A,\PTR_B /*ptrba+off*4**/
  1230. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1231. .elseif \C_A==2
  1232. sllg \PTR_B, \OFF_VAL,4
  1233. la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/
  1234. agr \PTR_B, \PTR_B
  1235. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1236. .elseif \C_A==1
  1237. sllg \PTR_B, \OFF_VAL,3
  1238. agr \PTR_A,\PTR_B /*ptrba+off*4**/
  1239. sllg \PTR_B, \OFF_VAL,5
  1240. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1241. .endif
  1242. .elseif \C_B==2
  1243. .if \C_A==4
  1244. sllg \PTR_B, \OFF_VAL,4
  1245. la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/
  1246. agr \PTR_A,\PTR_B /*ptrba+off*2**/
  1247. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1248. .elseif \C_A==2
  1249. sllg \PTR_B, \OFF_VAL,4
  1250. agr \PTR_A,\PTR_B /*ptrba+off*2**/
  1251. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1252. .elseif \C_A==1
  1253. sllg \PTR_B, \OFF_VAL,3
  1254. la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/
  1255. agr \PTR_B,\PTR_B /* off+off**/
  1256. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1257. .endif
  1258. .elseif \C_B==1
  1259. .if \C_A==4
  1260. sllg \PTR_B, \OFF_VAL,5
  1261. agr \PTR_A,\PTR_B /*ptrba+off*4**/
  1262. sllg \PTR_B, \OFF_VAL,3
  1263. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1264. .elseif \C_A==2
  1265. sllg \PTR_B, \OFF_VAL,3
  1266. la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/
  1267. agr \PTR_A,\PTR_B /*ptrba+off*1**/
  1268. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1269. .elseif \C_A==1
  1270. sllg \PTR_B, \OFF_VAL,3
  1271. agr \PTR_A,\PTR_B /*ptrba+off*1**/
  1272. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1273. .endif
  1274. .endif
  1275. #endif
  1276. .endm
  1277. /**/
  1278. .macro RefreshTempBk TEMP_VAL,BK_VAL,OFF_VAL,INCR_A,INCR_B
  1279. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1280. /* temp = bk-off;*/
  1281. sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL
  1282. #elif defined(LEFT)
  1283. /* temp = off+INCR_A; // number of values in A */
  1284. la \TEMP_VAL,\INCR_A(\OFF_VAL)
  1285. #else
  1286. /* temp = off+INCR_B // number of values in B*/
  1287. la \TEMP_VAL,\INCR_B(\OFF_VAL)
  1288. #endif
  1289. .endm
  1290. .macro RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,PTR_A,C_A,C_B
  1291. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1292. /*temp = bk - off;*/
  1293. sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL
  1294. #ifdef LEFT
  1295. /*temp -= 8; // number of values in A*/
  1296. lay \TEMP_VAL,-\C_A(\TEMP_VAL)
  1297. #else
  1298. /*temp -= 4; // number of values in B*/
  1299. lay \TEMP_VAL,-\C_B(\TEMP_VAL)
  1300. #endif
  1301. /*ptrba += temp*C_A;
  1302. ptrbb += temp*C_B;*/
  1303. .if \C_A==4
  1304. sllg \TEMP_VAL, \TEMP_VAL,5 /*temp*4*/
  1305. .elseif \C_A==2
  1306. sllg \TEMP_VAL, \TEMP_VAL,4 /*temp*2*/
  1307. .elseif \C_A==1
  1308. sllg \TEMP_VAL, \TEMP_VAL,3 /*temp*1*/
  1309. .endif
  1310. la \PTR_A,0(\PTR_A,\TEMP_VAL) /*ptrba+temp*C_A*/
  1311. #endif
  1312. #ifdef LEFT
  1313. /*off += \c_A; // number of values in A*/
  1314. aghi \OFF_VAL,\C_A
  1315. #endif
  1316. .endm