You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zkernelMacrosV.S 46 kB

8 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335
  1. /****************************************Implementation**Details**********************************************/
  2. /* */
  3. /* Lets denote (a,a1i) complex which is mathematically a+a1*i */
  4. /* Complex number multiplication: (a,a1i)*(b,b1i) */
  5. /* As i*i=-1 .The multiplication result will be: */
  6. /* (a+a1*i)(b+b1*i)=a*b+a1*i*b1*i+ a1*i*b+a*b1*i=a*b-a1*b1 + (a1*b+a*b1)*i which is (ab-a1b1,a1b+ab1) */
  7. /* so let c= ab-a1b1 , ci=a1b+ab1 then */
  8. /* c=c+a*b-a1*b1 => c=a*b-( a1*b1-c) => c= a1*b1-c then c=a*b-c two mseb */
  9. /* ci=ci+a1*b+a*b1 => ci= a1*b+ci then ci= a*b1+ci */
  10. /* For simd real and imaginary parts will be grouped together */
  11. /* such (realA,realK) and (imageA ,imageK) */
  12. /* Simd(0,1)=(a*b,k*b)-((ai*bi,ki*bi)-Simd(0,1)) */
  13. /* SimdI(0,1)=SimdI(0,1)+(a*bi,k*bi)+(ai*b,ki*b) */
  14. /* */
  15. /* */
  16. /* for defined(NR) || defined(NC) || defined(TR) || defined(TC) */
  17. /* (a+a1*I)(b-b1*I)=ab+a1*b1+I(a1b-ab1) */
  18. /* */
  19. /* c=c+ab+a1b1 => c=a1b1+c;c=ab+c */
  20. /* ci=ci+a1b-ab1 => ci=a1*b-(ab1-ci) => ci=ab1-ci; ci=a1*b-ci */
  21. /* */
  22. /* */
  23. /* for defined(RN) || defined(RT) || defined(CN) || defined(CT) */
  24. /* (a-a1*I)(b+b1*I)=ab+a1*b1+I(-a1b+ab1) */
  25. /* */
  26. /* c=c+ab+a1b1 => c=a1b1+c;c=ab+c */
  27. /* ci=ci+a1b-ab1 => ci=a*b1-(a1b-ci) => ci=a1b-ci; ci=a*b1-ci */
  28. /* */
  29. /* */
  30. /* for defined(RR) || defined(RC) || defined(CR) || defined(CC) */
  31. /* (a-a1*I)(b-b1*I)=ab-a1*b1+I(-a1b-ab1) */
  32. /* */
  33. /* c= a1*b1-c then c=a*b-c */
  34. /* ci = ci-a1*b -a*b1; */
  35. /* as ibm z13 only has x*z-m x*z+m instructions implementation will be changed a bit */
  36. /* Assuming ci=0; and cix=cix+a1b+ab1 ; ci=ci-cix will work */
  37. /* cix= a*b1+cix ; cix= a1*b+cix (two madb) ci=ci-cix (sign change if ci=0) */
  38. /* As c=0 then */
  39. /* c=a*b-c then c=a1*b1-c => c=(a1*b1-(a*b-c)) which is -1*( a*b -(a1*b1-c)) */
  40. /* */
  41. /* Values will be equal to (-c) and (-ci) */
  42. /* To change sign it'll be multiplied by -1*(alpha+alpha_i) */
  43. /* This is done once: */
  44. /* lcdbr ALPHA_I,ALPHA_I */
  45. /* lcdbr ALPHA ,ALPHA */
  46. /*************************************************************************************************************/
  47. /*************************Zero vectors***************************************/
  48. /*zero vectors for 4x4 */
  49. .macro ZERO_ZCVEC_4x4
  50. vzero %v16
  51. vzero %v17
  52. vzero %v18
  53. vzero %v19
  54. vzero %v20
  55. vzero %v21
  56. vzero %v22
  57. vzero %v23
  58. vzero %v24
  59. vzero %v25
  60. vzero %v26
  61. vzero %v27
  62. vzero %v28
  63. vzero %v29
  64. vzero %v30
  65. vzero %v31
  66. .endm
  67. /*zero vectors for */
  68. .macro ZERO_ZCVEC_2x4
  69. vzero %v16
  70. vzero %v17
  71. vzero %v18
  72. vzero %v19
  73. vzero %v20
  74. vzero %v21
  75. vzero %v22
  76. vzero %v23
  77. .endm
  78. /*zero vectors for */
  79. .macro ZERO_ZCVEC_1x4
  80. vzero %v16
  81. vzero %v17
  82. vzero %v18
  83. vzero %v19
  84. .endm
  85. /*zero vectors for */
  86. .macro ZERO_ZCVEC_4x2
  87. ZERO_ZCVEC_2x4
  88. .endm
  89. .macro ZERO_ZCVEC_4x1
  90. ZERO_ZCVEC_1x4
  91. .endm
  92. /*zero vectors for */
  93. .macro ZERO_ZCVEC_2x2
  94. vzero %v16
  95. vzero %v17
  96. vzero %v20
  97. vzero %v21
  98. .endm
  99. /*zero vectors for */
  100. .macro ZERO_ZCVEC_1x2
  101. vzero %v16
  102. vzero %v17
  103. .endm
  104. /*zero vectors for */
  105. .macro ZERO_ZCVEC_2x1
  106. vzero %v16
  107. vzero %v17
  108. .endm
  109. /*zero vectors for 1x1*/
  110. .macro ZERO_ZCVEC_1x1
  111. lzdr %f6
  112. lzdr %f7
  113. .endm
  114. /*
  115. Calculate for 4x2 inner
  116. */
  117. .macro CalcComplex_4x2 vResR1, vResI1, vResR2, vResI2, vResR3, vResI3, vResR4, vResI4, vr1, vi1, vr2, vi2, vrB, viB,vrB2, viB2
  118. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  119. vfmsdb \vResR1, \vi1, \viB, \vResR1
  120. vfmadb \vResI1, \vr1, \viB, \vResI1
  121. vfmsdb \vResR2, \vi2, \viB, \vResR2
  122. vfmadb \vResI2, \vr2, \viB, \vResI2
  123. vfmsdb \vResR3, \vi1, \viB2, \vResR3
  124. vfmadb \vResI3, \vr1, \viB2, \vResI3
  125. vfmsdb \vResR4, \vi2, \viB2, \vResR4
  126. vfmadb \vResI4, \vr2, \viB2, \vResI4
  127. vfmsdb \vResR1, \vr1, \vrB, \vResR1
  128. vfmadb \vResI1, \vi1, \vrB, \vResI1
  129. vfmsdb \vResR2, \vr2, \vrB, \vResR2
  130. vfmadb \vResI2, \vi2, \vrB, \vResI2
  131. vfmsdb \vResR3, \vr1, \vrB2, \vResR3
  132. vfmadb \vResI3, \vi1, \vrB2, \vResI3
  133. vfmsdb \vResR4, \vr2, \vrB2, \vResR4
  134. vfmadb \vResI4, \vi2, \vrB2, \vResI4
  135. #endif
  136. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  137. vfmadb \vResR1, \vi1, \viB, \vResR1
  138. vfmsdb \vResI1, \vr1, \viB, \vResI1
  139. vfmadb \vResR2, \vi2, \viB, \vResR2
  140. vfmsdb \vResI2, \vr2, \viB, \vResI2
  141. vfmadb \vResR3, \vi1, \viB2, \vResR3
  142. vfmsdb \vResI3, \vr1, \viB2, \vResI3
  143. vfmadb \vResR4, \vi2, \viB2, \vResR4
  144. vfmsdb \vResI4, \vr2, \viB2, \vResI4
  145. vfmadb \vResR1, \vr1, \vrB, \vResR1
  146. vfmsdb \vResI1, \vi1, \vrB, \vResI1
  147. vfmadb \vResR2, \vr2, \vrB, \vResR2
  148. vfmsdb \vResI2, \vi2, \vrB, \vResI2
  149. vfmadb \vResR3, \vr1, \vrB2, \vResR3
  150. vfmsdb \vResI3, \vi1, \vrB2, \vResI3
  151. vfmadb \vResR4, \vr2, \vrB2, \vResR4
  152. vfmsdb \vResI4, \vi2, \vrB2, \vResI4
  153. #endif
  154. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  155. vfmadb \vResR1, \vi1, \viB, \vResR1
  156. vfmsdb \vResI1, \vi1, \vrB, \vResI1
  157. vfmadb \vResR2, \vi2, \viB, \vResR2
  158. vfmsdb \vResI2, \vi2, \vrB, \vResI2
  159. vfmadb \vResR3, \vi1, \viB2, \vResR3
  160. vfmsdb \vResI3, \vi1, \vrB2, \vResI3
  161. vfmadb \vResR4, \vi2, \viB2, \vResR4
  162. vfmsdb \vResI4, \vi2, \vrB2, \vResI4
  163. vfmadb \vResR1, \vr1, \vrB, \vResR1
  164. vfmsdb \vResI1, \vr1, \viB, \vResI1
  165. vfmadb \vResR2, \vr2, \vrB, \vResR2
  166. vfmsdb \vResI2, \vr2, \viB, \vResI2
  167. vfmadb \vResR3, \vr1, \vrB2, \vResR3
  168. vfmsdb \vResI3, \vr1, \viB2, \vResI3
  169. vfmadb \vResR4, \vr2, \vrB2, \vResR4
  170. vfmsdb \vResI4, \vr2, \viB2, \vResI4
  171. #endif
  172. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  173. vfmsdb \vResR1, \vr1, \vrB, \vResR1
  174. vfmadb \vResI1, \vi1, \vrB, \vResI1
  175. vfmsdb \vResR2, \vr2, \vrB, \vResR2
  176. vfmadb \vResI2, \vi2, \vrB, \vResI2
  177. vfmsdb \vResR3, \vr1, \vrB2, \vResR3
  178. vfmadb \vResI3, \vi1, \vrB2, \vResI3
  179. vfmsdb \vResR4, \vr2, \vrB2, \vResR4
  180. vfmadb \vResI4, \vi2, \vrB2, \vResI4
  181. vfmsdb \vResR1, \vi1, \viB, \vResR1
  182. vfmadb \vResI1, \vr1, \viB, \vResI1
  183. vfmsdb \vResR2, \vi2, \viB, \vResR2
  184. vfmadb \vResI2, \vr2, \viB, \vResI2
  185. vfmsdb \vResR3, \vi1, \viB2, \vResR3
  186. vfmadb \vResI3, \vr1, \viB2, \vResI3
  187. vfmsdb \vResR4, \vi2, \viB2, \vResR4
  188. vfmadb \vResI4, \vr2, \viB2, \vResI4
  189. #endif
  190. .endm
  191. /*
  192. Calculate for 2x4 inner
  193. */
  194. .macro CalcComplex_2x4 vResR1, vResI1, vResR2, vResI2, vResR3, vResI3, vResR4, vResI4, vr1, vi1, vr2, vi2, vrB, viB,vrB2, viB2
  195. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  196. vfmsdb \vResR1, \vi1, \viB, \vResR1
  197. vfmadb \vResI1, \vr1, \viB, \vResI1
  198. vfmsdb \vResR2, \vi2, \viB, \vResR2
  199. vfmadb \vResI2, \vr2, \viB, \vResI2
  200. vfmsdb \vResR3, \vi1, \viB2, \vResR3
  201. vfmadb \vResI3, \vr1, \viB2, \vResI3
  202. vfmsdb \vResR4, \vi2, \viB2, \vResR4
  203. vfmadb \vResI4, \vr2, \viB2, \vResI4
  204. vfmsdb \vResR1, \vr1, \vrB, \vResR1
  205. vfmadb \vResI1, \vi1, \vrB, \vResI1
  206. vfmsdb \vResR2, \vr2, \vrB, \vResR2
  207. vfmadb \vResI2, \vi2, \vrB, \vResI2
  208. vfmsdb \vResR3, \vr1, \vrB2, \vResR3
  209. vfmadb \vResI3, \vi1, \vrB2, \vResI3
  210. vfmsdb \vResR4, \vr2, \vrB2, \vResR4
  211. vfmadb \vResI4, \vi2, \vrB2, \vResI4
  212. #endif
  213. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  214. vfmadb \vResR1, \vi1, \viB, \vResR1
  215. vfmsdb \vResI1, \vr1, \viB, \vResI1
  216. vfmadb \vResR2, \vi2, \viB, \vResR2
  217. vfmsdb \vResI2, \vr2, \viB, \vResI2
  218. vfmadb \vResR3, \vi1, \viB2, \vResR3
  219. vfmsdb \vResI3, \vr1, \viB2, \vResI3
  220. vfmadb \vResR4, \vi2, \viB2, \vResR4
  221. vfmsdb \vResI4, \vr2, \viB2, \vResI4
  222. vfmadb \vResR1, \vr1, \vrB, \vResR1
  223. vfmsdb \vResI1, \vi1, \vrB, \vResI1
  224. vfmadb \vResR2, \vr2, \vrB, \vResR2
  225. vfmsdb \vResI2, \vi2, \vrB, \vResI2
  226. vfmadb \vResR3, \vr1, \vrB2, \vResR3
  227. vfmsdb \vResI3, \vi1, \vrB2, \vResI3
  228. vfmadb \vResR4, \vr2, \vrB2, \vResR4
  229. vfmsdb \vResI4, \vi2, \vrB2, \vResI4
  230. #endif
  231. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  232. vfmadb \vResR1, \vi1, \viB, \vResR1
  233. vfmsdb \vResI1, \vi1, \vrB, \vResI1
  234. vfmadb \vResR2, \vi2, \viB, \vResR2
  235. vfmsdb \vResI2, \vi2, \vrB, \vResI2
  236. vfmadb \vResR3, \vi1, \viB2, \vResR3
  237. vfmsdb \vResI3, \vi1, \vrB2, \vResI3
  238. vfmadb \vResR4, \vi2, \viB2, \vResR4
  239. vfmsdb \vResI4, \vi2, \vrB2, \vResI4
  240. vfmadb \vResR1, \vr1, \vrB, \vResR1
  241. vfmsdb \vResI1, \vr1, \viB, \vResI1
  242. vfmadb \vResR2, \vr2, \vrB, \vResR2
  243. vfmsdb \vResI2, \vr2, \viB, \vResI2
  244. vfmadb \vResR3, \vr1, \vrB2, \vResR3
  245. vfmsdb \vResI3, \vr1, \viB2, \vResI3
  246. vfmadb \vResR4, \vr2, \vrB2, \vResR4
  247. vfmsdb \vResI4, \vr2, \viB2, \vResI4
  248. #endif
  249. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  250. vfmsdb \vResR1, \vr1, \vrB, \vResR1
  251. vfmadb \vResI1, \vi1, \vrB, \vResI1
  252. vfmsdb \vResR2, \vr2, \vrB, \vResR2
  253. vfmadb \vResI2, \vi2, \vrB, \vResI2
  254. vfmsdb \vResR3, \vr1, \vrB2, \vResR3
  255. vfmadb \vResI3, \vi1, \vrB2, \vResI3
  256. vfmsdb \vResR4, \vr2, \vrB2, \vResR4
  257. vfmadb \vResI4, \vi2, \vrB2, \vResI4
  258. vfmsdb \vResR1, \vi1, \viB, \vResR1
  259. vfmadb \vResI1, \vr1, \viB, \vResI1
  260. vfmsdb \vResR2, \vi2, \viB, \vResR2
  261. vfmadb \vResI2, \vr2, \viB, \vResI2
  262. vfmsdb \vResR3, \vi1, \viB2, \vResR3
  263. vfmadb \vResI3, \vr1, \viB2, \vResI3
  264. vfmsdb \vResR4, \vi2, \viB2, \vResR4
  265. vfmadb \vResI4, \vr2, \viB2, \vResI4
  266. #endif
  267. .endm
  268. /*
  269. Calculate for 2x2 inner
  270. */
  271. .macro CalcComplex_2x2 vResR1, vResI1,vResR2, vResI2, vR1, vI1, vRB, vIB, vRB2, vIB2
  272. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  273. vfmsdb \vResR1, \vI1, \vIB, \vResR1
  274. vfmadb \vResI1, \vR1, \vIB, \vResI1
  275. vfmsdb \vResR2, \vI1, \vIB2, \vResR2
  276. vfmadb \vResI2, \vR1, \vIB2, \vResI2
  277. vfmsdb \vResR1, \vR1, \vRB, \vResR1
  278. vfmadb \vResI1, \vI1, \vRB, \vResI1
  279. vfmsdb \vResR2, \vR1, \vRB2, \vResR2
  280. vfmadb \vResI2, \vI1, \vRB2, \vResI2
  281. #endif
  282. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  283. vfmadb \vResR1, \vI1, \vIB, \vResR1
  284. vfmsdb \vResI1, \vR1, \vIB, \vResI1
  285. vfmadb \vResR2, \vI1, \vIB2, \vResR2
  286. vfmsdb \vResI2, \vR1, \vIB2, \vResI2
  287. vfmadb \vResR1, \vR1, \vRB, \vResR1
  288. vfmsdb \vResI1, \vI1, \vRB, \vResI1
  289. vfmadb \vResR2, \vR1, \vRB2, \vResR2
  290. vfmsdb \vResI2, \vI1, \vRB2, \vResI2
  291. #endif
  292. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  293. vfmadb \vResR1, \vI1, \vIB, \vResR1
  294. vfmsdb \vResI1, \vI1, \vRB, \vResI1
  295. vfmadb \vResR2, \vI1, \vIB2, \vResR2
  296. vfmsdb \vResI2, \vI1, \vRB2, \vResI2
  297. vfmadb \vResR1, \vR1, \vRB, \vResR1
  298. vfmsdb \vResI1, \vR1, \vIB, \vResI1
  299. vfmadb \vResR2, \vR1, \vRB2, \vResR2
  300. vfmsdb \vResI2, \vR1, \vIB2, \vResI2
  301. #endif
  302. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  303. vfmsdb \vResR1, \vR1, \vRB, \vResR1
  304. vfmadb \vResI1, \vI1, \vRB, \vResI1
  305. vfmsdb \vResR2, \vR1, \vRB2, \vResR2
  306. vfmadb \vResI2, \vI1, \vRB2, \vResI2
  307. vfmsdb \vResR1, \vI1, \vIB, \vResR1
  308. vfmadb \vResI1, \vR1, \vIB, \vResI1
  309. vfmsdb \vResR2, \vI1, \vIB2, \vResR2
  310. vfmadb \vResI2, \vR1, \vIB2, \vResI2
  311. #endif
  312. .endm
  313. /*
  314. Calculate for 2x1 inner
  315. */
  316. .macro CalcComplex_2x1 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB
  317. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  318. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  319. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  320. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  321. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  322. #endif
  323. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  324. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  325. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  326. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  327. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  328. #endif
  329. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  330. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  331. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  332. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  333. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  334. #endif
  335. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  336. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  337. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  338. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  339. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  340. #endif
  341. .endm
  342. /*
  343. Calculate for 1x2 inner
  344. */
  345. .macro CalcComplex_1x2 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB
  346. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  347. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  348. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  349. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  350. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  351. #endif
  352. #if defined(RN) || defined(CN) || defined(RT) || defined(CT)
  353. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  354. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  355. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  356. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  357. #endif
  358. #if defined(NR) || defined(TR) || defined(NC) || defined(TC)
  359. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  360. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  361. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  362. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  363. #endif
  364. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  365. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  366. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  367. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  368. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  369. #endif
  370. .endm
  371. /*
  372. Calculate for 4x1 inner
  373. */
  374. .macro CalcComplex_4x1 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB
  375. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  376. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  377. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  378. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  379. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  380. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  381. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  382. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  383. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  384. #endif
  385. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  386. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  387. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  388. vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  389. vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  390. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  391. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  392. vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  393. vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  394. #endif
  395. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  396. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  397. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  398. vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  399. vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  400. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  401. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  402. vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  403. vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  404. #endif
  405. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  406. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  407. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  408. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  409. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  410. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  411. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  412. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  413. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  414. #endif
  415. .endm
  416. /*
  417. Calculate for 1x4 inner
  418. */
  419. .macro CalcComplex_1x4 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB
  420. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  421. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  422. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  423. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  424. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  425. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  426. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  427. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  428. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  429. #endif
  430. #if defined(RN) || defined(CN) || defined(RT) || defined(CT)
  431. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  432. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  433. vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  434. vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  435. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  436. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  437. vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  438. vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  439. #endif
  440. #if defined(NR) || defined(TR) || defined(NC) || defined(TC)
  441. vfmadb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  442. vfmsdb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  443. vfmadb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  444. vfmsdb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  445. vfmadb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  446. vfmsdb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  447. vfmadb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  448. vfmsdb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  449. #endif
  450. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  451. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  452. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  453. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  454. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  455. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  456. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  457. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  458. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  459. #endif
  460. .endm
  461. .macro CalcComplex_1x1 RealResult1, ImageResult1, Real1, Image1, RealB, ImageB
  462. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  463. msdbr \RealResult1, \Image1, \ImageB
  464. madbr \ImageResult1, \Real1, \ImageB
  465. msdbr \RealResult1, \Real1, \RealB
  466. madbr \ImageResult1, \Image1, \RealB
  467. #endif
  468. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  469. madbr \RealResult1, \Image1, \ImageB
  470. msdbr \ImageResult1, \Real1, \ImageB
  471. madbr \RealResult1, \Real1, \RealB
  472. msdbr \ImageResult1, \Image1, \RealB
  473. #endif
  474. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  475. madbr \RealResult1, \Image1, \ImageB
  476. msdbr \ImageResult1, \Image1, \RealB
  477. madbr \RealResult1, \Real1, \RealB
  478. msdbr \ImageResult1, \Real1, \ImageB
  479. #endif
  480. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  481. msdbr \RealResult1, \Real1, \RealB
  482. madbr \ImageResult1, \Image1, \RealB
  483. msdbr \RealResult1, \Image1, \ImageB
  484. madbr \ImageResult1, \Real1, \ImageB
  485. #endif
  486. .endm
  487. #define DISP(ind,stride,disp) (ind*stride+disp)
  488. #define DISP64(ind,disp) (ind*64+disp)
  489. #define DISP32(ind,disp) (ind*32+disp)
  490. #define DISP16(ind,disp) (ind*16+disp)
  491. #define USE_VLM 1
  492. .macro ZCALC_4x4_I PTR_A_REG,PTR_B_REG,Index,IsLast
  493. #if defined(USE_VLM)
  494. vlm %v4,%v7, DISP64(\Index ,0) (\PTR_A_REG)
  495. #else
  496. vl %v4 , DISP64(\Index ,0) (\PTR_A_REG)
  497. vl %v5 , DISP64(\Index ,16)(\PTR_A_REG)
  498. vl %v6 , DISP64(\Index ,32)(\PTR_A_REG)
  499. vl %v7 , DISP64(\Index ,48)(\PTR_A_REG)
  500. #endif
  501. vlrepg %v9, DISP64(\Index ,0)(\PTR_B_REG)
  502. vlrepg %v10 , DISP64(\Index ,8)(\PTR_B_REG)
  503. vlrepg %v11, DISP64(\Index ,16)(\PTR_B_REG)
  504. vlrepg %v12 , DISP64(\Index ,24)(\PTR_B_REG)
  505. vpdi %v1,%v4,%v5,0
  506. vpdi %v5,%v4,%v5,0b101
  507. vpdi %v3,%v6,%v7,0
  508. vpdi %v7,%v6,%v7,0b101
  509. CalcComplex_4x2 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12
  510. vlrepg %v9, DISP64(\Index ,32)(\PTR_B_REG)
  511. vlrepg %v10 , DISP64(\Index ,40)(\PTR_B_REG)
  512. vlrepg %v11, DISP64(\Index ,48)(\PTR_B_REG)
  513. vlrepg %v12 , DISP64(\Index ,56)(\PTR_B_REG)
  514. .if \IsLast==1
  515. la \PTR_A_REG, DISP64(\Index ,64)(\PTR_A_REG)
  516. .endif
  517. CalcComplex_4x2 %v24,%v25,%v26,%v27,%v28,%v29,%v30,%v31,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12
  518. .if \IsLast==1
  519. la \PTR_B_REG, DISP64(\Index ,64)(\PTR_B_REG)
  520. .endif
  521. .endm
  522. .macro ZCALC_4x2_I PTR_A_REG,PTR_B_REG,Index,IsLast
  523. #if defined(USE_VLM)
  524. vlm %v4,%v7, DISP64(\Index ,0) (\PTR_A_REG)
  525. #else
  526. vl %v4 , DISP64(\Index ,0) (\PTR_A_REG)
  527. vl %v5 , DISP64(\Index ,16)(\PTR_A_REG)
  528. vl %v6 , DISP64(\Index ,32)(\PTR_A_REG)
  529. vl %v7 , DISP64(\Index ,48)(\PTR_A_REG)
  530. #endif
  531. vlrepg %v9, DISP32(\Index ,0)(\PTR_B_REG)
  532. vlrepg %v10 , DISP32(\Index ,8)(\PTR_B_REG)
  533. vlrepg %v11, DISP32(\Index ,16)(\PTR_B_REG)
  534. vlrepg %v12 , DISP32(\Index ,24)(\PTR_B_REG)
  535. vpdi %v1,%v4,%v5,0
  536. vpdi %v5,%v4,%v5,0b101
  537. vpdi %v3,%v6,%v7,0
  538. vpdi %v7,%v6,%v7,0b101
  539. .if \IsLast==1
  540. la \PTR_A_REG, DISP64(\Index ,64)(\PTR_A_REG)
  541. .endif
  542. CalcComplex_4x2 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12
  543. .if \IsLast==1
  544. la \PTR_B_REG, DISP32(\Index ,32)(\PTR_B_REG)
  545. .endif
  546. .endm
  547. .macro ZCALC_2x4_I PTR_A_REG,PTR_B_REG,Index,IsLast
  548. #if defined(USE_VLM)
  549. vlm %v4,%v7, DISP64(\Index ,0) (\PTR_B_REG)
  550. #else
  551. vl %v4 , DISP64(\Index ,0) (\PTR_B_REG)
  552. vl %v5 , DISP64(\Index ,16)(\PTR_B_REG)
  553. vl %v6 , DISP64(\Index ,32)(\PTR_B_REG)
  554. vl %v7 , DISP64(\Index ,48)(\PTR_B_REG)
  555. #endif
  556. vlrepg %v9, DISP32(\Index ,0)(\PTR_A_REG)
  557. vlrepg %v10 , DISP32(\Index ,8)(\PTR_A_REG)
  558. vlrepg %v11, DISP32(\Index ,16)(\PTR_A_REG)
  559. vlrepg %v12 , DISP32(\Index ,24)(\PTR_A_REG)
  560. vpdi %v1,%v4,%v5,0
  561. vpdi %v5,%v4,%v5,0b101
  562. vpdi %v3,%v6,%v7,0
  563. vpdi %v7,%v6,%v7,0b101
  564. .if \IsLast==1
  565. la \PTR_B_REG, DISP64(\Index ,64)(\PTR_B_REG)
  566. .endif
  567. CalcComplex_2x4 %v16,%v17,%v18,%v19,%v20,%v21,%v22,%v23,%v1,%v5,%v3,%v7,%v9,%v10,%v11,%v12
  568. .if \IsLast==1
  569. la \PTR_A_REG, DISP32(\Index ,32)(\PTR_A_REG)
  570. .endif
  571. .endm
  572. .macro ZCALC_4x1_I PTR_A_REG,PTR_B_REG,Index,IsLast
  573. #if defined(USE_VLM)
  574. vlm %v4,%v7, DISP64(\Index ,0) (\PTR_A_REG)
  575. #else
  576. vl %v4 , DISP64(\Index ,0) (\PTR_A_REG)
  577. vl %v5 , DISP64(\Index ,16)(\PTR_A_REG)
  578. vl %v6 , DISP64(\Index ,32)(\PTR_A_REG)
  579. vl %v7 , DISP64(\Index ,48)(\PTR_A_REG)
  580. #endif
  581. vlrepg %v9, DISP16(\Index ,0)(\PTR_B_REG)
  582. vlrepg %v10 , DISP16(\Index ,8)(\PTR_B_REG)
  583. vpdi %v1,%v4,%v5,0
  584. vpdi %v11,%v4,%v5,0b101
  585. vpdi %v3,%v6,%v7,0
  586. vpdi %v12,%v6,%v7,0b101
  587. .if \IsLast==1
  588. la \PTR_A_REG, DISP64(\Index ,64)(\PTR_A_REG)
  589. .endif
  590. CalcComplex_4x1 %v16,%v17,%v18,%v19,%v1,%v11,%v3,%v12,%v9,%v10
  591. .if \IsLast==1
  592. la \PTR_B_REG, DISP16(\Index ,16)(\PTR_B_REG)
  593. .endif
  594. .endm
  595. .macro ZCALC_1x4_I PTR_A_REG,PTR_B_REG,Index,IsLast
  596. #if defined(USE_VLM)
  597. vlm %v4,%v7, DISP64(\Index ,0) (\PTR_B_REG)
  598. #else
  599. vl %v4 , DISP64(\Index ,0) (\PTR_B_REG)
  600. vl %v5 , DISP64(\Index ,16)(\PTR_B_REG)
  601. vl %v6 , DISP64(\Index ,32)(\PTR_B_REG)
  602. vl %v7 , DISP64(\Index ,48)(\PTR_B_REG)
  603. #endif
  604. vlrepg %v9, DISP16(\Index ,0)(\PTR_A_REG)
  605. vlrepg %v10 , DISP16(\Index ,8)(\PTR_A_REG)
  606. vpdi %v1,%v4,%v5,0
  607. vpdi %v11,%v4,%v5,0b101
  608. vpdi %v3,%v6,%v7,0
  609. vpdi %v12,%v6,%v7,0b101
  610. .if \IsLast==1
  611. la \PTR_B_REG, DISP64(\Index ,64)(\PTR_B_REG)
  612. .endif
  613. CalcComplex_1x4 %v16,%v17,%v18,%v19,%v1,%v11,%v3,%v12,%v9,%v10
  614. .if \IsLast==1
  615. la \PTR_A_REG, DISP16(\Index ,16)(\PTR_A_REG)
  616. .endif
  617. .endm
  618. .macro ZCALC_2x2_I PTR_A_REG,PTR_B_REG ,Index,IsLast
  619. vl %v1 , DISP32(\Index ,0)(\PTR_A_REG)
  620. vl %v3 , DISP32(\Index ,16)(\PTR_A_REG)
  621. vlrepg %v9, DISP32(\Index ,0)(\PTR_B_REG)
  622. vlrepg %v10 , DISP32(\Index ,8)(\PTR_B_REG)
  623. vlrepg %v11, DISP32(\Index ,16)(\PTR_B_REG)
  624. vlrepg %v12 , DISP32(\Index ,24)(\PTR_B_REG)
  625. vpdi %v5,%v1,%v3,0
  626. vpdi %v6,%v1,%v3,0b101
  627. .if \IsLast==1
  628. la \PTR_A_REG, DISP32(\Index ,32)(\PTR_A_REG)
  629. .endif
  630. CalcComplex_2x2 %v16,%v17,%v20,%v21,%v5,%v6, %v9,%v10,%v11,%v12
  631. .if \IsLast==1
  632. la \PTR_B_REG, DISP32(\Index ,32)(\PTR_B_REG)
  633. .endif
  634. .endm
  635. .macro ZCALC_2x1_I PTR_A_REG,PTR_B_REG ,Index,IsLast
  636. vl %v1 , DISP32(\Index ,0)(\PTR_A_REG)
  637. vl %v3 , DISP32(\Index ,16)(\PTR_A_REG)
  638. vlrepg %v6, DISP16(\Index ,0)(\PTR_B_REG)
  639. vlrepg %v7 , DISP16(\Index ,8)(\PTR_B_REG)
  640. vpdi %v4,%v1,%v3,0
  641. vpdi %v5,%v1,%v3,0b101
  642. .if \IsLast==1
  643. la \PTR_A_REG, DISP32(\Index ,32)(\PTR_A_REG)
  644. .endif
  645. CalcComplex_2x1 %v16,%v17,%v4,%v5,%v6,%v7
  646. .if \IsLast==1
  647. la \PTR_B_REG, DISP16(\Index ,16)(\PTR_B_REG)
  648. .endif
  649. .endm
  650. .macro ZCALC_1x2_I PTR_A_REG,PTR_B_REG ,Index,IsLast
  651. vl %v1 , DISP32(\Index ,0)(\PTR_B_REG)
  652. vl %v3 , DISP32(\Index ,16)(\PTR_B_REG)
  653. vlrepg %v6, DISP16(\Index ,0)(\PTR_A_REG)
  654. vlrepg %v7 , DISP16(\Index ,8)(\PTR_A_REG)
  655. vpdi %v4,%v1,%v3,0
  656. vpdi %v5,%v1,%v3,0b101
  657. .if \IsLast==1
  658. la \PTR_B_REG, DISP32(\Index ,32)(\PTR_B_REG)
  659. .endif
  660. CalcComplex_1x2 %v16,%v17,%v4,%v5,%v6,%v7
  661. .if \IsLast==1
  662. la \PTR_A_REG, DISP16(\Index ,16)(\PTR_A_REG)
  663. .endif
  664. .endm
  665. .macro ZCALC_1x1_I PTR_A_REG,PTR_B_REG ,Index,IsLast
  666. ld %f1 , DISP16(\Index ,0)(\PTR_A_REG)
  667. ld %f3 , DISP16(\Index ,8)(\PTR_A_REG)
  668. ld %f4 , DISP16(\Index ,0)(\PTR_B_REG)
  669. ld %f5 , DISP16(\Index ,8)(\PTR_B_REG)
  670. .if \IsLast==1
  671. la \PTR_A_REG, DISP16(\Index ,16)(\PTR_A_REG)
  672. .endif
  673. CalcComplex_1x1 %f6,%f7,%f1,%f3,%f4,%f5
  674. .if \IsLast==1
  675. la \PTR_B_REG, DISP16(\Index ,16)(\PTR_B_REG)
  676. .endif
  677. .endm
  678. .macro ZCALC_4x4 PTR_A_REG,PTR_B_REG
  679. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,0,1
  680. .endm
  681. .macro ZCALC_4x2 PTR_A_REG,PTR_B_REG
  682. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,0,1
  683. .endm
  684. .macro ZCALC_4x1 PTR_A_REG,PTR_B_REG
  685. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,0,1
  686. .endm
  687. .macro ZCALC_4x4_4 PTR_A_REG,PTR_B_REG
  688. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,0,0
  689. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,1,0
  690. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,2,0
  691. ZCALC_4x4_I \PTR_A_REG,\PTR_B_REG,3,1
  692. .endm
  693. .macro ZCALC_4x2_4 PTR_A_REG,PTR_B_REG
  694. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,0,0
  695. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,1,0
  696. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,2,0
  697. ZCALC_4x2_I \PTR_A_REG,\PTR_B_REG,3,1
  698. .endm
  699. .macro ZCALC_4x1_4 PTR_A_REG,PTR_B_REG
  700. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,0,0
  701. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,1,0
  702. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,2,0
  703. ZCALC_4x1_I \PTR_A_REG,\PTR_B_REG,3,1
  704. .endm
  705. .macro ZCALC_2x4_4 PTR_A_REG,PTR_B_REG
  706. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,0,0
  707. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,1,0
  708. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,2,0
  709. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,3,1
  710. .endm
  711. .macro ZCALC_2x4 PTR_A_REG,PTR_B_REG
  712. ZCALC_2x4_I \PTR_A_REG,\PTR_B_REG,0,1
  713. .endm
  714. .macro ZCALC_1x4_4 PTR_A_REG,PTR_B_REG
  715. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,0,0
  716. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,1,0
  717. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,2,0
  718. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,3,1
  719. .endm
  720. .macro ZCALC_1x4 PTR_A_REG,PTR_B_REG
  721. ZCALC_1x4_I \PTR_A_REG,\PTR_B_REG,0,1
  722. .endm
  723. .macro ZCALC_2x2 PTR_A_REG,PTR_B_REG
  724. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,0,1
  725. .endm
  726. .macro ZCALC_2x2_4 PTR_A_REG,PTR_B_REG
  727. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,0,0
  728. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,1,0
  729. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,2,0
  730. ZCALC_2x2_I \PTR_A_REG,\PTR_B_REG,3,1
  731. .endm
  732. .macro ZCALC_2x1 PTR_A_REG,PTR_B_REG
  733. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,0,1
  734. .endm
  735. .macro ZCALC_2x1_4 PTR_A_REG,PTR_B_REG
  736. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,0,0
  737. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,1,0
  738. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,2,0
  739. ZCALC_2x1_I \PTR_A_REG,\PTR_B_REG,3,1
  740. .endm
  741. .macro ZCALC_1x2_4 PTR_A_REG,PTR_B_REG
  742. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,0,0
  743. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,1,0
  744. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,2,0
  745. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,3,1
  746. .endm
  747. .macro ZCALC_1x2 PTR_A_REG,PTR_B_REG
  748. ZCALC_1x2_I \PTR_A_REG,\PTR_B_REG,0,1
  749. .endm
  750. .macro ZCALC_1x1_4 PTR_A_REG,PTR_B_REG
  751. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,0,0
  752. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,1,0
  753. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,2,0
  754. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,3,1
  755. .endm
  756. .macro ZCALC_1x1 PTR_A_REG,PTR_B_REG
  757. ZCALC_1x1_I \PTR_A_REG,\PTR_B_REG,0,1
  758. .endm
  759. /*****************************STORE RESULTS************************************/
  760. .macro CalcMultAlpha_4x1 vRealResult1, vImageResult1, vRealResult2, vImageResult2, vReal1, vImage1, vReal2, vImage2, vecRealB, vecImageB
  761. #if defined (TRMMKERNEL)
  762. vfmdb \vRealResult1, \vImage1, \vecImageB
  763. vfmdb \vImageResult1, \vReal1, \vecImageB
  764. vfmdb \vRealResult2, \vImage2, \vecImageB
  765. vfmdb \vImageResult2, \vReal2, \vecImageB
  766. #else
  767. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  768. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  769. vfmsdb \vRealResult2, \vImage2, \vecImageB, \vRealResult2
  770. vfmadb \vImageResult2, \vReal2, \vecImageB, \vImageResult2
  771. #endif
  772. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  773. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  774. vfmsdb \vRealResult2, \vReal2, \vecRealB, \vRealResult2
  775. vfmadb \vImageResult2, \vImage2, \vecRealB, \vImageResult2
  776. .endm
  777. .macro CalcMultAlpha_2x1 vRealResult1, vImageResult1, vReal1, vImage1, vecRealB, vecImageB
  778. #if defined (TRMMKERNEL)
  779. vfmdb \vRealResult1, \vImage1, \vecImageB
  780. vfmdb \vImageResult1, \vReal1, \vecImageB
  781. #else
  782. vfmsdb \vRealResult1, \vImage1, \vecImageB, \vRealResult1
  783. vfmadb \vImageResult1, \vReal1, \vecImageB, \vImageResult1
  784. #endif
  785. vfmsdb \vRealResult1, \vReal1, \vecRealB, \vRealResult1
  786. vfmadb \vImageResult1, \vImage1, \vecRealB, \vImageResult1
  787. .endm
  788. .macro CalcMultAlpha_1x1 RealResult1, ImageResult1, Real1, Image1, RealB, ImageB
  789. msdbr \RealResult1, \Image1, \ImageB
  790. madbr \ImageResult1, \Real1, \ImageB
  791. msdbr \RealResult1, \Real1, \RealB
  792. madbr \ImageResult1, \Image1, \RealB
  793. .endm
  794. .macro ZSTORE_4x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL ,LC1,LC2
  795. #if !defined(TRMMKERNEL)
  796. vl %v1 , 0(\CIJ_REG)
  797. vl %v4 , 16(\CIJ_REG)
  798. vpdi %v3,%v1,%v4,0
  799. vl %v7 , 32(\CIJ_REG)
  800. vpdi %v4,%v1,%v4,0b101
  801. vl %v6 , 48 (\CIJ_REG)
  802. vpdi %v1,%v7,%v6,0
  803. vpdi %v6,%v7,%v6,0b101
  804. #endif
  805. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  806. CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  807. vpdi %v16, %v3 ,%v4,0
  808. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  809. vpdi %v17, %v3,%v4,0b0101
  810. vst %v16,0(\CIJ_REG)
  811. vpdi %v18, %v1 ,%v6,0
  812. vst %v17,16(\CIJ_REG)
  813. vpdi %v19, %v1 ,%v6,0b0101
  814. vst %v18,32(\CIJ_REG)
  815. vst %v19,48(\CIJ_REG)
  816. #if !defined(TRMMKERNEL)
  817. vl %v1 , 0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  818. vl %v4 , 16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  819. vpdi %v3,%v1,%v4,0
  820. vl %v7 , 32(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  821. vpdi %v4,%v1,%v4,0b101
  822. vl %v6 , 48 (\CIJ_REG,\LDC_BYTE_ORIGINAL)
  823. vpdi %v1,%v7,%v6,0
  824. vpdi %v6,%v7,%v6,0b101
  825. #endif
  826. CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI
  827. vpdi %v16, %v3 ,%v4,0
  828. vpdi %v17, %v3 ,%v4,0b0101
  829. vst %v16,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  830. vpdi %v18, %v1 ,%v6,0
  831. vst %v17,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  832. vpdi %v19, %v1 ,%v6,0b0101
  833. vst %v18,32(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  834. vst %v19,48(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  835. #if !defined(TRMMKERNEL)
  836. vl %v1 , 0(\CIJ_REG,\LC1)
  837. vl %v4 , 16(\CIJ_REG,\LC1)
  838. vpdi %v3,%v1,%v4,0
  839. vl %v7 , 32(\CIJ_REG,\LC1)
  840. vpdi %v4,%v1,%v4,0b101
  841. vl %v6 , 48 (\CIJ_REG,\LC1)
  842. vpdi %v1,%v7,%v6,0
  843. vpdi %v6,%v7,%v6,0b101
  844. #endif
  845. CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v24,%v25,%v26,%v27,\ALPHA_VECREG,\ALPHA_VECI
  846. vpdi %v16, %v3 ,%v4,0
  847. vpdi %v17, %v3 ,%v4,0b0101
  848. vst %v16,0(\CIJ_REG,\LC1)
  849. vpdi %v18, %v1 ,%v6,0
  850. vst %v17,16(\CIJ_REG,\LC1)
  851. vpdi %v19, %v1 ,%v6,0b0101
  852. vst %v18,32(\CIJ_REG,\LC1)
  853. vst %v19,48(\CIJ_REG,\LC1)
  854. #if !defined(TRMMKERNEL)
  855. vl %v1 , 0(\CIJ_REG,\LC2)
  856. vl %v4 , 16(\CIJ_REG,\LC2)
  857. vpdi %v3,%v1,%v4,0
  858. vl %v7 , 32(\CIJ_REG,\LC2)
  859. vpdi %v4,%v1,%v4,0b101
  860. vl %v6 , 48 (\CIJ_REG,\LC2)
  861. vpdi %v1,%v7,%v6,0
  862. vpdi %v6,%v7,%v6,0b101
  863. #endif
  864. CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v28,%v29,%v30,%v31,\ALPHA_VECREG,\ALPHA_VECI
  865. vpdi %v16, %v3 ,%v4,0
  866. vpdi %v17, %v3 ,%v4,0b0101
  867. vst %v16,0(\CIJ_REG,\LC2)
  868. vpdi %v18, %v1 ,%v6,0
  869. vst %v17,16(\CIJ_REG,\LC2)
  870. vpdi %v19, %v1 ,%v6,0b0101
  871. vst %v18,32(\CIJ_REG,\LC2)
  872. vst %v19,48(\CIJ_REG,\LC2)
  873. la \CIJ_REG,64(\CIJ_REG)
  874. .endm
  875. .macro ZSTORE_4x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  876. #if !defined(TRMMKERNEL)
  877. vl %v1 , 0(\CIJ_REG)
  878. vl %v4 , 16(\CIJ_REG)
  879. vpdi %v3,%v1,%v4,0
  880. vl %v7 , 32(\CIJ_REG)
  881. vpdi %v4,%v1,%v4,0b101
  882. vl %v6 , 48 (\CIJ_REG)
  883. vpdi %v1,%v7,%v6,0
  884. vpdi %v6,%v7,%v6,0b101
  885. #endif
  886. CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  887. vpdi %v16, %v3 ,%v4,0
  888. vpdi %v17, %v3,%v4,0b0101
  889. vst %v16,0(\CIJ_REG)
  890. vpdi %v18, %v1 ,%v6,0
  891. vst %v17,16(\CIJ_REG)
  892. vpdi %v19, %v1 ,%v6,0b0101
  893. vst %v18,32(\CIJ_REG)
  894. vst %v19,48(\CIJ_REG)
  895. #if !defined(TRMMKERNEL)
  896. vl %v1 , 0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  897. vl %v4 , 16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  898. vpdi %v3,%v1,%v4,0
  899. vl %v7 , 32(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  900. vpdi %v4,%v1,%v4,0b101
  901. vl %v6 , 48 (\CIJ_REG,\LDC_BYTE_ORIGINAL)
  902. vpdi %v1,%v7,%v6,0
  903. vpdi %v6,%v7,%v6,0b101
  904. #endif
  905. CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI
  906. vpdi %v20, %v3 ,%v4,0
  907. vpdi %v21, %v3 ,%v4,0b0101
  908. vst %v20,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  909. vpdi %v22, %v1 ,%v6,0
  910. vst %v21,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  911. vpdi %v23, %v1 ,%v6,0b0101
  912. vst %v22,32(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  913. vst %v23,48(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  914. la \CIJ_REG,64(\CIJ_REG)
  915. .endm
  916. .macro ZSTORE_4x1 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  917. #if !defined(TRMMKERNEL)
  918. vl %v1 , 0(\CIJ_REG)
  919. vl %v4 , 16(\CIJ_REG)
  920. vpdi %v3,%v1,%v4,0
  921. vl %v7 , 32(\CIJ_REG)
  922. vpdi %v4,%v1,%v4,0b101
  923. vl %v6 , 48 (\CIJ_REG)
  924. vpdi %v1,%v7,%v6,0
  925. vpdi %v6,%v7,%v6,0b101
  926. #endif
  927. CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  928. vpdi %v16, %v3 ,%v4,0
  929. vpdi %v17, %v3,%v4,0b0101
  930. vst %v16,0(\CIJ_REG)
  931. vpdi %v18, %v1 ,%v6,0
  932. vst %v17,16(\CIJ_REG)
  933. vpdi %v19, %v1 ,%v6,0b0101
  934. vst %v18,32(\CIJ_REG)
  935. vst %v19,48(\CIJ_REG)
  936. la \CIJ_REG,64(\CIJ_REG)
  937. .endm
  938. .macro ZSTORE_1x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL,LC1,LC2
  939. #if !defined(TRMMKERNEL)
  940. vl %v1 , 0(\CIJ_REG)
  941. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  942. vl %v4 , 0(\CIJ_REG, \LDC_BYTE_ORIGINAL)
  943. vpdi %v3,%v1,%v4,0
  944. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  945. vl %v7 , 0(\CIJ_REG, \LC1)
  946. vpdi %v4,%v1,%v4,0b101
  947. vl %v6 , 0 (\CIJ_REG,\LC2)
  948. vpdi %v1,%v7,%v6,0
  949. vpdi %v6,%v7,%v6,0b101
  950. #else
  951. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  952. #endif
  953. CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  954. #if defined(TRMMKERNEL)
  955. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  956. #endif
  957. vpdi %v16, %v3 ,%v4,0
  958. vpdi %v17, %v3,%v4,0b0101
  959. vst %v16,0(\CIJ_REG)
  960. vpdi %v18, %v1 ,%v6,0
  961. vst %v17,0(\CIJ_REG, \LDC_BYTE_ORIGINAL)
  962. vpdi %v19, %v1 ,%v6,0b0101
  963. vst %v18,0(\CIJ_REG, \LC1)
  964. vst %v19,0(\CIJ_REG,\LC2)
  965. la \CIJ_REG,16(\CIJ_REG)
  966. .endm
  967. .macro ZSTORE_2x4 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL,LC1,LC2
  968. #if !defined(TRMMKERNEL)
  969. vl %v1 , 0(\CIJ_REG)
  970. vl %v26 , 16(\CIJ_REG)
  971. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  972. vl %v4 , 0(\CIJ_REG, \LDC_BYTE_ORIGINAL)
  973. vl %v25 , 16(\CIJ_REG, \LDC_BYTE_ORIGINAL)
  974. vpdi %v3,%v1,%v4,0
  975. vpdi %v24,%v26,%v25,0
  976. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  977. vl %v7 , 0(\CIJ_REG, \LC1)
  978. vl %v28 , 16(\CIJ_REG, \LC1)
  979. vpdi %v4,%v1,%v4,0b101
  980. vpdi %v25,%v26,%v25,0b101
  981. vl %v6 , 0 (\CIJ_REG,\LC2)
  982. vl %v27 , 16 (\CIJ_REG,\LC2)
  983. vpdi %v1,%v7,%v6,0
  984. vpdi %v6,%v7,%v6,0b101
  985. vpdi %v26,%v28,%v27,0
  986. vpdi %v27,%v28,%v27,0b101
  987. #else
  988. la \LC1,0(\LDC_BYTE_ORIGINAL, \LDC_BYTE_ORIGINAL)
  989. #endif
  990. CalcMultAlpha_4x1 %v3,%v4,%v1,%v6,%v16,%v17,%v18,%v19,\ALPHA_VECREG,\ALPHA_VECI
  991. CalcMultAlpha_4x1 %v24,%v25,%v26,%v27,%v20,%v21,%v22,%v23,\ALPHA_VECREG,\ALPHA_VECI
  992. #if defined(TRMMKERNEL)
  993. la \LC2,0(\LC1,\LDC_BYTE_ORIGINAL )
  994. #endif
  995. vpdi %v16, %v3 ,%v4,0
  996. vpdi %v17, %v3,%v4,0b0101
  997. vpdi %v20, %v24 ,%v25,0
  998. vpdi %v21, %v24,%v25,0b0101
  999. vpdi %v22, %v26 ,%v27,0
  1000. vpdi %v23, %v26 ,%v27,0b0101
  1001. vst %v16,0(\CIJ_REG)
  1002. vst %v20,16(\CIJ_REG)
  1003. vpdi %v18, %v1 ,%v6,0
  1004. vst %v17,0(\CIJ_REG, \LDC_BYTE_ORIGINAL)
  1005. vst %v21,16(\CIJ_REG, \LDC_BYTE_ORIGINAL)
  1006. vpdi %v19, %v1 ,%v6,0b0101
  1007. vst %v18,0(\CIJ_REG, \LC1)
  1008. vst %v22,16(\CIJ_REG, \LC1)
  1009. vst %v19,0(\CIJ_REG,\LC2)
  1010. vst %v23,16(\CIJ_REG,\LC2)
  1011. la \CIJ_REG,32(\CIJ_REG)
  1012. .endm
  1013. .macro ZSTORE_2x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  1014. #if !defined(TRMMKERNEL)
  1015. vl %v1 , 0(\CIJ_REG)
  1016. vl %v4 , 16(\CIJ_REG)
  1017. vpdi %v3,%v1,%v4,0
  1018. vpdi %v4,%v1,%v4,0b101
  1019. vl %v5 , 0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  1020. vl %v7 , 16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  1021. vpdi %v6,%v5,%v7,0
  1022. vpdi %v7,%v5,%v7,0b101
  1023. #endif
  1024. CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI
  1025. CalcMultAlpha_2x1 %v6,%v7, %v20,%v21 ,\ALPHA_VECREG,\ALPHA_VECI
  1026. vpdi %v16, %v3 ,%v4,0
  1027. vpdi %v17, %v3,%v4,0b0101
  1028. vst %v16,0(\CIJ_REG)
  1029. vst %v17,16(\CIJ_REG)
  1030. vpdi %v20, %v6 ,%v7,0
  1031. vpdi %v21, %v6 ,%v7,0b0101
  1032. vst %v20,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  1033. vst %v21,16(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  1034. la \CIJ_REG,32(\CIJ_REG)
  1035. .endm
  1036. .macro ZSTORE_2x1 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  1037. #if !defined(TRMMKERNEL)
  1038. vl %v1 , 0(\CIJ_REG)
  1039. vl %v4 , 16(\CIJ_REG)
  1040. vpdi %v3,%v1,%v4,0
  1041. vpdi %v4,%v1,%v4,0b101
  1042. #endif
  1043. CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI
  1044. vpdi %v16, %v3 ,%v4,0
  1045. vpdi %v17, %v3,%v4,0b0101
  1046. vst %v16,0(\CIJ_REG)
  1047. vst %v17,16(\CIJ_REG)
  1048. la \CIJ_REG,32(\CIJ_REG)
  1049. .endm
  1050. .macro ZSTORE_1x2 ALPHA_VECREG,ALPHA_VECI,CIJ_REG , LDC_BYTE_ORIGINAL
  1051. #if !defined(TRMMKERNEL)
  1052. vl %v1 , 0(\CIJ_REG)
  1053. vl %v4 , 0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  1054. vpdi %v3,%v1,%v4,0
  1055. vpdi %v4,%v1,%v4,0b101
  1056. #endif
  1057. CalcMultAlpha_2x1 %v3,%v4, %v16,%v17,\ALPHA_VECREG,\ALPHA_VECI
  1058. vpdi %v16, %v3 ,%v4,0
  1059. vpdi %v17, %v3,%v4,0b0101
  1060. vst %v16,0(\CIJ_REG)
  1061. vst %v17,0(\CIJ_REG,\LDC_BYTE_ORIGINAL)
  1062. la \CIJ_REG,16(\CIJ_REG)
  1063. .endm
  1064. .macro ZSTORE_1x1 ALPHA_RR,ALPHA_RI ,CIJ_REG
  1065. #if defined (TRMMKERNEL)
  1066. lzdr %f1
  1067. lzdr %f4
  1068. #else
  1069. ld %f1 , 0(\CIJ_REG)
  1070. ld %f4 , 8(\CIJ_REG )
  1071. #endif
  1072. CalcMultAlpha_1x1 %f1,%f4, %f6,%f7,\ALPHA_RR,\ALPHA_RI
  1073. std %f1,0(\CIJ_REG)
  1074. std %f4,8(\CIJ_REG)
  1075. la \CIJ_REG,16(\CIJ_REG)
  1076. .endm
  1077. /****************************TRMM POINTER REFRESH MACROSES*************************/
  1078. .macro RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
  1079. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1080. /* ptrbb = bb;*/
  1081. lgr \PTR_B,\B_VAL /*refresh BPOINT*/
  1082. #else
  1083. /* ptrba =ptrba+ off*C_A;
  1084. ptrbb = bb + off*C_B;*/
  1085. .if \C_B==4
  1086. .if \C_A==4
  1087. sllg \PTR_B, \OFF_VAL,6
  1088. agr \PTR_A,\PTR_B /*ptrba+off*4**/
  1089. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1090. .elseif \C_A==2
  1091. sllg \PTR_B, \OFF_VAL,5
  1092. la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/
  1093. agr \PTR_B, \PTR_B
  1094. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1095. .elseif \C_A==1
  1096. sllg \PTR_B, \OFF_VAL,4
  1097. agr \PTR_A,\PTR_B /*ptrba+off*4**/
  1098. sllg \PTR_B, \OFF_VAL,6
  1099. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1100. .endif
  1101. .elseif \C_B==2
  1102. .if \C_A==4
  1103. sllg \PTR_B, \OFF_VAL,5
  1104. la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*2**/
  1105. agr \PTR_A,\PTR_B /*ptrba+off*2**/
  1106. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1107. .elseif \C_A==2
  1108. sllg \PTR_B, \OFF_VAL,5
  1109. agr \PTR_A,\PTR_B /*ptrba+off*2**/
  1110. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1111. .elseif \C_A==1
  1112. sllg \PTR_B, \OFF_VAL,4
  1113. la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/
  1114. agr \PTR_B,\PTR_B /* off+off**/
  1115. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1116. .endif
  1117. .elseif \C_B==1
  1118. .if \C_A==4
  1119. sllg \PTR_B, \OFF_VAL,6
  1120. agr \PTR_A,\PTR_B /*ptrba+off*4**/
  1121. sllg \PTR_B, \OFF_VAL,4
  1122. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1123. .elseif \C_A==2
  1124. sllg \PTR_B, \OFF_VAL,4
  1125. la \PTR_A,0(\PTR_A,\PTR_B) /*ptrba+off*1**/
  1126. agr \PTR_A,\PTR_B /*ptrba+off*1**/
  1127. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1128. .elseif \C_A==1
  1129. sllg \PTR_B, \OFF_VAL,4
  1130. agr \PTR_A,\PTR_B /*ptrba+off*1**/
  1131. la \PTR_B,0(\B_VAL,\PTR_B) /*refresh BPOINT*/
  1132. .endif
  1133. .endif
  1134. #endif
  1135. .endm
  1136. /**/
  1137. .macro RefreshTempBk TEMP_VAL,BK_VAL,OFF_VAL,INCR_A,INCR_B
  1138. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1139. /* temp = bk-off;*/
  1140. sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL
  1141. #elif defined(LEFT)
  1142. /* temp = off+INCR_A; // number of values in A */
  1143. la \TEMP_VAL,\INCR_A(\OFF_VAL)
  1144. #else
  1145. /* temp = off+INCR_B // number of values in B*/
  1146. la \TEMP_VAL,\INCR_B(\OFF_VAL)
  1147. #endif
  1148. .endm
  1149. .macro RefreshPointersAndOFF TEMP_VAL,BK_VAL,OFF_VAL,PTR_A,C_A,C_B
  1150. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1151. /*temp = bk - off;*/
  1152. sgrk \TEMP_VAL,\BK_VAL,\OFF_VAL
  1153. #ifdef LEFT
  1154. /*temp -= 8; // number of values in A*/
  1155. lay \TEMP_VAL,-\C_A(\TEMP_VAL)
  1156. #else
  1157. /*temp -= 4; // number of values in B*/
  1158. lay \TEMP_VAL,-\C_B(\TEMP_VAL)
  1159. #endif
  1160. /*ptrba += temp*C_A;
  1161. ptrbb += temp*C_B;*/
  1162. .if \C_A==4
  1163. sllg \TEMP_VAL, \TEMP_VAL,6 /*temp*4*/
  1164. .elseif \C_A==2
  1165. sllg \TEMP_VAL, \TEMP_VAL,5 /*temp*2*/
  1166. .elseif \C_A==1
  1167. sllg \TEMP_VAL, \TEMP_VAL,4 /*temp*1*/
  1168. .endif
  1169. la \PTR_A,0(\PTR_A,\TEMP_VAL) /*ptrba+temp*C_A*/
  1170. #endif
  1171. #ifdef LEFT
  1172. /*off += \c_A; // number of values in A*/
  1173. aghi \OFF_VAL,\C_A
  1174. #endif
  1175. .endm