You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctrmm4x4V.S 19 kB

8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
8 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733
  1. /***************************************************************************
  2. Copyright (c) 2013-2017, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2017/03/12 AbdelRauf (quickwritereader@gmail.com)
  29. * BLASTEST : passed
  30. * CTEST : passed
  31. * TEST : passed
  32. **************************************************************************************/
  33. /*********************************************************************/
  34. /* Copyright 2009, 2010 The University of Texas at Austin. */
  35. /* All rights reserved. */
  36. /* */
  37. /* Redistribution and use in source and binary forms, with or */
  38. /* without modification, are permitted provided that the following */
  39. /* conditions are met: */
  40. /* */
  41. /* 1. Redistributions of source code must retain the above */
  42. /* copyright notice, this list of conditions and the following */
  43. /* disclaimer. */
  44. /* */
  45. /* 2. Redistributions in binary form must reproduce the above */
  46. /* copyright notice, this list of conditions and the following */
  47. /* disclaimer in the documentation and/or other materials */
  48. /* provided with the distribution. */
  49. /* */
  50. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  51. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  52. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  53. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  54. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  55. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  56. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  57. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  58. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  59. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  60. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  61. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  62. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  63. /* POSSIBILITY OF SUCH DAMAGE. */
  64. /* */
  65. /* The views and conclusions contained in the software and */
  66. /* documentation are those of the authors and should not be */
  67. /* interpreted as representing official policies, either expressed */
  68. /* or implied, of The University of Texas at Austin. */
  69. /*********************************************************************/
  70. #define ASSEMBLER
  71. #include "common.h"
  72. /*
  73. BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
  74. FLOAT* C,BLASLONG ldc, BLASLONG offset)
  75. ##bm=r2,bn=r3, bk=r4, alpha=f0,aplhai=f2, ba=r5,bb=r6,stack[160] ,ldc=stack[168]
  76. offset=stack[176]
  77. **********************************************************************************************/
  78. /*Note: r0 can not be used as address disp register */
  79. #define BM %r2
  80. #define BM_CUR %r0
  81. #define BN %r3
  82. #define BN_CUR %r10
  83. #define BK %r4
  84. #define LDC_BYTE %r8
  85. #define ALPHA %f0
  86. #define ALPHA_I %f2
  87. #define ALPHA_VECT %v0
  88. #define ALPHA_VECT_I %v2
  89. #define LOCAL_VAR1 %r9
  90. #define LOCAL_VAR2 %r1
  91. #define LOCAL_VAR3 %r11
  92. #define A %r5
  93. #define B %r6
  94. #define CIJ %r7
  95. #define CIJ_LOCAL %r12
  96. #define OFF %r13
  97. #define OFFSET %f8
  98. #define ALIGN_4 .align 32
  99. #define ALIGN_2 .align 16
  100. #define PREFETCH_INS 1
  101. /**************************Include kernel helper macrosses**********************************/
  102. #include "ckernelMacrosV.S"
  103. /***********************************CGEMM**4x4*******************************************************/
  104. PROLOGUE
  105. #if defined(TRMMKERNEL)
  106. std OFFSET ,40(%r15)
  107. stmg %r6,%r13,48(%r15)
  108. #else
  109. stmg %r6,%r12,48(%r15)
  110. #endif
  111. std %f9, 128(%r15)
  112. std %f10,136(%r15)
  113. std %f11,144(%r15)
  114. std %f12,152(%r15)
  115. lg CIJ, 160(%r15)
  116. lg LOCAL_VAR1, 168(%r15)
  117. #if defined(TRMMKERNEL)
  118. lg OFF,176(%r15)
  119. ldgr OFFSET ,OFF
  120. #endif
  121. srlg BN_CUR,BN,2
  122. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  123. lcdbr ALPHA_I,ALPHA_I
  124. lcdbr ALPHA ,ALPHA
  125. #endif
  126. vrepg ALPHA_VECT,ALPHA_VECT,0 /*replicate alpha which in f0*/
  127. sllg LDC_BYTE, LOCAL_VAR1,3 /*calculate lcd stride with complex=8 x<<4 */
  128. vrepg ALPHA_VECT_I,ALPHA_VECT_I,0 /*replicate alpha which in f0*/
  129. vldeb ALPHA_VECT,ALPHA_VECT
  130. vldeb ALPHA_VECT_I,ALPHA_VECT_I
  131. #if defined(TRMMKERNEL) && !defined(LEFT)
  132. /*off = -offset;*/
  133. lgdr LOCAL_VAR1,OFFSET
  134. lcgr OFF,LOCAL_VAR1
  135. #endif
  136. cijle BN_CUR,0,.LX2
  137. ALIGN_4
  138. .LX4_BN:
  139. #if defined(PREFETCH_INS)
  140. pfd 1, 0(A)
  141. pfd 1, 0(B)
  142. #endif
  143. #if defined(TRMMKERNEL) && defined(LEFT)
  144. /*off = offset;*/
  145. lgdr OFF,OFFSET
  146. #endif
  147. srlg BM_CUR,BM,2
  148. lgr LOCAL_VAR3,A
  149. lgr CIJ_LOCAL,CIJ
  150. cijle BM_CUR,0,.L2x4
  151. ALIGN_4
  152. .L4x4_BM: /*BM start*/
  153. #if defined(TRMMKERNEL)
  154. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  155. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,4
  156. RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
  157. srl LOCAL_VAR1,2
  158. #else
  159. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  160. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  161. #endif
  162. ZERO_ZCVEC_4x4
  163. cijle LOCAL_VAR1,0,.L4x4_mod
  164. ALIGN_4
  165. .L4x4_4_BK: /*BK_CUR LOOP */
  166. ZCALC_4x4_4 LOCAL_VAR3,LOCAL_VAR2
  167. #if defined(PREFETCH_INS)
  168. pfd 1, 128(LOCAL_VAR3) /*256-128*/
  169. pfd 1, 128(LOCAL_VAR2 )
  170. #endif
  171. brctg LOCAL_VAR1,.L4x4_4_BK
  172. ALIGN_4
  173. .L4x4_mod:
  174. #if defined(TRMMKERNEL)
  175. RefreshTempBk LOCAL_VAR1,BK,OFF,4,4
  176. nill LOCAL_VAR1,3
  177. #else
  178. lghi LOCAL_VAR1,3
  179. NGR LOCAL_VAR1,BK /*refresh BK*/
  180. #endif
  181. jz .L4x4_BK_Store
  182. ALIGN_4
  183. .L4x4_BK: /*BK_CUR LOOP */
  184. ZCALC_4x4 LOCAL_VAR3,LOCAL_VAR2
  185. brctg LOCAL_VAR1,.L4x4_BK
  186. ALIGN_4
  187. .L4x4_BK_Store:
  188. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  189. ZSTORE_4x4 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE,LOCAL_VAR1,LOCAL_VAR2
  190. #if defined(TRMMKERNEL)
  191. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,4,4
  192. #endif
  193. brctg BM_CUR,.L4x4_BM
  194. ALIGN_2
  195. .L2x4:
  196. tmll BM,2
  197. jz .L1x4
  198. ALIGN_4
  199. .L2x4_BM: /*BM start*/
  200. #if defined(TRMMKERNEL)
  201. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  202. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,4
  203. RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
  204. srl LOCAL_VAR1,2
  205. #else
  206. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  207. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  208. #endif
  209. ZERO_ZCVEC_2x4
  210. cijle LOCAL_VAR1,0,.L2x4_mod
  211. ALIGN_4
  212. .L2x4_4_BK: /*BK_CUR LOOP */
  213. ZCALC_2x4_4 LOCAL_VAR3,LOCAL_VAR2
  214. #if defined(PREFETCH_INS)
  215. pfd 1, 128(LOCAL_VAR2)
  216. #endif
  217. brctg LOCAL_VAR1,.L2x4_4_BK
  218. ALIGN_4
  219. .L2x4_mod:
  220. #if defined(TRMMKERNEL)
  221. RefreshTempBk LOCAL_VAR1,BK,OFF,2,4
  222. nill LOCAL_VAR1,3
  223. #else
  224. lghi LOCAL_VAR1,3
  225. NGR LOCAL_VAR1,BK /*refresh BK*/
  226. #endif
  227. jz .L2x4_BK_Store
  228. ALIGN_4
  229. .L2x4_BK: /*BK_CUR LOOP */
  230. ZCALC_2x4 LOCAL_VAR3,LOCAL_VAR2
  231. brctg LOCAL_VAR1,.L2x4_BK
  232. ALIGN_4
  233. .L2x4_BK_Store:
  234. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  235. ZSTORE_2x4 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE ,LOCAL_VAR1,LOCAL_VAR2
  236. #if defined(TRMMKERNEL)
  237. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,2,4
  238. #endif
  239. ALIGN_4
  240. .L1x4:
  241. tmll BM,1
  242. jz .Lx4_INNER_END
  243. ALIGN_4
  244. .L1x4_BM: /*BM start*/
  245. #if defined(TRMMKERNEL)
  246. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  247. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,4
  248. RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
  249. srl LOCAL_VAR1,2
  250. #else
  251. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  252. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  253. #endif
  254. ZERO_ZCVEC_1x4
  255. cijle LOCAL_VAR1,0,.L1x4_mod
  256. ALIGN_4
  257. .L1x4_4_BK: /*BK_CUR LOOP */
  258. ZCALC_1x4_4 LOCAL_VAR3,LOCAL_VAR2
  259. brctg LOCAL_VAR1,.L1x4_4_BK
  260. ALIGN_4
  261. .L1x4_mod:
  262. #if defined(TRMMKERNEL)
  263. RefreshTempBk LOCAL_VAR1,BK,OFF,1,4
  264. nill LOCAL_VAR1,3
  265. #else
  266. lghi LOCAL_VAR1,3
  267. NGR LOCAL_VAR1,BK /*refresh BK*/
  268. #endif
  269. jz .L1x4_BK_Store
  270. ALIGN_4
  271. .L1x4_BK: /*BK_CUR LOOP */
  272. ZCALC_1x4 LOCAL_VAR3,LOCAL_VAR2
  273. brctg LOCAL_VAR1,.L1x4_BK
  274. ALIGN_4
  275. .L1x4_BK_Store:
  276. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  277. ZSTORE_1x4 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE,LOCAL_VAR1,LOCAL_VAR2
  278. #if defined(TRMMKERNEL)
  279. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,1,4
  280. #endif
  281. ALIGN_2
  282. .Lx4_INNER_END:
  283. /*add LDC_BYTE_COPY to new*/
  284. sllg LOCAL_VAR1,LDC_BYTE,2 /*multiply*4 */
  285. #if defined(TRMMKERNEL) && !defined(LEFT)
  286. aghi OFF,4
  287. #endif
  288. sllg LOCAL_VAR2,BK,5 /*multiply*4*sizeof(complex) =multiply*4*8* 2**5 */
  289. la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*4*/
  290. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*4*sizeof(complex) */
  291. brctg BN_CUR,.LX4_BN
  292. /*********************************X2 SECTION************************************************/
  293. ALIGN_4
  294. .LX2:
  295. tmll BN,2
  296. jz .Lx1
  297. ALIGN_4
  298. .Lx2_BN:
  299. #if defined(TRMMKERNEL) && defined(LEFT)
  300. /*off = offset;*/
  301. lgdr OFF,OFFSET
  302. #endif
  303. srlg BM_CUR,BM,2
  304. lgr LOCAL_VAR3,A
  305. lgr CIJ_LOCAL,CIJ
  306. cijle BM_CUR,0,.L2x2
  307. ALIGN_4
  308. .L4x2_BM: /*BM start*/
  309. #if defined(TRMMKERNEL)
  310. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  311. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,2
  312. RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
  313. srl LOCAL_VAR1,2
  314. #else
  315. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  316. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  317. #endif
  318. ZERO_ZCVEC_4x2
  319. cijle LOCAL_VAR1,0,.L4x2_mod
  320. ALIGN_4
  321. .L4x2_4_BK: /*BK_CUR LOOP */
  322. ZCALC_4x2_4 LOCAL_VAR3,LOCAL_VAR2
  323. #if defined(PREFETCH_INS)
  324. pfd 1, 128(LOCAL_VAR3)
  325. #endif
  326. brctg LOCAL_VAR1,.L4x2_4_BK
  327. ALIGN_4
  328. .L4x2_mod:
  329. #if defined(TRMMKERNEL)
  330. RefreshTempBk LOCAL_VAR1,BK,OFF,4,2
  331. nill LOCAL_VAR1,3
  332. #else
  333. lghi LOCAL_VAR1,3
  334. NGR LOCAL_VAR1,BK /*refresh BK*/
  335. #endif
  336. jz .L4x2_BK_Store
  337. ALIGN_4
  338. .L4x2_BK: /*BK_CUR LOOP */
  339. ZCALC_4x2 LOCAL_VAR3,LOCAL_VAR2
  340. brctg LOCAL_VAR1,.L4x2_BK
  341. ALIGN_4
  342. .L4x2_BK_Store:
  343. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  344. ZSTORE_4x2 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE
  345. #if defined(TRMMKERNEL)
  346. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,4,2
  347. #endif
  348. ALIGN_4
  349. brctg BM_CUR,.L4x2_BM
  350. ALIGN_2
  351. .L2x2:
  352. tmll BM,2
  353. jz .L1x2
  354. ALIGN_4
  355. .L2x2_BM: /*BM start*/
  356. #if defined(TRMMKERNEL)
  357. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  358. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,2
  359. RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
  360. srl LOCAL_VAR1,2
  361. #else
  362. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  363. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  364. #endif
  365. ZERO_ZCVEC_2x2
  366. cijle LOCAL_VAR1,0,.L2x2_mod
  367. ALIGN_4
  368. .L2x2_4_BK: /*BK_CUR LOOP */
  369. ZCALC_2x2_4 LOCAL_VAR3,LOCAL_VAR2
  370. #if defined(PREFETCH_INS)
  371. pfd 1, 256(LOCAL_VAR3)
  372. pfd 1, 256(LOCAL_VAR2)
  373. #endif
  374. brctg LOCAL_VAR1,.L2x2_4_BK
  375. ALIGN_4
  376. .L2x2_mod:
  377. #if defined(TRMMKERNEL)
  378. RefreshTempBk LOCAL_VAR1,BK,OFF,2,2
  379. nill LOCAL_VAR1,3
  380. #else
  381. lghi LOCAL_VAR1,3
  382. NGR LOCAL_VAR1,BK /*refresh BK*/
  383. #endif
  384. jz .L2x2_BK_Store
  385. ALIGN_4
  386. .L2x2_BK: /*BK_CUR LOOP */
  387. ZCALC_2x2 LOCAL_VAR3,LOCAL_VAR2
  388. brctg LOCAL_VAR1,.L2x2_BK
  389. ALIGN_4
  390. .L2x2_BK_Store:
  391. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  392. ZSTORE_2x2 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE
  393. #if defined(TRMMKERNEL)
  394. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,2,2
  395. #endif
  396. ALIGN_2
  397. .L1x2:
  398. tmll BM,1
  399. jz .Lx2_INNER_END
  400. ALIGN_4
  401. .L1x2_BM: /*BM start*/
  402. #if defined(TRMMKERNEL)
  403. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  404. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,2
  405. RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
  406. srl LOCAL_VAR1,2
  407. #else
  408. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  409. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  410. #endif
  411. ZERO_ZCVEC_1x2
  412. cijle LOCAL_VAR1,0,.L1x2_mod
  413. ALIGN_4
  414. .L1x2_4_BK: /*BK_CUR LOOP */
  415. ZCALC_1x2_4 LOCAL_VAR3,LOCAL_VAR2
  416. brctg LOCAL_VAR1,.L1x2_4_BK
  417. ALIGN_4
  418. .L1x2_mod:
  419. #if defined(TRMMKERNEL)
  420. RefreshTempBk LOCAL_VAR1,BK,OFF,1,2
  421. nill LOCAL_VAR1,3
  422. #else
  423. lghi LOCAL_VAR1,3
  424. NGR LOCAL_VAR1,BK /*refresh BK*/
  425. #endif
  426. jz .L1x2_BK_Store
  427. ALIGN_4
  428. .L1x2_BK: /*BK_CUR LOOP */
  429. ZCALC_1x2 LOCAL_VAR3,LOCAL_VAR2
  430. brctg LOCAL_VAR1,.L1x2_BK
  431. ALIGN_4
  432. .L1x2_BK_Store:
  433. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  434. ZSTORE_1x2 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE
  435. #if defined(TRMMKERNEL)
  436. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,1,2
  437. #endif
  438. ALIGN_2
  439. .Lx2_INNER_END:
  440. /*add LDC_BYTE_COPY to new*/
  441. la LOCAL_VAR1,0(LDC_BYTE,LDC_BYTE) /*multiply*2 */
  442. sllg LOCAL_VAR2,BK,4 /*multiply*2*sizeof(complex) =multiply*2*8 2^4 */
  443. la CIJ,0(CIJ,LOCAL_VAR1) /*refresh CIJ=CIJ+LDC_BYTE*2*/
  444. #if defined(TRMMKERNEL) && !defined(LEFT)
  445. aghi OFF,2
  446. #endif
  447. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*2*sizeof(complex) */
  448. /*********************************X1 SECTION************************************************/
  449. ALIGN_2
  450. .Lx1:
  451. tmll BN,1
  452. jz .L_FUNC_END
  453. ALIGN_4
  454. .Lx1_BN:
  455. #if defined(TRMMKERNEL) && defined(LEFT)
  456. /*off = offset;*/
  457. lgdr OFF,OFFSET
  458. #endif
  459. srlg BM_CUR,BM,2
  460. lgr LOCAL_VAR3,A
  461. lgr CIJ_LOCAL,CIJ
  462. cijle BM_CUR,0,.L2x1
  463. ALIGN_4
  464. .L4x1_BM: /*BM start*/
  465. #if defined(TRMMKERNEL)
  466. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  467. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,4,1
  468. RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
  469. srl LOCAL_VAR1,2
  470. #else
  471. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  472. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  473. #endif
  474. ZERO_ZCVEC_4x1
  475. cijle LOCAL_VAR1,0,.L4x1_mod
  476. ALIGN_4
  477. .L4x1_4_BK: /*BK_CUR LOOP */
  478. ZCALC_4x1_4 LOCAL_VAR3,LOCAL_VAR2
  479. brctg LOCAL_VAR1,.L4x1_4_BK
  480. ALIGN_4
  481. .L4x1_mod:
  482. #if defined(TRMMKERNEL)
  483. RefreshTempBk LOCAL_VAR1,BK,OFF,4,1
  484. nill LOCAL_VAR1,3
  485. #else
  486. lghi LOCAL_VAR1,3
  487. NGR LOCAL_VAR1,BK /*refresh BK*/
  488. #endif
  489. jz .L4x1_BK_Store
  490. ALIGN_4
  491. .L4x1_BK: /*BK_CUR LOOP */
  492. ZCALC_4x1 LOCAL_VAR3,LOCAL_VAR2
  493. brctg LOCAL_VAR1,.L4x1_BK
  494. ALIGN_4
  495. .L4x1_BK_Store:
  496. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  497. ZSTORE_4x1 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE
  498. #if defined(TRMMKERNEL)
  499. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,4,1
  500. #endif
  501. ALIGN_4
  502. brctg BM_CUR , .L4x1_BM
  503. ALIGN_2
  504. .L2x1:
  505. tmll BM,2
  506. jz .L1x1
  507. ALIGN_4
  508. .L2x1_BM: /*BM start*/
  509. #if defined(TRMMKERNEL)
  510. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  511. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,2,1
  512. RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
  513. srl LOCAL_VAR1,2
  514. #else
  515. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  516. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  517. #endif
  518. ZERO_ZCVEC_2x1
  519. cijle LOCAL_VAR1,0,.L2x1_mod
  520. ALIGN_4
  521. .L2x1_4_BK: /*BK_CUR LOOP */
  522. ZCALC_2x1_4 LOCAL_VAR3,LOCAL_VAR2
  523. brctg LOCAL_VAR1,.L2x1_4_BK
  524. ALIGN_4
  525. .L2x1_mod:
  526. #if defined(TRMMKERNEL)
  527. RefreshTempBk LOCAL_VAR1,BK,OFF,2,1
  528. nill LOCAL_VAR1,3
  529. #else
  530. lghi LOCAL_VAR1,3
  531. NGR LOCAL_VAR1,BK /*refresh BK*/
  532. #endif
  533. jz .L2x1_BK_Store
  534. ALIGN_4
  535. .L2x1_BK: /*BK_CUR LOOP */
  536. ZCALC_2x1 LOCAL_VAR3,LOCAL_VAR2
  537. brctg LOCAL_VAR1,.L2x1_BK
  538. ALIGN_4
  539. .L2x1_BK_Store:
  540. /*store C and use LDC_BYTE AND CIJ_COPY for mem storing*/
  541. ZSTORE_2x1 ALPHA_VECT,ALPHA_VECT_I ,CIJ_LOCAL, LDC_BYTE
  542. #if defined(TRMMKERNEL)
  543. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,2,1
  544. #endif
  545. ALIGN_2
  546. .L1x1:
  547. tmll BM, 1
  548. jz .Lx1_INNER_END
  549. ALIGN_4
  550. .L1x1_BM: /*BM start*/
  551. #if defined(TRMMKERNEL)
  552. /* RefreshPointers PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B */
  553. RefreshPointers LOCAL_VAR3,LOCAL_VAR2,OFF,B,1,1
  554. RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
  555. srl LOCAL_VAR1,2
  556. #else
  557. srlg LOCAL_VAR1,BK,2 /*refresh BK*/
  558. lgr LOCAL_VAR2,B /*refresh BPOINT*/
  559. #endif
  560. ZERO_ZCVEC_1x1
  561. cijle LOCAL_VAR1,0,.L1x1_mod
  562. ALIGN_4
  563. .L1x1_4_BK: /*BK_CUR LOOP */
  564. ZCALC_1x1_4 LOCAL_VAR3,LOCAL_VAR2
  565. brctg LOCAL_VAR1,.L1x1_4_BK
  566. ALIGN_4
  567. .L1x1_mod:
  568. #if defined(TRMMKERNEL)
  569. RefreshTempBk LOCAL_VAR1,BK,OFF,1,1
  570. nill LOCAL_VAR1,3
  571. #else
  572. lghi LOCAL_VAR1,3
  573. NGR LOCAL_VAR1,BK /*refresh BK*/
  574. #endif
  575. jz .L1x1_BK_Store
  576. ALIGN_4
  577. .L1x1_BK: /*BK_CUR LOOP */
  578. ZCALC_1x1 LOCAL_VAR3,LOCAL_VAR2
  579. brctg LOCAL_VAR1,.L1x1_BK
  580. ALIGN_4
  581. .L1x1_BK_Store:
  582. /*store C and use CIJ_COPY for mem storing*/
  583. ZSTORE_1x1 ALPHA,ALPHA_I ,CIJ_LOCAL
  584. #if defined(TRMMKERNEL)
  585. RefreshPointersAndOFF LOCAL_VAR1,BK,OFF,LOCAL_VAR3,1,1
  586. #endif
  587. ALIGN_2
  588. .Lx1_INNER_END:
  589. /*add LDC_BYTE_COPY to new*/
  590. sllg LOCAL_VAR2,BK,3 /*multiply*1*sizeof(complex) =multiply*1*8* 2^3 */
  591. la CIJ,0(CIJ,LDC_BYTE) /*refresh CIJ=CIJ+LDC_BYTE */
  592. #if defined(TRMMKERNEL) && !defined(LEFT)
  593. aghi OFF,1
  594. #endif
  595. la B,0(B,LOCAL_VAR2) /*refresh B=B+Bk*1*sizeof(complex) */
  596. ALIGN_2
  597. .L_FUNC_END:
  598. /*end*/
  599. #if defined(TRMMKERNEL)
  600. ld OFFSET,40(%r15)
  601. lmg %r6,%r13,48(%r15)
  602. #else
  603. lmg %r6,%r12,48(%r15)
  604. #endif
  605. ld %f9, 128(%r15)
  606. ld %f10,136(%r15)
  607. ld %f11,144(%r15)
  608. ld %f12,152(%r15)
  609. br %r14
  610. .end