You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_1x2_3dnow.S 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define OLD_M 4 + STACK + ARGS(%esi)
  43. #define OLD_N 8 + STACK + ARGS(%esi)
  44. #define OLD_K 12 + STACK + ARGS(%esi)
  45. #define OLD_ALPHA_R 16 + STACK + ARGS(%esi)
  46. #define OLD_ALPHA_I 20 + STACK + ARGS(%esi)
  47. #define OLD_A 24 + STACK + ARGS(%esi)
  48. #define OLD_B 28 + STACK + ARGS(%esi)
  49. #define OLD_C 32 + STACK + ARGS(%esi)
  50. #define OLD_LDC 36 + STACK + ARGS(%esi)
  51. #define OLD_OFFSET 40 + STACK + ARGS(%esi)
  52. #define GAMMA_R 0(%esp)
  53. #define GAMMA_I 8(%esp)
  54. #define ALPHA 16(%esp)
  55. #define K 24(%esp)
  56. #define N 28(%esp)
  57. #define M 32(%esp)
  58. #define A 36(%esp)
  59. #define C 40(%esp)
  60. #define J 44(%esp)
  61. #define OLD_STACK 48(%esp)
  62. #define OFFSET 52(%esp)
  63. #define KK 56(%esp)
  64. #define KKK 60(%esp)
  65. #define BUFFER 128(%esp)
  66. #define AA %edx
  67. #define BB %ecx
  68. #define PREFETCHSIZE (16 * 2 + 6)
  69. #define AOFFSET -32
  70. #define BOFFSET 128
  71. /*
  72. A hint of scheduling is received from following URL
  73. https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11
  74. */
  75. PROLOGUE
  76. pushl %ebp
  77. pushl %edi
  78. pushl %esi
  79. pushl %ebx
  80. PROFCODE
  81. movl %esp, %esi # save old stack
  82. subl $128 + LOCAL_BUFFER_SIZE, %esp
  83. movl OLD_M, %ebx
  84. andl $-1024, %esp # align stack
  85. STACK_TOUCHING
  86. movl OLD_N, %eax
  87. movl OLD_K, %ecx
  88. movl OLD_A, %edx
  89. movl %ebx, M
  90. movl %eax, N
  91. movl %ecx, K
  92. subl $AOFFSET * SIZE, %edx
  93. movl %edx, A
  94. movl %esi, OLD_STACK
  95. testl %ebx, %ebx
  96. jle .L999
  97. movl OLD_B, %edi
  98. movl OLD_C, %ebx
  99. EMMS
  100. movd OLD_ALPHA_R, %mm0
  101. movd OLD_ALPHA_I, %mm1
  102. movd %mm0, 0 + ALPHA
  103. movd %mm1, 4 + ALPHA
  104. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  105. movl $0x3f800000, 0 + GAMMA_R
  106. movl $0x3f800000, 4 + GAMMA_R
  107. movl $0xbf800000, 0 + GAMMA_I
  108. movl $0x3f800000, 4 + GAMMA_I
  109. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  110. movl $0x3f800000, 0 + GAMMA_R
  111. movl $0x3f800000, 4 + GAMMA_R
  112. movl $0x3f800000, 0 + GAMMA_I
  113. movl $0xbf800000, 4 + GAMMA_I
  114. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  115. movl $0x3f800000, 0 + GAMMA_R
  116. movl $0xbF800000, 4 + GAMMA_R
  117. movl $0x3f800000, 0 + GAMMA_I
  118. movl $0x3F800000, 4 + GAMMA_I
  119. #else
  120. movl $0x3f800000, 0 + GAMMA_R
  121. movl $0xbf800000, 4 + GAMMA_R
  122. movl $0xbf800000, 0 + GAMMA_I
  123. movl $0xbf800000, 4 + GAMMA_I
  124. #endif
  125. movl %ebx, C
  126. movl OLD_LDC, %ebp
  127. leal (, %ebp, SIZE * 2), %ebp
  128. #ifdef TRMMKERNEL
  129. movl OLD_OFFSET, %eax
  130. movl %eax, OFFSET
  131. #ifndef LEFT
  132. negl %eax
  133. movl %eax, KK
  134. #endif
  135. #endif
  136. movl N, %eax
  137. sarl $1, %eax
  138. movl %eax, J # j = n
  139. jle .L20
  140. ALIGN_4
  141. .L01:
  142. /* Copying to Sub Buffer */
  143. leal BUFFER, BB
  144. #if defined(TRMMKERNEL) && defined(LEFT)
  145. movl OFFSET, %eax
  146. movl %eax, KK
  147. #endif
  148. movl K, %eax
  149. sarl $2, %eax
  150. jle .L03
  151. ALIGN_4
  152. .L02:
  153. movd 0 * SIZE(%edi), %mm0
  154. movd 1 * SIZE(%edi), %mm1
  155. movd 2 * SIZE(%edi), %mm2
  156. movd 3 * SIZE(%edi), %mm3
  157. movd 4 * SIZE(%edi), %mm4
  158. movd 5 * SIZE(%edi), %mm5
  159. movd 6 * SIZE(%edi), %mm6
  160. movd 7 * SIZE(%edi), %mm7
  161. prefetchnta 72 * SIZE(%edi)
  162. punpckldq %mm0, %mm0
  163. punpckldq %mm1, %mm1
  164. punpckldq %mm2, %mm2
  165. punpckldq %mm3, %mm3
  166. punpckldq %mm4, %mm4
  167. punpckldq %mm5, %mm5
  168. punpckldq %mm6, %mm6
  169. punpckldq %mm7, %mm7
  170. movq %mm0, 0 * SIZE(BB)
  171. movq %mm1, 2 * SIZE(BB)
  172. movq %mm2, 4 * SIZE(BB)
  173. movq %mm3, 6 * SIZE(BB)
  174. movq %mm4, 8 * SIZE(BB)
  175. movq %mm5, 10 * SIZE(BB)
  176. movq %mm6, 12 * SIZE(BB)
  177. movq %mm7, 14 * SIZE(BB)
  178. movd 8 * SIZE(%edi), %mm0
  179. movd 9 * SIZE(%edi), %mm1
  180. movd 10 * SIZE(%edi), %mm2
  181. movd 11 * SIZE(%edi), %mm3
  182. movd 12 * SIZE(%edi), %mm4
  183. movd 13 * SIZE(%edi), %mm5
  184. movd 14 * SIZE(%edi), %mm6
  185. movd 15 * SIZE(%edi), %mm7
  186. punpckldq %mm0, %mm0
  187. punpckldq %mm1, %mm1
  188. punpckldq %mm2, %mm2
  189. punpckldq %mm3, %mm3
  190. punpckldq %mm4, %mm4
  191. punpckldq %mm5, %mm5
  192. punpckldq %mm6, %mm6
  193. punpckldq %mm7, %mm7
  194. movq %mm0, 16 * SIZE(BB)
  195. movq %mm1, 18 * SIZE(BB)
  196. movq %mm2, 20 * SIZE(BB)
  197. movq %mm3, 22 * SIZE(BB)
  198. movq %mm4, 24 * SIZE(BB)
  199. movq %mm5, 26 * SIZE(BB)
  200. movq %mm6, 28 * SIZE(BB)
  201. movq %mm7, 30 * SIZE(BB)
  202. addl $16 * SIZE, %edi
  203. addl $32 * SIZE, BB
  204. decl %eax
  205. jne .L02
  206. ALIGN_4
  207. .L03:
  208. movl K, %eax
  209. andl $3, %eax
  210. BRANCH
  211. jle .L10
  212. ALIGN_4
  213. .L04:
  214. movd 0 * SIZE(%edi), %mm0
  215. movd 1 * SIZE(%edi), %mm1
  216. movd 2 * SIZE(%edi), %mm2
  217. movd 3 * SIZE(%edi), %mm3
  218. punpckldq %mm0, %mm0
  219. punpckldq %mm1, %mm1
  220. punpckldq %mm2, %mm2
  221. punpckldq %mm3, %mm3
  222. movq %mm0, 0 * SIZE(BB)
  223. movq %mm1, 2 * SIZE(BB)
  224. movq %mm2, 4 * SIZE(BB)
  225. movq %mm3, 6 * SIZE(BB)
  226. addl $4 * SIZE, %edi
  227. addl $8 * SIZE, BB
  228. decl %eax
  229. jne .L04
  230. ALIGN_4
  231. .L10:
  232. movl C, %esi # coffset = c
  233. movl A, AA # aoffset = a
  234. movl M, %ebx
  235. ALIGN_4
  236. .L11:
  237. leal - BOFFSET * SIZE + BUFFER, BB
  238. #if !defined(TRMMKERNEL) || \
  239. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  240. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  241. #else
  242. movl KK, %eax
  243. leal (, %eax, SIZE), %eax
  244. leal (AA, %eax, 2), AA
  245. leal (BB, %eax, 8), BB
  246. #endif
  247. movq ( 0 + AOFFSET) * SIZE(AA), %mm0
  248. pxor %mm4, %mm4
  249. movq ( 16 + AOFFSET) * SIZE(AA), %mm1
  250. pxor %mm5, %mm5
  251. PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2
  252. pxor %mm6, %mm6
  253. PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3
  254. pxor %mm7, %mm7
  255. prefetchw 2 * SIZE(%esi)
  256. prefetchw 2 * SIZE(%esi, %ebp)
  257. #ifndef TRMMKERNEL
  258. movl K, %eax
  259. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  260. movl K, %eax
  261. subl KK, %eax
  262. movl %eax, KKK
  263. #else
  264. movl KK, %eax
  265. #ifdef LEFT
  266. addl $1, %eax
  267. #else
  268. addl $2, %eax
  269. #endif
  270. movl %eax, KKK
  271. #endif
  272. sarl $4, %eax
  273. je .L15
  274. ALIGN_4
  275. .L12:
  276. pfmul %mm0, %mm2
  277. pfadd %mm2, %mm4
  278. PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2
  279. pfmul %mm0, %mm2
  280. pfadd %mm2, %mm5
  281. PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2
  282. pfmul %mm0, %mm2
  283. pfadd %mm2, %mm6
  284. PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA)
  285. PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2
  286. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  287. pfadd %mm0, %mm7
  288. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  289. pfmul %mm0, %mm2
  290. pfadd %mm2, %mm4
  291. PADDING movq ( 10 + BOFFSET) * SIZE(BB), %mm2
  292. pfmul %mm0, %mm2
  293. pfadd %mm2, %mm5
  294. PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2
  295. pfmul %mm0, %mm2
  296. pfadd %mm2, %mm6
  297. PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2
  298. pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0
  299. pfadd %mm0, %mm7
  300. movq ( 4 + AOFFSET) * SIZE(AA), %mm0
  301. pfmul %mm0, %mm3
  302. pfadd %mm3, %mm4
  303. PADDING movq ( 18 + BOFFSET) * SIZE(BB), %mm3
  304. pfmul %mm0, %mm3
  305. pfadd %mm3, %mm5
  306. PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3
  307. pfmul %mm0, %mm3
  308. pfadd %mm3, %mm6
  309. PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3
  310. pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0
  311. pfadd %mm0, %mm7
  312. movq ( 6 + AOFFSET) * SIZE(AA), %mm0
  313. pfmul %mm0, %mm3
  314. pfadd %mm3, %mm4
  315. PADDING movq ( 26 + BOFFSET) * SIZE(BB), %mm3
  316. pfmul %mm0, %mm3
  317. pfadd %mm3, %mm5
  318. PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3
  319. pfmul %mm0, %mm3
  320. pfadd %mm3, %mm6
  321. PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3
  322. pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0
  323. pfadd %mm0, %mm7
  324. movq ( 8 + AOFFSET) * SIZE(AA), %mm0
  325. pfmul %mm0, %mm2
  326. pfadd %mm2, %mm4
  327. PADDING movq ( 34 + BOFFSET) * SIZE(BB), %mm2
  328. pfmul %mm0, %mm2
  329. pfadd %mm2, %mm5
  330. PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2
  331. pfmul %mm0, %mm2
  332. pfadd %mm2, %mm6
  333. PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2
  334. pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0
  335. pfadd %mm0, %mm7
  336. movq ( 10 + AOFFSET) * SIZE(AA), %mm0
  337. pfmul %mm0, %mm2
  338. pfadd %mm2, %mm4
  339. PADDING movq ( 42 + BOFFSET) * SIZE(BB), %mm2
  340. pfmul %mm0, %mm2
  341. pfadd %mm2, %mm5
  342. PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2
  343. pfmul %mm0, %mm2
  344. pfadd %mm2, %mm6
  345. PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2
  346. pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0
  347. pfadd %mm0, %mm7
  348. movq ( 12 + AOFFSET) * SIZE(AA), %mm0
  349. pfmul %mm0, %mm3
  350. pfadd %mm3, %mm4
  351. PADDING movq ( 50 + BOFFSET) * SIZE(BB), %mm3
  352. pfmul %mm0, %mm3
  353. pfadd %mm3, %mm5
  354. PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3
  355. pfmul %mm0, %mm3
  356. pfadd %mm3, %mm6
  357. PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3
  358. pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0
  359. pfadd %mm0, %mm7
  360. movq ( 14 + AOFFSET) * SIZE(AA), %mm0
  361. pfmul %mm0, %mm3
  362. pfadd %mm3, %mm4
  363. PADDING movq ( 58 + BOFFSET) * SIZE(BB), %mm3
  364. pfmul %mm0, %mm3
  365. pfadd %mm3, %mm5
  366. PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3
  367. pfmul %mm0, %mm3
  368. pfadd %mm3, %mm6
  369. PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3
  370. pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0
  371. pfadd %mm0, %mm7
  372. movq ( 32 + AOFFSET) * SIZE(AA), %mm0
  373. pfmul %mm1, %mm2
  374. pfadd %mm2, %mm4
  375. PADDING movq ( 66 + BOFFSET) * SIZE(BB), %mm2
  376. pfmul %mm1, %mm2
  377. pfadd %mm2, %mm5
  378. PADDING movq ( 68 + BOFFSET) * SIZE(BB), %mm2
  379. pfmul %mm1, %mm2
  380. pfadd %mm2, %mm6
  381. PADDING movq ( 72 + BOFFSET) * SIZE(BB), %mm2
  382. pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1
  383. pfadd %mm1, %mm7
  384. movq ( 18 + AOFFSET) * SIZE(AA), %mm1
  385. pfmul %mm1, %mm2
  386. pfadd %mm2, %mm4
  387. PADDING movq ( 74 + BOFFSET) * SIZE(BB), %mm2
  388. pfmul %mm1, %mm2
  389. pfadd %mm2, %mm5
  390. PADDING movq ( 76 + BOFFSET) * SIZE(BB), %mm2
  391. pfmul %mm1, %mm2
  392. pfadd %mm2, %mm6
  393. PADDING movq ( 96 + BOFFSET) * SIZE(BB), %mm2
  394. pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1
  395. pfadd %mm1, %mm7
  396. movq ( 20 + AOFFSET) * SIZE(AA), %mm1
  397. pfmul %mm1, %mm3
  398. pfadd %mm3, %mm4
  399. PADDING movq ( 82 + BOFFSET) * SIZE(BB), %mm3
  400. pfmul %mm1, %mm3
  401. pfadd %mm3, %mm5
  402. PADDING movq ( 84 + BOFFSET) * SIZE(BB), %mm3
  403. pfmul %mm1, %mm3
  404. pfadd %mm3, %mm6
  405. PADDING movq ( 88 + BOFFSET) * SIZE(BB), %mm3
  406. pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1
  407. pfadd %mm1, %mm7
  408. movq ( 22 + AOFFSET) * SIZE(AA), %mm1
  409. pfmul %mm1, %mm3
  410. pfadd %mm3, %mm4
  411. PADDING movq ( 90 + BOFFSET) * SIZE(BB), %mm3
  412. pfmul %mm1, %mm3
  413. pfadd %mm3, %mm5
  414. PADDING movq ( 92 + BOFFSET) * SIZE(BB), %mm3
  415. pfmul %mm1, %mm3
  416. pfadd %mm3, %mm6
  417. PADDING movq (112 + BOFFSET) * SIZE(BB), %mm3
  418. pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1
  419. pfadd %mm1, %mm7
  420. movq ( 24 + AOFFSET) * SIZE(AA), %mm1
  421. pfmul %mm1, %mm2
  422. pfadd %mm2, %mm4
  423. PADDING movq ( 98 + BOFFSET) * SIZE(BB), %mm2
  424. pfmul %mm1, %mm2
  425. pfadd %mm2, %mm5
  426. PADDING movq (100 + BOFFSET) * SIZE(BB), %mm2
  427. pfmul %mm1, %mm2
  428. pfadd %mm2, %mm6
  429. PADDING movq (104 + BOFFSET) * SIZE(BB), %mm2
  430. pfmul (102 + BOFFSET) * SIZE(BB), %mm1
  431. pfadd %mm1, %mm7
  432. movq ( 26 + AOFFSET) * SIZE(AA), %mm1
  433. pfmul %mm1, %mm2
  434. pfadd %mm2, %mm4
  435. PADDING movq (106 + BOFFSET) * SIZE(BB), %mm2
  436. pfmul %mm1, %mm2
  437. pfadd %mm2, %mm5
  438. PADDING movq (108 + BOFFSET) * SIZE(BB), %mm2
  439. pfmul %mm1, %mm2
  440. pfadd %mm2, %mm6
  441. PADDING movq (128 + BOFFSET) * SIZE(BB), %mm2
  442. pfmul (110 + BOFFSET) * SIZE(BB), %mm1
  443. pfadd %mm1, %mm7
  444. movq ( 28 + AOFFSET) * SIZE(AA), %mm1
  445. pfmul %mm1, %mm3
  446. pfadd %mm3, %mm4
  447. PADDING movq (114 + BOFFSET) * SIZE(BB), %mm3
  448. pfmul %mm1, %mm3
  449. pfadd %mm3, %mm5
  450. PADDING movq (116 + BOFFSET) * SIZE(BB), %mm3
  451. pfmul %mm1, %mm3
  452. pfadd %mm3, %mm6
  453. PADDING movq (120 + BOFFSET) * SIZE(BB), %mm3
  454. pfmul (118 + BOFFSET) * SIZE(BB), %mm1
  455. pfadd %mm1, %mm7
  456. movq ( 30 + AOFFSET) * SIZE(AA), %mm1
  457. pfmul %mm1, %mm3
  458. pfadd %mm3, %mm4
  459. PADDING movq (122 + BOFFSET) * SIZE(BB), %mm3
  460. pfmul %mm1, %mm3
  461. pfadd %mm3, %mm5
  462. PADDING movq (124 + BOFFSET) * SIZE(BB), %mm3
  463. pfmul %mm1, %mm3
  464. pfadd %mm3, %mm6
  465. PADDING movq (144 + BOFFSET) * SIZE(BB), %mm3
  466. pfmul (126 + BOFFSET) * SIZE(BB), %mm1
  467. pfadd %mm1, %mm7
  468. movq ( 48 + AOFFSET) * SIZE(AA), %mm1
  469. subl $-32 * SIZE, AA
  470. addl $128 * SIZE, BB
  471. decl %eax
  472. jne .L12
  473. ALIGN_3
  474. .L15:
  475. #ifndef TRMMKERNEL
  476. movl K, %eax
  477. #else
  478. movl KKK, %eax
  479. #endif
  480. andl $15, %eax # if (k & 1)
  481. BRANCH
  482. je .L18
  483. ALIGN_3
  484. .L16:
  485. pfmul %mm0, %mm2
  486. pfadd %mm2, %mm4
  487. PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2
  488. pfmul %mm0, %mm2
  489. pfadd %mm2, %mm5
  490. PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2
  491. pfmul %mm0, %mm2
  492. pfadd %mm2, %mm6
  493. PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2
  494. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  495. pfadd %mm0, %mm7
  496. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  497. addl $2 * SIZE, AA
  498. addl $8 * SIZE, BB
  499. decl %eax
  500. jg .L16
  501. ALIGN_4
  502. .L18:
  503. movq GAMMA_R, %mm0
  504. movq GAMMA_I, %mm1
  505. movq ALPHA, %mm2
  506. pswapd %mm5, %mm5
  507. pswapd %mm7, %mm7
  508. pfmul %mm0, %mm4
  509. pfmul %mm1, %mm5
  510. pfmul %mm0, %mm6
  511. pfmul %mm1, %mm7
  512. pfadd %mm5, %mm4
  513. pfadd %mm7, %mm6
  514. pswapd %mm4, %mm5
  515. pswapd %mm6, %mm7
  516. pfmul %mm2, %mm4
  517. pfmul %mm2, %mm6
  518. pfmul %mm2, %mm5
  519. pfmul %mm2, %mm7
  520. pfpnacc %mm5, %mm4
  521. pfpnacc %mm7, %mm6
  522. #ifndef TRMMKERNEL
  523. pfadd (%esi), %mm4
  524. pfadd (%esi, %ebp), %mm6
  525. #endif
  526. movq %mm4, (%esi)
  527. movq %mm6, (%esi, %ebp)
  528. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  529. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  530. movl K, %eax
  531. subl KKK, %eax
  532. leal (,%eax, SIZE), %eax
  533. leal (AA, %eax, 2), AA
  534. leal (BB, %eax, 8), BB
  535. #endif
  536. #if defined(TRMMKERNEL) && defined(LEFT)
  537. addl $1, KK
  538. #endif
  539. addl $2 * SIZE, %esi
  540. decl %ebx
  541. jg .L11
  542. ALIGN_4
  543. .L19:
  544. #if defined(TRMMKERNEL) && !defined(LEFT)
  545. addl $2, KK
  546. #endif
  547. leal (, %ebp, 2), %eax
  548. addl %eax, C # c += ldc
  549. decl J # j --
  550. jg .L01
  551. ALIGN_4
  552. .L20:
  553. movl N, %eax
  554. andl $1, %eax
  555. jle .L999
  556. ALIGN_4
  557. .L21:
  558. /* Copying to Sub Buffer */
  559. movl K, %eax
  560. leal BUFFER, BB
  561. sarl $2, %eax
  562. jle .L25
  563. ALIGN_4
  564. .L22:
  565. movd 0 * SIZE(%edi), %mm0
  566. movd 1 * SIZE(%edi), %mm1
  567. movd 2 * SIZE(%edi), %mm2
  568. movd 3 * SIZE(%edi), %mm3
  569. movd 4 * SIZE(%edi), %mm4
  570. movd 5 * SIZE(%edi), %mm5
  571. movd 6 * SIZE(%edi), %mm6
  572. movd 7 * SIZE(%edi), %mm7
  573. prefetchnta 72 * SIZE(%edi)
  574. punpckldq %mm0, %mm0
  575. punpckldq %mm1, %mm1
  576. punpckldq %mm2, %mm2
  577. punpckldq %mm3, %mm3
  578. punpckldq %mm4, %mm4
  579. punpckldq %mm5, %mm5
  580. punpckldq %mm6, %mm6
  581. punpckldq %mm7, %mm7
  582. movq %mm0, 0 * SIZE(BB)
  583. movq %mm1, 2 * SIZE(BB)
  584. movq %mm2, 4 * SIZE(BB)
  585. movq %mm3, 6 * SIZE(BB)
  586. movq %mm4, 8 * SIZE(BB)
  587. movq %mm5, 10 * SIZE(BB)
  588. movq %mm6, 12 * SIZE(BB)
  589. movq %mm7, 14 * SIZE(BB)
  590. addl $ 8 * SIZE, %edi
  591. addl $16 * SIZE, BB
  592. decl %eax
  593. jne .L22
  594. ALIGN_4
  595. .L25:
  596. movl K, %eax
  597. andl $3, %eax
  598. BRANCH
  599. jle .L30
  600. ALIGN_4
  601. .L26:
  602. movd 0 * SIZE(%edi), %mm0
  603. movd 1 * SIZE(%edi), %mm1
  604. movd %mm0, 0 * SIZE(BB)
  605. movd %mm0, 1 * SIZE(BB)
  606. movd %mm1, 2 * SIZE(BB)
  607. movd %mm1, 3 * SIZE(BB)
  608. addl $2 * SIZE, %edi
  609. addl $4 * SIZE, BB
  610. decl %eax
  611. jne .L26
  612. ALIGN_4
  613. .L30:
  614. #if defined(TRMMKERNEL) && defined(LEFT)
  615. movl OFFSET, %eax
  616. movl %eax, KK
  617. #endif
  618. movl C, %esi # coffset = c
  619. movl A, AA # aoffset = a
  620. movl M, %ebx
  621. ALIGN_3
  622. .L31:
  623. leal - BOFFSET * SIZE + BUFFER, BB
  624. #if !defined(TRMMKERNEL) || \
  625. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  626. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  627. #else
  628. movl KK, %eax
  629. leal (, %eax, SIZE), %eax
  630. leal (AA, %eax, 2), AA
  631. leal (BB, %eax, 4), BB
  632. #endif
  633. movq ( 0 + AOFFSET) * SIZE(AA), %mm0
  634. pxor %mm4, %mm4
  635. movq ( 16 + AOFFSET) * SIZE(AA), %mm1
  636. pxor %mm5, %mm5
  637. PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2
  638. pxor %mm6, %mm6
  639. PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3
  640. pxor %mm7, %mm7
  641. prefetchw 2 * SIZE(%esi)
  642. #ifndef TRMMKERNEL
  643. movl K, %eax
  644. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  645. movl K, %eax
  646. subl KK, %eax
  647. movl %eax, KKK
  648. #else
  649. movl KK, %eax
  650. #ifdef LEFT
  651. addl $1, %eax
  652. #else
  653. addl $1, %eax
  654. #endif
  655. movl %eax, KKK
  656. #endif
  657. sarl $4, %eax
  658. je .L35
  659. ALIGN_4
  660. .L32:
  661. pfmul %mm0, %mm2
  662. pfadd %mm2, %mm4
  663. PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2
  664. pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0
  665. pfadd %mm0, %mm5
  666. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  667. PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA)
  668. pfmul %mm0, %mm2
  669. pfadd %mm2, %mm6
  670. PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2
  671. pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0
  672. pfadd %mm0, %mm7
  673. movq ( 4 + AOFFSET) * SIZE(AA), %mm0
  674. pfmul %mm0, %mm2
  675. pfadd %mm2, %mm4
  676. PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2
  677. pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0
  678. pfadd %mm0, %mm5
  679. movq ( 6 + AOFFSET) * SIZE(AA), %mm0
  680. pfmul %mm0, %mm2
  681. pfadd %mm2, %mm6
  682. PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2
  683. pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0
  684. pfadd %mm0, %mm7
  685. movq ( 8 + AOFFSET) * SIZE(AA), %mm0
  686. pfmul %mm0, %mm3
  687. pfadd %mm3, %mm4
  688. PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3
  689. pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0
  690. pfadd %mm0, %mm5
  691. movq ( 10 + AOFFSET) * SIZE(AA), %mm0
  692. pfmul %mm0, %mm3
  693. pfadd %mm3, %mm6
  694. PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3
  695. pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0
  696. pfadd %mm0, %mm7
  697. movq ( 12 + AOFFSET) * SIZE(AA), %mm0
  698. pfmul %mm0, %mm3
  699. pfadd %mm3, %mm4
  700. PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3
  701. pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0
  702. pfadd %mm0, %mm5
  703. movq ( 14 + AOFFSET) * SIZE(AA), %mm0
  704. pfmul %mm0, %mm3
  705. pfadd %mm3, %mm6
  706. PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3
  707. pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0
  708. pfadd %mm0, %mm7
  709. movq ( 32 + AOFFSET) * SIZE(AA), %mm0
  710. pfmul %mm1, %mm2
  711. pfadd %mm2, %mm4
  712. PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2
  713. pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1
  714. pfadd %mm1, %mm5
  715. movq ( 18 + AOFFSET) * SIZE(AA), %mm1
  716. pfmul %mm1, %mm2
  717. pfadd %mm2, %mm6
  718. PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2
  719. pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1
  720. pfadd %mm1, %mm7
  721. movq ( 20 + AOFFSET) * SIZE(AA), %mm1
  722. pfmul %mm1, %mm2
  723. pfadd %mm2, %mm4
  724. PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2
  725. pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1
  726. pfadd %mm1, %mm5
  727. movq ( 22 + AOFFSET) * SIZE(AA), %mm1
  728. pfmul %mm1, %mm2
  729. pfadd %mm2, %mm6
  730. PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2
  731. pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1
  732. pfadd %mm1, %mm7
  733. movq ( 24 + AOFFSET) * SIZE(AA), %mm1
  734. pfmul %mm1, %mm3
  735. pfadd %mm3, %mm4
  736. PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3
  737. pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1
  738. pfadd %mm1, %mm5
  739. movq ( 26 + AOFFSET) * SIZE(AA), %mm1
  740. pfmul %mm1, %mm3
  741. pfadd %mm3, %mm6
  742. PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3
  743. pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1
  744. pfadd %mm1, %mm7
  745. movq ( 28 + AOFFSET) * SIZE(AA), %mm1
  746. pfmul %mm1, %mm3
  747. pfadd %mm3, %mm4
  748. PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3
  749. pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1
  750. pfadd %mm1, %mm5
  751. movq ( 30 + AOFFSET) * SIZE(AA), %mm1
  752. pfmul %mm1, %mm3
  753. pfadd %mm3, %mm6
  754. PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3
  755. pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1
  756. pfadd %mm1, %mm7
  757. movq ( 48 + AOFFSET) * SIZE(AA), %mm1
  758. subl $-32 * SIZE, AA
  759. addl $ 64 * SIZE, BB
  760. decl %eax
  761. jne .L32
  762. ALIGN_3
  763. .L35:
  764. #ifndef TRMMKERNEL
  765. movl K, %eax
  766. #else
  767. movl KKK, %eax
  768. #endif
  769. andl $15, %eax # if (k & 1)
  770. BRANCH
  771. je .L38
  772. ALIGN_3
  773. .L36:
  774. pfmul %mm0, %mm2
  775. pfadd %mm2, %mm4
  776. PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2
  777. pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0
  778. pfadd %mm0, %mm5
  779. movq ( 2 + AOFFSET) * SIZE(AA), %mm0
  780. addl $2 * SIZE, AA
  781. addl $4 * SIZE, BB
  782. decl %eax
  783. jg .L36
  784. ALIGN_4
  785. .L38:
  786. pfadd %mm6, %mm4
  787. pfadd %mm7, %mm5
  788. movq ALPHA, %mm2
  789. pswapd %mm5, %mm5
  790. pfmul GAMMA_R, %mm4
  791. pfmul GAMMA_I, %mm5
  792. pfadd %mm5, %mm4
  793. pswapd %mm4, %mm5
  794. pfmul %mm2, %mm4
  795. pfmul %mm2, %mm5
  796. pfpnacc %mm5, %mm4
  797. #ifndef TRMMKERNEL
  798. pfadd 0 * SIZE(%esi), %mm4
  799. #endif
  800. movq %mm4, 0 * SIZE(%esi)
  801. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  802. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  803. movl K, %eax
  804. subl KKK, %eax
  805. leal (,%eax, SIZE), %eax
  806. leal (AA, %eax, 2), AA
  807. leal (BB, %eax, 4), BB
  808. #endif
  809. #if defined(TRMMKERNEL) && defined(LEFT)
  810. addl $1, KK
  811. #endif
  812. addl $2 * SIZE, %esi # coffset += 4
  813. decl %ebx # i --
  814. jg .L31
  815. ALIGN_4
  816. .L999:
  817. EMMS
  818. movl OLD_STACK, %esp
  819. popl %ebx
  820. popl %esi
  821. popl %edi
  822. popl %ebp
  823. ret
  824. EPILOGUE