You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_1x2_sse2.S 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esi)
  43. #define STACK_N 8 + STACK + ARGS(%esi)
  44. #define STACK_K 12 + STACK + ARGS(%esi)
  45. #define STACK_ALPHA_R 16 + STACK + ARGS(%esi)
  46. #define STACK_ALPHA_I 24 + STACK + ARGS(%esi)
  47. #define STACK_A 32 + STACK + ARGS(%esi)
  48. #define STACK_B 36 + STACK + ARGS(%esi)
  49. #define STACK_C 40 + STACK + ARGS(%esi)
  50. #define STACK_LDC 44 + STACK + ARGS(%esi)
  51. #define STACK_OFFT 48 + STACK + ARGS(%esi)
  52. #define POSINV 0(%esp)
  53. #define ALPHA_R 16(%esp)
  54. #define ALPHA_I 32(%esp)
  55. #define K 48(%esp)
  56. #define N 52(%esp)
  57. #define M 56(%esp)
  58. #define A 60(%esp)
  59. #define C 64(%esp)
  60. #define J 68(%esp)
  61. #define OLD_STACK 72(%esp)
  62. #define OFFSET 76(%esp)
  63. #define KK 80(%esp)
  64. #define KKK 84(%esp)
  65. #define BUFFER 128(%esp)
  66. #define STACK_ALIGN 4096
  67. #define STACK_OFFSET 1024
  68. #if defined(OPTERON) || defined(BARCELONA)
  69. #define PREFETCH prefetch
  70. #endif
  71. #define PREFETCHSIZE (8 * 10 + 4)
  72. #define AA %edx
  73. #define BB %ecx
  74. #define LDC %ebp
  75. #define B %edi
  76. #define KERNEL1(address) \
  77. mulpd %xmm0, %xmm2; \
  78. addpd %xmm2, %xmm4; \
  79. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  80. movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  81. mulpd %xmm0, %xmm2; \
  82. addpd %xmm2, %xmm5; \
  83. movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  84. mulpd %xmm0, %xmm2; \
  85. mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  86. addpd %xmm2, %xmm6; \
  87. movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  88. addpd %xmm0, %xmm7; \
  89. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  90. #define KERNEL2(address) \
  91. mulpd %xmm0, %xmm3; \
  92. addpd %xmm3, %xmm4; \
  93. movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  94. mulpd %xmm0, %xmm3; \
  95. addpd %xmm3, %xmm5; \
  96. movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  97. mulpd %xmm0, %xmm3; \
  98. mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  99. addpd %xmm3, %xmm6; \
  100. movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  101. addpd %xmm0, %xmm7; \
  102. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  103. #define KERNEL3(address) \
  104. mulpd %xmm0, %xmm2; \
  105. addpd %xmm2, %xmm4; \
  106. movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  107. mulpd %xmm0, %xmm2; \
  108. addpd %xmm2, %xmm5; \
  109. movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  110. mulpd %xmm0, %xmm2; \
  111. mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  112. addpd %xmm2, %xmm6; \
  113. movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  114. addpd %xmm0, %xmm7; \
  115. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  116. #define KERNEL4(address) \
  117. mulpd %xmm0, %xmm3; \
  118. addpd %xmm3, %xmm4; \
  119. movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  120. mulpd %xmm0, %xmm3; \
  121. addpd %xmm3, %xmm5; \
  122. movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  123. mulpd %xmm0, %xmm3; \
  124. mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  125. addpd %xmm3, %xmm6; \
  126. movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  127. addpd %xmm0, %xmm7; \
  128. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  129. #define KERNEL5(address) \
  130. PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
  131. mulpd %xmm1, %xmm2; \
  132. addpd %xmm2, %xmm4; \
  133. movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  134. mulpd %xmm1, %xmm2; \
  135. addpd %xmm2, %xmm5; \
  136. movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  137. mulpd %xmm1, %xmm2; \
  138. mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  139. addpd %xmm2, %xmm6; \
  140. movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  141. addpd %xmm1, %xmm7; \
  142. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  143. #define KERNEL6(address) \
  144. mulpd %xmm1, %xmm3; \
  145. addpd %xmm3, %xmm4; \
  146. movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  147. mulpd %xmm1, %xmm3; \
  148. addpd %xmm3, %xmm5; \
  149. movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  150. mulpd %xmm1, %xmm3; \
  151. mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  152. addpd %xmm3, %xmm6; \
  153. movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  154. addpd %xmm1, %xmm7; \
  155. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  156. #define KERNEL7(address) \
  157. mulpd %xmm1, %xmm2; \
  158. addpd %xmm2, %xmm4; \
  159. movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  160. mulpd %xmm1, %xmm2; \
  161. addpd %xmm2, %xmm5; \
  162. movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  163. mulpd %xmm1, %xmm2; \
  164. mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  165. addpd %xmm2, %xmm6; \
  166. movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  167. addpd %xmm1, %xmm7; \
  168. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  169. #define KERNEL8(address) \
  170. mulpd %xmm1, %xmm3; \
  171. addpd %xmm3, %xmm4; \
  172. movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  173. mulpd %xmm1, %xmm3; \
  174. addpd %xmm3, %xmm5; \
  175. movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  176. mulpd %xmm1, %xmm3; \
  177. mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  178. addpd %xmm3, %xmm6; \
  179. movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  180. addpd %xmm1, %xmm7; \
  181. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  182. PROLOGUE
  183. pushl %ebp
  184. pushl %edi
  185. pushl %esi
  186. pushl %ebx
  187. PROFCODE
  188. EMMS
  189. movl %esp, %esi # save old stack
  190. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  191. andl $-STACK_ALIGN, %esp # align stack
  192. addl $STACK_OFFSET, %esp
  193. STACK_TOUCHING
  194. movl STACK_M, %ebx
  195. movl STACK_N, %eax
  196. movl STACK_K, %ecx
  197. movl STACK_A, %edx
  198. movl %ebx, M
  199. movl %eax, N
  200. movl %ecx, K
  201. movl %edx, A
  202. movl %esi, OLD_STACK
  203. movl STACK_B, B
  204. movl STACK_C, %ebx
  205. #ifdef TRMMKERNEL
  206. movss STACK_OFFT, %xmm4
  207. #endif
  208. movlpd STACK_ALPHA_R, %xmm0
  209. movlpd STACK_ALPHA_I, %xmm1
  210. pcmpeqb %xmm7, %xmm7
  211. psllq $63, %xmm7 # Generate mask
  212. pxor %xmm2, %xmm2
  213. movlpd %xmm0, 0 + ALPHA_R
  214. movlpd %xmm0, 8 + ALPHA_R
  215. movlpd %xmm1, 8 + ALPHA_I
  216. xorpd %xmm7, %xmm1
  217. movlpd %xmm1, 0 + ALPHA_I
  218. movlpd %xmm2, 0 + POSINV
  219. movlpd %xmm7, 8 + POSINV
  220. movl %ebx, C
  221. movl STACK_LDC, LDC
  222. #ifdef TRMMKERNEL
  223. movss %xmm4, OFFSET
  224. movss %xmm4, KK
  225. #ifndef LEFT
  226. negl KK
  227. #endif
  228. #endif
  229. sall $ZBASE_SHIFT, LDC
  230. sarl $1, %eax
  231. movl %eax, J # j = n
  232. jle .L100
  233. ALIGN_4
  234. .L01:
  235. #if defined(TRMMKERNEL) && defined(LEFT)
  236. movl OFFSET, %eax
  237. movl %eax, KK
  238. #endif
  239. leal BUFFER, %ecx
  240. movapd POSINV, %xmm7
  241. movl K, %eax
  242. sarl $1, %eax
  243. jle .L03
  244. ALIGN_4
  245. .L02:
  246. prefetchnta 56 * SIZE(B)
  247. movlpd 0 * SIZE(B), %xmm0
  248. movlpd 1 * SIZE(B), %xmm1
  249. movlpd 2 * SIZE(B), %xmm2
  250. movlpd 3 * SIZE(B), %xmm3
  251. movlpd 4 * SIZE(B), %xmm4
  252. movlpd 5 * SIZE(B), %xmm5
  253. movlpd 6 * SIZE(B), %xmm6
  254. movlpd 7 * SIZE(B), %xmm7
  255. movlpd %xmm0, 0 * SIZE(BB)
  256. movlpd %xmm0, 1 * SIZE(BB)
  257. movlpd %xmm1, 2 * SIZE(BB)
  258. movlpd %xmm1, 3 * SIZE(BB)
  259. movlpd %xmm2, 4 * SIZE(BB)
  260. movlpd %xmm2, 5 * SIZE(BB)
  261. movlpd %xmm3, 6 * SIZE(BB)
  262. movlpd %xmm3, 7 * SIZE(BB)
  263. movlpd %xmm4, 8 * SIZE(BB)
  264. movlpd %xmm4, 9 * SIZE(BB)
  265. movlpd %xmm5, 10 * SIZE(BB)
  266. movlpd %xmm5, 11 * SIZE(BB)
  267. movlpd %xmm6, 12 * SIZE(BB)
  268. movlpd %xmm6, 13 * SIZE(BB)
  269. movlpd %xmm7, 14 * SIZE(BB)
  270. movlpd %xmm7, 15 * SIZE(BB)
  271. addl $ 8 * SIZE, B
  272. subl $-16 * SIZE, BB
  273. decl %eax
  274. jne .L02
  275. ALIGN_4
  276. .L03:
  277. movl K, %eax
  278. andl $1, %eax
  279. BRANCH
  280. jle .L05
  281. movlpd 0 * SIZE(B), %xmm0
  282. movlpd 1 * SIZE(B), %xmm1
  283. movlpd 2 * SIZE(B), %xmm2
  284. movlpd 3 * SIZE(B), %xmm3
  285. movlpd %xmm0, 0 * SIZE(BB)
  286. movlpd %xmm0, 1 * SIZE(BB)
  287. movlpd %xmm1, 2 * SIZE(BB)
  288. movlpd %xmm1, 3 * SIZE(BB)
  289. movlpd %xmm2, 4 * SIZE(BB)
  290. movlpd %xmm2, 5 * SIZE(BB)
  291. movlpd %xmm3, 6 * SIZE(BB)
  292. movlpd %xmm3, 7 * SIZE(BB)
  293. addl $4 * SIZE, B
  294. ALIGN_4
  295. .L05:
  296. movl C, %esi # coffset = c
  297. movl A, AA # aoffset = a
  298. movl M, %ebx
  299. testl %ebx, %ebx
  300. jle .L100
  301. ALIGN_4
  302. .L10:
  303. #if !defined(TRMMKERNEL) || \
  304. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  305. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  306. leal BUFFER, BB # boffset1 = boffset
  307. #else
  308. leal BUFFER, BB # boffset1 = boffset
  309. movl KK, %eax
  310. leal (, %eax, SIZE), %eax
  311. leal (AA, %eax, 2), AA
  312. leal (BB, %eax, 8), BB
  313. #endif
  314. movapd 0 * SIZE(AA), %xmm0
  315. pxor %xmm4, %xmm4
  316. movapd 8 * SIZE(AA), %xmm1
  317. pxor %xmm5, %xmm5
  318. movapd 0 * SIZE(BB), %xmm2
  319. pxor %xmm6, %xmm6
  320. movapd 8 * SIZE(BB), %xmm3
  321. pxor %xmm7, %xmm7
  322. prefetchw 2 * SIZE(%esi)
  323. prefetchw 2 * SIZE(%esi, LDC)
  324. #ifndef TRMMKERNEL
  325. movl K, %eax
  326. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  327. movl K, %eax
  328. subl KK, %eax
  329. movl %eax, KKK
  330. #else
  331. movl KK, %eax
  332. #ifdef LEFT
  333. addl $1, %eax
  334. #else
  335. addl $2, %eax
  336. #endif
  337. movl %eax, KKK
  338. #endif
  339. #if 1
  340. andl $-8, %eax
  341. sall $4, %eax
  342. je .L15
  343. .L1X:
  344. KERNEL1(16 * 0)
  345. KERNEL2(16 * 0)
  346. KERNEL3(16 * 0)
  347. KERNEL4(16 * 0)
  348. KERNEL5(16 * 0)
  349. KERNEL6(16 * 0)
  350. KERNEL7(16 * 0)
  351. KERNEL8(16 * 0)
  352. cmpl $128 * 1, %eax
  353. jle .L12
  354. KERNEL1(16 * 1)
  355. KERNEL2(16 * 1)
  356. KERNEL3(16 * 1)
  357. KERNEL4(16 * 1)
  358. KERNEL5(16 * 1)
  359. KERNEL6(16 * 1)
  360. KERNEL7(16 * 1)
  361. KERNEL8(16 * 1)
  362. cmpl $128 * 2, %eax
  363. jle .L12
  364. KERNEL1(16 * 2)
  365. KERNEL2(16 * 2)
  366. KERNEL3(16 * 2)
  367. KERNEL4(16 * 2)
  368. KERNEL5(16 * 2)
  369. KERNEL6(16 * 2)
  370. KERNEL7(16 * 2)
  371. KERNEL8(16 * 2)
  372. cmpl $128 * 3, %eax
  373. jle .L12
  374. KERNEL1(16 * 3)
  375. KERNEL2(16 * 3)
  376. KERNEL3(16 * 3)
  377. KERNEL4(16 * 3)
  378. KERNEL5(16 * 3)
  379. KERNEL6(16 * 3)
  380. KERNEL7(16 * 3)
  381. KERNEL8(16 * 3)
  382. cmpl $128 * 4, %eax
  383. jle .L12
  384. KERNEL1(16 * 4)
  385. KERNEL2(16 * 4)
  386. KERNEL3(16 * 4)
  387. KERNEL4(16 * 4)
  388. KERNEL5(16 * 4)
  389. KERNEL6(16 * 4)
  390. KERNEL7(16 * 4)
  391. KERNEL8(16 * 4)
  392. cmpl $128 * 5, %eax
  393. jle .L12
  394. KERNEL1(16 * 5)
  395. KERNEL2(16 * 5)
  396. KERNEL3(16 * 5)
  397. KERNEL4(16 * 5)
  398. KERNEL5(16 * 5)
  399. KERNEL6(16 * 5)
  400. KERNEL7(16 * 5)
  401. KERNEL8(16 * 5)
  402. cmpl $128 * 6, %eax
  403. jle .L12
  404. KERNEL1(16 * 6)
  405. KERNEL2(16 * 6)
  406. KERNEL3(16 * 6)
  407. KERNEL4(16 * 6)
  408. KERNEL5(16 * 6)
  409. KERNEL6(16 * 6)
  410. KERNEL7(16 * 6)
  411. KERNEL8(16 * 6)
  412. cmpl $128 * 7, %eax
  413. jle .L12
  414. KERNEL1(16 * 7)
  415. KERNEL2(16 * 7)
  416. KERNEL3(16 * 7)
  417. KERNEL4(16 * 7)
  418. KERNEL5(16 * 7)
  419. KERNEL6(16 * 7)
  420. KERNEL7(16 * 7)
  421. KERNEL8(16 * 7)
  422. addl $128 * 4 * SIZE, BB
  423. addl $128 * 1 * SIZE, AA
  424. subl $128 * 8, %eax
  425. jg .L1X
  426. jmp .L15
  427. .L12:
  428. leal (AA, %eax, 1), AA
  429. leal (BB, %eax, 4), BB
  430. ALIGN_4
  431. #else
  432. sarl $3, %eax
  433. je .L15
  434. ALIGN_4
  435. .L12:
  436. KERNEL1(16 * 0)
  437. KERNEL2(16 * 0)
  438. KERNEL3(16 * 0)
  439. KERNEL4(16 * 0)
  440. KERNEL5(16 * 0)
  441. KERNEL6(16 * 0)
  442. KERNEL7(16 * 0)
  443. KERNEL8(16 * 0)
  444. addl $64 * SIZE, BB
  445. addl $16 * SIZE, AA
  446. decl %eax
  447. jne .L11
  448. ALIGN_4
  449. #endif
  450. .L15:
  451. #ifndef TRMMKERNEL
  452. movl K, %eax
  453. #else
  454. movl KKK, %eax
  455. #endif
  456. andl $7, %eax # if (k & 1)
  457. BRANCH
  458. je .L14
  459. ALIGN_4
  460. .L13:
  461. mulpd %xmm0, %xmm2
  462. addpd %xmm2, %xmm4
  463. movapd 2 * SIZE(BB), %xmm2
  464. mulpd %xmm0, %xmm2
  465. addpd %xmm2, %xmm5
  466. movapd 4 * SIZE(BB), %xmm2
  467. mulpd %xmm0, %xmm2
  468. mulpd 6 * SIZE(BB), %xmm0
  469. addpd %xmm2, %xmm6
  470. movapd 8 * SIZE(BB), %xmm2
  471. addpd %xmm0, %xmm7
  472. movapd 2 * SIZE(AA), %xmm0
  473. addl $2 * SIZE, AA
  474. addl $8 * SIZE, BB
  475. decl %eax
  476. jg .L13
  477. ALIGN_4
  478. .L14:
  479. movapd POSINV, %xmm1
  480. movapd ALPHA_R, %xmm2
  481. movapd ALPHA_I, %xmm3
  482. SHUFPD_1 %xmm5, %xmm5
  483. SHUFPD_1 %xmm7, %xmm7
  484. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  485. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  486. xorpd %xmm1, %xmm5
  487. xorpd %xmm1, %xmm7
  488. #else
  489. xorpd %xmm1, %xmm4
  490. xorpd %xmm1, %xmm6
  491. #endif
  492. #ifndef TRMMKERNEL
  493. movlpd 0 * SIZE(%esi), %xmm0
  494. movhpd 1 * SIZE(%esi), %xmm0
  495. movlpd 0 * SIZE(%esi, LDC), %xmm1
  496. movhpd 1 * SIZE(%esi, LDC), %xmm1
  497. #endif
  498. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  499. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  500. subpd %xmm5, %xmm4
  501. subpd %xmm7, %xmm6
  502. #else
  503. addpd %xmm5, %xmm4
  504. addpd %xmm7, %xmm6
  505. #endif
  506. pshufd $0x4e, %xmm4, %xmm5
  507. pshufd $0x4e, %xmm6, %xmm7
  508. mulpd %xmm2, %xmm4
  509. mulpd %xmm3, %xmm5
  510. mulpd %xmm2, %xmm6
  511. mulpd %xmm3, %xmm7
  512. addpd %xmm5, %xmm4
  513. addpd %xmm7, %xmm6
  514. #ifndef TRMMKERNEL
  515. addpd %xmm0, %xmm4
  516. addpd %xmm1, %xmm6
  517. #endif
  518. movlpd %xmm4, 0 * SIZE(%esi)
  519. movhpd %xmm4, 1 * SIZE(%esi)
  520. movlpd %xmm6, 0 * SIZE(%esi, LDC)
  521. movhpd %xmm6, 1 * SIZE(%esi, LDC)
  522. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  523. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  524. movl K, %eax
  525. subl KKK, %eax
  526. leal (,%eax, SIZE), %eax
  527. leal (AA, %eax, 2), AA
  528. leal (BB, %eax, 8), BB
  529. #endif
  530. #if defined(TRMMKERNEL) && defined(LEFT)
  531. addl $1, KK
  532. #endif
  533. addl $2 * SIZE, %esi # coffset += 4
  534. decl %ebx # i --
  535. jg .L10
  536. ALIGN_4
  537. .L99:
  538. #if defined(TRMMKERNEL) && !defined(LEFT)
  539. addl $2, KK
  540. #endif
  541. leal (, LDC, 2), %eax
  542. addl %eax, C # c += ldc
  543. decl J # j --
  544. jg .L01
  545. ALIGN_4
  546. .L100:
  547. movl N, %eax
  548. andl $1, %eax
  549. jle .L500
  550. ALIGN_4
  551. .L101:
  552. #if defined(TRMMKERNEL) && defined(LEFT)
  553. movl OFFSET, %eax
  554. movl %eax, KK
  555. #endif
  556. leal BUFFER, %ecx
  557. movapd POSINV, %xmm7
  558. movl K, %eax
  559. sarl $2, %eax
  560. jle .L103
  561. ALIGN_4
  562. .L102:
  563. prefetchnta 56 * SIZE(B)
  564. movlpd 0 * SIZE(B), %xmm0
  565. movlpd 1 * SIZE(B), %xmm1
  566. movlpd 2 * SIZE(B), %xmm2
  567. movlpd 3 * SIZE(B), %xmm3
  568. movlpd 4 * SIZE(B), %xmm4
  569. movlpd 5 * SIZE(B), %xmm5
  570. movlpd 6 * SIZE(B), %xmm6
  571. movlpd 7 * SIZE(B), %xmm7
  572. movlpd %xmm0, 0 * SIZE(BB)
  573. movlpd %xmm0, 1 * SIZE(BB)
  574. movlpd %xmm1, 2 * SIZE(BB)
  575. movlpd %xmm1, 3 * SIZE(BB)
  576. movlpd %xmm2, 4 * SIZE(BB)
  577. movlpd %xmm2, 5 * SIZE(BB)
  578. movlpd %xmm3, 6 * SIZE(BB)
  579. movlpd %xmm3, 7 * SIZE(BB)
  580. movlpd %xmm4, 8 * SIZE(BB)
  581. movlpd %xmm4, 9 * SIZE(BB)
  582. movlpd %xmm5, 10 * SIZE(BB)
  583. movlpd %xmm5, 11 * SIZE(BB)
  584. movlpd %xmm6, 12 * SIZE(BB)
  585. movlpd %xmm6, 13 * SIZE(BB)
  586. movlpd %xmm7, 14 * SIZE(BB)
  587. movlpd %xmm7, 15 * SIZE(BB)
  588. addl $ 8 * SIZE, B
  589. subl $-16 * SIZE, %ecx
  590. decl %eax
  591. jne .L102
  592. ALIGN_4
  593. .L103:
  594. movl K, %eax
  595. andl $3, %eax
  596. BRANCH
  597. jle .L105
  598. ALIGN_4
  599. .L104:
  600. movlpd 0 * SIZE(B), %xmm0
  601. movlpd 1 * SIZE(B), %xmm1
  602. movlpd %xmm0, 0 * SIZE(BB)
  603. movlpd %xmm0, 1 * SIZE(BB)
  604. movlpd %xmm1, 2 * SIZE(BB)
  605. movlpd %xmm1, 3 * SIZE(BB)
  606. addl $2 * SIZE, B
  607. addl $4 * SIZE, %ecx
  608. decl %eax
  609. jne .L104
  610. ALIGN_4
  611. .L105:
  612. movl C, %esi # coffset = c
  613. movl A, AA # aoffset = a
  614. movl M, %ebx
  615. testl %ebx, %ebx
  616. jle .L500
  617. ALIGN_4
  618. .L110:
  619. #if !defined(TRMMKERNEL) || \
  620. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  621. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  622. leal BUFFER, BB
  623. #else
  624. leal BUFFER, BB
  625. movl KK, %eax
  626. leal (, %eax, SIZE), %eax
  627. leal (AA, %eax, 2), AA
  628. leal (BB, %eax, 4), BB
  629. #endif
  630. pxor %xmm4, %xmm4
  631. pxor %xmm5, %xmm5
  632. pxor %xmm6, %xmm6
  633. pxor %xmm7, %xmm7
  634. movapd 0 * SIZE(AA), %xmm0
  635. movapd 8 * SIZE(AA), %xmm1
  636. movapd 0 * SIZE(BB), %xmm2
  637. movapd 8 * SIZE(BB), %xmm3
  638. #ifndef TRMMKERNEL
  639. movl K, %eax
  640. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  641. movl K, %eax
  642. subl KK, %eax
  643. movl %eax, KKK
  644. #else
  645. movl KK, %eax
  646. #ifdef LEFT
  647. addl $1, %eax
  648. #else
  649. addl $1, %eax
  650. #endif
  651. movl %eax, KKK
  652. #endif
  653. sarl $3, %eax
  654. je .L112
  655. ALIGN_4
  656. .L111:
  657. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  658. mulpd %xmm0, %xmm2
  659. mulpd 2 * SIZE(BB), %xmm0
  660. addpd %xmm2, %xmm4
  661. movapd 4 * SIZE(BB), %xmm2
  662. addpd %xmm0, %xmm5
  663. movapd 2 * SIZE(AA), %xmm0
  664. mulpd %xmm0, %xmm2
  665. mulpd 6 * SIZE(BB), %xmm0
  666. addpd %xmm2, %xmm6
  667. movapd 16 * SIZE(BB), %xmm2
  668. addpd %xmm0, %xmm7
  669. movapd 4 * SIZE(AA), %xmm0
  670. mulpd %xmm0, %xmm3
  671. mulpd 10 * SIZE(BB), %xmm0
  672. addpd %xmm3, %xmm4
  673. movapd 12 * SIZE(BB), %xmm3
  674. addpd %xmm0, %xmm5
  675. movapd 6 * SIZE(AA), %xmm0
  676. mulpd %xmm0, %xmm3
  677. mulpd 14 * SIZE(BB), %xmm0
  678. addpd %xmm3, %xmm6
  679. movapd 24 * SIZE(BB), %xmm3
  680. addpd %xmm0, %xmm7
  681. movapd 16 * SIZE(AA), %xmm0
  682. mulpd %xmm1, %xmm2
  683. mulpd 18 * SIZE(BB), %xmm1
  684. addpd %xmm2, %xmm4
  685. movapd 20 * SIZE(BB), %xmm2
  686. addpd %xmm1, %xmm5
  687. movapd 10 * SIZE(AA), %xmm1
  688. mulpd %xmm1, %xmm2
  689. mulpd 22 * SIZE(BB), %xmm1
  690. addpd %xmm2, %xmm6
  691. movapd 32 * SIZE(BB), %xmm2
  692. addpd %xmm1, %xmm7
  693. movapd 12 * SIZE(AA), %xmm1
  694. mulpd %xmm1, %xmm3
  695. mulpd 26 * SIZE(BB), %xmm1
  696. addpd %xmm3, %xmm4
  697. movapd 28 * SIZE(BB), %xmm3
  698. addpd %xmm1, %xmm5
  699. movapd 14 * SIZE(AA), %xmm1
  700. mulpd %xmm1, %xmm3
  701. mulpd 30 * SIZE(BB), %xmm1
  702. addpd %xmm3, %xmm6
  703. movapd 40 * SIZE(BB), %xmm3
  704. addpd %xmm1, %xmm7
  705. movapd 24 * SIZE(AA), %xmm1
  706. addl $16 * SIZE, AA
  707. addl $32 * SIZE, BB
  708. decl %eax
  709. jne .L111
  710. ALIGN_4
  711. .L112:
  712. #ifndef TRMMKERNEL
  713. movl K, %eax
  714. #else
  715. movl KKK, %eax
  716. #endif
  717. andl $7, %eax # if (k & 1)
  718. BRANCH
  719. je .L114
  720. ALIGN_4
  721. .L113:
  722. mulpd %xmm0, %xmm2
  723. mulpd 2 * SIZE(BB), %xmm0
  724. addpd %xmm2, %xmm4
  725. movapd 4 * SIZE(BB), %xmm2
  726. addpd %xmm0, %xmm5
  727. movapd 2 * SIZE(AA), %xmm0
  728. addl $2 * SIZE, AA
  729. addl $4 * SIZE, BB
  730. decl %eax
  731. jg .L113
  732. ALIGN_4
  733. .L114:
  734. movapd POSINV, %xmm1
  735. movapd ALPHA_R, %xmm2
  736. movapd ALPHA_I, %xmm3
  737. addpd %xmm6, %xmm4
  738. addpd %xmm7, %xmm5
  739. SHUFPD_1 %xmm5, %xmm5
  740. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  741. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  742. xorpd %xmm1, %xmm5
  743. #else
  744. xorpd %xmm1, %xmm4
  745. #endif
  746. #ifndef TRMMKERNEL
  747. movlpd 0 * SIZE(%esi), %xmm0
  748. movhpd 1 * SIZE(%esi), %xmm0
  749. #endif
  750. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  751. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  752. subpd %xmm5, %xmm4
  753. #else
  754. addpd %xmm5, %xmm4
  755. #endif
  756. pshufd $0x4e, %xmm4, %xmm5
  757. mulpd %xmm2, %xmm4
  758. mulpd %xmm3, %xmm5
  759. addpd %xmm5, %xmm4
  760. #ifndef TRMMKERNEL
  761. addpd %xmm0, %xmm4
  762. #endif
  763. movlpd %xmm4, 0 * SIZE(%esi)
  764. movhpd %xmm4, 1 * SIZE(%esi)
  765. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  766. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  767. movl K, %eax
  768. subl KKK, %eax
  769. leal (,%eax, SIZE), %eax
  770. leal (AA, %eax, 2), AA
  771. leal (BB, %eax, 4), BB
  772. #endif
  773. #if defined(TRMMKERNEL) && defined(LEFT)
  774. addl $1, KK
  775. #endif
  776. addl $2 * SIZE, %esi # coffset += 4
  777. decl %ebx # i --
  778. jg .L110
  779. ALIGN_4
  780. .L500:
  781. movl OLD_STACK, %esp
  782. EMMS
  783. popl %ebx
  784. popl %esi
  785. popl %edi
  786. popl %ebp
  787. ret
  788. EPILOGUE