You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2_penryn.S 24 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA_R 16 + STACK + ARGS(%esp)
  46. #define ALPHA_I 20 + STACK + ARGS(%esp)
  47. #define A 24 + STACK + ARGS(%esp)
  48. #define ARG_B 28 + STACK + ARGS(%esp)
  49. #define C 32 + STACK + ARGS(%esp)
  50. #define ARG_LDC 36 + STACK + ARGS(%esp)
  51. #define OFFSET 40 + STACK + ARGS(%esp)
  52. #define J 0 + STACK(%esp)
  53. #define BX 4 + STACK(%esp)
  54. #define KK 8 + STACK(%esp)
  55. #define KKK 12 + STACK(%esp)
  56. #ifdef NANO
  57. #define PREFETCHSIZE (16 * 3 + 8)
  58. #define PREFETCHW prefetcht0
  59. #define PREFETCHB prefetcht0
  60. #endif
  61. #if defined(NEHALEM) || defined(SANDYBRIDGE)
  62. #define PREFETCHSIZE (16 * 1 + 8)
  63. #define PREFETCHW prefetcht0
  64. #define PREFETCHB prefetcht0
  65. #endif
  66. #ifndef PREFETCH
  67. #define PREFETCH prefetcht0
  68. #endif
  69. #ifndef PREFETCHW
  70. #define PREFETCHW prefetcht0
  71. #endif
  72. #ifndef PREFETCHB
  73. #define PREFETCHB prefetcht0
  74. #endif
  75. #ifndef PREFETCHSIZE
  76. #define PREFETCHSIZE (16 * 13 + 8)
  77. #endif
  78. #define AA %edx
  79. #define BB %ecx
  80. #define LDC %ebp
  81. #define B %edi
  82. #define C1 %esi
  83. #define I %ebx
  84. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  85. #define ADD1 addps
  86. #define ADD2 addps
  87. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  88. #define ADD1 addps
  89. #define ADD2 addps
  90. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  91. #define ADD1 addps
  92. #define ADD2 addps
  93. #else
  94. #define ADD1 addps
  95. #define ADD2 subps
  96. #endif
  97. PROLOGUE
  98. subl $ARGS, %esp # Generate Stack Frame
  99. pushl %ebp
  100. pushl %edi
  101. pushl %esi
  102. pushl %ebx
  103. PROFCODE
  104. movl ARG_B, B
  105. movl ARG_LDC, LDC
  106. #ifdef TRMMKERNEL
  107. movl OFFSET, %eax
  108. #ifndef LEFT
  109. negl %eax
  110. #endif
  111. movl %eax, KK
  112. #endif
  113. subl $-32 * SIZE, A
  114. subl $-32 * SIZE, B
  115. sall $ZBASE_SHIFT, LDC
  116. movl N, %eax
  117. sarl $1, %eax
  118. movl %eax, J
  119. jle .L30
  120. ALIGN_4
  121. .L01:
  122. #if defined(TRMMKERNEL) && defined(LEFT)
  123. movl OFFSET, %eax
  124. movl %eax, KK
  125. #endif
  126. movl B, BX
  127. movl C, C1
  128. movl A, AA
  129. movl M, %ebx
  130. sarl $1, %ebx
  131. jle .L20
  132. ALIGN_4
  133. .L10:
  134. #if !defined(TRMMKERNEL) || \
  135. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  136. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  137. movl B, BB
  138. #else
  139. movl B, BB
  140. movl KK, %eax
  141. leal (, %eax, SIZE), %eax
  142. leal (AA, %eax, 4), AA
  143. leal (BB, %eax, 4), BB
  144. #endif
  145. movl BX, %eax
  146. PREFETCHB -32 * SIZE(%eax)
  147. subl $-16 * SIZE, %eax
  148. movl %eax, BX
  149. movaps -32 * SIZE(AA), %xmm0
  150. pxor %xmm2, %xmm2
  151. movaps -32 * SIZE(BB), %xmm1
  152. pxor %xmm3, %xmm3
  153. xorps %xmm4, %xmm4
  154. PREFETCHW 3 * SIZE(C1)
  155. xorps %xmm5, %xmm5
  156. PREFETCHW 7 * SIZE(C1, LDC)
  157. xorps %xmm6, %xmm6
  158. xorps %xmm7, %xmm7
  159. #ifndef TRMMKERNEL
  160. movl K, %eax
  161. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  162. movl K, %eax
  163. subl KK, %eax
  164. movl %eax, KKK
  165. #else
  166. movl KK, %eax
  167. #ifdef LEFT
  168. addl $2, %eax
  169. #else
  170. addl $2, %eax
  171. #endif
  172. movl %eax, KKK
  173. #endif
  174. sarl $3, %eax
  175. je .L15
  176. ALIGN_4
  177. .L12:
  178. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  179. ADD2 %xmm2, %xmm7
  180. pshufd $0xb1, %xmm1, %xmm2
  181. mulps %xmm0, %xmm1
  182. ADD1 %xmm3, %xmm6
  183. pshufd $0x1b, %xmm2, %xmm3
  184. mulps %xmm0, %xmm2
  185. ADD2 %xmm2, %xmm5
  186. pshufd $0xb1, %xmm3, %xmm2
  187. mulps %xmm0, %xmm3
  188. ADD1 %xmm1, %xmm4
  189. movaps -28 * SIZE(BB), %xmm1
  190. mulps %xmm0, %xmm2
  191. movaps -28 * SIZE(AA), %xmm0
  192. ADD2 %xmm2, %xmm7
  193. pshufd $0xb1, %xmm1, %xmm2
  194. mulps %xmm0, %xmm1
  195. ADD1 %xmm3, %xmm6
  196. pshufd $0x1b, %xmm2, %xmm3
  197. mulps %xmm0, %xmm2
  198. ADD2 %xmm2, %xmm5
  199. pshufd $0xb1, %xmm3, %xmm2
  200. mulps %xmm0, %xmm3
  201. ADD1 %xmm1, %xmm4
  202. movaps -24 * SIZE(BB), %xmm1
  203. mulps %xmm0, %xmm2
  204. movaps -24 * SIZE(AA), %xmm0
  205. ADD2 %xmm2, %xmm7
  206. pshufd $0xb1, %xmm1, %xmm2
  207. mulps %xmm0, %xmm1
  208. ADD1 %xmm3, %xmm6
  209. pshufd $0x1b, %xmm2, %xmm3
  210. mulps %xmm0, %xmm2
  211. ADD2 %xmm2, %xmm5
  212. pshufd $0xb1, %xmm3, %xmm2
  213. mulps %xmm0, %xmm3
  214. ADD1 %xmm1, %xmm4
  215. movaps -20 * SIZE(BB), %xmm1
  216. mulps %xmm0, %xmm2
  217. movaps -20 * SIZE(AA), %xmm0
  218. ADD2 %xmm2, %xmm7
  219. pshufd $0xb1, %xmm1, %xmm2
  220. mulps %xmm0, %xmm1
  221. ADD1 %xmm3, %xmm6
  222. pshufd $0x1b, %xmm2, %xmm3
  223. mulps %xmm0, %xmm2
  224. ADD2 %xmm2, %xmm5
  225. pshufd $0xb1, %xmm3, %xmm2
  226. mulps %xmm0, %xmm3
  227. ADD1 %xmm1, %xmm4
  228. movaps -16 * SIZE(BB), %xmm1
  229. mulps %xmm0, %xmm2
  230. movaps -16 * SIZE(AA), %xmm0
  231. ADD2 %xmm2, %xmm7
  232. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  233. pshufd $0xb1, %xmm1, %xmm2
  234. mulps %xmm0, %xmm1
  235. ADD1 %xmm3, %xmm6
  236. pshufd $0x1b, %xmm2, %xmm3
  237. mulps %xmm0, %xmm2
  238. ADD2 %xmm2, %xmm5
  239. pshufd $0xb1, %xmm3, %xmm2
  240. mulps %xmm0, %xmm3
  241. ADD1 %xmm1, %xmm4
  242. movaps -12 * SIZE(BB), %xmm1
  243. mulps %xmm0, %xmm2
  244. movaps -12 * SIZE(AA), %xmm0
  245. ADD2 %xmm2, %xmm7
  246. pshufd $0xb1, %xmm1, %xmm2
  247. mulps %xmm0, %xmm1
  248. ADD1 %xmm3, %xmm6
  249. pshufd $0x1b, %xmm2, %xmm3
  250. mulps %xmm0, %xmm2
  251. ADD2 %xmm2, %xmm5
  252. pshufd $0xb1, %xmm3, %xmm2
  253. mulps %xmm0, %xmm3
  254. ADD1 %xmm1, %xmm4
  255. movaps -8 * SIZE(BB), %xmm1
  256. mulps %xmm0, %xmm2
  257. movaps -8 * SIZE(AA), %xmm0
  258. ADD2 %xmm2, %xmm7
  259. pshufd $0xb1, %xmm1, %xmm2
  260. mulps %xmm0, %xmm1
  261. ADD1 %xmm3, %xmm6
  262. pshufd $0x1b, %xmm2, %xmm3
  263. mulps %xmm0, %xmm2
  264. ADD2 %xmm2, %xmm5
  265. pshufd $0xb1, %xmm3, %xmm2
  266. mulps %xmm0, %xmm3
  267. ADD1 %xmm1, %xmm4
  268. movaps -4 * SIZE(BB), %xmm1
  269. mulps %xmm0, %xmm2
  270. movaps -4 * SIZE(AA), %xmm0
  271. ADD2 %xmm2, %xmm7
  272. subl $-32 * SIZE, BB
  273. pshufd $0xb1, %xmm1, %xmm2
  274. mulps %xmm0, %xmm1
  275. ADD1 %xmm3, %xmm6
  276. pshufd $0x1b, %xmm2, %xmm3
  277. mulps %xmm0, %xmm2
  278. ADD2 %xmm2, %xmm5
  279. subl $-32 * SIZE, AA
  280. pshufd $0xb1, %xmm3, %xmm2
  281. mulps %xmm0, %xmm3
  282. ADD1 %xmm1, %xmm4
  283. movaps -32 * SIZE(BB), %xmm1
  284. mulps %xmm0, %xmm2
  285. movaps -32 * SIZE(AA), %xmm0
  286. decl %eax
  287. jne .L12
  288. ALIGN_4
  289. .L15:
  290. #ifndef TRMMKERNEL
  291. movl K, %eax
  292. #else
  293. movl KKK, %eax
  294. #endif
  295. andl $7, %eax
  296. BRANCH
  297. je .L18
  298. ALIGN_4
  299. .L16:
  300. ADD2 %xmm2, %xmm7
  301. pshufd $0xb1, %xmm1, %xmm2
  302. mulps %xmm0, %xmm1
  303. ADD1 %xmm3, %xmm6
  304. pshufd $0x1b, %xmm2, %xmm3
  305. mulps %xmm0, %xmm2
  306. ADD2 %xmm2, %xmm5
  307. pshufd $0xb1, %xmm3, %xmm2
  308. mulps %xmm0, %xmm3
  309. ADD1 %xmm1, %xmm4
  310. movaps -28 * SIZE(BB), %xmm1
  311. mulps %xmm0, %xmm2
  312. movaps -28 * SIZE(AA), %xmm0
  313. addl $4 * SIZE, AA
  314. addl $4 * SIZE, BB
  315. decl %eax
  316. jg .L16
  317. ALIGN_4
  318. .L18:
  319. ADD2 %xmm2, %xmm7
  320. pcmpeqb %xmm0, %xmm0
  321. ADD1 %xmm3, %xmm6
  322. psllq $63, %xmm0
  323. movsd ALPHA_R, %xmm3
  324. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  325. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  326. pxor %xmm0, %xmm4
  327. pxor %xmm0, %xmm6
  328. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  329. pshufd $0xb1, %xmm0, %xmm0
  330. pxor %xmm0, %xmm5
  331. pxor %xmm0, %xmm7
  332. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  333. pxor %xmm0, %xmm5
  334. pxor %xmm0, %xmm7
  335. #endif
  336. haddps %xmm5, %xmm4
  337. haddps %xmm7, %xmm6
  338. shufps $0xd8, %xmm4, %xmm4
  339. shufps $0xd8, %xmm6, %xmm6
  340. movaps %xmm4, %xmm5
  341. shufps $0xe4, %xmm6, %xmm4
  342. shufps $0xe4, %xmm5, %xmm6
  343. pshufd $0x00, %xmm3, %xmm2
  344. pshufd $0x55, %xmm3, %xmm3
  345. pshufd $0xb1, %xmm4, %xmm5
  346. pshufd $0xb1, %xmm6, %xmm7
  347. mulps %xmm2, %xmm4
  348. mulps %xmm3, %xmm5
  349. mulps %xmm2, %xmm6
  350. mulps %xmm3, %xmm7
  351. addsubps %xmm5, %xmm4
  352. addsubps %xmm7, %xmm6
  353. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  354. movsd 0 * SIZE(C1), %xmm2
  355. movhps 2 * SIZE(C1), %xmm2
  356. movsd 0 * SIZE(C1, LDC), %xmm3
  357. movhps 2 * SIZE(C1, LDC), %xmm3
  358. addps %xmm2, %xmm4
  359. addps %xmm3, %xmm6
  360. #endif
  361. movsd %xmm4, 0 * SIZE(C1)
  362. movhps %xmm4, 2 * SIZE(C1)
  363. movsd %xmm6, 0 * SIZE(C1, LDC)
  364. movhps %xmm6, 2 * SIZE(C1, LDC)
  365. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  366. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  367. movl K, %eax
  368. subl KKK, %eax
  369. leal (,%eax, SIZE), %eax
  370. leal (AA, %eax, 4), AA
  371. leal (BB, %eax, 4), BB
  372. #endif
  373. #if defined(TRMMKERNEL) && defined(LEFT)
  374. addl $2, KK
  375. #endif
  376. addl $4 * SIZE, C1
  377. decl %ebx
  378. jg .L10
  379. ALIGN_4
  380. .L20:
  381. movl M, %ebx
  382. testl $1, %ebx
  383. jle .L29
  384. #if !defined(TRMMKERNEL) || \
  385. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  386. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  387. movl B, BB
  388. #else
  389. movl B, BB
  390. movl KK, %eax
  391. leal (, %eax, SIZE), %eax
  392. leal (AA, %eax, 2), AA
  393. leal (BB, %eax, 4), BB
  394. #endif
  395. movsd -32 * SIZE(AA), %xmm0
  396. pxor %xmm2, %xmm2
  397. movaps -32 * SIZE(BB), %xmm1
  398. pxor %xmm3, %xmm3
  399. pxor %xmm4, %xmm4
  400. pxor %xmm5, %xmm5
  401. pxor %xmm6, %xmm6
  402. pxor %xmm7, %xmm7
  403. #ifndef TRMMKERNEL
  404. movl K, %eax
  405. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  406. movl K, %eax
  407. subl KK, %eax
  408. movl %eax, KKK
  409. #else
  410. movl KK, %eax
  411. #ifdef LEFT
  412. addl $1, %eax
  413. #else
  414. addl $2, %eax
  415. #endif
  416. movl %eax, KKK
  417. #endif
  418. sarl $3, %eax
  419. je .L25
  420. ALIGN_4
  421. .L22:
  422. addps %xmm2, %xmm6
  423. pshufd $0x00, %xmm1, %xmm2
  424. mulps %xmm0, %xmm2
  425. addps %xmm3, %xmm7
  426. pshufd $0x55, %xmm1, %xmm3
  427. mulps %xmm0, %xmm3
  428. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  429. addps %xmm2, %xmm4
  430. pshufd $0xaa, %xmm1, %xmm2
  431. mulps %xmm0, %xmm2
  432. addps %xmm3, %xmm5
  433. pshufd $0xff, %xmm1, %xmm3
  434. movaps -28 * SIZE(BB), %xmm1
  435. mulps %xmm0, %xmm3
  436. movsd -30 * SIZE(AA), %xmm0
  437. addps %xmm2, %xmm6
  438. pshufd $0x00, %xmm1, %xmm2
  439. mulps %xmm0, %xmm2
  440. addps %xmm3, %xmm7
  441. pshufd $0x55, %xmm1, %xmm3
  442. mulps %xmm0, %xmm3
  443. addps %xmm2, %xmm4
  444. pshufd $0xaa, %xmm1, %xmm2
  445. mulps %xmm0, %xmm2
  446. addps %xmm3, %xmm5
  447. pshufd $0xff, %xmm1, %xmm3
  448. movaps -24 * SIZE(BB), %xmm1
  449. mulps %xmm0, %xmm3
  450. movsd -28 * SIZE(AA), %xmm0
  451. addps %xmm2, %xmm6
  452. pshufd $0x00, %xmm1, %xmm2
  453. mulps %xmm0, %xmm2
  454. addps %xmm3, %xmm7
  455. pshufd $0x55, %xmm1, %xmm3
  456. mulps %xmm0, %xmm3
  457. addps %xmm2, %xmm4
  458. pshufd $0xaa, %xmm1, %xmm2
  459. mulps %xmm0, %xmm2
  460. addps %xmm3, %xmm5
  461. pshufd $0xff, %xmm1, %xmm3
  462. movaps -20 * SIZE(BB), %xmm1
  463. mulps %xmm0, %xmm3
  464. movsd -26 * SIZE(AA), %xmm0
  465. addps %xmm2, %xmm6
  466. pshufd $0x00, %xmm1, %xmm2
  467. mulps %xmm0, %xmm2
  468. addps %xmm3, %xmm7
  469. pshufd $0x55, %xmm1, %xmm3
  470. mulps %xmm0, %xmm3
  471. addps %xmm2, %xmm4
  472. pshufd $0xaa, %xmm1, %xmm2
  473. mulps %xmm0, %xmm2
  474. addps %xmm3, %xmm5
  475. pshufd $0xff, %xmm1, %xmm3
  476. movaps -16 * SIZE(BB), %xmm1
  477. mulps %xmm0, %xmm3
  478. movsd -24 * SIZE(AA), %xmm0
  479. addps %xmm2, %xmm6
  480. pshufd $0x00, %xmm1, %xmm2
  481. mulps %xmm0, %xmm2
  482. addps %xmm3, %xmm7
  483. pshufd $0x55, %xmm1, %xmm3
  484. mulps %xmm0, %xmm3
  485. addps %xmm2, %xmm4
  486. pshufd $0xaa, %xmm1, %xmm2
  487. mulps %xmm0, %xmm2
  488. addps %xmm3, %xmm5
  489. pshufd $0xff, %xmm1, %xmm3
  490. movaps -12 * SIZE(BB), %xmm1
  491. mulps %xmm0, %xmm3
  492. movsd -22 * SIZE(AA), %xmm0
  493. addps %xmm2, %xmm6
  494. pshufd $0x00, %xmm1, %xmm2
  495. mulps %xmm0, %xmm2
  496. addps %xmm3, %xmm7
  497. pshufd $0x55, %xmm1, %xmm3
  498. mulps %xmm0, %xmm3
  499. addps %xmm2, %xmm4
  500. pshufd $0xaa, %xmm1, %xmm2
  501. mulps %xmm0, %xmm2
  502. addps %xmm3, %xmm5
  503. pshufd $0xff, %xmm1, %xmm3
  504. movaps -8 * SIZE(BB), %xmm1
  505. mulps %xmm0, %xmm3
  506. movsd -20 * SIZE(AA), %xmm0
  507. addps %xmm2, %xmm6
  508. pshufd $0x00, %xmm1, %xmm2
  509. mulps %xmm0, %xmm2
  510. addps %xmm3, %xmm7
  511. pshufd $0x55, %xmm1, %xmm3
  512. mulps %xmm0, %xmm3
  513. addps %xmm2, %xmm4
  514. pshufd $0xaa, %xmm1, %xmm2
  515. mulps %xmm0, %xmm2
  516. addps %xmm3, %xmm5
  517. pshufd $0xff, %xmm1, %xmm3
  518. movaps -4 * SIZE(BB), %xmm1
  519. mulps %xmm0, %xmm3
  520. movsd -18 * SIZE(AA), %xmm0
  521. addps %xmm2, %xmm6
  522. pshufd $0x00, %xmm1, %xmm2
  523. mulps %xmm0, %xmm2
  524. addps %xmm3, %xmm7
  525. pshufd $0x55, %xmm1, %xmm3
  526. mulps %xmm0, %xmm3
  527. addps %xmm2, %xmm4
  528. pshufd $0xaa, %xmm1, %xmm2
  529. mulps %xmm0, %xmm2
  530. addps %xmm3, %xmm5
  531. pshufd $0xff, %xmm1, %xmm3
  532. movaps 0 * SIZE(BB), %xmm1
  533. mulps %xmm0, %xmm3
  534. movsd -16 * SIZE(AA), %xmm0
  535. subl $-16 * SIZE, AA
  536. subl $-32 * SIZE, BB
  537. decl %eax
  538. jne .L22
  539. ALIGN_4
  540. .L25:
  541. #ifndef TRMMKERNEL
  542. movl K, %eax
  543. #else
  544. movl KKK, %eax
  545. #endif
  546. andl $7, %eax
  547. BRANCH
  548. je .L28
  549. ALIGN_4
  550. .L26:
  551. addps %xmm2, %xmm6
  552. pshufd $0x00, %xmm1, %xmm2
  553. mulps %xmm0, %xmm2
  554. addps %xmm3, %xmm7
  555. pshufd $0x55, %xmm1, %xmm3
  556. mulps %xmm0, %xmm3
  557. addps %xmm2, %xmm4
  558. pshufd $0xaa, %xmm1, %xmm2
  559. mulps %xmm0, %xmm2
  560. addps %xmm3, %xmm5
  561. pshufd $0xff, %xmm1, %xmm3
  562. movaps -28 * SIZE(BB), %xmm1
  563. mulps %xmm0, %xmm3
  564. movsd -30 * SIZE(AA), %xmm0
  565. addl $2 * SIZE, AA
  566. addl $4 * SIZE, BB
  567. decl %eax
  568. jg .L26
  569. ALIGN_4
  570. .L28:
  571. addps %xmm2, %xmm6
  572. addps %xmm3, %xmm7
  573. movsd ALPHA_R, %xmm3
  574. pshufd $0xb1, %xmm5, %xmm5
  575. pcmpeqb %xmm0, %xmm0
  576. pshufd $0xb1, %xmm7, %xmm7
  577. psllq $63, %xmm0
  578. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  579. pxor %xmm0, %xmm5
  580. pxor %xmm0, %xmm7
  581. subps %xmm5, %xmm4
  582. subps %xmm7, %xmm6
  583. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  584. pxor %xmm0, %xmm5
  585. pxor %xmm0, %xmm7
  586. addps %xmm5, %xmm4
  587. addps %xmm7, %xmm6
  588. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  589. pxor %xmm0, %xmm4
  590. pxor %xmm0, %xmm6
  591. addps %xmm5, %xmm4
  592. addps %xmm7, %xmm6
  593. #else
  594. pxor %xmm0, %xmm4
  595. pxor %xmm0, %xmm6
  596. subps %xmm5, %xmm4
  597. subps %xmm7, %xmm6
  598. #endif
  599. pshufd $0x00, %xmm3, %xmm2
  600. pshufd $0x55, %xmm3, %xmm3
  601. pshufd $0xb1, %xmm4, %xmm5
  602. pshufd $0xb1, %xmm6, %xmm7
  603. mulps %xmm2, %xmm4
  604. mulps %xmm3, %xmm5
  605. mulps %xmm2, %xmm6
  606. mulps %xmm3, %xmm7
  607. pxor %xmm0, %xmm5
  608. pxor %xmm0, %xmm7
  609. subps %xmm5, %xmm4
  610. subps %xmm7, %xmm6
  611. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  612. movsd 0 * SIZE(C1), %xmm2
  613. movsd 0 * SIZE(C1, LDC), %xmm3
  614. addps %xmm2, %xmm4
  615. addps %xmm3, %xmm6
  616. #endif
  617. movsd %xmm4, 0 * SIZE(C1)
  618. movsd %xmm6, 0 * SIZE(C1, LDC)
  619. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  620. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  621. movl K, %eax
  622. subl KKK, %eax
  623. leal (,%eax, SIZE), %eax
  624. leal (AA, %eax, 2), AA
  625. leal (BB, %eax, 4), BB
  626. #endif
  627. #if defined(TRMMKERNEL) && defined(LEFT)
  628. addl $1, KK
  629. #endif
  630. addl $2 * SIZE, C1
  631. ALIGN_2
  632. .L29:
  633. #if defined(TRMMKERNEL) && !defined(LEFT)
  634. addl $2, KK
  635. #endif
  636. movl BB, B
  637. leal (, LDC, 2), %eax
  638. addl %eax, C
  639. decl J
  640. jg .L01
  641. ALIGN_4
  642. .L30:
  643. movl N, %eax
  644. testl $1, %eax
  645. jle .L999
  646. #if defined(TRMMKERNEL) && defined(LEFT)
  647. movl OFFSET, %eax
  648. movl %eax, KK
  649. #endif
  650. movl C, C1
  651. movl A, AA
  652. movl M, %ebx
  653. sarl $1, %ebx
  654. jle .L40
  655. ALIGN_4
  656. .L31:
  657. #if !defined(TRMMKERNEL) || \
  658. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  659. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  660. movl B, BB
  661. #else
  662. movl B, BB
  663. movl KK, %eax
  664. leal (, %eax, SIZE), %eax
  665. leal (AA, %eax, 4), AA
  666. leal (BB, %eax, 2), BB
  667. #endif
  668. movaps -32 * SIZE(AA), %xmm0
  669. pxor %xmm2, %xmm2
  670. movaps -32 * SIZE(BB), %xmm1
  671. pxor %xmm3, %xmm3
  672. pxor %xmm4, %xmm4
  673. prefetcht0 3 * SIZE(C1)
  674. pxor %xmm5, %xmm5
  675. pxor %xmm6, %xmm6
  676. pxor %xmm7, %xmm7
  677. #ifndef TRMMKERNEL
  678. movl K, %eax
  679. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  680. movl K, %eax
  681. subl KK, %eax
  682. movl %eax, KKK
  683. #else
  684. movl KK, %eax
  685. #ifdef LEFT
  686. addl $2, %eax
  687. #else
  688. addl $1, %eax
  689. #endif
  690. movl %eax, KKK
  691. #endif
  692. sarl $3, %eax
  693. je .L35
  694. ALIGN_4
  695. .L32:
  696. addps %xmm2, %xmm4
  697. pshufd $0x00, %xmm1, %xmm2
  698. mulps %xmm0, %xmm2
  699. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  700. addps %xmm3, %xmm5
  701. pshufd $0x55, %xmm1, %xmm3
  702. mulps %xmm0, %xmm3
  703. movaps -28 * SIZE(AA), %xmm0
  704. addps %xmm2, %xmm4
  705. pshufd $0xaa, %xmm1, %xmm2
  706. mulps %xmm0, %xmm2
  707. addps %xmm3, %xmm5
  708. pshufd $0xff, %xmm1, %xmm3
  709. movaps -28 * SIZE(BB), %xmm1
  710. mulps %xmm0, %xmm3
  711. movaps -24 * SIZE(AA), %xmm0
  712. addps %xmm2, %xmm4
  713. pshufd $0x00, %xmm1, %xmm2
  714. mulps %xmm0, %xmm2
  715. addps %xmm3, %xmm5
  716. pshufd $0x55, %xmm1, %xmm3
  717. mulps %xmm0, %xmm3
  718. movaps -20 * SIZE(AA), %xmm0
  719. addps %xmm2, %xmm4
  720. pshufd $0xaa, %xmm1, %xmm2
  721. mulps %xmm0, %xmm2
  722. addps %xmm3, %xmm5
  723. pshufd $0xff, %xmm1, %xmm3
  724. movaps -24 * SIZE(BB), %xmm1
  725. mulps %xmm0, %xmm3
  726. movaps -16 * SIZE(AA), %xmm0
  727. PREFETCH (PREFETCHSIZE + 16) * SIZE(AA)
  728. addps %xmm2, %xmm4
  729. pshufd $0x00, %xmm1, %xmm2
  730. mulps %xmm0, %xmm2
  731. addps %xmm3, %xmm5
  732. pshufd $0x55, %xmm1, %xmm3
  733. mulps %xmm0, %xmm3
  734. movaps -12 * SIZE(AA), %xmm0
  735. addps %xmm2, %xmm4
  736. pshufd $0xaa, %xmm1, %xmm2
  737. mulps %xmm0, %xmm2
  738. addps %xmm3, %xmm5
  739. pshufd $0xff, %xmm1, %xmm3
  740. movaps -20 * SIZE(BB), %xmm1
  741. mulps %xmm0, %xmm3
  742. movaps -8 * SIZE(AA), %xmm0
  743. addps %xmm2, %xmm4
  744. pshufd $0x00, %xmm1, %xmm2
  745. mulps %xmm0, %xmm2
  746. addps %xmm3, %xmm5
  747. pshufd $0x55, %xmm1, %xmm3
  748. mulps %xmm0, %xmm3
  749. movaps -4 * SIZE(AA), %xmm0
  750. addps %xmm2, %xmm4
  751. pshufd $0xaa, %xmm1, %xmm2
  752. mulps %xmm0, %xmm2
  753. addps %xmm3, %xmm5
  754. pshufd $0xff, %xmm1, %xmm3
  755. movaps -16 * SIZE(BB), %xmm1
  756. mulps %xmm0, %xmm3
  757. movaps 0 * SIZE(AA), %xmm0
  758. subl $-32 * SIZE, AA
  759. subl $-16 * SIZE, BB
  760. decl %eax
  761. jne .L32
  762. ALIGN_4
  763. .L35:
  764. #ifndef TRMMKERNEL
  765. movl K, %eax
  766. #else
  767. movl KKK, %eax
  768. #endif
  769. movsd -32 * SIZE(BB), %xmm1
  770. andl $7, %eax
  771. BRANCH
  772. je .L38
  773. ALIGN_4
  774. .L36:
  775. addps %xmm2, %xmm4
  776. pshufd $0x00, %xmm1, %xmm2
  777. mulps %xmm0, %xmm2
  778. addps %xmm3, %xmm5
  779. pshufd $0x55, %xmm1, %xmm3
  780. movsd -30 * SIZE(BB), %xmm1
  781. mulps %xmm0, %xmm3
  782. movaps -28 * SIZE(AA), %xmm0
  783. addl $4 * SIZE, AA
  784. addl $2 * SIZE, BB
  785. decl %eax
  786. jg .L36
  787. ALIGN_4
  788. .L38:
  789. addps %xmm2, %xmm4
  790. addps %xmm3, %xmm5
  791. movsd ALPHA_R, %xmm3
  792. pshufd $0xb1, %xmm5, %xmm5
  793. pcmpeqb %xmm0, %xmm0
  794. psllq $63, %xmm0
  795. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  796. pxor %xmm0, %xmm5
  797. subps %xmm5, %xmm4
  798. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  799. pxor %xmm0, %xmm5
  800. addps %xmm5, %xmm4
  801. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  802. pxor %xmm0, %xmm4
  803. addps %xmm5, %xmm4
  804. #else
  805. pxor %xmm0, %xmm4
  806. subps %xmm5, %xmm4
  807. #endif
  808. pshufd $0x00, %xmm3, %xmm2
  809. pshufd $0x55, %xmm3, %xmm3
  810. pshufd $0xb1, %xmm4, %xmm5
  811. mulps %xmm2, %xmm4
  812. mulps %xmm3, %xmm5
  813. pxor %xmm0, %xmm5
  814. subps %xmm5, %xmm4
  815. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  816. movsd 0 * SIZE(C1), %xmm2
  817. movhps 2 * SIZE(C1), %xmm2
  818. addps %xmm2, %xmm4
  819. #endif
  820. movsd %xmm4, 0 * SIZE(C1)
  821. movhps %xmm4, 2 * SIZE(C1)
  822. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  823. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  824. movl K, %eax
  825. subl KKK, %eax
  826. leal (,%eax, SIZE), %eax
  827. leal (AA, %eax, 4), AA
  828. leal (BB, %eax, 2), BB
  829. #endif
  830. #if defined(TRMMKERNEL) && defined(LEFT)
  831. addl $2, KK
  832. #endif
  833. addl $4 * SIZE, C1
  834. decl %ebx
  835. jg .L31
  836. ALIGN_4
  837. .L40:
  838. movl M, %ebx
  839. testl $1, %ebx
  840. jle .L999
  841. #if !defined(TRMMKERNEL) || \
  842. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  843. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  844. movl B, BB
  845. #else
  846. movl B, BB
  847. movl KK, %eax
  848. leal (, %eax, SIZE), %eax
  849. leal (AA, %eax, 2), AA
  850. leal (BB, %eax, 2), BB
  851. #endif
  852. movsd -32 * SIZE(AA), %xmm0
  853. pxor %xmm2, %xmm2
  854. movsd -32 * SIZE(BB), %xmm1
  855. pxor %xmm3, %xmm3
  856. pxor %xmm4, %xmm4
  857. pxor %xmm5, %xmm5
  858. pxor %xmm6, %xmm6
  859. pxor %xmm7, %xmm7
  860. #ifndef TRMMKERNEL
  861. movl K, %eax
  862. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  863. movl K, %eax
  864. subl KK, %eax
  865. movl %eax, KKK
  866. #else
  867. movl KK, %eax
  868. #ifdef LEFT
  869. addl $1, %eax
  870. #else
  871. addl $1, %eax
  872. #endif
  873. movl %eax, KKK
  874. #endif
  875. sarl $3, %eax
  876. je .L45
  877. ALIGN_4
  878. .L42:
  879. addps %xmm2, %xmm4
  880. pshufd $0x00, %xmm1, %xmm2
  881. mulps %xmm0, %xmm2
  882. addps %xmm3, %xmm5
  883. pshufd $0x55, %xmm1, %xmm3
  884. movsd -30 * SIZE(BB), %xmm1
  885. mulps %xmm0, %xmm3
  886. movsd -30 * SIZE(AA), %xmm0
  887. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  888. addps %xmm2, %xmm4
  889. pshufd $0x00, %xmm1, %xmm2
  890. mulps %xmm0, %xmm2
  891. addps %xmm3, %xmm5
  892. pshufd $0x55, %xmm1, %xmm3
  893. movsd -28 * SIZE(BB), %xmm1
  894. mulps %xmm0, %xmm3
  895. movsd -28 * SIZE(AA), %xmm0
  896. addps %xmm2, %xmm4
  897. pshufd $0x00, %xmm1, %xmm2
  898. mulps %xmm0, %xmm2
  899. addps %xmm3, %xmm5
  900. pshufd $0x55, %xmm1, %xmm3
  901. movsd -26 * SIZE(BB), %xmm1
  902. mulps %xmm0, %xmm3
  903. movsd -26 * SIZE(AA), %xmm0
  904. addps %xmm2, %xmm4
  905. pshufd $0x00, %xmm1, %xmm2
  906. mulps %xmm0, %xmm2
  907. addps %xmm3, %xmm5
  908. pshufd $0x55, %xmm1, %xmm3
  909. movsd -24 * SIZE(BB), %xmm1
  910. mulps %xmm0, %xmm3
  911. movsd -24 * SIZE(AA), %xmm0
  912. addps %xmm2, %xmm4
  913. pshufd $0x00, %xmm1, %xmm2
  914. mulps %xmm0, %xmm2
  915. addps %xmm3, %xmm5
  916. pshufd $0x55, %xmm1, %xmm3
  917. movsd -22 * SIZE(BB), %xmm1
  918. mulps %xmm0, %xmm3
  919. movsd -22 * SIZE(AA), %xmm0
  920. addps %xmm2, %xmm4
  921. pshufd $0x00, %xmm1, %xmm2
  922. mulps %xmm0, %xmm2
  923. addps %xmm3, %xmm5
  924. pshufd $0x55, %xmm1, %xmm3
  925. movsd -20 * SIZE(BB), %xmm1
  926. mulps %xmm0, %xmm3
  927. movsd -20 * SIZE(AA), %xmm0
  928. addps %xmm2, %xmm4
  929. pshufd $0x00, %xmm1, %xmm2
  930. mulps %xmm0, %xmm2
  931. addps %xmm3, %xmm5
  932. pshufd $0x55, %xmm1, %xmm3
  933. movsd -18 * SIZE(BB), %xmm1
  934. mulps %xmm0, %xmm3
  935. movsd -18 * SIZE(AA), %xmm0
  936. addps %xmm2, %xmm4
  937. pshufd $0x00, %xmm1, %xmm2
  938. mulps %xmm0, %xmm2
  939. addps %xmm3, %xmm5
  940. pshufd $0x55, %xmm1, %xmm3
  941. movsd -16 * SIZE(BB), %xmm1
  942. mulps %xmm0, %xmm3
  943. movsd -16 * SIZE(AA), %xmm0
  944. subl $-16 * SIZE, AA
  945. subl $-16 * SIZE, BB
  946. decl %eax
  947. jne .L42
  948. ALIGN_4
  949. .L45:
  950. #ifndef TRMMKERNEL
  951. movl K, %eax
  952. #else
  953. movl KKK, %eax
  954. #endif
  955. andl $7, %eax
  956. BRANCH
  957. je .L48
  958. ALIGN_4
  959. .L46:
  960. addps %xmm2, %xmm4
  961. pshufd $0x00, %xmm1, %xmm2
  962. mulps %xmm0, %xmm2
  963. addps %xmm3, %xmm5
  964. pshufd $0x55, %xmm1, %xmm3
  965. movsd -30 * SIZE(BB), %xmm1
  966. mulps %xmm0, %xmm3
  967. movsd -30 * SIZE(AA), %xmm0
  968. addl $2 * SIZE, AA
  969. addl $2 * SIZE, BB
  970. decl %eax
  971. jg .L46
  972. ALIGN_4
  973. .L48:
  974. addps %xmm2, %xmm4
  975. addps %xmm3, %xmm5
  976. movsd ALPHA_R, %xmm3
  977. pshufd $0xb1, %xmm5, %xmm5
  978. pcmpeqb %xmm0, %xmm0
  979. psllq $63, %xmm0
  980. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  981. pxor %xmm0, %xmm5
  982. subps %xmm5, %xmm4
  983. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  984. pxor %xmm0, %xmm5
  985. addps %xmm5, %xmm4
  986. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  987. pxor %xmm0, %xmm4
  988. addps %xmm5, %xmm4
  989. #else
  990. pxor %xmm0, %xmm4
  991. subps %xmm5, %xmm4
  992. #endif
  993. pshufd $0x00, %xmm3, %xmm2
  994. pshufd $0x55, %xmm3, %xmm3
  995. pshufd $0xb1, %xmm4, %xmm5
  996. mulps %xmm2, %xmm4
  997. mulps %xmm3, %xmm5
  998. pxor %xmm0, %xmm5
  999. subps %xmm5, %xmm4
  1000. #if! defined(TRMMKERNEL) && !defined(BETAZERO)
  1001. movsd 0 * SIZE(C1), %xmm2
  1002. addps %xmm2, %xmm4
  1003. #endif
  1004. movsd %xmm4, 0 * SIZE(C1)
  1005. ALIGN_4
  1006. .L999:
  1007. popl %ebx
  1008. popl %esi
  1009. popl %edi
  1010. popl %ebp
  1011. addl $ARGS, %esp
  1012. ret
  1013. EPILOGUE