You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel_2x2_atom.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA_R 16 + STACK + ARGS(%esp)
  46. #define ALPHA_I 24 + STACK + ARGS(%esp)
  47. #define A 32 + STACK + ARGS(%esp)
  48. #define ARG_B 36 + STACK + ARGS(%esp)
  49. #define C 40 + STACK + ARGS(%esp)
  50. #define ARG_LDC 44 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define BX 4 + STACK(%esp)
  53. #define KK 8 + STACK(%esp)
  54. #define KKK 12 + STACK(%esp)
  55. #define PREFETCH prefetcht0
  56. #define PREFETCHSIZE 84
  57. #define AA %edx
  58. #define BB %ecx
  59. #define CO1 %esi
  60. #define LDC %ebp
  61. #define B %edi
  62. PROLOGUE
  63. subl $ARGS, %esp
  64. pushl %ebp
  65. pushl %edi
  66. pushl %esi
  67. pushl %ebx
  68. PROFCODE
  69. movl ARG_B, B
  70. movl ARG_LDC, LDC
  71. #ifdef TRMMKERNEL
  72. movl OFFSET, %eax
  73. #ifndef LEFT
  74. negl %eax
  75. #endif
  76. movl %eax, KK
  77. #endif
  78. sall $ZBASE_SHIFT, LDC
  79. movl N, %eax
  80. sarl $1, %eax
  81. movl %eax, J
  82. jle .L30
  83. ALIGN_2
  84. .L10:
  85. #if defined(TRMMKERNEL) && defined(LEFT)
  86. movl OFFSET, %eax
  87. movl %eax, KK
  88. #endif
  89. movl K, %eax
  90. sall $BASE_SHIFT + 1, %eax
  91. leal (B, %eax), %eax
  92. movl %eax, BX
  93. movl C, CO1 # coffset = c
  94. leal (, LDC, 2), %eax
  95. addl %eax, C
  96. movl A, AA # aoffset = a
  97. movl M, %ebx
  98. sarl $1, %ebx # i = (m >> 2)
  99. jle .L20
  100. ALIGN_4
  101. .L11:
  102. #if !defined(TRMMKERNEL) || \
  103. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  104. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  105. movl B, BB
  106. #else
  107. movl KK, %eax
  108. leal (, %eax, SIZE), %eax
  109. leal (AA, %eax, 2), AA
  110. leal (B, %eax, 2), BB
  111. #endif
  112. movl BX, %eax
  113. prefetcht0 0 * SIZE(%eax)
  114. subl $-8 * SIZE, BX
  115. movsd 0 * SIZE(AA), %xmm0
  116. xorps %xmm2, %xmm2
  117. xorps %xmm3, %xmm3
  118. xorps %xmm4, %xmm4
  119. prefetcht0 3 * SIZE(CO1)
  120. xorps %xmm5, %xmm5
  121. prefetcht0 3 * SIZE(CO1, LDC)
  122. xorps %xmm6, %xmm6
  123. xorps %xmm7, %xmm7
  124. #ifndef TRMMKERNEL
  125. movl K, %eax
  126. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  127. movl K, %eax
  128. subl KK, %eax
  129. movl %eax, KKK
  130. #else
  131. movl KK, %eax
  132. #ifdef LEFT
  133. addl $2, %eax
  134. #else
  135. addl $2, %eax
  136. #endif
  137. movl %eax, KKK
  138. #endif
  139. sarl $2, %eax
  140. je .L15
  141. ALIGN_4
  142. .L12:
  143. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  144. addsd %xmm2, %xmm6
  145. movsd 1 * SIZE(AA), %xmm2
  146. movaps %xmm0, %xmm1
  147. mulsd 0 * SIZE(BB), %xmm0
  148. addsd %xmm3, %xmm7
  149. mulsd 1 * SIZE(BB), %xmm1
  150. addsd %xmm0, %xmm4
  151. movsd 2 * SIZE(AA), %xmm0
  152. movaps %xmm2, %xmm3
  153. mulsd 0 * SIZE(BB), %xmm2
  154. addsd %xmm1, %xmm5
  155. mulsd 1 * SIZE(BB), %xmm3
  156. addsd %xmm2, %xmm6
  157. movsd 3 * SIZE(AA), %xmm2
  158. movaps %xmm0, %xmm1
  159. mulsd 2 * SIZE(BB), %xmm0
  160. addsd %xmm3, %xmm7
  161. mulsd 3 * SIZE(BB), %xmm1
  162. addsd %xmm0, %xmm4
  163. movsd 4 * SIZE(AA), %xmm0
  164. movaps %xmm2, %xmm3
  165. mulsd 2 * SIZE(BB), %xmm2
  166. addsd %xmm1, %xmm5
  167. mulsd 3 * SIZE(BB), %xmm3
  168. addsd %xmm2, %xmm6
  169. movsd 5 * SIZE(AA), %xmm2
  170. movaps %xmm0, %xmm1
  171. mulsd 4 * SIZE(BB), %xmm0
  172. addsd %xmm3, %xmm7
  173. mulsd 5 * SIZE(BB), %xmm1
  174. addsd %xmm0, %xmm4
  175. movsd 6 * SIZE(AA), %xmm0
  176. movaps %xmm2, %xmm3
  177. mulsd 4 * SIZE(BB), %xmm2
  178. addsd %xmm1, %xmm5
  179. mulsd 5 * SIZE(BB), %xmm3
  180. addsd %xmm2, %xmm6
  181. movsd 7 * SIZE(AA), %xmm2
  182. movaps %xmm0, %xmm1
  183. mulsd 6 * SIZE(BB), %xmm0
  184. addsd %xmm3, %xmm7
  185. mulsd 7 * SIZE(BB), %xmm1
  186. addsd %xmm0, %xmm4
  187. movsd 8 * SIZE(AA), %xmm0
  188. movaps %xmm2, %xmm3
  189. mulsd 6 * SIZE(BB), %xmm2
  190. addsd %xmm1, %xmm5
  191. mulsd 7 * SIZE(BB), %xmm3
  192. addl $8 * SIZE, BB
  193. addl $8 * SIZE, AA
  194. decl %eax
  195. jne .L12
  196. ALIGN_4
  197. .L15:
  198. #ifndef TRMMKERNEL
  199. movl K, %eax
  200. #else
  201. movl KKK, %eax
  202. #endif
  203. andl $3, %eax # if (k & 1)
  204. BRANCH
  205. je .L18
  206. ALIGN_3
  207. .L16:
  208. addsd %xmm2, %xmm6
  209. movsd 1 * SIZE(AA), %xmm2
  210. movaps %xmm0, %xmm1
  211. mulsd 0 * SIZE(BB), %xmm0
  212. addsd %xmm3, %xmm7
  213. mulsd 1 * SIZE(BB), %xmm1
  214. addsd %xmm0, %xmm4
  215. movsd 2 * SIZE(AA), %xmm0
  216. movaps %xmm2, %xmm3
  217. mulsd 0 * SIZE(BB), %xmm2
  218. addsd %xmm1, %xmm5
  219. mulsd 1 * SIZE(BB), %xmm3
  220. addl $2 * SIZE, AA
  221. addl $2 * SIZE, BB
  222. decl %eax
  223. jg .L16
  224. ALIGN_4
  225. .L18:
  226. movsd ALPHA_R, %xmm0
  227. movsd ALPHA_I, %xmm1
  228. addsd %xmm2, %xmm6
  229. addsd %xmm3, %xmm7
  230. movaps %xmm4, %xmm2
  231. mulsd %xmm0, %xmm4
  232. mulsd %xmm1, %xmm2
  233. movaps %xmm6, %xmm3
  234. mulsd %xmm0, %xmm6
  235. mulsd %xmm1, %xmm3
  236. addsd 0 * SIZE(CO1), %xmm4
  237. addsd 1 * SIZE(CO1), %xmm2
  238. addsd 2 * SIZE(CO1), %xmm6
  239. addsd 3 * SIZE(CO1), %xmm3
  240. movlps %xmm4, 0 * SIZE(CO1)
  241. movlps %xmm2, 1 * SIZE(CO1)
  242. movlps %xmm6, 2 * SIZE(CO1)
  243. movlps %xmm3, 3 * SIZE(CO1)
  244. movaps %xmm5, %xmm2
  245. mulsd %xmm0, %xmm5
  246. mulsd %xmm1, %xmm2
  247. movaps %xmm7, %xmm3
  248. mulsd %xmm0, %xmm7
  249. mulsd %xmm1, %xmm3
  250. addsd 0 * SIZE(CO1, LDC), %xmm5
  251. addsd 1 * SIZE(CO1, LDC), %xmm2
  252. addsd 2 * SIZE(CO1, LDC), %xmm7
  253. addsd 3 * SIZE(CO1, LDC), %xmm3
  254. movlps %xmm5, 0 * SIZE(CO1, LDC)
  255. movlps %xmm2, 1 * SIZE(CO1, LDC)
  256. movlps %xmm7, 2 * SIZE(CO1, LDC)
  257. movlps %xmm3, 3 * SIZE(CO1, LDC)
  258. addl $4 * SIZE, CO1
  259. decl %ebx
  260. jg .L11
  261. ALIGN_4
  262. .L20:
  263. movl M, %ebx
  264. testl $1, %ebx
  265. jle .L29
  266. #if !defined(TRMMKERNEL) || \
  267. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  268. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  269. movl B, BB
  270. #else
  271. movl KK, %eax
  272. leal (, %eax, SIZE), %eax
  273. leal (AA, %eax, 1), AA
  274. leal (B, %eax, 2), BB
  275. #endif
  276. movsd 0 * SIZE(AA), %xmm0
  277. xorps %xmm2, %xmm2
  278. xorps %xmm3, %xmm3
  279. xorps %xmm4, %xmm4
  280. xorps %xmm5, %xmm5
  281. #ifndef TRMMKERNEL
  282. movl K, %eax
  283. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  284. movl K, %eax
  285. subl KK, %eax
  286. movl %eax, KKK
  287. #else
  288. movl KK, %eax
  289. #ifdef LEFT
  290. addl $1, %eax
  291. #else
  292. addl $2, %eax
  293. #endif
  294. movl %eax, KKK
  295. #endif
  296. sarl $2, %eax
  297. je .L25
  298. ALIGN_4
  299. .L22:
  300. addsd %xmm2, %xmm4
  301. movsd 0 * SIZE(BB), %xmm2
  302. addsd %xmm3, %xmm5
  303. movsd 1 * SIZE(BB), %xmm3
  304. mulsd %xmm0, %xmm2
  305. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  306. mulsd %xmm0, %xmm3
  307. movsd 1 * SIZE(AA), %xmm0
  308. addsd %xmm2, %xmm4
  309. movsd 2 * SIZE(BB), %xmm2
  310. addsd %xmm3, %xmm5
  311. movsd 3 * SIZE(BB), %xmm3
  312. mulsd %xmm0, %xmm2
  313. mulsd %xmm0, %xmm3
  314. movsd 2 * SIZE(AA), %xmm0
  315. addsd %xmm2, %xmm4
  316. movsd 4 * SIZE(BB), %xmm2
  317. addsd %xmm3, %xmm5
  318. movsd 5 * SIZE(BB), %xmm3
  319. mulsd %xmm0, %xmm2
  320. mulsd %xmm0, %xmm3
  321. movsd 3 * SIZE(AA), %xmm0
  322. addsd %xmm2, %xmm4
  323. movsd 6 * SIZE(BB), %xmm2
  324. addsd %xmm3, %xmm5
  325. movsd 7 * SIZE(BB), %xmm3
  326. mulsd %xmm0, %xmm2
  327. mulsd %xmm0, %xmm3
  328. movsd 4 * SIZE(AA), %xmm0
  329. addl $4 * SIZE, AA
  330. addl $8 * SIZE, BB
  331. decl %eax
  332. jne .L22
  333. ALIGN_4
  334. .L25:
  335. #ifndef TRMMKERNEL
  336. movl K, %eax
  337. #else
  338. movl KKK, %eax
  339. #endif
  340. andl $3, %eax # if (k & 1)
  341. BRANCH
  342. je .L28
  343. ALIGN_3
  344. .L26:
  345. addsd %xmm2, %xmm4
  346. movsd 0 * SIZE(BB), %xmm2
  347. addsd %xmm3, %xmm5
  348. movsd 1 * SIZE(BB), %xmm3
  349. mulsd %xmm0, %xmm2
  350. mulsd %xmm0, %xmm3
  351. movsd 1 * SIZE(AA), %xmm0
  352. addl $1 * SIZE, AA
  353. addl $2 * SIZE, BB
  354. decl %eax
  355. jg .L26
  356. ALIGN_4
  357. .L28:
  358. movsd ALPHA_R, %xmm0
  359. movsd ALPHA_I, %xmm1
  360. addsd %xmm2, %xmm4
  361. addsd %xmm3, %xmm5
  362. movaps %xmm4, %xmm2
  363. mulsd %xmm0, %xmm4
  364. mulsd %xmm1, %xmm2
  365. movaps %xmm5, %xmm3
  366. mulsd %xmm0, %xmm5
  367. mulsd %xmm1, %xmm3
  368. addsd 0 * SIZE(CO1), %xmm4
  369. addsd 1 * SIZE(CO1), %xmm2
  370. addsd 0 * SIZE(CO1, LDC), %xmm5
  371. addsd 1 * SIZE(CO1, LDC), %xmm3
  372. movlps %xmm4, 0 * SIZE(CO1)
  373. movlps %xmm2, 1 * SIZE(CO1)
  374. movlps %xmm5, 0 * SIZE(CO1, LDC)
  375. movlps %xmm3, 1 * SIZE(CO1, LDC)
  376. ALIGN_4
  377. .L29:
  378. #if defined(TRMMKERNEL) && !defined(LEFT)
  379. addl $2, KK
  380. #endif
  381. movl BB, B
  382. decl J
  383. jg .L10
  384. ALIGN_4
  385. .L30:
  386. testl $1, N
  387. je .L999
  388. #if defined(TRMMKERNEL) && defined(LEFT)
  389. movl OFFSET, %eax
  390. movl %eax, KK
  391. #endif
  392. movl C, CO1
  393. addl LDC, C
  394. movl A, AA
  395. movl M, %ebx
  396. sarl $1, %ebx
  397. jle .L40
  398. ALIGN_4
  399. .L31:
  400. #if !defined(TRMMKERNEL) || \
  401. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  402. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  403. movl B, BB
  404. #else
  405. movl KK, %eax
  406. leal (, %eax, SIZE), %eax
  407. leal (AA, %eax, 2), AA
  408. leal (B, %eax, 1), BB
  409. #endif
  410. movsd 0 * SIZE(BB), %xmm1
  411. xorps %xmm0, %xmm0
  412. prefetcht0 3 * SIZE(CO1)
  413. xorps %xmm2, %xmm2
  414. xorps %xmm4, %xmm4
  415. xorps %xmm6, %xmm6
  416. #ifndef TRMMKERNEL
  417. movl K, %eax
  418. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  419. movl K, %eax
  420. subl KK, %eax
  421. movl %eax, KKK
  422. #else
  423. movl KK, %eax
  424. #ifdef LEFT
  425. addl $2, %eax
  426. #else
  427. addl $1, %eax
  428. #endif
  429. movl %eax, KKK
  430. #endif
  431. sarl $2, %eax
  432. je .L35
  433. ALIGN_4
  434. .L32:
  435. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  436. addsd %xmm0, %xmm4
  437. movsd 0 * SIZE(AA), %xmm0
  438. addsd %xmm2, %xmm6
  439. movsd 1 * SIZE(AA), %xmm2
  440. mulsd %xmm1, %xmm0
  441. mulsd %xmm1, %xmm2
  442. movsd 1 * SIZE(BB), %xmm1
  443. addsd %xmm0, %xmm4
  444. movsd 2 * SIZE(AA), %xmm0
  445. addsd %xmm2, %xmm6
  446. movsd 3 * SIZE(AA), %xmm2
  447. mulsd %xmm1, %xmm0
  448. mulsd %xmm1, %xmm2
  449. movsd 2 * SIZE(BB), %xmm1
  450. addsd %xmm0, %xmm4
  451. movsd 4 * SIZE(AA), %xmm0
  452. addsd %xmm2, %xmm6
  453. movsd 5 * SIZE(AA), %xmm2
  454. mulsd %xmm1, %xmm0
  455. mulsd %xmm1, %xmm2
  456. movsd 3 * SIZE(BB), %xmm1
  457. addsd %xmm0, %xmm4
  458. movsd 6 * SIZE(AA), %xmm0
  459. addsd %xmm2, %xmm6
  460. movsd 7 * SIZE(AA), %xmm2
  461. mulsd %xmm1, %xmm0
  462. mulsd %xmm1, %xmm2
  463. movsd 4 * SIZE(BB), %xmm1
  464. addl $8 * SIZE, AA
  465. addl $4 * SIZE, BB
  466. decl %eax
  467. jne .L32
  468. ALIGN_4
  469. .L35:
  470. #ifndef TRMMKERNEL
  471. movl K, %eax
  472. #else
  473. movl KKK, %eax
  474. #endif
  475. andl $3, %eax # if (k & 1)
  476. BRANCH
  477. je .L38
  478. ALIGN_3
  479. .L36:
  480. addsd %xmm0, %xmm4
  481. movsd 0 * SIZE(AA), %xmm0
  482. addsd %xmm2, %xmm6
  483. movsd 1 * SIZE(AA), %xmm2
  484. mulsd %xmm1, %xmm0
  485. mulsd %xmm1, %xmm2
  486. movsd 1 * SIZE(BB), %xmm1
  487. addl $2 * SIZE, AA
  488. addl $1 * SIZE, BB
  489. decl %eax
  490. jg .L36
  491. ALIGN_4
  492. .L38:
  493. addsd %xmm0, %xmm4
  494. addsd %xmm2, %xmm6
  495. movsd ALPHA_R, %xmm0
  496. movsd ALPHA_I, %xmm1
  497. movaps %xmm4, %xmm2
  498. mulsd %xmm0, %xmm4
  499. mulsd %xmm1, %xmm2
  500. movaps %xmm6, %xmm3
  501. mulsd %xmm0, %xmm6
  502. mulsd %xmm1, %xmm3
  503. addsd 0 * SIZE(CO1), %xmm4
  504. addsd 1 * SIZE(CO1), %xmm2
  505. addsd 2 * SIZE(CO1), %xmm6
  506. addsd 3 * SIZE(CO1), %xmm3
  507. movlps %xmm4, 0 * SIZE(CO1)
  508. movlps %xmm2, 1 * SIZE(CO1)
  509. movlps %xmm6, 2 * SIZE(CO1)
  510. movlps %xmm3, 3 * SIZE(CO1)
  511. addl $4 * SIZE, CO1
  512. decl %ebx
  513. jg .L31
  514. ALIGN_4
  515. .L40:
  516. movl M, %ebx
  517. testl $1, %ebx
  518. jle .L999
  519. #if !defined(TRMMKERNEL) || \
  520. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  521. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  522. movl B, BB
  523. #else
  524. movl KK, %eax
  525. leal (, %eax, SIZE), %eax
  526. leal (AA, %eax, 1), AA
  527. leal (B, %eax, 1), BB
  528. #endif
  529. movsd 0 * SIZE(AA), %xmm0
  530. xorps %xmm4, %xmm4
  531. movsd 0 * SIZE(BB), %xmm2
  532. xorps %xmm5, %xmm5
  533. #ifndef TRMMKERNEL
  534. movl K, %eax
  535. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  536. movl K, %eax
  537. subl KK, %eax
  538. movl %eax, KKK
  539. #else
  540. movl KK, %eax
  541. #ifdef LEFT
  542. addl $1, %eax
  543. #else
  544. addl $1, %eax
  545. #endif
  546. movl %eax, KKK
  547. #endif
  548. sarl $2, %eax
  549. je .L45
  550. ALIGN_4
  551. .L42:
  552. mulsd %xmm0, %xmm2
  553. movsd 1 * SIZE(AA), %xmm0
  554. addsd %xmm2, %xmm4
  555. movsd 1 * SIZE(BB), %xmm2
  556. mulsd %xmm0, %xmm2
  557. movsd 2 * SIZE(AA), %xmm0
  558. addsd %xmm2, %xmm5
  559. movsd 2 * SIZE(BB), %xmm2
  560. mulsd %xmm0, %xmm2
  561. movsd 3 * SIZE(AA), %xmm0
  562. addsd %xmm2, %xmm4
  563. movsd 3 * SIZE(BB), %xmm2
  564. mulsd %xmm0, %xmm2
  565. movsd 4 * SIZE(AA), %xmm0
  566. addsd %xmm2, %xmm5
  567. movsd 4 * SIZE(BB), %xmm2
  568. addl $4 * SIZE, AA
  569. addl $4 * SIZE, BB
  570. decl %eax
  571. jne .L42
  572. ALIGN_4
  573. .L45:
  574. #ifndef TRMMKERNEL
  575. movl K, %eax
  576. #else
  577. movl KKK, %eax
  578. #endif
  579. andl $3, %eax # if (k & 1)
  580. BRANCH
  581. je .L48
  582. ALIGN_3
  583. .L46:
  584. mulsd %xmm0, %xmm2
  585. movsd 1 * SIZE(AA), %xmm0
  586. addsd %xmm2, %xmm4
  587. movsd 1 * SIZE(BB), %xmm2
  588. addl $1 * SIZE, AA
  589. addl $1 * SIZE, BB
  590. decl %eax
  591. jg .L46
  592. ALIGN_4
  593. .L48:
  594. addsd %xmm5, %xmm4
  595. movsd ALPHA_R, %xmm0
  596. movsd ALPHA_I, %xmm1
  597. movaps %xmm4, %xmm2
  598. mulsd %xmm0, %xmm4
  599. mulsd %xmm1, %xmm2
  600. addsd 0 * SIZE(CO1), %xmm4
  601. addsd 1 * SIZE(CO1), %xmm2
  602. movlps %xmm4, 0 * SIZE(CO1)
  603. movlps %xmm2, 1 * SIZE(CO1)
  604. ALIGN_4
  605. .L999:
  606. popl %ebx
  607. popl %esi
  608. popl %edi
  609. popl %ebp
  610. addl $ARGS, %esp
  611. ret
  612. EPILOGUE