You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x2_atom.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define BX 4 + STACK(%esp)
  53. #define KK 8 + STACK(%esp)
  54. #define KKK 12 + STACK(%esp)
  55. #define PREFETCH prefetcht0
  56. #define PREFETCHSIZE 84
  57. #define AA %edx
  58. #define BB %ecx
  59. #define CO1 %esi
  60. #define LDC %ebp
  61. #define B %edi
  62. PROLOGUE
  63. subl $ARGS, %esp
  64. pushl %ebp
  65. pushl %edi
  66. pushl %esi
  67. pushl %ebx
  68. PROFCODE
  69. movl ARG_B, B
  70. movl ARG_LDC, LDC
  71. #ifdef TRMMKERNEL
  72. movl OFFSET, %eax
  73. #ifndef LEFT
  74. negl %eax
  75. #endif
  76. movl %eax, KK
  77. #endif
  78. leal (, LDC, SIZE), LDC
  79. movl N, %eax
  80. sarl $1, %eax
  81. movl %eax, J
  82. jle .L30
  83. ALIGN_2
  84. .L10:
  85. #if defined(TRMMKERNEL) && defined(LEFT)
  86. movl OFFSET, %eax
  87. movl %eax, KK
  88. #endif
  89. movl K, %eax
  90. sall $BASE_SHIFT + 1, %eax
  91. leal (B, %eax), %eax
  92. movl %eax, BX
  93. movl C, CO1 # coffset = c
  94. leal (, LDC, 2), %eax
  95. addl %eax, C
  96. movl A, AA # aoffset = a
  97. movl M, %ebx
  98. sarl $1, %ebx # i = (m >> 2)
  99. jle .L20
  100. ALIGN_4
  101. .L11:
  102. #if !defined(TRMMKERNEL) || \
  103. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  104. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  105. movl B, BB
  106. #else
  107. movl KK, %eax
  108. leal (, %eax, SIZE), %eax
  109. leal (AA, %eax, 2), AA
  110. leal (B, %eax, 2), BB
  111. #endif
  112. movl BX, %eax
  113. prefetcht0 0 * SIZE(%eax)
  114. subl $-8 * SIZE, BX
  115. movsd 0 * SIZE(AA), %xmm0
  116. xorps %xmm2, %xmm2
  117. xorps %xmm3, %xmm3
  118. xorps %xmm4, %xmm4
  119. prefetcht0 3 * SIZE(CO1)
  120. xorps %xmm5, %xmm5
  121. prefetcht0 3 * SIZE(CO1, LDC)
  122. xorps %xmm6, %xmm6
  123. xorps %xmm7, %xmm7
  124. #ifndef TRMMKERNEL
  125. movl K, %eax
  126. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  127. movl K, %eax
  128. subl KK, %eax
  129. movl %eax, KKK
  130. #else
  131. movl KK, %eax
  132. #ifdef LEFT
  133. addl $2, %eax
  134. #else
  135. addl $2, %eax
  136. #endif
  137. movl %eax, KKK
  138. #endif
  139. sarl $2, %eax
  140. je .L15
  141. ALIGN_4
  142. .L12:
  143. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  144. addsd %xmm2, %xmm6
  145. movsd 1 * SIZE(AA), %xmm2
  146. movaps %xmm0, %xmm1
  147. mulsd 0 * SIZE(BB), %xmm0
  148. addsd %xmm3, %xmm7
  149. mulsd 1 * SIZE(BB), %xmm1
  150. addsd %xmm0, %xmm4
  151. movsd 2 * SIZE(AA), %xmm0
  152. movaps %xmm2, %xmm3
  153. mulsd 0 * SIZE(BB), %xmm2
  154. addsd %xmm1, %xmm5
  155. mulsd 1 * SIZE(BB), %xmm3
  156. addsd %xmm2, %xmm6
  157. movsd 3 * SIZE(AA), %xmm2
  158. movaps %xmm0, %xmm1
  159. mulsd 2 * SIZE(BB), %xmm0
  160. addsd %xmm3, %xmm7
  161. mulsd 3 * SIZE(BB), %xmm1
  162. addsd %xmm0, %xmm4
  163. movsd 4 * SIZE(AA), %xmm0
  164. movaps %xmm2, %xmm3
  165. mulsd 2 * SIZE(BB), %xmm2
  166. addsd %xmm1, %xmm5
  167. mulsd 3 * SIZE(BB), %xmm3
  168. addsd %xmm2, %xmm6
  169. movsd 5 * SIZE(AA), %xmm2
  170. movaps %xmm0, %xmm1
  171. mulsd 4 * SIZE(BB), %xmm0
  172. addsd %xmm3, %xmm7
  173. mulsd 5 * SIZE(BB), %xmm1
  174. addsd %xmm0, %xmm4
  175. movsd 6 * SIZE(AA), %xmm0
  176. movaps %xmm2, %xmm3
  177. mulsd 4 * SIZE(BB), %xmm2
  178. addsd %xmm1, %xmm5
  179. mulsd 5 * SIZE(BB), %xmm3
  180. addsd %xmm2, %xmm6
  181. movsd 7 * SIZE(AA), %xmm2
  182. movaps %xmm0, %xmm1
  183. mulsd 6 * SIZE(BB), %xmm0
  184. addsd %xmm3, %xmm7
  185. mulsd 7 * SIZE(BB), %xmm1
  186. addsd %xmm0, %xmm4
  187. movsd 8 * SIZE(AA), %xmm0
  188. movaps %xmm2, %xmm3
  189. mulsd 6 * SIZE(BB), %xmm2
  190. addsd %xmm1, %xmm5
  191. mulsd 7 * SIZE(BB), %xmm3
  192. addl $8 * SIZE, BB
  193. addl $8 * SIZE, AA
  194. decl %eax
  195. jne .L12
  196. ALIGN_4
  197. .L15:
  198. #ifndef TRMMKERNEL
  199. movl K, %eax
  200. #else
  201. movl KKK, %eax
  202. #endif
  203. andl $3, %eax # if (k & 1)
  204. BRANCH
  205. je .L18
  206. ALIGN_3
  207. .L16:
  208. addsd %xmm2, %xmm6
  209. movsd 1 * SIZE(AA), %xmm2
  210. movaps %xmm0, %xmm1
  211. mulsd 0 * SIZE(BB), %xmm0
  212. addsd %xmm3, %xmm7
  213. mulsd 1 * SIZE(BB), %xmm1
  214. addsd %xmm0, %xmm4
  215. movsd 2 * SIZE(AA), %xmm0
  216. movaps %xmm2, %xmm3
  217. mulsd 0 * SIZE(BB), %xmm2
  218. addsd %xmm1, %xmm5
  219. mulsd 1 * SIZE(BB), %xmm3
  220. addl $2 * SIZE, AA
  221. addl $2 * SIZE, BB
  222. decl %eax
  223. jg .L16
  224. ALIGN_4
  225. .L18:
  226. movsd ALPHA, %xmm0
  227. addsd %xmm2, %xmm6
  228. addsd %xmm3, %xmm7
  229. mulsd %xmm0, %xmm4
  230. mulsd %xmm0, %xmm5
  231. mulsd %xmm0, %xmm6
  232. mulsd %xmm0, %xmm7
  233. #ifndef TRMMKERNEL
  234. addsd 0 * SIZE(CO1), %xmm4
  235. addsd 1 * SIZE(CO1), %xmm6
  236. addsd 0 * SIZE(CO1, LDC), %xmm5
  237. addsd 1 * SIZE(CO1, LDC), %xmm7
  238. #endif
  239. movsd %xmm4, 0 * SIZE(CO1)
  240. movsd %xmm6, 1 * SIZE(CO1)
  241. movsd %xmm5, 0 * SIZE(CO1, LDC)
  242. movsd %xmm7, 1 * SIZE(CO1, LDC)
  243. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  244. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  245. movl K, %eax
  246. subl KKK, %eax
  247. leal (,%eax, SIZE), %eax
  248. leal (AA, %eax, 2), AA
  249. leal (BB, %eax, 2), BB
  250. #endif
  251. #if defined(TRMMKERNEL) && defined(LEFT)
  252. addl $2, KK
  253. #endif
  254. addl $2 * SIZE, CO1
  255. decl %ebx
  256. jg .L11
  257. ALIGN_4
  258. .L20:
  259. movl M, %ebx
  260. testl $1, %ebx
  261. jle .L29
  262. #if !defined(TRMMKERNEL) || \
  263. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  264. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  265. movl B, BB
  266. #else
  267. movl KK, %eax
  268. leal (, %eax, SIZE), %eax
  269. leal (AA, %eax, 1), AA
  270. leal (B, %eax, 2), BB
  271. #endif
  272. movsd 0 * SIZE(AA), %xmm0
  273. xorps %xmm2, %xmm2
  274. xorps %xmm3, %xmm3
  275. xorps %xmm4, %xmm4
  276. xorps %xmm5, %xmm5
  277. #ifndef TRMMKERNEL
  278. movl K, %eax
  279. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  280. movl K, %eax
  281. subl KK, %eax
  282. movl %eax, KKK
  283. #else
  284. movl KK, %eax
  285. #ifdef LEFT
  286. addl $1, %eax
  287. #else
  288. addl $2, %eax
  289. #endif
  290. movl %eax, KKK
  291. #endif
  292. sarl $2, %eax
  293. je .L25
  294. ALIGN_4
  295. .L22:
  296. addsd %xmm2, %xmm4
  297. movsd 0 * SIZE(BB), %xmm2
  298. addsd %xmm3, %xmm5
  299. movsd 1 * SIZE(BB), %xmm3
  300. mulsd %xmm0, %xmm2
  301. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  302. mulsd %xmm0, %xmm3
  303. movsd 1 * SIZE(AA), %xmm0
  304. addsd %xmm2, %xmm4
  305. movsd 2 * SIZE(BB), %xmm2
  306. addsd %xmm3, %xmm5
  307. movsd 3 * SIZE(BB), %xmm3
  308. mulsd %xmm0, %xmm2
  309. mulsd %xmm0, %xmm3
  310. movsd 2 * SIZE(AA), %xmm0
  311. addsd %xmm2, %xmm4
  312. movsd 4 * SIZE(BB), %xmm2
  313. addsd %xmm3, %xmm5
  314. movsd 5 * SIZE(BB), %xmm3
  315. mulsd %xmm0, %xmm2
  316. mulsd %xmm0, %xmm3
  317. movsd 3 * SIZE(AA), %xmm0
  318. addsd %xmm2, %xmm4
  319. movsd 6 * SIZE(BB), %xmm2
  320. addsd %xmm3, %xmm5
  321. movsd 7 * SIZE(BB), %xmm3
  322. mulsd %xmm0, %xmm2
  323. mulsd %xmm0, %xmm3
  324. movsd 4 * SIZE(AA), %xmm0
  325. addl $4 * SIZE, AA
  326. addl $8 * SIZE, BB
  327. decl %eax
  328. jne .L22
  329. ALIGN_4
  330. .L25:
  331. #ifndef TRMMKERNEL
  332. movl K, %eax
  333. #else
  334. movl KKK, %eax
  335. #endif
  336. andl $3, %eax # if (k & 1)
  337. BRANCH
  338. je .L28
  339. ALIGN_3
  340. .L26:
  341. addsd %xmm2, %xmm4
  342. movsd 0 * SIZE(BB), %xmm2
  343. addsd %xmm3, %xmm5
  344. movsd 1 * SIZE(BB), %xmm3
  345. mulsd %xmm0, %xmm2
  346. mulsd %xmm0, %xmm3
  347. movsd 1 * SIZE(AA), %xmm0
  348. addl $1 * SIZE, AA
  349. addl $2 * SIZE, BB
  350. decl %eax
  351. jg .L26
  352. ALIGN_4
  353. .L28:
  354. movsd ALPHA, %xmm0
  355. addsd %xmm2, %xmm4
  356. addsd %xmm3, %xmm5
  357. mulsd %xmm0, %xmm4
  358. mulsd %xmm0, %xmm5
  359. #ifndef TRMMKERNEL
  360. addsd 0 * SIZE(CO1), %xmm4
  361. addsd 0 * SIZE(CO1, LDC), %xmm5
  362. #endif
  363. movsd %xmm4, 0 * SIZE(CO1)
  364. movsd %xmm5, 0 * SIZE(CO1, LDC)
  365. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  366. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  367. movl K, %eax
  368. subl KKK, %eax
  369. leal (,%eax, SIZE), %eax
  370. leal (AA, %eax, 1), AA
  371. leal (BB, %eax, 2), BB
  372. #endif
  373. #if defined(TRMMKERNEL) && defined(LEFT)
  374. addl $1, KK
  375. #endif
  376. addl $1 * SIZE, CO1
  377. ALIGN_4
  378. .L29:
  379. #if defined(TRMMKERNEL) && !defined(LEFT)
  380. addl $2, KK
  381. #endif
  382. movl BB, B
  383. decl J
  384. jg .L10
  385. ALIGN_4
  386. .L30:
  387. testl $1, N
  388. je .L999
  389. #if defined(TRMMKERNEL) && defined(LEFT)
  390. movl OFFSET, %eax
  391. movl %eax, KK
  392. #endif
  393. movl C, CO1
  394. addl LDC, C
  395. movl A, AA
  396. movl M, %ebx
  397. sarl $1, %ebx
  398. jle .L40
  399. ALIGN_4
  400. .L31:
  401. #if !defined(TRMMKERNEL) || \
  402. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  403. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  404. movl B, BB
  405. #else
  406. movl KK, %eax
  407. leal (, %eax, SIZE), %eax
  408. leal (AA, %eax, 2), AA
  409. leal (B, %eax, 1), BB
  410. #endif
  411. movsd 0 * SIZE(BB), %xmm1
  412. xorps %xmm0, %xmm0
  413. prefetcht0 3 * SIZE(CO1)
  414. xorps %xmm2, %xmm2
  415. xorps %xmm4, %xmm4
  416. xorps %xmm6, %xmm6
  417. #ifndef TRMMKERNEL
  418. movl K, %eax
  419. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  420. movl K, %eax
  421. subl KK, %eax
  422. movl %eax, KKK
  423. #else
  424. movl KK, %eax
  425. #ifdef LEFT
  426. addl $2, %eax
  427. #else
  428. addl $1, %eax
  429. #endif
  430. movl %eax, KKK
  431. #endif
  432. sarl $2, %eax
  433. je .L35
  434. ALIGN_4
  435. .L32:
  436. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  437. addsd %xmm0, %xmm4
  438. movsd 0 * SIZE(AA), %xmm0
  439. addsd %xmm2, %xmm6
  440. movsd 1 * SIZE(AA), %xmm2
  441. mulsd %xmm1, %xmm0
  442. mulsd %xmm1, %xmm2
  443. movsd 1 * SIZE(BB), %xmm1
  444. addsd %xmm0, %xmm4
  445. movsd 2 * SIZE(AA), %xmm0
  446. addsd %xmm2, %xmm6
  447. movsd 3 * SIZE(AA), %xmm2
  448. mulsd %xmm1, %xmm0
  449. mulsd %xmm1, %xmm2
  450. movsd 2 * SIZE(BB), %xmm1
  451. addsd %xmm0, %xmm4
  452. movsd 4 * SIZE(AA), %xmm0
  453. addsd %xmm2, %xmm6
  454. movsd 5 * SIZE(AA), %xmm2
  455. mulsd %xmm1, %xmm0
  456. mulsd %xmm1, %xmm2
  457. movsd 3 * SIZE(BB), %xmm1
  458. addsd %xmm0, %xmm4
  459. movsd 6 * SIZE(AA), %xmm0
  460. addsd %xmm2, %xmm6
  461. movsd 7 * SIZE(AA), %xmm2
  462. mulsd %xmm1, %xmm0
  463. mulsd %xmm1, %xmm2
  464. movsd 4 * SIZE(BB), %xmm1
  465. addl $8 * SIZE, AA
  466. addl $4 * SIZE, BB
  467. decl %eax
  468. jne .L32
  469. ALIGN_4
  470. .L35:
  471. #ifndef TRMMKERNEL
  472. movl K, %eax
  473. #else
  474. movl KKK, %eax
  475. #endif
  476. andl $3, %eax # if (k & 1)
  477. BRANCH
  478. je .L38
  479. ALIGN_3
  480. .L36:
  481. addsd %xmm0, %xmm4
  482. movsd 0 * SIZE(AA), %xmm0
  483. addsd %xmm2, %xmm6
  484. movsd 1 * SIZE(AA), %xmm2
  485. mulsd %xmm1, %xmm0
  486. mulsd %xmm1, %xmm2
  487. movsd 1 * SIZE(BB), %xmm1
  488. addl $2 * SIZE, AA
  489. addl $1 * SIZE, BB
  490. decl %eax
  491. jg .L36
  492. ALIGN_4
  493. .L38:
  494. movsd ALPHA, %xmm3
  495. addsd %xmm0, %xmm4
  496. addsd %xmm2, %xmm6
  497. mulsd %xmm3, %xmm4
  498. mulsd %xmm3, %xmm6
  499. #ifndef TRMMKERNEL
  500. addsd 0 * SIZE(CO1), %xmm4
  501. addsd 1 * SIZE(CO1), %xmm6
  502. #endif
  503. movsd %xmm4, 0 * SIZE(CO1)
  504. movsd %xmm6, 1 * SIZE(CO1)
  505. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  506. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  507. movl K, %eax
  508. subl KKK, %eax
  509. leal (,%eax, SIZE), %eax
  510. leal (AA, %eax, 2), AA
  511. leal (BB, %eax, 1), BB
  512. #endif
  513. #if defined(TRMMKERNEL) && defined(LEFT)
  514. addl $2, KK
  515. #endif
  516. addl $2 * SIZE, CO1
  517. decl %ebx
  518. jg .L31
  519. ALIGN_4
  520. .L40:
  521. movl M, %ebx
  522. testl $1, %ebx
  523. jle .L999
  524. #if !defined(TRMMKERNEL) || \
  525. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  526. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  527. movl B, BB
  528. #else
  529. movl KK, %eax
  530. leal (, %eax, SIZE), %eax
  531. leal (AA, %eax, 1), AA
  532. leal (B, %eax, 1), BB
  533. #endif
  534. movsd 0 * SIZE(AA), %xmm0
  535. xorps %xmm4, %xmm4
  536. movsd 0 * SIZE(BB), %xmm2
  537. xorps %xmm5, %xmm5
  538. #ifndef TRMMKERNEL
  539. movl K, %eax
  540. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  541. movl K, %eax
  542. subl KK, %eax
  543. movl %eax, KKK
  544. #else
  545. movl KK, %eax
  546. #ifdef LEFT
  547. addl $1, %eax
  548. #else
  549. addl $1, %eax
  550. #endif
  551. movl %eax, KKK
  552. #endif
  553. sarl $2, %eax
  554. je .L45
  555. ALIGN_4
  556. .L42:
  557. mulsd %xmm0, %xmm2
  558. movsd 1 * SIZE(AA), %xmm0
  559. addsd %xmm2, %xmm4
  560. movsd 1 * SIZE(BB), %xmm2
  561. mulsd %xmm0, %xmm2
  562. movsd 2 * SIZE(AA), %xmm0
  563. addsd %xmm2, %xmm5
  564. movsd 2 * SIZE(BB), %xmm2
  565. mulsd %xmm0, %xmm2
  566. movsd 3 * SIZE(AA), %xmm0
  567. addsd %xmm2, %xmm4
  568. movsd 3 * SIZE(BB), %xmm2
  569. mulsd %xmm0, %xmm2
  570. movsd 4 * SIZE(AA), %xmm0
  571. addsd %xmm2, %xmm5
  572. movsd 4 * SIZE(BB), %xmm2
  573. addl $4 * SIZE, AA
  574. addl $4 * SIZE, BB
  575. decl %eax
  576. jne .L42
  577. ALIGN_4
  578. .L45:
  579. #ifndef TRMMKERNEL
  580. movl K, %eax
  581. #else
  582. movl KKK, %eax
  583. #endif
  584. andl $3, %eax # if (k & 1)
  585. BRANCH
  586. je .L48
  587. ALIGN_3
  588. .L46:
  589. mulsd %xmm0, %xmm2
  590. movsd 1 * SIZE(AA), %xmm0
  591. addsd %xmm2, %xmm4
  592. movsd 1 * SIZE(BB), %xmm2
  593. addl $1 * SIZE, AA
  594. addl $1 * SIZE, BB
  595. decl %eax
  596. jg .L46
  597. ALIGN_4
  598. .L48:
  599. movsd ALPHA, %xmm0
  600. addsd %xmm5, %xmm4
  601. mulsd %xmm0, %xmm4
  602. #ifndef TRMMKERNEL
  603. addsd 0 * SIZE(CO1), %xmm4
  604. #endif
  605. movsd %xmm4, 0 * SIZE(CO1)
  606. ALIGN_4
  607. .L999:
  608. popl %ebx
  609. popl %esi
  610. popl %edi
  611. popl %ebp
  612. addl $ARGS, %esp
  613. ret
  614. EPILOGUE