You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_2x2.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define J 0 + STACK(%esp)
  43. #define BX 4 + STACK(%esp)
  44. #define KK 8 + STACK(%esp)
  45. #define KKK 12 + STACK(%esp)
  46. #define M 4 + STACK + ARGS(%esp)
  47. #define N 8 + STACK + ARGS(%esp)
  48. #define K 12 + STACK + ARGS(%esp)
  49. #define ALPHA 16 + STACK + ARGS(%esp)
  50. #ifdef DOUBLE
  51. #define A 24 + STACK + ARGS(%esp)
  52. #define B 28 + STACK + ARGS(%esp)
  53. #define C 32 + STACK + ARGS(%esp)
  54. #define LDC 36 + STACK + ARGS(%esp)
  55. #define OFFSET 40 + STACK + ARGS(%esp)
  56. #else
  57. #define A 20 + STACK + ARGS(%esp)
  58. #define B 24 + STACK + ARGS(%esp)
  59. #define C 28 + STACK + ARGS(%esp)
  60. #define LDC 32 + STACK + ARGS(%esp)
  61. #define OFFSET 36 + STACK + ARGS(%esp)
  62. #endif
  63. #define PREFETCH_OFFSET 48
  64. #if defined(PENTIUM3) || defined(PENTIUMM)
  65. #define REP rep
  66. #else
  67. #define REP rep
  68. #endif
  69. PROLOGUE
  70. subl $ARGS, %esp # Generate Stack Frame
  71. pushl %ebp
  72. pushl %edi
  73. pushl %esi
  74. pushl %ebx
  75. PROFCODE
  76. #if defined(TRMMKERNEL) && !defined(LEFT)
  77. movl OFFSET, %eax
  78. negl %eax
  79. movl %eax, KK
  80. #endif
  81. movl N, %eax # j = (n >> 1) # MEMORY
  82. movl LDC, %ebp # ldc # MEMORY
  83. movl B, %ebx
  84. sarl $1, %eax
  85. leal (, %ebp, SIZE), %ebp
  86. leal 0(%ecx) , %ecx # NOP
  87. movl %eax, J # j = (n >> 1) # MEMORY
  88. test %eax, %eax
  89. je .L8 # if !(n >> 1) goto .L8
  90. ALIGN_4
  91. .L34:
  92. #if defined(TRMMKERNEL) && defined(LEFT)
  93. movl OFFSET, %eax
  94. movl %eax, KK
  95. #endif
  96. movl %ebx, BX
  97. movl M, %esi # m # MEMORY
  98. movl A, %edx # a # MEMORY
  99. movl C, %edi # C # MEMORY
  100. sarl $1, %esi # i = (m >> 1)
  101. je .L12
  102. ALIGN_4
  103. .MainHead:
  104. #if !defined(TRMMKERNEL) || \
  105. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  106. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  107. movl %ebx, %ecx
  108. #else
  109. movl KK, %eax
  110. leal (, %eax, SIZE), %eax
  111. leal (%edx, %eax, 2), %edx
  112. leal (%ebx, %eax, 2), %ecx
  113. #endif
  114. #ifdef HAVE_SSE
  115. movl BX, %eax
  116. prefetcht2 0 * SIZE(%eax)
  117. prefetcht2 4 * SIZE(%eax)
  118. #if L2_SIZE > 262144
  119. subl $-8 * SIZE, BX
  120. #elif L2_SIZE > 131072
  121. prefetcht2 8 * SIZE(%eax)
  122. prefetcht2 12 * SIZE(%eax)
  123. subl $-16 * SIZE, BX
  124. #else
  125. prefetcht2 16 * SIZE(%eax)
  126. prefetcht2 20 * SIZE(%eax)
  127. prefetcht2 24 * SIZE(%eax)
  128. prefetcht2 28 * SIZE(%eax)
  129. subl $-32 * SIZE, BX
  130. #endif
  131. #endif
  132. fldz
  133. fldz
  134. #ifndef TRMMKERNEL
  135. movl K, %eax
  136. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  137. movl K, %eax
  138. subl KK, %eax
  139. movl %eax, KKK
  140. #else
  141. movl KK, %eax
  142. #ifdef LEFT
  143. addl $2, %eax
  144. #else
  145. addl $2, %eax
  146. #endif
  147. movl %eax, KKK
  148. #endif
  149. fldz
  150. fldz
  151. FLD 4 * SIZE(%ecx) # b5
  152. FLD 4 * SIZE(%edx) # a5
  153. FLD 0 * SIZE(%ecx) # b1
  154. FLD 0 * SIZE(%edx) # a1
  155. #if defined(HAVE_3DNOW)
  156. prefetchw 2 * SIZE(%edi)
  157. prefetchw 2 * SIZE(%edi, %ebp, 1)
  158. #elif defined(HAVE_SSE)
  159. prefetchnta 2 * SIZE(%edi)
  160. prefetchnta 2 * SIZE(%edi, %ebp, 1)
  161. #endif
  162. sarl $2, %eax
  163. je .L16
  164. ALIGN_4
  165. .MainLoop:
  166. #if defined(HAVE_3DNOW)
  167. prefetch (PREFETCH_OFFSET) * SIZE(%ecx)
  168. nop
  169. #elif defined(HAVE_SSE)
  170. prefetchnta (PREFETCH_OFFSET) * SIZE(%ecx)
  171. #ifdef CORE_KATMAI
  172. prefetcht0 (PREFETCH_OFFSET) * SIZE(%edx)
  173. #endif
  174. #endif
  175. fmul %st, %st(1)
  176. FMUL 1 * SIZE(%ecx)
  177. fxch %st(1)
  178. faddp %st, %st(4)
  179. FLD 0 * SIZE(%ecx)
  180. fxch %st(1)
  181. faddp %st, %st(5)
  182. FLD 1 * SIZE(%edx)
  183. fmul %st, %st(1)
  184. FMUL 1 * SIZE(%ecx)
  185. fxch %st(1)
  186. faddp %st, %st(6)
  187. FLD 2 * SIZE(%ecx)
  188. fxch %st(1)
  189. faddp %st, %st(7)
  190. FLD 2 * SIZE(%edx)
  191. fmul %st, %st(1)
  192. FMUL 3 * SIZE(%ecx)
  193. fxch %st(1)
  194. faddp %st, %st(4)
  195. FLD 2 * SIZE(%ecx)
  196. fxch %st(1)
  197. faddp %st, %st(5)
  198. FLD 3 * SIZE(%edx)
  199. fmul %st, %st(1)
  200. FMUL 3 * SIZE(%ecx)
  201. fxch %st(1)
  202. faddp %st, %st(6)
  203. FLD 8 * SIZE(%ecx)
  204. fxch %st(1)
  205. faddp %st, %st(7)
  206. FLD 8 * SIZE(%edx)
  207. fxch %st(2)
  208. #if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE)
  209. prefetchnta (PREFETCH_OFFSET + 4) * SIZE(%ecx)
  210. #ifdef CORE_KATMAI
  211. prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(%edx)
  212. #endif
  213. #endif
  214. fmul %st, %st(3)
  215. FMUL 5 * SIZE(%ecx)
  216. fxch %st(3)
  217. faddp %st, %st(4)
  218. FLD 4 * SIZE(%ecx)
  219. fxch %st(3)
  220. faddp %st, %st(5)
  221. FLD 5 * SIZE(%edx)
  222. fmul %st, %st(3)
  223. FMUL 5 * SIZE(%ecx)
  224. fxch %st(3)
  225. faddp %st, %st(6)
  226. FLD 6 * SIZE(%ecx)
  227. fxch %st(3)
  228. faddp %st, %st(7)
  229. FLD 6 * SIZE(%edx)
  230. fmul %st, %st(3)
  231. FMUL 7 * SIZE(%ecx)
  232. fxch %st(3)
  233. faddp %st, %st(4)
  234. FLD 6 * SIZE(%ecx)
  235. fxch %st(3)
  236. faddp %st, %st(5)
  237. FLD 7 * SIZE(%edx)
  238. fmul %st, %st(3)
  239. FMUL 7 * SIZE(%ecx)
  240. fxch %st(3)
  241. faddp %st, %st(6)
  242. FLD 12 * SIZE(%ecx)
  243. fxch %st(3)
  244. faddp %st, %st(7)
  245. FLD 12 * SIZE(%edx)
  246. fxch %st(2)
  247. subl $-8 * SIZE, %ecx
  248. subl $-8 * SIZE, %edx
  249. decl %eax # l --
  250. jne .MainLoop
  251. ALIGN_4
  252. .L16:
  253. #ifndef TRMMKERNEL
  254. movl K, %eax
  255. #else
  256. movl KKK, %eax
  257. #endif
  258. and $3, %eax
  259. je .L21
  260. ALIGN_4
  261. .SubLoop:
  262. fmul %st, %st(1)
  263. FMUL 1 * SIZE(%ecx)
  264. fxch %st(1)
  265. faddp %st, %st(4)
  266. FLD 0 * SIZE(%ecx)
  267. fxch %st(1)
  268. faddp %st, %st(5)
  269. FLD 1 * SIZE(%edx)
  270. fmul %st, %st(1)
  271. FMUL 1 * SIZE(%ecx)
  272. fxch %st(1)
  273. faddp %st, %st(6)
  274. FLD 2 * SIZE(%ecx)
  275. fxch %st(1)
  276. faddp %st, %st(7)
  277. FLD 2 * SIZE(%edx)
  278. addl $2 * SIZE,%ecx
  279. addl $2 * SIZE,%edx
  280. decl %eax
  281. jne .SubLoop
  282. ALIGN_4
  283. .L21:
  284. ffreep %st(0)
  285. ffreep %st(0)
  286. ffreep %st(0)
  287. ffreep %st(0)
  288. FLD ALPHA
  289. fmul %st, %st(4)
  290. fmul %st, %st(1)
  291. fmul %st, %st(2)
  292. fmulp %st, %st(3)
  293. #ifndef TRMMKERNEL
  294. FADD 0 * SIZE(%edi)
  295. FST 0 * SIZE(%edi)
  296. FADD 0 * SIZE(%edi,%ebp)
  297. FST 0 * SIZE(%edi,%ebp)
  298. FADD 1 * SIZE(%edi)
  299. FST 1 * SIZE(%edi)
  300. FADD 1 * SIZE(%edi,%ebp)
  301. FST 1 * SIZE(%edi,%ebp)
  302. #else
  303. FST 0 * SIZE(%edi)
  304. FST 0 * SIZE(%edi,%ebp)
  305. FST 1 * SIZE(%edi)
  306. FST 1 * SIZE(%edi,%ebp)
  307. #endif
  308. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  309. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  310. movl K, %eax
  311. subl KKK, %eax
  312. leal (,%eax, SIZE), %eax
  313. leal (%edx, %eax, 2), %edx
  314. leal (%ecx, %eax, 2), %ecx
  315. #endif
  316. #if defined(TRMMKERNEL) && defined(LEFT)
  317. addl $2, KK
  318. #endif
  319. addl $2 * SIZE, %edi
  320. rep
  321. decl %esi # i --
  322. rep
  323. jne .MainHead
  324. ALIGN_4
  325. .L12:
  326. movl M, %eax # m # MEMORY
  327. andl $1, %eax
  328. je .L27
  329. #if !defined(TRMMKERNEL) || \
  330. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  331. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  332. movl %ebx, %ecx
  333. #else
  334. movl KK, %eax
  335. leal (, %eax, SIZE), %eax
  336. leal (%edx, %eax, 1), %edx
  337. leal (%ebx, %eax, 2), %ecx
  338. #endif
  339. fldz
  340. fldz
  341. FLD 0 * SIZE(%edx) # temp1 = *(aoffset + 0)
  342. #ifndef TRMMKERNEL
  343. movl K, %eax
  344. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  345. movl K, %eax
  346. subl KK, %eax
  347. movl %eax, KKK
  348. #else
  349. movl KK, %eax
  350. #ifdef LEFT
  351. addl $1, %eax
  352. #else
  353. addl $2, %eax
  354. #endif
  355. movl %eax, KKK
  356. #endif
  357. sarl $1,%eax # k >> 1 # MEMORY
  358. je .L54
  359. ALIGN_4
  360. .L55:
  361. FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0)
  362. rep
  363. fmul %st(1), %st
  364. faddp %st, %st(2)
  365. FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0)
  366. faddp %st, %st(2)
  367. FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0)
  368. FLD 2 * SIZE(%ecx) # temp2 = *(boffset + 0)
  369. rep
  370. fmul %st(1), %st
  371. faddp %st, %st(2)
  372. FMUL 3 * SIZE(%ecx) # temp2 = *(boffset + 0)
  373. faddp %st, %st(2)
  374. FLD 2 * SIZE(%edx) # temp1 = *(aoffset + 0)
  375. addl $2 * SIZE, %edx
  376. addl $4 * SIZE, %ecx
  377. decl %eax
  378. jne .L55
  379. ALIGN_4
  380. .L54:
  381. #ifndef TRMMKERNEL
  382. movl K, %eax
  383. #else
  384. movl KKK, %eax
  385. #endif
  386. andl $1,%eax # k & 1
  387. je .L33
  388. ALIGN_4
  389. FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0)
  390. rep
  391. fmul %st(1), %st
  392. faddp %st, %st(2)
  393. FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0)
  394. faddp %st, %st(2)
  395. FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0)
  396. addl $1 * SIZE, %edx
  397. addl $2 * SIZE, %ecx
  398. ALIGN_4
  399. .L33:
  400. ffreep %st(0)
  401. FLD ALPHA
  402. fmul %st, %st(2)
  403. fmulp %st, %st(1)
  404. #ifndef TRMMKERNEL
  405. FADD (%edi)
  406. FST (%edi)
  407. FADD (%edi,%ebp)
  408. FST (%edi,%ebp)
  409. #else
  410. FST (%edi)
  411. FST (%edi,%ebp)
  412. #endif
  413. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  414. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  415. movl K, %eax
  416. subl KKK, %eax
  417. leal (,%eax, SIZE), %eax
  418. leal (%edx, %eax, 1), %edx
  419. leal (%ecx, %eax, 2), %ecx
  420. #endif
  421. #if defined(TRMMKERNEL) && defined(LEFT)
  422. addl $1, KK
  423. #endif
  424. ALIGN_4
  425. .L27:
  426. #if defined(TRMMKERNEL) && !defined(LEFT)
  427. addl $2, KK
  428. #endif
  429. lea (, %ebp, 2), %eax
  430. addl %eax, C # C + 2 * ldc # MEMORY
  431. movl %ecx, %ebx # b # MEMORY
  432. decl J # j-- # MEMORY
  433. jne .L34
  434. ALIGN_4
  435. .L8:
  436. movl N, %eax # n # MEMORY
  437. andl $1, %eax
  438. je .End
  439. #if defined(TRMMKERNEL) && defined(LEFT)
  440. movl OFFSET, %eax
  441. movl %eax, KK
  442. #endif
  443. movl C, %edi # c # MEMORY
  444. movl A, %edx # a # MEMORY
  445. movl M, %esi # m # MEMORY
  446. sarl $1, %esi # m >> 1
  447. je .L36
  448. ALIGN_4
  449. .L46:
  450. #if !defined(TRMMKERNEL) || \
  451. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  452. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  453. movl %ebx, %ecx
  454. #else
  455. movl KK, %eax
  456. leal (, %eax, SIZE), %eax
  457. leal (%edx, %eax, 2), %edx
  458. leal (%ebx, %eax, 1), %ecx
  459. #endif
  460. #ifndef TRMMKERNEL
  461. movl K, %eax
  462. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  463. movl K, %eax
  464. subl KK, %eax
  465. movl %eax, KKK
  466. #else
  467. movl KK, %eax
  468. #ifdef LEFT
  469. addl $2, %eax
  470. #else
  471. addl $1, %eax
  472. #endif
  473. movl %eax, KKK
  474. #endif
  475. fldz
  476. sarl $1, %eax
  477. fldz
  478. FLD 0 * SIZE(%ecx) # temp1 = *(boffset + 0)
  479. je .L56
  480. ALIGN_4
  481. .L57:
  482. FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0)
  483. fmul %st(1), %st
  484. faddp %st, %st(2)
  485. FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0)
  486. faddp %st, %st(2)
  487. FLD 1 * SIZE(%ecx) # temp1 = *(boffset + 0)
  488. FLD 2 * SIZE(%edx) # temp2 = *(aoffset + 0)
  489. fmul %st(1), %st
  490. faddp %st, %st(2)
  491. FMUL 3 * SIZE(%edx) # temp2 = *(aoffset + 0)
  492. faddp %st, %st(2)
  493. FLD 2 * SIZE(%ecx) # temp1 = *(boffset + 0)
  494. addl $4 * SIZE,%edx
  495. addl $2 * SIZE,%ecx
  496. dec %eax
  497. jne .L57
  498. ALIGN_4
  499. .L56:
  500. #ifndef TRMMKERNEL
  501. movl K, %eax
  502. #else
  503. movl KKK, %eax
  504. #endif
  505. andl $1, %eax
  506. je .L45
  507. ALIGN_4
  508. FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0)
  509. fmul %st(1), %st
  510. faddp %st, %st(2)
  511. FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0)
  512. faddp %st, %st(2)
  513. FLD 3 * SIZE(%ecx) # temp1 = *(boffset + 0)
  514. addl $2 * SIZE,%edx
  515. addl $1 * SIZE,%ecx
  516. ALIGN_4
  517. .L45:
  518. ffreep %st(0)
  519. FLD ALPHA
  520. fmul %st, %st(1)
  521. fmulp %st, %st(2)
  522. #ifndef TRMMKERNEL
  523. FADD 0 * SIZE(%edi)
  524. FST 0 * SIZE(%edi)
  525. FADD 1 * SIZE(%edi)
  526. FST 1 * SIZE(%edi)
  527. #else
  528. FST 0 * SIZE(%edi)
  529. FST 1 * SIZE(%edi)
  530. #endif
  531. addl $2 * SIZE, %edi
  532. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  533. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  534. movl K, %eax
  535. subl KKK, %eax
  536. leal (,%eax, SIZE), %eax
  537. leal (%edx, %eax, 2), %edx
  538. leal (%ecx, %eax, 1), %ecx
  539. #endif
  540. #if defined(TRMMKERNEL) && defined(LEFT)
  541. addl $2, KK
  542. #endif
  543. decl %esi # i --
  544. jne .L46
  545. ALIGN_4
  546. .L36:
  547. movl M, %eax # m # MEMORY
  548. andl $1, %eax # m & 1
  549. je .End
  550. #if !defined(TRMMKERNEL) || \
  551. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  552. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  553. movl %ebx, %ecx
  554. #else
  555. movl KK, %eax
  556. leal (, %eax, SIZE), %eax
  557. leal (%edx, %eax, 1), %edx
  558. leal (%ebx, %eax, 1), %ecx
  559. #endif
  560. #ifndef TRMMKERNEL
  561. movl K, %eax
  562. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  563. movl K, %eax
  564. subl KK, %eax
  565. movl %eax, KKK
  566. #else
  567. movl KK, %eax
  568. #ifdef LEFT
  569. addl $1, %eax
  570. #else
  571. addl $1, %eax
  572. #endif
  573. movl %eax, KKK
  574. #endif
  575. fldz
  576. ALIGN_3
  577. .L51:
  578. FLD (%edx)
  579. FMUL (%ecx)
  580. addl $1 * SIZE,%edx
  581. addl $1 * SIZE,%ecx
  582. faddp %st,%st(1)
  583. decl %eax
  584. jne .L51
  585. FMUL ALPHA
  586. #ifndef TRMMKERNEL
  587. FADD (%edi)
  588. FST (%edi)
  589. #else
  590. FST (%edi)
  591. #endif
  592. ALIGN_4
  593. .End:
  594. popl %ebx
  595. popl %esi
  596. popl %edi
  597. popl %ebp
  598. addl $ARGS, %esp
  599. ret
  600. EPILOGUE