You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_1x4.S 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define J 0 + STACK(%esp)
  43. #define I 4 + STACK(%esp)
  44. #define KK 8 + STACK(%esp)
  45. #define KKK 12 + STACK(%esp)
  46. #define M 4 + STACK + ARGS(%esp)
  47. #define N 8 + STACK + ARGS(%esp)
  48. #define K 12 + STACK + ARGS(%esp)
  49. #define ALPHA 16 + STACK + ARGS(%esp)
  50. #ifdef DOUBLE
  51. #define STACK_A 24 + STACK + ARGS(%esp)
  52. #define STACK_B 28 + STACK + ARGS(%esp)
  53. #define C 32 + STACK + ARGS(%esp)
  54. #define STACK_LDC 36 + STACK + ARGS(%esp)
  55. #define OFFSET 40 + STACK + ARGS(%esp)
  56. #else
  57. #define STACK_A 20 + STACK + ARGS(%esp)
  58. #define STACK_B 24 + STACK + ARGS(%esp)
  59. #define C 28 + STACK + ARGS(%esp)
  60. #define STACK_LDC 32 + STACK + ARGS(%esp)
  61. #define OFFSET 36 + STACK + ARGS(%esp)
  62. #endif
  63. #define A %edx
  64. #define B %ecx
  65. #define BB %ebx
  66. #define LDC %ebp
  67. #define BX %esi
  68. #define PREFETCHSIZE (8 * 5 + 4)
  69. #define AOFFSET 1
  70. #define BOFFSET -7
  71. #ifdef HAVE_3DNOW
  72. #define PREFETCH prefetch
  73. #else
  74. #define PREFETCH prefetcht0
  75. #endif
  76. #define KERNEL \
  77. PREFETCH PREFETCHSIZE * SIZE + AOFFSET(A, %eax, 1);\
  78. fmul %st(1), %st;\
  79. faddp %st, %st(4);\
  80. FLD -15 * SIZE + BOFFSET(B, %eax, 4);\
  81. fmul %st(1), %st;\
  82. faddp %st, %st(5);\
  83. FLD -14 * SIZE + BOFFSET(B, %eax, 4);\
  84. fmul %st(1), %st;\
  85. faddp %st, %st(6);\
  86. FMUL -13 * SIZE + BOFFSET(B, %eax, 4);\
  87. faddp %st, %st(6);\
  88. FLD -15 * SIZE + AOFFSET(A, %eax, 1);\
  89. FLD -12 * SIZE + BOFFSET(B, %eax, 4);\
  90. fmul %st(1), %st;\
  91. faddp %st, %st(4);\
  92. FLD -11 * SIZE + BOFFSET(B, %eax, 4);\
  93. fmul %st(1), %st;\
  94. faddp %st, %st(5);\
  95. FLD -10 * SIZE + BOFFSET(B, %eax, 4);\
  96. fmul %st(1), %st;\
  97. faddp %st, %st(6);\
  98. FMUL -9 * SIZE + BOFFSET(B, %eax, 4);\
  99. faddp %st, %st(6);\
  100. FLD -14 * SIZE + AOFFSET(A, %eax, 1);\
  101. FLD -8 * SIZE + BOFFSET(B, %eax, 4);\
  102. fmul %st(1), %st;\
  103. faddp %st, %st(4);\
  104. FLD -7 * SIZE + BOFFSET(B, %eax, 4);\
  105. fmul %st(1), %st;\
  106. faddp %st, %st(5);\
  107. FLD -6 * SIZE + BOFFSET(B, %eax, 4);\
  108. fmul %st(1), %st;\
  109. faddp %st, %st(6);\
  110. FMUL -5 * SIZE + BOFFSET(B, %eax, 4);\
  111. faddp %st, %st(6);\
  112. FLD -13 * SIZE + AOFFSET(A, %eax, 1);\
  113. FLD -4 * SIZE + BOFFSET(B, %eax, 4);\
  114. fmul %st(1), %st;\
  115. faddp %st, %st(4);\
  116. FLD -3 * SIZE + BOFFSET(B, %eax, 4);\
  117. fmul %st(1), %st;\
  118. faddp %st, %st(5);\
  119. FLD -2 * SIZE + BOFFSET(B, %eax, 4);\
  120. fmul %st(1), %st;\
  121. faddp %st, %st(6);\
  122. FMUL -1 * SIZE + BOFFSET(B, %eax, 4);\
  123. faddp %st, %st(6);\
  124. FLD -12 * SIZE + AOFFSET(A, %eax, 1);\
  125. FLD 0 * SIZE + BOFFSET(B, %eax, 4);\
  126. fmul %st(1), %st;\
  127. faddp %st, %st(4);\
  128. FLD 1 * SIZE + BOFFSET(B, %eax, 4);\
  129. fmul %st(1), %st;\
  130. faddp %st, %st(5);\
  131. FLD 2 * SIZE + BOFFSET(B, %eax, 4);\
  132. fmul %st(1), %st;\
  133. faddp %st, %st(6);\
  134. FMUL 3 * SIZE + BOFFSET(B, %eax, 4);\
  135. faddp %st, %st(6);\
  136. FLD -11 * SIZE + AOFFSET(A, %eax, 1);\
  137. FLD 4 * SIZE + BOFFSET(B, %eax, 4);\
  138. fmul %st(1), %st;\
  139. faddp %st, %st(4);\
  140. FLD 5 * SIZE + BOFFSET(B, %eax, 4);\
  141. fmul %st(1), %st;\
  142. faddp %st, %st(5);\
  143. FLD 6 * SIZE + BOFFSET(B, %eax, 4);\
  144. fmul %st(1), %st;\
  145. faddp %st, %st(6);\
  146. FMUL 7 * SIZE + BOFFSET(B, %eax, 4);\
  147. faddp %st, %st(6);\
  148. FLD -10 * SIZE + AOFFSET(A, %eax, 1);\
  149. FLD 8 * SIZE + BOFFSET(B, %eax, 4);\
  150. fmul %st(1), %st;\
  151. faddp %st, %st(4);\
  152. FLD 9 * SIZE + BOFFSET(B, %eax, 4);\
  153. fmul %st(1), %st;\
  154. faddp %st, %st(5);\
  155. FLD 10 * SIZE + BOFFSET(B, %eax, 4);\
  156. fmul %st(1), %st;\
  157. faddp %st, %st(6);\
  158. FMUL 11 * SIZE + BOFFSET(B, %eax, 4);\
  159. faddp %st, %st(6);\
  160. FLD -9 * SIZE + AOFFSET(A, %eax, 1);\
  161. FLD 12 * SIZE + BOFFSET(B, %eax, 4);\
  162. fmul %st(1), %st;\
  163. faddp %st, %st(4);\
  164. FLD 13 * SIZE + BOFFSET(B, %eax, 4);\
  165. fmul %st(1), %st;\
  166. faddp %st, %st(5);\
  167. FLD 14 * SIZE + BOFFSET(B, %eax, 4);\
  168. fmul %st(1), %st;\
  169. faddp %st, %st(6);\
  170. FMUL 15 * SIZE + BOFFSET(B, %eax, 4);\
  171. faddp %st, %st(6);\
  172. FLD 8 * SIZE + AOFFSET(A, %eax, 1);\
  173. fxch %st(1);\
  174. FLD 16 * SIZE + BOFFSET(B, %eax, 4);\
  175. fmul %st(1), %st;\
  176. faddp %st, %st(4);\
  177. FLD -15 * SIZE + BOFFSET(BB, %eax, 4);\
  178. fmul %st(1), %st;\
  179. PREFETCH (PREFETCHSIZE + 8) * SIZE + AOFFSET(A, %eax, 1);\
  180. faddp %st, %st(5);\
  181. FLD -14 * SIZE + BOFFSET(BB, %eax, 4);\
  182. fmul %st(1), %st;\
  183. faddp %st, %st(6);\
  184. FMUL -13 * SIZE + BOFFSET(BB, %eax, 4);\
  185. faddp %st, %st(6);\
  186. FLD -7 * SIZE + AOFFSET(A, %eax, 1);\
  187. FLD -12 * SIZE + BOFFSET(BB, %eax, 4);\
  188. fmul %st(1), %st;\
  189. faddp %st, %st(4);\
  190. FLD -11 * SIZE + BOFFSET(BB, %eax, 4);\
  191. fmul %st(1), %st;\
  192. faddp %st, %st(5);\
  193. FLD -10 * SIZE + BOFFSET(BB, %eax, 4);\
  194. fmul %st(1), %st;\
  195. faddp %st, %st(6);\
  196. FMUL -9 * SIZE + BOFFSET(BB, %eax, 4);\
  197. faddp %st, %st(6);\
  198. FLD -6 * SIZE + AOFFSET(A, %eax, 1);\
  199. FLD -8 * SIZE + BOFFSET(BB, %eax, 4);\
  200. fmul %st(1), %st;\
  201. faddp %st, %st(4);\
  202. FLD -7 * SIZE + BOFFSET(BB, %eax, 4);\
  203. fmul %st(1), %st;\
  204. faddp %st, %st(5);\
  205. FLD -6 * SIZE + BOFFSET(BB, %eax, 4);\
  206. fmul %st(1), %st;\
  207. faddp %st, %st(6);\
  208. FMUL -5 * SIZE + BOFFSET(BB, %eax, 4);\
  209. faddp %st, %st(6);\
  210. FLD -5 * SIZE + AOFFSET(A, %eax, 1);\
  211. FLD -4 * SIZE + BOFFSET(BB, %eax, 4);\
  212. fmul %st(1), %st;\
  213. faddp %st, %st(4);\
  214. FLD -3 * SIZE + BOFFSET(BB, %eax, 4);\
  215. fmul %st(1), %st;\
  216. faddp %st, %st(5);\
  217. FLD -2 * SIZE + BOFFSET(BB, %eax, 4);\
  218. fmul %st(1), %st;\
  219. faddp %st, %st(6);\
  220. FMUL -1 * SIZE + BOFFSET(BB, %eax, 4);\
  221. faddp %st, %st(6);\
  222. FLD -4 * SIZE + AOFFSET(A, %eax, 1);\
  223. FLD 0 * SIZE + BOFFSET(BB, %eax, 4);\
  224. fmul %st(1), %st;\
  225. faddp %st, %st(4);\
  226. FLD 1 * SIZE + BOFFSET(BB, %eax, 4);\
  227. fmul %st(1), %st;\
  228. faddp %st, %st(5);\
  229. FLD 2 * SIZE + BOFFSET(BB, %eax, 4);\
  230. fmul %st(1), %st;\
  231. faddp %st, %st(6);\
  232. FMUL 3 * SIZE + BOFFSET(BB, %eax, 4);\
  233. faddp %st, %st(6);\
  234. FLD -3 * SIZE + AOFFSET(A, %eax, 1);\
  235. FLD 4 * SIZE + BOFFSET(BB, %eax, 4);\
  236. fmul %st(1), %st;\
  237. faddp %st, %st(4);\
  238. FLD 5 * SIZE + BOFFSET(BB, %eax, 4);\
  239. fmul %st(1), %st;\
  240. faddp %st, %st(5);\
  241. FLD 6 * SIZE + BOFFSET(BB, %eax, 4);\
  242. fmul %st(1), %st;\
  243. faddp %st, %st(6);\
  244. FMUL 7 * SIZE + BOFFSET(BB, %eax, 4);\
  245. faddp %st, %st(6);\
  246. FLD -2 * SIZE + AOFFSET(A, %eax, 1);\
  247. FLD 8 * SIZE + BOFFSET(BB, %eax, 4);\
  248. fmul %st(1), %st;\
  249. faddp %st, %st(4);\
  250. FLD 9 * SIZE + BOFFSET(BB, %eax, 4);\
  251. fmul %st(1), %st;\
  252. faddp %st, %st(5);\
  253. FLD 10 * SIZE + BOFFSET(BB, %eax, 4);\
  254. fmul %st(1), %st;\
  255. faddp %st, %st(6);\
  256. FMUL 11 * SIZE + BOFFSET(BB, %eax, 4);\
  257. faddp %st, %st(6);\
  258. FLD -1 * SIZE + AOFFSET(A, %eax, 1);\
  259. FLD 12 * SIZE + BOFFSET(BB, %eax, 4);\
  260. fmul %st(1), %st;\
  261. faddp %st, %st(4);\
  262. FLD 13 * SIZE + BOFFSET(BB, %eax, 4);\
  263. fmul %st(1), %st;\
  264. faddp %st, %st(5);\
  265. FLD 14 * SIZE + BOFFSET(BB, %eax, 4);\
  266. fmul %st(1), %st;\
  267. faddp %st, %st(6);\
  268. FMUL 15 * SIZE + BOFFSET(BB, %eax, 4);\
  269. faddp %st, %st(6);\
  270. FLD 16 * SIZE + AOFFSET(A, %eax, 1);\
  271. fxch %st(2);\
  272. FLD 16 * SIZE + BOFFSET(BB, %eax, 4);\
  273. subl $-16 * SIZE, %eax
  274. /*
  275. A hint of scheduling is received from following URL
  276. http://www.netlib.org/atlas/atlas-comm/msg00260.html
  277. */
  278. PROLOGUE
  279. subl $ARGS, %esp # Generate Stack Frame
  280. pushl %ebp
  281. pushl %edi
  282. pushl %esi
  283. pushl %ebx
  284. PROFCODE
  285. #if defined(TRMMKERNEL) && !defined(LEFT)
  286. movl OFFSET, %eax
  287. negl %eax
  288. movl %eax, KK
  289. #endif
  290. movl STACK_LDC, LDC
  291. leal (, LDC, SIZE), LDC
  292. subl $(AOFFSET - 16 * SIZE), STACK_A
  293. subl $(BOFFSET - 16 * SIZE), STACK_B
  294. movl M, %eax
  295. testl %eax, %eax
  296. jle .L999
  297. movl N, %eax
  298. testl %eax, %eax
  299. jle .L999
  300. movl K, %eax
  301. testl %eax, %eax
  302. jle .L999
  303. movl N, %eax
  304. sarl $2, %eax
  305. movl %eax, J
  306. je .L20
  307. ALIGN_3
  308. .L11:
  309. #if defined(TRMMKERNEL) && defined(LEFT)
  310. movl OFFSET, %eax
  311. movl %eax, KK
  312. #endif
  313. movl STACK_A, A
  314. movl STACK_B, B
  315. movl C, %edi
  316. movl K, BX
  317. sall $BASE_SHIFT + 2, BX
  318. addl B, BX
  319. movl M, %eax
  320. movl %eax, I
  321. ALIGN_3
  322. .L14:
  323. prefetchnta -16 * SIZE + BOFFSET(BX)
  324. subl $-8 * SIZE, BX
  325. movl STACK_B, B
  326. #if !defined(TRMMKERNEL) || \
  327. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  328. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  329. #else
  330. movl KK, %eax
  331. leal (, %eax, SIZE), %eax
  332. leal (A, %eax, 1), A
  333. leal (B, %eax, 4), B
  334. #endif
  335. leal (%edi, LDC, 2), %eax
  336. fldz
  337. fldz
  338. fldz
  339. fldz
  340. FLD 0 * SIZE + AOFFSET(A)
  341. FLD -8 * SIZE + AOFFSET(A)
  342. FLD -16 * SIZE + AOFFSET(A)
  343. FLD -16 * SIZE + BOFFSET(B)
  344. #ifdef HAVE_3DNOW
  345. prefetchw 1 * SIZE(%edi)
  346. prefetchw 2 * SIZE(%edi, LDC)
  347. prefetchw 1 * SIZE(%eax)
  348. prefetchw 2 * SIZE(%eax, LDC)
  349. #elif defined(HAVE_SSE)
  350. prefetcht0 1 * SIZE(%edi)
  351. prefetcht0 2 * SIZE(%edi, LDC)
  352. prefetcht0 1 * SIZE(%eax)
  353. prefetcht0 2 * SIZE(%eax, LDC)
  354. #endif
  355. #ifndef TRMMKERNEL
  356. movl K, %eax
  357. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  358. movl K, %eax
  359. subl KK, %eax
  360. movl %eax, KKK
  361. #else
  362. movl KK, %eax
  363. #ifdef LEFT
  364. addl $1, %eax
  365. #else
  366. addl $4, %eax
  367. #endif
  368. movl %eax, KKK
  369. #endif
  370. andl $-16, %eax
  371. leal (, %eax, SIZE), %eax
  372. leal (A, %eax, 1), A
  373. leal 32 * SIZE(B, %eax, 4), BB
  374. leal (B, %eax, 4), B
  375. negl %eax
  376. NOBRANCH
  377. je .L16
  378. ALIGN_4
  379. .L15:
  380. KERNEL
  381. jge .L16
  382. KERNEL
  383. jge .L16
  384. KERNEL
  385. jge .L16
  386. KERNEL
  387. jl .L15
  388. ALIGN_4
  389. .L16:
  390. #ifndef TRMMKERNEL
  391. movl K, %eax
  392. #else
  393. movl KKK, %eax
  394. #endif
  395. and $15, %eax
  396. je .L19
  397. ALIGN_4
  398. .L17:
  399. fmul %st(1), %st
  400. faddp %st, %st(4)
  401. FLD -15 * SIZE + BOFFSET(B)
  402. fmul %st(1), %st
  403. faddp %st, %st(5)
  404. FLD -14 * SIZE + BOFFSET(B)
  405. fmul %st(1), %st
  406. faddp %st, %st(6)
  407. FMUL -13 * SIZE + BOFFSET(B)
  408. faddp %st, %st(6)
  409. FLD -15 * SIZE + AOFFSET(A)
  410. FLD -12 * SIZE + BOFFSET(B)
  411. addl $1 * SIZE,A
  412. addl $4 * SIZE,B
  413. decl %eax
  414. jne .L17
  415. ALIGN_4
  416. .L19:
  417. ffreep %st(0)
  418. ffreep %st(0)
  419. ffreep %st(0)
  420. ffreep %st(0)
  421. FLD ALPHA
  422. fmul %st, %st(1)
  423. fmul %st, %st(2)
  424. fmul %st, %st(3)
  425. fmulp %st, %st(4)
  426. leal (%edi, LDC, 2), %eax
  427. #ifndef TRMMKERNEL
  428. FADD (%edi)
  429. FST (%edi)
  430. FADD (%edi,LDC)
  431. FST (%edi,LDC)
  432. FADD (%eax)
  433. FST (%eax)
  434. FADD (%eax,LDC)
  435. FST (%eax,LDC)
  436. #else
  437. FST (%edi)
  438. FST (%edi,LDC)
  439. FST (%eax)
  440. FST (%eax,LDC)
  441. #endif
  442. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  443. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  444. movl K, %eax
  445. subl KKK, %eax
  446. leal (,%eax, SIZE), %eax
  447. leal (A, %eax, 1), A
  448. leal (B, %eax, 4), B
  449. #endif
  450. #if defined(TRMMKERNEL) && defined(LEFT)
  451. addl $1, KK
  452. #endif
  453. addl $1 * SIZE, %edi
  454. decl I
  455. jne .L14
  456. #if defined(TRMMKERNEL) && !defined(LEFT)
  457. addl $4, KK
  458. #endif
  459. leal (, LDC, 4), %eax
  460. addl %eax, C
  461. movl B, STACK_B
  462. decl J
  463. jne .L11
  464. ALIGN_4
  465. .L20:
  466. movl N, %eax
  467. andl $2, %eax
  468. je .L30
  469. ALIGN_3
  470. .L21:
  471. #if defined(TRMMKERNEL) && defined(LEFT)
  472. movl OFFSET, %eax
  473. movl %eax, KK
  474. #endif
  475. movl STACK_A, A
  476. movl STACK_B, B
  477. movl C, %edi
  478. movl M, %eax
  479. movl %eax, I
  480. ALIGN_3
  481. .L24:
  482. movl STACK_B, B
  483. #if !defined(TRMMKERNEL) || \
  484. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  485. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  486. #else
  487. movl KK, %eax
  488. leal (, %eax, SIZE), %eax
  489. leal (A, %eax, 1), A
  490. leal (B, %eax, 2), B
  491. #endif
  492. fldz
  493. fldz
  494. fldz
  495. fldz
  496. FLD -16 * SIZE + AOFFSET(A)
  497. FLD -16 * SIZE + BOFFSET(B)
  498. prefetchw 1 * SIZE(%edi)
  499. prefetchw 1 * SIZE(%edi, LDC)
  500. #ifndef TRMMKERNEL
  501. movl K, %eax
  502. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  503. movl K, %eax
  504. subl KK, %eax
  505. movl %eax, KKK
  506. #else
  507. movl KK, %eax
  508. #ifdef LEFT
  509. addl $1, %eax
  510. #else
  511. addl $2, %eax
  512. #endif
  513. movl %eax, KKK
  514. #endif
  515. sarl $3, %eax
  516. je .L26
  517. ALIGN_3
  518. .L25:
  519. fmul %st(1), %st
  520. faddp %st, %st(2)
  521. FMUL -15 * SIZE + BOFFSET(B)
  522. faddp %st, %st(2)
  523. FLD -15 * SIZE + AOFFSET(A)
  524. FLD -14 * SIZE + BOFFSET(B)
  525. fmul %st(1), %st
  526. faddp %st, %st(4)
  527. FMUL -13 * SIZE + BOFFSET(B)
  528. faddp %st, %st(4)
  529. FLD -14 * SIZE + AOFFSET(A)
  530. FLD -12 * SIZE + BOFFSET(B)
  531. fmul %st(1), %st
  532. faddp %st, %st(2)
  533. FMUL -11 * SIZE + BOFFSET(B)
  534. faddp %st, %st(2)
  535. FLD -13 * SIZE + AOFFSET(A)
  536. FLD -10 * SIZE + BOFFSET(B)
  537. fmul %st(1), %st
  538. faddp %st, %st(4)
  539. FMUL -9 * SIZE + BOFFSET(B)
  540. faddp %st, %st(4)
  541. FLD -12 * SIZE + AOFFSET(A)
  542. FLD -8 * SIZE + BOFFSET(B)
  543. fmul %st(1), %st
  544. faddp %st, %st(2)
  545. FMUL -7 * SIZE + BOFFSET(B)
  546. faddp %st, %st(2)
  547. FLD -11 * SIZE + AOFFSET(A)
  548. FLD -6 * SIZE + BOFFSET(B)
  549. fmul %st(1), %st
  550. faddp %st, %st(4)
  551. FMUL -5 * SIZE + BOFFSET(B)
  552. faddp %st, %st(4)
  553. FLD -10 * SIZE + AOFFSET(A)
  554. FLD -4 * SIZE + BOFFSET(B)
  555. fmul %st(1), %st
  556. faddp %st, %st(2)
  557. FMUL -3 * SIZE + BOFFSET(B)
  558. faddp %st, %st(2)
  559. FLD -9 * SIZE + AOFFSET(A)
  560. FLD -2 * SIZE + BOFFSET(B)
  561. fmul %st(1), %st
  562. faddp %st, %st(4)
  563. FMUL -1 * SIZE + BOFFSET(B)
  564. faddp %st, %st(4)
  565. FLD -8 * SIZE + AOFFSET(A)
  566. FLD 0 * SIZE + BOFFSET(B)
  567. addl $ 8 * SIZE, A
  568. subl $-16 * SIZE, B
  569. decl %eax
  570. jne .L25
  571. ALIGN_4
  572. .L26:
  573. #ifndef TRMMKERNEL
  574. movl K, %eax
  575. #else
  576. movl KKK, %eax
  577. #endif
  578. and $7, %eax
  579. je .L29
  580. ALIGN_4
  581. .L27:
  582. fmul %st(1), %st
  583. faddp %st, %st(2)
  584. FMUL -15 * SIZE + BOFFSET(B)
  585. faddp %st, %st(2)
  586. FLD -15 * SIZE + AOFFSET(A)
  587. FLD -14 * SIZE + BOFFSET(B)
  588. addl $1 * SIZE,A
  589. addl $2 * SIZE,B
  590. decl %eax
  591. jne .L27
  592. ALIGN_4
  593. .L29:
  594. ffreep %st(0)
  595. ffreep %st(0)
  596. faddp %st, %st(2)
  597. faddp %st, %st(2)
  598. FLD ALPHA
  599. fmul %st, %st(1)
  600. fmulp %st, %st(2)
  601. #ifndef TRMMKERNEL
  602. FADD (%edi)
  603. FST (%edi)
  604. FADD (%edi,LDC)
  605. FST (%edi,LDC)
  606. #else
  607. FST (%edi)
  608. FST (%edi,LDC)
  609. #endif
  610. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  611. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  612. movl K, %eax
  613. subl KKK, %eax
  614. leal (,%eax, SIZE), %eax
  615. leal (A, %eax, 1), A
  616. leal (B, %eax, 2), B
  617. #endif
  618. #if defined(TRMMKERNEL) && defined(LEFT)
  619. addl $1, KK
  620. #endif
  621. addl $1 * SIZE, %edi
  622. decl I
  623. jne .L24
  624. #if defined(TRMMKERNEL) && !defined(LEFT)
  625. addl $2, KK
  626. #endif
  627. leal (, LDC, 2), %eax
  628. addl %eax, C
  629. movl B, STACK_B
  630. ALIGN_4
  631. .L30:
  632. movl N, %eax
  633. andl $1, %eax
  634. je .L999
  635. ALIGN_3
  636. .L31:
  637. #if defined(TRMMKERNEL) && defined(LEFT)
  638. movl OFFSET, %eax
  639. movl %eax, KK
  640. #endif
  641. movl STACK_A, A
  642. movl STACK_B, B
  643. movl C, %edi
  644. movl M, %eax
  645. movl %eax, I
  646. ALIGN_3
  647. .L34:
  648. movl STACK_B, B
  649. #if !defined(TRMMKERNEL) || \
  650. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  651. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  652. #else
  653. movl KK, %eax
  654. leal (, %eax, SIZE), %eax
  655. leal (A, %eax, 1), A
  656. leal (B, %eax, 1), B
  657. #endif
  658. fldz
  659. fldz
  660. fldz
  661. fldz
  662. prefetchw 1 * SIZE(%edi)
  663. #ifndef TRMMKERNEL
  664. movl K, %eax
  665. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  666. movl K, %eax
  667. subl KK, %eax
  668. movl %eax, KKK
  669. #else
  670. movl KK, %eax
  671. #ifdef LEFT
  672. addl $1, %eax
  673. #else
  674. addl $1, %eax
  675. #endif
  676. movl %eax, KKK
  677. #endif
  678. sarl $3, %eax
  679. je .L36
  680. ALIGN_3
  681. .L35:
  682. FLD -16 * SIZE + AOFFSET(A)
  683. FMUL -16 * SIZE + BOFFSET(B)
  684. faddp %st, %st(1)
  685. FLD -15 * SIZE + AOFFSET(A)
  686. FMUL -15 * SIZE + BOFFSET(B)
  687. faddp %st, %st(2)
  688. FLD -14 * SIZE + AOFFSET(A)
  689. FMUL -14 * SIZE + BOFFSET(B)
  690. faddp %st, %st(3)
  691. FLD -13 * SIZE + AOFFSET(A)
  692. FMUL -13 * SIZE + BOFFSET(B)
  693. faddp %st, %st(4)
  694. FLD -12 * SIZE + AOFFSET(A)
  695. FMUL -12 * SIZE + BOFFSET(B)
  696. faddp %st, %st(1)
  697. FLD -11 * SIZE + AOFFSET(A)
  698. FMUL -11 * SIZE + BOFFSET(B)
  699. faddp %st, %st(2)
  700. FLD -10 * SIZE + AOFFSET(A)
  701. FMUL -10 * SIZE + BOFFSET(B)
  702. faddp %st, %st(3)
  703. FLD -9 * SIZE + AOFFSET(A)
  704. FMUL -9 * SIZE + BOFFSET(B)
  705. faddp %st, %st(4)
  706. addl $8 * SIZE, A
  707. addl $8 * SIZE, B
  708. decl %eax
  709. jne .L35
  710. ALIGN_4
  711. .L36:
  712. #ifndef TRMMKERNEL
  713. movl K, %eax
  714. #else
  715. movl KKK, %eax
  716. #endif
  717. and $7, %eax
  718. je .L39
  719. ALIGN_4
  720. .L37:
  721. FLD -16 * SIZE + AOFFSET(A)
  722. FMUL -16 * SIZE + BOFFSET(B)
  723. faddp %st, %st(1)
  724. addl $1 * SIZE,A
  725. addl $1 * SIZE,B
  726. decl %eax
  727. jne .L37
  728. ALIGN_4
  729. .L39:
  730. faddp %st, %st(2)
  731. faddp %st, %st(2)
  732. faddp %st, %st(1)
  733. FMUL ALPHA
  734. #ifndef TRMMKERNEL
  735. FADD (%edi)
  736. FST (%edi)
  737. #else
  738. FST (%edi)
  739. #endif
  740. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  741. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  742. movl K, %eax
  743. subl KKK, %eax
  744. leal (,%eax, SIZE), %eax
  745. leal (A, %eax, 1), A
  746. leal (B, %eax, 1), B
  747. #endif
  748. #if defined(TRMMKERNEL) && defined(LEFT)
  749. addl $1, KK
  750. #endif
  751. addl $1 * SIZE, %edi
  752. decl I
  753. jne .L34
  754. #if defined(TRMMKERNEL) && !defined(LEFT)
  755. addl $1, KK
  756. #endif
  757. addl LDC, C
  758. movl B, STACK_B
  759. ALIGN_4
  760. .L999:
  761. popl %ebx
  762. popl %esi
  763. popl %edi
  764. popl %ebp
  765. addl $ARGS, %esp
  766. ret
  767. EPILOGUE