You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_2x2.S 19 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define J 0 + STACK(%esp)
  43. #define KK 4 + STACK(%esp)
  44. #define KKK 8 + STACK(%esp)
  45. #define AORIG 12 + STACK(%esp)
  46. #define M 4 + STACK + ARGS(%esp)
  47. #define N 8 + STACK + ARGS(%esp)
  48. #define K 12 + STACK + ARGS(%esp)
  49. #define ALPHA 16 + STACK + ARGS(%esp)
  50. #ifdef DOUBLE
  51. #define A 24 + STACK + ARGS(%esp)
  52. #define B 28 + STACK + ARGS(%esp)
  53. #define C 32 + STACK + ARGS(%esp)
  54. #define LDC 36 + STACK + ARGS(%esp)
  55. #define OFFSET 40 + STACK + ARGS(%esp)
  56. #else
  57. #define A 20 + STACK + ARGS(%esp)
  58. #define B 24 + STACK + ARGS(%esp)
  59. #define C 28 + STACK + ARGS(%esp)
  60. #define LDC 32 + STACK + ARGS(%esp)
  61. #define OFFSET 36 + STACK + ARGS(%esp)
  62. #endif
  63. #define PREFETCH_OFFSET 48
  64. #if defined(PENTIUM3) || defined(PENTIUMM)
  65. #define REP rep
  66. #else
  67. #define REP rep
  68. #endif
  69. #define AA %edx
  70. #define BB %ecx
  71. PROLOGUE
  72. subl $ARGS, %esp # Generate Stack Frame
  73. pushl %ebp
  74. pushl %edi
  75. pushl %esi
  76. pushl %ebx
  77. PROFCODE
  78. movl LDC, %ebp # ldc # MEMORY
  79. movl B, %ebx
  80. leal (, %ebp, SIZE), %ebp
  81. #ifdef LN
  82. movl M, %eax
  83. leal (, %eax, SIZE), %eax
  84. addl %eax, C
  85. imull K, %eax
  86. addl %eax, A
  87. #endif
  88. #ifdef RT
  89. movl N, %eax
  90. leal (, %eax, SIZE), %eax
  91. imull K, %eax
  92. addl %eax, %ebx
  93. movl N, %eax
  94. imull %ebp, %eax
  95. addl %eax, C
  96. #endif
  97. #ifdef RN
  98. negl KK
  99. #endif
  100. #ifdef RT
  101. movl N, %eax
  102. subl OFFSET, %eax
  103. movl %eax, KK
  104. #endif
  105. movl N, %eax # j = (n >> 1) # MEMORY
  106. sarl $1, %eax
  107. movl %eax, J # j = (n >> 1) # MEMORY
  108. je .L8
  109. ALIGN_4
  110. .L34:
  111. #if defined(LT) || defined(RN)
  112. movl A, AA
  113. #else
  114. movl A, %eax
  115. movl %eax, AORIG
  116. #endif
  117. #ifdef RT
  118. movl K, %eax
  119. sall $1 + BASE_SHIFT, %eax
  120. subl %eax, %ebx
  121. #endif
  122. lea (, %ebp, 2), %eax
  123. #ifdef RT
  124. subl %eax, C
  125. #endif
  126. movl C, %edi
  127. #ifndef RT
  128. addl %eax, C
  129. #endif
  130. #ifdef LN
  131. movl OFFSET, %eax
  132. addl M, %eax
  133. movl %eax, KK
  134. #endif
  135. #ifdef LT
  136. movl OFFSET, %eax
  137. movl %eax, KK
  138. #endif
  139. movl M, %eax # m # MEMORY
  140. andl $1, %eax
  141. je .L12
  142. #ifdef LN
  143. movl K, %eax
  144. sall $0 + BASE_SHIFT, %eax
  145. subl %eax, AORIG
  146. #endif
  147. #if defined(LN) || defined(RT)
  148. movl KK, %eax
  149. leal (, %eax, SIZE), %eax
  150. movl AORIG, AA
  151. leal (AA, %eax, 1), AA
  152. leal (%ebx, %eax, 2), BB
  153. #else
  154. movl %ebx, BB
  155. #endif
  156. fldz
  157. fldz
  158. FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0)
  159. #if defined(LT) || defined(RN)
  160. movl KK, %eax
  161. #else
  162. movl K, %eax
  163. subl KK, %eax
  164. #endif
  165. sarl $1,%eax # k >> 1 # MEMORY
  166. je .L54
  167. ALIGN_4
  168. .L55:
  169. FLD 0 * SIZE(BB) # temp2 = *(boffset + 0)
  170. rep
  171. fmul %st(1), %st
  172. faddp %st, %st(2)
  173. FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0)
  174. faddp %st, %st(2)
  175. FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0)
  176. FLD 2 * SIZE(BB) # temp2 = *(boffset + 0)
  177. rep
  178. fmul %st(1), %st
  179. faddp %st, %st(2)
  180. FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0)
  181. faddp %st, %st(2)
  182. FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0)
  183. addl $2 * SIZE, AA
  184. addl $4 * SIZE, BB
  185. decl %eax
  186. jne .L55
  187. ALIGN_4
  188. .L54:
  189. #if defined(LT) || defined(RN)
  190. movl KK, %eax
  191. #else
  192. movl K, %eax
  193. subl KK, %eax
  194. #endif
  195. andl $1,%eax # k & 1
  196. je .L33
  197. ALIGN_4
  198. FLD 0 * SIZE(BB) # temp2 = *(boffset + 0)
  199. rep
  200. fmul %st(1), %st
  201. faddp %st, %st(2)
  202. FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0)
  203. faddp %st, %st(2)
  204. FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0)
  205. addl $1 * SIZE, AA
  206. addl $2 * SIZE, BB
  207. ALIGN_4
  208. .L33:
  209. ffreep %st(0)
  210. #if defined(LN) || defined(RT)
  211. movl KK, %eax
  212. #ifdef LN
  213. subl $1, %eax
  214. #else
  215. subl $2, %eax
  216. #endif
  217. leal (, %eax, SIZE), %eax
  218. movl AORIG, AA
  219. leal (AA, %eax, 1), AA
  220. leal (%ebx, %eax, 2), BB
  221. #endif
  222. #if defined(LN) || defined(LT)
  223. FLD 0 * SIZE(BB)
  224. fsubp %st, %st(1)
  225. FLD 1 * SIZE(BB)
  226. fsubp %st, %st(2)
  227. #else
  228. FLD 0 * SIZE(AA)
  229. fsubp %st, %st(1)
  230. FLD 1 * SIZE(AA)
  231. fsubp %st, %st(2)
  232. #endif
  233. #if defined(LN) || defined(LT)
  234. FLD 0 * SIZE(AA)
  235. fmul %st, %st(1)
  236. fmulp %st, %st(2)
  237. #endif
  238. #ifdef RN
  239. FLD 0 * SIZE(BB)
  240. fmulp %st, %st(1)
  241. FLD 1 * SIZE(BB)
  242. fmul %st(1), %st
  243. fsubrp %st, %st(2)
  244. FLD 3 * SIZE(BB)
  245. fmulp %st, %st(2)
  246. #endif
  247. #ifdef RT
  248. FLD 3 * SIZE(BB)
  249. fmulp %st, %st(2)
  250. FLD 2 * SIZE(BB)
  251. fmul %st(2), %st
  252. fsubrp %st, %st(1)
  253. FLD 0 * SIZE(BB)
  254. fmulp %st, %st(1)
  255. #endif
  256. #ifdef LN
  257. subl $1 * SIZE, %edi
  258. #endif
  259. #if defined(LN) || defined(LT)
  260. FSTU 0 * SIZE(BB)
  261. fxch %st(1)
  262. FSTU 1 * SIZE(BB)
  263. #else
  264. FSTU 0 * SIZE(AA)
  265. fxch %st(1)
  266. FSTU 1 * SIZE(AA)
  267. #endif
  268. FST 0 * SIZE(%edi,%ebp)
  269. FST 0 * SIZE(%edi)
  270. #ifndef LN
  271. addl $1 * SIZE, %edi
  272. #endif
  273. #if defined(LT) || defined(RN)
  274. movl K, %eax
  275. subl KK, %eax
  276. leal (,%eax, SIZE), %eax
  277. leal (AA, %eax, 1), AA
  278. leal (BB, %eax, 2), BB
  279. #endif
  280. #ifdef LN
  281. subl $1, KK
  282. #endif
  283. #ifdef LT
  284. addl $1, KK
  285. #endif
  286. #ifdef RT
  287. movl K, %eax
  288. sall $0 + BASE_SHIFT, %eax
  289. addl %eax, AORIG
  290. #endif
  291. ALIGN_4
  292. .L12:
  293. movl M, %esi
  294. sarl $1, %esi
  295. je .L27
  296. ALIGN_4
  297. .MainHead:
  298. #ifdef LN
  299. movl K, %eax
  300. sall $1 + BASE_SHIFT, %eax
  301. subl %eax, AORIG
  302. #endif
  303. #if defined(LN) || defined(RT)
  304. movl KK, %eax
  305. leal (, %eax, SIZE), %eax
  306. movl AORIG, AA
  307. leal (AA, %eax, 2), AA
  308. leal (%ebx, %eax, 2), BB
  309. #else
  310. movl %ebx, BB
  311. #endif
  312. fldz
  313. fldz
  314. fldz
  315. fldz
  316. FLD 4 * SIZE(BB) # b5
  317. FLD 4 * SIZE(AA) # a5
  318. FLD 0 * SIZE(BB) # b1
  319. FLD 0 * SIZE(AA) # a1
  320. #ifdef LN
  321. #if defined(HAVE_3DNOW)
  322. prefetchw -2 * SIZE(%edi)
  323. prefetchw -2 * SIZE(%edi, %ebp, 1)
  324. #elif defined(HAVE_SSE)
  325. prefetchnta -2 * SIZE(%edi)
  326. prefetchnta -2 * SIZE(%edi, %ebp, 1)
  327. #endif
  328. #else
  329. #if defined(HAVE_3DNOW)
  330. prefetchw 2 * SIZE(%edi)
  331. prefetchw 2 * SIZE(%edi, %ebp, 1)
  332. #elif defined(HAVE_SSE)
  333. prefetchnta 2 * SIZE(%edi)
  334. prefetchnta 2 * SIZE(%edi, %ebp, 1)
  335. #endif
  336. #endif
  337. #if defined(LT) || defined(RN)
  338. movl KK, %eax
  339. #else
  340. movl K, %eax
  341. subl KK, %eax
  342. #endif
  343. sarl $2, %eax
  344. je .L16
  345. ALIGN_4
  346. .MainLoop:
  347. #if defined(HAVE_3DNOW)
  348. prefetch (PREFETCH_OFFSET) * SIZE(BB)
  349. nop
  350. #elif defined(HAVE_SSE)
  351. prefetchnta (PREFETCH_OFFSET) * SIZE(BB)
  352. #if (L2_SIZE == 524288)
  353. prefetcht0 (PREFETCH_OFFSET) * SIZE(AA)
  354. #endif
  355. #endif
  356. fmul %st, %st(1)
  357. FMUL 1 * SIZE(BB)
  358. fxch %st(1)
  359. faddp %st, %st(4)
  360. FLD 0 * SIZE(BB)
  361. fxch %st(1)
  362. faddp %st, %st(5)
  363. FLD 1 * SIZE(AA)
  364. fmul %st, %st(1)
  365. FMUL 1 * SIZE(BB)
  366. fxch %st(1)
  367. faddp %st, %st(6)
  368. FLD 2 * SIZE(BB)
  369. fxch %st(1)
  370. faddp %st, %st(7)
  371. FLD 2 * SIZE(AA)
  372. fmul %st, %st(1)
  373. FMUL 3 * SIZE(BB)
  374. fxch %st(1)
  375. faddp %st, %st(4)
  376. FLD 2 * SIZE(BB)
  377. fxch %st(1)
  378. faddp %st, %st(5)
  379. FLD 3 * SIZE(AA)
  380. fmul %st, %st(1)
  381. FMUL 3 * SIZE(BB)
  382. fxch %st(1)
  383. faddp %st, %st(6)
  384. FLD 8 * SIZE(BB)
  385. fxch %st(1)
  386. faddp %st, %st(7)
  387. FLD 8 * SIZE(AA)
  388. fxch %st(2)
  389. #if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE)
  390. prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB)
  391. #if (L2_SIZE == 524288)
  392. prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA)
  393. #endif
  394. #endif
  395. fmul %st, %st(3)
  396. FMUL 5 * SIZE(BB)
  397. fxch %st(3)
  398. faddp %st, %st(4)
  399. FLD 4 * SIZE(BB)
  400. fxch %st(3)
  401. faddp %st, %st(5)
  402. FLD 5 * SIZE(AA)
  403. fmul %st, %st(3)
  404. FMUL 5 * SIZE(BB)
  405. fxch %st(3)
  406. faddp %st, %st(6)
  407. FLD 6 * SIZE(BB)
  408. fxch %st(3)
  409. faddp %st, %st(7)
  410. FLD 6 * SIZE(AA)
  411. fmul %st, %st(3)
  412. FMUL 7 * SIZE(BB)
  413. fxch %st(3)
  414. faddp %st, %st(4)
  415. FLD 6 * SIZE(BB)
  416. fxch %st(3)
  417. faddp %st, %st(5)
  418. FLD 7 * SIZE(AA)
  419. fmul %st, %st(3)
  420. FMUL 7 * SIZE(BB)
  421. fxch %st(3)
  422. faddp %st, %st(6)
  423. FLD 12 * SIZE(BB)
  424. fxch %st(3)
  425. faddp %st, %st(7)
  426. FLD 12 * SIZE(AA)
  427. fxch %st(2)
  428. subl $-8 * SIZE, BB
  429. subl $-8 * SIZE, AA
  430. decl %eax # l --
  431. jne .MainLoop
  432. ALIGN_4
  433. .L16:
  434. #if defined(LT) || defined(RN)
  435. movl KK, %eax
  436. #else
  437. movl K, %eax
  438. subl KK, %eax
  439. #endif
  440. and $3, %eax
  441. je .L21
  442. ALIGN_4
  443. .SubLoop:
  444. fmul %st, %st(1)
  445. FMUL 1 * SIZE(BB)
  446. fxch %st(1)
  447. faddp %st, %st(4)
  448. FLD 0 * SIZE(BB)
  449. fxch %st(1)
  450. faddp %st, %st(5)
  451. FLD 1 * SIZE(AA)
  452. fmul %st, %st(1)
  453. FMUL 1 * SIZE(BB)
  454. fxch %st(1)
  455. faddp %st, %st(6)
  456. FLD 2 * SIZE(BB)
  457. fxch %st(1)
  458. faddp %st, %st(7)
  459. FLD 2 * SIZE(AA)
  460. addl $2 * SIZE,BB
  461. addl $2 * SIZE,AA
  462. decl %eax
  463. jne .SubLoop
  464. ALIGN_4
  465. .L21:
  466. ffreep %st(0)
  467. ffreep %st(0)
  468. ffreep %st(0)
  469. ffreep %st(0)
  470. #if defined(LN) || defined(RT)
  471. movl KK, %eax
  472. #ifdef LN
  473. subl $2, %eax
  474. #else
  475. subl $2, %eax
  476. #endif
  477. leal (, %eax, SIZE), %eax
  478. movl AORIG, AA
  479. leal (AA, %eax, 2), AA
  480. leal (%ebx, %eax, 2), BB
  481. #endif
  482. #if defined(LN) || defined(LT)
  483. FLD 0 * SIZE(BB)
  484. fsubp %st, %st(1)
  485. FLD 1 * SIZE(BB)
  486. fsubp %st, %st(2)
  487. FLD 2 * SIZE(BB)
  488. fsubp %st, %st(3)
  489. FLD 3 * SIZE(BB)
  490. fsubp %st, %st(4)
  491. #else
  492. FLD 0 * SIZE(AA)
  493. fsubp %st, %st(1)
  494. FLD 1 * SIZE(AA)
  495. fsubp %st, %st(3)
  496. FLD 2 * SIZE(AA)
  497. fsubp %st, %st(2)
  498. FLD 3 * SIZE(AA)
  499. fsubp %st, %st(4)
  500. #endif
  501. #ifdef LN
  502. FLD 3 * SIZE(AA)
  503. fmul %st, %st(3)
  504. fmulp %st, %st(4)
  505. FLD 2 * SIZE(AA)
  506. fmul %st(3), %st
  507. FLD 2 * SIZE(AA)
  508. fmul %st(5), %st
  509. fsubrp %st, %st(3)
  510. fsubrp %st, %st(1)
  511. FLD 0 * SIZE(AA)
  512. fmul %st, %st(1)
  513. fmulp %st, %st(2)
  514. #endif
  515. #ifdef LT
  516. FLD 0 * SIZE(AA)
  517. fmul %st, %st(1)
  518. fmulp %st, %st(2)
  519. FLD 1 * SIZE(AA)
  520. fmul %st(1), %st
  521. FLD 1 * SIZE(AA)
  522. fmul %st(3), %st
  523. fsubrp %st, %st(5)
  524. fsubrp %st, %st(3)
  525. FLD 3 * SIZE(AA)
  526. fmul %st, %st(3)
  527. fmulp %st, %st(4)
  528. #endif
  529. #ifdef RN
  530. FLD 0 * SIZE(BB)
  531. fmul %st, %st(1)
  532. fmulp %st, %st(3)
  533. FLD 1 * SIZE(BB)
  534. fmul %st(1), %st
  535. FLD 1 * SIZE(BB)
  536. fmul %st(4), %st
  537. fsubrp %st, %st(5)
  538. fsubrp %st, %st(2)
  539. FLD 3 * SIZE(BB)
  540. fmul %st, %st(2)
  541. fmulp %st, %st(4)
  542. #endif
  543. #ifdef RT
  544. FLD 3 * SIZE(BB)
  545. fmul %st, %st(2)
  546. fmulp %st, %st(4)
  547. FLD 2 * SIZE(BB)
  548. fmul %st(2), %st
  549. FLD 2 * SIZE(BB)
  550. fmul %st(5), %st
  551. fsubrp %st, %st(4)
  552. fsubrp %st, %st(1)
  553. FLD 0 * SIZE(BB)
  554. fmul %st, %st(1)
  555. fmulp %st, %st(3)
  556. #endif
  557. #ifdef LN
  558. subl $2 * SIZE, %edi
  559. #endif
  560. #if defined(LN) || defined(LT)
  561. FSTU 0 * SIZE(BB)
  562. fxch %st(1)
  563. FSTU 1 * SIZE(BB)
  564. fxch %st(2)
  565. FSTU 2 * SIZE(BB)
  566. fxch %st(3)
  567. FSTU 3 * SIZE(BB)
  568. FST 1 * SIZE(%edi,%ebp)
  569. FST 0 * SIZE(%edi)
  570. FST 0 * SIZE(%edi,%ebp)
  571. FST 1 * SIZE(%edi)
  572. #else
  573. FSTU 0 * SIZE(AA)
  574. fxch %st(2)
  575. FSTU 1 * SIZE(AA)
  576. fxch %st(1)
  577. FSTU 2 * SIZE(AA)
  578. fxch %st(3)
  579. FSTU 3 * SIZE(AA)
  580. FST 1 * SIZE(%edi,%ebp)
  581. FST 1 * SIZE(%edi)
  582. FST 0 * SIZE(%edi)
  583. FST 0 * SIZE(%edi,%ebp)
  584. #endif
  585. #ifndef LN
  586. addl $2 * SIZE, %edi
  587. #endif
  588. #if defined(LT) || defined(RN)
  589. movl K, %eax
  590. subl KK, %eax
  591. leal (,%eax, SIZE), %eax
  592. leal (AA, %eax, 2), AA
  593. leal (BB, %eax, 2), BB
  594. #endif
  595. #ifdef LN
  596. subl $2, KK
  597. #endif
  598. #ifdef LT
  599. addl $2, KK
  600. #endif
  601. #ifdef RT
  602. movl K, %eax
  603. sall $1 + BASE_SHIFT, %eax
  604. addl %eax, AORIG
  605. #endif
  606. decl %esi # i --
  607. jne .MainHead
  608. ALIGN_4
  609. .L27:
  610. #ifdef LN
  611. movl K, %eax
  612. leal ( , %eax, SIZE), %eax
  613. leal (%ebx, %eax, 2), %ebx
  614. #endif
  615. #if defined(LT) || defined(RN)
  616. movl BB, %ebx
  617. #endif
  618. #ifdef RN
  619. addl $2, KK
  620. #endif
  621. #ifdef RT
  622. subl $2, KK
  623. #endif
  624. decl J # j-- # MEMORY
  625. jne .L34
  626. ALIGN_4
  627. .L8:
  628. movl N, %eax # n # MEMORY
  629. andl $1, %eax
  630. je .End
  631. #if defined(LT) || defined(RN)
  632. movl A, AA
  633. #else
  634. movl A, %eax
  635. movl %eax, AORIG
  636. #endif
  637. #ifdef RT
  638. movl K, %eax
  639. sall $0 + BASE_SHIFT, %eax
  640. subl %eax, %ebx
  641. #endif
  642. #ifdef RT
  643. subl %ebp, C
  644. #endif
  645. movl C, %edi # c # MEMORY
  646. #ifndef RT
  647. addl %ebp, C
  648. #endif
  649. #ifdef LN
  650. movl OFFSET, %eax
  651. addl M, %eax
  652. movl %eax, KK
  653. #endif
  654. #ifdef LT
  655. movl OFFSET, %eax
  656. movl %eax, KK
  657. #endif
  658. movl M, %eax # m # MEMORY
  659. andl $1, %eax # m & 1
  660. je .L36
  661. #ifdef LN
  662. movl K, %eax
  663. sall $0 + BASE_SHIFT, %eax
  664. subl %eax, AORIG
  665. #endif
  666. #if defined(LN) || defined(RT)
  667. movl KK, %eax
  668. leal (, %eax, SIZE), %eax
  669. movl AORIG, AA
  670. leal (AA, %eax, 1), AA
  671. leal (%ebx, %eax, 1), BB
  672. #else
  673. movl %ebx, BB
  674. #endif
  675. fldz
  676. #ifdef LN
  677. #if defined(HAVE_3DNOW)
  678. prefetchw -2 * SIZE(%edi)
  679. #elif defined(HAVE_SSE)
  680. prefetchnta -2 * SIZE(%edi)
  681. #endif
  682. #else
  683. #if defined(HAVE_3DNOW)
  684. prefetchw 2 * SIZE(%edi)
  685. #elif defined(HAVE_SSE)
  686. prefetchnta 2 * SIZE(%edi)
  687. #endif
  688. #endif
  689. #if defined(LT) || defined(RN)
  690. movl KK, %eax
  691. #else
  692. movl K, %eax
  693. subl KK, %eax
  694. #endif
  695. test %eax, %eax
  696. jle .L52
  697. ALIGN_3
  698. .L51:
  699. FLD (AA)
  700. FMUL (BB)
  701. addl $1 * SIZE,AA
  702. addl $1 * SIZE,BB
  703. faddp %st,%st(1)
  704. decl %eax
  705. jne .L51
  706. ALIGN_4
  707. .L52:
  708. #if defined(LN) || defined(RT)
  709. movl KK, %eax
  710. #ifdef LN
  711. subl $1, %eax
  712. #else
  713. subl $1, %eax
  714. #endif
  715. leal (, %eax, SIZE), %eax
  716. movl AORIG, AA
  717. leal (AA, %eax, 1), AA
  718. leal (%ebx, %eax, 1), BB
  719. #endif
  720. #if defined(LN) || defined(LT)
  721. FLD 0 * SIZE(BB)
  722. fsubp %st, %st(1)
  723. #else
  724. FLD 0 * SIZE(AA)
  725. fsubp %st, %st(1)
  726. #endif
  727. #if defined(LN) || defined(LT)
  728. FMUL 0 * SIZE(AA)
  729. #else
  730. FMUL 0 * SIZE(BB)
  731. #endif
  732. #ifdef LN
  733. subl $1 * SIZE, %edi
  734. #endif
  735. #if defined(LN) || defined(LT)
  736. FSTU 0 * SIZE(BB)
  737. #else
  738. FSTU 0 * SIZE(AA)
  739. #endif
  740. FST 0 * SIZE(%edi)
  741. #ifndef LN
  742. addl $1 * SIZE, %edi
  743. #endif
  744. #if defined(LT) || defined(RN)
  745. movl K, %eax
  746. subl KK, %eax
  747. leal (,%eax, SIZE), %eax
  748. leal (AA, %eax, 1), AA
  749. leal (BB, %eax, 1), BB
  750. #endif
  751. #ifdef LN
  752. subl $1, KK
  753. #endif
  754. #ifdef LT
  755. addl $1, KK
  756. #endif
  757. #ifdef RT
  758. movl K, %eax
  759. sall $0 + BASE_SHIFT, %eax
  760. addl %eax, AORIG
  761. #endif
  762. ALIGN_4
  763. .L36:
  764. movl M, %esi # m # MEMORY
  765. sarl $1, %esi # m >> 1
  766. je .L99
  767. ALIGN_4
  768. .L46:
  769. #ifdef LN
  770. movl K, %eax
  771. sall $1 + BASE_SHIFT, %eax
  772. subl %eax, AORIG
  773. #endif
  774. #if defined(LN) || defined(RT)
  775. movl KK, %eax
  776. leal (, %eax, SIZE), %eax
  777. movl AORIG, AA
  778. leal (AA, %eax, 2), AA
  779. leal (%ebx, %eax, 1), BB
  780. #else
  781. movl %ebx, BB
  782. #endif
  783. fldz
  784. fldz
  785. FLD 0 * SIZE(BB) # temp1 = *(boffset + 0)
  786. #if defined(LT) || defined(RN)
  787. movl KK, %eax
  788. #else
  789. movl K, %eax
  790. subl KK, %eax
  791. #endif
  792. sarl $1, %eax
  793. je .L56
  794. ALIGN_4
  795. .L57:
  796. FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0)
  797. fmul %st(1), %st
  798. faddp %st, %st(2)
  799. FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0)
  800. faddp %st, %st(2)
  801. FLD 1 * SIZE(BB) # temp1 = *(boffset + 0)
  802. FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0)
  803. fmul %st(1), %st
  804. faddp %st, %st(2)
  805. FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0)
  806. faddp %st, %st(2)
  807. FLD 2 * SIZE(BB) # temp1 = *(boffset + 0)
  808. addl $4 * SIZE,AA
  809. addl $2 * SIZE,BB
  810. dec %eax
  811. jne .L57
  812. ALIGN_4
  813. .L56:
  814. #if defined(LT) || defined(RN)
  815. movl KK, %eax
  816. #else
  817. movl K, %eax
  818. subl KK, %eax
  819. #endif
  820. andl $1, %eax
  821. je .L45
  822. ALIGN_4
  823. FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0)
  824. fmul %st(1), %st
  825. faddp %st, %st(2)
  826. FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0)
  827. faddp %st, %st(2)
  828. FLD 3 * SIZE(BB) # temp1 = *(boffset + 0)
  829. addl $2 * SIZE,AA
  830. addl $1 * SIZE,BB
  831. ALIGN_4
  832. .L45:
  833. ffreep %st(0)
  834. #if defined(LN) || defined(RT)
  835. movl KK, %eax
  836. #ifdef LN
  837. subl $2, %eax
  838. #else
  839. subl $1, %eax
  840. #endif
  841. leal (, %eax, SIZE), %eax
  842. movl AORIG, AA
  843. leal (AA, %eax, 2), AA
  844. leal (%ebx, %eax, 1), BB
  845. #endif
  846. #if defined(LN) || defined(LT)
  847. FLD 0 * SIZE(BB)
  848. fsubp %st, %st(1)
  849. FLD 1 * SIZE(BB)
  850. fsubp %st, %st(2)
  851. #else
  852. FLD 0 * SIZE(AA)
  853. fsubp %st, %st(1)
  854. FLD 1 * SIZE(AA)
  855. fsubp %st, %st(2)
  856. #endif
  857. #ifdef LN
  858. FLD 3 * SIZE(AA)
  859. fmulp %st, %st(2)
  860. FLD 2 * SIZE(AA)
  861. fmul %st(2), %st
  862. fsubrp %st, %st(1)
  863. FLD 0 * SIZE(AA)
  864. fmulp %st, %st(1)
  865. #endif
  866. #ifdef LT
  867. FLD 0 * SIZE(AA)
  868. fmulp %st, %st(1)
  869. FLD 1 * SIZE(AA)
  870. fmul %st(1), %st
  871. fsubrp %st, %st(2)
  872. FLD 3 * SIZE(AA)
  873. fmulp %st, %st(2)
  874. #endif
  875. #ifdef RN
  876. FLD 0 * SIZE(BB)
  877. fmul %st, %st(1)
  878. fmulp %st, %st(2)
  879. #endif
  880. #ifdef RT
  881. FLD 0 * SIZE(BB)
  882. fmul %st, %st(1)
  883. fmulp %st, %st(2)
  884. #endif
  885. #ifdef LN
  886. subl $2 * SIZE, %edi
  887. #endif
  888. #if defined(LN) || defined(LT)
  889. FSTU 0 * SIZE(BB)
  890. fxch %st(1)
  891. FSTU 1 * SIZE(BB)
  892. #else
  893. FSTU 0 * SIZE(AA)
  894. fxch %st(1)
  895. FSTU 1 * SIZE(AA)
  896. #endif
  897. FST 1 * SIZE(%edi)
  898. FST 0 * SIZE(%edi)
  899. #ifndef LN
  900. addl $2 * SIZE, %edi
  901. #endif
  902. #if defined(LT) || defined(RN)
  903. movl K, %eax
  904. subl KK, %eax
  905. leal (,%eax, SIZE), %eax
  906. leal (AA, %eax, 2), AA
  907. leal (BB, %eax, 1), BB
  908. #endif
  909. #ifdef LN
  910. subl $2, KK
  911. #endif
  912. #ifdef LT
  913. addl $2, KK
  914. #endif
  915. #ifdef RT
  916. movl K, %eax
  917. sall $1 + BASE_SHIFT, %eax
  918. addl %eax, AORIG
  919. #endif
  920. decl %esi # i --
  921. jne .L46
  922. ALIGN_4
  923. .L99:
  924. #ifdef LN
  925. movl K, %eax
  926. leal (%ebx, %eax, SIZE), %ebx
  927. #endif
  928. #if defined(LT) || defined(RN)
  929. movl BB, %ebx
  930. #endif
  931. #ifdef RN
  932. addl $1, KK
  933. #endif
  934. #ifdef RT
  935. subl $1, KK
  936. #endif
  937. ALIGN_4
  938. .End:
  939. popl %ebx
  940. popl %esi
  941. popl %edi
  942. popl %ebp
  943. addl $ARGS, %esp
  944. ret
  945. EPILOGUE