You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_1x4.S 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 32
  42. #define J 0 + STACK(%esp)
  43. #define I 4 + STACK(%esp)
  44. #define KK 8 + STACK(%esp)
  45. #define KKK 12 + STACK(%esp)
  46. #define AORIG 16 + STACK(%esp)
  47. #define M 4 + STACK + ARGS(%esp)
  48. #define N 8 + STACK + ARGS(%esp)
  49. #define K 12 + STACK + ARGS(%esp)
  50. #define ALPHA 16 + STACK + ARGS(%esp)
  51. #ifdef DOUBLE
  52. #define STACK_A 24 + STACK + ARGS(%esp)
  53. #define STACK_B 28 + STACK + ARGS(%esp)
  54. #define C 32 + STACK + ARGS(%esp)
  55. #define STACK_LDC 36 + STACK + ARGS(%esp)
  56. #define OFFSET 40 + STACK + ARGS(%esp)
  57. #else
  58. #define STACK_A 20 + STACK + ARGS(%esp)
  59. #define STACK_B 24 + STACK + ARGS(%esp)
  60. #define C 28 + STACK + ARGS(%esp)
  61. #define STACK_LDC 32 + STACK + ARGS(%esp)
  62. #define OFFSET 36 + STACK + ARGS(%esp)
  63. #endif
  64. #define A %edx
  65. #define B %ecx
  66. #define B_ORIG %ebx
  67. #define LDC %ebp
  68. #define PREFETCHSIZE (5 + 8 * 10)
  69. PROLOGUE
  70. subl $ARGS, %esp # Generate Stack Frame
  71. pushl %ebp
  72. pushl %edi
  73. pushl %esi
  74. pushl %ebx
  75. PROFCODE
  76. movl STACK_B, B_ORIG
  77. movl STACK_LDC, LDC
  78. leal (, LDC, SIZE), LDC
  79. #ifdef LN
  80. movl M, %eax
  81. leal (, %eax, SIZE), %eax
  82. addl %eax, C
  83. imull K, %eax
  84. addl %eax, STACK_A
  85. #endif
  86. #ifdef RT
  87. movl N, %eax
  88. leal (, %eax, SIZE), %eax
  89. imull K, %eax
  90. addl %eax, B_ORIG
  91. movl N, %eax
  92. imull LDC, %eax
  93. addl %eax, C
  94. #endif
  95. #ifdef RN
  96. movl OFFSET, %eax
  97. negl %eax
  98. movl %eax, KK
  99. #endif
  100. #ifdef RT
  101. movl N, %eax
  102. subl OFFSET, %eax
  103. movl %eax, KK
  104. #endif
  105. subl $-16 * SIZE, B_ORIG
  106. subl $-16 * SIZE, STACK_A
  107. movl M, %eax
  108. testl %eax, %eax
  109. jle .L999
  110. movl N, %eax
  111. testl %eax, %eax
  112. jle .L999
  113. movl K, %eax
  114. testl %eax, %eax
  115. jle .L999
  116. movl N, %eax
  117. sarl $2, %eax
  118. movl %eax, J
  119. je .L20
  120. ALIGN_3
  121. .L11:
  122. #if defined(LT) || defined(RN)
  123. movl STACK_A, A
  124. #else
  125. movl STACK_A, %eax
  126. movl %eax, AORIG
  127. #endif
  128. #ifdef RT
  129. movl K, %eax
  130. sall $2 + BASE_SHIFT, %eax
  131. subl %eax, B_ORIG
  132. #endif
  133. leal (, LDC, 4), %eax
  134. #ifdef RT
  135. subl %eax, C
  136. #endif
  137. movl C, %edi
  138. #ifndef RT
  139. addl %eax, C
  140. #endif
  141. #ifdef LN
  142. movl OFFSET, %eax
  143. addl M, %eax
  144. movl %eax, KK
  145. #endif
  146. #ifdef LT
  147. movl OFFSET, %eax
  148. movl %eax, KK
  149. #endif
  150. movl B_ORIG, B
  151. #if defined(LT) || defined(RN)
  152. movl KK, %eax
  153. #else
  154. movl K, %eax
  155. subl KK, %eax
  156. #endif
  157. sarl $4, %eax
  158. jle .L13
  159. ALIGN_4
  160. .L12:
  161. movl -16 * SIZE(B), %esi
  162. movl -8 * SIZE(B), %esi
  163. movl 0 * SIZE(B), %esi
  164. movl 8 * SIZE(B), %esi
  165. movl 16 * SIZE(B), %esi
  166. movl 24 * SIZE(B), %esi
  167. movl 32 * SIZE(B), %esi
  168. movl 40 * SIZE(B), %esi
  169. subl $-64 * SIZE, B
  170. decl %eax
  171. jne .L12
  172. ALIGN_3
  173. .L13:
  174. movl M, %esi
  175. movl %esi, I
  176. ALIGN_3
  177. .L14:
  178. #ifdef LN
  179. movl K, %eax
  180. sall $BASE_SHIFT, %eax
  181. subl %eax, AORIG
  182. #endif
  183. #if defined(LN) || defined(RT)
  184. movl KK, %eax
  185. leal (, %eax, SIZE), %eax
  186. movl AORIG, A
  187. leal (A , %eax, 1), A
  188. leal (B_ORIG, %eax, 4), B
  189. #else
  190. movl B_ORIG, B
  191. #endif
  192. leal (%edi, LDC, 2), %eax
  193. fldz
  194. fldz
  195. fldz
  196. fldz
  197. FLD -8 * SIZE(A)
  198. FLD -16 * SIZE(A)
  199. FLD -16 * SIZE(B)
  200. movl $32 * SIZE, %esi
  201. prefetchw 1 * SIZE(%edi)
  202. prefetchw 1 * SIZE(%edi, LDC)
  203. prefetchw 1 * SIZE(%eax)
  204. prefetchw 1 * SIZE(%eax, LDC)
  205. #if defined(LT) || defined(RN)
  206. movl KK, %eax
  207. #else
  208. movl K, %eax
  209. subl KK, %eax
  210. #endif
  211. sarl $3, %eax
  212. je .L16
  213. ALIGN_3
  214. .L15:
  215. fmul %st(1), %st
  216. faddp %st, %st(3)
  217. PADDING
  218. FLD -15 * SIZE(B)
  219. fmul %st(1), %st
  220. faddp %st, %st(4)
  221. PADDING
  222. FLD -14 * SIZE(B)
  223. fmul %st(1), %st
  224. faddp %st, %st(5)
  225. PADDING
  226. FMUL -13 * SIZE(B)
  227. faddp %st, %st(5)
  228. FLD -15 * SIZE(A)
  229. FLD -12 * SIZE(B)
  230. fmul %st(1), %st
  231. faddp %st, %st(3)
  232. PADDING
  233. FLD -11 * SIZE(B)
  234. fmul %st(1), %st
  235. faddp %st, %st(4)
  236. PADDING
  237. FLD -10 * SIZE(B)
  238. fmul %st(1), %st
  239. faddp %st, %st(5)
  240. PADDING
  241. FMUL -9 * SIZE(B)
  242. faddp %st, %st(5)
  243. FLD -14 * SIZE(A)
  244. FLD -8 * SIZE(B)
  245. fmul %st(1), %st
  246. faddp %st, %st(3)
  247. PADDING
  248. FLD -7 * SIZE(B)
  249. fmul %st(1), %st
  250. faddp %st, %st(4)
  251. PADDING
  252. FLD -6 * SIZE(B)
  253. fmul %st(1), %st
  254. faddp %st, %st(5)
  255. PADDING
  256. FMUL -5 * SIZE(B)
  257. faddp %st, %st(5)
  258. FLD -13 * SIZE(A)
  259. FLD -4 * SIZE(B)
  260. fmul %st(1), %st
  261. faddp %st, %st(3)
  262. PADDING
  263. FLD -3 * SIZE(B)
  264. fmul %st(1), %st
  265. faddp %st, %st(4)
  266. PADDING
  267. FLD -2 * SIZE(B)
  268. fmul %st(1), %st
  269. faddp %st, %st(5)
  270. PADDING
  271. FMUL -1 * SIZE(B)
  272. faddp %st, %st(5)
  273. FLD -12 * SIZE(A)
  274. FLD 0 * SIZE(B)
  275. fmul %st(1), %st
  276. faddp %st, %st(3)
  277. PADDING
  278. FLD 1 * SIZE(B)
  279. fmul %st(1), %st
  280. faddp %st, %st(4)
  281. PADDING
  282. FLD 2 * SIZE(B)
  283. fmul %st(1), %st
  284. faddp %st, %st(5)
  285. PADDING
  286. FMUL 3 * SIZE(B)
  287. faddp %st, %st(5)
  288. FLD -11 * SIZE(A)
  289. FLD 4 * SIZE(B)
  290. fmul %st(1), %st
  291. faddp %st, %st(3)
  292. PADDING
  293. FLD 5 * SIZE(B)
  294. fmul %st(1), %st
  295. faddp %st, %st(4)
  296. PADDING
  297. FLD 6 * SIZE(B)
  298. fmul %st(1), %st
  299. faddp %st, %st(5)
  300. PADDING
  301. FMUL 7 * SIZE(B)
  302. faddp %st, %st(5)
  303. FLD -10 * SIZE(A)
  304. FLD 8 * SIZE(B)
  305. fmul %st(1), %st
  306. faddp %st, %st(3)
  307. PADDING
  308. FLD 9 * SIZE(B)
  309. fmul %st(1), %st
  310. faddp %st, %st(4)
  311. PADDING
  312. FLD 10 * SIZE(B)
  313. fmul %st(1), %st
  314. faddp %st, %st(5)
  315. PADDING
  316. FMUL 11 * SIZE(B)
  317. faddp %st, %st(5)
  318. FLD -9 * SIZE(A)
  319. FLD 12 * SIZE(B)
  320. fmul %st(1), %st
  321. faddp %st, %st(3)
  322. PADDING
  323. FLD 13 * SIZE(B)
  324. fmul %st(1), %st
  325. faddp %st, %st(4)
  326. PADDING
  327. FLD 14 * SIZE(B)
  328. fmul %st(1), %st
  329. faddp %st, %st(5)
  330. PADDING
  331. FMUL 15 * SIZE(B)
  332. faddp %st, %st(5)
  333. FLD 0 * SIZE(A)
  334. PADDING prefetch PREFETCHSIZE * SIZE(A)
  335. addl $8 * SIZE, A
  336. fxch %st(1)
  337. addl $32 * SIZE, B
  338. FLD -16 * SIZE(B)
  339. decl %eax
  340. jne .L15
  341. ALIGN_4
  342. .L16:
  343. #if defined(LT) || defined(RN)
  344. movl KK, %eax
  345. #else
  346. movl K, %eax
  347. subl KK, %eax
  348. #endif
  349. and $7, %eax
  350. je .L19
  351. ALIGN_4
  352. .L17:
  353. fmul %st(1), %st
  354. faddp %st, %st(3)
  355. FLD -15 * SIZE(B)
  356. fmul %st(1), %st
  357. faddp %st, %st(4)
  358. FLD -14 * SIZE(B)
  359. fmul %st(1), %st
  360. faddp %st, %st(5)
  361. FMUL -13 * SIZE(B)
  362. faddp %st, %st(5)
  363. FLD -15 * SIZE(A)
  364. FLD -12 * SIZE(B)
  365. addl $1 * SIZE,A
  366. addl $4 * SIZE,B
  367. decl %eax
  368. jne .L17
  369. ALIGN_4
  370. .L19:
  371. ffreep %st(0)
  372. ffreep %st(0)
  373. ffreep %st(0)
  374. #if defined(LN) || defined(RT)
  375. movl KK, %eax
  376. #ifdef LN
  377. subl $1, %eax
  378. #else
  379. subl $4, %eax
  380. #endif
  381. leal (, %eax, SIZE), %eax
  382. movl AORIG, A
  383. leal (A, %eax, 1), A
  384. leal (B_ORIG, %eax, 4), B
  385. #endif
  386. #if defined(LN) || defined(LT)
  387. FLD 0 * SIZE - 16 * SIZE(B)
  388. fsubp %st, %st(1)
  389. FLD 1 * SIZE - 16 * SIZE(B)
  390. fsubp %st, %st(2)
  391. FLD 2 * SIZE - 16 * SIZE(B)
  392. fsubp %st, %st(3)
  393. FLD 3 * SIZE - 16 * SIZE(B)
  394. fsubp %st, %st(4)
  395. #else
  396. FLD 0 * SIZE - 16 * SIZE(A)
  397. fsubp %st, %st(1)
  398. FLD 1 * SIZE - 16 * SIZE(A)
  399. fsubp %st, %st(2)
  400. FLD 2 * SIZE - 16 * SIZE(A)
  401. fsubp %st, %st(3)
  402. FLD 3 * SIZE - 16 * SIZE(A)
  403. fsubp %st, %st(4)
  404. #endif
  405. #ifdef LN
  406. FLD 0 * SIZE - 16 * SIZE(A)
  407. fmul %st, %st(1)
  408. fmul %st, %st(2)
  409. fmul %st, %st(3)
  410. fmulp %st, %st(4)
  411. #endif
  412. #ifdef LT
  413. FLD 0 * SIZE - 16 * SIZE(A)
  414. fmul %st, %st(1)
  415. fmul %st, %st(2)
  416. fmul %st, %st(3)
  417. fmulp %st, %st(4)
  418. #endif
  419. #ifdef RN
  420. FMUL 0 * SIZE - 16 * SIZE(B)
  421. FLD 1 * SIZE - 16 * SIZE(B)
  422. fmul %st(1), %st
  423. fsubrp %st, %st(2)
  424. FLD 2 * SIZE - 16 * SIZE(B)
  425. fmul %st(1), %st
  426. fsubrp %st, %st(3)
  427. FLD 3 * SIZE - 16 * SIZE(B)
  428. fmul %st(1), %st
  429. fsubrp %st, %st(4)
  430. FLD 5 * SIZE - 16 * SIZE(B)
  431. fmulp %st, %st(2)
  432. FLD 6 * SIZE - 16 * SIZE(B)
  433. fmul %st(2), %st
  434. fsubrp %st, %st(3)
  435. FLD 7 * SIZE - 16 * SIZE(B)
  436. fmul %st(2), %st
  437. fsubrp %st, %st(4)
  438. FLD 10 * SIZE - 16 * SIZE(B)
  439. fmulp %st, %st(3)
  440. FLD 11 * SIZE - 16 * SIZE(B)
  441. fmul %st(3), %st
  442. fsubrp %st, %st(4)
  443. FLD 15 * SIZE - 16 * SIZE(B)
  444. fmulp %st, %st(4)
  445. #endif
  446. #ifdef RT
  447. FLD 15 * SIZE - 16 * SIZE(B)
  448. fmulp %st, %st(4)
  449. FLD 14 * SIZE - 16 * SIZE(B)
  450. fmul %st(4), %st
  451. fsubrp %st, %st(3)
  452. FLD 13 * SIZE - 16 * SIZE(B)
  453. fmul %st(4), %st
  454. fsubrp %st, %st(2)
  455. FLD 12 * SIZE - 16 * SIZE(B)
  456. fmul %st(4), %st
  457. fsubrp %st, %st(1)
  458. FLD 10 * SIZE - 16 * SIZE(B)
  459. fmulp %st, %st(3)
  460. FLD 9 * SIZE - 16 * SIZE(B)
  461. fmul %st(3), %st
  462. fsubrp %st, %st(2)
  463. FLD 8 * SIZE - 16 * SIZE(B)
  464. fmul %st(3), %st
  465. fsubrp %st, %st(1)
  466. FLD 5 * SIZE - 16 * SIZE(B)
  467. fmulp %st, %st(2)
  468. FLD 4 * SIZE - 16 * SIZE(B)
  469. fmul %st(2), %st
  470. fsubrp %st, %st(1)
  471. FLD 0 * SIZE - 16 * SIZE(B)
  472. fmulp %st, %st(1)
  473. #endif
  474. #ifdef LN
  475. subl $1 * SIZE, %edi
  476. #endif
  477. #if defined(LN) || defined(LT)
  478. FSTU 0 * SIZE - 16 * SIZE(B)
  479. fxch %st(1)
  480. FSTU 1 * SIZE - 16 * SIZE(B)
  481. fxch %st(2)
  482. FSTU 2 * SIZE - 16 * SIZE(B)
  483. fxch %st(3)
  484. FSTU 3 * SIZE - 16 * SIZE(B)
  485. #else
  486. FSTU 0 * SIZE - 16 * SIZE(A)
  487. fxch %st(1)
  488. FSTU 1 * SIZE - 16 * SIZE(A)
  489. fxch %st(2)
  490. FSTU 2 * SIZE - 16 * SIZE(A)
  491. fxch %st(3)
  492. FSTU 3 * SIZE - 16 * SIZE(A)
  493. #endif
  494. leal (%edi, LDC, 2), %eax
  495. FST 0 * SIZE(%eax, LDC)
  496. FST 0 * SIZE(%edi)
  497. FST 0 * SIZE(%edi, LDC)
  498. FST 0 * SIZE(%eax)
  499. #ifndef LN
  500. addl $1 * SIZE, %edi
  501. #endif
  502. #if defined(LT) || defined(RN)
  503. movl K, %eax
  504. subl KK, %eax
  505. leal (,%eax, SIZE), %eax
  506. leal (A, %eax, 1), A
  507. leal (B, %eax, 4), B
  508. #endif
  509. #ifdef LN
  510. subl $1, KK
  511. #endif
  512. #ifdef LT
  513. addl $1, KK
  514. #endif
  515. #ifdef RT
  516. movl K, %eax
  517. sall $BASE_SHIFT, %eax
  518. addl %eax, AORIG
  519. #endif
  520. decl I
  521. jne .L14
  522. #ifdef LN
  523. movl K, %eax
  524. leal ( , %eax, SIZE), %eax
  525. leal (B_ORIG, %eax, 4), B_ORIG
  526. #endif
  527. #if defined(LT) || defined(RN)
  528. movl B, B_ORIG
  529. #endif
  530. #ifdef RN
  531. addl $4, KK
  532. #endif
  533. #ifdef RT
  534. subl $4, KK
  535. #endif
  536. decl J
  537. jne .L11
  538. ALIGN_4
  539. .L20:
  540. movl N, %eax
  541. andl $2, %eax
  542. je .L30
  543. #if defined(LT) || defined(RN)
  544. movl STACK_A, A
  545. #else
  546. movl STACK_A, %eax
  547. movl %eax, AORIG
  548. #endif
  549. #ifdef RT
  550. movl K, %eax
  551. sall $1 + BASE_SHIFT, %eax
  552. subl %eax, B_ORIG
  553. #endif
  554. leal (, LDC, 2), %eax
  555. #ifdef RT
  556. subl %eax, C
  557. #endif
  558. movl C, %edi
  559. #ifndef RT
  560. addl %eax, C
  561. #endif
  562. #ifdef LN
  563. movl OFFSET, %eax
  564. addl M, %eax
  565. movl %eax, KK
  566. #endif
  567. #ifdef LT
  568. movl OFFSET, %eax
  569. movl %eax, KK
  570. #endif
  571. movl B_ORIG, B
  572. #if defined(LT) || defined(RN)
  573. movl KK, %eax
  574. #else
  575. movl K, %eax
  576. subl KK, %eax
  577. #endif
  578. sarl $4, %eax
  579. jle .L23
  580. ALIGN_4
  581. .L22:
  582. movl -16 * SIZE(B), %esi
  583. movl -8 * SIZE(B), %esi
  584. movl 0 * SIZE(B), %esi
  585. movl 8 * SIZE(B), %esi
  586. subl $-32 * SIZE, B
  587. decl %eax
  588. jne .L22
  589. ALIGN_3
  590. .L23:
  591. movl M, %esi
  592. movl %esi, I
  593. ALIGN_3
  594. .L24:
  595. #ifdef LN
  596. movl K, %eax
  597. sall $BASE_SHIFT, %eax
  598. subl %eax, AORIG
  599. #endif
  600. #if defined(LN) || defined(RT)
  601. movl KK, %eax
  602. leal (, %eax, SIZE), %eax
  603. movl AORIG, A
  604. leal (A , %eax, 1), A
  605. leal (B_ORIG, %eax, 2), B
  606. #else
  607. movl B_ORIG, B
  608. #endif
  609. fldz
  610. fldz
  611. fldz
  612. fldz
  613. FLD -16 * SIZE(A)
  614. FLD -16 * SIZE(B)
  615. prefetchw 1 * SIZE(%edi)
  616. prefetchw 1 * SIZE(%edi, LDC)
  617. #if defined(LT) || defined(RN)
  618. movl KK, %eax
  619. #else
  620. movl K, %eax
  621. subl KK, %eax
  622. #endif
  623. sarl $3, %eax
  624. je .L26
  625. ALIGN_3
  626. .L25:
  627. fmul %st(1), %st
  628. faddp %st, %st(2)
  629. FMUL -15 * SIZE(B)
  630. faddp %st, %st(2)
  631. FLD -15 * SIZE(A)
  632. FLD -14 * SIZE(B)
  633. fmul %st(1), %st
  634. faddp %st, %st(4)
  635. FMUL -13 * SIZE(B)
  636. faddp %st, %st(4)
  637. FLD -14 * SIZE(A)
  638. FLD -12 * SIZE(B)
  639. fmul %st(1), %st
  640. faddp %st, %st(2)
  641. FMUL -11 * SIZE(B)
  642. faddp %st, %st(2)
  643. FLD -13 * SIZE(A)
  644. FLD -10 * SIZE(B)
  645. fmul %st(1), %st
  646. faddp %st, %st(4)
  647. FMUL -9 * SIZE(B)
  648. faddp %st, %st(4)
  649. FLD -12 * SIZE(A)
  650. FLD -8 * SIZE(B)
  651. fmul %st(1), %st
  652. faddp %st, %st(2)
  653. FMUL -7 * SIZE(B)
  654. faddp %st, %st(2)
  655. FLD -11 * SIZE(A)
  656. FLD -6 * SIZE(B)
  657. fmul %st(1), %st
  658. faddp %st, %st(4)
  659. FMUL -5 * SIZE(B)
  660. faddp %st, %st(4)
  661. FLD -10 * SIZE(A)
  662. FLD -4 * SIZE(B)
  663. fmul %st(1), %st
  664. faddp %st, %st(2)
  665. FMUL -3 * SIZE(B)
  666. faddp %st, %st(2)
  667. FLD -9 * SIZE(A)
  668. FLD -2 * SIZE(B)
  669. fmul %st(1), %st
  670. faddp %st, %st(4)
  671. FMUL -1 * SIZE(B)
  672. faddp %st, %st(4)
  673. FLD -8 * SIZE(A)
  674. FLD 0 * SIZE(B)
  675. addl $ 8 * SIZE, A
  676. subl $-16 * SIZE, B
  677. decl %eax
  678. jne .L25
  679. ALIGN_4
  680. .L26:
  681. #if defined(LT) || defined(RN)
  682. movl KK, %eax
  683. #else
  684. movl K, %eax
  685. subl KK, %eax
  686. #endif
  687. and $7, %eax
  688. je .L29
  689. ALIGN_4
  690. .L27:
  691. fmul %st(1), %st
  692. faddp %st, %st(2)
  693. FMUL -15 * SIZE(B)
  694. faddp %st, %st(2)
  695. FLD -15 * SIZE(A)
  696. FLD -14 * SIZE(B)
  697. addl $1 * SIZE,A
  698. addl $2 * SIZE,B
  699. decl %eax
  700. jne .L27
  701. ALIGN_4
  702. .L29:
  703. ffreep %st(0)
  704. ffreep %st(0)
  705. faddp %st, %st(2)
  706. faddp %st, %st(2)
  707. #if defined(LN) || defined(RT)
  708. movl KK, %eax
  709. #ifdef LN
  710. subl $1, %eax
  711. #else
  712. subl $2, %eax
  713. #endif
  714. leal (, %eax, SIZE), %eax
  715. movl AORIG, A
  716. leal (A, %eax, 1), A
  717. leal (B_ORIG, %eax, 2), B
  718. #endif
  719. #if defined(LN) || defined(LT)
  720. FLD 0 * SIZE - 16 * SIZE(B)
  721. fsubp %st, %st(1)
  722. FLD 1 * SIZE - 16 * SIZE(B)
  723. fsubp %st, %st(2)
  724. #else
  725. FLD 0 * SIZE - 16 * SIZE(A)
  726. fsubp %st, %st(1)
  727. FLD 1 * SIZE - 16 * SIZE(A)
  728. fsubp %st, %st(2)
  729. #endif
  730. #ifdef LN
  731. FLD 0 * SIZE - 16 * SIZE(A)
  732. fmul %st, %st(1)
  733. fmulp %st, %st(2)
  734. #endif
  735. #ifdef LT
  736. FLD 0 * SIZE - 16 * SIZE(A)
  737. fmul %st, %st(1)
  738. fmulp %st, %st(2)
  739. #endif
  740. #ifdef RN
  741. FMUL 0 * SIZE - 16 * SIZE(B)
  742. FLD 1 * SIZE - 16 * SIZE(B)
  743. fmul %st(1), %st
  744. fsubrp %st, %st(2)
  745. FLD 3 * SIZE - 16 * SIZE(B)
  746. fmulp %st, %st(2)
  747. #endif
  748. #ifdef RT
  749. FLD 3 * SIZE - 16 * SIZE(B)
  750. fmulp %st, %st(2)
  751. FLD 2 * SIZE - 16 * SIZE(B)
  752. fmul %st(2), %st
  753. fsubrp %st, %st(1)
  754. FLD 0 * SIZE - 16 * SIZE(B)
  755. fmulp %st, %st(1)
  756. #endif
  757. #ifdef LN
  758. subl $1 * SIZE, %edi
  759. #endif
  760. #if defined(LN) || defined(LT)
  761. FSTU 0 * SIZE - 16 * SIZE(B)
  762. fxch %st(1)
  763. FSTU 1 * SIZE - 16 * SIZE(B)
  764. #else
  765. FSTU 0 * SIZE - 16 * SIZE(A)
  766. fxch %st(1)
  767. FSTU 1 * SIZE - 16 * SIZE(A)
  768. #endif
  769. FST 0 * SIZE(%edi, LDC)
  770. FST 0 * SIZE(%edi)
  771. #ifndef LN
  772. addl $1 * SIZE, %edi
  773. #endif
  774. #if defined(LT) || defined(RN)
  775. movl K, %eax
  776. subl KK, %eax
  777. leal (,%eax, SIZE), %eax
  778. leal (A, %eax, 1), A
  779. leal (B, %eax, 2), B
  780. #endif
  781. #ifdef LN
  782. subl $1, KK
  783. #endif
  784. #ifdef LT
  785. addl $1, KK
  786. #endif
  787. #ifdef RT
  788. movl K, %eax
  789. sall $BASE_SHIFT, %eax
  790. addl %eax, AORIG
  791. #endif
  792. decl I
  793. jne .L24
  794. #ifdef LN
  795. movl K, %eax
  796. leal ( , %eax, SIZE), %eax
  797. leal (B_ORIG, %eax, 2), B_ORIG
  798. #endif
  799. #if defined(LT) || defined(RN)
  800. movl B, B_ORIG
  801. #endif
  802. #ifdef RN
  803. addl $2, KK
  804. #endif
  805. #ifdef RT
  806. subl $2, KK
  807. #endif
  808. ALIGN_4
  809. .L30:
  810. movl N, %eax
  811. andl $1, %eax
  812. je .L999
  813. ALIGN_3
  814. .L31:
  815. #if defined(LT) || defined(RN)
  816. movl STACK_A, A
  817. #else
  818. movl STACK_A, %eax
  819. movl %eax, AORIG
  820. #endif
  821. #ifdef RT
  822. movl K, %eax
  823. sall $0 + BASE_SHIFT, %eax
  824. subl %eax, B_ORIG
  825. #endif
  826. #ifdef RT
  827. subl LDC, C
  828. #endif
  829. movl C, %edi
  830. #ifndef RT
  831. addl LDC, C
  832. #endif
  833. #ifdef LN
  834. movl OFFSET, %eax
  835. addl M, %eax
  836. movl %eax, KK
  837. #endif
  838. #ifdef LT
  839. movl OFFSET, %eax
  840. movl %eax, KK
  841. #endif
  842. movl B_ORIG, B
  843. #if defined(LT) || defined(RN)
  844. movl KK, %eax
  845. #else
  846. movl K, %eax
  847. subl KK, %eax
  848. #endif
  849. sarl $5, %eax
  850. jle .L33
  851. ALIGN_4
  852. .L32:
  853. movl -16 * SIZE(B), %esi
  854. movl -8 * SIZE(B), %esi
  855. movl 0 * SIZE(B), %esi
  856. movl 8 * SIZE(B), %esi
  857. subl $-32 * SIZE, B
  858. decl %eax
  859. jne .L32
  860. ALIGN_3
  861. .L33:
  862. movl M, %esi
  863. movl %esi, I
  864. ALIGN_3
  865. .L34:
  866. #ifdef LN
  867. movl K, %eax
  868. sall $BASE_SHIFT, %eax
  869. subl %eax, AORIG
  870. #endif
  871. #if defined(LN) || defined(RT)
  872. movl KK, %eax
  873. leal (, %eax, SIZE), %eax
  874. movl AORIG, A
  875. leal (A , %eax, 1), A
  876. leal (B_ORIG, %eax, 1), B
  877. #else
  878. movl B_ORIG, B
  879. #endif
  880. fldz
  881. fldz
  882. fldz
  883. fldz
  884. prefetchw 1 * SIZE(%edi)
  885. #if defined(LT) || defined(RN)
  886. movl KK, %eax
  887. #else
  888. movl K, %eax
  889. subl KK, %eax
  890. #endif
  891. sarl $3, %eax
  892. je .L36
  893. ALIGN_3
  894. .L35:
  895. FLD -16 * SIZE(A)
  896. FMUL -16 * SIZE(B)
  897. faddp %st, %st(1)
  898. FLD -15 * SIZE(A)
  899. FMUL -15 * SIZE(B)
  900. faddp %st, %st(2)
  901. FLD -14 * SIZE(A)
  902. FMUL -14 * SIZE(B)
  903. faddp %st, %st(3)
  904. FLD -13 * SIZE(A)
  905. FMUL -13 * SIZE(B)
  906. faddp %st, %st(4)
  907. FLD -12 * SIZE(A)
  908. FMUL -12 * SIZE(B)
  909. faddp %st, %st(1)
  910. FLD -11 * SIZE(A)
  911. FMUL -11 * SIZE(B)
  912. faddp %st, %st(2)
  913. FLD -10 * SIZE(A)
  914. FMUL -10 * SIZE(B)
  915. faddp %st, %st(3)
  916. FLD -9 * SIZE(A)
  917. FMUL -9 * SIZE(B)
  918. faddp %st, %st(4)
  919. addl $8 * SIZE, A
  920. addl $8 * SIZE, B
  921. decl %eax
  922. jne .L35
  923. ALIGN_4
  924. .L36:
  925. #if defined(LT) || defined(RN)
  926. movl KK, %eax
  927. #else
  928. movl K, %eax
  929. subl KK, %eax
  930. #endif
  931. and $7, %eax
  932. je .L39
  933. ALIGN_4
  934. .L37:
  935. FLD -16 * SIZE(A)
  936. FMUL -16 * SIZE(B)
  937. faddp %st, %st(1)
  938. addl $1 * SIZE,A
  939. addl $1 * SIZE,B
  940. decl %eax
  941. jne .L37
  942. ALIGN_4
  943. .L39:
  944. faddp %st, %st(2)
  945. faddp %st, %st(2)
  946. faddp %st, %st(1)
  947. #if defined(LN) || defined(RT)
  948. movl KK, %eax
  949. subl $1, %eax
  950. movl AORIG, A
  951. leal (A, %eax, SIZE), A
  952. leal (B_ORIG, %eax, SIZE), B
  953. #endif
  954. #if defined(LN) || defined(LT)
  955. FLD 0 * SIZE - 16 * SIZE(B)
  956. fsubp %st, %st(1)
  957. #else
  958. FLD 0 * SIZE - 16 * SIZE(A)
  959. fsubp %st, %st(1)
  960. #endif
  961. #if defined(LN) || defined(LT)
  962. FLD 0 * SIZE - 16 * SIZE(A)
  963. fmulp %st, %st(1)
  964. #endif
  965. #if defined(RN) || defined(RT)
  966. FMUL 0 * SIZE - 16 * SIZE(B)
  967. #endif
  968. #ifdef LN
  969. subl $1 * SIZE, %edi
  970. #endif
  971. #if defined(LN) || defined(LT)
  972. FSTU 0 * SIZE - 16 * SIZE(B)
  973. #else
  974. FSTU 0 * SIZE - 16 * SIZE(A)
  975. #endif
  976. FST 0 * SIZE(%edi)
  977. #ifndef LN
  978. addl $1 * SIZE, %edi
  979. #endif
  980. #if defined(LT) || defined(RN)
  981. movl K, %eax
  982. subl KK, %eax
  983. leal (A, %eax, SIZE), A
  984. leal (B, %eax, SIZE), B
  985. #endif
  986. #ifdef LN
  987. subl $1, KK
  988. #endif
  989. #ifdef LT
  990. addl $1, KK
  991. #endif
  992. #ifdef RT
  993. movl K, %eax
  994. sall $BASE_SHIFT, %eax
  995. addl %eax, AORIG
  996. #endif
  997. decl I
  998. jne .L34
  999. #ifdef LN
  1000. movl K, %eax
  1001. leal ( , %eax, SIZE), %eax
  1002. leal (B_ORIG, %eax, 1), B_ORIG
  1003. #endif
  1004. #if defined(LT) || defined(RN)
  1005. movl B, B_ORIG
  1006. #endif
  1007. #ifdef RN
  1008. addl $1, KK
  1009. #endif
  1010. #ifdef RT
  1011. subl $1, KK
  1012. #endif
  1013. ALIGN_4
  1014. .L999:
  1015. popl %ebx
  1016. popl %esi
  1017. popl %edi
  1018. popl %ebp
  1019. addl $ARGS, %esp
  1020. ret
  1021. EPILOGUE