You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qtrsm_kernel_RT_2x2.S 19 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1
  41. #define N ARG2
  42. #define K ARG3
  43. #define A ARG4
  44. #define B ARG5
  45. #define C ARG6
  46. #define LDC %r10
  47. #define I %r12
  48. #define J %r13
  49. #define AO %r14
  50. #define BO %r15
  51. #define CO %rbp
  52. #define KK %r11
  53. #define AORIG 48(%rsp)
  54. #define STACKSIZE 64
  55. #define ALPHA 8 + STACKSIZE(%rsp)
  56. #define OFFSET 32 + STACKSIZE(%rsp)
  57. #ifdef OPTERON
  58. #define PREFETCH prefetch
  59. #define PREFETCHW prefetchw
  60. #else
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #endif
  64. #define PREFETCHSIZE (5 + 4 * 10)
  65. PROLOGUE
  66. PROFCODE
  67. #ifdef WINDOWS_ABI
  68. emms
  69. #endif
  70. subq $STACKSIZE, %rsp
  71. movq %rbx, 0(%rsp)
  72. movq %rbp, 8(%rsp)
  73. movq %r12, 16(%rsp)
  74. movq %r13, 24(%rsp)
  75. movq %r14, 32(%rsp)
  76. movq %r15, 40(%rsp)
  77. movq 24 + STACKSIZE(%rsp), LDC
  78. #if defined(TRMMKERNEL) && !defined(LEFT)
  79. movq OFFSET, %rax
  80. negq %rax
  81. movq %rax, KK
  82. #endif
  83. addq $8 * SIZE, A
  84. addq $8 * SIZE, B
  85. salq $BASE_SHIFT, LDC
  86. #ifdef LN
  87. movq M, %rax
  88. salq $BASE_SHIFT, %rax
  89. addq %rax, C
  90. imulq K, %rax
  91. addq %rax, A
  92. #endif
  93. #ifdef RT
  94. movq N, %rax
  95. salq $BASE_SHIFT, %rax
  96. imulq K, %rax
  97. addq %rax, B
  98. movq N, %rax
  99. imulq LDC, %rax
  100. addq %rax, C
  101. #endif
  102. #ifdef RN
  103. movq OFFSET, %rax
  104. negq %rax
  105. movq %rax, KK
  106. #endif
  107. #ifdef RT
  108. movq N, %rax
  109. subq OFFSET, %rax
  110. movq %rax, KK
  111. #endif
  112. movq N, %rax
  113. testq $1, %rax
  114. je .L30
  115. #if defined(LT) || defined(RN)
  116. movq A, AO
  117. #else
  118. movq A, %rax
  119. movq %rax, AORIG
  120. #endif
  121. #ifdef RT
  122. movq K, %rax
  123. salq $0 + BASE_SHIFT, %rax
  124. subq %rax, B
  125. #endif
  126. #ifdef RT
  127. subq LDC, C
  128. #endif
  129. movq C, CO
  130. #ifndef RT
  131. addq LDC, C
  132. #endif
  133. #ifdef LN
  134. movq OFFSET, %rax
  135. addq M, %rax
  136. movq %rax, KK
  137. #endif
  138. #ifdef LT
  139. movq OFFSET, %rax
  140. movq %rax, KK
  141. #endif
  142. movq M, I
  143. sarq $1, I
  144. je .L40
  145. ALIGN_4
  146. .L31:
  147. #ifdef LN
  148. movq K, %rax
  149. salq $1 + BASE_SHIFT, %rax
  150. subq %rax, AORIG
  151. #endif
  152. #if defined(LN) || defined(RT)
  153. movq KK, %rax
  154. salq $BASE_SHIFT, %rax
  155. movq AORIG, AO
  156. leaq (AO, %rax, 2), AO
  157. leaq (B, %rax, 1), BO
  158. #else
  159. movq B, BO
  160. #endif
  161. fldz
  162. fldz
  163. #if defined(HAVE_3DNOW)
  164. prefetchw 2 * SIZE(CO)
  165. #elif defined(HAVE_SSE)
  166. prefetchnta 2 * SIZE(CO)
  167. #endif
  168. #if defined(LT) || defined(RN)
  169. movq KK, %rax
  170. #else
  171. movq K, %rax
  172. subq KK, %rax
  173. #endif
  174. sarq $2, %rax
  175. je .L35
  176. ALIGN_4
  177. .L32:
  178. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  179. FLD -8 * SIZE(BO)
  180. FLD -8 * SIZE(AO)
  181. fmul %st(1), %st
  182. faddp %st, %st(2)
  183. FLD -7 * SIZE(AO)
  184. fmulp %st, %st(1)
  185. faddp %st, %st(2)
  186. FLD -7 * SIZE(BO)
  187. FLD -6 * SIZE(AO)
  188. fmul %st(1), %st
  189. faddp %st, %st(2)
  190. FLD -5 * SIZE(AO)
  191. fmulp %st, %st(1)
  192. faddp %st, %st(2)
  193. FLD -6 * SIZE(BO)
  194. FLD -4 * SIZE(AO)
  195. fmul %st(1), %st
  196. faddp %st, %st(2)
  197. FLD -3 * SIZE(AO)
  198. fmulp %st, %st(1)
  199. faddp %st, %st(2)
  200. FLD -5 * SIZE(BO)
  201. FLD -2 * SIZE(AO)
  202. fmul %st(1), %st
  203. faddp %st, %st(2)
  204. FLD -1 * SIZE(AO)
  205. fmulp %st, %st(1)
  206. faddp %st, %st(2)
  207. addq $8 * SIZE,AO
  208. addq $4 * SIZE,BO
  209. decq %rax
  210. jne .L32
  211. ALIGN_4
  212. .L35:
  213. #if defined(LT) || defined(RN)
  214. movq KK, %rax
  215. #else
  216. movq K, %rax
  217. subq KK, %rax
  218. #endif
  219. and $3, %rax
  220. je .L38
  221. ALIGN_4
  222. .L36:
  223. FLD -8 * SIZE(BO)
  224. FLD -8 * SIZE(AO)
  225. fmul %st(1), %st
  226. faddp %st, %st(2)
  227. FLD -7 * SIZE(AO)
  228. fmulp %st, %st(1)
  229. faddp %st, %st(2)
  230. addq $2 * SIZE,AO
  231. addq $1 * SIZE,BO
  232. decq %rax
  233. jne .L36
  234. ALIGN_4
  235. .L38:
  236. #if defined(LN) || defined(RT)
  237. movq KK, %rax
  238. #ifdef LN
  239. subq $2, %rax
  240. #else
  241. subq $1, %rax
  242. #endif
  243. salq $BASE_SHIFT, %rax
  244. movq AORIG, AO
  245. leaq (AO, %rax, 2), AO
  246. leaq (B, %rax, 1), BO
  247. #endif
  248. #if defined(LN) || defined(LT)
  249. FLD -8 * SIZE(BO)
  250. fsubp %st, %st(1)
  251. FLD -7 * SIZE(BO)
  252. fsubp %st, %st(2)
  253. #else
  254. FLD -8 * SIZE(AO)
  255. fsubp %st, %st(1)
  256. FLD -7 * SIZE(AO)
  257. fsubp %st, %st(2)
  258. #endif
  259. #ifdef LN
  260. FLD -5 * SIZE(AO)
  261. fmulp %st, %st(2)
  262. FLD -6 * SIZE(AO)
  263. fmul %st(2), %st
  264. fsubrp %st, %st(1)
  265. FLD -8 * SIZE(AO)
  266. fmulp %st, %st(1)
  267. #endif
  268. #ifdef LT
  269. FLD -8 * SIZE(AO)
  270. fmulp %st, %st(1)
  271. FLD -7 * SIZE(AO)
  272. fmul %st(1), %st
  273. fsubrp %st, %st(2)
  274. FLD -5 * SIZE(AO)
  275. fmulp %st, %st(2)
  276. #endif
  277. #ifdef RN
  278. FLD -8 * SIZE(BO)
  279. fmul %st, %st(1)
  280. fmulp %st, %st(2)
  281. #endif
  282. #ifdef RT
  283. FLD -8 * SIZE(BO)
  284. fmul %st, %st(1)
  285. fmulp %st, %st(2)
  286. #endif
  287. #ifdef LN
  288. subq $2 * SIZE, CO
  289. #endif
  290. #if defined(LN) || defined(LT)
  291. fld %st
  292. FST -8 * SIZE(BO)
  293. fxch %st(1)
  294. fld %st
  295. FST -7 * SIZE(BO)
  296. #else
  297. fld %st
  298. FST -8 * SIZE(AO)
  299. fxch %st(1)
  300. fld %st
  301. FST -7 * SIZE(AO)
  302. #endif
  303. FST 1 * SIZE(CO)
  304. FST 0 * SIZE(CO)
  305. #ifndef LN
  306. addq $2 * SIZE, CO
  307. #endif
  308. #if defined(LT) || defined(RN)
  309. movq K, %rax
  310. subq KK, %rax
  311. salq $BASE_SHIFT, %rax
  312. leaq (AO, %rax, 2), AO
  313. leaq (BO, %rax, 1), BO
  314. #endif
  315. #ifdef LN
  316. subq $2, KK
  317. #endif
  318. #ifdef LT
  319. addq $2, KK
  320. #endif
  321. #ifdef RT
  322. movq K, %rax
  323. salq $1 + BASE_SHIFT, %rax
  324. addq %rax, AORIG
  325. #endif
  326. decq I
  327. jne .L31
  328. ALIGN_4
  329. .L40:
  330. movq M, %rax
  331. andq $1, %rax
  332. je .L49
  333. ALIGN_4
  334. .L41:
  335. #ifdef LN
  336. movq K, %rax
  337. salq $0 + BASE_SHIFT, %rax
  338. subq %rax, AORIG
  339. #endif
  340. #if defined(LN) || defined(RT)
  341. movq KK, %rax
  342. salq $BASE_SHIFT, %rax
  343. movq AORIG, AO
  344. leaq (AO, %rax, 1), AO
  345. leaq (B, %rax, 1), BO
  346. #else
  347. movq B, BO
  348. #endif
  349. fldz
  350. #if defined(LT) || defined(RN)
  351. movq KK, %rax
  352. #else
  353. movq K, %rax
  354. subq KK, %rax
  355. #endif
  356. sarq $2, %rax
  357. je .L45
  358. ALIGN_4
  359. .L42:
  360. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  361. FLD -8 * SIZE(AO)
  362. FLD -8 * SIZE(BO)
  363. fmulp %st, %st(1)
  364. faddp %st, %st(1)
  365. FLD -7 * SIZE(AO)
  366. FLD -7 * SIZE(BO)
  367. fmulp %st, %st(1)
  368. faddp %st, %st(1)
  369. FLD -6 * SIZE(AO)
  370. FLD -6 * SIZE(BO)
  371. fmulp %st, %st(1)
  372. faddp %st, %st(1)
  373. FLD -5 * SIZE(AO)
  374. FLD -5 * SIZE(BO)
  375. fmulp %st, %st(1)
  376. faddp %st, %st(1)
  377. addq $4 * SIZE,AO
  378. addq $4 * SIZE,BO
  379. decq %rax
  380. jne .L42
  381. ALIGN_4
  382. .L45:
  383. #if defined(LT) || defined(RN)
  384. movq KK, %rax
  385. #else
  386. movq K, %rax
  387. subq KK, %rax
  388. #endif
  389. and $3, %rax
  390. je .L48
  391. ALIGN_4
  392. .L46:
  393. FLD -8 * SIZE(AO)
  394. FLD -8 * SIZE(BO)
  395. fmulp %st, %st(1)
  396. faddp %st, %st(1)
  397. addq $1 * SIZE,AO
  398. addq $1 * SIZE,BO
  399. decq %rax
  400. jne .L46
  401. ALIGN_4
  402. .L48:
  403. #if defined(LN) || defined(RT)
  404. movq KK, %rax
  405. #ifdef LN
  406. subq $1, %rax
  407. #else
  408. subq $1, %rax
  409. #endif
  410. salq $BASE_SHIFT, %rax
  411. movq AORIG, AO
  412. leaq (AO, %rax, 1), AO
  413. leaq (B, %rax, 1), BO
  414. #endif
  415. #if defined(LN) || defined(LT)
  416. FLD -8 * SIZE(BO)
  417. fsubp %st, %st(1)
  418. #else
  419. FLD -8 * SIZE(AO)
  420. fsubp %st, %st(1)
  421. #endif
  422. #ifdef LN
  423. FLD -8 * SIZE(AO)
  424. fmulp %st, %st(1)
  425. #endif
  426. #ifdef LT
  427. FLD -8 * SIZE(AO)
  428. fmulp %st, %st(1)
  429. #endif
  430. #ifdef RN
  431. FLD -8 * SIZE(BO)
  432. fmulp %st, %st(1)
  433. #endif
  434. #ifdef RT
  435. FLD -8 * SIZE(BO)
  436. fmulp %st, %st(1)
  437. #endif
  438. #ifdef LN
  439. subq $1 * SIZE, CO
  440. #endif
  441. #if defined(LN) || defined(LT)
  442. fld %st
  443. FST -8 * SIZE(BO)
  444. #else
  445. fld %st
  446. FST -8 * SIZE(AO)
  447. #endif
  448. FST 0 * SIZE(CO)
  449. #ifndef LN
  450. addq $1 * SIZE, CO
  451. #endif
  452. #if defined(LT) || defined(RN)
  453. movq K, %rax
  454. subq KK, %rax
  455. salq $BASE_SHIFT, %rax
  456. leaq (AO, %rax, 1), AO
  457. leaq (BO, %rax, 1), BO
  458. #endif
  459. #ifdef LN
  460. subq $1, KK
  461. #endif
  462. #ifdef LT
  463. addq $1, KK
  464. #endif
  465. #ifdef RT
  466. movq K, %rax
  467. salq $0 + BASE_SHIFT, %rax
  468. addq %rax, AORIG
  469. #endif
  470. ALIGN_4
  471. .L49:
  472. #ifdef LN
  473. movq K, %rax
  474. salq $BASE_SHIFT, %rax
  475. leaq (B, %rax, 1), B
  476. #endif
  477. #if defined(LT) || defined(RN)
  478. movq BO, B
  479. #endif
  480. #ifdef RN
  481. addq $1, KK
  482. #endif
  483. #ifdef RT
  484. subq $1, KK
  485. #endif
  486. ALIGN_4
  487. .L30:
  488. movq N, %rax
  489. sarq $1, %rax
  490. movq %rax, J
  491. je .L999
  492. ALIGN_4
  493. .L01:
  494. #if defined(LT) || defined(RN)
  495. movq A, AO
  496. #else
  497. movq A, %rax
  498. movq %rax, AORIG
  499. #endif
  500. #ifdef RT
  501. movq K, %rax
  502. salq $1 + BASE_SHIFT, %rax
  503. subq %rax, B
  504. #endif
  505. lea (, LDC, 2), %rax
  506. #ifdef RT
  507. subq %rax, C
  508. #endif
  509. movq C, CO
  510. #ifndef RT
  511. addq %rax, C
  512. #endif
  513. #ifdef LN
  514. movq OFFSET, %rax
  515. addq M, %rax
  516. movq %rax, KK
  517. #endif
  518. #ifdef LT
  519. movq OFFSET, %rax
  520. movq %rax, KK
  521. #endif
  522. movq M, I
  523. sarq $1, I
  524. je .L20
  525. ALIGN_4
  526. .L11:
  527. #ifdef LN
  528. movq K, %rax
  529. salq $1 + BASE_SHIFT, %rax
  530. subq %rax, AORIG
  531. #endif
  532. #if defined(LN) || defined(RT)
  533. movq KK, %rax
  534. salq $BASE_SHIFT, %rax
  535. movq AORIG, AO
  536. leaq (AO, %rax, 2), AO
  537. leaq (B, %rax, 2), BO
  538. #else
  539. movq B, BO
  540. #endif
  541. fldz
  542. fldz
  543. fldz
  544. fldz
  545. #if defined(HAVE_3DNOW)
  546. prefetchw 2 * SIZE(CO)
  547. prefetchw 2 * SIZE(CO, LDC, 1)
  548. #elif defined(HAVE_SSE)
  549. prefetchnta 2 * SIZE(CO)
  550. prefetchnta 2 * SIZE(CO, LDC, 1)
  551. #endif
  552. #if defined(LT) || defined(RN)
  553. movq KK, %rax
  554. #else
  555. movq K, %rax
  556. subq KK, %rax
  557. #endif
  558. sarq $2, %rax
  559. je .L15
  560. ALIGN_4
  561. .L12:
  562. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  563. FLD -8 * SIZE(AO)
  564. FLD -8 * SIZE(BO)
  565. fld %st(1)
  566. fmul %st(1), %st
  567. faddp %st, %st(3)
  568. FLD -7 * SIZE(BO)
  569. fmul %st, %st(2)
  570. FLD -7 * SIZE(AO)
  571. fmul %st, %st(2)
  572. fmulp %st, %st(1)
  573. faddp %st, %st(6)
  574. faddp %st, %st(4)
  575. faddp %st, %st(2)
  576. FLD -6 * SIZE(AO)
  577. FLD -6 * SIZE(BO)
  578. fld %st(1)
  579. fmul %st(1), %st
  580. faddp %st, %st(3)
  581. FLD -5 * SIZE(BO)
  582. fmul %st, %st(2)
  583. FLD -5 * SIZE(AO)
  584. fmul %st, %st(2)
  585. fmulp %st, %st(1)
  586. faddp %st, %st(6)
  587. faddp %st, %st(4)
  588. faddp %st, %st(2)
  589. PREFETCH (PREFETCHSIZE + 4) * SIZE(AO)
  590. FLD -4 * SIZE(AO)
  591. FLD -4 * SIZE(BO)
  592. fld %st(1)
  593. fmul %st(1), %st
  594. faddp %st, %st(3)
  595. FLD -3 * SIZE(BO)
  596. fmul %st, %st(2)
  597. FLD -3 * SIZE(AO)
  598. fmul %st, %st(2)
  599. fmulp %st, %st(1)
  600. faddp %st, %st(6)
  601. faddp %st, %st(4)
  602. faddp %st, %st(2)
  603. FLD -2 * SIZE(AO)
  604. FLD -2 * SIZE(BO)
  605. fld %st(1)
  606. fmul %st(1), %st
  607. faddp %st, %st(3)
  608. FLD -1 * SIZE(BO)
  609. fmul %st, %st(2)
  610. FLD -1 * SIZE(AO)
  611. fmul %st, %st(2)
  612. fmulp %st, %st(1)
  613. faddp %st, %st(6)
  614. faddp %st, %st(4)
  615. faddp %st, %st(2)
  616. addq $8 * SIZE,AO
  617. addq $8 * SIZE,BO
  618. decq %rax
  619. jne .L12
  620. ALIGN_4
  621. .L15:
  622. #if defined(LT) || defined(RN)
  623. movq KK, %rax
  624. #else
  625. movq K, %rax
  626. subq KK, %rax
  627. #endif
  628. and $3, %rax
  629. je .L18
  630. ALIGN_4
  631. .L16:
  632. FLD -8 * SIZE(AO)
  633. FLD -8 * SIZE(BO)
  634. fld %st(1)
  635. fmul %st(1), %st
  636. faddp %st, %st(3)
  637. FLD -7 * SIZE(BO)
  638. fmul %st, %st(2)
  639. FLD -7 * SIZE(AO)
  640. fmul %st, %st(2)
  641. fmulp %st, %st(1)
  642. faddp %st, %st(6)
  643. faddp %st, %st(4)
  644. faddp %st, %st(2)
  645. addq $2 * SIZE,AO
  646. addq $2 * SIZE,BO
  647. decq %rax
  648. jne .L16
  649. ALIGN_4
  650. .L18:
  651. #if defined(LN) || defined(RT)
  652. movq KK, %rax
  653. #ifdef LN
  654. subq $2, %rax
  655. #else
  656. subq $2, %rax
  657. #endif
  658. salq $BASE_SHIFT, %rax
  659. movq AORIG, AO
  660. leaq (AO, %rax, 2), AO
  661. leaq (B, %rax, 2), BO
  662. #endif
  663. #if defined(LN) || defined(LT)
  664. FLD -8 * SIZE(BO)
  665. fsubp %st, %st(1)
  666. FLD -7 * SIZE(BO)
  667. fsubp %st, %st(2)
  668. FLD -6 * SIZE(BO)
  669. fsubp %st, %st(3)
  670. FLD -5 * SIZE(BO)
  671. fsubp %st, %st(4)
  672. #else
  673. FLD -8 * SIZE(AO)
  674. fsubp %st, %st(1)
  675. FLD -7 * SIZE(AO)
  676. fsubp %st, %st(3)
  677. FLD -6 * SIZE(AO)
  678. fsubp %st, %st(2)
  679. FLD -5 * SIZE(AO)
  680. fsubp %st, %st(4)
  681. #endif
  682. #ifdef LN
  683. FLD -5 * SIZE(AO)
  684. fmul %st, %st(3)
  685. fmulp %st, %st(4)
  686. FLD -6 * SIZE(AO)
  687. fmul %st(3), %st
  688. FLD -6 * SIZE(AO)
  689. fmul %st(5), %st
  690. fsubrp %st, %st(3)
  691. fsubrp %st, %st(1)
  692. FLD -8 * SIZE(AO)
  693. fmul %st, %st(1)
  694. fmulp %st, %st(2)
  695. #endif
  696. #ifdef LT
  697. FLD -8 * SIZE(AO)
  698. fmul %st, %st(1)
  699. fmulp %st, %st(2)
  700. FLD -7 * SIZE(AO)
  701. fmul %st(1), %st
  702. FLD -7 * SIZE(AO)
  703. fmul %st(3), %st
  704. fsubrp %st, %st(5)
  705. fsubrp %st, %st(3)
  706. FLD -5 * SIZE(AO)
  707. fmul %st, %st(3)
  708. fmulp %st, %st(4)
  709. #endif
  710. #ifdef RN
  711. FLD -8 * SIZE(BO)
  712. fmul %st, %st(1)
  713. fmulp %st, %st(3)
  714. FLD -7 * SIZE(BO)
  715. fmul %st(1), %st
  716. FLD -7 * SIZE(BO)
  717. fmul %st(4), %st
  718. fsubrp %st, %st(5)
  719. fsubrp %st, %st(2)
  720. FLD -5 * SIZE(BO)
  721. fmul %st, %st(2)
  722. fmulp %st, %st(4)
  723. #endif
  724. #ifdef RT
  725. FLD -5 * SIZE(BO)
  726. fmul %st, %st(2)
  727. fmulp %st, %st(4)
  728. FLD -6 * SIZE(BO)
  729. fmul %st(2), %st
  730. FLD -6 * SIZE(BO)
  731. fmul %st(5), %st
  732. fsubrp %st, %st(4)
  733. fsubrp %st, %st(1)
  734. FLD -8 * SIZE(BO)
  735. fmul %st, %st(1)
  736. fmulp %st, %st(3)
  737. #endif
  738. #ifdef LN
  739. subq $2 * SIZE, CO
  740. #endif
  741. #if defined(LN) || defined(LT)
  742. fld %st
  743. FST -8 * SIZE(BO)
  744. fxch %st(1)
  745. fld %st
  746. FST -7 * SIZE(BO)
  747. fxch %st(2)
  748. fld %st
  749. FST -6 * SIZE(BO)
  750. fxch %st(3)
  751. fld %st
  752. FST -5 * SIZE(BO)
  753. FST 1 * SIZE(CO, LDC)
  754. FST 0 * SIZE(CO)
  755. FST 0 * SIZE(CO, LDC)
  756. FST 1 * SIZE(CO)
  757. #else
  758. fld %st
  759. FST -8 * SIZE(AO)
  760. fxch %st(2)
  761. fld %st
  762. FST -7 * SIZE(AO)
  763. fxch %st(1)
  764. fld %st
  765. FST -6 * SIZE(AO)
  766. fxch %st(3)
  767. fld %st
  768. FST -5 * SIZE(AO)
  769. FST 1 * SIZE(CO, LDC)
  770. FST 1 * SIZE(CO)
  771. FST 0 * SIZE(CO)
  772. FST 0 * SIZE(CO, LDC)
  773. #endif
  774. #ifndef LN
  775. addq $2 * SIZE, CO
  776. #endif
  777. #if defined(LT) || defined(RN)
  778. movq K, %rax
  779. subq KK, %rax
  780. salq $BASE_SHIFT, %rax
  781. leaq (AO, %rax, 2), AO
  782. leaq (BO, %rax, 2), BO
  783. #endif
  784. #ifdef LN
  785. subq $2, KK
  786. #endif
  787. #ifdef LT
  788. addq $2, KK
  789. #endif
  790. #ifdef RT
  791. movq K, %rax
  792. salq $1 + BASE_SHIFT, %rax
  793. addq %rax, AORIG
  794. #endif
  795. decq I
  796. jne .L11
  797. ALIGN_4
  798. .L20:
  799. movq M, %rax
  800. andq $1, %rax
  801. je .L29
  802. ALIGN_4
  803. .L21:
  804. #ifdef LN
  805. movq K, %rax
  806. salq $0 + BASE_SHIFT, %rax
  807. subq %rax, AORIG
  808. #endif
  809. #if defined(LN) || defined(RT)
  810. movq KK, %rax
  811. salq $BASE_SHIFT, %rax
  812. movq AORIG, AO
  813. leaq (AO, %rax, 1), AO
  814. leaq (B, %rax, 2), BO
  815. #else
  816. movq B, BO
  817. #endif
  818. fldz
  819. fldz
  820. #if defined(LT) || defined(RN)
  821. movq KK, %rax
  822. #else
  823. movq K, %rax
  824. subq KK, %rax
  825. #endif
  826. sarq $2, %rax
  827. je .L25
  828. ALIGN_4
  829. .L22:
  830. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  831. FLD -8 * SIZE(AO)
  832. FLD -8 * SIZE(BO)
  833. fmul %st(1), %st
  834. faddp %st, %st(2)
  835. FLD -7 * SIZE(BO)
  836. fmulp %st, %st(1)
  837. faddp %st, %st(2)
  838. FLD -7 * SIZE(AO)
  839. FLD -6 * SIZE(BO)
  840. fmul %st(1), %st
  841. faddp %st, %st(2)
  842. FLD -5 * SIZE(BO)
  843. fmulp %st, %st(1)
  844. faddp %st, %st(2)
  845. FLD -6 * SIZE(AO)
  846. FLD -4 * SIZE(BO)
  847. fmul %st(1), %st
  848. faddp %st, %st(2)
  849. FLD -3 * SIZE(BO)
  850. fmulp %st, %st(1)
  851. faddp %st, %st(2)
  852. FLD -5 * SIZE(AO)
  853. FLD -2 * SIZE(BO)
  854. fmul %st(1), %st
  855. faddp %st, %st(2)
  856. FLD -1 * SIZE(BO)
  857. fmulp %st, %st(1)
  858. faddp %st, %st(2)
  859. addq $4 * SIZE,AO
  860. addq $8 * SIZE,BO
  861. decq %rax
  862. jne .L22
  863. ALIGN_4
  864. .L25:
  865. #if defined(LT) || defined(RN)
  866. movq KK, %rax
  867. #else
  868. movq K, %rax
  869. subq KK, %rax
  870. #endif
  871. and $3, %rax
  872. je .L28
  873. ALIGN_4
  874. .L26:
  875. FLD -8 * SIZE(AO)
  876. FLD -8 * SIZE(BO)
  877. fmul %st(1), %st
  878. faddp %st, %st(2)
  879. FLD -7 * SIZE(BO)
  880. fmulp %st, %st(1)
  881. faddp %st, %st(2)
  882. addq $1 * SIZE,AO
  883. addq $2 * SIZE,BO
  884. decq %rax
  885. jne .L26
  886. ALIGN_4
  887. .L28:
  888. #if defined(LN) || defined(RT)
  889. movq KK, %rax
  890. #ifdef LN
  891. subq $1, %rax
  892. #else
  893. subq $2, %rax
  894. #endif
  895. salq $BASE_SHIFT, %rax
  896. movq AORIG, AO
  897. leaq (AO, %rax, 1), AO
  898. leaq (B, %rax, 2), BO
  899. #endif
  900. #if defined(LN) || defined(LT)
  901. FLD -8 * SIZE(BO)
  902. fsubp %st, %st(1)
  903. FLD -7 * SIZE(BO)
  904. fsubp %st, %st(2)
  905. #else
  906. FLD -8 * SIZE(AO)
  907. fsubp %st, %st(1)
  908. FLD -7 * SIZE(AO)
  909. fsubp %st, %st(2)
  910. #endif
  911. #if defined(LN) || defined(LT)
  912. FLD -8 * SIZE(AO)
  913. fmul %st, %st(1)
  914. fmulp %st, %st(2)
  915. #endif
  916. #ifdef RN
  917. FLD -8 * SIZE(BO)
  918. fmulp %st, %st(1)
  919. FLD -7 * SIZE(BO)
  920. fmul %st(1), %st
  921. fsubrp %st, %st(2)
  922. FLD -5 * SIZE(BO)
  923. fmulp %st, %st(2)
  924. #endif
  925. #ifdef RT
  926. FLD -5 * SIZE(BO)
  927. fmulp %st, %st(2)
  928. FLD -6 * SIZE(BO)
  929. fmul %st(2), %st
  930. fsubrp %st, %st(1)
  931. FLD -8 * SIZE(BO)
  932. fmulp %st, %st(1)
  933. #endif
  934. #ifdef LN
  935. subq $1 * SIZE, CO
  936. #endif
  937. #if defined(LN) || defined(LT)
  938. fld %st
  939. FST -8 * SIZE(BO)
  940. fxch %st(1)
  941. fld %st
  942. FST -7 * SIZE(BO)
  943. #else
  944. fld %st
  945. FST -8 * SIZE(AO)
  946. fxch %st(1)
  947. fld %st
  948. FST -7 * SIZE(AO)
  949. #endif
  950. FST 0 * SIZE(CO, LDC)
  951. FST 0 * SIZE(CO)
  952. #ifndef LN
  953. addq $1 * SIZE, CO
  954. #endif
  955. #if defined(LT) || defined(RN)
  956. movq K, %rax
  957. subq KK, %rax
  958. salq $BASE_SHIFT, %rax
  959. leaq (AO, %rax, 1), AO
  960. leaq (BO, %rax, 2), BO
  961. #endif
  962. #ifdef LN
  963. subq $1, KK
  964. #endif
  965. #ifdef LT
  966. addq $1, KK
  967. #endif
  968. #ifdef RT
  969. movq K, %rax
  970. salq $0 + BASE_SHIFT, %rax
  971. addq %rax, AORIG
  972. #endif
  973. ALIGN_4
  974. .L29:
  975. #ifdef LN
  976. movq K, %rax
  977. salq $BASE_SHIFT, %rax
  978. leaq (B, %rax, 2), B
  979. #endif
  980. #if defined(LT) || defined(RN)
  981. movq BO, B
  982. #endif
  983. #ifdef RN
  984. addq $2, KK
  985. #endif
  986. #ifdef RT
  987. subq $2, KK
  988. #endif
  989. decq J
  990. jne .L01
  991. ALIGN_4
  992. .L999:
  993. movq 0(%rsp), %rbx
  994. movq 8(%rsp), %rbp
  995. movq 16(%rsp), %r12
  996. movq 24(%rsp), %r13
  997. movq 32(%rsp), %r14
  998. movq 40(%rsp), %r15
  999. addq $STACKSIZE, %rsp
  1000. ret
  1001. EPILOGUE