You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_2x2_atom.S 20 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #define PREFETCH prefetcht0
  56. #define PREFETCHSIZE 84
  57. #define AA %edx
  58. #define BB %ecx
  59. #define LDC %ebp
  60. #define B %edi
  61. #define CO1 %esi
  62. PROLOGUE
  63. subl $ARGS, %esp
  64. pushl %ebp
  65. pushl %edi
  66. pushl %esi
  67. pushl %ebx
  68. PROFCODE
  69. movl ARG_B, B
  70. movl ARG_LDC, LDC
  71. movl OFFSET, %eax
  72. #ifdef RN
  73. negl %eax
  74. #endif
  75. movl %eax, KK
  76. leal (, LDC, SIZE), LDC
  77. #ifdef LN
  78. movl M, %eax
  79. leal (, %eax, SIZE), %eax
  80. addl %eax, C
  81. imull K, %eax
  82. addl %eax, A
  83. #endif
  84. #ifdef RT
  85. movl N, %eax
  86. leal (, %eax, SIZE), %eax
  87. imull K, %eax
  88. addl %eax, B
  89. movl N, %eax
  90. imull LDC, %eax
  91. addl %eax, C
  92. #endif
  93. #ifdef RT
  94. movl N, %eax
  95. subl OFFSET, %eax
  96. movl %eax, KK
  97. #endif
  98. testl $1, N
  99. je .L30
  100. #if defined(LT) || defined(RN)
  101. movl A, AA
  102. #else
  103. movl A, %eax
  104. movl %eax, AORIG
  105. #endif
  106. #ifdef RT
  107. movl K, %eax
  108. sall $BASE_SHIFT, %eax
  109. subl %eax, B
  110. #endif
  111. #ifdef RT
  112. subl LDC, C
  113. #endif
  114. movl C, CO1
  115. #ifndef RT
  116. addl LDC, C
  117. #endif
  118. #ifdef LN
  119. movl OFFSET, %eax
  120. addl M, %eax
  121. movl %eax, KK
  122. #endif
  123. #ifdef LT
  124. movl OFFSET, %eax
  125. movl %eax, KK
  126. #endif
  127. movl M, %ebx
  128. sarl $1, %ebx
  129. jle .L40
  130. ALIGN_4
  131. .L31:
  132. #ifdef LN
  133. movl K, %eax
  134. sall $1 + BASE_SHIFT, %eax
  135. subl %eax, AORIG
  136. #endif
  137. #if defined(LN) || defined(RT)
  138. movl KK, %eax
  139. movl AORIG, AA
  140. leal (, %eax, SIZE), %eax
  141. leal (AA, %eax, 2), AA
  142. #endif
  143. movl B, BB
  144. #if defined(LN) || defined(RT)
  145. movl KK, %eax
  146. sall $BASE_SHIFT, %eax
  147. addl %eax, BB
  148. #endif
  149. movsd 0 * SIZE(BB), %xmm1
  150. xorps %xmm0, %xmm0
  151. prefetcht0 3 * SIZE(CO1)
  152. xorps %xmm2, %xmm2
  153. xorps %xmm4, %xmm4
  154. xorps %xmm6, %xmm6
  155. #if defined(LT) || defined(RN)
  156. movl KK, %eax
  157. #else
  158. movl K, %eax
  159. subl KK, %eax
  160. #endif
  161. sarl $2, %eax
  162. je .L35
  163. ALIGN_4
  164. .L32:
  165. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  166. addsd %xmm0, %xmm4
  167. movsd 0 * SIZE(AA), %xmm0
  168. addsd %xmm2, %xmm6
  169. movsd 1 * SIZE(AA), %xmm2
  170. mulsd %xmm1, %xmm0
  171. mulsd %xmm1, %xmm2
  172. movsd 1 * SIZE(BB), %xmm1
  173. addsd %xmm0, %xmm4
  174. movsd 2 * SIZE(AA), %xmm0
  175. addsd %xmm2, %xmm6
  176. movsd 3 * SIZE(AA), %xmm2
  177. mulsd %xmm1, %xmm0
  178. mulsd %xmm1, %xmm2
  179. movsd 2 * SIZE(BB), %xmm1
  180. addsd %xmm0, %xmm4
  181. movsd 4 * SIZE(AA), %xmm0
  182. addsd %xmm2, %xmm6
  183. movsd 5 * SIZE(AA), %xmm2
  184. mulsd %xmm1, %xmm0
  185. mulsd %xmm1, %xmm2
  186. movsd 3 * SIZE(BB), %xmm1
  187. addsd %xmm0, %xmm4
  188. movsd 6 * SIZE(AA), %xmm0
  189. addsd %xmm2, %xmm6
  190. movsd 7 * SIZE(AA), %xmm2
  191. mulsd %xmm1, %xmm0
  192. mulsd %xmm1, %xmm2
  193. movsd 4 * SIZE(BB), %xmm1
  194. addl $8 * SIZE, AA
  195. addl $4 * SIZE, BB
  196. decl %eax
  197. jne .L32
  198. ALIGN_4
  199. .L35:
  200. #if defined(LT) || defined(RN)
  201. movl KK, %eax
  202. #else
  203. movl K, %eax
  204. subl KK, %eax
  205. #endif
  206. andl $3, %eax # if (k & 1)
  207. BRANCH
  208. je .L38
  209. ALIGN_3
  210. .L36:
  211. addsd %xmm0, %xmm4
  212. movsd 0 * SIZE(AA), %xmm0
  213. addsd %xmm2, %xmm6
  214. movsd 1 * SIZE(AA), %xmm2
  215. mulsd %xmm1, %xmm0
  216. mulsd %xmm1, %xmm2
  217. movsd 1 * SIZE(BB), %xmm1
  218. addl $2 * SIZE, AA
  219. addl $1 * SIZE, BB
  220. decl %eax
  221. jg .L36
  222. ALIGN_4
  223. .L38:
  224. addsd %xmm0, %xmm4
  225. addsd %xmm2, %xmm6
  226. #if defined(LN) || defined(RT)
  227. movl KK, %eax
  228. #ifdef LN
  229. subl $2, %eax
  230. #else
  231. subl $1, %eax
  232. #endif
  233. movl AORIG, AA
  234. leal (, %eax, SIZE), %eax
  235. leal (AA, %eax, 2), AA
  236. leal (B, %eax, 1), BB
  237. #endif
  238. #if defined(LN) || defined(LT)
  239. movsd 0 * SIZE(BB), %xmm0
  240. movsd 1 * SIZE(BB), %xmm2
  241. subsd %xmm4, %xmm0
  242. subsd %xmm6, %xmm2
  243. #else
  244. movsd 0 * SIZE(AA), %xmm0
  245. movsd 1 * SIZE(AA), %xmm2
  246. subsd %xmm4, %xmm0
  247. subsd %xmm6, %xmm2
  248. #endif
  249. #ifdef LN
  250. movsd 3 * SIZE(AA), %xmm4
  251. mulsd %xmm4, %xmm2
  252. movsd 2 * SIZE(AA), %xmm5
  253. mulsd %xmm2, %xmm5
  254. movsd 0 * SIZE(AA), %xmm7
  255. subsd %xmm5, %xmm0
  256. mulsd %xmm7, %xmm0
  257. #endif
  258. #ifdef LT
  259. movsd 0 * SIZE(AA), %xmm4
  260. mulsd %xmm4, %xmm0
  261. movsd 1 * SIZE(AA), %xmm5
  262. mulsd %xmm0, %xmm5
  263. movsd 3 * SIZE(AA), %xmm7
  264. subsd %xmm5, %xmm2
  265. mulsd %xmm7, %xmm2
  266. #endif
  267. #if defined(RN) || defined(RT)
  268. movsd 0 * SIZE(BB), %xmm4
  269. mulsd %xmm4, %xmm0
  270. mulsd %xmm4, %xmm2
  271. #endif
  272. #if defined(LN) || defined(LT)
  273. movsd %xmm0, 0 * SIZE(BB)
  274. movsd %xmm2, 1 * SIZE(BB)
  275. #else
  276. movsd %xmm0, 0 * SIZE(AA)
  277. movsd %xmm2, 1 * SIZE(AA)
  278. #endif
  279. #ifdef LN
  280. subl $2 * SIZE, CO1
  281. #endif
  282. movsd %xmm0, 0 * SIZE(CO1)
  283. movsd %xmm2, 1 * SIZE(CO1)
  284. #ifndef LN
  285. addl $2 * SIZE, CO1
  286. #endif
  287. #if defined(LT) || defined(RN)
  288. movl K, %eax
  289. subl KK, %eax
  290. leal (,%eax, SIZE), %eax
  291. leal (AA, %eax, 2), AA
  292. addl %eax, BB
  293. #endif
  294. #ifdef LN
  295. subl $2, KK
  296. #endif
  297. #ifdef LT
  298. addl $2, KK
  299. #endif
  300. #ifdef RT
  301. movl K, %eax
  302. sall $1 + BASE_SHIFT, %eax
  303. addl %eax, AORIG
  304. #endif
  305. decl %ebx # i --
  306. jg .L31
  307. ALIGN_4
  308. .L40:
  309. movl M, %ebx
  310. testl $1, %ebx # i = (m >> 2)
  311. jle .L49
  312. #ifdef LN
  313. movl K, %eax
  314. sall $BASE_SHIFT, %eax
  315. subl %eax, AORIG
  316. #endif
  317. #if defined(LN) || defined(RT)
  318. movl KK, %eax
  319. movl AORIG, AA
  320. leal (AA, %eax, SIZE), AA
  321. #endif
  322. movl B, BB
  323. #if defined(LN) || defined(RT)
  324. movl KK, %eax
  325. sall $BASE_SHIFT, %eax
  326. addl %eax, BB
  327. #endif
  328. movsd 0 * SIZE(AA), %xmm0
  329. xorps %xmm2, %xmm2
  330. movsd 0 * SIZE(BB), %xmm2
  331. xorps %xmm3, %xmm3
  332. xorps %xmm4, %xmm4
  333. xorps %xmm5, %xmm5
  334. #if defined(LT) || defined(RN)
  335. movl KK, %eax
  336. #else
  337. movl K, %eax
  338. subl KK, %eax
  339. #endif
  340. sarl $2, %eax
  341. je .L45
  342. ALIGN_4
  343. .L42:
  344. mulsd %xmm0, %xmm2
  345. movsd 1 * SIZE(AA), %xmm0
  346. addsd %xmm2, %xmm4
  347. movsd 1 * SIZE(BB), %xmm2
  348. mulsd %xmm0, %xmm2
  349. movsd 2 * SIZE(AA), %xmm0
  350. addsd %xmm2, %xmm5
  351. movsd 2 * SIZE(BB), %xmm2
  352. mulsd %xmm0, %xmm2
  353. movsd 3 * SIZE(AA), %xmm0
  354. addsd %xmm2, %xmm4
  355. movsd 3 * SIZE(BB), %xmm2
  356. mulsd %xmm0, %xmm2
  357. movsd 4 * SIZE(AA), %xmm0
  358. addsd %xmm2, %xmm5
  359. movsd 4 * SIZE(BB), %xmm2
  360. addl $4 * SIZE, AA
  361. addl $4 * SIZE, BB
  362. decl %eax
  363. jne .L42
  364. ALIGN_4
  365. .L45:
  366. #if defined(LT) || defined(RN)
  367. movl KK, %eax
  368. #else
  369. movl K, %eax
  370. subl KK, %eax
  371. #endif
  372. andl $3, %eax # if (k & 1)
  373. BRANCH
  374. je .L48
  375. ALIGN_3
  376. .L46:
  377. mulsd %xmm0, %xmm2
  378. movsd 1 * SIZE(AA), %xmm0
  379. addsd %xmm2, %xmm4
  380. movsd 1 * SIZE(BB), %xmm2
  381. addl $1 * SIZE, AA
  382. addl $1 * SIZE, BB
  383. decl %eax
  384. jg .L46
  385. ALIGN_4
  386. .L48:
  387. addsd %xmm5, %xmm4
  388. #if defined(LN) || defined(RT)
  389. movl KK, %eax
  390. #ifdef LN
  391. subl $1, %eax
  392. #else
  393. subl $1, %eax
  394. #endif
  395. movl AORIG, AA
  396. leal (, %eax, SIZE), %eax
  397. addl %eax, AA
  398. leal (B, %eax, 1), BB
  399. #endif
  400. #if defined(LN) || defined(LT)
  401. movsd 0 * SIZE(BB), %xmm0
  402. subsd %xmm4, %xmm0
  403. #else
  404. movsd 0 * SIZE(AA), %xmm0
  405. subsd %xmm4, %xmm0
  406. #endif
  407. #if defined(LN) || defined(LT)
  408. mulsd 0 * SIZE(AA), %xmm0
  409. #endif
  410. #if defined(RN) || defined(RT)
  411. mulsd 0 * SIZE(BB), %xmm0
  412. #endif
  413. #if defined(LN) || defined(LT)
  414. movsd %xmm0, 0 * SIZE(BB)
  415. #else
  416. movsd %xmm0, 0 * SIZE(AA)
  417. #endif
  418. #ifdef LN
  419. subl $1 * SIZE, CO1
  420. #endif
  421. movsd %xmm0, 0 * SIZE(CO1)
  422. #ifndef LN
  423. addl $1 * SIZE, CO1
  424. #endif
  425. #if defined(LT) || defined(RN)
  426. movl K, %eax
  427. subl KK, %eax
  428. leal (,%eax, SIZE), %eax
  429. addl %eax, AA
  430. addl %eax, BB
  431. #endif
  432. #ifdef LN
  433. subl $1, KK
  434. #endif
  435. #ifdef LT
  436. addl $1, KK
  437. #endif
  438. #ifdef RT
  439. movl K, %eax
  440. sall $BASE_SHIFT, %eax
  441. addl %eax, AORIG
  442. #endif
  443. ALIGN_4
  444. .L49:
  445. #ifdef LN
  446. movl K, %eax
  447. leal (B, %eax, SIZE), B
  448. #endif
  449. #if defined(LT) || defined(RN)
  450. movl BB, B
  451. #endif
  452. #ifdef RN
  453. addl $1, KK
  454. #endif
  455. #ifdef RT
  456. subl $1, KK
  457. #endif
  458. ALIGN_4
  459. .L30:
  460. movl N, %eax
  461. sarl $1, %eax
  462. movl %eax, J
  463. jle .L999
  464. ALIGN_2
  465. .L10:
  466. #if defined(LT) || defined(RN)
  467. movl A, AA
  468. #else
  469. movl A, %eax
  470. movl %eax, AORIG
  471. #endif
  472. #ifdef RT
  473. movl K, %eax
  474. sall $1 + BASE_SHIFT, %eax
  475. subl %eax, B
  476. #endif
  477. leal (, LDC, 2), %eax
  478. #ifdef RT
  479. subl %eax, C
  480. #endif
  481. movl C, CO1
  482. #ifndef RT
  483. addl %eax, C
  484. #endif
  485. #ifdef LN
  486. movl OFFSET, %eax
  487. addl M, %eax
  488. movl %eax, KK
  489. #endif
  490. #ifdef LT
  491. movl OFFSET, %eax
  492. movl %eax, KK
  493. #endif
  494. movl M, %ebx
  495. sarl $1, %ebx
  496. jle .L20
  497. ALIGN_4
  498. .L11:
  499. #ifdef LN
  500. movl K, %eax
  501. sall $1 + BASE_SHIFT, %eax
  502. subl %eax, AORIG
  503. #endif
  504. #if defined(LN) || defined(RT)
  505. movl KK, %eax
  506. movl AORIG, AA
  507. leal (, %eax, SIZE), %eax
  508. leal (AA, %eax, 2), AA
  509. #endif
  510. movl B, BB
  511. #if defined(LN) || defined(RT)
  512. movl KK, %eax
  513. sall $1 + BASE_SHIFT, %eax
  514. addl %eax, BB
  515. #endif
  516. movsd 0 * SIZE(AA), %xmm0
  517. xorps %xmm2, %xmm2
  518. xorps %xmm3, %xmm3
  519. xorps %xmm4, %xmm4
  520. prefetcht0 3 * SIZE(CO1)
  521. xorps %xmm5, %xmm5
  522. prefetcht0 3 * SIZE(CO1, LDC)
  523. xorps %xmm6, %xmm6
  524. xorps %xmm7, %xmm7
  525. #if defined(LT) || defined(RN)
  526. movl KK, %eax
  527. #else
  528. movl K, %eax
  529. subl KK, %eax
  530. #endif
  531. sarl $2, %eax
  532. je .L15
  533. ALIGN_4
  534. .L12:
  535. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  536. addsd %xmm2, %xmm6
  537. movsd 1 * SIZE(AA), %xmm2
  538. movaps %xmm0, %xmm1
  539. mulsd 0 * SIZE(BB), %xmm0
  540. addsd %xmm3, %xmm7
  541. mulsd 1 * SIZE(BB), %xmm1
  542. addsd %xmm0, %xmm4
  543. movsd 2 * SIZE(AA), %xmm0
  544. movaps %xmm2, %xmm3
  545. mulsd 0 * SIZE(BB), %xmm2
  546. addsd %xmm1, %xmm5
  547. mulsd 1 * SIZE(BB), %xmm3
  548. addsd %xmm2, %xmm6
  549. movsd 3 * SIZE(AA), %xmm2
  550. movaps %xmm0, %xmm1
  551. mulsd 2 * SIZE(BB), %xmm0
  552. addsd %xmm3, %xmm7
  553. mulsd 3 * SIZE(BB), %xmm1
  554. addsd %xmm0, %xmm4
  555. movsd 4 * SIZE(AA), %xmm0
  556. movaps %xmm2, %xmm3
  557. mulsd 2 * SIZE(BB), %xmm2
  558. addsd %xmm1, %xmm5
  559. mulsd 3 * SIZE(BB), %xmm3
  560. addsd %xmm2, %xmm6
  561. movsd 5 * SIZE(AA), %xmm2
  562. movaps %xmm0, %xmm1
  563. mulsd 4 * SIZE(BB), %xmm0
  564. addsd %xmm3, %xmm7
  565. mulsd 5 * SIZE(BB), %xmm1
  566. addsd %xmm0, %xmm4
  567. movsd 6 * SIZE(AA), %xmm0
  568. movaps %xmm2, %xmm3
  569. mulsd 4 * SIZE(BB), %xmm2
  570. addsd %xmm1, %xmm5
  571. mulsd 5 * SIZE(BB), %xmm3
  572. addsd %xmm2, %xmm6
  573. movsd 7 * SIZE(AA), %xmm2
  574. movaps %xmm0, %xmm1
  575. mulsd 6 * SIZE(BB), %xmm0
  576. addsd %xmm3, %xmm7
  577. mulsd 7 * SIZE(BB), %xmm1
  578. addsd %xmm0, %xmm4
  579. movsd 8 * SIZE(AA), %xmm0
  580. movaps %xmm2, %xmm3
  581. mulsd 6 * SIZE(BB), %xmm2
  582. addsd %xmm1, %xmm5
  583. mulsd 7 * SIZE(BB), %xmm3
  584. addl $8 * SIZE, BB
  585. addl $8 * SIZE, AA
  586. decl %eax
  587. jne .L12
  588. ALIGN_4
  589. .L15:
  590. #if defined(LT) || defined(RN)
  591. movl KK, %eax
  592. #else
  593. movl K, %eax
  594. subl KK, %eax
  595. #endif
  596. andl $3, %eax # if (k & 1)
  597. BRANCH
  598. je .L18
  599. ALIGN_3
  600. .L16:
  601. addsd %xmm2, %xmm6
  602. movsd 1 * SIZE(AA), %xmm2
  603. movaps %xmm0, %xmm1
  604. mulsd 0 * SIZE(BB), %xmm0
  605. addsd %xmm3, %xmm7
  606. mulsd 1 * SIZE(BB), %xmm1
  607. addsd %xmm0, %xmm4
  608. movsd 2 * SIZE(AA), %xmm0
  609. movaps %xmm2, %xmm3
  610. mulsd 0 * SIZE(BB), %xmm2
  611. addsd %xmm1, %xmm5
  612. mulsd 1 * SIZE(BB), %xmm3
  613. addl $2 * SIZE, AA
  614. addl $2 * SIZE, BB
  615. decl %eax
  616. jg .L16
  617. ALIGN_4
  618. .L18:
  619. addsd %xmm2, %xmm6
  620. addsd %xmm3, %xmm7
  621. #if defined(LN) || defined(RT)
  622. movl KK, %eax
  623. #ifdef LN
  624. subl $2, %eax
  625. #else
  626. subl $2, %eax
  627. #endif
  628. movl AORIG, AA
  629. leal (, %eax, SIZE), %eax
  630. leal (AA, %eax, 2), AA
  631. leal (B, %eax, 2), BB
  632. #endif
  633. #if defined(LN) || defined(LT)
  634. movsd 0 * SIZE(BB), %xmm0
  635. movsd 1 * SIZE(BB), %xmm1
  636. movsd 2 * SIZE(BB), %xmm2
  637. movsd 3 * SIZE(BB), %xmm3
  638. subsd %xmm4, %xmm0
  639. subsd %xmm5, %xmm1
  640. subsd %xmm6, %xmm2
  641. subsd %xmm7, %xmm3
  642. #else
  643. movsd 0 * SIZE(AA), %xmm0
  644. movsd 1 * SIZE(AA), %xmm2
  645. movsd 2 * SIZE(AA), %xmm1
  646. movsd 3 * SIZE(AA), %xmm3
  647. subsd %xmm4, %xmm0
  648. subsd %xmm6, %xmm2
  649. subsd %xmm5, %xmm1
  650. subsd %xmm7, %xmm3
  651. #endif
  652. #ifdef LN
  653. movsd 3 * SIZE(AA), %xmm4
  654. mulsd %xmm4, %xmm2
  655. movsd 2 * SIZE(AA), %xmm5
  656. mulsd %xmm4, %xmm3
  657. movsd 0 * SIZE(AA), %xmm7
  658. movaps %xmm5, %xmm6
  659. mulsd %xmm2, %xmm5
  660. mulsd %xmm3, %xmm6
  661. subsd %xmm5, %xmm0
  662. subsd %xmm6, %xmm1
  663. mulsd %xmm7, %xmm0
  664. mulsd %xmm7, %xmm1
  665. #endif
  666. #ifdef LT
  667. movsd 0 * SIZE(AA), %xmm4
  668. mulsd %xmm4, %xmm0
  669. movsd 1 * SIZE(AA), %xmm5
  670. mulsd %xmm4, %xmm1
  671. movsd 3 * SIZE(AA), %xmm7
  672. movaps %xmm5, %xmm6
  673. mulsd %xmm0, %xmm5
  674. mulsd %xmm1, %xmm6
  675. subsd %xmm5, %xmm2
  676. subsd %xmm6, %xmm3
  677. mulsd %xmm7, %xmm2
  678. mulsd %xmm7, %xmm3
  679. #endif
  680. #ifdef RN
  681. movsd 0 * SIZE(BB), %xmm4
  682. mulsd %xmm4, %xmm0
  683. movsd 1 * SIZE(BB), %xmm5
  684. mulsd %xmm4, %xmm2
  685. movsd 3 * SIZE(BB), %xmm7
  686. movaps %xmm5, %xmm6
  687. mulsd %xmm0, %xmm5
  688. mulsd %xmm2, %xmm6
  689. subsd %xmm5, %xmm1
  690. subsd %xmm6, %xmm3
  691. mulsd %xmm7, %xmm1
  692. mulsd %xmm7, %xmm3
  693. #endif
  694. #ifdef RT
  695. movsd 3 * SIZE(BB), %xmm4
  696. mulsd %xmm4, %xmm1
  697. movsd 2 * SIZE(BB), %xmm5
  698. mulsd %xmm4, %xmm3
  699. movsd 0 * SIZE(BB), %xmm7
  700. movaps %xmm5, %xmm6
  701. mulsd %xmm1, %xmm5
  702. mulsd %xmm3, %xmm6
  703. subsd %xmm5, %xmm0
  704. subsd %xmm6, %xmm2
  705. mulsd %xmm7, %xmm0
  706. mulsd %xmm7, %xmm2
  707. #endif
  708. #if defined(LN) || defined(LT)
  709. movsd %xmm0, 0 * SIZE(BB)
  710. movsd %xmm1, 1 * SIZE(BB)
  711. movsd %xmm2, 2 * SIZE(BB)
  712. movsd %xmm3, 3 * SIZE(BB)
  713. #else
  714. movsd %xmm0, 0 * SIZE(AA)
  715. movsd %xmm2, 1 * SIZE(AA)
  716. movsd %xmm1, 2 * SIZE(AA)
  717. movsd %xmm3, 3 * SIZE(AA)
  718. #endif
  719. #ifdef LN
  720. subl $2 * SIZE, CO1
  721. #endif
  722. movsd %xmm0, 0 * SIZE(CO1)
  723. movsd %xmm2, 1 * SIZE(CO1)
  724. movsd %xmm1, 0 * SIZE(CO1, LDC)
  725. movsd %xmm3, 1 * SIZE(CO1, LDC)
  726. #ifndef LN
  727. addl $2 * SIZE, CO1
  728. #endif
  729. #if defined(LT) || defined(RN)
  730. movl K, %eax
  731. subl KK, %eax
  732. leal (,%eax, SIZE), %eax
  733. leal (AA, %eax, 2), AA
  734. leal (BB, %eax, 2), BB
  735. #endif
  736. #ifdef LN
  737. subl $2, KK
  738. #endif
  739. #ifdef LT
  740. addl $2, KK
  741. #endif
  742. #ifdef RT
  743. movl K, %eax
  744. sall $1 + BASE_SHIFT, %eax
  745. addl %eax, AORIG
  746. #endif
  747. decl %ebx # i --
  748. jg .L11
  749. ALIGN_4
  750. .L20:
  751. movl M, %ebx
  752. testl $1, %ebx # i = (m >> 2)
  753. jle .L29
  754. #ifdef LN
  755. movl K, %eax
  756. sall $BASE_SHIFT, %eax
  757. subl %eax, AORIG
  758. #endif
  759. #if defined(LN) || defined(RT)
  760. movl KK, %eax
  761. movl AORIG, AA
  762. leal (AA, %eax, SIZE), AA
  763. #endif
  764. movl B, BB
  765. #if defined(LN) || defined(RT)
  766. movl KK, %eax
  767. sall $1 + BASE_SHIFT, %eax
  768. addl %eax, BB
  769. #endif
  770. movsd 0 * SIZE(AA), %xmm0
  771. xorps %xmm2, %xmm2
  772. xorps %xmm3, %xmm3
  773. xorps %xmm4, %xmm4
  774. xorps %xmm5, %xmm5
  775. #if defined(LT) || defined(RN)
  776. movl KK, %eax
  777. #else
  778. movl K, %eax
  779. subl KK, %eax
  780. #endif
  781. sarl $2, %eax
  782. je .L25
  783. ALIGN_4
  784. .L22:
  785. addsd %xmm2, %xmm4
  786. movsd 0 * SIZE(BB), %xmm2
  787. addsd %xmm3, %xmm5
  788. movsd 1 * SIZE(BB), %xmm3
  789. mulsd %xmm0, %xmm2
  790. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  791. mulsd %xmm0, %xmm3
  792. movsd 1 * SIZE(AA), %xmm0
  793. addsd %xmm2, %xmm4
  794. movsd 2 * SIZE(BB), %xmm2
  795. addsd %xmm3, %xmm5
  796. movsd 3 * SIZE(BB), %xmm3
  797. mulsd %xmm0, %xmm2
  798. mulsd %xmm0, %xmm3
  799. movsd 2 * SIZE(AA), %xmm0
  800. addsd %xmm2, %xmm4
  801. movsd 4 * SIZE(BB), %xmm2
  802. addsd %xmm3, %xmm5
  803. movsd 5 * SIZE(BB), %xmm3
  804. mulsd %xmm0, %xmm2
  805. mulsd %xmm0, %xmm3
  806. movsd 3 * SIZE(AA), %xmm0
  807. addsd %xmm2, %xmm4
  808. movsd 6 * SIZE(BB), %xmm2
  809. addsd %xmm3, %xmm5
  810. movsd 7 * SIZE(BB), %xmm3
  811. mulsd %xmm0, %xmm2
  812. mulsd %xmm0, %xmm3
  813. movsd 4 * SIZE(AA), %xmm0
  814. addl $4 * SIZE, AA
  815. addl $8 * SIZE, BB
  816. decl %eax
  817. jne .L22
  818. ALIGN_4
  819. .L25:
  820. #if defined(LT) || defined(RN)
  821. movl KK, %eax
  822. #else
  823. movl K, %eax
  824. subl KK, %eax
  825. #endif
  826. andl $3, %eax # if (k & 1)
  827. BRANCH
  828. je .L28
  829. ALIGN_3
  830. .L26:
  831. addsd %xmm2, %xmm4
  832. movsd 0 * SIZE(BB), %xmm2
  833. addsd %xmm3, %xmm5
  834. movsd 1 * SIZE(BB), %xmm3
  835. mulsd %xmm0, %xmm2
  836. mulsd %xmm0, %xmm3
  837. movsd 1 * SIZE(AA), %xmm0
  838. addl $1 * SIZE, AA
  839. addl $2 * SIZE, BB
  840. decl %eax
  841. jg .L26
  842. ALIGN_4
  843. .L28:
  844. addsd %xmm2, %xmm4
  845. addsd %xmm3, %xmm5
  846. #if defined(LN) || defined(RT)
  847. movl KK, %eax
  848. #ifdef LN
  849. subl $1, %eax
  850. #else
  851. subl $2, %eax
  852. #endif
  853. movl AORIG, AA
  854. leal (, %eax, SIZE), %eax
  855. leal (AA, %eax, 1), AA
  856. leal (B, %eax, 2), BB
  857. #endif
  858. #if defined(LN) || defined(LT)
  859. movsd 0 * SIZE(BB), %xmm0
  860. movsd 1 * SIZE(BB), %xmm1
  861. subsd %xmm4, %xmm0
  862. subsd %xmm5, %xmm1
  863. #else
  864. movsd 0 * SIZE(AA), %xmm0
  865. movsd 1 * SIZE(AA), %xmm1
  866. subsd %xmm4, %xmm0
  867. subsd %xmm5, %xmm1
  868. #endif
  869. #if defined(LN) || defined(LT)
  870. movsd 0 * SIZE(AA), %xmm7
  871. mulsd %xmm7, %xmm0
  872. mulsd %xmm7, %xmm1
  873. #endif
  874. #ifdef RN
  875. movsd 0 * SIZE(BB), %xmm4
  876. mulsd %xmm4, %xmm0
  877. movsd 1 * SIZE(BB), %xmm5
  878. movaps %xmm5, %xmm6
  879. movsd 3 * SIZE(BB), %xmm7
  880. mulsd %xmm0, %xmm5
  881. subsd %xmm5, %xmm1
  882. mulsd %xmm7, %xmm1
  883. #endif
  884. #ifdef RT
  885. movsd 3 * SIZE(BB), %xmm4
  886. mulsd %xmm4, %xmm1
  887. movsd 2 * SIZE(BB), %xmm5
  888. movaps %xmm5, %xmm6
  889. movsd 0 * SIZE(BB), %xmm7
  890. mulsd %xmm1, %xmm5
  891. subsd %xmm5, %xmm0
  892. mulsd %xmm7, %xmm0
  893. #endif
  894. #if defined(LN) || defined(LT)
  895. movsd %xmm0, 0 * SIZE(BB)
  896. movsd %xmm1, 1 * SIZE(BB)
  897. #else
  898. movsd %xmm0, 0 * SIZE(AA)
  899. movsd %xmm1, 1 * SIZE(AA)
  900. #endif
  901. #ifdef LN
  902. subl $1 * SIZE, CO1
  903. #endif
  904. movsd %xmm0, 0 * SIZE(CO1)
  905. movsd %xmm1, 0 * SIZE(CO1, LDC)
  906. #ifndef LN
  907. addl $1 * SIZE, CO1
  908. #endif
  909. #if defined(LT) || defined(RN)
  910. movl K, %eax
  911. subl KK, %eax
  912. leal (,%eax, SIZE), %eax
  913. leal (AA, %eax, 1), AA
  914. leal (BB, %eax, 2), BB
  915. #endif
  916. #ifdef LN
  917. subl $1, KK
  918. #endif
  919. #ifdef LT
  920. addl $1, KK
  921. #endif
  922. #ifdef RT
  923. movl K, %eax
  924. sall $BASE_SHIFT, %eax
  925. addl %eax, AORIG
  926. #endif
  927. ALIGN_4
  928. .L29:
  929. #ifdef LN
  930. movl K, %eax
  931. leal (, %eax, SIZE), %eax
  932. leal (B, %eax, 2), B
  933. #endif
  934. #if defined(LT) || defined(RN)
  935. movl BB, B
  936. #endif
  937. #ifdef RN
  938. addl $2, KK
  939. #endif
  940. #ifdef RT
  941. subl $2, KK
  942. #endif
  943. decl J # j --
  944. jg .L10
  945. ALIGN_4
  946. .L999:
  947. popl %ebx
  948. popl %esi
  949. popl %edi
  950. popl %ebp
  951. addl $ARGS, %esp
  952. ret
  953. EPILOGUE