You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_2x2_atom.S 20 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #define PREFETCH prefetcht0
  56. #define PREFETCHSIZE 84
  57. #define AA %edx
  58. #define BB %ecx
  59. #define LDC %ebp
  60. #define B %edi
  61. #define CO1 %esi
  62. PROLOGUE
  63. subl $ARGS, %esp
  64. pushl %ebp
  65. pushl %edi
  66. pushl %esi
  67. pushl %ebx
  68. PROFCODE
  69. movl ARG_B, B
  70. movl ARG_LDC, LDC
  71. movl OFFSET, %eax
  72. #ifdef RN
  73. negl %eax
  74. #endif
  75. movl %eax, KK
  76. leal (, LDC, SIZE), LDC
  77. #ifdef LN
  78. movl M, %eax
  79. leal (, %eax, SIZE), %eax
  80. addl %eax, C
  81. imull K, %eax
  82. addl %eax, A
  83. #endif
  84. #ifdef RT
  85. movl N, %eax
  86. leal (, %eax, SIZE), %eax
  87. imull K, %eax
  88. addl %eax, B
  89. movl N, %eax
  90. imull LDC, %eax
  91. addl %eax, C
  92. #endif
  93. #ifdef RT
  94. movl N, %eax
  95. subl OFFSET, %eax
  96. movl %eax, KK
  97. #endif
  98. movl N, %eax
  99. sarl $1, %eax
  100. movl %eax, J
  101. jle .L30
  102. ALIGN_2
  103. .L10:
  104. #if defined(LT) || defined(RN)
  105. movl A, AA
  106. #else
  107. movl A, %eax
  108. movl %eax, AORIG
  109. #endif
  110. #ifdef RT
  111. movl K, %eax
  112. sall $1 + BASE_SHIFT, %eax
  113. subl %eax, B
  114. #endif
  115. leal (, LDC, 2), %eax
  116. #ifdef RT
  117. subl %eax, C
  118. #endif
  119. movl C, CO1
  120. #ifndef RT
  121. addl %eax, C
  122. #endif
  123. #ifdef LN
  124. movl OFFSET, %eax
  125. addl M, %eax
  126. movl %eax, KK
  127. #endif
  128. #ifdef LT
  129. movl OFFSET, %eax
  130. movl %eax, KK
  131. #endif
  132. movl M, %ebx
  133. testl $1, %ebx # i = (m >> 2)
  134. jle .L20
  135. #ifdef LN
  136. movl K, %eax
  137. sall $BASE_SHIFT, %eax
  138. subl %eax, AORIG
  139. #endif
  140. #if defined(LN) || defined(RT)
  141. movl KK, %eax
  142. movl AORIG, AA
  143. leal (AA, %eax, SIZE), AA
  144. #endif
  145. movl B, BB
  146. #if defined(LN) || defined(RT)
  147. movl KK, %eax
  148. sall $1 + BASE_SHIFT, %eax
  149. addl %eax, BB
  150. #endif
  151. movsd 0 * SIZE(AA), %xmm0
  152. xorps %xmm2, %xmm2
  153. xorps %xmm3, %xmm3
  154. xorps %xmm4, %xmm4
  155. xorps %xmm5, %xmm5
  156. #if defined(LT) || defined(RN)
  157. movl KK, %eax
  158. #else
  159. movl K, %eax
  160. subl KK, %eax
  161. #endif
  162. sarl $2, %eax
  163. je .L25
  164. ALIGN_4
  165. .L22:
  166. addsd %xmm2, %xmm4
  167. movsd 0 * SIZE(BB), %xmm2
  168. addsd %xmm3, %xmm5
  169. movsd 1 * SIZE(BB), %xmm3
  170. mulsd %xmm0, %xmm2
  171. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  172. mulsd %xmm0, %xmm3
  173. movsd 1 * SIZE(AA), %xmm0
  174. addsd %xmm2, %xmm4
  175. movsd 2 * SIZE(BB), %xmm2
  176. addsd %xmm3, %xmm5
  177. movsd 3 * SIZE(BB), %xmm3
  178. mulsd %xmm0, %xmm2
  179. mulsd %xmm0, %xmm3
  180. movsd 2 * SIZE(AA), %xmm0
  181. addsd %xmm2, %xmm4
  182. movsd 4 * SIZE(BB), %xmm2
  183. addsd %xmm3, %xmm5
  184. movsd 5 * SIZE(BB), %xmm3
  185. mulsd %xmm0, %xmm2
  186. mulsd %xmm0, %xmm3
  187. movsd 3 * SIZE(AA), %xmm0
  188. addsd %xmm2, %xmm4
  189. movsd 6 * SIZE(BB), %xmm2
  190. addsd %xmm3, %xmm5
  191. movsd 7 * SIZE(BB), %xmm3
  192. mulsd %xmm0, %xmm2
  193. mulsd %xmm0, %xmm3
  194. movsd 4 * SIZE(AA), %xmm0
  195. addl $4 * SIZE, AA
  196. addl $8 * SIZE, BB
  197. decl %eax
  198. jne .L22
  199. ALIGN_4
  200. .L25:
  201. #if defined(LT) || defined(RN)
  202. movl KK, %eax
  203. #else
  204. movl K, %eax
  205. subl KK, %eax
  206. #endif
  207. andl $3, %eax # if (k & 1)
  208. BRANCH
  209. je .L28
  210. ALIGN_3
  211. .L26:
  212. addsd %xmm2, %xmm4
  213. movsd 0 * SIZE(BB), %xmm2
  214. addsd %xmm3, %xmm5
  215. movsd 1 * SIZE(BB), %xmm3
  216. mulsd %xmm0, %xmm2
  217. mulsd %xmm0, %xmm3
  218. movsd 1 * SIZE(AA), %xmm0
  219. addl $1 * SIZE, AA
  220. addl $2 * SIZE, BB
  221. decl %eax
  222. jg .L26
  223. ALIGN_4
  224. .L28:
  225. addsd %xmm2, %xmm4
  226. addsd %xmm3, %xmm5
  227. #if defined(LN) || defined(RT)
  228. movl KK, %eax
  229. #ifdef LN
  230. subl $1, %eax
  231. #else
  232. subl $2, %eax
  233. #endif
  234. movl AORIG, AA
  235. leal (, %eax, SIZE), %eax
  236. leal (AA, %eax, 1), AA
  237. leal (B, %eax, 2), BB
  238. #endif
  239. #if defined(LN) || defined(LT)
  240. movsd 0 * SIZE(BB), %xmm0
  241. movsd 1 * SIZE(BB), %xmm1
  242. subsd %xmm4, %xmm0
  243. subsd %xmm5, %xmm1
  244. #else
  245. movsd 0 * SIZE(AA), %xmm0
  246. movsd 1 * SIZE(AA), %xmm1
  247. subsd %xmm4, %xmm0
  248. subsd %xmm5, %xmm1
  249. #endif
  250. #if defined(LN) || defined(LT)
  251. movsd 0 * SIZE(AA), %xmm7
  252. mulsd %xmm7, %xmm0
  253. mulsd %xmm7, %xmm1
  254. #endif
  255. #ifdef RN
  256. movsd 0 * SIZE(BB), %xmm4
  257. mulsd %xmm4, %xmm0
  258. movsd 1 * SIZE(BB), %xmm5
  259. movaps %xmm5, %xmm6
  260. movsd 3 * SIZE(BB), %xmm7
  261. mulsd %xmm0, %xmm5
  262. subsd %xmm5, %xmm1
  263. mulsd %xmm7, %xmm1
  264. #endif
  265. #ifdef RT
  266. movsd 3 * SIZE(BB), %xmm4
  267. mulsd %xmm4, %xmm1
  268. movsd 2 * SIZE(BB), %xmm5
  269. movaps %xmm5, %xmm6
  270. movsd 0 * SIZE(BB), %xmm7
  271. mulsd %xmm1, %xmm5
  272. subsd %xmm5, %xmm0
  273. mulsd %xmm7, %xmm0
  274. #endif
  275. #if defined(LN) || defined(LT)
  276. movsd %xmm0, 0 * SIZE(BB)
  277. movsd %xmm1, 1 * SIZE(BB)
  278. #else
  279. movsd %xmm0, 0 * SIZE(AA)
  280. movsd %xmm1, 1 * SIZE(AA)
  281. #endif
  282. #ifdef LN
  283. subl $1 * SIZE, CO1
  284. #endif
  285. movsd %xmm0, 0 * SIZE(CO1)
  286. movsd %xmm1, 0 * SIZE(CO1, LDC)
  287. #ifndef LN
  288. addl $1 * SIZE, CO1
  289. #endif
  290. #if defined(LT) || defined(RN)
  291. movl K, %eax
  292. subl KK, %eax
  293. leal (,%eax, SIZE), %eax
  294. leal (AA, %eax, 1), AA
  295. leal (BB, %eax, 2), BB
  296. #endif
  297. #ifdef LN
  298. subl $1, KK
  299. #endif
  300. #ifdef LT
  301. addl $1, KK
  302. #endif
  303. #ifdef RT
  304. movl K, %eax
  305. sall $BASE_SHIFT, %eax
  306. addl %eax, AORIG
  307. #endif
  308. ALIGN_4
  309. .L20:
  310. movl M, %ebx
  311. sarl $1, %ebx
  312. jle .L29
  313. ALIGN_4
  314. .L11:
  315. #ifdef LN
  316. movl K, %eax
  317. sall $1 + BASE_SHIFT, %eax
  318. subl %eax, AORIG
  319. #endif
  320. #if defined(LN) || defined(RT)
  321. movl KK, %eax
  322. movl AORIG, AA
  323. leal (, %eax, SIZE), %eax
  324. leal (AA, %eax, 2), AA
  325. #endif
  326. movl B, BB
  327. #if defined(LN) || defined(RT)
  328. movl KK, %eax
  329. sall $1 + BASE_SHIFT, %eax
  330. addl %eax, BB
  331. #endif
  332. movsd 0 * SIZE(AA), %xmm0
  333. xorps %xmm2, %xmm2
  334. xorps %xmm3, %xmm3
  335. xorps %xmm4, %xmm4
  336. prefetcht0 3 * SIZE(CO1)
  337. xorps %xmm5, %xmm5
  338. prefetcht0 3 * SIZE(CO1, LDC)
  339. xorps %xmm6, %xmm6
  340. xorps %xmm7, %xmm7
  341. #if defined(LT) || defined(RN)
  342. movl KK, %eax
  343. #else
  344. movl K, %eax
  345. subl KK, %eax
  346. #endif
  347. sarl $2, %eax
  348. je .L15
  349. ALIGN_4
  350. .L12:
  351. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  352. addsd %xmm2, %xmm6
  353. movsd 1 * SIZE(AA), %xmm2
  354. movaps %xmm0, %xmm1
  355. mulsd 0 * SIZE(BB), %xmm0
  356. addsd %xmm3, %xmm7
  357. mulsd 1 * SIZE(BB), %xmm1
  358. addsd %xmm0, %xmm4
  359. movsd 2 * SIZE(AA), %xmm0
  360. movaps %xmm2, %xmm3
  361. mulsd 0 * SIZE(BB), %xmm2
  362. addsd %xmm1, %xmm5
  363. mulsd 1 * SIZE(BB), %xmm3
  364. addsd %xmm2, %xmm6
  365. movsd 3 * SIZE(AA), %xmm2
  366. movaps %xmm0, %xmm1
  367. mulsd 2 * SIZE(BB), %xmm0
  368. addsd %xmm3, %xmm7
  369. mulsd 3 * SIZE(BB), %xmm1
  370. addsd %xmm0, %xmm4
  371. movsd 4 * SIZE(AA), %xmm0
  372. movaps %xmm2, %xmm3
  373. mulsd 2 * SIZE(BB), %xmm2
  374. addsd %xmm1, %xmm5
  375. mulsd 3 * SIZE(BB), %xmm3
  376. addsd %xmm2, %xmm6
  377. movsd 5 * SIZE(AA), %xmm2
  378. movaps %xmm0, %xmm1
  379. mulsd 4 * SIZE(BB), %xmm0
  380. addsd %xmm3, %xmm7
  381. mulsd 5 * SIZE(BB), %xmm1
  382. addsd %xmm0, %xmm4
  383. movsd 6 * SIZE(AA), %xmm0
  384. movaps %xmm2, %xmm3
  385. mulsd 4 * SIZE(BB), %xmm2
  386. addsd %xmm1, %xmm5
  387. mulsd 5 * SIZE(BB), %xmm3
  388. addsd %xmm2, %xmm6
  389. movsd 7 * SIZE(AA), %xmm2
  390. movaps %xmm0, %xmm1
  391. mulsd 6 * SIZE(BB), %xmm0
  392. addsd %xmm3, %xmm7
  393. mulsd 7 * SIZE(BB), %xmm1
  394. addsd %xmm0, %xmm4
  395. movsd 8 * SIZE(AA), %xmm0
  396. movaps %xmm2, %xmm3
  397. mulsd 6 * SIZE(BB), %xmm2
  398. addsd %xmm1, %xmm5
  399. mulsd 7 * SIZE(BB), %xmm3
  400. addl $8 * SIZE, BB
  401. addl $8 * SIZE, AA
  402. decl %eax
  403. jne .L12
  404. ALIGN_4
  405. .L15:
  406. #if defined(LT) || defined(RN)
  407. movl KK, %eax
  408. #else
  409. movl K, %eax
  410. subl KK, %eax
  411. #endif
  412. andl $3, %eax # if (k & 1)
  413. BRANCH
  414. je .L18
  415. ALIGN_3
  416. .L16:
  417. addsd %xmm2, %xmm6
  418. movsd 1 * SIZE(AA), %xmm2
  419. movaps %xmm0, %xmm1
  420. mulsd 0 * SIZE(BB), %xmm0
  421. addsd %xmm3, %xmm7
  422. mulsd 1 * SIZE(BB), %xmm1
  423. addsd %xmm0, %xmm4
  424. movsd 2 * SIZE(AA), %xmm0
  425. movaps %xmm2, %xmm3
  426. mulsd 0 * SIZE(BB), %xmm2
  427. addsd %xmm1, %xmm5
  428. mulsd 1 * SIZE(BB), %xmm3
  429. addl $2 * SIZE, AA
  430. addl $2 * SIZE, BB
  431. decl %eax
  432. jg .L16
  433. ALIGN_4
  434. .L18:
  435. addsd %xmm2, %xmm6
  436. addsd %xmm3, %xmm7
  437. #if defined(LN) || defined(RT)
  438. movl KK, %eax
  439. #ifdef LN
  440. subl $2, %eax
  441. #else
  442. subl $2, %eax
  443. #endif
  444. movl AORIG, AA
  445. leal (, %eax, SIZE), %eax
  446. leal (AA, %eax, 2), AA
  447. leal (B, %eax, 2), BB
  448. #endif
  449. #if defined(LN) || defined(LT)
  450. movsd 0 * SIZE(BB), %xmm0
  451. movsd 1 * SIZE(BB), %xmm1
  452. movsd 2 * SIZE(BB), %xmm2
  453. movsd 3 * SIZE(BB), %xmm3
  454. subsd %xmm4, %xmm0
  455. subsd %xmm5, %xmm1
  456. subsd %xmm6, %xmm2
  457. subsd %xmm7, %xmm3
  458. #else
  459. movsd 0 * SIZE(AA), %xmm0
  460. movsd 1 * SIZE(AA), %xmm2
  461. movsd 2 * SIZE(AA), %xmm1
  462. movsd 3 * SIZE(AA), %xmm3
  463. subsd %xmm4, %xmm0
  464. subsd %xmm6, %xmm2
  465. subsd %xmm5, %xmm1
  466. subsd %xmm7, %xmm3
  467. #endif
  468. #ifdef LN
  469. movsd 3 * SIZE(AA), %xmm4
  470. mulsd %xmm4, %xmm2
  471. movsd 2 * SIZE(AA), %xmm5
  472. mulsd %xmm4, %xmm3
  473. movsd 0 * SIZE(AA), %xmm7
  474. movaps %xmm5, %xmm6
  475. mulsd %xmm2, %xmm5
  476. mulsd %xmm3, %xmm6
  477. subsd %xmm5, %xmm0
  478. subsd %xmm6, %xmm1
  479. mulsd %xmm7, %xmm0
  480. mulsd %xmm7, %xmm1
  481. #endif
  482. #ifdef LT
  483. movsd 0 * SIZE(AA), %xmm4
  484. mulsd %xmm4, %xmm0
  485. movsd 1 * SIZE(AA), %xmm5
  486. mulsd %xmm4, %xmm1
  487. movsd 3 * SIZE(AA), %xmm7
  488. movaps %xmm5, %xmm6
  489. mulsd %xmm0, %xmm5
  490. mulsd %xmm1, %xmm6
  491. subsd %xmm5, %xmm2
  492. subsd %xmm6, %xmm3
  493. mulsd %xmm7, %xmm2
  494. mulsd %xmm7, %xmm3
  495. #endif
  496. #ifdef RN
  497. movsd 0 * SIZE(BB), %xmm4
  498. mulsd %xmm4, %xmm0
  499. movsd 1 * SIZE(BB), %xmm5
  500. mulsd %xmm4, %xmm2
  501. movsd 3 * SIZE(BB), %xmm7
  502. movaps %xmm5, %xmm6
  503. mulsd %xmm0, %xmm5
  504. mulsd %xmm2, %xmm6
  505. subsd %xmm5, %xmm1
  506. subsd %xmm6, %xmm3
  507. mulsd %xmm7, %xmm1
  508. mulsd %xmm7, %xmm3
  509. #endif
  510. #ifdef RT
  511. movsd 3 * SIZE(BB), %xmm4
  512. mulsd %xmm4, %xmm1
  513. movsd 2 * SIZE(BB), %xmm5
  514. mulsd %xmm4, %xmm3
  515. movsd 0 * SIZE(BB), %xmm7
  516. movaps %xmm5, %xmm6
  517. mulsd %xmm1, %xmm5
  518. mulsd %xmm3, %xmm6
  519. subsd %xmm5, %xmm0
  520. subsd %xmm6, %xmm2
  521. mulsd %xmm7, %xmm0
  522. mulsd %xmm7, %xmm2
  523. #endif
  524. #if defined(LN) || defined(LT)
  525. movsd %xmm0, 0 * SIZE(BB)
  526. movsd %xmm1, 1 * SIZE(BB)
  527. movsd %xmm2, 2 * SIZE(BB)
  528. movsd %xmm3, 3 * SIZE(BB)
  529. #else
  530. movsd %xmm0, 0 * SIZE(AA)
  531. movsd %xmm2, 1 * SIZE(AA)
  532. movsd %xmm1, 2 * SIZE(AA)
  533. movsd %xmm3, 3 * SIZE(AA)
  534. #endif
  535. #ifdef LN
  536. subl $2 * SIZE, CO1
  537. #endif
  538. movsd %xmm0, 0 * SIZE(CO1)
  539. movsd %xmm2, 1 * SIZE(CO1)
  540. movsd %xmm1, 0 * SIZE(CO1, LDC)
  541. movsd %xmm3, 1 * SIZE(CO1, LDC)
  542. #ifndef LN
  543. addl $2 * SIZE, CO1
  544. #endif
  545. #if defined(LT) || defined(RN)
  546. movl K, %eax
  547. subl KK, %eax
  548. leal (,%eax, SIZE), %eax
  549. leal (AA, %eax, 2), AA
  550. leal (BB, %eax, 2), BB
  551. #endif
  552. #ifdef LN
  553. subl $2, KK
  554. #endif
  555. #ifdef LT
  556. addl $2, KK
  557. #endif
  558. #ifdef RT
  559. movl K, %eax
  560. sall $1 + BASE_SHIFT, %eax
  561. addl %eax, AORIG
  562. #endif
  563. decl %ebx # i --
  564. jg .L11
  565. ALIGN_4
  566. .L29:
  567. #ifdef LN
  568. movl K, %eax
  569. leal (, %eax, SIZE), %eax
  570. leal (B, %eax, 2), B
  571. #endif
  572. #if defined(LT) || defined(RN)
  573. movl BB, B
  574. #endif
  575. #ifdef RN
  576. addl $2, KK
  577. #endif
  578. #ifdef RT
  579. subl $2, KK
  580. #endif
  581. decl J # j --
  582. jg .L10
  583. ALIGN_4
  584. .L30:
  585. testl $1, N
  586. je .L999
  587. #if defined(LT) || defined(RN)
  588. movl A, AA
  589. #else
  590. movl A, %eax
  591. movl %eax, AORIG
  592. #endif
  593. #ifdef RT
  594. movl K, %eax
  595. sall $BASE_SHIFT, %eax
  596. subl %eax, B
  597. #endif
  598. #ifdef RT
  599. subl LDC, C
  600. #endif
  601. movl C, CO1
  602. #ifndef RT
  603. addl LDC, C
  604. #endif
  605. #ifdef LN
  606. movl OFFSET, %eax
  607. addl M, %eax
  608. movl %eax, KK
  609. #endif
  610. #ifdef LT
  611. movl OFFSET, %eax
  612. movl %eax, KK
  613. #endif
  614. movl M, %ebx
  615. testl $1, %ebx # i = (m >> 2)
  616. jle .L40
  617. #ifdef LN
  618. movl K, %eax
  619. sall $BASE_SHIFT, %eax
  620. subl %eax, AORIG
  621. #endif
  622. #if defined(LN) || defined(RT)
  623. movl KK, %eax
  624. movl AORIG, AA
  625. leal (AA, %eax, SIZE), AA
  626. #endif
  627. movl B, BB
  628. #if defined(LN) || defined(RT)
  629. movl KK, %eax
  630. sall $BASE_SHIFT, %eax
  631. addl %eax, BB
  632. #endif
  633. movsd 0 * SIZE(AA), %xmm0
  634. xorps %xmm2, %xmm2
  635. movsd 0 * SIZE(BB), %xmm2
  636. xorps %xmm3, %xmm3
  637. xorps %xmm4, %xmm4
  638. xorps %xmm5, %xmm5
  639. #if defined(LT) || defined(RN)
  640. movl KK, %eax
  641. #else
  642. movl K, %eax
  643. subl KK, %eax
  644. #endif
  645. sarl $2, %eax
  646. je .L45
  647. ALIGN_4
  648. .L42:
  649. mulsd %xmm0, %xmm2
  650. movsd 1 * SIZE(AA), %xmm0
  651. addsd %xmm2, %xmm4
  652. movsd 1 * SIZE(BB), %xmm2
  653. mulsd %xmm0, %xmm2
  654. movsd 2 * SIZE(AA), %xmm0
  655. addsd %xmm2, %xmm5
  656. movsd 2 * SIZE(BB), %xmm2
  657. mulsd %xmm0, %xmm2
  658. movsd 3 * SIZE(AA), %xmm0
  659. addsd %xmm2, %xmm4
  660. movsd 3 * SIZE(BB), %xmm2
  661. mulsd %xmm0, %xmm2
  662. movsd 4 * SIZE(AA), %xmm0
  663. addsd %xmm2, %xmm5
  664. movsd 4 * SIZE(BB), %xmm2
  665. addl $4 * SIZE, AA
  666. addl $4 * SIZE, BB
  667. decl %eax
  668. jne .L42
  669. ALIGN_4
  670. .L45:
  671. #if defined(LT) || defined(RN)
  672. movl KK, %eax
  673. #else
  674. movl K, %eax
  675. subl KK, %eax
  676. #endif
  677. andl $3, %eax # if (k & 1)
  678. BRANCH
  679. je .L48
  680. ALIGN_3
  681. .L46:
  682. mulsd %xmm0, %xmm2
  683. movsd 1 * SIZE(AA), %xmm0
  684. addsd %xmm2, %xmm4
  685. movsd 1 * SIZE(BB), %xmm2
  686. addl $1 * SIZE, AA
  687. addl $1 * SIZE, BB
  688. decl %eax
  689. jg .L46
  690. ALIGN_4
  691. .L48:
  692. addsd %xmm5, %xmm4
  693. #if defined(LN) || defined(RT)
  694. movl KK, %eax
  695. #ifdef LN
  696. subl $1, %eax
  697. #else
  698. subl $1, %eax
  699. #endif
  700. movl AORIG, AA
  701. leal (, %eax, SIZE), %eax
  702. addl %eax, AA
  703. leal (B, %eax, 1), BB
  704. #endif
  705. #if defined(LN) || defined(LT)
  706. movsd 0 * SIZE(BB), %xmm0
  707. subsd %xmm4, %xmm0
  708. #else
  709. movsd 0 * SIZE(AA), %xmm0
  710. subsd %xmm4, %xmm0
  711. #endif
  712. #if defined(LN) || defined(LT)
  713. mulsd 0 * SIZE(AA), %xmm0
  714. #endif
  715. #if defined(RN) || defined(RT)
  716. mulsd 0 * SIZE(BB), %xmm0
  717. #endif
  718. #if defined(LN) || defined(LT)
  719. movsd %xmm0, 0 * SIZE(BB)
  720. #else
  721. movsd %xmm0, 0 * SIZE(AA)
  722. #endif
  723. #ifdef LN
  724. subl $1 * SIZE, CO1
  725. #endif
  726. movsd %xmm0, 0 * SIZE(CO1)
  727. #ifndef LN
  728. addl $1 * SIZE, CO1
  729. #endif
  730. #if defined(LT) || defined(RN)
  731. movl K, %eax
  732. subl KK, %eax
  733. leal (,%eax, SIZE), %eax
  734. addl %eax, AA
  735. addl %eax, BB
  736. #endif
  737. #ifdef LN
  738. subl $1, KK
  739. #endif
  740. #ifdef LT
  741. addl $1, KK
  742. #endif
  743. #ifdef RT
  744. movl K, %eax
  745. sall $BASE_SHIFT, %eax
  746. addl %eax, AORIG
  747. #endif
  748. ALIGN_4
  749. .L40:
  750. movl M, %ebx
  751. sarl $1, %ebx
  752. jle .L49
  753. ALIGN_4
  754. .L31:
  755. #ifdef LN
  756. movl K, %eax
  757. sall $1 + BASE_SHIFT, %eax
  758. subl %eax, AORIG
  759. #endif
  760. #if defined(LN) || defined(RT)
  761. movl KK, %eax
  762. movl AORIG, AA
  763. leal (, %eax, SIZE), %eax
  764. leal (AA, %eax, 2), AA
  765. #endif
  766. movl B, BB
  767. #if defined(LN) || defined(RT)
  768. movl KK, %eax
  769. sall $BASE_SHIFT, %eax
  770. addl %eax, BB
  771. #endif
  772. movsd 0 * SIZE(BB), %xmm1
  773. xorps %xmm0, %xmm0
  774. prefetcht0 3 * SIZE(CO1)
  775. xorps %xmm2, %xmm2
  776. xorps %xmm4, %xmm4
  777. xorps %xmm6, %xmm6
  778. #if defined(LT) || defined(RN)
  779. movl KK, %eax
  780. #else
  781. movl K, %eax
  782. subl KK, %eax
  783. #endif
  784. sarl $2, %eax
  785. je .L35
  786. ALIGN_4
  787. .L32:
  788. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  789. addsd %xmm0, %xmm4
  790. movsd 0 * SIZE(AA), %xmm0
  791. addsd %xmm2, %xmm6
  792. movsd 1 * SIZE(AA), %xmm2
  793. mulsd %xmm1, %xmm0
  794. mulsd %xmm1, %xmm2
  795. movsd 1 * SIZE(BB), %xmm1
  796. addsd %xmm0, %xmm4
  797. movsd 2 * SIZE(AA), %xmm0
  798. addsd %xmm2, %xmm6
  799. movsd 3 * SIZE(AA), %xmm2
  800. mulsd %xmm1, %xmm0
  801. mulsd %xmm1, %xmm2
  802. movsd 2 * SIZE(BB), %xmm1
  803. addsd %xmm0, %xmm4
  804. movsd 4 * SIZE(AA), %xmm0
  805. addsd %xmm2, %xmm6
  806. movsd 5 * SIZE(AA), %xmm2
  807. mulsd %xmm1, %xmm0
  808. mulsd %xmm1, %xmm2
  809. movsd 3 * SIZE(BB), %xmm1
  810. addsd %xmm0, %xmm4
  811. movsd 6 * SIZE(AA), %xmm0
  812. addsd %xmm2, %xmm6
  813. movsd 7 * SIZE(AA), %xmm2
  814. mulsd %xmm1, %xmm0
  815. mulsd %xmm1, %xmm2
  816. movsd 4 * SIZE(BB), %xmm1
  817. addl $8 * SIZE, AA
  818. addl $4 * SIZE, BB
  819. decl %eax
  820. jne .L32
  821. ALIGN_4
  822. .L35:
  823. #if defined(LT) || defined(RN)
  824. movl KK, %eax
  825. #else
  826. movl K, %eax
  827. subl KK, %eax
  828. #endif
  829. andl $3, %eax # if (k & 1)
  830. BRANCH
  831. je .L38
  832. ALIGN_3
  833. .L36:
  834. addsd %xmm0, %xmm4
  835. movsd 0 * SIZE(AA), %xmm0
  836. addsd %xmm2, %xmm6
  837. movsd 1 * SIZE(AA), %xmm2
  838. mulsd %xmm1, %xmm0
  839. mulsd %xmm1, %xmm2
  840. movsd 1 * SIZE(BB), %xmm1
  841. addl $2 * SIZE, AA
  842. addl $1 * SIZE, BB
  843. decl %eax
  844. jg .L36
  845. ALIGN_4
  846. .L38:
  847. addsd %xmm0, %xmm4
  848. addsd %xmm2, %xmm6
  849. #if defined(LN) || defined(RT)
  850. movl KK, %eax
  851. #ifdef LN
  852. subl $2, %eax
  853. #else
  854. subl $1, %eax
  855. #endif
  856. movl AORIG, AA
  857. leal (, %eax, SIZE), %eax
  858. leal (AA, %eax, 2), AA
  859. leal (B, %eax, 1), BB
  860. #endif
  861. #if defined(LN) || defined(LT)
  862. movsd 0 * SIZE(BB), %xmm0
  863. movsd 1 * SIZE(BB), %xmm2
  864. subsd %xmm4, %xmm0
  865. subsd %xmm6, %xmm2
  866. #else
  867. movsd 0 * SIZE(AA), %xmm0
  868. movsd 1 * SIZE(AA), %xmm2
  869. subsd %xmm4, %xmm0
  870. subsd %xmm6, %xmm2
  871. #endif
  872. #ifdef LN
  873. movsd 3 * SIZE(AA), %xmm4
  874. mulsd %xmm4, %xmm2
  875. movsd 2 * SIZE(AA), %xmm5
  876. mulsd %xmm2, %xmm5
  877. movsd 0 * SIZE(AA), %xmm7
  878. subsd %xmm5, %xmm0
  879. mulsd %xmm7, %xmm0
  880. #endif
  881. #ifdef LT
  882. movsd 0 * SIZE(AA), %xmm4
  883. mulsd %xmm4, %xmm0
  884. movsd 1 * SIZE(AA), %xmm5
  885. mulsd %xmm0, %xmm5
  886. movsd 3 * SIZE(AA), %xmm7
  887. subsd %xmm5, %xmm2
  888. mulsd %xmm7, %xmm2
  889. #endif
  890. #if defined(RN) || defined(RT)
  891. movsd 0 * SIZE(BB), %xmm4
  892. mulsd %xmm4, %xmm0
  893. mulsd %xmm4, %xmm2
  894. #endif
  895. #if defined(LN) || defined(LT)
  896. movsd %xmm0, 0 * SIZE(BB)
  897. movsd %xmm2, 1 * SIZE(BB)
  898. #else
  899. movsd %xmm0, 0 * SIZE(AA)
  900. movsd %xmm2, 1 * SIZE(AA)
  901. #endif
  902. #ifdef LN
  903. subl $2 * SIZE, CO1
  904. #endif
  905. movsd %xmm0, 0 * SIZE(CO1)
  906. movsd %xmm2, 1 * SIZE(CO1)
  907. #ifndef LN
  908. addl $2 * SIZE, CO1
  909. #endif
  910. #if defined(LT) || defined(RN)
  911. movl K, %eax
  912. subl KK, %eax
  913. leal (,%eax, SIZE), %eax
  914. leal (AA, %eax, 2), AA
  915. addl %eax, BB
  916. #endif
  917. #ifdef LN
  918. subl $2, KK
  919. #endif
  920. #ifdef LT
  921. addl $2, KK
  922. #endif
  923. #ifdef RT
  924. movl K, %eax
  925. sall $1 + BASE_SHIFT, %eax
  926. addl %eax, AORIG
  927. #endif
  928. decl %ebx # i --
  929. jg .L31
  930. ALIGN_4
  931. .L49:
  932. #ifdef LN
  933. movl K, %eax
  934. leal (B, %eax, SIZE), B
  935. #endif
  936. #if defined(LT) || defined(RN)
  937. movl BB, B
  938. #endif
  939. #ifdef RN
  940. addl $1, KK
  941. #endif
  942. #ifdef RT
  943. subl $1, KK
  944. #endif
  945. ALIGN_4
  946. .L999:
  947. popl %ebx
  948. popl %esi
  949. popl %edi
  950. popl %ebp
  951. addl $ARGS, %esp
  952. ret
  953. EPILOGUE