You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_2x2_atom.S 20 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA 16 + STACK + ARGS(%esp)
  46. #define A 24 + STACK + ARGS(%esp)
  47. #define ARG_B 28 + STACK + ARGS(%esp)
  48. #define C 32 + STACK + ARGS(%esp)
  49. #define ARG_LDC 36 + STACK + ARGS(%esp)
  50. #define OFFSET 40 + STACK + ARGS(%esp)
  51. #define J 0 + STACK(%esp)
  52. #define KK 4 + STACK(%esp)
  53. #define KKK 8 + STACK(%esp)
  54. #define AORIG 12 + STACK(%esp)
  55. #define PREFETCH prefetcht0
  56. #define PREFETCHSIZE 84
  57. #define AA %edx
  58. #define BB %ecx
  59. #define LDC %ebp
  60. #define B %edi
  61. #define CO1 %esi
  62. PROLOGUE
  63. subl $ARGS, %esp
  64. pushl %ebp
  65. pushl %edi
  66. pushl %esi
  67. pushl %ebx
  68. PROFCODE
  69. movl ARG_B, B
  70. movl ARG_LDC, LDC
  71. movl OFFSET, %eax
  72. #ifdef RN
  73. negl %eax
  74. #endif
  75. movl %eax, KK
  76. leal (, LDC, SIZE), LDC
  77. #ifdef LN
  78. movl M, %eax
  79. leal (, %eax, SIZE), %eax
  80. addl %eax, C
  81. imull K, %eax
  82. addl %eax, A
  83. #endif
  84. #ifdef RT
  85. movl N, %eax
  86. leal (, %eax, SIZE), %eax
  87. imull K, %eax
  88. addl %eax, B
  89. movl N, %eax
  90. imull LDC, %eax
  91. addl %eax, C
  92. #endif
  93. #ifdef RT
  94. movl N, %eax
  95. subl OFFSET, %eax
  96. movl %eax, KK
  97. #endif
  98. movl N, %eax
  99. sarl $1, %eax
  100. movl %eax, J
  101. jle .L30
  102. ALIGN_2
  103. .L10:
  104. #if defined(LT) || defined(RN)
  105. movl A, AA
  106. #else
  107. movl A, %eax
  108. movl %eax, AORIG
  109. #endif
  110. #ifdef RT
  111. movl K, %eax
  112. sall $1 + BASE_SHIFT, %eax
  113. subl %eax, B
  114. #endif
  115. leal (, LDC, 2), %eax
  116. #ifdef RT
  117. subl %eax, C
  118. #endif
  119. movl C, CO1
  120. #ifndef RT
  121. addl %eax, C
  122. #endif
  123. #ifdef LN
  124. movl OFFSET, %eax
  125. addl M, %eax
  126. movl %eax, KK
  127. #endif
  128. #ifdef LT
  129. movl OFFSET, %eax
  130. movl %eax, KK
  131. #endif
  132. movl M, %ebx
  133. sarl $1, %ebx
  134. jle .L20
  135. ALIGN_4
  136. .L11:
  137. #ifdef LN
  138. movl K, %eax
  139. sall $1 + BASE_SHIFT, %eax
  140. subl %eax, AORIG
  141. #endif
  142. #if defined(LN) || defined(RT)
  143. movl KK, %eax
  144. movl AORIG, AA
  145. leal (, %eax, SIZE), %eax
  146. leal (AA, %eax, 2), AA
  147. #endif
  148. movl B, BB
  149. #if defined(LN) || defined(RT)
  150. movl KK, %eax
  151. sall $1 + BASE_SHIFT, %eax
  152. addl %eax, BB
  153. #endif
  154. movsd 0 * SIZE(AA), %xmm0
  155. xorps %xmm2, %xmm2
  156. xorps %xmm3, %xmm3
  157. xorps %xmm4, %xmm4
  158. prefetcht0 3 * SIZE(CO1)
  159. xorps %xmm5, %xmm5
  160. prefetcht0 3 * SIZE(CO1, LDC)
  161. xorps %xmm6, %xmm6
  162. xorps %xmm7, %xmm7
  163. #if defined(LT) || defined(RN)
  164. movl KK, %eax
  165. #else
  166. movl K, %eax
  167. subl KK, %eax
  168. #endif
  169. sarl $2, %eax
  170. je .L15
  171. ALIGN_4
  172. .L12:
  173. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  174. addsd %xmm2, %xmm6
  175. movsd 1 * SIZE(AA), %xmm2
  176. movaps %xmm0, %xmm1
  177. mulsd 0 * SIZE(BB), %xmm0
  178. addsd %xmm3, %xmm7
  179. mulsd 1 * SIZE(BB), %xmm1
  180. addsd %xmm0, %xmm4
  181. movsd 2 * SIZE(AA), %xmm0
  182. movaps %xmm2, %xmm3
  183. mulsd 0 * SIZE(BB), %xmm2
  184. addsd %xmm1, %xmm5
  185. mulsd 1 * SIZE(BB), %xmm3
  186. addsd %xmm2, %xmm6
  187. movsd 3 * SIZE(AA), %xmm2
  188. movaps %xmm0, %xmm1
  189. mulsd 2 * SIZE(BB), %xmm0
  190. addsd %xmm3, %xmm7
  191. mulsd 3 * SIZE(BB), %xmm1
  192. addsd %xmm0, %xmm4
  193. movsd 4 * SIZE(AA), %xmm0
  194. movaps %xmm2, %xmm3
  195. mulsd 2 * SIZE(BB), %xmm2
  196. addsd %xmm1, %xmm5
  197. mulsd 3 * SIZE(BB), %xmm3
  198. addsd %xmm2, %xmm6
  199. movsd 5 * SIZE(AA), %xmm2
  200. movaps %xmm0, %xmm1
  201. mulsd 4 * SIZE(BB), %xmm0
  202. addsd %xmm3, %xmm7
  203. mulsd 5 * SIZE(BB), %xmm1
  204. addsd %xmm0, %xmm4
  205. movsd 6 * SIZE(AA), %xmm0
  206. movaps %xmm2, %xmm3
  207. mulsd 4 * SIZE(BB), %xmm2
  208. addsd %xmm1, %xmm5
  209. mulsd 5 * SIZE(BB), %xmm3
  210. addsd %xmm2, %xmm6
  211. movsd 7 * SIZE(AA), %xmm2
  212. movaps %xmm0, %xmm1
  213. mulsd 6 * SIZE(BB), %xmm0
  214. addsd %xmm3, %xmm7
  215. mulsd 7 * SIZE(BB), %xmm1
  216. addsd %xmm0, %xmm4
  217. movsd 8 * SIZE(AA), %xmm0
  218. movaps %xmm2, %xmm3
  219. mulsd 6 * SIZE(BB), %xmm2
  220. addsd %xmm1, %xmm5
  221. mulsd 7 * SIZE(BB), %xmm3
  222. addl $8 * SIZE, BB
  223. addl $8 * SIZE, AA
  224. decl %eax
  225. jne .L12
  226. ALIGN_4
  227. .L15:
  228. #if defined(LT) || defined(RN)
  229. movl KK, %eax
  230. #else
  231. movl K, %eax
  232. subl KK, %eax
  233. #endif
  234. andl $3, %eax # if (k & 1)
  235. BRANCH
  236. je .L18
  237. ALIGN_3
  238. .L16:
  239. addsd %xmm2, %xmm6
  240. movsd 1 * SIZE(AA), %xmm2
  241. movaps %xmm0, %xmm1
  242. mulsd 0 * SIZE(BB), %xmm0
  243. addsd %xmm3, %xmm7
  244. mulsd 1 * SIZE(BB), %xmm1
  245. addsd %xmm0, %xmm4
  246. movsd 2 * SIZE(AA), %xmm0
  247. movaps %xmm2, %xmm3
  248. mulsd 0 * SIZE(BB), %xmm2
  249. addsd %xmm1, %xmm5
  250. mulsd 1 * SIZE(BB), %xmm3
  251. addl $2 * SIZE, AA
  252. addl $2 * SIZE, BB
  253. decl %eax
  254. jg .L16
  255. ALIGN_4
  256. .L18:
  257. addsd %xmm2, %xmm6
  258. addsd %xmm3, %xmm7
  259. #if defined(LN) || defined(RT)
  260. movl KK, %eax
  261. #ifdef LN
  262. subl $2, %eax
  263. #else
  264. subl $2, %eax
  265. #endif
  266. movl AORIG, AA
  267. leal (, %eax, SIZE), %eax
  268. leal (AA, %eax, 2), AA
  269. leal (B, %eax, 2), BB
  270. #endif
  271. #if defined(LN) || defined(LT)
  272. movsd 0 * SIZE(BB), %xmm0
  273. movsd 1 * SIZE(BB), %xmm1
  274. movsd 2 * SIZE(BB), %xmm2
  275. movsd 3 * SIZE(BB), %xmm3
  276. subsd %xmm4, %xmm0
  277. subsd %xmm5, %xmm1
  278. subsd %xmm6, %xmm2
  279. subsd %xmm7, %xmm3
  280. #else
  281. movsd 0 * SIZE(AA), %xmm0
  282. movsd 1 * SIZE(AA), %xmm2
  283. movsd 2 * SIZE(AA), %xmm1
  284. movsd 3 * SIZE(AA), %xmm3
  285. subsd %xmm4, %xmm0
  286. subsd %xmm6, %xmm2
  287. subsd %xmm5, %xmm1
  288. subsd %xmm7, %xmm3
  289. #endif
  290. #ifdef LN
  291. movsd 3 * SIZE(AA), %xmm4
  292. mulsd %xmm4, %xmm2
  293. movsd 2 * SIZE(AA), %xmm5
  294. mulsd %xmm4, %xmm3
  295. movsd 0 * SIZE(AA), %xmm7
  296. movaps %xmm5, %xmm6
  297. mulsd %xmm2, %xmm5
  298. mulsd %xmm3, %xmm6
  299. subsd %xmm5, %xmm0
  300. subsd %xmm6, %xmm1
  301. mulsd %xmm7, %xmm0
  302. mulsd %xmm7, %xmm1
  303. #endif
  304. #ifdef LT
  305. movsd 0 * SIZE(AA), %xmm4
  306. mulsd %xmm4, %xmm0
  307. movsd 1 * SIZE(AA), %xmm5
  308. mulsd %xmm4, %xmm1
  309. movsd 3 * SIZE(AA), %xmm7
  310. movaps %xmm5, %xmm6
  311. mulsd %xmm0, %xmm5
  312. mulsd %xmm1, %xmm6
  313. subsd %xmm5, %xmm2
  314. subsd %xmm6, %xmm3
  315. mulsd %xmm7, %xmm2
  316. mulsd %xmm7, %xmm3
  317. #endif
  318. #ifdef RN
  319. movsd 0 * SIZE(BB), %xmm4
  320. mulsd %xmm4, %xmm0
  321. movsd 1 * SIZE(BB), %xmm5
  322. mulsd %xmm4, %xmm2
  323. movsd 3 * SIZE(BB), %xmm7
  324. movaps %xmm5, %xmm6
  325. mulsd %xmm0, %xmm5
  326. mulsd %xmm2, %xmm6
  327. subsd %xmm5, %xmm1
  328. subsd %xmm6, %xmm3
  329. mulsd %xmm7, %xmm1
  330. mulsd %xmm7, %xmm3
  331. #endif
  332. #ifdef RT
  333. movsd 3 * SIZE(BB), %xmm4
  334. mulsd %xmm4, %xmm1
  335. movsd 2 * SIZE(BB), %xmm5
  336. mulsd %xmm4, %xmm3
  337. movsd 0 * SIZE(BB), %xmm7
  338. movaps %xmm5, %xmm6
  339. mulsd %xmm1, %xmm5
  340. mulsd %xmm3, %xmm6
  341. subsd %xmm5, %xmm0
  342. subsd %xmm6, %xmm2
  343. mulsd %xmm7, %xmm0
  344. mulsd %xmm7, %xmm2
  345. #endif
  346. #if defined(LN) || defined(LT)
  347. movsd %xmm0, 0 * SIZE(BB)
  348. movsd %xmm1, 1 * SIZE(BB)
  349. movsd %xmm2, 2 * SIZE(BB)
  350. movsd %xmm3, 3 * SIZE(BB)
  351. #else
  352. movsd %xmm0, 0 * SIZE(AA)
  353. movsd %xmm2, 1 * SIZE(AA)
  354. movsd %xmm1, 2 * SIZE(AA)
  355. movsd %xmm3, 3 * SIZE(AA)
  356. #endif
  357. #ifdef LN
  358. subl $2 * SIZE, CO1
  359. #endif
  360. movsd %xmm0, 0 * SIZE(CO1)
  361. movsd %xmm2, 1 * SIZE(CO1)
  362. movsd %xmm1, 0 * SIZE(CO1, LDC)
  363. movsd %xmm3, 1 * SIZE(CO1, LDC)
  364. #ifndef LN
  365. addl $2 * SIZE, CO1
  366. #endif
  367. #if defined(LT) || defined(RN)
  368. movl K, %eax
  369. subl KK, %eax
  370. leal (,%eax, SIZE), %eax
  371. leal (AA, %eax, 2), AA
  372. leal (BB, %eax, 2), BB
  373. #endif
  374. #ifdef LN
  375. subl $2, KK
  376. #endif
  377. #ifdef LT
  378. addl $2, KK
  379. #endif
  380. #ifdef RT
  381. movl K, %eax
  382. sall $1 + BASE_SHIFT, %eax
  383. addl %eax, AORIG
  384. #endif
  385. decl %ebx # i --
  386. jg .L11
  387. ALIGN_4
  388. .L20:
  389. movl M, %ebx
  390. testl $1, %ebx # i = (m >> 2)
  391. jle .L29
  392. #ifdef LN
  393. movl K, %eax
  394. sall $BASE_SHIFT, %eax
  395. subl %eax, AORIG
  396. #endif
  397. #if defined(LN) || defined(RT)
  398. movl KK, %eax
  399. movl AORIG, AA
  400. leal (AA, %eax, SIZE), AA
  401. #endif
  402. movl B, BB
  403. #if defined(LN) || defined(RT)
  404. movl KK, %eax
  405. sall $1 + BASE_SHIFT, %eax
  406. addl %eax, BB
  407. #endif
  408. movsd 0 * SIZE(AA), %xmm0
  409. xorps %xmm2, %xmm2
  410. xorps %xmm3, %xmm3
  411. xorps %xmm4, %xmm4
  412. xorps %xmm5, %xmm5
  413. #if defined(LT) || defined(RN)
  414. movl KK, %eax
  415. #else
  416. movl K, %eax
  417. subl KK, %eax
  418. #endif
  419. sarl $2, %eax
  420. je .L25
  421. ALIGN_4
  422. .L22:
  423. addsd %xmm2, %xmm4
  424. movsd 0 * SIZE(BB), %xmm2
  425. addsd %xmm3, %xmm5
  426. movsd 1 * SIZE(BB), %xmm3
  427. mulsd %xmm0, %xmm2
  428. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  429. mulsd %xmm0, %xmm3
  430. movsd 1 * SIZE(AA), %xmm0
  431. addsd %xmm2, %xmm4
  432. movsd 2 * SIZE(BB), %xmm2
  433. addsd %xmm3, %xmm5
  434. movsd 3 * SIZE(BB), %xmm3
  435. mulsd %xmm0, %xmm2
  436. mulsd %xmm0, %xmm3
  437. movsd 2 * SIZE(AA), %xmm0
  438. addsd %xmm2, %xmm4
  439. movsd 4 * SIZE(BB), %xmm2
  440. addsd %xmm3, %xmm5
  441. movsd 5 * SIZE(BB), %xmm3
  442. mulsd %xmm0, %xmm2
  443. mulsd %xmm0, %xmm3
  444. movsd 3 * SIZE(AA), %xmm0
  445. addsd %xmm2, %xmm4
  446. movsd 6 * SIZE(BB), %xmm2
  447. addsd %xmm3, %xmm5
  448. movsd 7 * SIZE(BB), %xmm3
  449. mulsd %xmm0, %xmm2
  450. mulsd %xmm0, %xmm3
  451. movsd 4 * SIZE(AA), %xmm0
  452. addl $4 * SIZE, AA
  453. addl $8 * SIZE, BB
  454. decl %eax
  455. jne .L22
  456. ALIGN_4
  457. .L25:
  458. #if defined(LT) || defined(RN)
  459. movl KK, %eax
  460. #else
  461. movl K, %eax
  462. subl KK, %eax
  463. #endif
  464. andl $3, %eax # if (k & 1)
  465. BRANCH
  466. je .L28
  467. ALIGN_3
  468. .L26:
  469. addsd %xmm2, %xmm4
  470. movsd 0 * SIZE(BB), %xmm2
  471. addsd %xmm3, %xmm5
  472. movsd 1 * SIZE(BB), %xmm3
  473. mulsd %xmm0, %xmm2
  474. mulsd %xmm0, %xmm3
  475. movsd 1 * SIZE(AA), %xmm0
  476. addl $1 * SIZE, AA
  477. addl $2 * SIZE, BB
  478. decl %eax
  479. jg .L26
  480. ALIGN_4
  481. .L28:
  482. addsd %xmm2, %xmm4
  483. addsd %xmm3, %xmm5
  484. #if defined(LN) || defined(RT)
  485. movl KK, %eax
  486. #ifdef LN
  487. subl $1, %eax
  488. #else
  489. subl $2, %eax
  490. #endif
  491. movl AORIG, AA
  492. leal (, %eax, SIZE), %eax
  493. leal (AA, %eax, 1), AA
  494. leal (B, %eax, 2), BB
  495. #endif
  496. #if defined(LN) || defined(LT)
  497. movsd 0 * SIZE(BB), %xmm0
  498. movsd 1 * SIZE(BB), %xmm1
  499. subsd %xmm4, %xmm0
  500. subsd %xmm5, %xmm1
  501. #else
  502. movsd 0 * SIZE(AA), %xmm0
  503. movsd 1 * SIZE(AA), %xmm1
  504. subsd %xmm4, %xmm0
  505. subsd %xmm5, %xmm1
  506. #endif
  507. #if defined(LN) || defined(LT)
  508. movsd 0 * SIZE(AA), %xmm7
  509. mulsd %xmm7, %xmm0
  510. mulsd %xmm7, %xmm1
  511. #endif
  512. #ifdef RN
  513. movsd 0 * SIZE(BB), %xmm4
  514. mulsd %xmm4, %xmm0
  515. movsd 1 * SIZE(BB), %xmm5
  516. movaps %xmm5, %xmm6
  517. movsd 3 * SIZE(BB), %xmm7
  518. mulsd %xmm0, %xmm5
  519. subsd %xmm5, %xmm1
  520. mulsd %xmm7, %xmm1
  521. #endif
  522. #ifdef RT
  523. movsd 3 * SIZE(BB), %xmm4
  524. mulsd %xmm4, %xmm1
  525. movsd 2 * SIZE(BB), %xmm5
  526. movaps %xmm5, %xmm6
  527. movsd 0 * SIZE(BB), %xmm7
  528. mulsd %xmm1, %xmm5
  529. subsd %xmm5, %xmm0
  530. mulsd %xmm7, %xmm0
  531. #endif
  532. #if defined(LN) || defined(LT)
  533. movsd %xmm0, 0 * SIZE(BB)
  534. movsd %xmm1, 1 * SIZE(BB)
  535. #else
  536. movsd %xmm0, 0 * SIZE(AA)
  537. movsd %xmm1, 1 * SIZE(AA)
  538. #endif
  539. #ifdef LN
  540. subl $1 * SIZE, CO1
  541. #endif
  542. movsd %xmm0, 0 * SIZE(CO1)
  543. movsd %xmm1, 0 * SIZE(CO1, LDC)
  544. #ifndef LN
  545. addl $1 * SIZE, CO1
  546. #endif
  547. #if defined(LT) || defined(RN)
  548. movl K, %eax
  549. subl KK, %eax
  550. leal (,%eax, SIZE), %eax
  551. leal (AA, %eax, 1), AA
  552. leal (BB, %eax, 2), BB
  553. #endif
  554. #ifdef LN
  555. subl $1, KK
  556. #endif
  557. #ifdef LT
  558. addl $1, KK
  559. #endif
  560. #ifdef RT
  561. movl K, %eax
  562. sall $BASE_SHIFT, %eax
  563. addl %eax, AORIG
  564. #endif
  565. ALIGN_4
  566. .L29:
  567. #ifdef LN
  568. movl K, %eax
  569. leal (, %eax, SIZE), %eax
  570. leal (B, %eax, 2), B
  571. #endif
  572. #if defined(LT) || defined(RN)
  573. movl BB, B
  574. #endif
  575. #ifdef RN
  576. addl $2, KK
  577. #endif
  578. #ifdef RT
  579. subl $2, KK
  580. #endif
  581. decl J # j --
  582. jg .L10
  583. ALIGN_4
  584. .L30:
  585. testl $1, N
  586. je .L999
  587. #if defined(LT) || defined(RN)
  588. movl A, AA
  589. #else
  590. movl A, %eax
  591. movl %eax, AORIG
  592. #endif
  593. #ifdef RT
  594. movl K, %eax
  595. sall $BASE_SHIFT, %eax
  596. subl %eax, B
  597. #endif
  598. #ifdef RT
  599. subl LDC, C
  600. #endif
  601. movl C, CO1
  602. #ifndef RT
  603. addl LDC, C
  604. #endif
  605. #ifdef LN
  606. movl OFFSET, %eax
  607. addl M, %eax
  608. movl %eax, KK
  609. #endif
  610. #ifdef LT
  611. movl OFFSET, %eax
  612. movl %eax, KK
  613. #endif
  614. movl M, %ebx
  615. sarl $1, %ebx
  616. jle .L40
  617. ALIGN_4
  618. .L31:
  619. #ifdef LN
  620. movl K, %eax
  621. sall $1 + BASE_SHIFT, %eax
  622. subl %eax, AORIG
  623. #endif
  624. #if defined(LN) || defined(RT)
  625. movl KK, %eax
  626. movl AORIG, AA
  627. leal (, %eax, SIZE), %eax
  628. leal (AA, %eax, 2), AA
  629. #endif
  630. movl B, BB
  631. #if defined(LN) || defined(RT)
  632. movl KK, %eax
  633. sall $BASE_SHIFT, %eax
  634. addl %eax, BB
  635. #endif
  636. movsd 0 * SIZE(BB), %xmm1
  637. xorps %xmm0, %xmm0
  638. prefetcht0 3 * SIZE(CO1)
  639. xorps %xmm2, %xmm2
  640. xorps %xmm4, %xmm4
  641. xorps %xmm6, %xmm6
  642. #if defined(LT) || defined(RN)
  643. movl KK, %eax
  644. #else
  645. movl K, %eax
  646. subl KK, %eax
  647. #endif
  648. sarl $2, %eax
  649. je .L35
  650. ALIGN_4
  651. .L32:
  652. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  653. addsd %xmm0, %xmm4
  654. movsd 0 * SIZE(AA), %xmm0
  655. addsd %xmm2, %xmm6
  656. movsd 1 * SIZE(AA), %xmm2
  657. mulsd %xmm1, %xmm0
  658. mulsd %xmm1, %xmm2
  659. movsd 1 * SIZE(BB), %xmm1
  660. addsd %xmm0, %xmm4
  661. movsd 2 * SIZE(AA), %xmm0
  662. addsd %xmm2, %xmm6
  663. movsd 3 * SIZE(AA), %xmm2
  664. mulsd %xmm1, %xmm0
  665. mulsd %xmm1, %xmm2
  666. movsd 2 * SIZE(BB), %xmm1
  667. addsd %xmm0, %xmm4
  668. movsd 4 * SIZE(AA), %xmm0
  669. addsd %xmm2, %xmm6
  670. movsd 5 * SIZE(AA), %xmm2
  671. mulsd %xmm1, %xmm0
  672. mulsd %xmm1, %xmm2
  673. movsd 3 * SIZE(BB), %xmm1
  674. addsd %xmm0, %xmm4
  675. movsd 6 * SIZE(AA), %xmm0
  676. addsd %xmm2, %xmm6
  677. movsd 7 * SIZE(AA), %xmm2
  678. mulsd %xmm1, %xmm0
  679. mulsd %xmm1, %xmm2
  680. movsd 4 * SIZE(BB), %xmm1
  681. addl $8 * SIZE, AA
  682. addl $4 * SIZE, BB
  683. decl %eax
  684. jne .L32
  685. ALIGN_4
  686. .L35:
  687. #if defined(LT) || defined(RN)
  688. movl KK, %eax
  689. #else
  690. movl K, %eax
  691. subl KK, %eax
  692. #endif
  693. andl $3, %eax # if (k & 1)
  694. BRANCH
  695. je .L38
  696. ALIGN_3
  697. .L36:
  698. addsd %xmm0, %xmm4
  699. movsd 0 * SIZE(AA), %xmm0
  700. addsd %xmm2, %xmm6
  701. movsd 1 * SIZE(AA), %xmm2
  702. mulsd %xmm1, %xmm0
  703. mulsd %xmm1, %xmm2
  704. movsd 1 * SIZE(BB), %xmm1
  705. addl $2 * SIZE, AA
  706. addl $1 * SIZE, BB
  707. decl %eax
  708. jg .L36
  709. ALIGN_4
  710. .L38:
  711. addsd %xmm0, %xmm4
  712. addsd %xmm2, %xmm6
  713. #if defined(LN) || defined(RT)
  714. movl KK, %eax
  715. #ifdef LN
  716. subl $2, %eax
  717. #else
  718. subl $1, %eax
  719. #endif
  720. movl AORIG, AA
  721. leal (, %eax, SIZE), %eax
  722. leal (AA, %eax, 2), AA
  723. leal (B, %eax, 1), BB
  724. #endif
  725. #if defined(LN) || defined(LT)
  726. movsd 0 * SIZE(BB), %xmm0
  727. movsd 1 * SIZE(BB), %xmm2
  728. subsd %xmm4, %xmm0
  729. subsd %xmm6, %xmm2
  730. #else
  731. movsd 0 * SIZE(AA), %xmm0
  732. movsd 1 * SIZE(AA), %xmm2
  733. subsd %xmm4, %xmm0
  734. subsd %xmm6, %xmm2
  735. #endif
  736. #ifdef LN
  737. movsd 3 * SIZE(AA), %xmm4
  738. mulsd %xmm4, %xmm2
  739. movsd 2 * SIZE(AA), %xmm5
  740. mulsd %xmm2, %xmm5
  741. movsd 0 * SIZE(AA), %xmm7
  742. subsd %xmm5, %xmm0
  743. mulsd %xmm7, %xmm0
  744. #endif
  745. #ifdef LT
  746. movsd 0 * SIZE(AA), %xmm4
  747. mulsd %xmm4, %xmm0
  748. movsd 1 * SIZE(AA), %xmm5
  749. mulsd %xmm0, %xmm5
  750. movsd 3 * SIZE(AA), %xmm7
  751. subsd %xmm5, %xmm2
  752. mulsd %xmm7, %xmm2
  753. #endif
  754. #if defined(RN) || defined(RT)
  755. movsd 0 * SIZE(BB), %xmm4
  756. mulsd %xmm4, %xmm0
  757. mulsd %xmm4, %xmm2
  758. #endif
  759. #if defined(LN) || defined(LT)
  760. movsd %xmm0, 0 * SIZE(BB)
  761. movsd %xmm2, 1 * SIZE(BB)
  762. #else
  763. movsd %xmm0, 0 * SIZE(AA)
  764. movsd %xmm2, 1 * SIZE(AA)
  765. #endif
  766. #ifdef LN
  767. subl $2 * SIZE, CO1
  768. #endif
  769. movsd %xmm0, 0 * SIZE(CO1)
  770. movsd %xmm2, 1 * SIZE(CO1)
  771. #ifndef LN
  772. addl $2 * SIZE, CO1
  773. #endif
  774. #if defined(LT) || defined(RN)
  775. movl K, %eax
  776. subl KK, %eax
  777. leal (,%eax, SIZE), %eax
  778. leal (AA, %eax, 2), AA
  779. addl %eax, BB
  780. #endif
  781. #ifdef LN
  782. subl $2, KK
  783. #endif
  784. #ifdef LT
  785. addl $2, KK
  786. #endif
  787. #ifdef RT
  788. movl K, %eax
  789. sall $1 + BASE_SHIFT, %eax
  790. addl %eax, AORIG
  791. #endif
  792. decl %ebx # i --
  793. jg .L31
  794. ALIGN_4
  795. .L40:
  796. movl M, %ebx
  797. testl $1, %ebx # i = (m >> 2)
  798. jle .L49
  799. #ifdef LN
  800. movl K, %eax
  801. sall $BASE_SHIFT, %eax
  802. subl %eax, AORIG
  803. #endif
  804. #if defined(LN) || defined(RT)
  805. movl KK, %eax
  806. movl AORIG, AA
  807. leal (AA, %eax, SIZE), AA
  808. #endif
  809. movl B, BB
  810. #if defined(LN) || defined(RT)
  811. movl KK, %eax
  812. sall $BASE_SHIFT, %eax
  813. addl %eax, BB
  814. #endif
  815. movsd 0 * SIZE(AA), %xmm0
  816. xorps %xmm2, %xmm2
  817. movsd 0 * SIZE(BB), %xmm2
  818. xorps %xmm3, %xmm3
  819. xorps %xmm4, %xmm4
  820. xorps %xmm5, %xmm5
  821. #if defined(LT) || defined(RN)
  822. movl KK, %eax
  823. #else
  824. movl K, %eax
  825. subl KK, %eax
  826. #endif
  827. sarl $2, %eax
  828. je .L45
  829. ALIGN_4
  830. .L42:
  831. mulsd %xmm0, %xmm2
  832. movsd 1 * SIZE(AA), %xmm0
  833. addsd %xmm2, %xmm4
  834. movsd 1 * SIZE(BB), %xmm2
  835. mulsd %xmm0, %xmm2
  836. movsd 2 * SIZE(AA), %xmm0
  837. addsd %xmm2, %xmm5
  838. movsd 2 * SIZE(BB), %xmm2
  839. mulsd %xmm0, %xmm2
  840. movsd 3 * SIZE(AA), %xmm0
  841. addsd %xmm2, %xmm4
  842. movsd 3 * SIZE(BB), %xmm2
  843. mulsd %xmm0, %xmm2
  844. movsd 4 * SIZE(AA), %xmm0
  845. addsd %xmm2, %xmm5
  846. movsd 4 * SIZE(BB), %xmm2
  847. addl $4 * SIZE, AA
  848. addl $4 * SIZE, BB
  849. decl %eax
  850. jne .L42
  851. ALIGN_4
  852. .L45:
  853. #if defined(LT) || defined(RN)
  854. movl KK, %eax
  855. #else
  856. movl K, %eax
  857. subl KK, %eax
  858. #endif
  859. andl $3, %eax # if (k & 1)
  860. BRANCH
  861. je .L48
  862. ALIGN_3
  863. .L46:
  864. mulsd %xmm0, %xmm2
  865. movsd 1 * SIZE(AA), %xmm0
  866. addsd %xmm2, %xmm4
  867. movsd 1 * SIZE(BB), %xmm2
  868. addl $1 * SIZE, AA
  869. addl $1 * SIZE, BB
  870. decl %eax
  871. jg .L46
  872. ALIGN_4
  873. .L48:
  874. addsd %xmm5, %xmm4
  875. #if defined(LN) || defined(RT)
  876. movl KK, %eax
  877. #ifdef LN
  878. subl $1, %eax
  879. #else
  880. subl $1, %eax
  881. #endif
  882. movl AORIG, AA
  883. leal (, %eax, SIZE), %eax
  884. addl %eax, AA
  885. leal (B, %eax, 1), BB
  886. #endif
  887. #if defined(LN) || defined(LT)
  888. movsd 0 * SIZE(BB), %xmm0
  889. subsd %xmm4, %xmm0
  890. #else
  891. movsd 0 * SIZE(AA), %xmm0
  892. subsd %xmm4, %xmm0
  893. #endif
  894. #if defined(LN) || defined(LT)
  895. mulsd 0 * SIZE(AA), %xmm0
  896. #endif
  897. #if defined(RN) || defined(RT)
  898. mulsd 0 * SIZE(BB), %xmm0
  899. #endif
  900. #if defined(LN) || defined(LT)
  901. movsd %xmm0, 0 * SIZE(BB)
  902. #else
  903. movsd %xmm0, 0 * SIZE(AA)
  904. #endif
  905. #ifdef LN
  906. subl $1 * SIZE, CO1
  907. #endif
  908. movsd %xmm0, 0 * SIZE(CO1)
  909. #ifndef LN
  910. addl $1 * SIZE, CO1
  911. #endif
  912. #if defined(LT) || defined(RN)
  913. movl K, %eax
  914. subl KK, %eax
  915. leal (,%eax, SIZE), %eax
  916. addl %eax, AA
  917. addl %eax, BB
  918. #endif
  919. #ifdef LN
  920. subl $1, KK
  921. #endif
  922. #ifdef LT
  923. addl $1, KK
  924. #endif
  925. #ifdef RT
  926. movl K, %eax
  927. sall $BASE_SHIFT, %eax
  928. addl %eax, AORIG
  929. #endif
  930. ALIGN_4
  931. .L49:
  932. #ifdef LN
  933. movl K, %eax
  934. leal (B, %eax, SIZE), B
  935. #endif
  936. #if defined(LT) || defined(RN)
  937. movl BB, B
  938. #endif
  939. #ifdef RN
  940. addl $1, KK
  941. #endif
  942. #ifdef RT
  943. subl $1, KK
  944. #endif
  945. ALIGN_4
  946. .L999:
  947. popl %ebx
  948. popl %esi
  949. popl %edi
  950. popl %ebp
  951. addl $ARGS, %esp
  952. ret
  953. EPILOGUE