You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT_1x2_sse2.S 26 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esi)
  43. #define STACK_N 8 + STACK + ARGS(%esi)
  44. #define STACK_K 12 + STACK + ARGS(%esi)
  45. #define STACK_ALPHA_R 16 + STACK + ARGS(%esi)
  46. #define STACK_ALPHA_I 24 + STACK + ARGS(%esi)
  47. #define STACK_A 32 + STACK + ARGS(%esi)
  48. #define STACK_B 36 + STACK + ARGS(%esi)
  49. #define STACK_C 40 + STACK + ARGS(%esi)
  50. #define STACK_LDC 44 + STACK + ARGS(%esi)
  51. #define STACK_OFFT 48 + STACK + ARGS(%esi)
  52. #define POSINV 0(%esp)
  53. #define K 16(%esp)
  54. #define N 20(%esp)
  55. #define M 24(%esp)
  56. #define A 28(%esp)
  57. #define C 32(%esp)
  58. #define J 36(%esp)
  59. #define OLD_STACK 40(%esp)
  60. #define OFFSET 44(%esp)
  61. #define KK 48(%esp)
  62. #define KKK 52(%esp)
  63. #define AORIG 56(%esp)
  64. #define BORIG 60(%esp)
  65. #define BUFFER 128(%esp)
  66. #define STACK_ALIGN 4096
  67. #define STACK_OFFSET 1024
  68. #if defined(OPTERON) || defined(BARCELONA)
  69. #define PREFETCH prefetch
  70. #else
  71. #define PREFETCH prefetcht0
  72. #endif
  73. #define PREFETCHSIZE (8 * 10 + 4)
  74. #define AA %edx
  75. #define BB %ecx
  76. #define LDC %ebp
  77. #define B %edi
  78. #define CO1 %esi
  79. #ifndef CONJ
  80. #define NN
  81. #else
  82. #if defined(LN) || defined(LT)
  83. #define CN
  84. #else
  85. #define NC
  86. #endif
  87. #endif
  88. #define KERNEL1(address) \
  89. mulpd %xmm0, %xmm2; \
  90. addpd %xmm2, %xmm4; \
  91. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  92. movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  93. mulpd %xmm0, %xmm2; \
  94. addpd %xmm2, %xmm5; \
  95. movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  96. mulpd %xmm0, %xmm2; \
  97. mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  98. addpd %xmm2, %xmm6; \
  99. movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  100. addpd %xmm0, %xmm7; \
  101. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  102. #define KERNEL2(address) \
  103. mulpd %xmm0, %xmm3; \
  104. addpd %xmm3, %xmm4; \
  105. movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  106. mulpd %xmm0, %xmm3; \
  107. addpd %xmm3, %xmm5; \
  108. movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  109. mulpd %xmm0, %xmm3; \
  110. mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  111. addpd %xmm3, %xmm6; \
  112. movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  113. addpd %xmm0, %xmm7; \
  114. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  115. #define KERNEL3(address) \
  116. mulpd %xmm0, %xmm2; \
  117. addpd %xmm2, %xmm4; \
  118. movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  119. mulpd %xmm0, %xmm2; \
  120. addpd %xmm2, %xmm5; \
  121. movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  122. mulpd %xmm0, %xmm2; \
  123. mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  124. addpd %xmm2, %xmm6; \
  125. movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  126. addpd %xmm0, %xmm7; \
  127. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  128. #define KERNEL4(address) \
  129. mulpd %xmm0, %xmm3; \
  130. addpd %xmm3, %xmm4; \
  131. movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  132. mulpd %xmm0, %xmm3; \
  133. addpd %xmm3, %xmm5; \
  134. movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  135. mulpd %xmm0, %xmm3; \
  136. mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  137. addpd %xmm3, %xmm6; \
  138. movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  139. addpd %xmm0, %xmm7; \
  140. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  141. #define KERNEL5(address) \
  142. PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
  143. mulpd %xmm1, %xmm2; \
  144. addpd %xmm2, %xmm4; \
  145. movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  146. mulpd %xmm1, %xmm2; \
  147. addpd %xmm2, %xmm5; \
  148. movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  149. mulpd %xmm1, %xmm2; \
  150. mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  151. addpd %xmm2, %xmm6; \
  152. movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  153. addpd %xmm1, %xmm7; \
  154. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  155. #define KERNEL6(address) \
  156. mulpd %xmm1, %xmm3; \
  157. addpd %xmm3, %xmm4; \
  158. movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  159. mulpd %xmm1, %xmm3; \
  160. addpd %xmm3, %xmm5; \
  161. movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  162. mulpd %xmm1, %xmm3; \
  163. mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  164. addpd %xmm3, %xmm6; \
  165. movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  166. addpd %xmm1, %xmm7; \
  167. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  168. #define KERNEL7(address) \
  169. mulpd %xmm1, %xmm2; \
  170. addpd %xmm2, %xmm4; \
  171. movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  172. mulpd %xmm1, %xmm2; \
  173. addpd %xmm2, %xmm5; \
  174. movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  175. mulpd %xmm1, %xmm2; \
  176. mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  177. addpd %xmm2, %xmm6; \
  178. movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  179. addpd %xmm1, %xmm7; \
  180. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  181. #define KERNEL8(address) \
  182. mulpd %xmm1, %xmm3; \
  183. addpd %xmm3, %xmm4; \
  184. movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  185. mulpd %xmm1, %xmm3; \
  186. addpd %xmm3, %xmm5; \
  187. movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  188. mulpd %xmm1, %xmm3; \
  189. mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  190. addpd %xmm3, %xmm6; \
  191. movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  192. addpd %xmm1, %xmm7; \
  193. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  194. PROLOGUE
  195. pushl %ebp
  196. pushl %edi
  197. pushl %esi
  198. pushl %ebx
  199. PROFCODE
  200. EMMS
  201. movl %esp, %esi # save old stack
  202. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  203. andl $-STACK_ALIGN, %esp # align stack
  204. addl $STACK_OFFSET, %esp
  205. STACK_TOUCHING
  206. movl STACK_M, %ebx
  207. movl STACK_N, %eax
  208. movl STACK_K, %ecx
  209. movl STACK_A, %edx
  210. movl %ebx, M
  211. movl %eax, N
  212. movl %ecx, K
  213. movl %edx, A
  214. movl %esi, OLD_STACK
  215. movl STACK_B, B
  216. movl STACK_C, %ebx
  217. movss STACK_OFFT, %xmm4
  218. pcmpeqb %xmm7, %xmm7
  219. psllq $63, %xmm7 # Generate mask
  220. pxor %xmm2, %xmm2
  221. movlpd %xmm2, 0 + POSINV
  222. movlpd %xmm7, 8 + POSINV
  223. movl %ebx, C
  224. movl STACK_LDC, LDC
  225. movss %xmm4, OFFSET
  226. movss %xmm4, KK
  227. sall $ZBASE_SHIFT, LDC
  228. #ifdef LN
  229. movl M, %eax
  230. sall $ZBASE_SHIFT, %eax
  231. addl %eax, C
  232. imull K, %eax
  233. addl %eax, A
  234. #endif
  235. #ifdef RT
  236. movl N, %eax
  237. sall $ZBASE_SHIFT, %eax
  238. imull K, %eax
  239. addl %eax, B
  240. movl N, %eax
  241. imull LDC, %eax
  242. addl %eax, C
  243. #endif
  244. #ifdef RN
  245. negl KK
  246. #endif
  247. #ifdef RT
  248. movl N, %eax
  249. subl OFFSET, %eax
  250. movl %eax, KK
  251. #endif
  252. movl N, %eax
  253. sarl $1, %eax
  254. movl %eax, J # j = n
  255. jle .L100
  256. ALIGN_4
  257. .L01:
  258. #ifdef LN
  259. movl OFFSET, %eax
  260. addl M, %eax
  261. movl %eax, KK
  262. #endif
  263. leal BUFFER, BB
  264. #ifdef RT
  265. movl K, %eax
  266. sall $1 + ZBASE_SHIFT, %eax
  267. subl %eax, B
  268. #endif
  269. #if defined(LN) || defined(RT)
  270. movl KK, %eax
  271. movl B, BORIG
  272. sall $1 + ZBASE_SHIFT, %eax
  273. addl %eax, B
  274. leal (BB, %eax, 2), BB
  275. #endif
  276. #if defined(LT)
  277. movl OFFSET, %eax
  278. movl %eax, KK
  279. #endif
  280. #if defined(LT) || defined(RN)
  281. movl KK, %eax
  282. #else
  283. movl K, %eax
  284. subl KK, %eax
  285. #endif
  286. sarl $1, %eax
  287. jle .L03
  288. ALIGN_4
  289. .L02:
  290. prefetchnta 56 * SIZE(B)
  291. movlpd 0 * SIZE(B), %xmm0
  292. movlpd 1 * SIZE(B), %xmm1
  293. movlpd 2 * SIZE(B), %xmm2
  294. movlpd 3 * SIZE(B), %xmm3
  295. movlpd 4 * SIZE(B), %xmm4
  296. movlpd 5 * SIZE(B), %xmm5
  297. movlpd 6 * SIZE(B), %xmm6
  298. movlpd 7 * SIZE(B), %xmm7
  299. movlpd %xmm0, 0 * SIZE(BB)
  300. movlpd %xmm0, 1 * SIZE(BB)
  301. movlpd %xmm1, 2 * SIZE(BB)
  302. movlpd %xmm1, 3 * SIZE(BB)
  303. movlpd %xmm2, 4 * SIZE(BB)
  304. movlpd %xmm2, 5 * SIZE(BB)
  305. movlpd %xmm3, 6 * SIZE(BB)
  306. movlpd %xmm3, 7 * SIZE(BB)
  307. movlpd %xmm4, 8 * SIZE(BB)
  308. movlpd %xmm4, 9 * SIZE(BB)
  309. movlpd %xmm5, 10 * SIZE(BB)
  310. movlpd %xmm5, 11 * SIZE(BB)
  311. movlpd %xmm6, 12 * SIZE(BB)
  312. movlpd %xmm6, 13 * SIZE(BB)
  313. movlpd %xmm7, 14 * SIZE(BB)
  314. movlpd %xmm7, 15 * SIZE(BB)
  315. addl $ 8 * SIZE, B
  316. subl $-16 * SIZE, BB
  317. decl %eax
  318. jne .L02
  319. ALIGN_4
  320. .L03:
  321. #if defined(LT) || defined(RN)
  322. movl KK, %eax
  323. #else
  324. movl K, %eax
  325. subl KK, %eax
  326. #endif
  327. andl $1, %eax
  328. BRANCH
  329. jle .L05
  330. movlpd 0 * SIZE(B), %xmm0
  331. movlpd 1 * SIZE(B), %xmm1
  332. movlpd 2 * SIZE(B), %xmm2
  333. movlpd 3 * SIZE(B), %xmm3
  334. movlpd %xmm0, 0 * SIZE(BB)
  335. movlpd %xmm0, 1 * SIZE(BB)
  336. movlpd %xmm1, 2 * SIZE(BB)
  337. movlpd %xmm1, 3 * SIZE(BB)
  338. movlpd %xmm2, 4 * SIZE(BB)
  339. movlpd %xmm2, 5 * SIZE(BB)
  340. movlpd %xmm3, 6 * SIZE(BB)
  341. movlpd %xmm3, 7 * SIZE(BB)
  342. addl $4 * SIZE, B
  343. ALIGN_4
  344. .L05:
  345. #if defined(LT) || defined(RN)
  346. movl A, %eax
  347. movl %eax, AA
  348. #else
  349. movl A, %eax
  350. movl %eax, AORIG
  351. #endif
  352. #ifdef RT
  353. leal (, LDC, 2), %eax
  354. subl %eax, C
  355. #endif
  356. movl C, CO1
  357. #ifndef RT
  358. leal (, LDC, 2), %eax
  359. addl %eax, C
  360. #endif
  361. movl M, %ebx
  362. testl %ebx, %ebx
  363. jle .L100
  364. ALIGN_4
  365. .L10:
  366. #ifdef LN
  367. movl K, %eax
  368. sall $ZBASE_SHIFT, %eax
  369. subl %eax, AORIG
  370. #endif
  371. #if defined(LN) || defined(RT)
  372. movl AORIG, %eax
  373. movl %eax, AA
  374. movl KK, %eax
  375. sall $ZBASE_SHIFT, %eax
  376. addl %eax, AA
  377. #endif
  378. leal BUFFER, BB
  379. #if defined(LN) || defined(RT)
  380. movl KK, %eax
  381. sall $1 + ZBASE_SHIFT, %eax
  382. leal (BB, %eax, 2), BB
  383. #endif
  384. movapd 0 * SIZE(AA), %xmm0
  385. pxor %xmm4, %xmm4
  386. movapd 8 * SIZE(AA), %xmm1
  387. pxor %xmm5, %xmm5
  388. movapd 0 * SIZE(BB), %xmm2
  389. pxor %xmm6, %xmm6
  390. movapd 8 * SIZE(BB), %xmm3
  391. pxor %xmm7, %xmm7
  392. #ifdef LN
  393. prefetchw -2 * SIZE(CO1)
  394. prefetchw -2 * SIZE(CO1, LDC)
  395. #else
  396. prefetchw 2 * SIZE(CO1)
  397. prefetchw 2 * SIZE(CO1, LDC)
  398. #endif
  399. #if defined(LT) || defined(RN)
  400. movl KK, %eax
  401. #else
  402. movl K, %eax
  403. subl KK, %eax
  404. #endif
  405. #if 1
  406. andl $-8, %eax
  407. sall $4, %eax
  408. je .L15
  409. .L1X:
  410. KERNEL1(16 * 0)
  411. KERNEL2(16 * 0)
  412. KERNEL3(16 * 0)
  413. KERNEL4(16 * 0)
  414. KERNEL5(16 * 0)
  415. KERNEL6(16 * 0)
  416. KERNEL7(16 * 0)
  417. KERNEL8(16 * 0)
  418. cmpl $128 * 1, %eax
  419. jle .L12
  420. KERNEL1(16 * 1)
  421. KERNEL2(16 * 1)
  422. KERNEL3(16 * 1)
  423. KERNEL4(16 * 1)
  424. KERNEL5(16 * 1)
  425. KERNEL6(16 * 1)
  426. KERNEL7(16 * 1)
  427. KERNEL8(16 * 1)
  428. cmpl $128 * 2, %eax
  429. jle .L12
  430. KERNEL1(16 * 2)
  431. KERNEL2(16 * 2)
  432. KERNEL3(16 * 2)
  433. KERNEL4(16 * 2)
  434. KERNEL5(16 * 2)
  435. KERNEL6(16 * 2)
  436. KERNEL7(16 * 2)
  437. KERNEL8(16 * 2)
  438. cmpl $128 * 3, %eax
  439. jle .L12
  440. KERNEL1(16 * 3)
  441. KERNEL2(16 * 3)
  442. KERNEL3(16 * 3)
  443. KERNEL4(16 * 3)
  444. KERNEL5(16 * 3)
  445. KERNEL6(16 * 3)
  446. KERNEL7(16 * 3)
  447. KERNEL8(16 * 3)
  448. cmpl $128 * 4, %eax
  449. jle .L12
  450. KERNEL1(16 * 4)
  451. KERNEL2(16 * 4)
  452. KERNEL3(16 * 4)
  453. KERNEL4(16 * 4)
  454. KERNEL5(16 * 4)
  455. KERNEL6(16 * 4)
  456. KERNEL7(16 * 4)
  457. KERNEL8(16 * 4)
  458. cmpl $128 * 5, %eax
  459. jle .L12
  460. KERNEL1(16 * 5)
  461. KERNEL2(16 * 5)
  462. KERNEL3(16 * 5)
  463. KERNEL4(16 * 5)
  464. KERNEL5(16 * 5)
  465. KERNEL6(16 * 5)
  466. KERNEL7(16 * 5)
  467. KERNEL8(16 * 5)
  468. cmpl $128 * 6, %eax
  469. jle .L12
  470. KERNEL1(16 * 6)
  471. KERNEL2(16 * 6)
  472. KERNEL3(16 * 6)
  473. KERNEL4(16 * 6)
  474. KERNEL5(16 * 6)
  475. KERNEL6(16 * 6)
  476. KERNEL7(16 * 6)
  477. KERNEL8(16 * 6)
  478. cmpl $128 * 7, %eax
  479. jle .L12
  480. KERNEL1(16 * 7)
  481. KERNEL2(16 * 7)
  482. KERNEL3(16 * 7)
  483. KERNEL4(16 * 7)
  484. KERNEL5(16 * 7)
  485. KERNEL6(16 * 7)
  486. KERNEL7(16 * 7)
  487. KERNEL8(16 * 7)
  488. addl $128 * 4 * SIZE, BB
  489. addl $128 * 1 * SIZE, AA
  490. subl $128 * 8, %eax
  491. jg .L1X
  492. jmp .L15
  493. .L12:
  494. leal (AA, %eax, 1), AA
  495. leal (BB, %eax, 4), BB
  496. ALIGN_4
  497. #else
  498. sarl $3, %eax
  499. je .L15
  500. ALIGN_4
  501. .L12:
  502. KERNEL1(16 * 0)
  503. KERNEL2(16 * 0)
  504. KERNEL3(16 * 0)
  505. KERNEL4(16 * 0)
  506. KERNEL5(16 * 0)
  507. KERNEL6(16 * 0)
  508. KERNEL7(16 * 0)
  509. KERNEL8(16 * 0)
  510. addl $64 * SIZE, BB
  511. addl $16 * SIZE, AA
  512. decl %eax
  513. jne .L11
  514. ALIGN_4
  515. #endif
  516. .L15:
  517. #if defined(LT) || defined(RN)
  518. movl KK, %eax
  519. #else
  520. movl K, %eax
  521. subl KK, %eax
  522. #endif
  523. andl $7, %eax # if (k & 1)
  524. BRANCH
  525. je .L14
  526. ALIGN_4
  527. .L13:
  528. mulpd %xmm0, %xmm2
  529. addpd %xmm2, %xmm4
  530. movapd 2 * SIZE(BB), %xmm2
  531. mulpd %xmm0, %xmm2
  532. addpd %xmm2, %xmm5
  533. movapd 4 * SIZE(BB), %xmm2
  534. mulpd %xmm0, %xmm2
  535. mulpd 6 * SIZE(BB), %xmm0
  536. addpd %xmm2, %xmm6
  537. movapd 8 * SIZE(BB), %xmm2
  538. addpd %xmm0, %xmm7
  539. movapd 2 * SIZE(AA), %xmm0
  540. addl $2 * SIZE, AA
  541. addl $8 * SIZE, BB
  542. decl %eax
  543. jg .L13
  544. ALIGN_4
  545. .L14:
  546. #if defined(LN) || defined(RT)
  547. movl KK, %eax
  548. #ifdef LN
  549. subl $1, %eax
  550. #else
  551. subl $2, %eax
  552. #endif
  553. movl AORIG, AA
  554. movl BORIG, B
  555. leal BUFFER, BB
  556. sall $ZBASE_SHIFT, %eax
  557. addl %eax, AA
  558. leal (B, %eax, 2), B
  559. leal (BB, %eax, 4), BB
  560. #endif
  561. movapd POSINV, %xmm1
  562. SHUFPD_1 %xmm5, %xmm5
  563. SHUFPD_1 %xmm7, %xmm7
  564. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  565. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  566. xorpd %xmm1, %xmm5
  567. xorpd %xmm1, %xmm7
  568. #else
  569. xorpd %xmm1, %xmm4
  570. xorpd %xmm1, %xmm6
  571. #endif
  572. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  573. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  574. subpd %xmm5, %xmm4
  575. subpd %xmm7, %xmm6
  576. #else
  577. addpd %xmm5, %xmm4
  578. addpd %xmm7, %xmm6
  579. #endif
  580. #if defined(LN) || defined(LT)
  581. movapd 0 * SIZE(B), %xmm5
  582. movapd 2 * SIZE(B), %xmm7
  583. subpd %xmm4, %xmm5
  584. subpd %xmm6, %xmm7
  585. #else
  586. movapd 0 * SIZE(AA), %xmm5
  587. movapd 2 * SIZE(AA), %xmm7
  588. subpd %xmm4, %xmm5
  589. subpd %xmm6, %xmm7
  590. #endif
  591. #ifndef CONJ
  592. SHUFPD_1 %xmm1, %xmm1
  593. #endif
  594. #if defined(LN) || defined(LT)
  595. movlpd 0 * SIZE(AA), %xmm2
  596. movhpd 0 * SIZE(AA), %xmm2
  597. movlpd 1 * SIZE(AA), %xmm3
  598. movhpd 1 * SIZE(AA), %xmm3
  599. pshufd $0x4e, %xmm5, %xmm4
  600. pshufd $0x4e, %xmm7, %xmm6
  601. xorpd %xmm1, %xmm4
  602. xorpd %xmm1, %xmm6
  603. mulpd %xmm2, %xmm5
  604. mulpd %xmm3, %xmm4
  605. mulpd %xmm2, %xmm7
  606. mulpd %xmm3, %xmm6
  607. addpd %xmm4, %xmm5
  608. addpd %xmm6, %xmm7
  609. #endif
  610. #ifdef RN
  611. movlpd 0 * SIZE(B), %xmm2
  612. movhpd 0 * SIZE(B), %xmm2
  613. movlpd 1 * SIZE(B), %xmm3
  614. movhpd 1 * SIZE(B), %xmm3
  615. pshufd $0x4e, %xmm5, %xmm4
  616. xorpd %xmm1, %xmm4
  617. mulpd %xmm2, %xmm5
  618. mulpd %xmm3, %xmm4
  619. addpd %xmm4, %xmm5
  620. movlpd 2 * SIZE(B), %xmm2
  621. movhpd 2 * SIZE(B), %xmm2
  622. movlpd 3 * SIZE(B), %xmm3
  623. movhpd 3 * SIZE(B), %xmm3
  624. movapd %xmm5, %xmm4
  625. pshufd $0x4e, %xmm5, %xmm6
  626. xorpd %xmm1, %xmm6
  627. mulpd %xmm2, %xmm4
  628. mulpd %xmm3, %xmm6
  629. subpd %xmm4, %xmm7
  630. subpd %xmm6, %xmm7
  631. movlpd 6 * SIZE(B), %xmm2
  632. movhpd 6 * SIZE(B), %xmm2
  633. movlpd 7 * SIZE(B), %xmm3
  634. movhpd 7 * SIZE(B), %xmm3
  635. pshufd $0x4e, %xmm7, %xmm6
  636. xorpd %xmm1, %xmm6
  637. mulpd %xmm2, %xmm7
  638. mulpd %xmm3, %xmm6
  639. addpd %xmm6, %xmm7
  640. #endif
  641. #ifdef RT
  642. movlpd 6 * SIZE(B), %xmm2
  643. movhpd 6 * SIZE(B), %xmm2
  644. movlpd 7 * SIZE(B), %xmm3
  645. movhpd 7 * SIZE(B), %xmm3
  646. pshufd $0x4e, %xmm7, %xmm6
  647. xorpd %xmm1, %xmm6
  648. mulpd %xmm2, %xmm7
  649. mulpd %xmm3, %xmm6
  650. addpd %xmm6, %xmm7
  651. movlpd 4 * SIZE(B), %xmm2
  652. movhpd 4 * SIZE(B), %xmm2
  653. movlpd 5 * SIZE(B), %xmm3
  654. movhpd 5 * SIZE(B), %xmm3
  655. movapd %xmm7, %xmm4
  656. pshufd $0x4e, %xmm7, %xmm6
  657. xorpd %xmm1, %xmm6
  658. mulpd %xmm2, %xmm4
  659. mulpd %xmm3, %xmm6
  660. subpd %xmm4, %xmm5
  661. subpd %xmm6, %xmm5
  662. movlpd 0 * SIZE(B), %xmm2
  663. movhpd 0 * SIZE(B), %xmm2
  664. movlpd 1 * SIZE(B), %xmm3
  665. movhpd 1 * SIZE(B), %xmm3
  666. pshufd $0x4e, %xmm5, %xmm4
  667. xorpd %xmm1, %xmm4
  668. mulpd %xmm2, %xmm5
  669. mulpd %xmm3, %xmm4
  670. addpd %xmm4, %xmm5
  671. #endif
  672. #ifdef LN
  673. subl $2 * SIZE, CO1
  674. #endif
  675. movlpd %xmm5, 0 * SIZE(CO1)
  676. movhpd %xmm5, 1 * SIZE(CO1)
  677. movlpd %xmm7, 0 * SIZE(CO1, LDC)
  678. movhpd %xmm7, 1 * SIZE(CO1, LDC)
  679. #if defined(LN) || defined(LT)
  680. movapd %xmm5, 0 * SIZE(B)
  681. movapd %xmm7, 2 * SIZE(B)
  682. movlpd %xmm5, 0 * SIZE(BB)
  683. movlpd %xmm5, 1 * SIZE(BB)
  684. movhpd %xmm5, 2 * SIZE(BB)
  685. movhpd %xmm5, 3 * SIZE(BB)
  686. movlpd %xmm7, 4 * SIZE(BB)
  687. movlpd %xmm7, 5 * SIZE(BB)
  688. movhpd %xmm7, 6 * SIZE(BB)
  689. movhpd %xmm7, 7 * SIZE(BB)
  690. #else
  691. movapd %xmm5, 0 * SIZE(AA)
  692. movapd %xmm7, 2 * SIZE(AA)
  693. #endif
  694. #ifndef LN
  695. addl $2 * SIZE, CO1
  696. #endif
  697. #if defined(LT) || defined(RN)
  698. movl K, %eax
  699. subl KK, %eax
  700. sall $ZBASE_SHIFT, %eax
  701. addl %eax, AA
  702. #ifdef LT
  703. addl $4 * SIZE, B
  704. #endif
  705. #endif
  706. #ifdef LN
  707. subl $1, KK
  708. movl BORIG, B
  709. #endif
  710. #ifdef LT
  711. addl $1, KK
  712. #endif
  713. #ifdef RT
  714. movl K, %eax
  715. movl BORIG, B
  716. sall $ZBASE_SHIFT, %eax
  717. addl %eax, AORIG
  718. #endif
  719. decl %ebx # i --
  720. jg .L10
  721. ALIGN_4
  722. .L99:
  723. #ifdef LN
  724. movl K, %eax
  725. sall $1 + ZBASE_SHIFT, %eax
  726. addl %eax, B
  727. #endif
  728. #if defined(LT) || defined(RN)
  729. movl K, %eax
  730. subl KK, %eax
  731. sall $1 + ZBASE_SHIFT, %eax
  732. addl %eax, B
  733. #endif
  734. #ifdef RN
  735. addl $2, KK
  736. #endif
  737. #ifdef RT
  738. subl $2, KK
  739. #endif
  740. decl J # j --
  741. jg .L01
  742. ALIGN_4
  743. .L100:
  744. movl N, %eax
  745. andl $1, %eax
  746. jle .L500
  747. ALIGN_4
  748. .L101:
  749. #ifdef LN
  750. movl OFFSET, %eax
  751. addl M, %eax
  752. movl %eax, KK
  753. #endif
  754. leal BUFFER, BB
  755. #ifdef RT
  756. movl K, %eax
  757. sall $ZBASE_SHIFT, %eax
  758. subl %eax, B
  759. #endif
  760. #if defined(LN) || defined(RT)
  761. movl KK, %eax
  762. movl B, BORIG
  763. sall $ZBASE_SHIFT, %eax
  764. addl %eax, B
  765. leal (BB, %eax, 2), BB
  766. #endif
  767. #if defined(LT)
  768. movl OFFSET, %eax
  769. movl %eax, KK
  770. #endif
  771. #if defined(LT) || defined(RN)
  772. movl KK, %eax
  773. #else
  774. movl K, %eax
  775. subl KK, %eax
  776. #endif
  777. sarl $2, %eax
  778. jle .L103
  779. ALIGN_4
  780. .L102:
  781. prefetchnta 56 * SIZE(B)
  782. movlpd 0 * SIZE(B), %xmm0
  783. movlpd 1 * SIZE(B), %xmm1
  784. movlpd 2 * SIZE(B), %xmm2
  785. movlpd 3 * SIZE(B), %xmm3
  786. movlpd 4 * SIZE(B), %xmm4
  787. movlpd 5 * SIZE(B), %xmm5
  788. movlpd 6 * SIZE(B), %xmm6
  789. movlpd 7 * SIZE(B), %xmm7
  790. movlpd %xmm0, 0 * SIZE(BB)
  791. movlpd %xmm0, 1 * SIZE(BB)
  792. movlpd %xmm1, 2 * SIZE(BB)
  793. movlpd %xmm1, 3 * SIZE(BB)
  794. movlpd %xmm2, 4 * SIZE(BB)
  795. movlpd %xmm2, 5 * SIZE(BB)
  796. movlpd %xmm3, 6 * SIZE(BB)
  797. movlpd %xmm3, 7 * SIZE(BB)
  798. movlpd %xmm4, 8 * SIZE(BB)
  799. movlpd %xmm4, 9 * SIZE(BB)
  800. movlpd %xmm5, 10 * SIZE(BB)
  801. movlpd %xmm5, 11 * SIZE(BB)
  802. movlpd %xmm6, 12 * SIZE(BB)
  803. movlpd %xmm6, 13 * SIZE(BB)
  804. movlpd %xmm7, 14 * SIZE(BB)
  805. movlpd %xmm7, 15 * SIZE(BB)
  806. addl $ 8 * SIZE, B
  807. subl $-16 * SIZE, BB
  808. decl %eax
  809. jne .L102
  810. ALIGN_4
  811. .L103:
  812. #if defined(LT) || defined(RN)
  813. movl KK, %eax
  814. #else
  815. movl K, %eax
  816. subl KK, %eax
  817. #endif
  818. andl $3, %eax
  819. BRANCH
  820. jle .L105
  821. ALIGN_4
  822. .L104:
  823. movlpd 0 * SIZE(B), %xmm0
  824. movlpd 1 * SIZE(B), %xmm1
  825. movlpd %xmm0, 0 * SIZE(BB)
  826. movlpd %xmm0, 1 * SIZE(BB)
  827. movlpd %xmm1, 2 * SIZE(BB)
  828. movlpd %xmm1, 3 * SIZE(BB)
  829. addl $2 * SIZE, B
  830. addl $4 * SIZE, BB
  831. decl %eax
  832. jne .L104
  833. ALIGN_4
  834. .L105:
  835. #if defined(LT) || defined(RN)
  836. movl A, %eax
  837. movl %eax, AA
  838. #else
  839. movl A, %eax
  840. movl %eax, AORIG
  841. #endif
  842. #ifdef RT
  843. subl LDC, C
  844. #endif
  845. movl C, CO1
  846. #ifndef RT
  847. addl LDC, C
  848. #endif
  849. movl M, %ebx
  850. testl %ebx, %ebx
  851. jle .L199
  852. ALIGN_4
  853. .L110:
  854. #ifdef LN
  855. movl K, %eax
  856. sall $ZBASE_SHIFT, %eax
  857. subl %eax, AORIG
  858. #endif
  859. #if defined(LN) || defined(RT)
  860. movl AORIG, %eax
  861. movl %eax, AA
  862. movl KK, %eax
  863. sall $ZBASE_SHIFT, %eax
  864. addl %eax, AA
  865. #endif
  866. leal BUFFER, BB
  867. #if defined(LN) || defined(RT)
  868. movl KK, %eax
  869. sall $ZBASE_SHIFT, %eax
  870. leal (BB, %eax, 2), BB
  871. #endif
  872. pxor %xmm4, %xmm4
  873. pxor %xmm5, %xmm5
  874. pxor %xmm6, %xmm6
  875. pxor %xmm7, %xmm7
  876. movapd 0 * SIZE(AA), %xmm0
  877. movapd 8 * SIZE(AA), %xmm1
  878. movapd 0 * SIZE(BB), %xmm2
  879. movapd 8 * SIZE(BB), %xmm3
  880. #ifdef LN
  881. prefetchw -2 * SIZE(CO1)
  882. #else
  883. prefetchw 2 * SIZE(CO1)
  884. #endif
  885. #if defined(LT) || defined(RN)
  886. movl KK, %eax
  887. #else
  888. movl K, %eax
  889. subl KK, %eax
  890. #endif
  891. sarl $3, %eax
  892. je .L112
  893. ALIGN_4
  894. .L111:
  895. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  896. mulpd %xmm0, %xmm2
  897. mulpd 2 * SIZE(BB), %xmm0
  898. addpd %xmm2, %xmm4
  899. movapd 4 * SIZE(BB), %xmm2
  900. addpd %xmm0, %xmm5
  901. movapd 2 * SIZE(AA), %xmm0
  902. mulpd %xmm0, %xmm2
  903. mulpd 6 * SIZE(BB), %xmm0
  904. addpd %xmm2, %xmm6
  905. movapd 16 * SIZE(BB), %xmm2
  906. addpd %xmm0, %xmm7
  907. movapd 4 * SIZE(AA), %xmm0
  908. mulpd %xmm0, %xmm3
  909. mulpd 10 * SIZE(BB), %xmm0
  910. addpd %xmm3, %xmm4
  911. movapd 12 * SIZE(BB), %xmm3
  912. addpd %xmm0, %xmm5
  913. movapd 6 * SIZE(AA), %xmm0
  914. mulpd %xmm0, %xmm3
  915. mulpd 14 * SIZE(BB), %xmm0
  916. addpd %xmm3, %xmm6
  917. movapd 24 * SIZE(BB), %xmm3
  918. addpd %xmm0, %xmm7
  919. movapd 16 * SIZE(AA), %xmm0
  920. mulpd %xmm1, %xmm2
  921. mulpd 18 * SIZE(BB), %xmm1
  922. addpd %xmm2, %xmm4
  923. movapd 20 * SIZE(BB), %xmm2
  924. addpd %xmm1, %xmm5
  925. movapd 10 * SIZE(AA), %xmm1
  926. mulpd %xmm1, %xmm2
  927. mulpd 22 * SIZE(BB), %xmm1
  928. addpd %xmm2, %xmm6
  929. movapd 32 * SIZE(BB), %xmm2
  930. addpd %xmm1, %xmm7
  931. movapd 12 * SIZE(AA), %xmm1
  932. mulpd %xmm1, %xmm3
  933. mulpd 26 * SIZE(BB), %xmm1
  934. addpd %xmm3, %xmm4
  935. movapd 28 * SIZE(BB), %xmm3
  936. addpd %xmm1, %xmm5
  937. movapd 14 * SIZE(AA), %xmm1
  938. mulpd %xmm1, %xmm3
  939. mulpd 30 * SIZE(BB), %xmm1
  940. addpd %xmm3, %xmm6
  941. movapd 40 * SIZE(BB), %xmm3
  942. addpd %xmm1, %xmm7
  943. movapd 24 * SIZE(AA), %xmm1
  944. addl $16 * SIZE, AA
  945. addl $32 * SIZE, BB
  946. decl %eax
  947. jne .L111
  948. ALIGN_4
  949. .L112:
  950. #if defined(LT) || defined(RN)
  951. movl KK, %eax
  952. #else
  953. movl K, %eax
  954. subl KK, %eax
  955. #endif
  956. andl $7, %eax # if (k & 1)
  957. BRANCH
  958. je .L114
  959. ALIGN_4
  960. .L113:
  961. mulpd %xmm0, %xmm2
  962. mulpd 2 * SIZE(BB), %xmm0
  963. addpd %xmm2, %xmm4
  964. movapd 4 * SIZE(BB), %xmm2
  965. addpd %xmm0, %xmm5
  966. movapd 2 * SIZE(AA), %xmm0
  967. addl $2 * SIZE, AA
  968. addl $4 * SIZE, BB
  969. decl %eax
  970. jg .L113
  971. ALIGN_4
  972. .L114:
  973. #if defined(LN) || defined(RT)
  974. movl KK, %eax
  975. #ifdef LN
  976. subl $1, %eax
  977. #else
  978. subl $1, %eax
  979. #endif
  980. movl AORIG, AA
  981. movl BORIG, B
  982. leal BUFFER, BB
  983. sall $ZBASE_SHIFT, %eax
  984. addl %eax, AA
  985. addl %eax, B
  986. leal (BB, %eax, 2), BB
  987. #endif
  988. movapd POSINV, %xmm1
  989. addpd %xmm6, %xmm4
  990. addpd %xmm7, %xmm5
  991. SHUFPD_1 %xmm5, %xmm5
  992. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  993. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  994. xorpd %xmm1, %xmm5
  995. #else
  996. xorpd %xmm1, %xmm4
  997. #endif
  998. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  999. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1000. subpd %xmm5, %xmm4
  1001. #else
  1002. addpd %xmm5, %xmm4
  1003. #endif
  1004. #if defined(LN) || defined(LT)
  1005. movapd 0 * SIZE(B), %xmm5
  1006. subpd %xmm4, %xmm5
  1007. #else
  1008. movapd 0 * SIZE(AA), %xmm5
  1009. subpd %xmm4, %xmm5
  1010. #endif
  1011. #ifndef CONJ
  1012. SHUFPD_1 %xmm1, %xmm1
  1013. #endif
  1014. #if defined(LN) || defined(LT)
  1015. movlpd 0 * SIZE(AA), %xmm2
  1016. movhpd 0 * SIZE(AA), %xmm2
  1017. movlpd 1 * SIZE(AA), %xmm3
  1018. movhpd 1 * SIZE(AA), %xmm3
  1019. pshufd $0x4e, %xmm5, %xmm4
  1020. xorpd %xmm1, %xmm4
  1021. mulpd %xmm2, %xmm5
  1022. mulpd %xmm3, %xmm4
  1023. addpd %xmm4, %xmm5
  1024. #endif
  1025. #ifdef RN
  1026. movlpd 0 * SIZE(B), %xmm2
  1027. movhpd 0 * SIZE(B), %xmm2
  1028. movlpd 1 * SIZE(B), %xmm3
  1029. movhpd 1 * SIZE(B), %xmm3
  1030. pshufd $0x4e, %xmm5, %xmm4
  1031. xorpd %xmm1, %xmm4
  1032. mulpd %xmm2, %xmm5
  1033. mulpd %xmm3, %xmm4
  1034. addpd %xmm4, %xmm5
  1035. #endif
  1036. #ifdef RT
  1037. movlpd 0 * SIZE(B), %xmm2
  1038. movhpd 0 * SIZE(B), %xmm2
  1039. movlpd 1 * SIZE(B), %xmm3
  1040. movhpd 1 * SIZE(B), %xmm3
  1041. pshufd $0x4e, %xmm5, %xmm4
  1042. xorpd %xmm1, %xmm4
  1043. mulpd %xmm2, %xmm5
  1044. mulpd %xmm3, %xmm4
  1045. addpd %xmm4, %xmm5
  1046. #endif
  1047. #ifdef LN
  1048. subl $2 * SIZE, CO1
  1049. #endif
  1050. movlpd %xmm5, 0 * SIZE(CO1)
  1051. movhpd %xmm5, 1 * SIZE(CO1)
  1052. #if defined(LN) || defined(LT)
  1053. movapd %xmm5, 0 * SIZE(B)
  1054. movlpd %xmm5, 0 * SIZE(BB)
  1055. movlpd %xmm5, 1 * SIZE(BB)
  1056. movhpd %xmm5, 2 * SIZE(BB)
  1057. movhpd %xmm5, 3 * SIZE(BB)
  1058. #else
  1059. movapd %xmm5, 0 * SIZE(AA)
  1060. #endif
  1061. #ifndef LN
  1062. addl $2 * SIZE, CO1
  1063. #endif
  1064. #if defined(LT) || defined(RN)
  1065. movl K, %eax
  1066. subl KK, %eax
  1067. sall $ZBASE_SHIFT, %eax
  1068. addl %eax, AA
  1069. #ifdef LT
  1070. addl $2 * SIZE, B
  1071. #endif
  1072. #endif
  1073. #ifdef LN
  1074. subl $1, KK
  1075. movl BORIG, B
  1076. #endif
  1077. #ifdef LT
  1078. addl $1, KK
  1079. #endif
  1080. #ifdef RT
  1081. movl K, %eax
  1082. movl BORIG, B
  1083. sall $ZBASE_SHIFT, %eax
  1084. addl %eax, AORIG
  1085. #endif
  1086. decl %ebx # i --
  1087. jg .L110
  1088. ALIGN_4
  1089. .L199:
  1090. #ifdef LN
  1091. movl K, %eax
  1092. sall $ZBASE_SHIFT, %eax
  1093. addl %eax, B
  1094. #endif
  1095. #if defined(LT) || defined(RN)
  1096. movl K, %eax
  1097. subl KK, %eax
  1098. sall $ZBASE_SHIFT, %eax
  1099. addl %eax, B
  1100. #endif
  1101. #ifdef RN
  1102. addl $1, KK
  1103. #endif
  1104. #ifdef RT
  1105. subl $1, KK
  1106. #endif
  1107. ALIGN_4
  1108. .L500:
  1109. movl OLD_STACK, %esp
  1110. EMMS
  1111. popl %ebx
  1112. popl %esi
  1113. popl %edi
  1114. popl %ebp
  1115. ret
  1116. EPILOGUE