You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_RT_1x2_sse2.S 26 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esi)
  43. #define STACK_N 8 + STACK + ARGS(%esi)
  44. #define STACK_K 12 + STACK + ARGS(%esi)
  45. #define STACK_ALPHA_R 16 + STACK + ARGS(%esi)
  46. #define STACK_ALPHA_I 24 + STACK + ARGS(%esi)
  47. #define STACK_A 32 + STACK + ARGS(%esi)
  48. #define STACK_B 36 + STACK + ARGS(%esi)
  49. #define STACK_C 40 + STACK + ARGS(%esi)
  50. #define STACK_LDC 44 + STACK + ARGS(%esi)
  51. #define STACK_OFFT 48 + STACK + ARGS(%esi)
  52. #define POSINV 0(%esp)
  53. #define K 16(%esp)
  54. #define N 20(%esp)
  55. #define M 24(%esp)
  56. #define A 28(%esp)
  57. #define C 32(%esp)
  58. #define J 36(%esp)
  59. #define OLD_STACK 40(%esp)
  60. #define OFFSET 44(%esp)
  61. #define KK 48(%esp)
  62. #define KKK 52(%esp)
  63. #define AORIG 56(%esp)
  64. #define BORIG 60(%esp)
  65. #define BUFFER 128(%esp)
  66. #define STACK_ALIGN 4096
  67. #define STACK_OFFSET 1024
  68. #if defined(OPTERON) || defined(BARCELONA)
  69. #define PREFETCH prefetch
  70. #else
  71. #define PREFETCH prefetcht0
  72. #endif
  73. #define PREFETCHSIZE (8 * 10 + 4)
  74. #define AA %edx
  75. #define BB %ecx
  76. #define LDC %ebp
  77. #define B %edi
  78. #define CO1 %esi
  79. #ifndef CONJ
  80. #define NN
  81. #else
  82. #if defined(LN) || defined(LT)
  83. #define CN
  84. #else
  85. #define NC
  86. #endif
  87. #endif
  88. #define KERNEL1(address) \
  89. mulpd %xmm0, %xmm2; \
  90. addpd %xmm2, %xmm4; \
  91. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  92. movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  93. mulpd %xmm0, %xmm2; \
  94. addpd %xmm2, %xmm5; \
  95. movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  96. mulpd %xmm0, %xmm2; \
  97. mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  98. addpd %xmm2, %xmm6; \
  99. movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  100. addpd %xmm0, %xmm7; \
  101. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  102. #define KERNEL2(address) \
  103. mulpd %xmm0, %xmm3; \
  104. addpd %xmm3, %xmm4; \
  105. movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  106. mulpd %xmm0, %xmm3; \
  107. addpd %xmm3, %xmm5; \
  108. movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  109. mulpd %xmm0, %xmm3; \
  110. mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  111. addpd %xmm3, %xmm6; \
  112. movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  113. addpd %xmm0, %xmm7; \
  114. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  115. #define KERNEL3(address) \
  116. mulpd %xmm0, %xmm2; \
  117. addpd %xmm2, %xmm4; \
  118. movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  119. mulpd %xmm0, %xmm2; \
  120. addpd %xmm2, %xmm5; \
  121. movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  122. mulpd %xmm0, %xmm2; \
  123. mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  124. addpd %xmm2, %xmm6; \
  125. movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  126. addpd %xmm0, %xmm7; \
  127. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  128. #define KERNEL4(address) \
  129. mulpd %xmm0, %xmm3; \
  130. addpd %xmm3, %xmm4; \
  131. movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  132. mulpd %xmm0, %xmm3; \
  133. addpd %xmm3, %xmm5; \
  134. movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  135. mulpd %xmm0, %xmm3; \
  136. mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  137. addpd %xmm3, %xmm6; \
  138. movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  139. addpd %xmm0, %xmm7; \
  140. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  141. #define KERNEL5(address) \
  142. PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
  143. mulpd %xmm1, %xmm2; \
  144. addpd %xmm2, %xmm4; \
  145. movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  146. mulpd %xmm1, %xmm2; \
  147. addpd %xmm2, %xmm5; \
  148. movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  149. mulpd %xmm1, %xmm2; \
  150. mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  151. addpd %xmm2, %xmm6; \
  152. movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  153. addpd %xmm1, %xmm7; \
  154. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  155. #define KERNEL6(address) \
  156. mulpd %xmm1, %xmm3; \
  157. addpd %xmm3, %xmm4; \
  158. movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  159. mulpd %xmm1, %xmm3; \
  160. addpd %xmm3, %xmm5; \
  161. movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  162. mulpd %xmm1, %xmm3; \
  163. mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  164. addpd %xmm3, %xmm6; \
  165. movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  166. addpd %xmm1, %xmm7; \
  167. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  168. #define KERNEL7(address) \
  169. mulpd %xmm1, %xmm2; \
  170. addpd %xmm2, %xmm4; \
  171. movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  172. mulpd %xmm1, %xmm2; \
  173. addpd %xmm2, %xmm5; \
  174. movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  175. mulpd %xmm1, %xmm2; \
  176. mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  177. addpd %xmm2, %xmm6; \
  178. movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  179. addpd %xmm1, %xmm7; \
  180. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  181. #define KERNEL8(address) \
  182. mulpd %xmm1, %xmm3; \
  183. addpd %xmm3, %xmm4; \
  184. movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  185. mulpd %xmm1, %xmm3; \
  186. addpd %xmm3, %xmm5; \
  187. movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  188. mulpd %xmm1, %xmm3; \
  189. mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  190. addpd %xmm3, %xmm6; \
  191. movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  192. addpd %xmm1, %xmm7; \
  193. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  194. PROLOGUE
  195. pushl %ebp
  196. pushl %edi
  197. pushl %esi
  198. pushl %ebx
  199. PROFCODE
  200. movl %esp, %esi # save old stack
  201. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  202. andl $-STACK_ALIGN, %esp # align stack
  203. addl $STACK_OFFSET, %esp
  204. STACK_TOUCHING
  205. movl STACK_M, %ebx
  206. movl STACK_N, %eax
  207. movl STACK_K, %ecx
  208. movl STACK_A, %edx
  209. movl %ebx, M
  210. movl %eax, N
  211. movl %ecx, K
  212. movl %edx, A
  213. movl %esi, OLD_STACK
  214. movl STACK_B, B
  215. movl STACK_C, %ebx
  216. movss STACK_OFFT, %xmm4
  217. pcmpeqb %xmm7, %xmm7
  218. psllq $63, %xmm7 # Generate mask
  219. pxor %xmm2, %xmm2
  220. movlpd %xmm2, 0 + POSINV
  221. movlpd %xmm7, 8 + POSINV
  222. movl %ebx, C
  223. movl STACK_LDC, LDC
  224. movss %xmm4, OFFSET
  225. movss %xmm4, KK
  226. sall $ZBASE_SHIFT, LDC
  227. #ifdef LN
  228. movl M, %eax
  229. sall $ZBASE_SHIFT, %eax
  230. addl %eax, C
  231. imull K, %eax
  232. addl %eax, A
  233. #endif
  234. #ifdef RT
  235. movl N, %eax
  236. sall $ZBASE_SHIFT, %eax
  237. imull K, %eax
  238. addl %eax, B
  239. movl N, %eax
  240. imull LDC, %eax
  241. addl %eax, C
  242. #endif
  243. #ifdef RN
  244. negl KK
  245. #endif
  246. #ifdef RT
  247. movl N, %eax
  248. subl OFFSET, %eax
  249. movl %eax, KK
  250. #endif
  251. movl N, %eax
  252. andl $1, %eax
  253. jle .L100
  254. ALIGN_4
  255. .L101:
  256. #ifdef LN
  257. movl OFFSET, %eax
  258. addl M, %eax
  259. movl %eax, KK
  260. #endif
  261. leal BUFFER, BB
  262. #ifdef RT
  263. movl K, %eax
  264. sall $ZBASE_SHIFT, %eax
  265. subl %eax, B
  266. #endif
  267. #if defined(LN) || defined(RT)
  268. movl KK, %eax
  269. movl B, BORIG
  270. sall $ZBASE_SHIFT, %eax
  271. addl %eax, B
  272. leal (BB, %eax, 2), BB
  273. #endif
  274. #if defined(LT)
  275. movl OFFSET, %eax
  276. movl %eax, KK
  277. #endif
  278. #if defined(LT) || defined(RN)
  279. movl KK, %eax
  280. #else
  281. movl K, %eax
  282. subl KK, %eax
  283. #endif
  284. sarl $2, %eax
  285. jle .L103
  286. ALIGN_4
  287. .L102:
  288. prefetchnta 56 * SIZE(B)
  289. movlpd 0 * SIZE(B), %xmm0
  290. movlpd 1 * SIZE(B), %xmm1
  291. movlpd 2 * SIZE(B), %xmm2
  292. movlpd 3 * SIZE(B), %xmm3
  293. movlpd 4 * SIZE(B), %xmm4
  294. movlpd 5 * SIZE(B), %xmm5
  295. movlpd 6 * SIZE(B), %xmm6
  296. movlpd 7 * SIZE(B), %xmm7
  297. movlpd %xmm0, 0 * SIZE(BB)
  298. movlpd %xmm0, 1 * SIZE(BB)
  299. movlpd %xmm1, 2 * SIZE(BB)
  300. movlpd %xmm1, 3 * SIZE(BB)
  301. movlpd %xmm2, 4 * SIZE(BB)
  302. movlpd %xmm2, 5 * SIZE(BB)
  303. movlpd %xmm3, 6 * SIZE(BB)
  304. movlpd %xmm3, 7 * SIZE(BB)
  305. movlpd %xmm4, 8 * SIZE(BB)
  306. movlpd %xmm4, 9 * SIZE(BB)
  307. movlpd %xmm5, 10 * SIZE(BB)
  308. movlpd %xmm5, 11 * SIZE(BB)
  309. movlpd %xmm6, 12 * SIZE(BB)
  310. movlpd %xmm6, 13 * SIZE(BB)
  311. movlpd %xmm7, 14 * SIZE(BB)
  312. movlpd %xmm7, 15 * SIZE(BB)
  313. addl $ 8 * SIZE, B
  314. subl $-16 * SIZE, BB
  315. decl %eax
  316. jne .L102
  317. ALIGN_4
  318. .L103:
  319. #if defined(LT) || defined(RN)
  320. movl KK, %eax
  321. #else
  322. movl K, %eax
  323. subl KK, %eax
  324. #endif
  325. andl $3, %eax
  326. BRANCH
  327. jle .L105
  328. ALIGN_4
  329. .L104:
  330. movlpd 0 * SIZE(B), %xmm0
  331. movlpd 1 * SIZE(B), %xmm1
  332. movlpd %xmm0, 0 * SIZE(BB)
  333. movlpd %xmm0, 1 * SIZE(BB)
  334. movlpd %xmm1, 2 * SIZE(BB)
  335. movlpd %xmm1, 3 * SIZE(BB)
  336. addl $2 * SIZE, B
  337. addl $4 * SIZE, BB
  338. decl %eax
  339. jne .L104
  340. ALIGN_4
  341. .L105:
  342. #if defined(LT) || defined(RN)
  343. movl A, %eax
  344. movl %eax, AA
  345. #else
  346. movl A, %eax
  347. movl %eax, AORIG
  348. #endif
  349. #ifdef RT
  350. subl LDC, C
  351. #endif
  352. movl C, CO1
  353. #ifndef RT
  354. addl LDC, C
  355. #endif
  356. movl M, %ebx
  357. testl %ebx, %ebx
  358. jle .L199
  359. ALIGN_4
  360. .L110:
  361. #ifdef LN
  362. movl K, %eax
  363. sall $ZBASE_SHIFT, %eax
  364. subl %eax, AORIG
  365. #endif
  366. #if defined(LN) || defined(RT)
  367. movl AORIG, %eax
  368. movl %eax, AA
  369. movl KK, %eax
  370. sall $ZBASE_SHIFT, %eax
  371. addl %eax, AA
  372. #endif
  373. leal BUFFER, BB
  374. #if defined(LN) || defined(RT)
  375. movl KK, %eax
  376. sall $ZBASE_SHIFT, %eax
  377. leal (BB, %eax, 2), BB
  378. #endif
  379. pxor %xmm4, %xmm4
  380. pxor %xmm5, %xmm5
  381. pxor %xmm6, %xmm6
  382. pxor %xmm7, %xmm7
  383. movapd 0 * SIZE(AA), %xmm0
  384. movapd 8 * SIZE(AA), %xmm1
  385. movapd 0 * SIZE(BB), %xmm2
  386. movapd 8 * SIZE(BB), %xmm3
  387. #ifdef LN
  388. prefetchw -2 * SIZE(CO1)
  389. #else
  390. prefetchw 2 * SIZE(CO1)
  391. #endif
  392. #if defined(LT) || defined(RN)
  393. movl KK, %eax
  394. #else
  395. movl K, %eax
  396. subl KK, %eax
  397. #endif
  398. sarl $3, %eax
  399. je .L112
  400. ALIGN_4
  401. .L111:
  402. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  403. mulpd %xmm0, %xmm2
  404. mulpd 2 * SIZE(BB), %xmm0
  405. addpd %xmm2, %xmm4
  406. movapd 4 * SIZE(BB), %xmm2
  407. addpd %xmm0, %xmm5
  408. movapd 2 * SIZE(AA), %xmm0
  409. mulpd %xmm0, %xmm2
  410. mulpd 6 * SIZE(BB), %xmm0
  411. addpd %xmm2, %xmm6
  412. movapd 16 * SIZE(BB), %xmm2
  413. addpd %xmm0, %xmm7
  414. movapd 4 * SIZE(AA), %xmm0
  415. mulpd %xmm0, %xmm3
  416. mulpd 10 * SIZE(BB), %xmm0
  417. addpd %xmm3, %xmm4
  418. movapd 12 * SIZE(BB), %xmm3
  419. addpd %xmm0, %xmm5
  420. movapd 6 * SIZE(AA), %xmm0
  421. mulpd %xmm0, %xmm3
  422. mulpd 14 * SIZE(BB), %xmm0
  423. addpd %xmm3, %xmm6
  424. movapd 24 * SIZE(BB), %xmm3
  425. addpd %xmm0, %xmm7
  426. movapd 16 * SIZE(AA), %xmm0
  427. mulpd %xmm1, %xmm2
  428. mulpd 18 * SIZE(BB), %xmm1
  429. addpd %xmm2, %xmm4
  430. movapd 20 * SIZE(BB), %xmm2
  431. addpd %xmm1, %xmm5
  432. movapd 10 * SIZE(AA), %xmm1
  433. mulpd %xmm1, %xmm2
  434. mulpd 22 * SIZE(BB), %xmm1
  435. addpd %xmm2, %xmm6
  436. movapd 32 * SIZE(BB), %xmm2
  437. addpd %xmm1, %xmm7
  438. movapd 12 * SIZE(AA), %xmm1
  439. mulpd %xmm1, %xmm3
  440. mulpd 26 * SIZE(BB), %xmm1
  441. addpd %xmm3, %xmm4
  442. movapd 28 * SIZE(BB), %xmm3
  443. addpd %xmm1, %xmm5
  444. movapd 14 * SIZE(AA), %xmm1
  445. mulpd %xmm1, %xmm3
  446. mulpd 30 * SIZE(BB), %xmm1
  447. addpd %xmm3, %xmm6
  448. movapd 40 * SIZE(BB), %xmm3
  449. addpd %xmm1, %xmm7
  450. movapd 24 * SIZE(AA), %xmm1
  451. addl $16 * SIZE, AA
  452. addl $32 * SIZE, BB
  453. decl %eax
  454. jne .L111
  455. ALIGN_4
  456. .L112:
  457. #if defined(LT) || defined(RN)
  458. movl KK, %eax
  459. #else
  460. movl K, %eax
  461. subl KK, %eax
  462. #endif
  463. andl $7, %eax # if (k & 1)
  464. BRANCH
  465. je .L114
  466. ALIGN_4
  467. .L113:
  468. mulpd %xmm0, %xmm2
  469. mulpd 2 * SIZE(BB), %xmm0
  470. addpd %xmm2, %xmm4
  471. movapd 4 * SIZE(BB), %xmm2
  472. addpd %xmm0, %xmm5
  473. movapd 2 * SIZE(AA), %xmm0
  474. addl $2 * SIZE, AA
  475. addl $4 * SIZE, BB
  476. decl %eax
  477. jg .L113
  478. ALIGN_4
  479. .L114:
  480. #if defined(LN) || defined(RT)
  481. movl KK, %eax
  482. #ifdef LN
  483. subl $1, %eax
  484. #else
  485. subl $1, %eax
  486. #endif
  487. movl AORIG, AA
  488. movl BORIG, B
  489. leal BUFFER, BB
  490. sall $ZBASE_SHIFT, %eax
  491. addl %eax, AA
  492. addl %eax, B
  493. leal (BB, %eax, 2), BB
  494. #endif
  495. movapd POSINV, %xmm1
  496. addpd %xmm6, %xmm4
  497. addpd %xmm7, %xmm5
  498. SHUFPD_1 %xmm5, %xmm5
  499. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  500. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  501. xorpd %xmm1, %xmm5
  502. #else
  503. xorpd %xmm1, %xmm4
  504. #endif
  505. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  506. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  507. subpd %xmm5, %xmm4
  508. #else
  509. addpd %xmm5, %xmm4
  510. #endif
  511. #if defined(LN) || defined(LT)
  512. movapd 0 * SIZE(B), %xmm5
  513. subpd %xmm4, %xmm5
  514. #else
  515. movapd 0 * SIZE(AA), %xmm5
  516. subpd %xmm4, %xmm5
  517. #endif
  518. #ifndef CONJ
  519. SHUFPD_1 %xmm1, %xmm1
  520. #endif
  521. #if defined(LN) || defined(LT)
  522. movlpd 0 * SIZE(AA), %xmm2
  523. movhpd 0 * SIZE(AA), %xmm2
  524. movlpd 1 * SIZE(AA), %xmm3
  525. movhpd 1 * SIZE(AA), %xmm3
  526. pshufd $0x4e, %xmm5, %xmm4
  527. xorpd %xmm1, %xmm4
  528. mulpd %xmm2, %xmm5
  529. mulpd %xmm3, %xmm4
  530. addpd %xmm4, %xmm5
  531. #endif
  532. #ifdef RN
  533. movlpd 0 * SIZE(B), %xmm2
  534. movhpd 0 * SIZE(B), %xmm2
  535. movlpd 1 * SIZE(B), %xmm3
  536. movhpd 1 * SIZE(B), %xmm3
  537. pshufd $0x4e, %xmm5, %xmm4
  538. xorpd %xmm1, %xmm4
  539. mulpd %xmm2, %xmm5
  540. mulpd %xmm3, %xmm4
  541. addpd %xmm4, %xmm5
  542. #endif
  543. #ifdef RT
  544. movlpd 0 * SIZE(B), %xmm2
  545. movhpd 0 * SIZE(B), %xmm2
  546. movlpd 1 * SIZE(B), %xmm3
  547. movhpd 1 * SIZE(B), %xmm3
  548. pshufd $0x4e, %xmm5, %xmm4
  549. xorpd %xmm1, %xmm4
  550. mulpd %xmm2, %xmm5
  551. mulpd %xmm3, %xmm4
  552. addpd %xmm4, %xmm5
  553. #endif
  554. #ifdef LN
  555. subl $2 * SIZE, CO1
  556. #endif
  557. movlpd %xmm5, 0 * SIZE(CO1)
  558. movhpd %xmm5, 1 * SIZE(CO1)
  559. #if defined(LN) || defined(LT)
  560. movapd %xmm5, 0 * SIZE(B)
  561. movlpd %xmm5, 0 * SIZE(BB)
  562. movlpd %xmm5, 1 * SIZE(BB)
  563. movhpd %xmm5, 2 * SIZE(BB)
  564. movhpd %xmm5, 3 * SIZE(BB)
  565. #else
  566. movapd %xmm5, 0 * SIZE(AA)
  567. #endif
  568. #ifndef LN
  569. addl $2 * SIZE, CO1
  570. #endif
  571. #if defined(LT) || defined(RN)
  572. movl K, %eax
  573. subl KK, %eax
  574. sall $ZBASE_SHIFT, %eax
  575. addl %eax, AA
  576. #ifdef LT
  577. addl $2 * SIZE, B
  578. #endif
  579. #endif
  580. #ifdef LN
  581. subl $1, KK
  582. movl BORIG, B
  583. #endif
  584. #ifdef LT
  585. addl $1, KK
  586. #endif
  587. #ifdef RT
  588. movl K, %eax
  589. movl BORIG, B
  590. sall $ZBASE_SHIFT, %eax
  591. addl %eax, AORIG
  592. #endif
  593. decl %ebx # i --
  594. jg .L110
  595. ALIGN_4
  596. .L199:
  597. #ifdef LN
  598. movl K, %eax
  599. sall $ZBASE_SHIFT, %eax
  600. addl %eax, B
  601. #endif
  602. #if defined(LT) || defined(RN)
  603. movl K, %eax
  604. subl KK, %eax
  605. sall $ZBASE_SHIFT, %eax
  606. addl %eax, B
  607. #endif
  608. #ifdef RN
  609. addl $1, KK
  610. #endif
  611. #ifdef RT
  612. subl $1, KK
  613. #endif
  614. ALIGN_4
  615. .L100:
  616. movl N, %eax
  617. sarl $1, %eax
  618. movl %eax, J # j = n
  619. jle .L500
  620. ALIGN_4
  621. .L01:
  622. #ifdef LN
  623. movl OFFSET, %eax
  624. addl M, %eax
  625. movl %eax, KK
  626. #endif
  627. leal BUFFER, BB
  628. #ifdef RT
  629. movl K, %eax
  630. sall $1 + ZBASE_SHIFT, %eax
  631. subl %eax, B
  632. #endif
  633. #if defined(LN) || defined(RT)
  634. movl KK, %eax
  635. movl B, BORIG
  636. sall $1 + ZBASE_SHIFT, %eax
  637. addl %eax, B
  638. leal (BB, %eax, 2), BB
  639. #endif
  640. #if defined(LT)
  641. movl OFFSET, %eax
  642. movl %eax, KK
  643. #endif
  644. #if defined(LT) || defined(RN)
  645. movl KK, %eax
  646. #else
  647. movl K, %eax
  648. subl KK, %eax
  649. #endif
  650. sarl $1, %eax
  651. jle .L03
  652. ALIGN_4
  653. .L02:
  654. prefetchnta 56 * SIZE(B)
  655. movlpd 0 * SIZE(B), %xmm0
  656. movlpd 1 * SIZE(B), %xmm1
  657. movlpd 2 * SIZE(B), %xmm2
  658. movlpd 3 * SIZE(B), %xmm3
  659. movlpd 4 * SIZE(B), %xmm4
  660. movlpd 5 * SIZE(B), %xmm5
  661. movlpd 6 * SIZE(B), %xmm6
  662. movlpd 7 * SIZE(B), %xmm7
  663. movlpd %xmm0, 0 * SIZE(BB)
  664. movlpd %xmm0, 1 * SIZE(BB)
  665. movlpd %xmm1, 2 * SIZE(BB)
  666. movlpd %xmm1, 3 * SIZE(BB)
  667. movlpd %xmm2, 4 * SIZE(BB)
  668. movlpd %xmm2, 5 * SIZE(BB)
  669. movlpd %xmm3, 6 * SIZE(BB)
  670. movlpd %xmm3, 7 * SIZE(BB)
  671. movlpd %xmm4, 8 * SIZE(BB)
  672. movlpd %xmm4, 9 * SIZE(BB)
  673. movlpd %xmm5, 10 * SIZE(BB)
  674. movlpd %xmm5, 11 * SIZE(BB)
  675. movlpd %xmm6, 12 * SIZE(BB)
  676. movlpd %xmm6, 13 * SIZE(BB)
  677. movlpd %xmm7, 14 * SIZE(BB)
  678. movlpd %xmm7, 15 * SIZE(BB)
  679. addl $ 8 * SIZE, B
  680. subl $-16 * SIZE, BB
  681. decl %eax
  682. jne .L02
  683. ALIGN_4
  684. .L03:
  685. #if defined(LT) || defined(RN)
  686. movl KK, %eax
  687. #else
  688. movl K, %eax
  689. subl KK, %eax
  690. #endif
  691. andl $1, %eax
  692. BRANCH
  693. jle .L05
  694. movlpd 0 * SIZE(B), %xmm0
  695. movlpd 1 * SIZE(B), %xmm1
  696. movlpd 2 * SIZE(B), %xmm2
  697. movlpd 3 * SIZE(B), %xmm3
  698. movlpd %xmm0, 0 * SIZE(BB)
  699. movlpd %xmm0, 1 * SIZE(BB)
  700. movlpd %xmm1, 2 * SIZE(BB)
  701. movlpd %xmm1, 3 * SIZE(BB)
  702. movlpd %xmm2, 4 * SIZE(BB)
  703. movlpd %xmm2, 5 * SIZE(BB)
  704. movlpd %xmm3, 6 * SIZE(BB)
  705. movlpd %xmm3, 7 * SIZE(BB)
  706. addl $4 * SIZE, B
  707. ALIGN_4
  708. .L05:
  709. #if defined(LT) || defined(RN)
  710. movl A, %eax
  711. movl %eax, AA
  712. #else
  713. movl A, %eax
  714. movl %eax, AORIG
  715. #endif
  716. #ifdef RT
  717. leal (, LDC, 2), %eax
  718. subl %eax, C
  719. #endif
  720. movl C, CO1
  721. #ifndef RT
  722. leal (, LDC, 2), %eax
  723. addl %eax, C
  724. #endif
  725. movl M, %ebx
  726. testl %ebx, %ebx
  727. jle .L100
  728. ALIGN_4
  729. .L10:
  730. #ifdef LN
  731. movl K, %eax
  732. sall $ZBASE_SHIFT, %eax
  733. subl %eax, AORIG
  734. #endif
  735. #if defined(LN) || defined(RT)
  736. movl AORIG, %eax
  737. movl %eax, AA
  738. movl KK, %eax
  739. sall $ZBASE_SHIFT, %eax
  740. addl %eax, AA
  741. #endif
  742. leal BUFFER, BB
  743. #if defined(LN) || defined(RT)
  744. movl KK, %eax
  745. sall $1 + ZBASE_SHIFT, %eax
  746. leal (BB, %eax, 2), BB
  747. #endif
  748. movapd 0 * SIZE(AA), %xmm0
  749. pxor %xmm4, %xmm4
  750. movapd 8 * SIZE(AA), %xmm1
  751. pxor %xmm5, %xmm5
  752. movapd 0 * SIZE(BB), %xmm2
  753. pxor %xmm6, %xmm6
  754. movapd 8 * SIZE(BB), %xmm3
  755. pxor %xmm7, %xmm7
  756. #ifdef LN
  757. prefetchw -2 * SIZE(CO1)
  758. prefetchw -2 * SIZE(CO1, LDC)
  759. #else
  760. prefetchw 2 * SIZE(CO1)
  761. prefetchw 2 * SIZE(CO1, LDC)
  762. #endif
  763. #if defined(LT) || defined(RN)
  764. movl KK, %eax
  765. #else
  766. movl K, %eax
  767. subl KK, %eax
  768. #endif
  769. #if 1
  770. andl $-8, %eax
  771. sall $4, %eax
  772. je .L15
  773. .L1X:
  774. KERNEL1(16 * 0)
  775. KERNEL2(16 * 0)
  776. KERNEL3(16 * 0)
  777. KERNEL4(16 * 0)
  778. KERNEL5(16 * 0)
  779. KERNEL6(16 * 0)
  780. KERNEL7(16 * 0)
  781. KERNEL8(16 * 0)
  782. cmpl $128 * 1, %eax
  783. jle .L12
  784. KERNEL1(16 * 1)
  785. KERNEL2(16 * 1)
  786. KERNEL3(16 * 1)
  787. KERNEL4(16 * 1)
  788. KERNEL5(16 * 1)
  789. KERNEL6(16 * 1)
  790. KERNEL7(16 * 1)
  791. KERNEL8(16 * 1)
  792. cmpl $128 * 2, %eax
  793. jle .L12
  794. KERNEL1(16 * 2)
  795. KERNEL2(16 * 2)
  796. KERNEL3(16 * 2)
  797. KERNEL4(16 * 2)
  798. KERNEL5(16 * 2)
  799. KERNEL6(16 * 2)
  800. KERNEL7(16 * 2)
  801. KERNEL8(16 * 2)
  802. cmpl $128 * 3, %eax
  803. jle .L12
  804. KERNEL1(16 * 3)
  805. KERNEL2(16 * 3)
  806. KERNEL3(16 * 3)
  807. KERNEL4(16 * 3)
  808. KERNEL5(16 * 3)
  809. KERNEL6(16 * 3)
  810. KERNEL7(16 * 3)
  811. KERNEL8(16 * 3)
  812. cmpl $128 * 4, %eax
  813. jle .L12
  814. KERNEL1(16 * 4)
  815. KERNEL2(16 * 4)
  816. KERNEL3(16 * 4)
  817. KERNEL4(16 * 4)
  818. KERNEL5(16 * 4)
  819. KERNEL6(16 * 4)
  820. KERNEL7(16 * 4)
  821. KERNEL8(16 * 4)
  822. cmpl $128 * 5, %eax
  823. jle .L12
  824. KERNEL1(16 * 5)
  825. KERNEL2(16 * 5)
  826. KERNEL3(16 * 5)
  827. KERNEL4(16 * 5)
  828. KERNEL5(16 * 5)
  829. KERNEL6(16 * 5)
  830. KERNEL7(16 * 5)
  831. KERNEL8(16 * 5)
  832. cmpl $128 * 6, %eax
  833. jle .L12
  834. KERNEL1(16 * 6)
  835. KERNEL2(16 * 6)
  836. KERNEL3(16 * 6)
  837. KERNEL4(16 * 6)
  838. KERNEL5(16 * 6)
  839. KERNEL6(16 * 6)
  840. KERNEL7(16 * 6)
  841. KERNEL8(16 * 6)
  842. cmpl $128 * 7, %eax
  843. jle .L12
  844. KERNEL1(16 * 7)
  845. KERNEL2(16 * 7)
  846. KERNEL3(16 * 7)
  847. KERNEL4(16 * 7)
  848. KERNEL5(16 * 7)
  849. KERNEL6(16 * 7)
  850. KERNEL7(16 * 7)
  851. KERNEL8(16 * 7)
  852. addl $128 * 4 * SIZE, BB
  853. addl $128 * 1 * SIZE, AA
  854. subl $128 * 8, %eax
  855. jg .L1X
  856. jmp .L15
  857. .L12:
  858. leal (AA, %eax, 1), AA
  859. leal (BB, %eax, 4), BB
  860. ALIGN_4
  861. #else
  862. sarl $3, %eax
  863. je .L15
  864. ALIGN_4
  865. .L12:
  866. KERNEL1(16 * 0)
  867. KERNEL2(16 * 0)
  868. KERNEL3(16 * 0)
  869. KERNEL4(16 * 0)
  870. KERNEL5(16 * 0)
  871. KERNEL6(16 * 0)
  872. KERNEL7(16 * 0)
  873. KERNEL8(16 * 0)
  874. addl $64 * SIZE, BB
  875. addl $16 * SIZE, AA
  876. decl %eax
  877. jne .L11
  878. ALIGN_4
  879. #endif
  880. .L15:
  881. #if defined(LT) || defined(RN)
  882. movl KK, %eax
  883. #else
  884. movl K, %eax
  885. subl KK, %eax
  886. #endif
  887. andl $7, %eax # if (k & 1)
  888. BRANCH
  889. je .L14
  890. ALIGN_4
  891. .L13:
  892. mulpd %xmm0, %xmm2
  893. addpd %xmm2, %xmm4
  894. movapd 2 * SIZE(BB), %xmm2
  895. mulpd %xmm0, %xmm2
  896. addpd %xmm2, %xmm5
  897. movapd 4 * SIZE(BB), %xmm2
  898. mulpd %xmm0, %xmm2
  899. mulpd 6 * SIZE(BB), %xmm0
  900. addpd %xmm2, %xmm6
  901. movapd 8 * SIZE(BB), %xmm2
  902. addpd %xmm0, %xmm7
  903. movapd 2 * SIZE(AA), %xmm0
  904. addl $2 * SIZE, AA
  905. addl $8 * SIZE, BB
  906. decl %eax
  907. jg .L13
  908. ALIGN_4
  909. .L14:
  910. #if defined(LN) || defined(RT)
  911. movl KK, %eax
  912. #ifdef LN
  913. subl $1, %eax
  914. #else
  915. subl $2, %eax
  916. #endif
  917. movl AORIG, AA
  918. movl BORIG, B
  919. leal BUFFER, BB
  920. sall $ZBASE_SHIFT, %eax
  921. addl %eax, AA
  922. leal (B, %eax, 2), B
  923. leal (BB, %eax, 4), BB
  924. #endif
  925. movapd POSINV, %xmm1
  926. SHUFPD_1 %xmm5, %xmm5
  927. SHUFPD_1 %xmm7, %xmm7
  928. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  929. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  930. xorpd %xmm1, %xmm5
  931. xorpd %xmm1, %xmm7
  932. #else
  933. xorpd %xmm1, %xmm4
  934. xorpd %xmm1, %xmm6
  935. #endif
  936. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  937. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  938. subpd %xmm5, %xmm4
  939. subpd %xmm7, %xmm6
  940. #else
  941. addpd %xmm5, %xmm4
  942. addpd %xmm7, %xmm6
  943. #endif
  944. #if defined(LN) || defined(LT)
  945. movapd 0 * SIZE(B), %xmm5
  946. movapd 2 * SIZE(B), %xmm7
  947. subpd %xmm4, %xmm5
  948. subpd %xmm6, %xmm7
  949. #else
  950. movapd 0 * SIZE(AA), %xmm5
  951. movapd 2 * SIZE(AA), %xmm7
  952. subpd %xmm4, %xmm5
  953. subpd %xmm6, %xmm7
  954. #endif
  955. #ifndef CONJ
  956. SHUFPD_1 %xmm1, %xmm1
  957. #endif
  958. #if defined(LN) || defined(LT)
  959. movlpd 0 * SIZE(AA), %xmm2
  960. movhpd 0 * SIZE(AA), %xmm2
  961. movlpd 1 * SIZE(AA), %xmm3
  962. movhpd 1 * SIZE(AA), %xmm3
  963. pshufd $0x4e, %xmm5, %xmm4
  964. pshufd $0x4e, %xmm7, %xmm6
  965. xorpd %xmm1, %xmm4
  966. xorpd %xmm1, %xmm6
  967. mulpd %xmm2, %xmm5
  968. mulpd %xmm3, %xmm4
  969. mulpd %xmm2, %xmm7
  970. mulpd %xmm3, %xmm6
  971. addpd %xmm4, %xmm5
  972. addpd %xmm6, %xmm7
  973. #endif
  974. #ifdef RN
  975. movlpd 0 * SIZE(B), %xmm2
  976. movhpd 0 * SIZE(B), %xmm2
  977. movlpd 1 * SIZE(B), %xmm3
  978. movhpd 1 * SIZE(B), %xmm3
  979. pshufd $0x4e, %xmm5, %xmm4
  980. xorpd %xmm1, %xmm4
  981. mulpd %xmm2, %xmm5
  982. mulpd %xmm3, %xmm4
  983. addpd %xmm4, %xmm5
  984. movlpd 2 * SIZE(B), %xmm2
  985. movhpd 2 * SIZE(B), %xmm2
  986. movlpd 3 * SIZE(B), %xmm3
  987. movhpd 3 * SIZE(B), %xmm3
  988. movapd %xmm5, %xmm4
  989. pshufd $0x4e, %xmm5, %xmm6
  990. xorpd %xmm1, %xmm6
  991. mulpd %xmm2, %xmm4
  992. mulpd %xmm3, %xmm6
  993. subpd %xmm4, %xmm7
  994. subpd %xmm6, %xmm7
  995. movlpd 6 * SIZE(B), %xmm2
  996. movhpd 6 * SIZE(B), %xmm2
  997. movlpd 7 * SIZE(B), %xmm3
  998. movhpd 7 * SIZE(B), %xmm3
  999. pshufd $0x4e, %xmm7, %xmm6
  1000. xorpd %xmm1, %xmm6
  1001. mulpd %xmm2, %xmm7
  1002. mulpd %xmm3, %xmm6
  1003. addpd %xmm6, %xmm7
  1004. #endif
  1005. #ifdef RT
  1006. movlpd 6 * SIZE(B), %xmm2
  1007. movhpd 6 * SIZE(B), %xmm2
  1008. movlpd 7 * SIZE(B), %xmm3
  1009. movhpd 7 * SIZE(B), %xmm3
  1010. pshufd $0x4e, %xmm7, %xmm6
  1011. xorpd %xmm1, %xmm6
  1012. mulpd %xmm2, %xmm7
  1013. mulpd %xmm3, %xmm6
  1014. addpd %xmm6, %xmm7
  1015. movlpd 4 * SIZE(B), %xmm2
  1016. movhpd 4 * SIZE(B), %xmm2
  1017. movlpd 5 * SIZE(B), %xmm3
  1018. movhpd 5 * SIZE(B), %xmm3
  1019. movapd %xmm7, %xmm4
  1020. pshufd $0x4e, %xmm7, %xmm6
  1021. xorpd %xmm1, %xmm6
  1022. mulpd %xmm2, %xmm4
  1023. mulpd %xmm3, %xmm6
  1024. subpd %xmm4, %xmm5
  1025. subpd %xmm6, %xmm5
  1026. movlpd 0 * SIZE(B), %xmm2
  1027. movhpd 0 * SIZE(B), %xmm2
  1028. movlpd 1 * SIZE(B), %xmm3
  1029. movhpd 1 * SIZE(B), %xmm3
  1030. pshufd $0x4e, %xmm5, %xmm4
  1031. xorpd %xmm1, %xmm4
  1032. mulpd %xmm2, %xmm5
  1033. mulpd %xmm3, %xmm4
  1034. addpd %xmm4, %xmm5
  1035. #endif
  1036. #ifdef LN
  1037. subl $2 * SIZE, CO1
  1038. #endif
  1039. movlpd %xmm5, 0 * SIZE(CO1)
  1040. movhpd %xmm5, 1 * SIZE(CO1)
  1041. movlpd %xmm7, 0 * SIZE(CO1, LDC)
  1042. movhpd %xmm7, 1 * SIZE(CO1, LDC)
  1043. #if defined(LN) || defined(LT)
  1044. movapd %xmm5, 0 * SIZE(B)
  1045. movapd %xmm7, 2 * SIZE(B)
  1046. movlpd %xmm5, 0 * SIZE(BB)
  1047. movlpd %xmm5, 1 * SIZE(BB)
  1048. movhpd %xmm5, 2 * SIZE(BB)
  1049. movhpd %xmm5, 3 * SIZE(BB)
  1050. movlpd %xmm7, 4 * SIZE(BB)
  1051. movlpd %xmm7, 5 * SIZE(BB)
  1052. movhpd %xmm7, 6 * SIZE(BB)
  1053. movhpd %xmm7, 7 * SIZE(BB)
  1054. #else
  1055. movapd %xmm5, 0 * SIZE(AA)
  1056. movapd %xmm7, 2 * SIZE(AA)
  1057. #endif
  1058. #ifndef LN
  1059. addl $2 * SIZE, CO1
  1060. #endif
  1061. #if defined(LT) || defined(RN)
  1062. movl K, %eax
  1063. subl KK, %eax
  1064. sall $ZBASE_SHIFT, %eax
  1065. addl %eax, AA
  1066. #ifdef LT
  1067. addl $4 * SIZE, B
  1068. #endif
  1069. #endif
  1070. #ifdef LN
  1071. subl $1, KK
  1072. movl BORIG, B
  1073. #endif
  1074. #ifdef LT
  1075. addl $1, KK
  1076. #endif
  1077. #ifdef RT
  1078. movl K, %eax
  1079. movl BORIG, B
  1080. sall $ZBASE_SHIFT, %eax
  1081. addl %eax, AORIG
  1082. #endif
  1083. decl %ebx # i --
  1084. jg .L10
  1085. ALIGN_4
  1086. .L99:
  1087. #ifdef LN
  1088. movl K, %eax
  1089. sall $1 + ZBASE_SHIFT, %eax
  1090. addl %eax, B
  1091. #endif
  1092. #if defined(LT) || defined(RN)
  1093. movl K, %eax
  1094. subl KK, %eax
  1095. sall $1 + ZBASE_SHIFT, %eax
  1096. addl %eax, B
  1097. #endif
  1098. #ifdef RN
  1099. addl $2, KK
  1100. #endif
  1101. #ifdef RT
  1102. subl $2, KK
  1103. #endif
  1104. decl J # j --
  1105. jg .L01
  1106. ALIGN_4
  1107. .L500:
  1108. movl OLD_STACK, %esp
  1109. popl %ebx
  1110. popl %esi
  1111. popl %edi
  1112. popl %ebp
  1113. ret
  1114. EPILOGUE