You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_4x2_atom.S 37 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define AO %r13
  49. #define BO %r14
  50. #define CO1 %r15
  51. #define CO2 %rbx
  52. #define KK %rbp
  53. #define BB %r12
  54. #ifndef WINDOWS_ABI
  55. #define STACKSIZE 128
  56. #define OLD_LDC 8 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  58. #define OFFSET 48(%rsp)
  59. #define J 56(%rsp)
  60. #define KKK 64(%rsp)
  61. #define AORIG 72(%rsp)
  62. #else
  63. #define STACKSIZE 256
  64. #define OLD_A 40 + STACKSIZE(%rsp)
  65. #define OLD_B 48 + STACKSIZE(%rsp)
  66. #define OLD_C 56 + STACKSIZE(%rsp)
  67. #define OLD_LDC 64 + STACKSIZE(%rsp)
  68. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  69. #define OFFSET 224(%rsp)
  70. #define J 232(%rsp)
  71. #define KKK 240(%rsp)
  72. #define AORIG 248(%rsp)
  73. #endif
  74. #define PREFETCH prefetcht0
  75. #define PREFETCHSIZE (8 * 8 + 3)
  76. PROLOGUE
  77. PROFCODE
  78. subq $STACKSIZE, %rsp
  79. movq %rbx, 0(%rsp)
  80. movq %rbp, 8(%rsp)
  81. movq %r12, 16(%rsp)
  82. movq %r13, 24(%rsp)
  83. movq %r14, 32(%rsp)
  84. movq %r15, 40(%rsp)
  85. #ifdef WINDOWS_ABI
  86. movq %rdi, 48(%rsp)
  87. movq %rsi, 56(%rsp)
  88. movups %xmm6, 64(%rsp)
  89. movups %xmm7, 80(%rsp)
  90. movups %xmm8, 96(%rsp)
  91. movups %xmm9, 112(%rsp)
  92. movups %xmm10, 128(%rsp)
  93. movups %xmm11, 144(%rsp)
  94. movups %xmm12, 160(%rsp)
  95. movups %xmm13, 176(%rsp)
  96. movups %xmm14, 192(%rsp)
  97. movups %xmm15, 208(%rsp)
  98. movq ARG1, M
  99. movq ARG2, N
  100. movq ARG3, K
  101. movq OLD_A, A
  102. movq OLD_B, B
  103. movq OLD_C, C
  104. #endif
  105. movq OLD_LDC, LDC
  106. movq OLD_OFFSET, KK
  107. movq KK, OFFSET
  108. leaq (, LDC, SIZE), LDC
  109. #ifdef LN
  110. leaq (, M, SIZE), %rax
  111. addq %rax, C
  112. imulq K, %rax
  113. addq %rax, A
  114. #endif
  115. #ifdef RT
  116. leaq (, N, SIZE), %rax
  117. imulq K, %rax
  118. addq %rax, B
  119. movq N, %rax
  120. imulq LDC, %rax
  121. addq %rax, C
  122. #endif
  123. #ifdef RN
  124. negq KK
  125. #endif
  126. #ifdef RT
  127. movq N, %rax
  128. subq OFFSET, %rax
  129. movq %rax, KK
  130. #endif
  131. testq $1, N
  132. je .L40
  133. ALIGN_4
  134. #if defined(LT) || defined(RN)
  135. movq A, AO
  136. #else
  137. movq A, AORIG
  138. #endif
  139. #ifdef RT
  140. movq K, %rax
  141. salq $0 + BASE_SHIFT, %rax
  142. subq %rax, B
  143. subq LDC, C
  144. #endif
  145. movq C, CO1
  146. #ifndef RT
  147. addq LDC, C
  148. #endif
  149. #ifdef LN
  150. movq OFFSET, %rax
  151. addq M, %rax
  152. movq %rax, KK
  153. #endif
  154. #ifdef LT
  155. movq OFFSET, %rax
  156. movq %rax, KK
  157. #endif
  158. movq M, I
  159. sarq $2, I
  160. jle .L50
  161. ALIGN_4
  162. .L41:
  163. #ifdef LN
  164. movq K, %rax
  165. salq $2 + BASE_SHIFT, %rax
  166. subq %rax, AORIG
  167. #endif
  168. #if defined(LN) || defined(RT)
  169. movq KK, %rax
  170. leaq (, %rax, SIZE), %rax
  171. movq AORIG, AO
  172. leaq (AO, %rax, 4), AO
  173. leaq (B, %rax, 1), BO
  174. #else
  175. movq B, BO
  176. #endif
  177. movsd 0 * SIZE(AO), %xmm0
  178. xorps %xmm9, %xmm9
  179. movsd 1 * SIZE(AO), %xmm1
  180. xorps %xmm11, %xmm11
  181. movsd 2 * SIZE(AO), %xmm2
  182. xorps %xmm13, %xmm13
  183. movsd 3 * SIZE(AO), %xmm3
  184. xorps %xmm15, %xmm15
  185. movsd 0 * SIZE(BO), %xmm4
  186. xorps %xmm8, %xmm8
  187. movsd 1 * SIZE(BO), %xmm5
  188. xorps %xmm10, %xmm10
  189. prefetcht0 3 * SIZE(CO1)
  190. xorps %xmm12, %xmm12
  191. xorps %xmm14, %xmm14
  192. #if defined(LT) || defined(RN)
  193. movq KK, %rax
  194. #else
  195. movq K, %rax
  196. subq KK, %rax
  197. #endif
  198. sarq $2, %rax
  199. je .L45
  200. ALIGN_4
  201. .L42:
  202. addsd %xmm9, %xmm8
  203. movsd 4 * SIZE(AO), %xmm9
  204. mulsd %xmm4, %xmm0
  205. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  206. addsd %xmm11, %xmm10
  207. movsd 5 * SIZE(AO), %xmm11
  208. mulsd %xmm4, %xmm1
  209. addsd %xmm13, %xmm12
  210. movsd 6 * SIZE(AO), %xmm13
  211. mulsd %xmm4, %xmm2
  212. addsd %xmm15, %xmm14
  213. movsd 7 * SIZE(AO), %xmm15
  214. mulsd %xmm4, %xmm3
  215. movsd 2 * SIZE(BO), %xmm4
  216. addsd %xmm0, %xmm8
  217. movsd 8 * SIZE(AO), %xmm0
  218. mulsd %xmm5, %xmm9
  219. addsd %xmm1, %xmm10
  220. movsd 9 * SIZE(AO), %xmm1
  221. mulsd %xmm5, %xmm11
  222. addsd %xmm2, %xmm12
  223. movsd 10 * SIZE(AO), %xmm2
  224. mulsd %xmm5, %xmm13
  225. addsd %xmm3, %xmm14
  226. movsd 11 * SIZE(AO), %xmm3
  227. mulsd %xmm5, %xmm15
  228. movsd 3 * SIZE(BO), %xmm5
  229. addsd %xmm9, %xmm8
  230. movsd 12 * SIZE(AO), %xmm9
  231. mulsd %xmm4, %xmm0
  232. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  233. addsd %xmm11, %xmm10
  234. movsd 13 * SIZE(AO), %xmm11
  235. mulsd %xmm4, %xmm1
  236. addsd %xmm13, %xmm12
  237. movsd 14 * SIZE(AO), %xmm13
  238. mulsd %xmm4, %xmm2
  239. addsd %xmm15, %xmm14
  240. movsd 15 * SIZE(AO), %xmm15
  241. mulsd %xmm4, %xmm3
  242. movsd 4 * SIZE(BO), %xmm4
  243. subq $-16 * SIZE, AO
  244. addsd %xmm0, %xmm8
  245. movsd 0 * SIZE(AO), %xmm0
  246. mulsd %xmm5, %xmm9
  247. addsd %xmm1, %xmm10
  248. movsd 1 * SIZE(AO), %xmm1
  249. mulsd %xmm5, %xmm11
  250. addq $ 4 * SIZE, BO
  251. addsd %xmm2, %xmm12
  252. movsd 2 * SIZE(AO), %xmm2
  253. mulsd %xmm5, %xmm13
  254. decq %rax
  255. addsd %xmm3, %xmm14
  256. movsd 3 * SIZE(AO), %xmm3
  257. mulsd %xmm5, %xmm15
  258. movsd 1 * SIZE(BO), %xmm5
  259. jne .L42
  260. ALIGN_4
  261. .L45:
  262. #if defined(LT) || defined(RN)
  263. movq KK, %rax
  264. #else
  265. movq K, %rax
  266. subq KK, %rax
  267. #endif
  268. addsd %xmm9, %xmm8
  269. addsd %xmm11, %xmm10
  270. addsd %xmm13, %xmm12
  271. addsd %xmm15, %xmm14
  272. andq $3, %rax
  273. BRANCH
  274. BRANCH
  275. je .L49
  276. ALIGN_4
  277. .L46:
  278. mulsd %xmm4, %xmm0
  279. mulsd %xmm4, %xmm1
  280. mulsd %xmm4, %xmm2
  281. mulsd %xmm4, %xmm3
  282. movsd 1 * SIZE(BO), %xmm4
  283. addsd %xmm0, %xmm8
  284. movsd 4 * SIZE(AO), %xmm0
  285. addsd %xmm1, %xmm10
  286. movsd 5 * SIZE(AO), %xmm1
  287. addsd %xmm2, %xmm12
  288. movsd 6 * SIZE(AO), %xmm2
  289. addsd %xmm3, %xmm14
  290. movsd 7 * SIZE(AO), %xmm3
  291. addq $4 * SIZE, AO
  292. addq $1 * SIZE, BO
  293. decq %rax
  294. BRANCH
  295. jg .L46
  296. ALIGN_4
  297. .L49:
  298. #if defined(LN) || defined(RT)
  299. movq KK, %rax
  300. #ifdef LN
  301. subq $4, %rax
  302. #else
  303. subq $1, %rax
  304. #endif
  305. leaq (, %rax, SIZE), %rax
  306. movq AORIG, AO
  307. leaq (AO, %rax, 4), AO
  308. leaq (B, %rax, 1), BO
  309. #endif
  310. #if defined(LN) || defined(LT)
  311. movsd 0 * SIZE(BO), %xmm0
  312. movsd 1 * SIZE(BO), %xmm2
  313. movsd 2 * SIZE(BO), %xmm4
  314. movsd 3 * SIZE(BO), %xmm6
  315. subsd %xmm8, %xmm0
  316. subsd %xmm10, %xmm2
  317. subsd %xmm12, %xmm4
  318. subsd %xmm14, %xmm6
  319. #else
  320. movsd 0 * SIZE(AO), %xmm0
  321. movsd 1 * SIZE(AO), %xmm2
  322. movsd 2 * SIZE(AO), %xmm4
  323. movsd 3 * SIZE(AO), %xmm6
  324. subsd %xmm8, %xmm0
  325. subsd %xmm10, %xmm2
  326. subsd %xmm12, %xmm4
  327. subsd %xmm14, %xmm6
  328. #endif
  329. #ifdef LN
  330. movsd 15 * SIZE(AO), %xmm8
  331. mulsd %xmm8, %xmm6
  332. movsd 14 * SIZE(AO), %xmm9
  333. mulsd %xmm6, %xmm9
  334. movsd 13 * SIZE(AO), %xmm11
  335. subsd %xmm9, %xmm4
  336. movsd 12 * SIZE(AO), %xmm13
  337. mulsd %xmm6, %xmm11
  338. movsd 10 * SIZE(AO), %xmm8
  339. subsd %xmm11, %xmm2
  340. movsd 9 * SIZE(AO), %xmm9
  341. mulsd %xmm6, %xmm13
  342. movsd 8 * SIZE(AO), %xmm11
  343. subsd %xmm13, %xmm0
  344. mulsd %xmm8, %xmm4
  345. movsd 5 * SIZE(AO), %xmm8
  346. mulsd %xmm4, %xmm9
  347. subsd %xmm9, %xmm2
  348. movsd 4 * SIZE(AO), %xmm9
  349. mulsd %xmm4, %xmm11
  350. subsd %xmm11, %xmm0
  351. movsd 0 * SIZE(AO), %xmm11
  352. mulsd %xmm8, %xmm2
  353. mulsd %xmm2, %xmm9
  354. subsd %xmm9, %xmm0
  355. mulsd %xmm11, %xmm0
  356. #endif
  357. #ifdef LT
  358. movsd 0 * SIZE(AO), %xmm8
  359. mulsd %xmm8, %xmm0
  360. movsd 1 * SIZE(AO), %xmm9
  361. mulsd %xmm0, %xmm9
  362. movsd 2 * SIZE(AO), %xmm11
  363. subsd %xmm9, %xmm2
  364. movsd 3 * SIZE(AO), %xmm13
  365. mulsd %xmm0, %xmm11
  366. movsd 5 * SIZE(AO), %xmm8
  367. subsd %xmm11, %xmm4
  368. movsd 6 * SIZE(AO), %xmm9
  369. mulsd %xmm0, %xmm13
  370. movsd 7 * SIZE(AO), %xmm11
  371. subsd %xmm13, %xmm6
  372. mulsd %xmm8, %xmm2
  373. movsd 10 * SIZE(AO), %xmm8
  374. mulsd %xmm2, %xmm9
  375. subsd %xmm9, %xmm4
  376. movsd 11 * SIZE(AO), %xmm9
  377. mulsd %xmm2, %xmm11
  378. subsd %xmm11, %xmm6
  379. mulsd %xmm8, %xmm4
  380. movsd 15 * SIZE(AO), %xmm8
  381. mulsd %xmm4, %xmm9
  382. subsd %xmm9, %xmm6
  383. mulsd %xmm8, %xmm6
  384. #endif
  385. #if defined(RN) || defined(RT)
  386. movsd 0 * SIZE(BO), %xmm8
  387. mulsd %xmm8, %xmm0
  388. mulsd %xmm8, %xmm2
  389. mulsd %xmm8, %xmm4
  390. mulsd %xmm8, %xmm6
  391. #endif
  392. #ifdef LN
  393. subq $4 * SIZE, CO1
  394. #endif
  395. movsd %xmm0, 0 * SIZE(CO1)
  396. movsd %xmm2, 1 * SIZE(CO1)
  397. movsd %xmm4, 2 * SIZE(CO1)
  398. movsd %xmm6, 3 * SIZE(CO1)
  399. #if defined(LN) || defined(LT)
  400. movsd %xmm0, 0 * SIZE(BO)
  401. movsd %xmm2, 1 * SIZE(BO)
  402. movsd %xmm4, 2 * SIZE(BO)
  403. movsd %xmm6, 3 * SIZE(BO)
  404. #else
  405. movsd %xmm0, 0 * SIZE(AO)
  406. movsd %xmm2, 1 * SIZE(AO)
  407. movsd %xmm4, 2 * SIZE(AO)
  408. movsd %xmm6, 3 * SIZE(AO)
  409. #endif
  410. #ifndef LN
  411. addq $4 * SIZE, CO1
  412. #endif
  413. #if defined(LT) || defined(RN)
  414. movq K, %rax
  415. subq KK, %rax
  416. leaq (,%rax, SIZE), %rax
  417. leaq (AO, %rax, 4), AO
  418. leaq (BO, %rax, 1), BO
  419. #endif
  420. #ifdef LN
  421. subq $4, KK
  422. #endif
  423. #ifdef LT
  424. addq $4, KK
  425. #endif
  426. #ifdef RT
  427. movq K, %rax
  428. salq $2 + BASE_SHIFT, %rax
  429. addq %rax, AORIG
  430. #endif
  431. decq I # i --
  432. jg .L41
  433. ALIGN_4
  434. .L50:
  435. testq $2, M
  436. je .L60
  437. #ifdef LN
  438. movq K, %rax
  439. salq $1 + BASE_SHIFT, %rax
  440. subq %rax, AORIG
  441. #endif
  442. #if defined(LN) || defined(RT)
  443. movq KK, %rax
  444. leaq (, %rax, SIZE), %rax
  445. movq AORIG, AO
  446. leaq (AO, %rax, 2), AO
  447. leaq (B, %rax, 1), BO
  448. #else
  449. movq B, BO
  450. #endif
  451. movsd 0 * SIZE(AO), %xmm0
  452. xorps %xmm2, %xmm2
  453. movsd 1 * SIZE(AO), %xmm1
  454. xorps %xmm3, %xmm3
  455. movsd 0 * SIZE(BO), %xmm4
  456. xorps %xmm8, %xmm8
  457. movsd 1 * SIZE(BO), %xmm5
  458. xorps %xmm10, %xmm10
  459. #if defined(LT) || defined(RN)
  460. movq KK, %rax
  461. #else
  462. movq K, %rax
  463. subq KK, %rax
  464. #endif
  465. sarq $2, %rax
  466. je .L55
  467. ALIGN_4
  468. .L52:
  469. addsd %xmm2, %xmm8
  470. movsd 2 * SIZE(AO), %xmm2
  471. mulsd %xmm4, %xmm0
  472. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  473. addsd %xmm3, %xmm10
  474. movsd 3 * SIZE(AO), %xmm3
  475. mulsd %xmm4, %xmm1
  476. movsd 2 * SIZE(BO), %xmm4
  477. addsd %xmm0, %xmm8
  478. movsd 4 * SIZE(AO), %xmm0
  479. mulsd %xmm5, %xmm2
  480. addq $8 * SIZE, AO
  481. addsd %xmm1, %xmm10
  482. movsd -3 * SIZE(AO), %xmm1
  483. mulsd %xmm5, %xmm3
  484. movsd 3 * SIZE(BO), %xmm5
  485. addsd %xmm2, %xmm8
  486. movsd -2 * SIZE(AO), %xmm2
  487. mulsd %xmm4, %xmm0
  488. addq $4 * SIZE, BO
  489. addsd %xmm3, %xmm10
  490. movsd -1 * SIZE(AO), %xmm3
  491. mulsd %xmm4, %xmm1
  492. movsd 0 * SIZE(BO), %xmm4
  493. addsd %xmm0, %xmm8
  494. movsd 0 * SIZE(AO), %xmm0
  495. mulsd %xmm5, %xmm2
  496. decq %rax
  497. addsd %xmm1, %xmm10
  498. movsd 1 * SIZE(AO), %xmm1
  499. mulsd %xmm5, %xmm3
  500. movsd 1 * SIZE(BO), %xmm5
  501. jne .L52
  502. ALIGN_4
  503. .L55:
  504. #if defined(LT) || defined(RN)
  505. movq KK, %rax
  506. #else
  507. movq K, %rax
  508. subq KK, %rax
  509. #endif
  510. addsd %xmm2, %xmm8
  511. addsd %xmm3, %xmm10
  512. andq $3, %rax
  513. BRANCH
  514. je .L59
  515. ALIGN_4
  516. .L56:
  517. mulsd %xmm4, %xmm0
  518. mulsd %xmm4, %xmm1
  519. movsd 1 * SIZE(BO), %xmm4
  520. addsd %xmm0, %xmm8
  521. movsd 2 * SIZE(AO), %xmm0
  522. addsd %xmm1, %xmm10
  523. movsd 3 * SIZE(AO), %xmm1
  524. addq $2 * SIZE, AO
  525. addq $1 * SIZE, BO
  526. decq %rax
  527. BRANCH
  528. jg .L56
  529. ALIGN_4
  530. .L59:
  531. #if defined(LN) || defined(RT)
  532. movq KK, %rax
  533. #ifdef LN
  534. subq $2, %rax
  535. #else
  536. subq $1, %rax
  537. #endif
  538. leaq (, %rax, SIZE), %rax
  539. movq AORIG, AO
  540. leaq (AO, %rax, 2), AO
  541. leaq (B, %rax, 1), BO
  542. #endif
  543. #if defined(LN) || defined(LT)
  544. movsd 0 * SIZE(BO), %xmm0
  545. movsd 1 * SIZE(BO), %xmm2
  546. subsd %xmm8, %xmm0
  547. subsd %xmm10, %xmm2
  548. #else
  549. movsd 0 * SIZE(AO), %xmm0
  550. movsd 1 * SIZE(AO), %xmm2
  551. subsd %xmm8, %xmm0
  552. subsd %xmm10, %xmm2
  553. #endif
  554. #ifdef LN
  555. movsd 3 * SIZE(AO), %xmm8
  556. movsd 2 * SIZE(AO), %xmm9
  557. movsd 0 * SIZE(AO), %xmm11
  558. mulsd %xmm8, %xmm2
  559. mulsd %xmm2, %xmm9
  560. subsd %xmm9, %xmm0
  561. mulsd %xmm11,%xmm0
  562. #endif
  563. #ifdef LT
  564. movsd 0 * SIZE(AO), %xmm8
  565. movsd 1 * SIZE(AO), %xmm9
  566. movsd 3 * SIZE(AO), %xmm11
  567. mulsd %xmm8, %xmm0
  568. mulsd %xmm0, %xmm9
  569. subsd %xmm9, %xmm2
  570. mulsd %xmm11,%xmm2
  571. #endif
  572. #if defined(RN) || defined(RT)
  573. movsd 0 * SIZE(BO), %xmm8
  574. mulsd %xmm8, %xmm0
  575. mulsd %xmm8, %xmm2
  576. #endif
  577. #ifdef LN
  578. subq $2 * SIZE, CO1
  579. #endif
  580. movsd %xmm0, 0 * SIZE(CO1)
  581. movsd %xmm2, 1 * SIZE(CO1)
  582. #if defined(LN) || defined(LT)
  583. movsd %xmm0, 0 * SIZE(BO)
  584. movsd %xmm2, 1 * SIZE(BO)
  585. #else
  586. movsd %xmm0, 0 * SIZE(AO)
  587. movsd %xmm2, 1 * SIZE(AO)
  588. #endif
  589. #ifndef LN
  590. addq $2 * SIZE, CO1
  591. #endif
  592. #if defined(LT) || defined(RN)
  593. movq K, %rax
  594. subq KK, %rax
  595. leaq (,%rax, SIZE), %rax
  596. leaq (AO, %rax, 2), AO
  597. leaq (BO, %rax, 1), BO
  598. #endif
  599. #ifdef LN
  600. subq $2, KK
  601. #endif
  602. #ifdef LT
  603. addq $2, KK
  604. #endif
  605. #ifdef RT
  606. movq K, %rax
  607. salq $1 + BASE_SHIFT, %rax
  608. addq %rax, AORIG
  609. #endif
  610. ALIGN_4
  611. .L60:
  612. testq $1, M
  613. je .L69
  614. #ifdef LN
  615. movq K, %rax
  616. salq $0 + BASE_SHIFT, %rax
  617. subq %rax, AORIG
  618. #endif
  619. #if defined(LN) || defined(RT)
  620. movq KK, %rax
  621. leaq (, %rax, SIZE), %rax
  622. movq AORIG, AO
  623. leaq (AO, %rax, 1), AO
  624. leaq (B, %rax, 1), BO
  625. #else
  626. movq B, BO
  627. #endif
  628. movsd 0 * SIZE(AO), %xmm0
  629. xorps %xmm5, %xmm5
  630. movsd 1 * SIZE(AO), %xmm2
  631. xorps %xmm7, %xmm7
  632. movsd 0 * SIZE(BO), %xmm1
  633. xorps %xmm8, %xmm8
  634. movsd 1 * SIZE(BO), %xmm3
  635. xorps %xmm9, %xmm9
  636. movsd 2 * SIZE(AO), %xmm4
  637. movsd 3 * SIZE(AO), %xmm6
  638. #if defined(LT) || defined(RN)
  639. movq KK, %rax
  640. #else
  641. movq K, %rax
  642. subq KK, %rax
  643. #endif
  644. sarq $2, %rax
  645. je .L65
  646. ALIGN_4
  647. .L62:
  648. addsd %xmm5, %xmm8
  649. movsd 2 * SIZE(BO), %xmm5
  650. mulsd %xmm0, %xmm1
  651. movsd 4 * SIZE(AO), %xmm0
  652. addsd %xmm7, %xmm9
  653. movsd 3 * SIZE(BO), %xmm7
  654. mulsd %xmm2, %xmm3
  655. movsd 5 * SIZE(AO), %xmm2
  656. addsd %xmm1, %xmm8
  657. movsd 4 * SIZE(BO), %xmm1
  658. mulsd %xmm4, %xmm5
  659. movsd 6 * SIZE(AO), %xmm4
  660. addsd %xmm3, %xmm9
  661. movsd 5 * SIZE(BO), %xmm3
  662. mulsd %xmm6, %xmm7
  663. movsd 7 * SIZE(AO), %xmm6
  664. addq $4 * SIZE, AO
  665. addq $4 * SIZE, BO
  666. decq %rax
  667. jne .L62
  668. addsd %xmm5, %xmm8
  669. addsd %xmm7, %xmm9
  670. ALIGN_4
  671. .L65:
  672. #if defined(LT) || defined(RN)
  673. movq KK, %rax
  674. #else
  675. movq K, %rax
  676. subq KK, %rax
  677. #endif
  678. andq $3, %rax
  679. BRANCH
  680. je .L68
  681. ALIGN_4
  682. .L66:
  683. movsd 0 * SIZE(AO), %xmm0
  684. movsd 0 * SIZE(BO), %xmm1
  685. mulsd %xmm0, %xmm1
  686. addsd %xmm1, %xmm8
  687. addq $1 * SIZE, AO
  688. addq $1 * SIZE, BO
  689. decq %rax
  690. BRANCH
  691. jg .L66
  692. ALIGN_4
  693. .L68:
  694. addsd %xmm9, %xmm8
  695. #if defined(LN) || defined(RT)
  696. movq KK, %rax
  697. #ifdef LN
  698. subq $1, %rax
  699. #else
  700. subq $1, %rax
  701. #endif
  702. leaq (, %rax, SIZE), %rax
  703. movq AORIG, AO
  704. leaq (AO, %rax, 1), AO
  705. leaq (B, %rax, 1), BO
  706. #endif
  707. #if defined(LN) || defined(LT)
  708. movsd 0 * SIZE(BO), %xmm0
  709. subsd %xmm8, %xmm0
  710. #else
  711. movsd 0 * SIZE(AO), %xmm0
  712. subsd %xmm8, %xmm0
  713. #endif
  714. #if defined(LN) || defined(LT)
  715. movsd 0 * SIZE(AO), %xmm8
  716. mulsd %xmm8, %xmm0
  717. #endif
  718. #if defined(RN) || defined(RT)
  719. movsd 0 * SIZE(BO), %xmm8
  720. mulsd %xmm8, %xmm0
  721. #endif
  722. #ifdef LN
  723. subq $1 * SIZE, CO1
  724. #endif
  725. movsd %xmm0, 0 * SIZE(CO1)
  726. #if defined(LN) || defined(LT)
  727. movsd %xmm0, 0 * SIZE(BO)
  728. #else
  729. movsd %xmm0, 0 * SIZE(AO)
  730. #endif
  731. #ifndef LN
  732. addq $1 * SIZE, CO1
  733. #endif
  734. #if defined(LT) || defined(RN)
  735. movq K, %rax
  736. subq KK, %rax
  737. leaq (,%rax, SIZE), %rax
  738. leaq (AO, %rax, 1), AO
  739. leaq (BO, %rax, 1), BO
  740. #endif
  741. #ifdef LN
  742. subq $1, KK
  743. #endif
  744. #ifdef LT
  745. addq $1, KK
  746. #endif
  747. #ifdef RT
  748. movq K, %rax
  749. salq $0 + BASE_SHIFT, %rax
  750. addq %rax, AORIG
  751. #endif
  752. ALIGN_4
  753. .L69:
  754. #ifdef LN
  755. leaq (, K, SIZE), %rax
  756. leaq (B, %rax, 1), B
  757. #endif
  758. #if defined(LT) || defined(RN)
  759. movq BO, B
  760. #endif
  761. #ifdef RN
  762. addq $1, KK
  763. #endif
  764. #ifdef RT
  765. subq $1, KK
  766. #endif
  767. ALIGN_2
  768. .L40:
  769. movq N, J
  770. sarq $1, J
  771. jle .L999
  772. ALIGN_4
  773. .L10:
  774. #if defined(LT) || defined(RN)
  775. movq A, AO
  776. #else
  777. movq A, AORIG
  778. #endif
  779. #ifdef RT
  780. movq K, %rax
  781. salq $1 + BASE_SHIFT, %rax
  782. subq %rax, B
  783. leaq (, LDC, 2), %rax
  784. subq %rax, C
  785. #endif
  786. movq C, CO1
  787. leaq (C, LDC, 1), CO2
  788. #ifndef RT
  789. leaq (C, LDC, 2), C
  790. #endif
  791. #ifdef LN
  792. movq OFFSET, %rax
  793. addq M, %rax
  794. movq %rax, KK
  795. #endif
  796. movq K, %rax
  797. salq $BASE_SHIFT + 1, %rax
  798. leaq (B, %rax), BB
  799. #ifdef LT
  800. movq OFFSET, %rax
  801. movq %rax, KK
  802. #endif
  803. movq M, I
  804. sarq $2, I
  805. jle .L20
  806. ALIGN_4
  807. .L11:
  808. #ifdef LN
  809. movq K, %rax
  810. salq $2 + BASE_SHIFT, %rax
  811. subq %rax, AORIG
  812. #endif
  813. #if defined(LN) || defined(RT)
  814. movq KK, %rax
  815. leaq (, %rax, SIZE), %rax
  816. movq AORIG, AO
  817. leaq (AO, %rax, 4), AO
  818. leaq (B, %rax, 2), BO
  819. #else
  820. movq B, BO
  821. #endif
  822. prefetcht0 0 * SIZE(BB)
  823. subq $-8 * SIZE, BB
  824. movsd 0 * SIZE(AO), %xmm0
  825. xorps %xmm2, %xmm2
  826. movsd 1 * SIZE(AO), %xmm4
  827. xorps %xmm5, %xmm5
  828. movsd 2 * SIZE(AO), %xmm5
  829. xorps %xmm6, %xmm6
  830. xorps %xmm7, %xmm7
  831. movsd 0 * SIZE(BO), %xmm1
  832. xorps %xmm8, %xmm8
  833. xorps %xmm9, %xmm9
  834. movsd 1 * SIZE(BO), %xmm3
  835. xorps %xmm10, %xmm10
  836. xorps %xmm11, %xmm11
  837. prefetcht0 3 * SIZE(CO1)
  838. xorps %xmm12, %xmm12
  839. xorps %xmm13, %xmm13
  840. prefetcht0 3 * SIZE(CO2)
  841. xorps %xmm14, %xmm14
  842. xorps %xmm15, %xmm15
  843. #if defined(LT) || defined(RN)
  844. movq KK, %rax
  845. #else
  846. movq K, %rax
  847. subq KK, %rax
  848. #endif
  849. sarq $2, %rax
  850. je .L15
  851. ALIGN_4
  852. .L12:
  853. addsd %xmm2, %xmm13
  854. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  855. movaps %xmm0, %xmm2
  856. mulsd %xmm1, %xmm0
  857. addsd %xmm7, %xmm14
  858. movsd 3 * SIZE(AO), %xmm7
  859. mulsd %xmm3, %xmm2
  860. addsd %xmm6, %xmm15
  861. PREFETCH (PREFETCHSIZE + 0) * SIZE(BO)
  862. movaps %xmm4, %xmm6
  863. mulsd %xmm1, %xmm4
  864. addsd %xmm0, %xmm8
  865. movsd 4 * SIZE(AO), %xmm0
  866. mulsd %xmm3, %xmm6
  867. addsd %xmm2, %xmm9
  868. movaps %xmm5, %xmm2
  869. mulsd %xmm1, %xmm5
  870. addsd %xmm4, %xmm10
  871. movsd 5 * SIZE(AO), %xmm4
  872. mulsd %xmm3, %xmm2
  873. addsd %xmm6, %xmm11
  874. movaps %xmm7, %xmm6
  875. mulsd %xmm1, %xmm7
  876. movsd 2 * SIZE(BO), %xmm1
  877. addsd %xmm5, %xmm12
  878. movsd 6 * SIZE(AO), %xmm5
  879. mulsd %xmm3, %xmm6
  880. movsd 3 * SIZE(BO), %xmm3
  881. addsd %xmm2, %xmm13
  882. movaps %xmm0, %xmm2
  883. mulsd %xmm1, %xmm0
  884. addsd %xmm7, %xmm14
  885. movsd 7 * SIZE(AO), %xmm7
  886. mulsd %xmm3, %xmm2
  887. addsd %xmm6, %xmm15
  888. movaps %xmm4, %xmm6
  889. mulsd %xmm1, %xmm4
  890. addsd %xmm0, %xmm8
  891. movsd 8 * SIZE(AO), %xmm0
  892. mulsd %xmm3, %xmm6
  893. addsd %xmm2, %xmm9
  894. movaps %xmm5, %xmm2
  895. mulsd %xmm1, %xmm5
  896. addsd %xmm4, %xmm10
  897. movsd 9 * SIZE(AO), %xmm4
  898. mulsd %xmm3, %xmm2
  899. addsd %xmm6, %xmm11
  900. movaps %xmm7, %xmm6
  901. mulsd %xmm1, %xmm7
  902. movsd 4 * SIZE(BO), %xmm1
  903. addsd %xmm5, %xmm12
  904. movsd 10 * SIZE(AO), %xmm5
  905. mulsd %xmm3, %xmm6
  906. movsd 5 * SIZE(BO), %xmm3
  907. addsd %xmm2, %xmm13
  908. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  909. movaps %xmm0, %xmm2
  910. mulsd %xmm1, %xmm0
  911. addsd %xmm7, %xmm14
  912. movsd 11 * SIZE(AO), %xmm7
  913. mulsd %xmm3, %xmm2
  914. addsd %xmm6, %xmm15
  915. movaps %xmm4, %xmm6
  916. mulsd %xmm1, %xmm4
  917. addsd %xmm0, %xmm8
  918. movsd 12 * SIZE(AO), %xmm0
  919. mulsd %xmm3, %xmm6
  920. addsd %xmm2, %xmm9
  921. movaps %xmm5, %xmm2
  922. mulsd %xmm1, %xmm5
  923. addsd %xmm4, %xmm10
  924. movsd 13 * SIZE(AO), %xmm4
  925. mulsd %xmm3, %xmm2
  926. addsd %xmm6, %xmm11
  927. movaps %xmm7, %xmm6
  928. mulsd %xmm1, %xmm7
  929. movsd 6 * SIZE(BO), %xmm1
  930. addsd %xmm5, %xmm12
  931. movsd 14 * SIZE(AO), %xmm5
  932. mulsd %xmm3, %xmm6
  933. movsd 7 * SIZE(BO), %xmm3
  934. addsd %xmm2, %xmm13
  935. movaps %xmm0, %xmm2
  936. mulsd %xmm1, %xmm0
  937. addsd %xmm7, %xmm14
  938. movsd 15 * SIZE(AO), %xmm7
  939. mulsd %xmm3, %xmm2
  940. subq $-16 * SIZE, AO
  941. addsd %xmm6, %xmm15
  942. movaps %xmm4, %xmm6
  943. mulsd %xmm1, %xmm4
  944. addsd %xmm0, %xmm8
  945. movsd 0 * SIZE(AO), %xmm0
  946. mulsd %xmm3, %xmm6
  947. addsd %xmm2, %xmm9
  948. movaps %xmm5, %xmm2
  949. mulsd %xmm1, %xmm5
  950. addq $ 8 * SIZE, BO
  951. addsd %xmm4, %xmm10
  952. movsd 1 * SIZE(AO), %xmm4
  953. mulsd %xmm3, %xmm2
  954. decq %rax
  955. addsd %xmm6, %xmm11
  956. movaps %xmm7, %xmm6
  957. mulsd %xmm1, %xmm7
  958. movsd 0 * SIZE(BO), %xmm1
  959. addsd %xmm5, %xmm12
  960. movsd 2 * SIZE(AO), %xmm5
  961. mulsd %xmm3, %xmm6
  962. movsd 1 * SIZE(BO), %xmm3
  963. jne .L12
  964. ALIGN_4
  965. .L15:
  966. #if defined(LT) || defined(RN)
  967. movq KK, %rax
  968. #else
  969. movq K, %rax
  970. subq KK, %rax
  971. #endif
  972. andq $3, %rax
  973. BRANCH
  974. je .L19
  975. ALIGN_4
  976. .L16:
  977. addsd %xmm2, %xmm13
  978. movaps %xmm0, %xmm2
  979. mulsd %xmm1, %xmm0
  980. addsd %xmm7, %xmm14
  981. movsd 3 * SIZE(AO), %xmm7
  982. mulsd %xmm3, %xmm2
  983. addsd %xmm6, %xmm15
  984. movaps %xmm4, %xmm6
  985. mulsd %xmm1, %xmm4
  986. addsd %xmm0, %xmm8
  987. movsd 4 * SIZE(AO), %xmm0
  988. mulsd %xmm3, %xmm6
  989. addsd %xmm2, %xmm9
  990. movaps %xmm5, %xmm2
  991. mulsd %xmm1, %xmm5
  992. addsd %xmm4, %xmm10
  993. movsd 5 * SIZE(AO), %xmm4
  994. mulsd %xmm3, %xmm2
  995. addsd %xmm6, %xmm11
  996. movaps %xmm7, %xmm6
  997. mulsd %xmm1, %xmm7
  998. movsd 2 * SIZE(BO), %xmm1
  999. addsd %xmm5, %xmm12
  1000. movsd 6 * SIZE(AO), %xmm5
  1001. mulsd %xmm3, %xmm6
  1002. movsd 3 * SIZE(BO), %xmm3
  1003. addq $4 * SIZE, AO
  1004. addq $2 * SIZE, BO
  1005. decq %rax
  1006. BRANCH
  1007. jg .L16
  1008. ALIGN_4
  1009. .L19:
  1010. addsd %xmm2, %xmm13
  1011. addsd %xmm7, %xmm14
  1012. addsd %xmm6, %xmm15
  1013. #if defined(LN) || defined(RT)
  1014. movq KK, %rax
  1015. #ifdef LN
  1016. subq $4, %rax
  1017. #else
  1018. subq $2, %rax
  1019. #endif
  1020. leaq (, %rax, SIZE), %rax
  1021. movq AORIG, AO
  1022. leaq (AO, %rax, 4), AO
  1023. leaq (B, %rax, 2), BO
  1024. #endif
  1025. #if defined(LN) || defined(LT)
  1026. movsd 0 * SIZE(BO), %xmm0
  1027. movsd 1 * SIZE(BO), %xmm1
  1028. movsd 2 * SIZE(BO), %xmm2
  1029. movsd 3 * SIZE(BO), %xmm3
  1030. movsd 4 * SIZE(BO), %xmm4
  1031. movsd 5 * SIZE(BO), %xmm5
  1032. movsd 6 * SIZE(BO), %xmm6
  1033. movsd 7 * SIZE(BO), %xmm7
  1034. subsd %xmm8, %xmm0
  1035. subsd %xmm9, %xmm1
  1036. subsd %xmm10, %xmm2
  1037. subsd %xmm11, %xmm3
  1038. subsd %xmm12, %xmm4
  1039. subsd %xmm13, %xmm5
  1040. subsd %xmm14, %xmm6
  1041. subsd %xmm15, %xmm7
  1042. #else
  1043. movsd 0 * SIZE(AO), %xmm0
  1044. movsd 1 * SIZE(AO), %xmm2
  1045. movsd 2 * SIZE(AO), %xmm4
  1046. movsd 3 * SIZE(AO), %xmm6
  1047. movsd 4 * SIZE(AO), %xmm1
  1048. movsd 5 * SIZE(AO), %xmm3
  1049. movsd 6 * SIZE(AO), %xmm5
  1050. movsd 7 * SIZE(AO), %xmm7
  1051. subsd %xmm8, %xmm0
  1052. subsd %xmm10, %xmm2
  1053. subsd %xmm12, %xmm4
  1054. subsd %xmm14, %xmm6
  1055. subsd %xmm9, %xmm1
  1056. subsd %xmm11, %xmm3
  1057. subsd %xmm13, %xmm5
  1058. subsd %xmm15, %xmm7
  1059. #endif
  1060. #ifdef LN
  1061. movsd 15 * SIZE(AO), %xmm8
  1062. mulsd %xmm8, %xmm6
  1063. movsd 14 * SIZE(AO), %xmm9
  1064. mulsd %xmm8, %xmm7
  1065. movsd 13 * SIZE(AO), %xmm11
  1066. movaps %xmm9, %xmm10
  1067. movsd 12 * SIZE(AO), %xmm13
  1068. mulsd %xmm6, %xmm9
  1069. movsd 10 * SIZE(AO), %xmm8
  1070. mulsd %xmm7, %xmm10
  1071. subsd %xmm9, %xmm4
  1072. movsd 9 * SIZE(AO), %xmm9
  1073. subsd %xmm10, %xmm5
  1074. movaps %xmm11, %xmm12
  1075. mulsd %xmm6, %xmm11
  1076. mulsd %xmm7, %xmm12
  1077. subsd %xmm11, %xmm2
  1078. movsd 8 * SIZE(AO), %xmm11
  1079. subsd %xmm12, %xmm3
  1080. movaps %xmm13, %xmm14
  1081. mulsd %xmm6, %xmm13
  1082. mulsd %xmm7, %xmm14
  1083. subsd %xmm13, %xmm0
  1084. subsd %xmm14, %xmm1
  1085. mulsd %xmm8, %xmm4
  1086. mulsd %xmm8, %xmm5
  1087. movsd 5 * SIZE(AO), %xmm8
  1088. movaps %xmm9, %xmm10
  1089. mulsd %xmm4, %xmm9
  1090. mulsd %xmm5, %xmm10
  1091. subsd %xmm9, %xmm2
  1092. movsd 4 * SIZE(AO), %xmm9
  1093. subsd %xmm10, %xmm3
  1094. movaps %xmm11, %xmm12
  1095. mulsd %xmm4, %xmm11
  1096. mulsd %xmm5, %xmm12
  1097. subsd %xmm11, %xmm0
  1098. movsd 0 * SIZE(AO), %xmm11
  1099. subsd %xmm12, %xmm1
  1100. mulsd %xmm8, %xmm2
  1101. mulsd %xmm8, %xmm3
  1102. movaps %xmm9, %xmm10
  1103. mulsd %xmm2, %xmm9
  1104. mulsd %xmm3, %xmm10
  1105. subsd %xmm9, %xmm0
  1106. subsd %xmm10, %xmm1
  1107. mulsd %xmm11, %xmm0
  1108. mulsd %xmm11, %xmm1
  1109. #endif
  1110. #ifdef LT
  1111. movsd 0 * SIZE(AO), %xmm8
  1112. mulsd %xmm8, %xmm0
  1113. movsd 1 * SIZE(AO), %xmm9
  1114. mulsd %xmm8, %xmm1
  1115. movsd 2 * SIZE(AO), %xmm11
  1116. movaps %xmm9, %xmm10
  1117. movsd 3 * SIZE(AO), %xmm13
  1118. mulsd %xmm0, %xmm9
  1119. movsd 5 * SIZE(AO), %xmm8
  1120. mulsd %xmm1, %xmm10
  1121. subsd %xmm9, %xmm2
  1122. movsd 6 * SIZE(AO), %xmm9
  1123. subsd %xmm10, %xmm3
  1124. movaps %xmm11, %xmm12
  1125. mulsd %xmm0, %xmm11
  1126. mulsd %xmm1, %xmm12
  1127. subsd %xmm11, %xmm4
  1128. movsd 7 * SIZE(AO), %xmm11
  1129. subsd %xmm12, %xmm5
  1130. movaps %xmm13, %xmm14
  1131. mulsd %xmm0, %xmm13
  1132. mulsd %xmm1, %xmm14
  1133. subsd %xmm13, %xmm6
  1134. subsd %xmm14, %xmm7
  1135. mulsd %xmm8, %xmm2
  1136. mulsd %xmm8, %xmm3
  1137. movsd 10 * SIZE(AO), %xmm8
  1138. movaps %xmm9, %xmm10
  1139. mulsd %xmm2, %xmm9
  1140. mulsd %xmm3, %xmm10
  1141. subsd %xmm9, %xmm4
  1142. movsd 11 * SIZE(AO), %xmm9
  1143. subsd %xmm10, %xmm5
  1144. movaps %xmm11, %xmm12
  1145. mulsd %xmm2, %xmm11
  1146. mulsd %xmm3, %xmm12
  1147. subsd %xmm11, %xmm6
  1148. subsd %xmm12, %xmm7
  1149. mulsd %xmm8, %xmm4
  1150. mulsd %xmm8, %xmm5
  1151. movsd 15 * SIZE(AO), %xmm8
  1152. movaps %xmm9, %xmm10
  1153. mulsd %xmm4, %xmm9
  1154. mulsd %xmm5, %xmm10
  1155. subsd %xmm9, %xmm6
  1156. subsd %xmm10, %xmm7
  1157. mulsd %xmm8, %xmm6
  1158. mulsd %xmm8, %xmm7
  1159. #endif
  1160. #ifdef RN
  1161. movsd 0 * SIZE(BO), %xmm8
  1162. mulsd %xmm8, %xmm0
  1163. movsd 1 * SIZE(BO), %xmm9
  1164. mulsd %xmm8, %xmm2
  1165. movsd 3 * SIZE(BO), %xmm13
  1166. mulsd %xmm8, %xmm4
  1167. mulsd %xmm8, %xmm6
  1168. movaps %xmm9, %xmm10
  1169. movaps %xmm9, %xmm11
  1170. movaps %xmm9, %xmm12
  1171. mulsd %xmm0, %xmm9
  1172. mulsd %xmm2, %xmm10
  1173. mulsd %xmm4, %xmm11
  1174. mulsd %xmm6, %xmm12
  1175. subsd %xmm9, %xmm1
  1176. subsd %xmm10, %xmm3
  1177. subsd %xmm11, %xmm5
  1178. subsd %xmm12, %xmm7
  1179. mulsd %xmm13, %xmm1
  1180. mulsd %xmm13, %xmm3
  1181. mulsd %xmm13, %xmm5
  1182. mulsd %xmm13, %xmm7
  1183. #endif
  1184. #ifdef RT
  1185. movsd 3 * SIZE(BO), %xmm8
  1186. mulsd %xmm8, %xmm1
  1187. movsd 2 * SIZE(BO), %xmm9
  1188. mulsd %xmm8, %xmm3
  1189. movsd 0 * SIZE(BO), %xmm13
  1190. mulsd %xmm8, %xmm5
  1191. mulsd %xmm8, %xmm7
  1192. movaps %xmm9, %xmm10
  1193. movaps %xmm9, %xmm11
  1194. movaps %xmm9, %xmm12
  1195. mulsd %xmm1, %xmm9
  1196. mulsd %xmm3, %xmm10
  1197. mulsd %xmm5, %xmm11
  1198. mulsd %xmm7, %xmm12
  1199. subsd %xmm9, %xmm0
  1200. subsd %xmm10, %xmm2
  1201. subsd %xmm11, %xmm4
  1202. subsd %xmm12, %xmm6
  1203. mulsd %xmm13, %xmm0
  1204. mulsd %xmm13, %xmm2
  1205. mulsd %xmm13, %xmm4
  1206. mulsd %xmm13, %xmm6
  1207. #endif
  1208. #ifdef LN
  1209. subq $4 * SIZE, CO1
  1210. subq $4 * SIZE, CO2
  1211. #endif
  1212. movsd %xmm0, 0 * SIZE(CO1)
  1213. movsd %xmm2, 1 * SIZE(CO1)
  1214. movsd %xmm4, 2 * SIZE(CO1)
  1215. movsd %xmm6, 3 * SIZE(CO1)
  1216. movsd %xmm1, 0 * SIZE(CO2)
  1217. movsd %xmm3, 1 * SIZE(CO2)
  1218. movsd %xmm5, 2 * SIZE(CO2)
  1219. movsd %xmm7, 3 * SIZE(CO2)
  1220. #if defined(LN) || defined(LT)
  1221. movsd %xmm0, 0 * SIZE(BO)
  1222. movsd %xmm1, 1 * SIZE(BO)
  1223. movsd %xmm2, 2 * SIZE(BO)
  1224. movsd %xmm3, 3 * SIZE(BO)
  1225. movsd %xmm4, 4 * SIZE(BO)
  1226. movsd %xmm5, 5 * SIZE(BO)
  1227. movsd %xmm6, 6 * SIZE(BO)
  1228. movsd %xmm7, 7 * SIZE(BO)
  1229. #else
  1230. movsd %xmm0, 0 * SIZE(AO)
  1231. movsd %xmm2, 1 * SIZE(AO)
  1232. movsd %xmm4, 2 * SIZE(AO)
  1233. movsd %xmm6, 3 * SIZE(AO)
  1234. movsd %xmm1, 4 * SIZE(AO)
  1235. movsd %xmm3, 5 * SIZE(AO)
  1236. movsd %xmm5, 6 * SIZE(AO)
  1237. movsd %xmm7, 7 * SIZE(AO)
  1238. #endif
  1239. #ifndef LN
  1240. addq $4 * SIZE, CO1
  1241. addq $4 * SIZE, CO2
  1242. #endif
  1243. #if defined(LT) || defined(RN)
  1244. movq K, %rax
  1245. subq KK, %rax
  1246. leaq (,%rax, SIZE), %rax
  1247. leaq (AO, %rax, 4), AO
  1248. leaq (BO, %rax, 2), BO
  1249. #endif
  1250. #ifdef LN
  1251. subq $4, KK
  1252. #endif
  1253. #ifdef LT
  1254. addq $4, KK
  1255. #endif
  1256. #ifdef RT
  1257. movq K, %rax
  1258. salq $2 + BASE_SHIFT, %rax
  1259. addq %rax, AORIG
  1260. #endif
  1261. decq I # i --
  1262. jg .L11
  1263. ALIGN_4
  1264. .L20:
  1265. testq $2, M
  1266. BRANCH
  1267. je .L30
  1268. #ifdef LN
  1269. movq K, %rax
  1270. salq $1 + BASE_SHIFT, %rax
  1271. subq %rax, AORIG
  1272. #endif
  1273. #if defined(LN) || defined(RT)
  1274. movq KK, %rax
  1275. leaq (, %rax, SIZE), %rax
  1276. movq AORIG, AO
  1277. leaq (AO, %rax, 2), AO
  1278. leaq (B, %rax, 2), BO
  1279. #else
  1280. movq B, BO
  1281. #endif
  1282. movsd 0 * SIZE(AO), %xmm0
  1283. xorps %xmm2, %xmm2
  1284. movsd 1 * SIZE(AO), %xmm4
  1285. xorps %xmm5, %xmm5
  1286. movsd 2 * SIZE(AO), %xmm5
  1287. xorps %xmm6, %xmm6
  1288. movsd 3 * SIZE(AO), %xmm7
  1289. movsd 0 * SIZE(BO), %xmm1
  1290. xorps %xmm8, %xmm8
  1291. xorps %xmm9, %xmm9
  1292. movsd 1 * SIZE(BO), %xmm3
  1293. xorps %xmm10, %xmm10
  1294. xorps %xmm11, %xmm11
  1295. #if defined(LT) || defined(RN)
  1296. movq KK, %rax
  1297. #else
  1298. movq K, %rax
  1299. subq KK, %rax
  1300. #endif
  1301. sarq $2, %rax
  1302. je .L25
  1303. ALIGN_4
  1304. .L22:
  1305. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1306. addsd %xmm2, %xmm9
  1307. movaps %xmm0, %xmm2
  1308. mulsd %xmm1, %xmm0
  1309. addsd %xmm6, %xmm11
  1310. movaps %xmm4, %xmm6
  1311. mulsd %xmm1, %xmm4
  1312. movsd 2 * SIZE(BO), %xmm1
  1313. addsd %xmm0, %xmm8
  1314. movsd 4 * SIZE(AO), %xmm0
  1315. mulsd %xmm3, %xmm2
  1316. addsd %xmm4, %xmm10
  1317. movsd 5 * SIZE(AO), %xmm4
  1318. mulsd %xmm3, %xmm6
  1319. movsd 3 * SIZE(BO), %xmm3
  1320. addsd %xmm2, %xmm9
  1321. movaps %xmm5, %xmm2
  1322. mulsd %xmm1, %xmm5
  1323. addsd %xmm6, %xmm11
  1324. movaps %xmm7, %xmm6
  1325. mulsd %xmm1, %xmm7
  1326. movsd 4 * SIZE(BO), %xmm1
  1327. addsd %xmm5, %xmm8
  1328. movsd 6 * SIZE(AO), %xmm5
  1329. mulsd %xmm3, %xmm2
  1330. addsd %xmm7, %xmm10
  1331. movsd 7 * SIZE(AO), %xmm7
  1332. mulsd %xmm3, %xmm6
  1333. movsd 5 * SIZE(BO), %xmm3
  1334. addsd %xmm2, %xmm9
  1335. movaps %xmm0, %xmm2
  1336. mulsd %xmm1, %xmm0
  1337. addsd %xmm6, %xmm11
  1338. movaps %xmm4, %xmm6
  1339. mulsd %xmm1, %xmm4
  1340. movsd 6 * SIZE(BO), %xmm1
  1341. addsd %xmm0, %xmm8
  1342. movsd 8 * SIZE(AO), %xmm0
  1343. mulsd %xmm3, %xmm2
  1344. addsd %xmm4, %xmm10
  1345. movsd 9 * SIZE(AO), %xmm4
  1346. mulsd %xmm3, %xmm6
  1347. movsd 7 * SIZE(BO), %xmm3
  1348. addsd %xmm2, %xmm9
  1349. movaps %xmm5, %xmm2
  1350. mulsd %xmm1, %xmm5
  1351. addsd %xmm6, %xmm11
  1352. movaps %xmm7, %xmm6
  1353. mulsd %xmm1, %xmm7
  1354. movsd 8 * SIZE(BO), %xmm1
  1355. addsd %xmm5, %xmm8
  1356. movsd 10 * SIZE(AO), %xmm5
  1357. mulsd %xmm3, %xmm2
  1358. addsd %xmm7, %xmm10
  1359. movsd 11 * SIZE(AO), %xmm7
  1360. mulsd %xmm3, %xmm6
  1361. movsd 9 * SIZE(BO), %xmm3
  1362. addq $8 * SIZE, AO
  1363. addq $8 * SIZE, BO
  1364. decq %rax
  1365. jne .L22
  1366. ALIGN_4
  1367. .L25:
  1368. #if defined(LT) || defined(RN)
  1369. movq KK, %rax
  1370. #else
  1371. movq K, %rax
  1372. subq KK, %rax
  1373. #endif
  1374. andq $3, %rax
  1375. BRANCH
  1376. je .L29
  1377. ALIGN_4
  1378. .L26:
  1379. addsd %xmm2, %xmm9
  1380. movaps %xmm0, %xmm2
  1381. mulsd %xmm1, %xmm0
  1382. addsd %xmm6, %xmm11
  1383. movaps %xmm4, %xmm6
  1384. mulsd %xmm1, %xmm4
  1385. movsd 2 * SIZE(BO), %xmm1
  1386. mulsd %xmm3, %xmm2
  1387. addsd %xmm0, %xmm8
  1388. movsd 2 * SIZE(AO), %xmm0
  1389. mulsd %xmm3, %xmm6
  1390. movsd 3 * SIZE(BO), %xmm3
  1391. addsd %xmm4, %xmm10
  1392. movsd 3 * SIZE(AO), %xmm4
  1393. addq $2 * SIZE, AO
  1394. addq $2 * SIZE, BO
  1395. decq %rax
  1396. BRANCH
  1397. jg .L26
  1398. ALIGN_4
  1399. .L29:
  1400. addsd %xmm2, %xmm9
  1401. addsd %xmm6, %xmm11
  1402. #if defined(LN) || defined(RT)
  1403. movq KK, %rax
  1404. #ifdef LN
  1405. subq $2, %rax
  1406. #else
  1407. subq $2, %rax
  1408. #endif
  1409. leaq (, %rax, SIZE), %rax
  1410. movq AORIG, AO
  1411. leaq (AO, %rax, 2), AO
  1412. leaq (B, %rax, 2), BO
  1413. #endif
  1414. #if defined(LN) || defined(LT)
  1415. movsd 0 * SIZE(BO), %xmm0
  1416. movsd 1 * SIZE(BO), %xmm1
  1417. movsd 2 * SIZE(BO), %xmm2
  1418. movsd 3 * SIZE(BO), %xmm3
  1419. subsd %xmm8, %xmm0
  1420. subsd %xmm9, %xmm1
  1421. subsd %xmm10, %xmm2
  1422. subsd %xmm11, %xmm3
  1423. #else
  1424. movsd 0 * SIZE(AO), %xmm0
  1425. movsd 1 * SIZE(AO), %xmm2
  1426. movsd 2 * SIZE(AO), %xmm1
  1427. movsd 3 * SIZE(AO), %xmm3
  1428. subsd %xmm8, %xmm0
  1429. subsd %xmm10, %xmm2
  1430. subsd %xmm9, %xmm1
  1431. subsd %xmm11, %xmm3
  1432. #endif
  1433. #ifdef LN
  1434. movsd 3 * SIZE(AO), %xmm8
  1435. mulsd %xmm8, %xmm2
  1436. movsd 2 * SIZE(AO), %xmm9
  1437. mulsd %xmm8, %xmm3
  1438. movsd 0 * SIZE(AO), %xmm13
  1439. movaps %xmm9, %xmm10
  1440. mulsd %xmm2, %xmm9
  1441. mulsd %xmm3, %xmm10
  1442. subsd %xmm9, %xmm0
  1443. subsd %xmm10, %xmm1
  1444. mulsd %xmm13, %xmm0
  1445. mulsd %xmm13, %xmm1
  1446. #endif
  1447. #ifdef LT
  1448. movsd 0 * SIZE(AO), %xmm8
  1449. mulsd %xmm8, %xmm0
  1450. movsd 1 * SIZE(AO), %xmm9
  1451. mulsd %xmm8, %xmm1
  1452. movsd 3 * SIZE(AO), %xmm13
  1453. movaps %xmm9, %xmm10
  1454. mulsd %xmm0, %xmm9
  1455. mulsd %xmm1, %xmm10
  1456. subsd %xmm9, %xmm2
  1457. subsd %xmm10, %xmm3
  1458. mulsd %xmm13, %xmm2
  1459. mulsd %xmm13, %xmm3
  1460. #endif
  1461. #ifdef RN
  1462. movsd 0 * SIZE(BO), %xmm8
  1463. mulsd %xmm8, %xmm0
  1464. movsd 1 * SIZE(BO), %xmm9
  1465. mulsd %xmm8, %xmm2
  1466. movsd 3 * SIZE(BO), %xmm13
  1467. movaps %xmm9, %xmm10
  1468. mulsd %xmm0, %xmm9
  1469. mulsd %xmm2, %xmm10
  1470. subsd %xmm9, %xmm1
  1471. subsd %xmm10, %xmm3
  1472. mulsd %xmm13, %xmm1
  1473. mulsd %xmm13, %xmm3
  1474. #endif
  1475. #ifdef RT
  1476. movsd 3 * SIZE(BO), %xmm8
  1477. mulsd %xmm8, %xmm1
  1478. movsd 2 * SIZE(BO), %xmm9
  1479. mulsd %xmm8, %xmm3
  1480. movsd 0 * SIZE(BO), %xmm13
  1481. movaps %xmm9, %xmm10
  1482. mulsd %xmm1, %xmm9
  1483. mulsd %xmm3, %xmm10
  1484. subsd %xmm9, %xmm0
  1485. subsd %xmm10, %xmm2
  1486. mulsd %xmm13, %xmm0
  1487. mulsd %xmm13, %xmm2
  1488. #endif
  1489. #ifdef LN
  1490. subq $2 * SIZE, CO1
  1491. subq $2 * SIZE, CO2
  1492. #endif
  1493. movsd %xmm0, 0 * SIZE(CO1)
  1494. movsd %xmm2, 1 * SIZE(CO1)
  1495. movsd %xmm1, 0 * SIZE(CO2)
  1496. movsd %xmm3, 1 * SIZE(CO2)
  1497. #if defined(LN) || defined(LT)
  1498. movsd %xmm0, 0 * SIZE(BO)
  1499. movsd %xmm1, 1 * SIZE(BO)
  1500. movsd %xmm2, 2 * SIZE(BO)
  1501. movsd %xmm3, 3 * SIZE(BO)
  1502. #else
  1503. movsd %xmm0, 0 * SIZE(AO)
  1504. movsd %xmm2, 1 * SIZE(AO)
  1505. movsd %xmm1, 2 * SIZE(AO)
  1506. movsd %xmm3, 3 * SIZE(AO)
  1507. #endif
  1508. #ifndef LN
  1509. addq $2 * SIZE, CO1
  1510. addq $2 * SIZE, CO2
  1511. #endif
  1512. #if defined(LT) || defined(RN)
  1513. movq K, %rax
  1514. subq KK, %rax
  1515. leaq (,%rax, SIZE), %rax
  1516. leaq (AO, %rax, 2), AO
  1517. leaq (BO, %rax, 2), BO
  1518. #endif
  1519. #ifdef LN
  1520. subq $2, KK
  1521. #endif
  1522. #ifdef LT
  1523. addq $2, KK
  1524. #endif
  1525. #ifdef RT
  1526. movq K, %rax
  1527. salq $1 + BASE_SHIFT, %rax
  1528. addq %rax, AORIG
  1529. #endif
  1530. ALIGN_4
  1531. .L30:
  1532. testq $1, M
  1533. je .L39
  1534. #ifdef LN
  1535. movq K, %rax
  1536. salq $0 + BASE_SHIFT, %rax
  1537. subq %rax, AORIG
  1538. #endif
  1539. #if defined(LN) || defined(RT)
  1540. movq KK, %rax
  1541. leaq (, %rax, SIZE), %rax
  1542. movq AORIG, AO
  1543. leaq (AO, %rax, 1), AO
  1544. leaq (B, %rax, 2), BO
  1545. #else
  1546. movq B, BO
  1547. #endif
  1548. movsd 0 * SIZE(AO), %xmm0
  1549. xorps %xmm7, %xmm7
  1550. movsd 1 * SIZE(AO), %xmm2
  1551. xorps %xmm5, %xmm5
  1552. movsd 0 * SIZE(BO), %xmm1
  1553. xorps %xmm8, %xmm8
  1554. xorps %xmm9, %xmm9
  1555. movsd 1 * SIZE(BO), %xmm3
  1556. #if defined(LT) || defined(RN)
  1557. movq KK, %rax
  1558. #else
  1559. movq K, %rax
  1560. subq KK, %rax
  1561. #endif
  1562. sarq $2, %rax
  1563. je .L35
  1564. ALIGN_4
  1565. .L32:
  1566. addsd %xmm5, %xmm8
  1567. movsd 2 * SIZE(BO), %xmm5
  1568. mulsd %xmm0, %xmm1
  1569. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1570. addsd %xmm7, %xmm9
  1571. movsd 3 * SIZE(BO), %xmm7
  1572. mulsd %xmm0, %xmm3
  1573. movsd 2 * SIZE(AO), %xmm0
  1574. addsd %xmm1, %xmm8
  1575. movsd 4 * SIZE(BO), %xmm1
  1576. mulsd %xmm2, %xmm5
  1577. addsd %xmm3, %xmm9
  1578. movsd 5 * SIZE(BO), %xmm3
  1579. mulsd %xmm2, %xmm7
  1580. movsd 3 * SIZE(AO), %xmm2
  1581. addsd %xmm5, %xmm8
  1582. movsd 6 * SIZE(BO), %xmm5
  1583. mulsd %xmm0, %xmm1
  1584. addsd %xmm7, %xmm9
  1585. movsd 7 * SIZE(BO), %xmm7
  1586. mulsd %xmm0, %xmm3
  1587. movsd 4 * SIZE(AO), %xmm0
  1588. addsd %xmm1, %xmm8
  1589. movsd 8 * SIZE(BO), %xmm1
  1590. mulsd %xmm2, %xmm5
  1591. addsd %xmm3, %xmm9
  1592. movsd 9 * SIZE(BO), %xmm3
  1593. mulsd %xmm2, %xmm7
  1594. movsd 5 * SIZE(AO), %xmm2
  1595. addq $4 * SIZE, AO
  1596. addq $8 * SIZE, BO
  1597. decq %rax
  1598. jne .L32
  1599. ALIGN_4
  1600. .L35:
  1601. #if defined(LT) || defined(RN)
  1602. movq KK, %rax
  1603. #else
  1604. movq K, %rax
  1605. subq KK, %rax
  1606. #endif
  1607. addsd %xmm5, %xmm8
  1608. addsd %xmm7, %xmm9
  1609. andq $3, %rax
  1610. BRANCH
  1611. BRANCH
  1612. je .L38
  1613. ALIGN_4
  1614. .L36:
  1615. mulsd %xmm0, %xmm1
  1616. addq $2 * SIZE, BO
  1617. mulsd %xmm0, %xmm3
  1618. movsd 1 * SIZE(AO), %xmm0
  1619. addsd %xmm1, %xmm8
  1620. movsd 0 * SIZE(BO), %xmm1
  1621. addsd %xmm3, %xmm9
  1622. movsd 1 * SIZE(BO), %xmm3
  1623. addq $1 * SIZE, AO
  1624. decq %rax
  1625. BRANCH
  1626. jg .L36
  1627. ALIGN_4
  1628. .L38:
  1629. #if defined(LN) || defined(RT)
  1630. movq KK, %rax
  1631. #ifdef LN
  1632. subq $1, %rax
  1633. #else
  1634. subq $2, %rax
  1635. #endif
  1636. leaq (, %rax, SIZE), %rax
  1637. movq AORIG, AO
  1638. leaq (AO, %rax, 1), AO
  1639. leaq (B, %rax, 2), BO
  1640. #endif
  1641. #if defined(LN) || defined(LT)
  1642. movsd 0 * SIZE(BO), %xmm0
  1643. movsd 1 * SIZE(BO), %xmm1
  1644. subsd %xmm8, %xmm0
  1645. subsd %xmm9, %xmm1
  1646. #else
  1647. movsd 0 * SIZE(AO), %xmm0
  1648. movsd 1 * SIZE(AO), %xmm1
  1649. subsd %xmm8, %xmm0
  1650. subsd %xmm9, %xmm1
  1651. #endif
  1652. #if defined(LN) || defined(LT)
  1653. movsd 0 * SIZE(AO), %xmm8
  1654. mulsd %xmm8, %xmm0
  1655. mulsd %xmm8, %xmm1
  1656. #endif
  1657. #ifdef RN
  1658. movsd 0 * SIZE(BO), %xmm8
  1659. mulsd %xmm8, %xmm0
  1660. movsd 1 * SIZE(BO), %xmm9
  1661. mulsd %xmm0, %xmm9
  1662. movsd 3 * SIZE(BO), %xmm13
  1663. subsd %xmm9, %xmm1
  1664. mulsd %xmm13, %xmm1
  1665. #endif
  1666. #ifdef RT
  1667. movsd 3 * SIZE(BO), %xmm8
  1668. mulsd %xmm8, %xmm1
  1669. movsd 2 * SIZE(BO), %xmm9
  1670. mulsd %xmm1, %xmm9
  1671. movsd 0 * SIZE(BO), %xmm13
  1672. subsd %xmm9, %xmm0
  1673. mulsd %xmm13, %xmm0
  1674. #endif
  1675. #ifdef LN
  1676. subq $1 * SIZE, CO1
  1677. subq $1 * SIZE, CO2
  1678. #endif
  1679. movsd %xmm0, 0 * SIZE(CO1)
  1680. movsd %xmm1, 0 * SIZE(CO2)
  1681. #if defined(LN) || defined(LT)
  1682. movsd %xmm0, 0 * SIZE(BO)
  1683. movsd %xmm1, 1 * SIZE(BO)
  1684. #else
  1685. movsd %xmm0, 0 * SIZE(AO)
  1686. movsd %xmm1, 1 * SIZE(AO)
  1687. #endif
  1688. #ifndef LN
  1689. addq $1 * SIZE, CO1
  1690. addq $1 * SIZE, CO2
  1691. #endif
  1692. #if defined(LT) || defined(RN)
  1693. movq K, %rax
  1694. subq KK, %rax
  1695. leaq (,%rax, SIZE), %rax
  1696. leaq (AO, %rax, 1), AO
  1697. leaq (BO, %rax, 2), BO
  1698. #endif
  1699. #ifdef LN
  1700. subq $1, KK
  1701. #endif
  1702. #ifdef LT
  1703. addq $1, KK
  1704. #endif
  1705. #ifdef RT
  1706. movq K, %rax
  1707. salq $0 + BASE_SHIFT, %rax
  1708. addq %rax, AORIG
  1709. #endif
  1710. ALIGN_4
  1711. .L39:
  1712. #ifdef LN
  1713. leaq (, K, SIZE), %rax
  1714. leaq (B, %rax, 2), B
  1715. #endif
  1716. #if defined(LT) || defined(RN)
  1717. movq BO, B
  1718. #endif
  1719. #ifdef RN
  1720. addq $2, KK
  1721. #endif
  1722. #ifdef RT
  1723. subq $2, KK
  1724. #endif
  1725. decq J # j --
  1726. jg .L10
  1727. ALIGN_4
  1728. .L999:
  1729. movq 0(%rsp), %rbx
  1730. movq 8(%rsp), %rbp
  1731. movq 16(%rsp), %r12
  1732. movq 24(%rsp), %r13
  1733. movq 32(%rsp), %r14
  1734. movq 40(%rsp), %r15
  1735. #ifdef WINDOWS_ABI
  1736. movq 48(%rsp), %rdi
  1737. movq 56(%rsp), %rsi
  1738. movups 64(%rsp), %xmm6
  1739. movups 80(%rsp), %xmm7
  1740. movups 96(%rsp), %xmm8
  1741. movups 112(%rsp), %xmm9
  1742. movups 128(%rsp), %xmm10
  1743. movups 144(%rsp), %xmm11
  1744. movups 160(%rsp), %xmm12
  1745. movups 176(%rsp), %xmm13
  1746. movups 192(%rsp), %xmm14
  1747. movups 208(%rsp), %xmm15
  1748. #endif
  1749. addq $STACKSIZE, %rsp
  1750. ret
  1751. EPILOGUE