You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_4x2_atom.S 37 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M %rdi
  41. #define N %rsi
  42. #define K %rdx
  43. #define A %rcx
  44. #define B %r8
  45. #define C %r9
  46. #define LDC %r10
  47. #define I %r11
  48. #define AO %r13
  49. #define BO %r14
  50. #define CO1 %r15
  51. #define CO2 %rbx
  52. #define KK %rbp
  53. #define BB %r12
  54. #ifndef WINDOWS_ABI
  55. #define STACKSIZE 128
  56. #define OLD_LDC 8 + STACKSIZE(%rsp)
  57. #define OLD_OFFSET 16 + STACKSIZE(%rsp)
  58. #define OFFSET 48(%rsp)
  59. #define J 56(%rsp)
  60. #define KKK 64(%rsp)
  61. #define AORIG 72(%rsp)
  62. #else
  63. #define STACKSIZE 256
  64. #define OLD_A 40 + STACKSIZE(%rsp)
  65. #define OLD_B 48 + STACKSIZE(%rsp)
  66. #define OLD_C 56 + STACKSIZE(%rsp)
  67. #define OLD_LDC 64 + STACKSIZE(%rsp)
  68. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  69. #define OFFSET 224(%rsp)
  70. #define J 232(%rsp)
  71. #define KKK 240(%rsp)
  72. #define AORIG 248(%rsp)
  73. #endif
  74. #define PREFETCH prefetcht0
  75. #define PREFETCHSIZE (8 * 8 + 3)
  76. PROLOGUE
  77. PROFCODE
  78. subq $STACKSIZE, %rsp
  79. movq %rbx, 0(%rsp)
  80. movq %rbp, 8(%rsp)
  81. movq %r12, 16(%rsp)
  82. movq %r13, 24(%rsp)
  83. movq %r14, 32(%rsp)
  84. movq %r15, 40(%rsp)
  85. #ifdef WINDOWS_ABI
  86. movq %rdi, 48(%rsp)
  87. movq %rsi, 56(%rsp)
  88. movups %xmm6, 64(%rsp)
  89. movups %xmm7, 80(%rsp)
  90. movups %xmm8, 96(%rsp)
  91. movups %xmm9, 112(%rsp)
  92. movups %xmm10, 128(%rsp)
  93. movups %xmm11, 144(%rsp)
  94. movups %xmm12, 160(%rsp)
  95. movups %xmm13, 176(%rsp)
  96. movups %xmm14, 192(%rsp)
  97. movups %xmm15, 208(%rsp)
  98. movq ARG1, M
  99. movq ARG2, N
  100. movq ARG3, K
  101. movq OLD_A, A
  102. movq OLD_B, B
  103. movq OLD_C, C
  104. #endif
  105. movq OLD_LDC, LDC
  106. movq OLD_OFFSET, KK
  107. movq KK, OFFSET
  108. leaq (, LDC, SIZE), LDC
  109. #ifdef LN
  110. leaq (, M, SIZE), %rax
  111. addq %rax, C
  112. imulq K, %rax
  113. addq %rax, A
  114. #endif
  115. #ifdef RT
  116. leaq (, N, SIZE), %rax
  117. imulq K, %rax
  118. addq %rax, B
  119. movq N, %rax
  120. imulq LDC, %rax
  121. addq %rax, C
  122. #endif
  123. #ifdef RN
  124. negq KK
  125. #endif
  126. #ifdef RT
  127. movq N, %rax
  128. subq OFFSET, %rax
  129. movq %rax, KK
  130. #endif
  131. movq N, J
  132. sarq $1, J
  133. jle .L40
  134. ALIGN_4
  135. .L10:
  136. #if defined(LT) || defined(RN)
  137. movq A, AO
  138. #else
  139. movq A, AORIG
  140. #endif
  141. #ifdef RT
  142. movq K, %rax
  143. salq $1 + BASE_SHIFT, %rax
  144. subq %rax, B
  145. leaq (, LDC, 2), %rax
  146. subq %rax, C
  147. #endif
  148. movq C, CO1
  149. leaq (C, LDC, 1), CO2
  150. #ifndef RT
  151. leaq (C, LDC, 2), C
  152. #endif
  153. #ifdef LN
  154. movq OFFSET, %rax
  155. addq M, %rax
  156. movq %rax, KK
  157. #endif
  158. movq K, %rax
  159. salq $BASE_SHIFT + 1, %rax
  160. leaq (B, %rax), BB
  161. #ifdef LT
  162. movq OFFSET, %rax
  163. movq %rax, KK
  164. #endif
  165. testq $1, M
  166. je .L20
  167. #ifdef LN
  168. movq K, %rax
  169. salq $0 + BASE_SHIFT, %rax
  170. subq %rax, AORIG
  171. #endif
  172. #if defined(LN) || defined(RT)
  173. movq KK, %rax
  174. leaq (, %rax, SIZE), %rax
  175. movq AORIG, AO
  176. leaq (AO, %rax, 1), AO
  177. leaq (B, %rax, 2), BO
  178. #else
  179. movq B, BO
  180. #endif
  181. movsd 0 * SIZE(AO), %xmm0
  182. xorps %xmm7, %xmm7
  183. movsd 1 * SIZE(AO), %xmm2
  184. xorps %xmm5, %xmm5
  185. movsd 0 * SIZE(BO), %xmm1
  186. xorps %xmm8, %xmm8
  187. xorps %xmm9, %xmm9
  188. movsd 1 * SIZE(BO), %xmm3
  189. #if defined(LT) || defined(RN)
  190. movq KK, %rax
  191. #else
  192. movq K, %rax
  193. subq KK, %rax
  194. #endif
  195. sarq $2, %rax
  196. je .L35
  197. ALIGN_4
  198. .L32:
  199. addsd %xmm5, %xmm8
  200. movsd 2 * SIZE(BO), %xmm5
  201. mulsd %xmm0, %xmm1
  202. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  203. addsd %xmm7, %xmm9
  204. movsd 3 * SIZE(BO), %xmm7
  205. mulsd %xmm0, %xmm3
  206. movsd 2 * SIZE(AO), %xmm0
  207. addsd %xmm1, %xmm8
  208. movsd 4 * SIZE(BO), %xmm1
  209. mulsd %xmm2, %xmm5
  210. addsd %xmm3, %xmm9
  211. movsd 5 * SIZE(BO), %xmm3
  212. mulsd %xmm2, %xmm7
  213. movsd 3 * SIZE(AO), %xmm2
  214. addsd %xmm5, %xmm8
  215. movsd 6 * SIZE(BO), %xmm5
  216. mulsd %xmm0, %xmm1
  217. addsd %xmm7, %xmm9
  218. movsd 7 * SIZE(BO), %xmm7
  219. mulsd %xmm0, %xmm3
  220. movsd 4 * SIZE(AO), %xmm0
  221. addsd %xmm1, %xmm8
  222. movsd 8 * SIZE(BO), %xmm1
  223. mulsd %xmm2, %xmm5
  224. addsd %xmm3, %xmm9
  225. movsd 9 * SIZE(BO), %xmm3
  226. mulsd %xmm2, %xmm7
  227. movsd 5 * SIZE(AO), %xmm2
  228. addq $4 * SIZE, AO
  229. addq $8 * SIZE, BO
  230. decq %rax
  231. jne .L32
  232. ALIGN_4
  233. .L35:
  234. #if defined(LT) || defined(RN)
  235. movq KK, %rax
  236. #else
  237. movq K, %rax
  238. subq KK, %rax
  239. #endif
  240. addsd %xmm5, %xmm8
  241. addsd %xmm7, %xmm9
  242. andq $3, %rax
  243. BRANCH
  244. BRANCH
  245. je .L38
  246. ALIGN_4
  247. .L36:
  248. mulsd %xmm0, %xmm1
  249. addq $2 * SIZE, BO
  250. mulsd %xmm0, %xmm3
  251. movsd 1 * SIZE(AO), %xmm0
  252. addsd %xmm1, %xmm8
  253. movsd 0 * SIZE(BO), %xmm1
  254. addsd %xmm3, %xmm9
  255. movsd 1 * SIZE(BO), %xmm3
  256. addq $1 * SIZE, AO
  257. decq %rax
  258. BRANCH
  259. jg .L36
  260. ALIGN_4
  261. .L38:
  262. #if defined(LN) || defined(RT)
  263. movq KK, %rax
  264. #ifdef LN
  265. subq $1, %rax
  266. #else
  267. subq $2, %rax
  268. #endif
  269. leaq (, %rax, SIZE), %rax
  270. movq AORIG, AO
  271. leaq (AO, %rax, 1), AO
  272. leaq (B, %rax, 2), BO
  273. #endif
  274. #if defined(LN) || defined(LT)
  275. movsd 0 * SIZE(BO), %xmm0
  276. movsd 1 * SIZE(BO), %xmm1
  277. subsd %xmm8, %xmm0
  278. subsd %xmm9, %xmm1
  279. #else
  280. movsd 0 * SIZE(AO), %xmm0
  281. movsd 1 * SIZE(AO), %xmm1
  282. subsd %xmm8, %xmm0
  283. subsd %xmm9, %xmm1
  284. #endif
  285. #if defined(LN) || defined(LT)
  286. movsd 0 * SIZE(AO), %xmm8
  287. mulsd %xmm8, %xmm0
  288. mulsd %xmm8, %xmm1
  289. #endif
  290. #ifdef RN
  291. movsd 0 * SIZE(BO), %xmm8
  292. mulsd %xmm8, %xmm0
  293. movsd 1 * SIZE(BO), %xmm9
  294. mulsd %xmm0, %xmm9
  295. movsd 3 * SIZE(BO), %xmm13
  296. subsd %xmm9, %xmm1
  297. mulsd %xmm13, %xmm1
  298. #endif
  299. #ifdef RT
  300. movsd 3 * SIZE(BO), %xmm8
  301. mulsd %xmm8, %xmm1
  302. movsd 2 * SIZE(BO), %xmm9
  303. mulsd %xmm1, %xmm9
  304. movsd 0 * SIZE(BO), %xmm13
  305. subsd %xmm9, %xmm0
  306. mulsd %xmm13, %xmm0
  307. #endif
  308. #ifdef LN
  309. subq $1 * SIZE, CO1
  310. subq $1 * SIZE, CO2
  311. #endif
  312. movsd %xmm0, 0 * SIZE(CO1)
  313. movsd %xmm1, 0 * SIZE(CO2)
  314. #if defined(LN) || defined(LT)
  315. movsd %xmm0, 0 * SIZE(BO)
  316. movsd %xmm1, 1 * SIZE(BO)
  317. #else
  318. movsd %xmm0, 0 * SIZE(AO)
  319. movsd %xmm1, 1 * SIZE(AO)
  320. #endif
  321. #ifndef LN
  322. addq $1 * SIZE, CO1
  323. addq $1 * SIZE, CO2
  324. #endif
  325. #if defined(LT) || defined(RN)
  326. movq K, %rax
  327. subq KK, %rax
  328. leaq (,%rax, SIZE), %rax
  329. leaq (AO, %rax, 1), AO
  330. leaq (BO, %rax, 2), BO
  331. #endif
  332. #ifdef LN
  333. subq $1, KK
  334. #endif
  335. #ifdef LT
  336. addq $1, KK
  337. #endif
  338. #ifdef RT
  339. movq K, %rax
  340. salq $0 + BASE_SHIFT, %rax
  341. addq %rax, AORIG
  342. #endif
  343. ALIGN_4
  344. .L20:
  345. testq $2, M
  346. BRANCH
  347. je .L30
  348. #ifdef LN
  349. movq K, %rax
  350. salq $1 + BASE_SHIFT, %rax
  351. subq %rax, AORIG
  352. #endif
  353. #if defined(LN) || defined(RT)
  354. movq KK, %rax
  355. leaq (, %rax, SIZE), %rax
  356. movq AORIG, AO
  357. leaq (AO, %rax, 2), AO
  358. leaq (B, %rax, 2), BO
  359. #else
  360. movq B, BO
  361. #endif
  362. movsd 0 * SIZE(AO), %xmm0
  363. xorps %xmm2, %xmm2
  364. movsd 1 * SIZE(AO), %xmm4
  365. xorps %xmm5, %xmm5
  366. movsd 2 * SIZE(AO), %xmm5
  367. xorps %xmm6, %xmm6
  368. movsd 3 * SIZE(AO), %xmm7
  369. movsd 0 * SIZE(BO), %xmm1
  370. xorps %xmm8, %xmm8
  371. xorps %xmm9, %xmm9
  372. movsd 1 * SIZE(BO), %xmm3
  373. xorps %xmm10, %xmm10
  374. xorps %xmm11, %xmm11
  375. #if defined(LT) || defined(RN)
  376. movq KK, %rax
  377. #else
  378. movq K, %rax
  379. subq KK, %rax
  380. #endif
  381. sarq $2, %rax
  382. je .L25
  383. ALIGN_4
  384. .L22:
  385. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  386. addsd %xmm2, %xmm9
  387. movaps %xmm0, %xmm2
  388. mulsd %xmm1, %xmm0
  389. addsd %xmm6, %xmm11
  390. movaps %xmm4, %xmm6
  391. mulsd %xmm1, %xmm4
  392. movsd 2 * SIZE(BO), %xmm1
  393. addsd %xmm0, %xmm8
  394. movsd 4 * SIZE(AO), %xmm0
  395. mulsd %xmm3, %xmm2
  396. addsd %xmm4, %xmm10
  397. movsd 5 * SIZE(AO), %xmm4
  398. mulsd %xmm3, %xmm6
  399. movsd 3 * SIZE(BO), %xmm3
  400. addsd %xmm2, %xmm9
  401. movaps %xmm5, %xmm2
  402. mulsd %xmm1, %xmm5
  403. addsd %xmm6, %xmm11
  404. movaps %xmm7, %xmm6
  405. mulsd %xmm1, %xmm7
  406. movsd 4 * SIZE(BO), %xmm1
  407. addsd %xmm5, %xmm8
  408. movsd 6 * SIZE(AO), %xmm5
  409. mulsd %xmm3, %xmm2
  410. addsd %xmm7, %xmm10
  411. movsd 7 * SIZE(AO), %xmm7
  412. mulsd %xmm3, %xmm6
  413. movsd 5 * SIZE(BO), %xmm3
  414. addsd %xmm2, %xmm9
  415. movaps %xmm0, %xmm2
  416. mulsd %xmm1, %xmm0
  417. addsd %xmm6, %xmm11
  418. movaps %xmm4, %xmm6
  419. mulsd %xmm1, %xmm4
  420. movsd 6 * SIZE(BO), %xmm1
  421. addsd %xmm0, %xmm8
  422. movsd 8 * SIZE(AO), %xmm0
  423. mulsd %xmm3, %xmm2
  424. addsd %xmm4, %xmm10
  425. movsd 9 * SIZE(AO), %xmm4
  426. mulsd %xmm3, %xmm6
  427. movsd 7 * SIZE(BO), %xmm3
  428. addsd %xmm2, %xmm9
  429. movaps %xmm5, %xmm2
  430. mulsd %xmm1, %xmm5
  431. addsd %xmm6, %xmm11
  432. movaps %xmm7, %xmm6
  433. mulsd %xmm1, %xmm7
  434. movsd 8 * SIZE(BO), %xmm1
  435. addsd %xmm5, %xmm8
  436. movsd 10 * SIZE(AO), %xmm5
  437. mulsd %xmm3, %xmm2
  438. addsd %xmm7, %xmm10
  439. movsd 11 * SIZE(AO), %xmm7
  440. mulsd %xmm3, %xmm6
  441. movsd 9 * SIZE(BO), %xmm3
  442. addq $8 * SIZE, AO
  443. addq $8 * SIZE, BO
  444. decq %rax
  445. jne .L22
  446. ALIGN_4
  447. .L25:
  448. #if defined(LT) || defined(RN)
  449. movq KK, %rax
  450. #else
  451. movq K, %rax
  452. subq KK, %rax
  453. #endif
  454. andq $3, %rax
  455. BRANCH
  456. je .L29
  457. ALIGN_4
  458. .L26:
  459. addsd %xmm2, %xmm9
  460. movaps %xmm0, %xmm2
  461. mulsd %xmm1, %xmm0
  462. addsd %xmm6, %xmm11
  463. movaps %xmm4, %xmm6
  464. mulsd %xmm1, %xmm4
  465. movsd 2 * SIZE(BO), %xmm1
  466. mulsd %xmm3, %xmm2
  467. addsd %xmm0, %xmm8
  468. movsd 2 * SIZE(AO), %xmm0
  469. mulsd %xmm3, %xmm6
  470. movsd 3 * SIZE(BO), %xmm3
  471. addsd %xmm4, %xmm10
  472. movsd 3 * SIZE(AO), %xmm4
  473. addq $2 * SIZE, AO
  474. addq $2 * SIZE, BO
  475. decq %rax
  476. BRANCH
  477. jg .L26
  478. ALIGN_4
  479. .L29:
  480. addsd %xmm2, %xmm9
  481. addsd %xmm6, %xmm11
  482. #if defined(LN) || defined(RT)
  483. movq KK, %rax
  484. #ifdef LN
  485. subq $2, %rax
  486. #else
  487. subq $2, %rax
  488. #endif
  489. leaq (, %rax, SIZE), %rax
  490. movq AORIG, AO
  491. leaq (AO, %rax, 2), AO
  492. leaq (B, %rax, 2), BO
  493. #endif
  494. #if defined(LN) || defined(LT)
  495. movsd 0 * SIZE(BO), %xmm0
  496. movsd 1 * SIZE(BO), %xmm1
  497. movsd 2 * SIZE(BO), %xmm2
  498. movsd 3 * SIZE(BO), %xmm3
  499. subsd %xmm8, %xmm0
  500. subsd %xmm9, %xmm1
  501. subsd %xmm10, %xmm2
  502. subsd %xmm11, %xmm3
  503. #else
  504. movsd 0 * SIZE(AO), %xmm0
  505. movsd 1 * SIZE(AO), %xmm2
  506. movsd 2 * SIZE(AO), %xmm1
  507. movsd 3 * SIZE(AO), %xmm3
  508. subsd %xmm8, %xmm0
  509. subsd %xmm10, %xmm2
  510. subsd %xmm9, %xmm1
  511. subsd %xmm11, %xmm3
  512. #endif
  513. #ifdef LN
  514. movsd 3 * SIZE(AO), %xmm8
  515. mulsd %xmm8, %xmm2
  516. movsd 2 * SIZE(AO), %xmm9
  517. mulsd %xmm8, %xmm3
  518. movsd 0 * SIZE(AO), %xmm13
  519. movaps %xmm9, %xmm10
  520. mulsd %xmm2, %xmm9
  521. mulsd %xmm3, %xmm10
  522. subsd %xmm9, %xmm0
  523. subsd %xmm10, %xmm1
  524. mulsd %xmm13, %xmm0
  525. mulsd %xmm13, %xmm1
  526. #endif
  527. #ifdef LT
  528. movsd 0 * SIZE(AO), %xmm8
  529. mulsd %xmm8, %xmm0
  530. movsd 1 * SIZE(AO), %xmm9
  531. mulsd %xmm8, %xmm1
  532. movsd 3 * SIZE(AO), %xmm13
  533. movaps %xmm9, %xmm10
  534. mulsd %xmm0, %xmm9
  535. mulsd %xmm1, %xmm10
  536. subsd %xmm9, %xmm2
  537. subsd %xmm10, %xmm3
  538. mulsd %xmm13, %xmm2
  539. mulsd %xmm13, %xmm3
  540. #endif
  541. #ifdef RN
  542. movsd 0 * SIZE(BO), %xmm8
  543. mulsd %xmm8, %xmm0
  544. movsd 1 * SIZE(BO), %xmm9
  545. mulsd %xmm8, %xmm2
  546. movsd 3 * SIZE(BO), %xmm13
  547. movaps %xmm9, %xmm10
  548. mulsd %xmm0, %xmm9
  549. mulsd %xmm2, %xmm10
  550. subsd %xmm9, %xmm1
  551. subsd %xmm10, %xmm3
  552. mulsd %xmm13, %xmm1
  553. mulsd %xmm13, %xmm3
  554. #endif
  555. #ifdef RT
  556. movsd 3 * SIZE(BO), %xmm8
  557. mulsd %xmm8, %xmm1
  558. movsd 2 * SIZE(BO), %xmm9
  559. mulsd %xmm8, %xmm3
  560. movsd 0 * SIZE(BO), %xmm13
  561. movaps %xmm9, %xmm10
  562. mulsd %xmm1, %xmm9
  563. mulsd %xmm3, %xmm10
  564. subsd %xmm9, %xmm0
  565. subsd %xmm10, %xmm2
  566. mulsd %xmm13, %xmm0
  567. mulsd %xmm13, %xmm2
  568. #endif
  569. #ifdef LN
  570. subq $2 * SIZE, CO1
  571. subq $2 * SIZE, CO2
  572. #endif
  573. movsd %xmm0, 0 * SIZE(CO1)
  574. movsd %xmm2, 1 * SIZE(CO1)
  575. movsd %xmm1, 0 * SIZE(CO2)
  576. movsd %xmm3, 1 * SIZE(CO2)
  577. #if defined(LN) || defined(LT)
  578. movsd %xmm0, 0 * SIZE(BO)
  579. movsd %xmm1, 1 * SIZE(BO)
  580. movsd %xmm2, 2 * SIZE(BO)
  581. movsd %xmm3, 3 * SIZE(BO)
  582. #else
  583. movsd %xmm0, 0 * SIZE(AO)
  584. movsd %xmm2, 1 * SIZE(AO)
  585. movsd %xmm1, 2 * SIZE(AO)
  586. movsd %xmm3, 3 * SIZE(AO)
  587. #endif
  588. #ifndef LN
  589. addq $2 * SIZE, CO1
  590. addq $2 * SIZE, CO2
  591. #endif
  592. #if defined(LT) || defined(RN)
  593. movq K, %rax
  594. subq KK, %rax
  595. leaq (,%rax, SIZE), %rax
  596. leaq (AO, %rax, 2), AO
  597. leaq (BO, %rax, 2), BO
  598. #endif
  599. #ifdef LN
  600. subq $2, KK
  601. #endif
  602. #ifdef LT
  603. addq $2, KK
  604. #endif
  605. #ifdef RT
  606. movq K, %rax
  607. salq $1 + BASE_SHIFT, %rax
  608. addq %rax, AORIG
  609. #endif
  610. ALIGN_4
  611. .L30:
  612. movq M, I
  613. sarq $2, I
  614. jle .L39
  615. ALIGN_4
  616. .L11:
  617. #ifdef LN
  618. movq K, %rax
  619. salq $2 + BASE_SHIFT, %rax
  620. subq %rax, AORIG
  621. #endif
  622. #if defined(LN) || defined(RT)
  623. movq KK, %rax
  624. leaq (, %rax, SIZE), %rax
  625. movq AORIG, AO
  626. leaq (AO, %rax, 4), AO
  627. leaq (B, %rax, 2), BO
  628. #else
  629. movq B, BO
  630. #endif
  631. prefetcht0 0 * SIZE(BB)
  632. subq $-8 * SIZE, BB
  633. movsd 0 * SIZE(AO), %xmm0
  634. xorps %xmm2, %xmm2
  635. movsd 1 * SIZE(AO), %xmm4
  636. xorps %xmm5, %xmm5
  637. movsd 2 * SIZE(AO), %xmm5
  638. xorps %xmm6, %xmm6
  639. xorps %xmm7, %xmm7
  640. movsd 0 * SIZE(BO), %xmm1
  641. xorps %xmm8, %xmm8
  642. xorps %xmm9, %xmm9
  643. movsd 1 * SIZE(BO), %xmm3
  644. xorps %xmm10, %xmm10
  645. xorps %xmm11, %xmm11
  646. prefetcht0 3 * SIZE(CO1)
  647. xorps %xmm12, %xmm12
  648. xorps %xmm13, %xmm13
  649. prefetcht0 3 * SIZE(CO2)
  650. xorps %xmm14, %xmm14
  651. xorps %xmm15, %xmm15
  652. #if defined(LT) || defined(RN)
  653. movq KK, %rax
  654. #else
  655. movq K, %rax
  656. subq KK, %rax
  657. #endif
  658. sarq $2, %rax
  659. je .L15
  660. ALIGN_4
  661. .L12:
  662. addsd %xmm2, %xmm13
  663. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  664. movaps %xmm0, %xmm2
  665. mulsd %xmm1, %xmm0
  666. addsd %xmm7, %xmm14
  667. movsd 3 * SIZE(AO), %xmm7
  668. mulsd %xmm3, %xmm2
  669. addsd %xmm6, %xmm15
  670. PREFETCH (PREFETCHSIZE + 0) * SIZE(BO)
  671. movaps %xmm4, %xmm6
  672. mulsd %xmm1, %xmm4
  673. addsd %xmm0, %xmm8
  674. movsd 4 * SIZE(AO), %xmm0
  675. mulsd %xmm3, %xmm6
  676. addsd %xmm2, %xmm9
  677. movaps %xmm5, %xmm2
  678. mulsd %xmm1, %xmm5
  679. addsd %xmm4, %xmm10
  680. movsd 5 * SIZE(AO), %xmm4
  681. mulsd %xmm3, %xmm2
  682. addsd %xmm6, %xmm11
  683. movaps %xmm7, %xmm6
  684. mulsd %xmm1, %xmm7
  685. movsd 2 * SIZE(BO), %xmm1
  686. addsd %xmm5, %xmm12
  687. movsd 6 * SIZE(AO), %xmm5
  688. mulsd %xmm3, %xmm6
  689. movsd 3 * SIZE(BO), %xmm3
  690. addsd %xmm2, %xmm13
  691. movaps %xmm0, %xmm2
  692. mulsd %xmm1, %xmm0
  693. addsd %xmm7, %xmm14
  694. movsd 7 * SIZE(AO), %xmm7
  695. mulsd %xmm3, %xmm2
  696. addsd %xmm6, %xmm15
  697. movaps %xmm4, %xmm6
  698. mulsd %xmm1, %xmm4
  699. addsd %xmm0, %xmm8
  700. movsd 8 * SIZE(AO), %xmm0
  701. mulsd %xmm3, %xmm6
  702. addsd %xmm2, %xmm9
  703. movaps %xmm5, %xmm2
  704. mulsd %xmm1, %xmm5
  705. addsd %xmm4, %xmm10
  706. movsd 9 * SIZE(AO), %xmm4
  707. mulsd %xmm3, %xmm2
  708. addsd %xmm6, %xmm11
  709. movaps %xmm7, %xmm6
  710. mulsd %xmm1, %xmm7
  711. movsd 4 * SIZE(BO), %xmm1
  712. addsd %xmm5, %xmm12
  713. movsd 10 * SIZE(AO), %xmm5
  714. mulsd %xmm3, %xmm6
  715. movsd 5 * SIZE(BO), %xmm3
  716. addsd %xmm2, %xmm13
  717. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  718. movaps %xmm0, %xmm2
  719. mulsd %xmm1, %xmm0
  720. addsd %xmm7, %xmm14
  721. movsd 11 * SIZE(AO), %xmm7
  722. mulsd %xmm3, %xmm2
  723. addsd %xmm6, %xmm15
  724. movaps %xmm4, %xmm6
  725. mulsd %xmm1, %xmm4
  726. addsd %xmm0, %xmm8
  727. movsd 12 * SIZE(AO), %xmm0
  728. mulsd %xmm3, %xmm6
  729. addsd %xmm2, %xmm9
  730. movaps %xmm5, %xmm2
  731. mulsd %xmm1, %xmm5
  732. addsd %xmm4, %xmm10
  733. movsd 13 * SIZE(AO), %xmm4
  734. mulsd %xmm3, %xmm2
  735. addsd %xmm6, %xmm11
  736. movaps %xmm7, %xmm6
  737. mulsd %xmm1, %xmm7
  738. movsd 6 * SIZE(BO), %xmm1
  739. addsd %xmm5, %xmm12
  740. movsd 14 * SIZE(AO), %xmm5
  741. mulsd %xmm3, %xmm6
  742. movsd 7 * SIZE(BO), %xmm3
  743. addsd %xmm2, %xmm13
  744. movaps %xmm0, %xmm2
  745. mulsd %xmm1, %xmm0
  746. addsd %xmm7, %xmm14
  747. movsd 15 * SIZE(AO), %xmm7
  748. mulsd %xmm3, %xmm2
  749. subq $-16 * SIZE, AO
  750. addsd %xmm6, %xmm15
  751. movaps %xmm4, %xmm6
  752. mulsd %xmm1, %xmm4
  753. addsd %xmm0, %xmm8
  754. movsd 0 * SIZE(AO), %xmm0
  755. mulsd %xmm3, %xmm6
  756. addsd %xmm2, %xmm9
  757. movaps %xmm5, %xmm2
  758. mulsd %xmm1, %xmm5
  759. addq $ 8 * SIZE, BO
  760. addsd %xmm4, %xmm10
  761. movsd 1 * SIZE(AO), %xmm4
  762. mulsd %xmm3, %xmm2
  763. decq %rax
  764. addsd %xmm6, %xmm11
  765. movaps %xmm7, %xmm6
  766. mulsd %xmm1, %xmm7
  767. movsd 0 * SIZE(BO), %xmm1
  768. addsd %xmm5, %xmm12
  769. movsd 2 * SIZE(AO), %xmm5
  770. mulsd %xmm3, %xmm6
  771. movsd 1 * SIZE(BO), %xmm3
  772. jne .L12
  773. ALIGN_4
  774. .L15:
  775. #if defined(LT) || defined(RN)
  776. movq KK, %rax
  777. #else
  778. movq K, %rax
  779. subq KK, %rax
  780. #endif
  781. andq $3, %rax
  782. BRANCH
  783. je .L19
  784. ALIGN_4
  785. .L16:
  786. addsd %xmm2, %xmm13
  787. movaps %xmm0, %xmm2
  788. mulsd %xmm1, %xmm0
  789. addsd %xmm7, %xmm14
  790. movsd 3 * SIZE(AO), %xmm7
  791. mulsd %xmm3, %xmm2
  792. addsd %xmm6, %xmm15
  793. movaps %xmm4, %xmm6
  794. mulsd %xmm1, %xmm4
  795. addsd %xmm0, %xmm8
  796. movsd 4 * SIZE(AO), %xmm0
  797. mulsd %xmm3, %xmm6
  798. addsd %xmm2, %xmm9
  799. movaps %xmm5, %xmm2
  800. mulsd %xmm1, %xmm5
  801. addsd %xmm4, %xmm10
  802. movsd 5 * SIZE(AO), %xmm4
  803. mulsd %xmm3, %xmm2
  804. addsd %xmm6, %xmm11
  805. movaps %xmm7, %xmm6
  806. mulsd %xmm1, %xmm7
  807. movsd 2 * SIZE(BO), %xmm1
  808. addsd %xmm5, %xmm12
  809. movsd 6 * SIZE(AO), %xmm5
  810. mulsd %xmm3, %xmm6
  811. movsd 3 * SIZE(BO), %xmm3
  812. addq $4 * SIZE, AO
  813. addq $2 * SIZE, BO
  814. decq %rax
  815. BRANCH
  816. jg .L16
  817. ALIGN_4
  818. .L19:
  819. addsd %xmm2, %xmm13
  820. addsd %xmm7, %xmm14
  821. addsd %xmm6, %xmm15
  822. #if defined(LN) || defined(RT)
  823. movq KK, %rax
  824. #ifdef LN
  825. subq $4, %rax
  826. #else
  827. subq $2, %rax
  828. #endif
  829. leaq (, %rax, SIZE), %rax
  830. movq AORIG, AO
  831. leaq (AO, %rax, 4), AO
  832. leaq (B, %rax, 2), BO
  833. #endif
  834. #if defined(LN) || defined(LT)
  835. movsd 0 * SIZE(BO), %xmm0
  836. movsd 1 * SIZE(BO), %xmm1
  837. movsd 2 * SIZE(BO), %xmm2
  838. movsd 3 * SIZE(BO), %xmm3
  839. movsd 4 * SIZE(BO), %xmm4
  840. movsd 5 * SIZE(BO), %xmm5
  841. movsd 6 * SIZE(BO), %xmm6
  842. movsd 7 * SIZE(BO), %xmm7
  843. subsd %xmm8, %xmm0
  844. subsd %xmm9, %xmm1
  845. subsd %xmm10, %xmm2
  846. subsd %xmm11, %xmm3
  847. subsd %xmm12, %xmm4
  848. subsd %xmm13, %xmm5
  849. subsd %xmm14, %xmm6
  850. subsd %xmm15, %xmm7
  851. #else
  852. movsd 0 * SIZE(AO), %xmm0
  853. movsd 1 * SIZE(AO), %xmm2
  854. movsd 2 * SIZE(AO), %xmm4
  855. movsd 3 * SIZE(AO), %xmm6
  856. movsd 4 * SIZE(AO), %xmm1
  857. movsd 5 * SIZE(AO), %xmm3
  858. movsd 6 * SIZE(AO), %xmm5
  859. movsd 7 * SIZE(AO), %xmm7
  860. subsd %xmm8, %xmm0
  861. subsd %xmm10, %xmm2
  862. subsd %xmm12, %xmm4
  863. subsd %xmm14, %xmm6
  864. subsd %xmm9, %xmm1
  865. subsd %xmm11, %xmm3
  866. subsd %xmm13, %xmm5
  867. subsd %xmm15, %xmm7
  868. #endif
  869. #ifdef LN
  870. movsd 15 * SIZE(AO), %xmm8
  871. mulsd %xmm8, %xmm6
  872. movsd 14 * SIZE(AO), %xmm9
  873. mulsd %xmm8, %xmm7
  874. movsd 13 * SIZE(AO), %xmm11
  875. movaps %xmm9, %xmm10
  876. movsd 12 * SIZE(AO), %xmm13
  877. mulsd %xmm6, %xmm9
  878. movsd 10 * SIZE(AO), %xmm8
  879. mulsd %xmm7, %xmm10
  880. subsd %xmm9, %xmm4
  881. movsd 9 * SIZE(AO), %xmm9
  882. subsd %xmm10, %xmm5
  883. movaps %xmm11, %xmm12
  884. mulsd %xmm6, %xmm11
  885. mulsd %xmm7, %xmm12
  886. subsd %xmm11, %xmm2
  887. movsd 8 * SIZE(AO), %xmm11
  888. subsd %xmm12, %xmm3
  889. movaps %xmm13, %xmm14
  890. mulsd %xmm6, %xmm13
  891. mulsd %xmm7, %xmm14
  892. subsd %xmm13, %xmm0
  893. subsd %xmm14, %xmm1
  894. mulsd %xmm8, %xmm4
  895. mulsd %xmm8, %xmm5
  896. movsd 5 * SIZE(AO), %xmm8
  897. movaps %xmm9, %xmm10
  898. mulsd %xmm4, %xmm9
  899. mulsd %xmm5, %xmm10
  900. subsd %xmm9, %xmm2
  901. movsd 4 * SIZE(AO), %xmm9
  902. subsd %xmm10, %xmm3
  903. movaps %xmm11, %xmm12
  904. mulsd %xmm4, %xmm11
  905. mulsd %xmm5, %xmm12
  906. subsd %xmm11, %xmm0
  907. movsd 0 * SIZE(AO), %xmm11
  908. subsd %xmm12, %xmm1
  909. mulsd %xmm8, %xmm2
  910. mulsd %xmm8, %xmm3
  911. movaps %xmm9, %xmm10
  912. mulsd %xmm2, %xmm9
  913. mulsd %xmm3, %xmm10
  914. subsd %xmm9, %xmm0
  915. subsd %xmm10, %xmm1
  916. mulsd %xmm11, %xmm0
  917. mulsd %xmm11, %xmm1
  918. #endif
  919. #ifdef LT
  920. movsd 0 * SIZE(AO), %xmm8
  921. mulsd %xmm8, %xmm0
  922. movsd 1 * SIZE(AO), %xmm9
  923. mulsd %xmm8, %xmm1
  924. movsd 2 * SIZE(AO), %xmm11
  925. movaps %xmm9, %xmm10
  926. movsd 3 * SIZE(AO), %xmm13
  927. mulsd %xmm0, %xmm9
  928. movsd 5 * SIZE(AO), %xmm8
  929. mulsd %xmm1, %xmm10
  930. subsd %xmm9, %xmm2
  931. movsd 6 * SIZE(AO), %xmm9
  932. subsd %xmm10, %xmm3
  933. movaps %xmm11, %xmm12
  934. mulsd %xmm0, %xmm11
  935. mulsd %xmm1, %xmm12
  936. subsd %xmm11, %xmm4
  937. movsd 7 * SIZE(AO), %xmm11
  938. subsd %xmm12, %xmm5
  939. movaps %xmm13, %xmm14
  940. mulsd %xmm0, %xmm13
  941. mulsd %xmm1, %xmm14
  942. subsd %xmm13, %xmm6
  943. subsd %xmm14, %xmm7
  944. mulsd %xmm8, %xmm2
  945. mulsd %xmm8, %xmm3
  946. movsd 10 * SIZE(AO), %xmm8
  947. movaps %xmm9, %xmm10
  948. mulsd %xmm2, %xmm9
  949. mulsd %xmm3, %xmm10
  950. subsd %xmm9, %xmm4
  951. movsd 11 * SIZE(AO), %xmm9
  952. subsd %xmm10, %xmm5
  953. movaps %xmm11, %xmm12
  954. mulsd %xmm2, %xmm11
  955. mulsd %xmm3, %xmm12
  956. subsd %xmm11, %xmm6
  957. subsd %xmm12, %xmm7
  958. mulsd %xmm8, %xmm4
  959. mulsd %xmm8, %xmm5
  960. movsd 15 * SIZE(AO), %xmm8
  961. movaps %xmm9, %xmm10
  962. mulsd %xmm4, %xmm9
  963. mulsd %xmm5, %xmm10
  964. subsd %xmm9, %xmm6
  965. subsd %xmm10, %xmm7
  966. mulsd %xmm8, %xmm6
  967. mulsd %xmm8, %xmm7
  968. #endif
  969. #ifdef RN
  970. movsd 0 * SIZE(BO), %xmm8
  971. mulsd %xmm8, %xmm0
  972. movsd 1 * SIZE(BO), %xmm9
  973. mulsd %xmm8, %xmm2
  974. movsd 3 * SIZE(BO), %xmm13
  975. mulsd %xmm8, %xmm4
  976. mulsd %xmm8, %xmm6
  977. movaps %xmm9, %xmm10
  978. movaps %xmm9, %xmm11
  979. movaps %xmm9, %xmm12
  980. mulsd %xmm0, %xmm9
  981. mulsd %xmm2, %xmm10
  982. mulsd %xmm4, %xmm11
  983. mulsd %xmm6, %xmm12
  984. subsd %xmm9, %xmm1
  985. subsd %xmm10, %xmm3
  986. subsd %xmm11, %xmm5
  987. subsd %xmm12, %xmm7
  988. mulsd %xmm13, %xmm1
  989. mulsd %xmm13, %xmm3
  990. mulsd %xmm13, %xmm5
  991. mulsd %xmm13, %xmm7
  992. #endif
  993. #ifdef RT
  994. movsd 3 * SIZE(BO), %xmm8
  995. mulsd %xmm8, %xmm1
  996. movsd 2 * SIZE(BO), %xmm9
  997. mulsd %xmm8, %xmm3
  998. movsd 0 * SIZE(BO), %xmm13
  999. mulsd %xmm8, %xmm5
  1000. mulsd %xmm8, %xmm7
  1001. movaps %xmm9, %xmm10
  1002. movaps %xmm9, %xmm11
  1003. movaps %xmm9, %xmm12
  1004. mulsd %xmm1, %xmm9
  1005. mulsd %xmm3, %xmm10
  1006. mulsd %xmm5, %xmm11
  1007. mulsd %xmm7, %xmm12
  1008. subsd %xmm9, %xmm0
  1009. subsd %xmm10, %xmm2
  1010. subsd %xmm11, %xmm4
  1011. subsd %xmm12, %xmm6
  1012. mulsd %xmm13, %xmm0
  1013. mulsd %xmm13, %xmm2
  1014. mulsd %xmm13, %xmm4
  1015. mulsd %xmm13, %xmm6
  1016. #endif
  1017. #ifdef LN
  1018. subq $4 * SIZE, CO1
  1019. subq $4 * SIZE, CO2
  1020. #endif
  1021. movsd %xmm0, 0 * SIZE(CO1)
  1022. movsd %xmm2, 1 * SIZE(CO1)
  1023. movsd %xmm4, 2 * SIZE(CO1)
  1024. movsd %xmm6, 3 * SIZE(CO1)
  1025. movsd %xmm1, 0 * SIZE(CO2)
  1026. movsd %xmm3, 1 * SIZE(CO2)
  1027. movsd %xmm5, 2 * SIZE(CO2)
  1028. movsd %xmm7, 3 * SIZE(CO2)
  1029. #if defined(LN) || defined(LT)
  1030. movsd %xmm0, 0 * SIZE(BO)
  1031. movsd %xmm1, 1 * SIZE(BO)
  1032. movsd %xmm2, 2 * SIZE(BO)
  1033. movsd %xmm3, 3 * SIZE(BO)
  1034. movsd %xmm4, 4 * SIZE(BO)
  1035. movsd %xmm5, 5 * SIZE(BO)
  1036. movsd %xmm6, 6 * SIZE(BO)
  1037. movsd %xmm7, 7 * SIZE(BO)
  1038. #else
  1039. movsd %xmm0, 0 * SIZE(AO)
  1040. movsd %xmm2, 1 * SIZE(AO)
  1041. movsd %xmm4, 2 * SIZE(AO)
  1042. movsd %xmm6, 3 * SIZE(AO)
  1043. movsd %xmm1, 4 * SIZE(AO)
  1044. movsd %xmm3, 5 * SIZE(AO)
  1045. movsd %xmm5, 6 * SIZE(AO)
  1046. movsd %xmm7, 7 * SIZE(AO)
  1047. #endif
  1048. #ifndef LN
  1049. addq $4 * SIZE, CO1
  1050. addq $4 * SIZE, CO2
  1051. #endif
  1052. #if defined(LT) || defined(RN)
  1053. movq K, %rax
  1054. subq KK, %rax
  1055. leaq (,%rax, SIZE), %rax
  1056. leaq (AO, %rax, 4), AO
  1057. leaq (BO, %rax, 2), BO
  1058. #endif
  1059. #ifdef LN
  1060. subq $4, KK
  1061. #endif
  1062. #ifdef LT
  1063. addq $4, KK
  1064. #endif
  1065. #ifdef RT
  1066. movq K, %rax
  1067. salq $2 + BASE_SHIFT, %rax
  1068. addq %rax, AORIG
  1069. #endif
  1070. decq I # i --
  1071. jg .L11
  1072. ALIGN_4
  1073. .L39:
  1074. #ifdef LN
  1075. leaq (, K, SIZE), %rax
  1076. leaq (B, %rax, 2), B
  1077. #endif
  1078. #if defined(LT) || defined(RN)
  1079. movq BO, B
  1080. #endif
  1081. #ifdef RN
  1082. addq $2, KK
  1083. #endif
  1084. #ifdef RT
  1085. subq $2, KK
  1086. #endif
  1087. decq J # j --
  1088. jg .L10
  1089. ALIGN_4
  1090. .L40:
  1091. testq $1, N
  1092. je .L999
  1093. ALIGN_4
  1094. #if defined(LT) || defined(RN)
  1095. movq A, AO
  1096. #else
  1097. movq A, AORIG
  1098. #endif
  1099. #ifdef RT
  1100. movq K, %rax
  1101. salq $0 + BASE_SHIFT, %rax
  1102. subq %rax, B
  1103. subq LDC, C
  1104. #endif
  1105. movq C, CO1
  1106. #ifndef RT
  1107. addq LDC, C
  1108. #endif
  1109. #ifdef LN
  1110. movq OFFSET, %rax
  1111. addq M, %rax
  1112. movq %rax, KK
  1113. #endif
  1114. #ifdef LT
  1115. movq OFFSET, %rax
  1116. movq %rax, KK
  1117. #endif
  1118. testq $1, M
  1119. je .L50
  1120. #ifdef LN
  1121. movq K, %rax
  1122. salq $0 + BASE_SHIFT, %rax
  1123. subq %rax, AORIG
  1124. #endif
  1125. #if defined(LN) || defined(RT)
  1126. movq KK, %rax
  1127. leaq (, %rax, SIZE), %rax
  1128. movq AORIG, AO
  1129. leaq (AO, %rax, 1), AO
  1130. leaq (B, %rax, 1), BO
  1131. #else
  1132. movq B, BO
  1133. #endif
  1134. movsd 0 * SIZE(AO), %xmm0
  1135. xorps %xmm5, %xmm5
  1136. movsd 1 * SIZE(AO), %xmm2
  1137. xorps %xmm7, %xmm7
  1138. movsd 0 * SIZE(BO), %xmm1
  1139. xorps %xmm8, %xmm8
  1140. movsd 1 * SIZE(BO), %xmm3
  1141. xorps %xmm9, %xmm9
  1142. movsd 2 * SIZE(AO), %xmm4
  1143. movsd 3 * SIZE(AO), %xmm6
  1144. #if defined(LT) || defined(RN)
  1145. movq KK, %rax
  1146. #else
  1147. movq K, %rax
  1148. subq KK, %rax
  1149. #endif
  1150. sarq $2, %rax
  1151. je .L65
  1152. ALIGN_4
  1153. .L62:
  1154. addsd %xmm5, %xmm8
  1155. movsd 2 * SIZE(BO), %xmm5
  1156. mulsd %xmm0, %xmm1
  1157. movsd 4 * SIZE(AO), %xmm0
  1158. addsd %xmm7, %xmm9
  1159. movsd 3 * SIZE(BO), %xmm7
  1160. mulsd %xmm2, %xmm3
  1161. movsd 5 * SIZE(AO), %xmm2
  1162. addsd %xmm1, %xmm8
  1163. movsd 4 * SIZE(BO), %xmm1
  1164. mulsd %xmm4, %xmm5
  1165. movsd 6 * SIZE(AO), %xmm4
  1166. addsd %xmm3, %xmm9
  1167. movsd 5 * SIZE(BO), %xmm3
  1168. mulsd %xmm6, %xmm7
  1169. movsd 7 * SIZE(AO), %xmm6
  1170. addq $4 * SIZE, AO
  1171. addq $4 * SIZE, BO
  1172. decq %rax
  1173. jne .L62
  1174. addsd %xmm5, %xmm8
  1175. addsd %xmm7, %xmm9
  1176. ALIGN_4
  1177. .L65:
  1178. #if defined(LT) || defined(RN)
  1179. movq KK, %rax
  1180. #else
  1181. movq K, %rax
  1182. subq KK, %rax
  1183. #endif
  1184. andq $3, %rax
  1185. BRANCH
  1186. je .L68
  1187. ALIGN_4
  1188. .L66:
  1189. movsd 0 * SIZE(AO), %xmm0
  1190. movsd 0 * SIZE(BO), %xmm1
  1191. mulsd %xmm0, %xmm1
  1192. addsd %xmm1, %xmm8
  1193. addq $1 * SIZE, AO
  1194. addq $1 * SIZE, BO
  1195. decq %rax
  1196. BRANCH
  1197. jg .L66
  1198. ALIGN_4
  1199. .L68:
  1200. addsd %xmm9, %xmm8
  1201. #if defined(LN) || defined(RT)
  1202. movq KK, %rax
  1203. #ifdef LN
  1204. subq $1, %rax
  1205. #else
  1206. subq $1, %rax
  1207. #endif
  1208. leaq (, %rax, SIZE), %rax
  1209. movq AORIG, AO
  1210. leaq (AO, %rax, 1), AO
  1211. leaq (B, %rax, 1), BO
  1212. #endif
  1213. #if defined(LN) || defined(LT)
  1214. movsd 0 * SIZE(BO), %xmm0
  1215. subsd %xmm8, %xmm0
  1216. #else
  1217. movsd 0 * SIZE(AO), %xmm0
  1218. subsd %xmm8, %xmm0
  1219. #endif
  1220. #if defined(LN) || defined(LT)
  1221. movsd 0 * SIZE(AO), %xmm8
  1222. mulsd %xmm8, %xmm0
  1223. #endif
  1224. #if defined(RN) || defined(RT)
  1225. movsd 0 * SIZE(BO), %xmm8
  1226. mulsd %xmm8, %xmm0
  1227. #endif
  1228. #ifdef LN
  1229. subq $1 * SIZE, CO1
  1230. #endif
  1231. movsd %xmm0, 0 * SIZE(CO1)
  1232. #if defined(LN) || defined(LT)
  1233. movsd %xmm0, 0 * SIZE(BO)
  1234. #else
  1235. movsd %xmm0, 0 * SIZE(AO)
  1236. #endif
  1237. #ifndef LN
  1238. addq $1 * SIZE, CO1
  1239. #endif
  1240. #if defined(LT) || defined(RN)
  1241. movq K, %rax
  1242. subq KK, %rax
  1243. leaq (,%rax, SIZE), %rax
  1244. leaq (AO, %rax, 1), AO
  1245. leaq (BO, %rax, 1), BO
  1246. #endif
  1247. #ifdef LN
  1248. subq $1, KK
  1249. #endif
  1250. #ifdef LT
  1251. addq $1, KK
  1252. #endif
  1253. #ifdef RT
  1254. movq K, %rax
  1255. salq $0 + BASE_SHIFT, %rax
  1256. addq %rax, AORIG
  1257. #endif
  1258. ALIGN_4
  1259. .L50:
  1260. testq $2, M
  1261. je .L60
  1262. #ifdef LN
  1263. movq K, %rax
  1264. salq $1 + BASE_SHIFT, %rax
  1265. subq %rax, AORIG
  1266. #endif
  1267. #if defined(LN) || defined(RT)
  1268. movq KK, %rax
  1269. leaq (, %rax, SIZE), %rax
  1270. movq AORIG, AO
  1271. leaq (AO, %rax, 2), AO
  1272. leaq (B, %rax, 1), BO
  1273. #else
  1274. movq B, BO
  1275. #endif
  1276. movsd 0 * SIZE(AO), %xmm0
  1277. xorps %xmm2, %xmm2
  1278. movsd 1 * SIZE(AO), %xmm1
  1279. xorps %xmm3, %xmm3
  1280. movsd 0 * SIZE(BO), %xmm4
  1281. xorps %xmm8, %xmm8
  1282. movsd 1 * SIZE(BO), %xmm5
  1283. xorps %xmm10, %xmm10
  1284. #if defined(LT) || defined(RN)
  1285. movq KK, %rax
  1286. #else
  1287. movq K, %rax
  1288. subq KK, %rax
  1289. #endif
  1290. sarq $2, %rax
  1291. je .L55
  1292. ALIGN_4
  1293. .L52:
  1294. addsd %xmm2, %xmm8
  1295. movsd 2 * SIZE(AO), %xmm2
  1296. mulsd %xmm4, %xmm0
  1297. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1298. addsd %xmm3, %xmm10
  1299. movsd 3 * SIZE(AO), %xmm3
  1300. mulsd %xmm4, %xmm1
  1301. movsd 2 * SIZE(BO), %xmm4
  1302. addsd %xmm0, %xmm8
  1303. movsd 4 * SIZE(AO), %xmm0
  1304. mulsd %xmm5, %xmm2
  1305. addq $8 * SIZE, AO
  1306. addsd %xmm1, %xmm10
  1307. movsd -3 * SIZE(AO), %xmm1
  1308. mulsd %xmm5, %xmm3
  1309. movsd 3 * SIZE(BO), %xmm5
  1310. addsd %xmm2, %xmm8
  1311. movsd -2 * SIZE(AO), %xmm2
  1312. mulsd %xmm4, %xmm0
  1313. addq $4 * SIZE, BO
  1314. addsd %xmm3, %xmm10
  1315. movsd -1 * SIZE(AO), %xmm3
  1316. mulsd %xmm4, %xmm1
  1317. movsd 0 * SIZE(BO), %xmm4
  1318. addsd %xmm0, %xmm8
  1319. movsd 0 * SIZE(AO), %xmm0
  1320. mulsd %xmm5, %xmm2
  1321. decq %rax
  1322. addsd %xmm1, %xmm10
  1323. movsd 1 * SIZE(AO), %xmm1
  1324. mulsd %xmm5, %xmm3
  1325. movsd 1 * SIZE(BO), %xmm5
  1326. jne .L52
  1327. ALIGN_4
  1328. .L55:
  1329. #if defined(LT) || defined(RN)
  1330. movq KK, %rax
  1331. #else
  1332. movq K, %rax
  1333. subq KK, %rax
  1334. #endif
  1335. addsd %xmm2, %xmm8
  1336. addsd %xmm3, %xmm10
  1337. andq $3, %rax
  1338. BRANCH
  1339. je .L59
  1340. ALIGN_4
  1341. .L56:
  1342. mulsd %xmm4, %xmm0
  1343. mulsd %xmm4, %xmm1
  1344. movsd 1 * SIZE(BO), %xmm4
  1345. addsd %xmm0, %xmm8
  1346. movsd 2 * SIZE(AO), %xmm0
  1347. addsd %xmm1, %xmm10
  1348. movsd 3 * SIZE(AO), %xmm1
  1349. addq $2 * SIZE, AO
  1350. addq $1 * SIZE, BO
  1351. decq %rax
  1352. BRANCH
  1353. jg .L56
  1354. ALIGN_4
  1355. .L59:
  1356. #if defined(LN) || defined(RT)
  1357. movq KK, %rax
  1358. #ifdef LN
  1359. subq $2, %rax
  1360. #else
  1361. subq $1, %rax
  1362. #endif
  1363. leaq (, %rax, SIZE), %rax
  1364. movq AORIG, AO
  1365. leaq (AO, %rax, 2), AO
  1366. leaq (B, %rax, 1), BO
  1367. #endif
  1368. #if defined(LN) || defined(LT)
  1369. movsd 0 * SIZE(BO), %xmm0
  1370. movsd 1 * SIZE(BO), %xmm2
  1371. subsd %xmm8, %xmm0
  1372. subsd %xmm10, %xmm2
  1373. #else
  1374. movsd 0 * SIZE(AO), %xmm0
  1375. movsd 1 * SIZE(AO), %xmm2
  1376. subsd %xmm8, %xmm0
  1377. subsd %xmm10, %xmm2
  1378. #endif
  1379. #ifdef LN
  1380. movsd 3 * SIZE(AO), %xmm8
  1381. movsd 2 * SIZE(AO), %xmm9
  1382. movsd 0 * SIZE(AO), %xmm11
  1383. mulsd %xmm8, %xmm2
  1384. mulsd %xmm2, %xmm9
  1385. subsd %xmm9, %xmm0
  1386. mulsd %xmm11,%xmm0
  1387. #endif
  1388. #ifdef LT
  1389. movsd 0 * SIZE(AO), %xmm8
  1390. movsd 1 * SIZE(AO), %xmm9
  1391. movsd 3 * SIZE(AO), %xmm11
  1392. mulsd %xmm8, %xmm0
  1393. mulsd %xmm0, %xmm9
  1394. subsd %xmm9, %xmm2
  1395. mulsd %xmm11,%xmm2
  1396. #endif
  1397. #if defined(RN) || defined(RT)
  1398. movsd 0 * SIZE(BO), %xmm8
  1399. mulsd %xmm8, %xmm0
  1400. mulsd %xmm8, %xmm2
  1401. #endif
  1402. #ifdef LN
  1403. subq $2 * SIZE, CO1
  1404. #endif
  1405. movsd %xmm0, 0 * SIZE(CO1)
  1406. movsd %xmm2, 1 * SIZE(CO1)
  1407. #if defined(LN) || defined(LT)
  1408. movsd %xmm0, 0 * SIZE(BO)
  1409. movsd %xmm2, 1 * SIZE(BO)
  1410. #else
  1411. movsd %xmm0, 0 * SIZE(AO)
  1412. movsd %xmm2, 1 * SIZE(AO)
  1413. #endif
  1414. #ifndef LN
  1415. addq $2 * SIZE, CO1
  1416. #endif
  1417. #if defined(LT) || defined(RN)
  1418. movq K, %rax
  1419. subq KK, %rax
  1420. leaq (,%rax, SIZE), %rax
  1421. leaq (AO, %rax, 2), AO
  1422. leaq (BO, %rax, 1), BO
  1423. #endif
  1424. #ifdef LN
  1425. subq $2, KK
  1426. #endif
  1427. #ifdef LT
  1428. addq $2, KK
  1429. #endif
  1430. #ifdef RT
  1431. movq K, %rax
  1432. salq $1 + BASE_SHIFT, %rax
  1433. addq %rax, AORIG
  1434. #endif
  1435. ALIGN_4
  1436. .L60:
  1437. movq M, I
  1438. sarq $2, I
  1439. jle .L69
  1440. ALIGN_4
  1441. .L41:
  1442. #ifdef LN
  1443. movq K, %rax
  1444. salq $2 + BASE_SHIFT, %rax
  1445. subq %rax, AORIG
  1446. #endif
  1447. #if defined(LN) || defined(RT)
  1448. movq KK, %rax
  1449. leaq (, %rax, SIZE), %rax
  1450. movq AORIG, AO
  1451. leaq (AO, %rax, 4), AO
  1452. leaq (B, %rax, 1), BO
  1453. #else
  1454. movq B, BO
  1455. #endif
  1456. movsd 0 * SIZE(AO), %xmm0
  1457. xorps %xmm9, %xmm9
  1458. movsd 1 * SIZE(AO), %xmm1
  1459. xorps %xmm11, %xmm11
  1460. movsd 2 * SIZE(AO), %xmm2
  1461. xorps %xmm13, %xmm13
  1462. movsd 3 * SIZE(AO), %xmm3
  1463. xorps %xmm15, %xmm15
  1464. movsd 0 * SIZE(BO), %xmm4
  1465. xorps %xmm8, %xmm8
  1466. movsd 1 * SIZE(BO), %xmm5
  1467. xorps %xmm10, %xmm10
  1468. prefetcht0 3 * SIZE(CO1)
  1469. xorps %xmm12, %xmm12
  1470. xorps %xmm14, %xmm14
  1471. #if defined(LT) || defined(RN)
  1472. movq KK, %rax
  1473. #else
  1474. movq K, %rax
  1475. subq KK, %rax
  1476. #endif
  1477. sarq $2, %rax
  1478. je .L45
  1479. ALIGN_4
  1480. .L42:
  1481. addsd %xmm9, %xmm8
  1482. movsd 4 * SIZE(AO), %xmm9
  1483. mulsd %xmm4, %xmm0
  1484. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1485. addsd %xmm11, %xmm10
  1486. movsd 5 * SIZE(AO), %xmm11
  1487. mulsd %xmm4, %xmm1
  1488. addsd %xmm13, %xmm12
  1489. movsd 6 * SIZE(AO), %xmm13
  1490. mulsd %xmm4, %xmm2
  1491. addsd %xmm15, %xmm14
  1492. movsd 7 * SIZE(AO), %xmm15
  1493. mulsd %xmm4, %xmm3
  1494. movsd 2 * SIZE(BO), %xmm4
  1495. addsd %xmm0, %xmm8
  1496. movsd 8 * SIZE(AO), %xmm0
  1497. mulsd %xmm5, %xmm9
  1498. addsd %xmm1, %xmm10
  1499. movsd 9 * SIZE(AO), %xmm1
  1500. mulsd %xmm5, %xmm11
  1501. addsd %xmm2, %xmm12
  1502. movsd 10 * SIZE(AO), %xmm2
  1503. mulsd %xmm5, %xmm13
  1504. addsd %xmm3, %xmm14
  1505. movsd 11 * SIZE(AO), %xmm3
  1506. mulsd %xmm5, %xmm15
  1507. movsd 3 * SIZE(BO), %xmm5
  1508. addsd %xmm9, %xmm8
  1509. movsd 12 * SIZE(AO), %xmm9
  1510. mulsd %xmm4, %xmm0
  1511. PREFETCH (PREFETCHSIZE + 8) * SIZE(AO)
  1512. addsd %xmm11, %xmm10
  1513. movsd 13 * SIZE(AO), %xmm11
  1514. mulsd %xmm4, %xmm1
  1515. addsd %xmm13, %xmm12
  1516. movsd 14 * SIZE(AO), %xmm13
  1517. mulsd %xmm4, %xmm2
  1518. addsd %xmm15, %xmm14
  1519. movsd 15 * SIZE(AO), %xmm15
  1520. mulsd %xmm4, %xmm3
  1521. movsd 4 * SIZE(BO), %xmm4
  1522. subq $-16 * SIZE, AO
  1523. addsd %xmm0, %xmm8
  1524. movsd 0 * SIZE(AO), %xmm0
  1525. mulsd %xmm5, %xmm9
  1526. addsd %xmm1, %xmm10
  1527. movsd 1 * SIZE(AO), %xmm1
  1528. mulsd %xmm5, %xmm11
  1529. addq $ 4 * SIZE, BO
  1530. addsd %xmm2, %xmm12
  1531. movsd 2 * SIZE(AO), %xmm2
  1532. mulsd %xmm5, %xmm13
  1533. decq %rax
  1534. addsd %xmm3, %xmm14
  1535. movsd 3 * SIZE(AO), %xmm3
  1536. mulsd %xmm5, %xmm15
  1537. movsd 1 * SIZE(BO), %xmm5
  1538. jne .L42
  1539. ALIGN_4
  1540. .L45:
  1541. #if defined(LT) || defined(RN)
  1542. movq KK, %rax
  1543. #else
  1544. movq K, %rax
  1545. subq KK, %rax
  1546. #endif
  1547. addsd %xmm9, %xmm8
  1548. addsd %xmm11, %xmm10
  1549. addsd %xmm13, %xmm12
  1550. addsd %xmm15, %xmm14
  1551. andq $3, %rax
  1552. BRANCH
  1553. BRANCH
  1554. je .L49
  1555. ALIGN_4
  1556. .L46:
  1557. mulsd %xmm4, %xmm0
  1558. mulsd %xmm4, %xmm1
  1559. mulsd %xmm4, %xmm2
  1560. mulsd %xmm4, %xmm3
  1561. movsd 1 * SIZE(BO), %xmm4
  1562. addsd %xmm0, %xmm8
  1563. movsd 4 * SIZE(AO), %xmm0
  1564. addsd %xmm1, %xmm10
  1565. movsd 5 * SIZE(AO), %xmm1
  1566. addsd %xmm2, %xmm12
  1567. movsd 6 * SIZE(AO), %xmm2
  1568. addsd %xmm3, %xmm14
  1569. movsd 7 * SIZE(AO), %xmm3
  1570. addq $4 * SIZE, AO
  1571. addq $1 * SIZE, BO
  1572. decq %rax
  1573. BRANCH
  1574. jg .L46
  1575. ALIGN_4
  1576. .L49:
  1577. #if defined(LN) || defined(RT)
  1578. movq KK, %rax
  1579. #ifdef LN
  1580. subq $4, %rax
  1581. #else
  1582. subq $1, %rax
  1583. #endif
  1584. leaq (, %rax, SIZE), %rax
  1585. movq AORIG, AO
  1586. leaq (AO, %rax, 4), AO
  1587. leaq (B, %rax, 1), BO
  1588. #endif
  1589. #if defined(LN) || defined(LT)
  1590. movsd 0 * SIZE(BO), %xmm0
  1591. movsd 1 * SIZE(BO), %xmm2
  1592. movsd 2 * SIZE(BO), %xmm4
  1593. movsd 3 * SIZE(BO), %xmm6
  1594. subsd %xmm8, %xmm0
  1595. subsd %xmm10, %xmm2
  1596. subsd %xmm12, %xmm4
  1597. subsd %xmm14, %xmm6
  1598. #else
  1599. movsd 0 * SIZE(AO), %xmm0
  1600. movsd 1 * SIZE(AO), %xmm2
  1601. movsd 2 * SIZE(AO), %xmm4
  1602. movsd 3 * SIZE(AO), %xmm6
  1603. subsd %xmm8, %xmm0
  1604. subsd %xmm10, %xmm2
  1605. subsd %xmm12, %xmm4
  1606. subsd %xmm14, %xmm6
  1607. #endif
  1608. #ifdef LN
  1609. movsd 15 * SIZE(AO), %xmm8
  1610. mulsd %xmm8, %xmm6
  1611. movsd 14 * SIZE(AO), %xmm9
  1612. mulsd %xmm6, %xmm9
  1613. movsd 13 * SIZE(AO), %xmm11
  1614. subsd %xmm9, %xmm4
  1615. movsd 12 * SIZE(AO), %xmm13
  1616. mulsd %xmm6, %xmm11
  1617. movsd 10 * SIZE(AO), %xmm8
  1618. subsd %xmm11, %xmm2
  1619. movsd 9 * SIZE(AO), %xmm9
  1620. mulsd %xmm6, %xmm13
  1621. movsd 8 * SIZE(AO), %xmm11
  1622. subsd %xmm13, %xmm0
  1623. mulsd %xmm8, %xmm4
  1624. movsd 5 * SIZE(AO), %xmm8
  1625. mulsd %xmm4, %xmm9
  1626. subsd %xmm9, %xmm2
  1627. movsd 4 * SIZE(AO), %xmm9
  1628. mulsd %xmm4, %xmm11
  1629. subsd %xmm11, %xmm0
  1630. movsd 0 * SIZE(AO), %xmm11
  1631. mulsd %xmm8, %xmm2
  1632. mulsd %xmm2, %xmm9
  1633. subsd %xmm9, %xmm0
  1634. mulsd %xmm11, %xmm0
  1635. #endif
  1636. #ifdef LT
  1637. movsd 0 * SIZE(AO), %xmm8
  1638. mulsd %xmm8, %xmm0
  1639. movsd 1 * SIZE(AO), %xmm9
  1640. mulsd %xmm0, %xmm9
  1641. movsd 2 * SIZE(AO), %xmm11
  1642. subsd %xmm9, %xmm2
  1643. movsd 3 * SIZE(AO), %xmm13
  1644. mulsd %xmm0, %xmm11
  1645. movsd 5 * SIZE(AO), %xmm8
  1646. subsd %xmm11, %xmm4
  1647. movsd 6 * SIZE(AO), %xmm9
  1648. mulsd %xmm0, %xmm13
  1649. movsd 7 * SIZE(AO), %xmm11
  1650. subsd %xmm13, %xmm6
  1651. mulsd %xmm8, %xmm2
  1652. movsd 10 * SIZE(AO), %xmm8
  1653. mulsd %xmm2, %xmm9
  1654. subsd %xmm9, %xmm4
  1655. movsd 11 * SIZE(AO), %xmm9
  1656. mulsd %xmm2, %xmm11
  1657. subsd %xmm11, %xmm6
  1658. mulsd %xmm8, %xmm4
  1659. movsd 15 * SIZE(AO), %xmm8
  1660. mulsd %xmm4, %xmm9
  1661. subsd %xmm9, %xmm6
  1662. mulsd %xmm8, %xmm6
  1663. #endif
  1664. #if defined(RN) || defined(RT)
  1665. movsd 0 * SIZE(BO), %xmm8
  1666. mulsd %xmm8, %xmm0
  1667. mulsd %xmm8, %xmm2
  1668. mulsd %xmm8, %xmm4
  1669. mulsd %xmm8, %xmm6
  1670. #endif
  1671. #ifdef LN
  1672. subq $4 * SIZE, CO1
  1673. #endif
  1674. movsd %xmm0, 0 * SIZE(CO1)
  1675. movsd %xmm2, 1 * SIZE(CO1)
  1676. movsd %xmm4, 2 * SIZE(CO1)
  1677. movsd %xmm6, 3 * SIZE(CO1)
  1678. #if defined(LN) || defined(LT)
  1679. movsd %xmm0, 0 * SIZE(BO)
  1680. movsd %xmm2, 1 * SIZE(BO)
  1681. movsd %xmm4, 2 * SIZE(BO)
  1682. movsd %xmm6, 3 * SIZE(BO)
  1683. #else
  1684. movsd %xmm0, 0 * SIZE(AO)
  1685. movsd %xmm2, 1 * SIZE(AO)
  1686. movsd %xmm4, 2 * SIZE(AO)
  1687. movsd %xmm6, 3 * SIZE(AO)
  1688. #endif
  1689. #ifndef LN
  1690. addq $4 * SIZE, CO1
  1691. #endif
  1692. #if defined(LT) || defined(RN)
  1693. movq K, %rax
  1694. subq KK, %rax
  1695. leaq (,%rax, SIZE), %rax
  1696. leaq (AO, %rax, 4), AO
  1697. leaq (BO, %rax, 1), BO
  1698. #endif
  1699. #ifdef LN
  1700. subq $4, KK
  1701. #endif
  1702. #ifdef LT
  1703. addq $4, KK
  1704. #endif
  1705. #ifdef RT
  1706. movq K, %rax
  1707. salq $2 + BASE_SHIFT, %rax
  1708. addq %rax, AORIG
  1709. #endif
  1710. decq I # i --
  1711. jg .L41
  1712. ALIGN_4
  1713. .L69:
  1714. #ifdef LN
  1715. leaq (, K, SIZE), %rax
  1716. leaq (B, %rax, 1), B
  1717. #endif
  1718. #if defined(LT) || defined(RN)
  1719. movq BO, B
  1720. #endif
  1721. #ifdef RN
  1722. addq $1, KK
  1723. #endif
  1724. #ifdef RT
  1725. subq $1, KK
  1726. #endif
  1727. ALIGN_2
  1728. .L999:
  1729. movq 0(%rsp), %rbx
  1730. movq 8(%rsp), %rbp
  1731. movq 16(%rsp), %r12
  1732. movq 24(%rsp), %r13
  1733. movq 32(%rsp), %r14
  1734. movq 40(%rsp), %r15
  1735. #ifdef WINDOWS_ABI
  1736. movq 48(%rsp), %rdi
  1737. movq 56(%rsp), %rsi
  1738. movups 64(%rsp), %xmm6
  1739. movups 80(%rsp), %xmm7
  1740. movups 96(%rsp), %xmm8
  1741. movups 112(%rsp), %xmm9
  1742. movups 128(%rsp), %xmm10
  1743. movups 144(%rsp), %xmm11
  1744. movups 160(%rsp), %xmm12
  1745. movups 176(%rsp), %xmm13
  1746. movups 192(%rsp), %xmm14
  1747. movups 208(%rsp), %xmm15
  1748. #endif
  1749. addq $STACKSIZE, %rsp
  1750. ret
  1751. EPILOGUE