You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_LT_1x2_sse3.S 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA_R 16 + STACK + ARGS(%esp)
  46. #define ALPHA_I 24 + STACK + ARGS(%esp)
  47. #define A 32 + STACK + ARGS(%esp)
  48. #define ARG_B 36 + STACK + ARGS(%esp)
  49. #define C 40 + STACK + ARGS(%esp)
  50. #define ARG_LDC 44 + STACK + ARGS(%esp)
  51. #define OFFSET 48 + STACK + ARGS(%esp)
  52. #define J 0 + STACK(%esp)
  53. #define KK 4 + STACK(%esp)
  54. #define KKK 8 + STACK(%esp)
  55. #define AORIG 12 + STACK(%esp)
  56. #ifdef PENTIUM4
  57. #define PREFETCH prefetcht1
  58. #define PREFETCHSIZE 84
  59. #endif
  60. #if defined(PENRYN) || defined(DUNNINGTON)
  61. #define PREFETCH prefetcht1
  62. #define PREFETCHSIZE 84
  63. #endif
  64. #ifdef PENTIUMM
  65. #define PREFETCH prefetcht1
  66. #define PREFETCHSIZE 84
  67. #endif
  68. #define AA %edx
  69. #define BB %ecx
  70. #define LDC %ebp
  71. #define B %edi
  72. #define CO1 %esi
  73. #define ADDSUB addpd
  74. #define KERNEL1(address) \
  75. mulpd %xmm0, %xmm2; \
  76. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  77. addpd %xmm2, %xmm4; \
  78. movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  79. mulpd %xmm0, %xmm2; \
  80. ADDSUB %xmm2, %xmm5; \
  81. movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  82. mulpd %xmm0, %xmm2; \
  83. addpd %xmm2, %xmm6; \
  84. movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  85. mulpd %xmm0, %xmm2; \
  86. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  87. ADDSUB %xmm2, %xmm7; \
  88. movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2
  89. #define KERNEL2(address) \
  90. mulpd %xmm0, %xmm2; \
  91. addpd %xmm2, %xmm4; \
  92. movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  93. mulpd %xmm0, %xmm2; \
  94. ADDSUB %xmm2, %xmm5; \
  95. movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  96. mulpd %xmm0, %xmm2; \
  97. addpd %xmm2, %xmm6; \
  98. movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  99. mulpd %xmm0, %xmm2; \
  100. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  101. ADDSUB %xmm2, %xmm7; \
  102. movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2
  103. #define KERNEL3(address) \
  104. mulpd %xmm0, %xmm3; \
  105. addpd %xmm3, %xmm4; \
  106. movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  107. mulpd %xmm0, %xmm3; \
  108. ADDSUB %xmm3, %xmm5; \
  109. movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  110. mulpd %xmm0, %xmm3; \
  111. addpd %xmm3, %xmm6; \
  112. movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  113. mulpd %xmm0, %xmm3; \
  114. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  115. ADDSUB %xmm3, %xmm7; \
  116. movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  117. #define KERNEL4(address) \
  118. mulpd %xmm0, %xmm3; \
  119. addpd %xmm3, %xmm4; \
  120. movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  121. mulpd %xmm0, %xmm3; \
  122. ADDSUB %xmm3, %xmm5; \
  123. movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  124. mulpd %xmm0, %xmm3; \
  125. addpd %xmm3, %xmm6; \
  126. movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  127. mulpd %xmm0, %xmm3; \
  128. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  129. ADDSUB %xmm3, %xmm7; \
  130. movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  131. #define KERNEL5(address) \
  132. mulpd %xmm1, %xmm2; \
  133. addpd %xmm2, %xmm4; \
  134. movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  135. mulpd %xmm1, %xmm2; \
  136. ADDSUB %xmm2, %xmm5; \
  137. movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  138. mulpd %xmm1, %xmm2; \
  139. addpd %xmm2, %xmm6; \
  140. movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  141. mulpd %xmm1, %xmm2; \
  142. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  143. ADDSUB %xmm2, %xmm7; \
  144. movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2
  145. #define KERNEL6(address) \
  146. mulpd %xmm1, %xmm2; \
  147. addpd %xmm2, %xmm4; \
  148. movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  149. mulpd %xmm1, %xmm2; \
  150. ADDSUB %xmm2, %xmm5; \
  151. movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  152. mulpd %xmm1, %xmm2; \
  153. addpd %xmm2, %xmm6; \
  154. movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  155. mulpd %xmm1, %xmm2; \
  156. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  157. ADDSUB %xmm2, %xmm7
  158. #define KERNEL7(address) \
  159. movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  160. mulpd %xmm1, %xmm3; \
  161. addpd %xmm3, %xmm4; \
  162. movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  163. mulpd %xmm1, %xmm3; \
  164. ADDSUB %xmm3, %xmm5; \
  165. movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  166. mulpd %xmm1, %xmm3; \
  167. addpd %xmm3, %xmm6; \
  168. movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  169. mulpd %xmm1, %xmm3; \
  170. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  171. ADDSUB %xmm3, %xmm7; \
  172. movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  173. #define KERNEL8(address) \
  174. mulpd %xmm1, %xmm3; \
  175. addpd %xmm3, %xmm4; \
  176. movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  177. mulpd %xmm1, %xmm3; \
  178. ADDSUB %xmm3, %xmm5; \
  179. movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  180. mulpd %xmm1, %xmm3; \
  181. addpd %xmm3, %xmm6; \
  182. movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  183. mulpd %xmm1, %xmm3; \
  184. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  185. ADDSUB %xmm3, %xmm7; \
  186. movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  187. PROLOGUE
  188. subl $ARGS, %esp
  189. pushl %ebp
  190. pushl %edi
  191. pushl %esi
  192. pushl %ebx
  193. PROFCODE
  194. movl ARG_B, B
  195. movl ARG_LDC, LDC
  196. movl OFFSET, %eax
  197. #ifdef RN
  198. negl %eax
  199. #endif
  200. movl %eax, KK
  201. sall $ZBASE_SHIFT, LDC
  202. #ifdef LN
  203. movl M, %eax
  204. sall $ZBASE_SHIFT, %eax
  205. addl %eax, C
  206. imull K, %eax
  207. addl %eax, A
  208. #endif
  209. #ifdef RT
  210. movl N, %eax
  211. sall $ZBASE_SHIFT, %eax
  212. imull K, %eax
  213. addl %eax, B
  214. movl N, %eax
  215. imull LDC, %eax
  216. addl %eax, C
  217. #endif
  218. #ifdef RT
  219. movl N, %eax
  220. subl OFFSET, %eax
  221. movl %eax, KK
  222. #endif
  223. movl N, %eax
  224. sarl $1, %eax
  225. movl %eax, J # j = n
  226. jle .L100
  227. ALIGN_4
  228. .L01:
  229. #if defined(LT) || defined(RN)
  230. movl A, AA
  231. #else
  232. movl A, %eax
  233. movl %eax, AORIG
  234. #endif
  235. #ifdef RT
  236. movl K, %eax
  237. sall $1 + ZBASE_SHIFT, %eax
  238. subl %eax, B
  239. #endif
  240. leal (, LDC, 2), %eax
  241. #ifdef RT
  242. subl %eax, C
  243. #endif
  244. movl C, CO1
  245. #ifndef RT
  246. addl %eax, C
  247. #endif
  248. #ifdef LN
  249. movl OFFSET, %eax
  250. addl M, %eax
  251. movl %eax, KK
  252. #endif
  253. #ifdef LT
  254. movl OFFSET, %eax
  255. movl %eax, KK
  256. #endif
  257. movl M, %ebx
  258. testl %ebx, %ebx
  259. jle .L100
  260. ALIGN_4
  261. .L10:
  262. #ifdef LN
  263. movl K, %eax
  264. sall $ZBASE_SHIFT, %eax
  265. subl %eax, AORIG
  266. #endif
  267. #if defined(LN) || defined(RT)
  268. movl KK, %eax
  269. movl AORIG, AA
  270. sall $ZBASE_SHIFT, %eax
  271. addl %eax, AA
  272. #endif
  273. movl B, BB
  274. #if defined(LN) || defined(RT)
  275. movl KK, %eax
  276. sall $1 + ZBASE_SHIFT, %eax
  277. addl %eax, BB
  278. #endif
  279. movapd 0 * SIZE(AA), %xmm0
  280. pxor %xmm4, %xmm4
  281. movapd 8 * SIZE(AA), %xmm1
  282. pxor %xmm5, %xmm5
  283. movddup 0 * SIZE(BB), %xmm2
  284. pxor %xmm6, %xmm6
  285. movddup 8 * SIZE(BB), %xmm3
  286. pxor %xmm7, %xmm7
  287. #ifdef LN
  288. prefetchnta -2 * SIZE(CO1)
  289. prefetchnta -2 * SIZE(CO1, LDC, 1)
  290. #else
  291. prefetchnta 2 * SIZE(CO1)
  292. prefetchnta 2 * SIZE(CO1, LDC, 1)
  293. #endif
  294. #if defined(LT) || defined(RN)
  295. movl KK, %eax
  296. #else
  297. movl K, %eax
  298. subl KK, %eax
  299. #endif
  300. sarl $3, %eax
  301. je .L12
  302. ALIGN_4
  303. .L11:
  304. KERNEL1(16 * 0)
  305. KERNEL2(16 * 0)
  306. KERNEL3(16 * 0)
  307. KERNEL4(16 * 0)
  308. KERNEL5(16 * 0)
  309. KERNEL6(16 * 0)
  310. KERNEL7(16 * 0)
  311. KERNEL8(16 * 0)
  312. addl $32 * SIZE, BB
  313. addl $16 * SIZE, AA
  314. decl %eax
  315. jne .L11
  316. ALIGN_4
  317. .L12:
  318. #if defined(LT) || defined(RN)
  319. movl KK, %eax
  320. #else
  321. movl K, %eax
  322. subl KK, %eax
  323. #endif
  324. andl $7, %eax # if (k & 1)
  325. BRANCH
  326. je .L14
  327. ALIGN_4
  328. .L13:
  329. mulpd %xmm0, %xmm2
  330. addpd %xmm2, %xmm4
  331. movddup 1 * SIZE(BB), %xmm2
  332. mulpd %xmm0, %xmm2
  333. ADDSUB %xmm2, %xmm5
  334. movddup 2 * SIZE(BB), %xmm2
  335. mulpd %xmm0, %xmm2
  336. addpd %xmm2, %xmm6
  337. movddup 3 * SIZE(BB), %xmm2
  338. mulpd %xmm0, %xmm2
  339. movapd 2 * SIZE(AA), %xmm0
  340. ADDSUB %xmm2, %xmm7
  341. movddup 4 * SIZE(BB), %xmm2
  342. addl $2 * SIZE, AA
  343. addl $4 * SIZE, BB
  344. decl %eax
  345. jg .L13
  346. ALIGN_4
  347. .L14:
  348. #if defined(LN) || defined(RT)
  349. movl KK, %eax
  350. #ifdef LN
  351. subl $1, %eax
  352. #else
  353. subl $2, %eax
  354. #endif
  355. movl AORIG, AA
  356. sall $ZBASE_SHIFT, %eax
  357. leal (AA, %eax, 1), AA
  358. leal (B, %eax, 2), BB
  359. #endif
  360. pcmpeqb %xmm1, %xmm1
  361. psllq $63, %xmm1
  362. shufps $0x40, %xmm1, %xmm1
  363. SHUFPD_1 %xmm5, %xmm5
  364. SHUFPD_1 %xmm7, %xmm7
  365. #ifndef CONJ
  366. xorpd %xmm1, %xmm5
  367. xorpd %xmm1, %xmm7
  368. subpd %xmm5, %xmm4
  369. subpd %xmm7, %xmm6
  370. #else
  371. #if defined(LN) || defined(LT)
  372. xorpd %xmm1, %xmm4
  373. xorpd %xmm1, %xmm6
  374. #else
  375. xorpd %xmm1, %xmm5
  376. xorpd %xmm1, %xmm7
  377. #endif
  378. addpd %xmm5, %xmm4
  379. addpd %xmm7, %xmm6
  380. #endif
  381. #if defined(LN) || defined(LT)
  382. movapd 0 * SIZE(BB), %xmm5
  383. movapd 2 * SIZE(BB), %xmm7
  384. subpd %xmm4, %xmm5
  385. subpd %xmm6, %xmm7
  386. #else
  387. movapd 0 * SIZE(AA), %xmm5
  388. movapd 2 * SIZE(AA), %xmm7
  389. subpd %xmm4, %xmm5
  390. subpd %xmm6, %xmm7
  391. #endif
  392. #ifndef CONJ
  393. SHUFPD_1 %xmm1, %xmm1
  394. #endif
  395. #if defined(LN) || defined(LT)
  396. movddup 0 * SIZE(AA), %xmm2
  397. movddup 1 * SIZE(AA), %xmm3
  398. movapd %xmm5, %xmm4
  399. movapd %xmm7, %xmm6
  400. SHUFPD_1 %xmm4, %xmm4
  401. SHUFPD_1 %xmm6, %xmm6
  402. xorpd %xmm1, %xmm4
  403. xorpd %xmm1, %xmm6
  404. mulpd %xmm2, %xmm5
  405. mulpd %xmm3, %xmm4
  406. mulpd %xmm2, %xmm7
  407. mulpd %xmm3, %xmm6
  408. addpd %xmm4, %xmm5
  409. addpd %xmm6, %xmm7
  410. #endif
  411. #ifdef RN
  412. movddup 0 * SIZE(BB), %xmm2
  413. movddup 1 * SIZE(BB), %xmm3
  414. movapd %xmm5, %xmm4
  415. SHUFPD_1 %xmm4, %xmm4
  416. xorpd %xmm1, %xmm4
  417. mulpd %xmm2, %xmm5
  418. mulpd %xmm3, %xmm4
  419. addpd %xmm4, %xmm5
  420. movddup 2 * SIZE(BB), %xmm2
  421. movddup 3 * SIZE(BB), %xmm3
  422. movapd %xmm5, %xmm4
  423. movapd %xmm5, %xmm6
  424. SHUFPD_1 %xmm6, %xmm6
  425. xorpd %xmm1, %xmm6
  426. mulpd %xmm2, %xmm4
  427. mulpd %xmm3, %xmm6
  428. subpd %xmm4, %xmm7
  429. subpd %xmm6, %xmm7
  430. movddup 6 * SIZE(BB), %xmm2
  431. movddup 7 * SIZE(BB), %xmm3
  432. movapd %xmm7, %xmm6
  433. SHUFPD_1 %xmm6, %xmm6
  434. xorpd %xmm1, %xmm6
  435. mulpd %xmm2, %xmm7
  436. mulpd %xmm3, %xmm6
  437. addpd %xmm6, %xmm7
  438. #endif
  439. #ifdef RT
  440. movddup 6 * SIZE(BB), %xmm2
  441. movddup 7 * SIZE(BB), %xmm3
  442. movapd %xmm7, %xmm6
  443. SHUFPD_1 %xmm6, %xmm6
  444. xorpd %xmm1, %xmm6
  445. mulpd %xmm2, %xmm7
  446. mulpd %xmm3, %xmm6
  447. addpd %xmm6, %xmm7
  448. movddup 4 * SIZE(BB), %xmm2
  449. movddup 5 * SIZE(BB), %xmm3
  450. movapd %xmm7, %xmm4
  451. movapd %xmm7, %xmm6
  452. SHUFPD_1 %xmm6, %xmm6
  453. xorpd %xmm1, %xmm6
  454. mulpd %xmm2, %xmm4
  455. mulpd %xmm3, %xmm6
  456. subpd %xmm4, %xmm5
  457. subpd %xmm6, %xmm5
  458. movddup 0 * SIZE(BB), %xmm2
  459. movddup 1 * SIZE(BB), %xmm3
  460. movapd %xmm5, %xmm4
  461. SHUFPD_1 %xmm4, %xmm4
  462. xorpd %xmm1, %xmm4
  463. mulpd %xmm2, %xmm5
  464. mulpd %xmm3, %xmm4
  465. addpd %xmm4, %xmm5
  466. #endif
  467. #ifdef LN
  468. subl $2 * SIZE, CO1
  469. #endif
  470. movlpd %xmm5, 0 * SIZE(CO1)
  471. movhpd %xmm5, 1 * SIZE(CO1)
  472. movlpd %xmm7, 0 * SIZE(CO1, LDC)
  473. movhpd %xmm7, 1 * SIZE(CO1, LDC)
  474. #if defined(LN) || defined(LT)
  475. movapd %xmm5, 0 * SIZE(BB)
  476. movapd %xmm7, 2 * SIZE(BB)
  477. #else
  478. movapd %xmm5, 0 * SIZE(AA)
  479. movapd %xmm7, 2 * SIZE(AA)
  480. #endif
  481. #ifndef LN
  482. addl $2 * SIZE, CO1
  483. #endif
  484. #if defined(LT) || defined(RN)
  485. movl K, %eax
  486. subl KK, %eax
  487. sall $ZBASE_SHIFT, %eax
  488. addl %eax, AA
  489. leal (BB, %eax, 2), BB
  490. #endif
  491. #ifdef LN
  492. subl $1, KK
  493. #endif
  494. #ifdef LT
  495. addl $1, KK
  496. #endif
  497. #ifdef RT
  498. movl K, %eax
  499. sall $ZBASE_SHIFT, %eax
  500. addl %eax, AORIG
  501. #endif
  502. decl %ebx # i --
  503. jg .L10
  504. ALIGN_4
  505. .L99:
  506. #ifdef LN
  507. movl K, %eax
  508. sall $1 + ZBASE_SHIFT, %eax
  509. addl %eax, B
  510. #endif
  511. #if defined(LT) || defined(RN)
  512. movl BB, B
  513. #endif
  514. #ifdef RN
  515. addl $2, KK
  516. #endif
  517. #ifdef RT
  518. subl $2, KK
  519. #endif
  520. decl J # j --
  521. jg .L01
  522. ALIGN_4
  523. .L100:
  524. movl N, %eax
  525. testl $1, %eax
  526. jle .L500
  527. #if defined(LT) || defined(RN)
  528. movl A, AA
  529. #else
  530. movl A, %eax
  531. movl %eax, AORIG
  532. #endif
  533. #ifdef RT
  534. movl K, %eax
  535. sall $ZBASE_SHIFT, %eax
  536. subl %eax, B
  537. #endif
  538. #ifdef RT
  539. subl LDC, C
  540. #endif
  541. movl C, CO1
  542. #ifndef RT
  543. addl LDC, C
  544. #endif
  545. #ifdef LN
  546. movl OFFSET, %eax
  547. addl M, %eax
  548. movl %eax, KK
  549. #endif
  550. #ifdef LT
  551. movl OFFSET, %eax
  552. movl %eax, KK
  553. #endif
  554. movl M, %ebx
  555. testl %ebx, %ebx
  556. jle .L500
  557. ALIGN_4
  558. L110:
  559. #ifdef LN
  560. movl K, %eax
  561. sall $ZBASE_SHIFT, %eax
  562. subl %eax, AORIG
  563. #endif
  564. #if defined(LN) || defined(RT)
  565. movl KK, %eax
  566. movl AORIG, AA
  567. sall $ZBASE_SHIFT, %eax
  568. addl %eax, AA
  569. #endif
  570. movl B, BB
  571. #if defined(LN) || defined(RT)
  572. movl KK, %eax
  573. sall $ZBASE_SHIFT, %eax
  574. addl %eax, BB
  575. #endif
  576. movapd 0 * SIZE(AA), %xmm0
  577. pxor %xmm4, %xmm4
  578. movapd 8 * SIZE(AA), %xmm1
  579. pxor %xmm5, %xmm5
  580. movddup 0 * SIZE(BB), %xmm2
  581. pxor %xmm6, %xmm6
  582. movddup 8 * SIZE(BB), %xmm3
  583. pxor %xmm7, %xmm7
  584. #ifdef LN
  585. prefetchnta -2 * SIZE(CO1)
  586. #else
  587. prefetchnta 2 * SIZE(CO1)
  588. #endif
  589. #if defined(LT) || defined(RN)
  590. movl KK, %eax
  591. #else
  592. movl K, %eax
  593. subl KK, %eax
  594. #endif
  595. sarl $3, %eax
  596. je L112
  597. ALIGN_4
  598. L111:
  599. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  600. mulpd %xmm0, %xmm2
  601. addpd %xmm2, %xmm4
  602. movddup 1 * SIZE(BB), %xmm2
  603. mulpd %xmm0, %xmm2
  604. movapd 2 * SIZE(AA), %xmm0
  605. ADDSUB %xmm2, %xmm5
  606. movddup 2 * SIZE(BB), %xmm2
  607. mulpd %xmm0, %xmm2
  608. addpd %xmm2, %xmm6
  609. movddup 3 * SIZE(BB), %xmm2
  610. mulpd %xmm0, %xmm2
  611. movapd 4 * SIZE(AA), %xmm0
  612. ADDSUB %xmm2, %xmm7
  613. movddup 4 * SIZE(BB), %xmm2
  614. mulpd %xmm0, %xmm2
  615. addpd %xmm2, %xmm4
  616. movddup 5 * SIZE(BB), %xmm2
  617. mulpd %xmm0, %xmm2
  618. movapd 6 * SIZE(AA), %xmm0
  619. ADDSUB %xmm2, %xmm5
  620. movddup 6 * SIZE(BB), %xmm2
  621. mulpd %xmm0, %xmm2
  622. addpd %xmm2, %xmm6
  623. movddup 7 * SIZE(BB), %xmm2
  624. mulpd %xmm0, %xmm2
  625. movapd 16 * SIZE(AA), %xmm0
  626. ADDSUB %xmm2, %xmm7
  627. movddup 16 * SIZE(BB), %xmm2
  628. mulpd %xmm1, %xmm3
  629. addpd %xmm3, %xmm4
  630. movddup 9 * SIZE(BB), %xmm3
  631. mulpd %xmm1, %xmm3
  632. movapd 10 * SIZE(AA), %xmm1
  633. ADDSUB %xmm3, %xmm5
  634. movddup 10 * SIZE(BB), %xmm3
  635. mulpd %xmm1, %xmm3
  636. addpd %xmm3, %xmm6
  637. movddup 11 * SIZE(BB), %xmm3
  638. mulpd %xmm1, %xmm3
  639. movapd 12 * SIZE(AA), %xmm1
  640. ADDSUB %xmm3, %xmm7
  641. movddup 12 * SIZE(BB), %xmm3
  642. mulpd %xmm1, %xmm3
  643. addpd %xmm3, %xmm4
  644. movddup 13 * SIZE(BB), %xmm3
  645. mulpd %xmm1, %xmm3
  646. movapd 14 * SIZE(AA), %xmm1
  647. ADDSUB %xmm3, %xmm5
  648. movddup 14 * SIZE(BB), %xmm3
  649. mulpd %xmm1, %xmm3
  650. addpd %xmm3, %xmm6
  651. movddup 15 * SIZE(BB), %xmm3
  652. mulpd %xmm1, %xmm3
  653. movapd 24 * SIZE(AA), %xmm1
  654. ADDSUB %xmm3, %xmm7
  655. movddup 24 * SIZE(BB), %xmm3
  656. addl $16 * SIZE, AA
  657. addl $16 * SIZE, BB
  658. decl %eax
  659. jne L111
  660. ALIGN_4
  661. L112:
  662. #if defined(LT) || defined(RN)
  663. movl KK, %eax
  664. #else
  665. movl K, %eax
  666. subl KK, %eax
  667. #endif
  668. andl $7, %eax # if (k & 1)
  669. BRANCH
  670. je L114
  671. ALIGN_4
  672. L113:
  673. mulpd %xmm0, %xmm2
  674. addpd %xmm2, %xmm4
  675. movddup 1 * SIZE(BB), %xmm2
  676. mulpd %xmm0, %xmm2
  677. movapd 2 * SIZE(AA), %xmm0
  678. ADDSUB %xmm2, %xmm5
  679. movddup 2 * SIZE(BB), %xmm2
  680. addl $2 * SIZE, AA
  681. addl $2 * SIZE, BB
  682. decl %eax
  683. jg L113
  684. ALIGN_4
  685. L114:
  686. addpd %xmm6, %xmm4
  687. addpd %xmm7, %xmm5
  688. #if defined(LN) || defined(RT)
  689. movl KK, %eax
  690. #ifdef LN
  691. subl $1, %eax
  692. #else
  693. subl $1, %eax
  694. #endif
  695. movl AORIG, AA
  696. sall $ZBASE_SHIFT, %eax
  697. leal (AA, %eax, 1), AA
  698. leal (B, %eax, 1), BB
  699. #endif
  700. pcmpeqb %xmm1, %xmm1
  701. psllq $63, %xmm1
  702. shufps $0x40, %xmm1, %xmm1
  703. SHUFPD_1 %xmm5, %xmm5
  704. #ifndef CONJ
  705. xorpd %xmm1, %xmm5
  706. subpd %xmm5, %xmm4
  707. #else
  708. #if defined(LN) || defined(LT)
  709. xorpd %xmm1, %xmm4
  710. #else
  711. xorpd %xmm1, %xmm5
  712. #endif
  713. addpd %xmm5, %xmm4
  714. #endif
  715. #if defined(LN) || defined(LT)
  716. movapd 0 * SIZE(BB), %xmm5
  717. subpd %xmm4, %xmm5
  718. #else
  719. movapd 0 * SIZE(AA), %xmm5
  720. subpd %xmm4, %xmm5
  721. #endif
  722. #ifndef CONJ
  723. SHUFPD_1 %xmm1, %xmm1
  724. #endif
  725. #if defined(LN) || defined(LT)
  726. movddup 0 * SIZE(AA), %xmm2
  727. movddup 1 * SIZE(AA), %xmm3
  728. movapd %xmm5, %xmm4
  729. SHUFPD_1 %xmm4, %xmm4
  730. xorpd %xmm1, %xmm4
  731. mulpd %xmm2, %xmm5
  732. mulpd %xmm3, %xmm4
  733. addpd %xmm4, %xmm5
  734. #endif
  735. #if defined(RN) || defined(RT)
  736. movddup 0 * SIZE(BB), %xmm2
  737. movddup 1 * SIZE(BB), %xmm3
  738. movapd %xmm5, %xmm4
  739. SHUFPD_1 %xmm4, %xmm4
  740. xorpd %xmm1, %xmm4
  741. mulpd %xmm2, %xmm5
  742. mulpd %xmm3, %xmm4
  743. addpd %xmm4, %xmm5
  744. #endif
  745. #ifdef LN
  746. subl $2 * SIZE, CO1
  747. #endif
  748. movlpd %xmm5, 0 * SIZE(CO1)
  749. movhpd %xmm5, 1 * SIZE(CO1)
  750. #if defined(LN) || defined(LT)
  751. movapd %xmm5, 0 * SIZE(BB)
  752. #else
  753. movapd %xmm5, 0 * SIZE(AA)
  754. #endif
  755. #ifndef LN
  756. addl $2 * SIZE, CO1
  757. #endif
  758. #if defined(LT) || defined(RN)
  759. movl K, %eax
  760. subl KK, %eax
  761. sall $ZBASE_SHIFT, %eax
  762. addl %eax, AA
  763. addl %eax, BB
  764. #endif
  765. #ifdef LN
  766. subl $1, KK
  767. #endif
  768. #ifdef LT
  769. addl $1, KK
  770. #endif
  771. #ifdef RT
  772. movl K, %eax
  773. sall $ZBASE_SHIFT, %eax
  774. addl %eax, AORIG
  775. #endif
  776. decl %ebx # i --
  777. jg L110
  778. #ifdef LN
  779. movl K, %eax
  780. sall $ZBASE_SHIFT, %eax
  781. addl %eax, B
  782. #endif
  783. #if defined(LT) || defined(RN)
  784. movl BB, B
  785. #endif
  786. #ifdef RN
  787. addl $1, KK
  788. #endif
  789. #ifdef RT
  790. subl $1, KK
  791. #endif
  792. ALIGN_4
  793. .L500:
  794. popl %ebx
  795. popl %esi
  796. popl %edi
  797. popl %ebp
  798. addl $ARGS, %esp
  799. ret
  800. EPILOGUE