You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ztrsm_kernel_RT_1x2_sse3.S 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 16
  42. #define M 4 + STACK + ARGS(%esp)
  43. #define N 8 + STACK + ARGS(%esp)
  44. #define K 12 + STACK + ARGS(%esp)
  45. #define ALPHA_R 16 + STACK + ARGS(%esp)
  46. #define ALPHA_I 24 + STACK + ARGS(%esp)
  47. #define A 32 + STACK + ARGS(%esp)
  48. #define ARG_B 36 + STACK + ARGS(%esp)
  49. #define C 40 + STACK + ARGS(%esp)
  50. #define ARG_LDC 44 + STACK + ARGS(%esp)
  51. #define OFFSET 48 + STACK + ARGS(%esp)
  52. #define J 0 + STACK(%esp)
  53. #define KK 4 + STACK(%esp)
  54. #define KKK 8 + STACK(%esp)
  55. #define AORIG 12 + STACK(%esp)
  56. #ifdef PENTIUM4
  57. #define PREFETCH prefetcht1
  58. #define PREFETCHSIZE 84
  59. #endif
  60. #if defined(PENRYN) || defined(DUNNINGTON)
  61. #define PREFETCH prefetcht1
  62. #define PREFETCHSIZE 84
  63. #endif
  64. #ifdef PENTIUMM
  65. #define PREFETCH prefetcht1
  66. #define PREFETCHSIZE 84
  67. #endif
  68. #define AA %edx
  69. #define BB %ecx
  70. #define LDC %ebp
  71. #define B %edi
  72. #define CO1 %esi
  73. #define ADDSUB addpd
  74. #define KERNEL1(address) \
  75. mulpd %xmm0, %xmm2; \
  76. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  77. addpd %xmm2, %xmm4; \
  78. movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  79. mulpd %xmm0, %xmm2; \
  80. ADDSUB %xmm2, %xmm5; \
  81. movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  82. mulpd %xmm0, %xmm2; \
  83. addpd %xmm2, %xmm6; \
  84. movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  85. mulpd %xmm0, %xmm2; \
  86. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  87. ADDSUB %xmm2, %xmm7; \
  88. movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2
  89. #define KERNEL2(address) \
  90. mulpd %xmm0, %xmm2; \
  91. addpd %xmm2, %xmm4; \
  92. movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  93. mulpd %xmm0, %xmm2; \
  94. ADDSUB %xmm2, %xmm5; \
  95. movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  96. mulpd %xmm0, %xmm2; \
  97. addpd %xmm2, %xmm6; \
  98. movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  99. mulpd %xmm0, %xmm2; \
  100. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  101. ADDSUB %xmm2, %xmm7; \
  102. movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2
  103. #define KERNEL3(address) \
  104. mulpd %xmm0, %xmm3; \
  105. addpd %xmm3, %xmm4; \
  106. movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  107. mulpd %xmm0, %xmm3; \
  108. ADDSUB %xmm3, %xmm5; \
  109. movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  110. mulpd %xmm0, %xmm3; \
  111. addpd %xmm3, %xmm6; \
  112. movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  113. mulpd %xmm0, %xmm3; \
  114. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  115. ADDSUB %xmm3, %xmm7; \
  116. movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  117. #define KERNEL4(address) \
  118. mulpd %xmm0, %xmm3; \
  119. addpd %xmm3, %xmm4; \
  120. movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  121. mulpd %xmm0, %xmm3; \
  122. ADDSUB %xmm3, %xmm5; \
  123. movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  124. mulpd %xmm0, %xmm3; \
  125. addpd %xmm3, %xmm6; \
  126. movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  127. mulpd %xmm0, %xmm3; \
  128. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
  129. ADDSUB %xmm3, %xmm7; \
  130. movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  131. #define KERNEL5(address) \
  132. mulpd %xmm1, %xmm2; \
  133. addpd %xmm2, %xmm4; \
  134. movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  135. mulpd %xmm1, %xmm2; \
  136. ADDSUB %xmm2, %xmm5; \
  137. movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  138. mulpd %xmm1, %xmm2; \
  139. addpd %xmm2, %xmm6; \
  140. movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  141. mulpd %xmm1, %xmm2; \
  142. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  143. ADDSUB %xmm2, %xmm7; \
  144. movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2
  145. #define KERNEL6(address) \
  146. mulpd %xmm1, %xmm2; \
  147. addpd %xmm2, %xmm4; \
  148. movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  149. mulpd %xmm1, %xmm2; \
  150. ADDSUB %xmm2, %xmm5; \
  151. movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  152. mulpd %xmm1, %xmm2; \
  153. addpd %xmm2, %xmm6; \
  154. movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  155. mulpd %xmm1, %xmm2; \
  156. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  157. ADDSUB %xmm2, %xmm7
  158. #define KERNEL7(address) \
  159. movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
  160. mulpd %xmm1, %xmm3; \
  161. addpd %xmm3, %xmm4; \
  162. movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  163. mulpd %xmm1, %xmm3; \
  164. ADDSUB %xmm3, %xmm5; \
  165. movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  166. mulpd %xmm1, %xmm3; \
  167. addpd %xmm3, %xmm6; \
  168. movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  169. mulpd %xmm1, %xmm3; \
  170. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  171. ADDSUB %xmm3, %xmm7; \
  172. movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  173. #define KERNEL8(address) \
  174. mulpd %xmm1, %xmm3; \
  175. addpd %xmm3, %xmm4; \
  176. movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  177. mulpd %xmm1, %xmm3; \
  178. ADDSUB %xmm3, %xmm5; \
  179. movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  180. mulpd %xmm1, %xmm3; \
  181. addpd %xmm3, %xmm6; \
  182. movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
  183. mulpd %xmm1, %xmm3; \
  184. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
  185. ADDSUB %xmm3, %xmm7; \
  186. movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3
  187. PROLOGUE
  188. subl $ARGS, %esp
  189. pushl %ebp
  190. pushl %edi
  191. pushl %esi
  192. pushl %ebx
  193. PROFCODE
  194. movl ARG_B, B
  195. movl ARG_LDC, LDC
  196. movl OFFSET, %eax
  197. #ifdef RN
  198. negl %eax
  199. #endif
  200. movl %eax, KK
  201. sall $ZBASE_SHIFT, LDC
  202. #ifdef LN
  203. movl M, %eax
  204. sall $ZBASE_SHIFT, %eax
  205. addl %eax, C
  206. imull K, %eax
  207. addl %eax, A
  208. #endif
  209. #ifdef RT
  210. movl N, %eax
  211. sall $ZBASE_SHIFT, %eax
  212. imull K, %eax
  213. addl %eax, B
  214. movl N, %eax
  215. imull LDC, %eax
  216. addl %eax, C
  217. #endif
  218. #ifdef RT
  219. movl N, %eax
  220. subl OFFSET, %eax
  221. movl %eax, KK
  222. #endif
  223. movl N, %eax
  224. testl $1, %eax
  225. jle .L100
  226. #if defined(LT) || defined(RN)
  227. movl A, AA
  228. #else
  229. movl A, %eax
  230. movl %eax, AORIG
  231. #endif
  232. #ifdef RT
  233. movl K, %eax
  234. sall $ZBASE_SHIFT, %eax
  235. subl %eax, B
  236. #endif
  237. #ifdef RT
  238. subl LDC, C
  239. #endif
  240. movl C, CO1
  241. #ifndef RT
  242. addl LDC, C
  243. #endif
  244. #ifdef LN
  245. movl OFFSET, %eax
  246. addl M, %eax
  247. movl %eax, KK
  248. #endif
  249. #ifdef LT
  250. movl OFFSET, %eax
  251. movl %eax, KK
  252. #endif
  253. movl M, %ebx
  254. testl %ebx, %ebx
  255. jle .L500
  256. ALIGN_4
  257. L110:
  258. #ifdef LN
  259. movl K, %eax
  260. sall $ZBASE_SHIFT, %eax
  261. subl %eax, AORIG
  262. #endif
  263. #if defined(LN) || defined(RT)
  264. movl KK, %eax
  265. movl AORIG, AA
  266. sall $ZBASE_SHIFT, %eax
  267. addl %eax, AA
  268. #endif
  269. movl B, BB
  270. #if defined(LN) || defined(RT)
  271. movl KK, %eax
  272. sall $ZBASE_SHIFT, %eax
  273. addl %eax, BB
  274. #endif
  275. movapd 0 * SIZE(AA), %xmm0
  276. pxor %xmm4, %xmm4
  277. movapd 8 * SIZE(AA), %xmm1
  278. pxor %xmm5, %xmm5
  279. movddup 0 * SIZE(BB), %xmm2
  280. pxor %xmm6, %xmm6
  281. movddup 8 * SIZE(BB), %xmm3
  282. pxor %xmm7, %xmm7
  283. #ifdef LN
  284. prefetchnta -2 * SIZE(CO1)
  285. #else
  286. prefetchnta 2 * SIZE(CO1)
  287. #endif
  288. #if defined(LT) || defined(RN)
  289. movl KK, %eax
  290. #else
  291. movl K, %eax
  292. subl KK, %eax
  293. #endif
  294. sarl $3, %eax
  295. je L112
  296. ALIGN_4
  297. L111:
  298. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  299. mulpd %xmm0, %xmm2
  300. addpd %xmm2, %xmm4
  301. movddup 1 * SIZE(BB), %xmm2
  302. mulpd %xmm0, %xmm2
  303. movapd 2 * SIZE(AA), %xmm0
  304. ADDSUB %xmm2, %xmm5
  305. movddup 2 * SIZE(BB), %xmm2
  306. mulpd %xmm0, %xmm2
  307. addpd %xmm2, %xmm6
  308. movddup 3 * SIZE(BB), %xmm2
  309. mulpd %xmm0, %xmm2
  310. movapd 4 * SIZE(AA), %xmm0
  311. ADDSUB %xmm2, %xmm7
  312. movddup 4 * SIZE(BB), %xmm2
  313. mulpd %xmm0, %xmm2
  314. addpd %xmm2, %xmm4
  315. movddup 5 * SIZE(BB), %xmm2
  316. mulpd %xmm0, %xmm2
  317. movapd 6 * SIZE(AA), %xmm0
  318. ADDSUB %xmm2, %xmm5
  319. movddup 6 * SIZE(BB), %xmm2
  320. mulpd %xmm0, %xmm2
  321. addpd %xmm2, %xmm6
  322. movddup 7 * SIZE(BB), %xmm2
  323. mulpd %xmm0, %xmm2
  324. movapd 16 * SIZE(AA), %xmm0
  325. ADDSUB %xmm2, %xmm7
  326. movddup 16 * SIZE(BB), %xmm2
  327. mulpd %xmm1, %xmm3
  328. addpd %xmm3, %xmm4
  329. movddup 9 * SIZE(BB), %xmm3
  330. mulpd %xmm1, %xmm3
  331. movapd 10 * SIZE(AA), %xmm1
  332. ADDSUB %xmm3, %xmm5
  333. movddup 10 * SIZE(BB), %xmm3
  334. mulpd %xmm1, %xmm3
  335. addpd %xmm3, %xmm6
  336. movddup 11 * SIZE(BB), %xmm3
  337. mulpd %xmm1, %xmm3
  338. movapd 12 * SIZE(AA), %xmm1
  339. ADDSUB %xmm3, %xmm7
  340. movddup 12 * SIZE(BB), %xmm3
  341. mulpd %xmm1, %xmm3
  342. addpd %xmm3, %xmm4
  343. movddup 13 * SIZE(BB), %xmm3
  344. mulpd %xmm1, %xmm3
  345. movapd 14 * SIZE(AA), %xmm1
  346. ADDSUB %xmm3, %xmm5
  347. movddup 14 * SIZE(BB), %xmm3
  348. mulpd %xmm1, %xmm3
  349. addpd %xmm3, %xmm6
  350. movddup 15 * SIZE(BB), %xmm3
  351. mulpd %xmm1, %xmm3
  352. movapd 24 * SIZE(AA), %xmm1
  353. ADDSUB %xmm3, %xmm7
  354. movddup 24 * SIZE(BB), %xmm3
  355. addl $16 * SIZE, AA
  356. addl $16 * SIZE, BB
  357. decl %eax
  358. jne L111
  359. ALIGN_4
  360. L112:
  361. #if defined(LT) || defined(RN)
  362. movl KK, %eax
  363. #else
  364. movl K, %eax
  365. subl KK, %eax
  366. #endif
  367. andl $7, %eax # if (k & 1)
  368. BRANCH
  369. je L114
  370. ALIGN_4
  371. L113:
  372. mulpd %xmm0, %xmm2
  373. addpd %xmm2, %xmm4
  374. movddup 1 * SIZE(BB), %xmm2
  375. mulpd %xmm0, %xmm2
  376. movapd 2 * SIZE(AA), %xmm0
  377. ADDSUB %xmm2, %xmm5
  378. movddup 2 * SIZE(BB), %xmm2
  379. addl $2 * SIZE, AA
  380. addl $2 * SIZE, BB
  381. decl %eax
  382. jg L113
  383. ALIGN_4
  384. L114:
  385. addpd %xmm6, %xmm4
  386. addpd %xmm7, %xmm5
  387. #if defined(LN) || defined(RT)
  388. movl KK, %eax
  389. #ifdef LN
  390. subl $1, %eax
  391. #else
  392. subl $1, %eax
  393. #endif
  394. movl AORIG, AA
  395. sall $ZBASE_SHIFT, %eax
  396. leal (AA, %eax, 1), AA
  397. leal (B, %eax, 1), BB
  398. #endif
  399. pcmpeqb %xmm1, %xmm1
  400. psllq $63, %xmm1
  401. shufps $0x40, %xmm1, %xmm1
  402. SHUFPD_1 %xmm5, %xmm5
  403. #ifndef CONJ
  404. xorpd %xmm1, %xmm5
  405. subpd %xmm5, %xmm4
  406. #else
  407. #if defined(LN) || defined(LT)
  408. xorpd %xmm1, %xmm4
  409. #else
  410. xorpd %xmm1, %xmm5
  411. #endif
  412. addpd %xmm5, %xmm4
  413. #endif
  414. #if defined(LN) || defined(LT)
  415. movapd 0 * SIZE(BB), %xmm5
  416. subpd %xmm4, %xmm5
  417. #else
  418. movapd 0 * SIZE(AA), %xmm5
  419. subpd %xmm4, %xmm5
  420. #endif
  421. #ifndef CONJ
  422. SHUFPD_1 %xmm1, %xmm1
  423. #endif
  424. #if defined(LN) || defined(LT)
  425. movddup 0 * SIZE(AA), %xmm2
  426. movddup 1 * SIZE(AA), %xmm3
  427. movapd %xmm5, %xmm4
  428. SHUFPD_1 %xmm4, %xmm4
  429. xorpd %xmm1, %xmm4
  430. mulpd %xmm2, %xmm5
  431. mulpd %xmm3, %xmm4
  432. addpd %xmm4, %xmm5
  433. #endif
  434. #if defined(RN) || defined(RT)
  435. movddup 0 * SIZE(BB), %xmm2
  436. movddup 1 * SIZE(BB), %xmm3
  437. movapd %xmm5, %xmm4
  438. SHUFPD_1 %xmm4, %xmm4
  439. xorpd %xmm1, %xmm4
  440. mulpd %xmm2, %xmm5
  441. mulpd %xmm3, %xmm4
  442. addpd %xmm4, %xmm5
  443. #endif
  444. #ifdef LN
  445. subl $2 * SIZE, CO1
  446. #endif
  447. movlpd %xmm5, 0 * SIZE(CO1)
  448. movhpd %xmm5, 1 * SIZE(CO1)
  449. #if defined(LN) || defined(LT)
  450. movapd %xmm5, 0 * SIZE(BB)
  451. #else
  452. movapd %xmm5, 0 * SIZE(AA)
  453. #endif
  454. #ifndef LN
  455. addl $2 * SIZE, CO1
  456. #endif
  457. #if defined(LT) || defined(RN)
  458. movl K, %eax
  459. subl KK, %eax
  460. sall $ZBASE_SHIFT, %eax
  461. addl %eax, AA
  462. addl %eax, BB
  463. #endif
  464. #ifdef LN
  465. subl $1, KK
  466. #endif
  467. #ifdef LT
  468. addl $1, KK
  469. #endif
  470. #ifdef RT
  471. movl K, %eax
  472. sall $ZBASE_SHIFT, %eax
  473. addl %eax, AORIG
  474. #endif
  475. decl %ebx # i --
  476. jg L110
  477. #ifdef LN
  478. movl K, %eax
  479. sall $ZBASE_SHIFT, %eax
  480. addl %eax, B
  481. #endif
  482. #if defined(LT) || defined(RN)
  483. movl BB, B
  484. #endif
  485. #ifdef RN
  486. addl $1, KK
  487. #endif
  488. #ifdef RT
  489. subl $1, KK
  490. #endif
  491. ALIGN_4
  492. .L100:
  493. movl N, %eax
  494. sarl $1, %eax
  495. movl %eax, J # j = n
  496. jle .L500
  497. ALIGN_4
  498. .L01:
  499. #if defined(LT) || defined(RN)
  500. movl A, AA
  501. #else
  502. movl A, %eax
  503. movl %eax, AORIG
  504. #endif
  505. #ifdef RT
  506. movl K, %eax
  507. sall $1 + ZBASE_SHIFT, %eax
  508. subl %eax, B
  509. #endif
  510. leal (, LDC, 2), %eax
  511. #ifdef RT
  512. subl %eax, C
  513. #endif
  514. movl C, CO1
  515. #ifndef RT
  516. addl %eax, C
  517. #endif
  518. #ifdef LN
  519. movl OFFSET, %eax
  520. addl M, %eax
  521. movl %eax, KK
  522. #endif
  523. #ifdef LT
  524. movl OFFSET, %eax
  525. movl %eax, KK
  526. #endif
  527. movl M, %ebx
  528. testl %ebx, %ebx
  529. jle .L500
  530. ALIGN_4
  531. .L10:
  532. #ifdef LN
  533. movl K, %eax
  534. sall $ZBASE_SHIFT, %eax
  535. subl %eax, AORIG
  536. #endif
  537. #if defined(LN) || defined(RT)
  538. movl KK, %eax
  539. movl AORIG, AA
  540. sall $ZBASE_SHIFT, %eax
  541. addl %eax, AA
  542. #endif
  543. movl B, BB
  544. #if defined(LN) || defined(RT)
  545. movl KK, %eax
  546. sall $1 + ZBASE_SHIFT, %eax
  547. addl %eax, BB
  548. #endif
  549. movapd 0 * SIZE(AA), %xmm0
  550. pxor %xmm4, %xmm4
  551. movapd 8 * SIZE(AA), %xmm1
  552. pxor %xmm5, %xmm5
  553. movddup 0 * SIZE(BB), %xmm2
  554. pxor %xmm6, %xmm6
  555. movddup 8 * SIZE(BB), %xmm3
  556. pxor %xmm7, %xmm7
  557. #ifdef LN
  558. prefetcht0 -2 * SIZE(CO1)
  559. prefetcht0 -2 * SIZE(CO1, LDC, 1)
  560. #else
  561. prefetchnta 2 * SIZE(CO1)
  562. prefetchnta 2 * SIZE(CO1, LDC, 1)
  563. #endif
  564. #if defined(LT) || defined(RN)
  565. movl KK, %eax
  566. #else
  567. movl K, %eax
  568. subl KK, %eax
  569. #endif
  570. sarl $3, %eax
  571. je .L12
  572. ALIGN_4
  573. .L11:
  574. KERNEL1(16 * 0)
  575. KERNEL2(16 * 0)
  576. KERNEL3(16 * 0)
  577. KERNEL4(16 * 0)
  578. KERNEL5(16 * 0)
  579. KERNEL6(16 * 0)
  580. KERNEL7(16 * 0)
  581. KERNEL8(16 * 0)
  582. addl $32 * SIZE, BB
  583. addl $16 * SIZE, AA
  584. decl %eax
  585. jne .L11
  586. ALIGN_4
  587. .L12:
  588. #if defined(LT) || defined(RN)
  589. movl KK, %eax
  590. #else
  591. movl K, %eax
  592. subl KK, %eax
  593. #endif
  594. andl $7, %eax # if (k & 1)
  595. BRANCH
  596. je .L14
  597. ALIGN_4
  598. .L13:
  599. mulpd %xmm0, %xmm2
  600. addpd %xmm2, %xmm4
  601. movddup 1 * SIZE(BB), %xmm2
  602. mulpd %xmm0, %xmm2
  603. ADDSUB %xmm2, %xmm5
  604. movddup 2 * SIZE(BB), %xmm2
  605. mulpd %xmm0, %xmm2
  606. addpd %xmm2, %xmm6
  607. movddup 3 * SIZE(BB), %xmm2
  608. mulpd %xmm0, %xmm2
  609. movapd 2 * SIZE(AA), %xmm0
  610. ADDSUB %xmm2, %xmm7
  611. movddup 4 * SIZE(BB), %xmm2
  612. addl $2 * SIZE, AA
  613. addl $4 * SIZE, BB
  614. decl %eax
  615. jg .L13
  616. ALIGN_4
  617. .L14:
  618. #if defined(LN) || defined(RT)
  619. movl KK, %eax
  620. #ifdef LN
  621. subl $1, %eax
  622. #else
  623. subl $2, %eax
  624. #endif
  625. movl AORIG, AA
  626. sall $ZBASE_SHIFT, %eax
  627. leal (AA, %eax, 1), AA
  628. leal (B, %eax, 2), BB
  629. #endif
  630. pcmpeqb %xmm1, %xmm1
  631. psllq $63, %xmm1
  632. shufps $0x40, %xmm1, %xmm1
  633. SHUFPD_1 %xmm5, %xmm5
  634. SHUFPD_1 %xmm7, %xmm7
  635. #ifndef CONJ
  636. xorpd %xmm1, %xmm5
  637. xorpd %xmm1, %xmm7
  638. subpd %xmm5, %xmm4
  639. subpd %xmm7, %xmm6
  640. #else
  641. #if defined(LN) || defined(LT)
  642. xorpd %xmm1, %xmm4
  643. xorpd %xmm1, %xmm6
  644. #else
  645. xorpd %xmm1, %xmm5
  646. xorpd %xmm1, %xmm7
  647. #endif
  648. addpd %xmm5, %xmm4
  649. addpd %xmm7, %xmm6
  650. #endif
  651. #if defined(LN) || defined(LT)
  652. movapd 0 * SIZE(BB), %xmm5
  653. movapd 2 * SIZE(BB), %xmm7
  654. subpd %xmm4, %xmm5
  655. subpd %xmm6, %xmm7
  656. #else
  657. movapd 0 * SIZE(AA), %xmm5
  658. movapd 2 * SIZE(AA), %xmm7
  659. subpd %xmm4, %xmm5
  660. subpd %xmm6, %xmm7
  661. #endif
  662. #ifndef CONJ
  663. SHUFPD_1 %xmm1, %xmm1
  664. #endif
  665. #if defined(LN) || defined(LT)
  666. movddup 0 * SIZE(AA), %xmm2
  667. movddup 1 * SIZE(AA), %xmm3
  668. movapd %xmm5, %xmm4
  669. movapd %xmm7, %xmm6
  670. SHUFPD_1 %xmm4, %xmm4
  671. SHUFPD_1 %xmm6, %xmm6
  672. xorpd %xmm1, %xmm4
  673. xorpd %xmm1, %xmm6
  674. mulpd %xmm2, %xmm5
  675. mulpd %xmm3, %xmm4
  676. mulpd %xmm2, %xmm7
  677. mulpd %xmm3, %xmm6
  678. addpd %xmm4, %xmm5
  679. addpd %xmm6, %xmm7
  680. #endif
  681. #ifdef RN
  682. movddup 0 * SIZE(BB), %xmm2
  683. movddup 1 * SIZE(BB), %xmm3
  684. movapd %xmm5, %xmm4
  685. SHUFPD_1 %xmm4, %xmm4
  686. xorpd %xmm1, %xmm4
  687. mulpd %xmm2, %xmm5
  688. mulpd %xmm3, %xmm4
  689. addpd %xmm4, %xmm5
  690. movddup 2 * SIZE(BB), %xmm2
  691. movddup 3 * SIZE(BB), %xmm3
  692. movapd %xmm5, %xmm4
  693. movapd %xmm5, %xmm6
  694. SHUFPD_1 %xmm6, %xmm6
  695. xorpd %xmm1, %xmm6
  696. mulpd %xmm2, %xmm4
  697. mulpd %xmm3, %xmm6
  698. subpd %xmm4, %xmm7
  699. subpd %xmm6, %xmm7
  700. movddup 6 * SIZE(BB), %xmm2
  701. movddup 7 * SIZE(BB), %xmm3
  702. movapd %xmm7, %xmm6
  703. SHUFPD_1 %xmm6, %xmm6
  704. xorpd %xmm1, %xmm6
  705. mulpd %xmm2, %xmm7
  706. mulpd %xmm3, %xmm6
  707. addpd %xmm6, %xmm7
  708. #endif
  709. #ifdef RT
  710. movddup 6 * SIZE(BB), %xmm2
  711. movddup 7 * SIZE(BB), %xmm3
  712. movapd %xmm7, %xmm6
  713. SHUFPD_1 %xmm6, %xmm6
  714. xorpd %xmm1, %xmm6
  715. mulpd %xmm2, %xmm7
  716. mulpd %xmm3, %xmm6
  717. addpd %xmm6, %xmm7
  718. movddup 4 * SIZE(BB), %xmm2
  719. movddup 5 * SIZE(BB), %xmm3
  720. movapd %xmm7, %xmm4
  721. movapd %xmm7, %xmm6
  722. SHUFPD_1 %xmm6, %xmm6
  723. xorpd %xmm1, %xmm6
  724. mulpd %xmm2, %xmm4
  725. mulpd %xmm3, %xmm6
  726. subpd %xmm4, %xmm5
  727. subpd %xmm6, %xmm5
  728. movddup 0 * SIZE(BB), %xmm2
  729. movddup 1 * SIZE(BB), %xmm3
  730. movapd %xmm5, %xmm4
  731. SHUFPD_1 %xmm4, %xmm4
  732. xorpd %xmm1, %xmm4
  733. mulpd %xmm2, %xmm5
  734. mulpd %xmm3, %xmm4
  735. addpd %xmm4, %xmm5
  736. #endif
  737. #ifdef LN
  738. subl $2 * SIZE, CO1
  739. #endif
  740. movlpd %xmm5, 0 * SIZE(CO1)
  741. movhpd %xmm5, 1 * SIZE(CO1)
  742. movlpd %xmm7, 0 * SIZE(CO1, LDC)
  743. movhpd %xmm7, 1 * SIZE(CO1, LDC)
  744. #if defined(LN) || defined(LT)
  745. movapd %xmm5, 0 * SIZE(BB)
  746. movapd %xmm7, 2 * SIZE(BB)
  747. #else
  748. movapd %xmm5, 0 * SIZE(AA)
  749. movapd %xmm7, 2 * SIZE(AA)
  750. #endif
  751. #ifndef LN
  752. addl $2 * SIZE, CO1
  753. #endif
  754. #if defined(LT) || defined(RN)
  755. movl K, %eax
  756. subl KK, %eax
  757. sall $ZBASE_SHIFT, %eax
  758. addl %eax, AA
  759. leal (BB, %eax, 2), BB
  760. #endif
  761. #ifdef LN
  762. subl $1, KK
  763. #endif
  764. #ifdef LT
  765. addl $1, KK
  766. #endif
  767. #ifdef RT
  768. movl K, %eax
  769. sall $ZBASE_SHIFT, %eax
  770. addl %eax, AORIG
  771. #endif
  772. decl %ebx # i --
  773. jg .L10
  774. ALIGN_4
  775. .L99:
  776. #ifdef LN
  777. movl K, %eax
  778. sall $1 + ZBASE_SHIFT, %eax
  779. addl %eax, B
  780. #endif
  781. #if defined(LT) || defined(RN)
  782. movl BB, B
  783. #endif
  784. #ifdef RN
  785. addl $2, KK
  786. #endif
  787. #ifdef RT
  788. subl $2, KK
  789. #endif
  790. decl J # j --
  791. jg .L01
  792. ALIGN_4
  793. .L500:
  794. popl %ebx
  795. popl %esi
  796. popl %edi
  797. popl %ebp
  798. addl $ARGS, %esp
  799. ret
  800. EPILOGUE