You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

qgemv_t.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef PENTIUM
  41. #define P 88
  42. #endif
  43. #ifndef P
  44. #define P 1000
  45. #endif
  46. #define STACK 16
  47. #define ARGS 24
  48. #define NLDA 0 + STACK(%esp)
  49. #define XP 4 + STACK(%esp)
  50. #define MIN_M 8 + STACK(%esp)
  51. #define J 12 + STACK(%esp)
  52. #define IS 16 + STACK(%esp)
  53. #define M 4 + STACK + ARGS(%esp)
  54. #define N 8 + STACK + ARGS(%esp)
  55. #define K 12 + STACK + ARGS(%esp)
  56. #define ALPHA 16 + STACK + ARGS(%esp)
  57. #define A 32 + STACK + ARGS(%esp)
  58. #define LDA 36 + STACK + ARGS(%esp)
  59. #define X 40 + STACK + ARGS(%esp)
  60. #define INCX 44 + STACK + ARGS(%esp)
  61. #define Y 48 + STACK + ARGS(%esp)
  62. #define INCY 52 + STACK + ARGS(%esp)
  63. #define BUFFER 56 + STACK + ARGS(%esp)
  64. PROLOGUE
  65. subl $ARGS, %esp
  66. pushl %ebp
  67. pushl %edi
  68. pushl %esi
  69. pushl %ebx
  70. PROFCODE
  71. FLD ALPHA
  72. movl X, %edi # X
  73. movl $0, IS
  74. movl M, %ebx
  75. movl N, %eax
  76. testl %ebx, %ebx
  77. jle .L79
  78. testl %eax, %eax
  79. jle .L79
  80. movl INCX, %esi
  81. sall $BASE_SHIFT, %esi
  82. movl %esi, INCX
  83. movl INCY, %esi
  84. sall $BASE_SHIFT, %esi
  85. movl %esi, INCY
  86. movl LDA, %ebx
  87. imull %ebx, %eax
  88. movl $P, %esi
  89. subl %eax, %esi
  90. sall $BASE_SHIFT, %esi
  91. movl %esi, NLDA
  92. movl %ebx, %esi
  93. sall $BASE_SHIFT, %esi
  94. movl %esi, LDA
  95. ALIGN_2
  96. .L32:
  97. movl IS, %esi
  98. movl $P, %edx
  99. movl M, %eax
  100. subl %esi, %eax
  101. cmpl %edx, %eax
  102. #ifdef PENTIUM
  103. jle .L33
  104. movl %edx, %eax
  105. .L33:
  106. #else
  107. cmovg %edx, %eax
  108. #endif
  109. movl %eax, MIN_M
  110. movl IS, %ecx
  111. sall $BASE_SHIFT, %ecx
  112. leal (%edi,%ecx, 1), %ecx
  113. movl INCX, %ebx
  114. movl %ecx, XP
  115. cmpl $SIZE, %ebx
  116. je .L34
  117. movl BUFFER, %esi
  118. movl MIN_M, %ecx
  119. movl %esi, XP
  120. sarl $2, %ecx
  121. jle .L35
  122. ALIGN_3
  123. .L36:
  124. FLD (%edi)
  125. addl %ebx, %edi
  126. FST 0 * SIZE(%esi)
  127. FLD (%edi)
  128. addl %ebx, %edi
  129. FST 1 * SIZE(%esi)
  130. FLD (%edi)
  131. addl %ebx, %edi
  132. FST 2 * SIZE(%esi)
  133. FLD (%edi)
  134. addl %ebx, %edi
  135. FST 3 * SIZE(%esi)
  136. addl $4 * SIZE, %esi
  137. decl %ecx
  138. jg .L36
  139. ALIGN_3
  140. .L35:
  141. movl MIN_M, %ecx
  142. andl $3,%ecx
  143. jle .L34
  144. ALIGN_2
  145. .L42:
  146. FLD (%edi)
  147. addl %ebx, %edi
  148. FST (%esi)
  149. addl $SIZE, %esi
  150. decl %ecx
  151. jg .L42
  152. ALIGN_3
  153. /* Main Routine */
  154. .L34:
  155. movl Y, %ebp # coffset = y
  156. movl N, %esi
  157. sarl $2, %esi
  158. movl %esi, J
  159. jle .L47
  160. ALIGN_3
  161. .L48:
  162. movl A, %ebx # a_offset = a
  163. fldz
  164. movl LDA, %edx
  165. fldz
  166. leal (%ebx, %edx), %ecx # a_offset2 = a + lda
  167. fldz
  168. leal (%ebx, %edx, 4), %eax
  169. fldz
  170. movl %eax, A
  171. movl XP, %esi
  172. FLD (%esi)
  173. movl MIN_M, %eax
  174. sarl $2,%eax
  175. jle .L51
  176. ALIGN_3
  177. #define PRESIZE 8
  178. .L80:
  179. #ifdef PENTIUM3
  180. prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2)
  181. FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda)
  182. fmul %st(1),%st # at1 *= bt1
  183. prefetcht0 PRESIZE * SIZE(%ecx)
  184. faddp %st,%st(2) # ct1 += at1
  185. FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda)
  186. prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2)
  187. fmul %st(1),%st # at1 *= bt1
  188. faddp %st,%st(3) # ct2 += at1
  189. prefetcht0 PRESIZE * SIZE(%ebx)
  190. FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda)
  191. fmul %st(1),%st
  192. faddp %st,%st(4)
  193. FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda)
  194. fmulp %st, %st(1)
  195. faddp %st,%st(4)
  196. FLD 1 * SIZE(%esi)
  197. FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda)
  198. fmul %st(1),%st # at1 *= bt1
  199. faddp %st,%st(2) # ct1 += at1
  200. FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda)
  201. fmul %st(1),%st # at1 *= bt1
  202. faddp %st,%st(3) # ct2 += at1
  203. FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda)
  204. fmul %st(1),%st
  205. faddp %st,%st(4)
  206. FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda)
  207. fmulp %st, %st(1)
  208. faddp %st,%st(4)
  209. FLD 2 * SIZE(%esi)
  210. FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda)
  211. fmul %st(1),%st # at1 *= bt1
  212. faddp %st,%st(2) # ct1 += at1
  213. FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda)
  214. fmul %st(1),%st # at1 *= bt1
  215. faddp %st,%st(3) # ct2 += at1
  216. FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda)
  217. fmul %st(1),%st
  218. faddp %st,%st(4)
  219. FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda)
  220. fmulp %st, %st(1)
  221. faddp %st,%st(4)
  222. FLD 3 * SIZE(%esi)
  223. FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda)
  224. fmul %st(1),%st # at1 *= bt1
  225. faddp %st,%st(2) # ct1 += at1
  226. FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda)
  227. fmul %st(1),%st # at1 *= bt1
  228. faddp %st,%st(3) # ct2 += at1
  229. FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda)
  230. fmul %st(1),%st
  231. faddp %st,%st(4)
  232. FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda)
  233. fmulp %st, %st(1)
  234. addl $4 * SIZE, %ebx
  235. faddp %st,%st(4)
  236. addl $4 * SIZE, %ecx
  237. FLD 4 * SIZE(%esi)
  238. addl $4 * SIZE, %esi
  239. #else
  240. #if defined(HAS_PREFETCH)
  241. prefetcht0 PRESIZE * SIZE(%ebx)
  242. prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2)
  243. prefetcht0 PRESIZE * SIZE(%ecx)
  244. prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2)
  245. #endif
  246. FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda)
  247. fmul %st(1),%st # at1 *= bt1
  248. faddp %st,%st(2) # ct1 += at1
  249. FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda)
  250. fmul %st(1),%st # at1 *= bt1
  251. faddp %st,%st(3) # ct2 += at1
  252. FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda)
  253. fmul %st(1),%st
  254. faddp %st,%st(4)
  255. FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda)
  256. fmulp %st, %st(1)
  257. faddp %st,%st(4)
  258. FLD 1 * SIZE(%esi)
  259. FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda)
  260. fmul %st(1),%st # at1 *= bt1
  261. faddp %st,%st(2) # ct1 += at1
  262. FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda)
  263. fmul %st(1),%st # at1 *= bt1
  264. faddp %st,%st(3) # ct2 += at1
  265. FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda)
  266. fmul %st(1),%st
  267. faddp %st,%st(4)
  268. FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda)
  269. fmulp %st, %st(1)
  270. faddp %st,%st(4)
  271. FLD 2 * SIZE(%esi)
  272. FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda)
  273. fmul %st(1),%st # at1 *= bt1
  274. faddp %st,%st(2) # ct1 += at1
  275. FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda)
  276. fmul %st(1),%st # at1 *= bt1
  277. faddp %st,%st(3) # ct2 += at1
  278. FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda)
  279. fmul %st(1),%st
  280. faddp %st,%st(4)
  281. FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda)
  282. fmulp %st, %st(1)
  283. faddp %st,%st(4)
  284. FLD 3 * SIZE(%esi)
  285. FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda)
  286. fmul %st(1),%st # at1 *= bt1
  287. faddp %st,%st(2) # ct1 += at1
  288. FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda)
  289. fmul %st(1),%st # at1 *= bt1
  290. faddp %st,%st(3) # ct2 += at1
  291. FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda)
  292. fmul %st(1),%st
  293. faddp %st,%st(4)
  294. FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda)
  295. fmulp %st, %st(1)
  296. faddp %st,%st(4)
  297. FLD 4 * SIZE(%esi)
  298. addl $4 * SIZE, %ebx
  299. addl $4 * SIZE, %ecx
  300. addl $4 * SIZE, %esi
  301. #endif
  302. decl %eax
  303. jg .L80
  304. ALIGN_3
  305. .L51:
  306. movl MIN_M, %eax
  307. andl $3, %eax
  308. je .L81
  309. ALIGN_3
  310. .L52:
  311. FLD (%ebx) # at = *(a_offset + 0 * lda)
  312. fmul %st(1),%st # at1 *= bt1
  313. faddp %st,%st(2) # ct1 += at1
  314. FLD (%ecx) # at1 = *(a_offset2 + 0 * lda)
  315. fmul %st(1),%st # at1 *= bt1
  316. faddp %st,%st(3) # ct2 += at1
  317. FLD (%ebx, %edx, 2) # at = *(a_offset + 2 * lda)
  318. fmul %st(1),%st
  319. faddp %st,%st(4)
  320. FLD (%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda)
  321. fmulp %st, %st(1)
  322. faddp %st,%st(4)
  323. FLD 1 * SIZE(%esi)
  324. addl $SIZE, %ebx
  325. addl $SIZE, %ecx
  326. addl $SIZE, %esi
  327. decl %eax
  328. jg .L52
  329. ALIGN_3
  330. .L81:
  331. ffreep %st(0)
  332. fxch %st(4)
  333. fmul %st, %st(4)
  334. fmul %st, %st(1)
  335. fmul %st, %st(2)
  336. fmul %st, %st(3)
  337. fxch %st(4)
  338. movl INCY, %eax
  339. FLD (%ebp)
  340. faddp %st, %st(1)
  341. FST (%ebp)
  342. addl %eax, %ebp
  343. FLD (%ebp)
  344. faddp %st, %st(1)
  345. FST (%ebp)
  346. addl %eax, %ebp
  347. FLD (%ebp)
  348. faddp %st, %st(1)
  349. FST (%ebp)
  350. addl %eax, %ebp
  351. FLD (%ebp)
  352. faddp %st, %st(1)
  353. FST (%ebp)
  354. addl %eax, %ebp
  355. decl J
  356. jg .L48
  357. ALIGN_3
  358. .L47:
  359. movl N, %esi
  360. andl $3,%esi
  361. movl %esi, J
  362. jle .L60
  363. ALIGN_2
  364. .L61:
  365. movl A, %ebx # a_offset = a
  366. fldz # ct1 = ZERO
  367. movl LDA, %edx
  368. fldz # ct1 = ZERO
  369. addl %ebx, %edx
  370. fldz # ct1 = ZERO
  371. movl %edx, A
  372. fldz # ct1 = ZERO
  373. movl XP, %esi
  374. movl MIN_M, %eax
  375. sarl $3,%eax
  376. jle .L64
  377. ALIGN_3
  378. .L65:
  379. #ifdef HAS_PREFETCH
  380. prefetcht0 PRESIZE * 2 * SIZE(%ebx)
  381. prefetcht0 PRESIZE * 2 * SIZE(%ebx)
  382. #endif
  383. FLD 0 * SIZE(%esi)
  384. FLD 0 * SIZE(%ebx)
  385. fmulp %st, %st(1)
  386. faddp %st,%st(1)
  387. FLD 1 * SIZE(%esi)
  388. FLD 1 * SIZE(%ebx)
  389. fmulp %st, %st(1)
  390. faddp %st,%st(2)
  391. FLD 2 * SIZE(%esi)
  392. FLD 2 * SIZE(%ebx)
  393. fmulp %st, %st(1)
  394. faddp %st,%st(3)
  395. FLD 3 * SIZE(%esi)
  396. FLD 3 * SIZE(%ebx)
  397. fmulp %st, %st(1)
  398. faddp %st,%st(4)
  399. FLD 4 * SIZE(%esi)
  400. FLD 4 * SIZE(%ebx)
  401. fmulp %st, %st(1)
  402. faddp %st,%st(1)
  403. FLD 5 * SIZE(%esi)
  404. FLD 5 * SIZE(%ebx)
  405. fmulp %st, %st(1)
  406. faddp %st,%st(2)
  407. FLD 6 * SIZE(%esi)
  408. FLD 6 * SIZE(%ebx)
  409. fmulp %st, %st(1)
  410. faddp %st,%st(3)
  411. FLD 7 * SIZE(%esi)
  412. FLD 7 * SIZE(%ebx)
  413. fmulp %st, %st(1)
  414. faddp %st,%st(4)
  415. addl $8 * SIZE, %esi
  416. addl $8 * SIZE, %ebx
  417. decl %eax
  418. jg .L65
  419. ALIGN_3
  420. .L64:
  421. movl MIN_M, %eax
  422. andl $7, %eax
  423. jle .L70
  424. ALIGN_3
  425. .L71:
  426. FLD (%esi)
  427. FLD (%ebx)
  428. fmulp %st, %st(1)
  429. faddp %st,%st(1)
  430. addl $SIZE, %esi
  431. addl $SIZE, %ebx
  432. decl %eax
  433. jg .L71
  434. ALIGN_3
  435. .L70:
  436. faddp %st, %st(1)
  437. faddp %st, %st(1)
  438. faddp %st, %st(1)
  439. fmul %st(1),%st
  440. FLD (%ebp)
  441. faddp %st, %st(1)
  442. FST (%ebp)
  443. addl INCY, %ebp
  444. decl J
  445. jg .L61
  446. ALIGN_3
  447. .L60:
  448. movl A, %ebx
  449. addl NLDA, %ebx
  450. movl %ebx, A
  451. addl $P, IS
  452. movl M, %esi
  453. cmpl %esi, IS
  454. jl .L32
  455. ALIGN_3
  456. .L79:
  457. ffreep %st(0)
  458. popl %ebx
  459. popl %esi
  460. popl %edi
  461. popl %ebp
  462. addl $ARGS, %esp
  463. ret
  464. EPILOGUE