You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy.S 12 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define PREFETCHSIZE 40
  41. #ifndef CONJ
  42. #define ADD1 SUB
  43. #define ADD2 ADD
  44. #else
  45. #define ADD1 ADD
  46. #define ADD2 SUB
  47. #endif
  48. PROLOGUE
  49. PROFCODE
  50. .frame $sp, 16, $26, 0
  51. ldl $19, 0($sp)
  52. fmov $f19, $f29
  53. ldq $20, 8($sp)
  54. fmov $f20, $f30
  55. mov $21, $18
  56. ldl $21, 16($sp)
  57. lda $sp, -64($sp)
  58. nop
  59. stt $f2, 0($sp)
  60. cmpeq $19, 1, $1
  61. stt $f3, 8($sp)
  62. cmpeq $21, 1, $2
  63. stt $f4, 16($sp)
  64. and $16, 3, $5
  65. stt $f5, 24($sp)
  66. stt $f6, 32($sp)
  67. stt $f7, 40($sp)
  68. stt $f8, 48($sp)
  69. #ifndef PROFILE
  70. .prologue 0
  71. #else
  72. .prologue 1
  73. #endif
  74. and $1, $2, $1
  75. ble $16, $End
  76. sra $16, 2, $4
  77. beq $1, $Sub
  78. ble $4, $Remain
  79. subq $4, 1, $4
  80. LD $f0, 0*SIZE($18)
  81. LD $f1, 1*SIZE($18)
  82. LD $f2, 2*SIZE($18)
  83. LD $f3, 3*SIZE($18)
  84. LD $f4, 4*SIZE($18)
  85. LD $f5, 5*SIZE($18)
  86. LD $f6, 6*SIZE($18)
  87. LD $f7, 7*SIZE($18)
  88. LD $f8, 0*SIZE($20)
  89. LD $f28, 1*SIZE($20)
  90. LD $f10, 2*SIZE($20)
  91. LD $f11, 3*SIZE($20)
  92. LD $f12, 4*SIZE($20)
  93. LD $f13, 5*SIZE($20)
  94. LD $f14, 6*SIZE($20)
  95. LD $f15, 7*SIZE($20)
  96. addq $18, 8*SIZE, $18
  97. ble $4, $MainLoopEnd
  98. .align 4
  99. $MainLoop:
  100. ldt $f31, PREFETCHSIZE * SIZE($20)
  101. ldl $31, PREFETCHSIZE * SIZE($18)
  102. MUL $f29, $f0, $f20
  103. LD $f31, 9*SIZE($18)
  104. MUL $f30, $f1, $f21
  105. unop
  106. MUL $f30, $f0, $f22
  107. LD $f0, 0*SIZE($18)
  108. MUL $f29, $f1, $f23
  109. LD $f1, 1*SIZE($18)
  110. MUL $f29, $f2, $f24
  111. unop
  112. MUL $f30, $f3, $f25
  113. nop
  114. MUL $f30, $f2, $f26
  115. LD $f2, 2*SIZE($18)
  116. MUL $f29, $f3, $f27
  117. LD $f3, 3*SIZE($18)
  118. ADD1 $f20, $f21, $f16
  119. MUL $f29, $f4, $f20
  120. ADD2 $f22, $f23, $f17
  121. MUL $f30, $f5, $f21
  122. ADD1 $f24, $f25, $f18
  123. unop
  124. MUL $f30, $f4, $f22
  125. LD $f4, 4*SIZE($18)
  126. ADD2 $f26, $f27, $f19
  127. addq $20, 8*SIZE, $20
  128. MUL $f29, $f5, $f23
  129. LD $f5, 5*SIZE($18)
  130. ADD $f16, $f8, $f16
  131. LD $f8, 0*SIZE($20)
  132. MUL $f29, $f6, $f24
  133. unop
  134. ADD $f17, $f28, $f17
  135. LD $f28, 1*SIZE($20)
  136. MUL $f30, $f7, $f25
  137. unop
  138. ADD $f18, $f10, $f18
  139. LD $f10, 2*SIZE($20)
  140. MUL $f30, $f6, $f26
  141. LD $f6, 6*SIZE($18)
  142. ADD $f19, $f11, $f19
  143. LD $f11, 3*SIZE($20)
  144. MUL $f29, $f7, $f27
  145. LD $f7, 7*SIZE($18)
  146. ST $f16,-8*SIZE($20)
  147. ADD1 $f20, $f21, $f16
  148. ST $f17,-7*SIZE($20)
  149. ADD2 $f22, $f23, $f17
  150. ST $f18,-6*SIZE($20)
  151. ADD1 $f24, $f25, $f18
  152. ST $f19,-5*SIZE($20)
  153. ADD2 $f26, $f27, $f19
  154. ADD $f16, $f12, $f16
  155. LD $f12, 4*SIZE($20)
  156. ADD $f17, $f13, $f17
  157. LD $f13, 5*SIZE($20)
  158. ADD $f18, $f14, $f18
  159. LD $f14, 6*SIZE($20)
  160. ADD $f19, $f15, $f19
  161. LD $f15, 7*SIZE($20)
  162. ST $f16,-4*SIZE($20)
  163. addq $18, 8*SIZE, $18
  164. ST $f17,-3*SIZE($20)
  165. subq $4, 1, $4
  166. ST $f18,-2*SIZE($20)
  167. nop
  168. ST $f19,-1*SIZE($20)
  169. bgt $4, $MainLoop
  170. .align 4
  171. $MainLoopEnd:
  172. MUL $f29, $f0, $f20
  173. MUL $f30, $f1, $f21
  174. MUL $f30, $f0, $f22
  175. MUL $f29, $f1, $f23
  176. MUL $f29, $f2, $f24
  177. MUL $f30, $f3, $f25
  178. MUL $f30, $f2, $f26
  179. MUL $f29, $f3, $f27
  180. ADD1 $f20, $f21, $f16
  181. MUL $f29, $f4, $f20
  182. ADD2 $f22, $f23, $f17
  183. MUL $f30, $f5, $f21
  184. ADD1 $f24, $f25, $f18
  185. MUL $f30, $f4, $f22
  186. ADD2 $f26, $f27, $f19
  187. MUL $f29, $f5, $f23
  188. ADD $f16, $f8, $f16
  189. MUL $f29, $f6, $f24
  190. ADD $f17, $f28, $f17
  191. MUL $f30, $f7, $f25
  192. ADD $f18, $f10, $f18
  193. MUL $f30, $f6, $f26
  194. ADD $f19, $f11, $f19
  195. MUL $f29, $f7, $f27
  196. ST $f16, 0*SIZE($20)
  197. ADD1 $f20, $f21, $f16
  198. ST $f17, 1*SIZE($20)
  199. ADD2 $f22, $f23, $f17
  200. ST $f18, 2*SIZE($20)
  201. ADD1 $f24, $f25, $f18
  202. ST $f19, 3*SIZE($20)
  203. ADD2 $f26, $f27, $f19
  204. ADD $f16, $f12, $f16
  205. ADD $f17, $f13, $f17
  206. ADD $f18, $f14, $f18
  207. ADD $f19, $f15, $f19
  208. ST $f16, 4*SIZE($20)
  209. ST $f17, 5*SIZE($20)
  210. ST $f18, 6*SIZE($20)
  211. ST $f19, 7*SIZE($20)
  212. unop
  213. addq $20, 8*SIZE, $20
  214. unop
  215. ble $5, $End
  216. .align 4
  217. $Remain:
  218. subq $5, 1, $6
  219. ble $5, $End
  220. LD $f0, 0*SIZE($18)
  221. LD $f1, 1*SIZE($18)
  222. LD $f8, 0*SIZE($20)
  223. LD $f28, 1*SIZE($20)
  224. addq $18, 2*SIZE, $18
  225. ble $6, $RemainLoopEnd
  226. .align 4
  227. $RemainLoop:
  228. MUL $f29, $f0, $f20
  229. subq $6, 1, $6
  230. MUL $f30, $f1, $f21
  231. addq $20, 2*SIZE, $20
  232. MUL $f30, $f0, $f22
  233. LD $f0, 0*SIZE($18)
  234. MUL $f29, $f1, $f23
  235. LD $f1, 1*SIZE($18)
  236. ADD1 $f20, $f21, $f16
  237. ADD2 $f22, $f23, $f17
  238. ADD $f16, $f8, $f16
  239. LD $f8, 0*SIZE($20)
  240. ADD $f17, $f28, $f17
  241. LD $f28, 1*SIZE($20)
  242. ST $f16,-2*SIZE($20)
  243. addq $18, 2*SIZE, $18
  244. ST $f17,-1*SIZE($20)
  245. bgt $6, $RemainLoop
  246. .align 4
  247. $RemainLoopEnd:
  248. MUL $f29, $f0, $f20
  249. MUL $f30, $f1, $f21
  250. MUL $f30, $f0, $f22
  251. MUL $f29, $f1, $f23
  252. ADD1 $f20, $f21, $f16
  253. ADD2 $f22, $f23, $f17
  254. ADD $f16, $f8, $f16
  255. ADD $f17, $f28, $f17
  256. ST $f16, 0*SIZE($20)
  257. nop
  258. ST $f17, 1*SIZE($20)
  259. nop
  260. .align 4
  261. $End:
  262. ldt $f2, 0($sp)
  263. ldt $f3, 8($sp)
  264. ldt $f4, 16($sp)
  265. ldt $f5, 24($sp)
  266. ldt $f6, 32($sp)
  267. ldt $f7, 40($sp)
  268. ldt $f8, 48($sp)
  269. lda $sp, 64($sp)
  270. ret
  271. .align 4
  272. $Sub:
  273. SXSUBL $16, SIZE, $22
  274. addq $22, $22, $22 # Complex
  275. .align 4
  276. addq $19, $19, $19 # Complex
  277. addq $21, $21, $21 # Complex
  278. ble $4, $SubRemain
  279. LD $f0, 0*SIZE($18)
  280. LD $f1, 1*SIZE($18)
  281. SXADDQ $19, $18, $18
  282. LD $f2, 0*SIZE($18)
  283. LD $f3, 1*SIZE($18)
  284. SXADDQ $19, $18, $18
  285. LD $f4, 0*SIZE($18)
  286. LD $f5, 1*SIZE($18)
  287. SXADDQ $19, $18, $18
  288. LD $f6, 0*SIZE($18)
  289. LD $f7, 1*SIZE($18)
  290. SXADDQ $19, $18, $18
  291. LD $f8, 0*SIZE($20)
  292. LD $f28, 1*SIZE($20)
  293. SXADDQ $21, $20, $24
  294. LD $f10, 0*SIZE($24)
  295. LD $f11, 1*SIZE($24)
  296. SXADDQ $21, $24, $24
  297. LD $f12, 0*SIZE($24)
  298. LD $f13, 1*SIZE($24)
  299. SXADDQ $21, $24, $24
  300. LD $f14, 0*SIZE($24)
  301. LD $f15, 1*SIZE($24)
  302. SXADDQ $21, $24, $24
  303. subq $4, 1, $4
  304. ble $4, $SubMainLoopEnd
  305. .align 4
  306. $SubMainLoop:
  307. MUL $f29, $f0, $f20
  308. unop
  309. MUL $f30, $f1, $f21
  310. unop
  311. MUL $f30, $f0, $f22
  312. LD $f0, 0*SIZE($18)
  313. MUL $f29, $f1, $f23
  314. LD $f1, 1*SIZE($18)
  315. MUL $f29, $f2, $f24
  316. SXADDQ $19, $18, $18
  317. MUL $f30, $f3, $f25
  318. unop
  319. MUL $f30, $f2, $f26
  320. LD $f2, 0*SIZE($18)
  321. MUL $f29, $f3, $f27
  322. LD $f3, 1*SIZE($18)
  323. ADD1 $f20, $f21, $f16
  324. SXADDQ $19, $18, $18
  325. MUL $f29, $f4, $f20
  326. unop
  327. ADD2 $f22, $f23, $f17
  328. unop
  329. MUL $f30, $f5, $f21
  330. unop
  331. ADD1 $f24, $f25, $f18
  332. unop
  333. MUL $f30, $f4, $f22
  334. LD $f4, 0*SIZE($18)
  335. ADD2 $f26, $f27, $f19
  336. unop
  337. MUL $f29, $f5, $f23
  338. LD $f5, 1*SIZE($18)
  339. ADD $f16, $f8, $f16
  340. LD $f8, 0*SIZE($24)
  341. MUL $f29, $f6, $f24
  342. SXADDQ $19, $18, $18
  343. ADD $f17, $f28, $f17
  344. LD $f28, 1*SIZE($24)
  345. MUL $f30, $f7, $f25
  346. SXADDQ $21, $24, $24
  347. ADD $f18, $f10, $f18
  348. LD $f10, 0*SIZE($24)
  349. MUL $f30, $f6, $f26
  350. LD $f6, 0*SIZE($18)
  351. ADD $f19, $f11, $f19
  352. LD $f11, 1*SIZE($24)
  353. MUL $f29, $f7, $f27
  354. LD $f7, 1*SIZE($18)
  355. ST $f16, 0*SIZE($20)
  356. SXADDQ $19, $18, $18
  357. ADD1 $f20, $f21, $f16
  358. unop
  359. ST $f17, 1*SIZE($20)
  360. SXADDQ $21, $20, $20
  361. ADD2 $f22, $f23, $f17
  362. unop
  363. ST $f18, 0*SIZE($20)
  364. SXADDQ $21, $24, $24
  365. ADD1 $f24, $f25, $f18
  366. unop
  367. ST $f19, 1*SIZE($20)
  368. unop
  369. ADD2 $f26, $f27, $f19
  370. SXADDQ $21, $20, $20
  371. ADD $f16, $f12, $f16
  372. unop
  373. LD $f12, 0*SIZE($24)
  374. unop
  375. ADD $f17, $f13, $f17
  376. unop
  377. LD $f13, 1*SIZE($24)
  378. SXADDQ $21, $24, $24
  379. ADD $f18, $f14, $f18
  380. subq $4, 1, $4
  381. LD $f14, 0*SIZE($24)
  382. unop
  383. ADD $f19, $f15, $f19
  384. unop
  385. LD $f15, 1*SIZE($24)
  386. SXADDQ $21, $24, $24
  387. ST $f16, 0*SIZE($20)
  388. ST $f17, 1*SIZE($20)
  389. SXADDQ $21, $20, $20
  390. unop
  391. ST $f18, 0*SIZE($20)
  392. ST $f19, 1*SIZE($20)
  393. SXADDQ $21, $20, $20
  394. bgt $4, $SubMainLoop
  395. .align 4
  396. $SubMainLoopEnd:
  397. MUL $f29, $f0, $f20
  398. MUL $f30, $f1, $f21
  399. MUL $f30, $f0, $f22
  400. MUL $f29, $f1, $f23
  401. MUL $f29, $f2, $f24
  402. MUL $f30, $f3, $f25
  403. MUL $f30, $f2, $f26
  404. MUL $f29, $f3, $f27
  405. ADD1 $f20, $f21, $f16
  406. MUL $f29, $f4, $f20
  407. ADD2 $f22, $f23, $f17
  408. MUL $f30, $f5, $f21
  409. ADD1 $f24, $f25, $f18
  410. MUL $f30, $f4, $f22
  411. ADD2 $f26, $f27, $f19
  412. MUL $f29, $f5, $f23
  413. ADD $f16, $f8, $f16
  414. MUL $f29, $f6, $f24
  415. ADD $f17, $f28, $f17
  416. MUL $f30, $f7, $f25
  417. ADD $f18, $f10, $f18
  418. MUL $f30, $f6, $f26
  419. ADD $f19, $f11, $f19
  420. MUL $f29, $f7, $f27
  421. ST $f16, 0*SIZE($20)
  422. ADD1 $f20, $f21, $f16
  423. ST $f17, 1*SIZE($20)
  424. ADD2 $f22, $f23, $f17
  425. SXADDQ $21, $20, $20
  426. nop
  427. ST $f18, 0*SIZE($20)
  428. ADD1 $f24, $f25, $f18
  429. ST $f19, 1*SIZE($20)
  430. ADD2 $f26, $f27, $f19
  431. SXADDQ $21, $20, $20
  432. ADD $f16, $f12, $f16
  433. ADD $f17, $f13, $f17
  434. ADD $f18, $f14, $f18
  435. ADD $f19, $f15, $f19
  436. ST $f16, 0*SIZE($20)
  437. ST $f17, 1*SIZE($20)
  438. SXADDQ $21, $20, $20
  439. ST $f18, 0*SIZE($20)
  440. ST $f19, 1*SIZE($20)
  441. SXADDQ $21, $20, $20
  442. ble $5, $SubEnd
  443. .align 4
  444. $SubRemain:
  445. subq $5, 1, $6
  446. ble $5, $SubEnd
  447. LD $f0, 0*SIZE($18)
  448. LD $f1, 1*SIZE($18)
  449. LD $f8, 0*SIZE($20)
  450. LD $f28, 1*SIZE($20)
  451. SXADDQ $19, $18, $18
  452. SXADDQ $21, $20, $24
  453. ble $6, $SubRemainLoopEnd
  454. .align 4
  455. $SubRemainLoop:
  456. MUL $f29, $f0, $f20
  457. MUL $f30, $f1, $f21
  458. MUL $f30, $f0, $f22
  459. LD $f0, 0*SIZE($18)
  460. MUL $f29, $f1, $f23
  461. LD $f1, 1*SIZE($18)
  462. ADD1 $f20, $f21, $f16
  463. SXADDQ $19, $18, $18
  464. ADD2 $f22, $f23, $f17
  465. nop
  466. ADD $f16, $f8, $f16
  467. LD $f8, 0*SIZE($24)
  468. ADD $f17, $f28, $f17
  469. LD $f28, 1*SIZE($24)
  470. SXADDQ $21, $24, $24
  471. subq $6, 1, $6
  472. ST $f16, 0*SIZE($20)
  473. ST $f17, 1*SIZE($20)
  474. SXADDQ $21, $20, $20
  475. bgt $6, $SubRemainLoop
  476. .align 4
  477. $SubRemainLoopEnd:
  478. MUL $f29, $f0, $f20
  479. MUL $f30, $f1, $f21
  480. MUL $f30, $f0, $f22
  481. MUL $f29, $f1, $f23
  482. ADD1 $f20, $f21, $f16
  483. ADD2 $f22, $f23, $f17
  484. ADD $f16, $f8, $f16
  485. ADD $f17, $f28, $f17
  486. ST $f16, 0*SIZE($20)
  487. nop
  488. ST $f17, 1*SIZE($20)
  489. nop
  490. .align 4
  491. $SubEnd:
  492. ldt $f2, 0($sp)
  493. ldt $f3, 8($sp)
  494. ldt $f4, 16($sp)
  495. ldt $f5, 24($sp)
  496. ldt $f6, 32($sp)
  497. ldt $f7, 40($sp)
  498. ldt $f8, 48($sp)
  499. lda $sp, 64($sp)
  500. ret
  501. EPILOGUE