You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define Y r6
  44. #define INCY r7
  45. #define PREA r8
  46. #define XX r9
  47. #define YY r10
  48. #define C f1
  49. #define S f2
  50. #define STACKSIZE 32
  51. #ifndef NEEDPARAM
  52. PROLOGUE
  53. PROFCODE
  54. addi SP, SP, -STACKSIZE
  55. li r0, 0
  56. stfd f14, 0(SP)
  57. stfd f15, 8(SP)
  58. stfd f16, 16(SP)
  59. stfd f17, 24(SP)
  60. slwi INCX, INCX, BASE_SHIFT
  61. slwi INCY, INCY, BASE_SHIFT
  62. li PREA, L1_PREFETCHSIZE
  63. cmpwi cr0, N, 0
  64. ble- LL(999)
  65. cmpwi cr0, INCX, SIZE
  66. bne- cr0, LL(100)
  67. cmpwi cr0, INCY, SIZE
  68. bne- cr0, LL(100)
  69. srawi. r0, N, 4
  70. mtspr CTR, r0
  71. beq- cr0, LL(50)
  72. LFD f0, 0 * SIZE(X)
  73. LFD f4, 1 * SIZE(X)
  74. LFD f6, 2 * SIZE(X)
  75. LFD f8, 3 * SIZE(X)
  76. LFD f3, 0 * SIZE(Y)
  77. LFD f5, 1 * SIZE(Y)
  78. LFD f7, 2 * SIZE(Y)
  79. LFD f9, 3 * SIZE(Y)
  80. bdz LL(12)
  81. .align 4
  82. LL(10):
  83. FMUL f10, C, f0
  84. FMUL f11, C, f3
  85. FMUL f12, C, f4
  86. FMUL f13, C, f5
  87. FMUL f14, C, f6
  88. FMUL f15, C, f7
  89. FMUL f16, C, f8
  90. FMUL f17, C, f9
  91. FMADD f10, S, f3, f10
  92. FNMSUB f11, S, f0, f11
  93. FMADD f12, S, f5, f12
  94. FNMSUB f13, S, f4, f13
  95. FMADD f14, S, f7, f14
  96. FNMSUB f15, S, f6, f15
  97. FMADD f16, S, f9, f16
  98. FNMSUB f17, S, f8, f17
  99. LFD f0, 4 * SIZE(X)
  100. LFD f4, 5 * SIZE(X)
  101. LFD f6, 6 * SIZE(X)
  102. LFD f8, 7 * SIZE(X)
  103. LFD f3, 4 * SIZE(Y)
  104. LFD f5, 5 * SIZE(Y)
  105. LFD f7, 6 * SIZE(Y)
  106. LFD f9, 7 * SIZE(Y)
  107. STFD f10, 0 * SIZE(X)
  108. STFD f12, 1 * SIZE(X)
  109. STFD f14, 2 * SIZE(X)
  110. STFD f16, 3 * SIZE(X)
  111. STFD f11, 0 * SIZE(Y)
  112. STFD f13, 1 * SIZE(Y)
  113. STFD f15, 2 * SIZE(Y)
  114. STFD f17, 3 * SIZE(Y)
  115. FMUL f10, C, f0
  116. FMUL f11, C, f3
  117. FMUL f12, C, f4
  118. FMUL f13, C, f5
  119. FMUL f14, C, f6
  120. FMUL f15, C, f7
  121. FMUL f16, C, f8
  122. FMUL f17, C, f9
  123. FMADD f10, S, f3, f10
  124. FNMSUB f11, S, f0, f11
  125. FMADD f12, S, f5, f12
  126. FNMSUB f13, S, f4, f13
  127. FMADD f14, S, f7, f14
  128. FNMSUB f15, S, f6, f15
  129. FMADD f16, S, f9, f16
  130. FNMSUB f17, S, f8, f17
  131. LFD f0, 8 * SIZE(X)
  132. LFD f4, 9 * SIZE(X)
  133. LFD f6, 10 * SIZE(X)
  134. LFD f8, 11 * SIZE(X)
  135. LFD f3, 8 * SIZE(Y)
  136. LFD f5, 9 * SIZE(Y)
  137. LFD f7, 10 * SIZE(Y)
  138. LFD f9, 11 * SIZE(Y)
  139. STFD f10, 4 * SIZE(X)
  140. STFD f12, 5 * SIZE(X)
  141. STFD f14, 6 * SIZE(X)
  142. STFD f16, 7 * SIZE(X)
  143. STFD f11, 4 * SIZE(Y)
  144. STFD f13, 5 * SIZE(Y)
  145. STFD f15, 6 * SIZE(Y)
  146. STFD f17, 7 * SIZE(Y)
  147. FMUL f10, C, f0
  148. FMUL f11, C, f3
  149. FMUL f12, C, f4
  150. FMUL f13, C, f5
  151. FMUL f14, C, f6
  152. FMUL f15, C, f7
  153. FMUL f16, C, f8
  154. FMUL f17, C, f9
  155. FMADD f10, S, f3, f10
  156. FNMSUB f11, S, f0, f11
  157. FMADD f12, S, f5, f12
  158. FNMSUB f13, S, f4, f13
  159. FMADD f14, S, f7, f14
  160. FNMSUB f15, S, f6, f15
  161. FMADD f16, S, f9, f16
  162. FNMSUB f17, S, f8, f17
  163. LFD f0, 12 * SIZE(X)
  164. LFD f4, 13 * SIZE(X)
  165. LFD f6, 14 * SIZE(X)
  166. LFD f8, 15 * SIZE(X)
  167. LFD f3, 12 * SIZE(Y)
  168. LFD f5, 13 * SIZE(Y)
  169. LFD f7, 14 * SIZE(Y)
  170. LFD f9, 15 * SIZE(Y)
  171. STFD f10, 8 * SIZE(X)
  172. STFD f12, 9 * SIZE(X)
  173. STFD f14, 10 * SIZE(X)
  174. STFD f16, 11 * SIZE(X)
  175. STFD f11, 8 * SIZE(Y)
  176. STFD f13, 9 * SIZE(Y)
  177. STFD f15, 10 * SIZE(Y)
  178. STFD f17, 11 * SIZE(Y)
  179. FMUL f10, C, f0
  180. FMUL f11, C, f3
  181. FMUL f12, C, f4
  182. FMUL f13, C, f5
  183. FMUL f14, C, f6
  184. FMUL f15, C, f7
  185. FMUL f16, C, f8
  186. FMUL f17, C, f9
  187. FMADD f10, S, f3, f10
  188. FNMSUB f11, S, f0, f11
  189. FMADD f12, S, f5, f12
  190. FNMSUB f13, S, f4, f13
  191. FMADD f14, S, f7, f14
  192. FNMSUB f15, S, f6, f15
  193. FMADD f16, S, f9, f16
  194. FNMSUB f17, S, f8, f17
  195. LFD f0, 16 * SIZE(X)
  196. LFD f4, 17 * SIZE(X)
  197. LFD f6, 18 * SIZE(X)
  198. LFD f8, 19 * SIZE(X)
  199. LFD f3, 16 * SIZE(Y)
  200. LFD f5, 17 * SIZE(Y)
  201. LFD f7, 18 * SIZE(Y)
  202. LFD f9, 19 * SIZE(Y)
  203. STFD f10, 12 * SIZE(X)
  204. STFD f12, 13 * SIZE(X)
  205. STFD f14, 14 * SIZE(X)
  206. STFD f16, 15 * SIZE(X)
  207. STFD f11, 12 * SIZE(Y)
  208. STFD f13, 13 * SIZE(Y)
  209. STFD f15, 14 * SIZE(Y)
  210. STFD f17, 15 * SIZE(Y)
  211. #ifndef POWER6
  212. dcbtst X, PREA
  213. #endif
  214. addi X, X, 16 * SIZE
  215. addi Y, Y, 16 * SIZE
  216. #ifdef POWER6
  217. dcbtst X, PREA
  218. dcbtst X, PREA
  219. #endif
  220. bdnz LL(10)
  221. .align 4
  222. LL(12):
  223. FMUL f10, C, f0
  224. FMUL f11, C, f3
  225. FMUL f12, C, f4
  226. FMUL f13, C, f5
  227. FMUL f14, C, f6
  228. FMUL f15, C, f7
  229. FMUL f16, C, f8
  230. FMUL f17, C, f9
  231. FMADD f10, S, f3, f10
  232. FNMSUB f11, S, f0, f11
  233. FMADD f12, S, f5, f12
  234. FNMSUB f13, S, f4, f13
  235. FMADD f14, S, f7, f14
  236. FNMSUB f15, S, f6, f15
  237. FMADD f16, S, f9, f16
  238. FNMSUB f17, S, f8, f17
  239. STFD f10, 0 * SIZE(X)
  240. STFD f12, 1 * SIZE(X)
  241. STFD f14, 2 * SIZE(X)
  242. STFD f16, 3 * SIZE(X)
  243. STFD f11, 0 * SIZE(Y)
  244. STFD f13, 1 * SIZE(Y)
  245. STFD f15, 2 * SIZE(Y)
  246. STFD f17, 3 * SIZE(Y)
  247. LFD f0, 4 * SIZE(X)
  248. LFD f4, 5 * SIZE(X)
  249. LFD f6, 6 * SIZE(X)
  250. LFD f8, 7 * SIZE(X)
  251. LFD f3, 4 * SIZE(Y)
  252. LFD f5, 5 * SIZE(Y)
  253. LFD f7, 6 * SIZE(Y)
  254. LFD f9, 7 * SIZE(Y)
  255. FMUL f10, C, f0
  256. FMUL f11, C, f3
  257. FMUL f12, C, f4
  258. FMUL f13, C, f5
  259. FMUL f14, C, f6
  260. FMUL f15, C, f7
  261. FMUL f16, C, f8
  262. FMUL f17, C, f9
  263. FMADD f10, S, f3, f10
  264. FNMSUB f11, S, f0, f11
  265. FMADD f12, S, f5, f12
  266. FNMSUB f13, S, f4, f13
  267. FMADD f14, S, f7, f14
  268. FNMSUB f15, S, f6, f15
  269. FMADD f16, S, f9, f16
  270. FNMSUB f17, S, f8, f17
  271. STFD f10, 4 * SIZE(X)
  272. STFD f12, 5 * SIZE(X)
  273. STFD f14, 6 * SIZE(X)
  274. STFD f16, 7 * SIZE(X)
  275. STFD f11, 4 * SIZE(Y)
  276. STFD f13, 5 * SIZE(Y)
  277. STFD f15, 6 * SIZE(Y)
  278. STFD f17, 7 * SIZE(Y)
  279. LFD f0, 8 * SIZE(X)
  280. LFD f4, 9 * SIZE(X)
  281. LFD f6, 10 * SIZE(X)
  282. LFD f8, 11 * SIZE(X)
  283. LFD f3, 8 * SIZE(Y)
  284. LFD f5, 9 * SIZE(Y)
  285. LFD f7, 10 * SIZE(Y)
  286. LFD f9, 11 * SIZE(Y)
  287. FMUL f10, C, f0
  288. FMUL f11, C, f3
  289. FMUL f12, C, f4
  290. FMUL f13, C, f5
  291. FMUL f14, C, f6
  292. FMUL f15, C, f7
  293. FMUL f16, C, f8
  294. FMUL f17, C, f9
  295. FMADD f10, S, f3, f10
  296. FNMSUB f11, S, f0, f11
  297. FMADD f12, S, f5, f12
  298. FNMSUB f13, S, f4, f13
  299. FMADD f14, S, f7, f14
  300. FNMSUB f15, S, f6, f15
  301. FMADD f16, S, f9, f16
  302. FNMSUB f17, S, f8, f17
  303. STFD f10, 8 * SIZE(X)
  304. STFD f12, 9 * SIZE(X)
  305. STFD f14, 10 * SIZE(X)
  306. STFD f16, 11 * SIZE(X)
  307. STFD f11, 8 * SIZE(Y)
  308. STFD f13, 9 * SIZE(Y)
  309. STFD f15, 10 * SIZE(Y)
  310. STFD f17, 11 * SIZE(Y)
  311. LFD f0, 12 * SIZE(X)
  312. LFD f4, 13 * SIZE(X)
  313. LFD f6, 14 * SIZE(X)
  314. LFD f8, 15 * SIZE(X)
  315. LFD f3, 12 * SIZE(Y)
  316. LFD f5, 13 * SIZE(Y)
  317. LFD f7, 14 * SIZE(Y)
  318. LFD f9, 15 * SIZE(Y)
  319. FMUL f10, C, f0
  320. FMUL f11, C, f3
  321. FMUL f12, C, f4
  322. FMUL f13, C, f5
  323. FMUL f14, C, f6
  324. FMUL f15, C, f7
  325. FMUL f16, C, f8
  326. FMUL f17, C, f9
  327. FMADD f10, S, f3, f10
  328. FNMSUB f11, S, f0, f11
  329. FMADD f12, S, f5, f12
  330. FNMSUB f13, S, f4, f13
  331. FMADD f14, S, f7, f14
  332. FNMSUB f15, S, f6, f15
  333. FMADD f16, S, f9, f16
  334. FNMSUB f17, S, f8, f17
  335. STFD f10, 12 * SIZE(X)
  336. STFD f12, 13 * SIZE(X)
  337. STFD f14, 14 * SIZE(X)
  338. STFD f16, 15 * SIZE(X)
  339. STFD f11, 12 * SIZE(Y)
  340. STFD f13, 13 * SIZE(Y)
  341. STFD f15, 14 * SIZE(Y)
  342. STFD f17, 15 * SIZE(Y)
  343. addi X, X, 16 * SIZE
  344. addi Y, Y, 16 * SIZE
  345. .align 4
  346. LL(50):
  347. andi. r0, N, 15
  348. mtspr CTR, r0
  349. beq LL(999)
  350. .align 4
  351. LL(60):
  352. LFD f3, 0 * SIZE(X)
  353. LFD f4, 0 * SIZE(Y)
  354. FMUL f10, C, f3
  355. FMUL f11, C, f4
  356. FMADD f10, S, f4, f10
  357. FNMSUB f11, S, f3, f11
  358. STFD f10, 0 * SIZE(X)
  359. STFD f11, 0 * SIZE(Y)
  360. addi X, X, 1 * SIZE
  361. addi Y, Y, 1 * SIZE
  362. bdnz LL(60)
  363. b LL(999)
  364. .align 4
  365. LL(100):
  366. sub X, X, INCX
  367. sub Y, Y, INCY
  368. mr XX, X
  369. mr YY, Y
  370. srawi. r0, N, 3
  371. mtspr CTR, r0
  372. beq- LL(150)
  373. .align 4
  374. LL(110):
  375. LFDUX f0, X, INCX
  376. LFDUX f3, Y, INCY
  377. LFDUX f4, X, INCX
  378. LFDUX f5, Y, INCY
  379. LFDUX f6, X, INCX
  380. LFDUX f7, Y, INCY
  381. LFDUX f8, X, INCX
  382. LFDUX f9, Y, INCY
  383. FMUL f10, C, f0
  384. FMUL f11, C, f3
  385. FMUL f12, C, f4
  386. FMUL f13, C, f5
  387. FMUL f14, C, f6
  388. FMUL f15, C, f7
  389. FMUL f16, C, f8
  390. FMUL f17, C, f9
  391. FMADD f10, S, f3, f10
  392. FNMSUB f11, S, f0, f11
  393. FMADD f12, S, f5, f12
  394. FNMSUB f13, S, f4, f13
  395. FMADD f14, S, f7, f14
  396. FNMSUB f15, S, f6, f15
  397. FMADD f16, S, f9, f16
  398. FNMSUB f17, S, f8, f17
  399. STFDUX f10, XX, INCX
  400. STFDUX f11, YY, INCY
  401. STFDUX f12, XX, INCX
  402. STFDUX f13, YY, INCY
  403. STFDUX f14, XX, INCX
  404. STFDUX f15, YY, INCY
  405. STFDUX f16, XX, INCX
  406. STFDUX f17, YY, INCY
  407. LFDUX f0, X, INCX
  408. LFDUX f3, Y, INCY
  409. LFDUX f4, X, INCX
  410. LFDUX f5, Y, INCY
  411. LFDUX f6, X, INCX
  412. LFDUX f7, Y, INCY
  413. LFDUX f8, X, INCX
  414. LFDUX f9, Y, INCY
  415. FMUL f10, C, f0
  416. FMUL f11, C, f3
  417. FMUL f12, C, f4
  418. FMUL f13, C, f5
  419. FMUL f14, C, f6
  420. FMUL f15, C, f7
  421. FMUL f16, C, f8
  422. FMUL f17, C, f9
  423. FMADD f10, S, f3, f10
  424. FNMSUB f11, S, f0, f11
  425. FMADD f12, S, f5, f12
  426. FNMSUB f13, S, f4, f13
  427. FMADD f14, S, f7, f14
  428. FNMSUB f15, S, f6, f15
  429. FMADD f16, S, f9, f16
  430. FNMSUB f17, S, f8, f17
  431. STFDUX f10, XX, INCX
  432. STFDUX f11, YY, INCY
  433. STFDUX f12, XX, INCX
  434. STFDUX f13, YY, INCY
  435. STFDUX f14, XX, INCX
  436. STFDUX f15, YY, INCY
  437. STFDUX f16, XX, INCX
  438. STFDUX f17, YY, INCY
  439. bdnz LL(110)
  440. .align 4
  441. LL(150):
  442. andi. r0, N, 7
  443. mtspr CTR, r0
  444. beq LL(999)
  445. .align 4
  446. LL(160):
  447. LFDUX f0, X, INCX
  448. LFDUX f3, Y, INCY
  449. FMUL f10, C, f0
  450. FMUL f11, C, f3
  451. FMADD f10, S, f3, f10
  452. FNMSUB f11, S, f0, f11
  453. STFDUX f10, XX, INCX
  454. STFDUX f11, YY, INCY
  455. bdnz LL(160)
  456. .align 4
  457. LL(999):
  458. lfd f14, 0(SP)
  459. lfd f15, 8(SP)
  460. lfd f16, 16(SP)
  461. lfd f17, 24(SP)
  462. addi SP, SP, STACKSIZE
  463. blr
  464. EPILOGUE
  465. #endif