You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot.S 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define Y r6
  44. #define INCY r7
  45. #define PREA r8
  46. #define XX r9
  47. #define YY r10
  48. #define INCXM1 r11
  49. #define INCYM1 r12
  50. #define C f1
  51. #define S f2
  52. #define STACKSIZE 32
  53. #ifndef NEEDPARAM
  54. PROLOGUE
  55. PROFCODE
  56. addi SP, SP, -STACKSIZE
  57. li r0, 0
  58. stfd f14, 0(SP)
  59. stfd f15, 8(SP)
  60. stfd f16, 16(SP)
  61. stfd f17, 24(SP)
  62. slwi INCX, INCX, ZBASE_SHIFT
  63. slwi INCY, INCY, ZBASE_SHIFT
  64. subi INCXM1, INCX, SIZE
  65. subi INCYM1, INCY, SIZE
  66. li PREA, L1_PREFETCHSIZE
  67. cmpwi cr0, N, 0
  68. ble- LL(999)
  69. cmpwi cr0, INCX, 2 * SIZE
  70. bne- cr0, LL(100)
  71. cmpwi cr0, INCY, 2 * SIZE
  72. bne- cr0, LL(100)
  73. srawi. r0, N, 3
  74. mtspr CTR, r0
  75. beq- cr0, LL(50)
  76. LFD f0, 0 * SIZE(X)
  77. LFD f4, 1 * SIZE(X)
  78. LFD f6, 2 * SIZE(X)
  79. LFD f8, 3 * SIZE(X)
  80. LFD f3, 0 * SIZE(Y)
  81. LFD f5, 1 * SIZE(Y)
  82. LFD f7, 2 * SIZE(Y)
  83. LFD f9, 3 * SIZE(Y)
  84. bdz LL(12)
  85. .align 4
  86. LL(10):
  87. FMUL f10, C, f0
  88. FMUL f11, C, f3
  89. FMUL f12, C, f4
  90. FMUL f13, C, f5
  91. FMUL f14, C, f6
  92. FMUL f15, C, f7
  93. FMUL f16, C, f8
  94. FMUL f17, C, f9
  95. FMADD f10, S, f3, f10
  96. FNMSUB f11, S, f0, f11
  97. FMADD f12, S, f5, f12
  98. FNMSUB f13, S, f4, f13
  99. FMADD f14, S, f7, f14
  100. FNMSUB f15, S, f6, f15
  101. FMADD f16, S, f9, f16
  102. FNMSUB f17, S, f8, f17
  103. LFD f0, 4 * SIZE(X)
  104. LFD f4, 5 * SIZE(X)
  105. LFD f6, 6 * SIZE(X)
  106. LFD f8, 7 * SIZE(X)
  107. LFD f3, 4 * SIZE(Y)
  108. LFD f5, 5 * SIZE(Y)
  109. LFD f7, 6 * SIZE(Y)
  110. LFD f9, 7 * SIZE(Y)
  111. STFD f10, 0 * SIZE(X)
  112. STFD f12, 1 * SIZE(X)
  113. STFD f14, 2 * SIZE(X)
  114. STFD f16, 3 * SIZE(X)
  115. STFD f11, 0 * SIZE(Y)
  116. STFD f13, 1 * SIZE(Y)
  117. STFD f15, 2 * SIZE(Y)
  118. STFD f17, 3 * SIZE(Y)
  119. FMUL f10, C, f0
  120. FMUL f11, C, f3
  121. FMUL f12, C, f4
  122. FMUL f13, C, f5
  123. FMUL f14, C, f6
  124. FMUL f15, C, f7
  125. FMUL f16, C, f8
  126. FMUL f17, C, f9
  127. FMADD f10, S, f3, f10
  128. FNMSUB f11, S, f0, f11
  129. FMADD f12, S, f5, f12
  130. FNMSUB f13, S, f4, f13
  131. FMADD f14, S, f7, f14
  132. FNMSUB f15, S, f6, f15
  133. FMADD f16, S, f9, f16
  134. FNMSUB f17, S, f8, f17
  135. LFD f0, 8 * SIZE(X)
  136. LFD f4, 9 * SIZE(X)
  137. LFD f6, 10 * SIZE(X)
  138. LFD f8, 11 * SIZE(X)
  139. LFD f3, 8 * SIZE(Y)
  140. LFD f5, 9 * SIZE(Y)
  141. LFD f7, 10 * SIZE(Y)
  142. LFD f9, 11 * SIZE(Y)
  143. STFD f10, 4 * SIZE(X)
  144. STFD f12, 5 * SIZE(X)
  145. STFD f14, 6 * SIZE(X)
  146. STFD f16, 7 * SIZE(X)
  147. STFD f11, 4 * SIZE(Y)
  148. STFD f13, 5 * SIZE(Y)
  149. STFD f15, 6 * SIZE(Y)
  150. STFD f17, 7 * SIZE(Y)
  151. FMUL f10, C, f0
  152. FMUL f11, C, f3
  153. FMUL f12, C, f4
  154. FMUL f13, C, f5
  155. FMUL f14, C, f6
  156. FMUL f15, C, f7
  157. FMUL f16, C, f8
  158. FMUL f17, C, f9
  159. FMADD f10, S, f3, f10
  160. FNMSUB f11, S, f0, f11
  161. FMADD f12, S, f5, f12
  162. FNMSUB f13, S, f4, f13
  163. FMADD f14, S, f7, f14
  164. FNMSUB f15, S, f6, f15
  165. FMADD f16, S, f9, f16
  166. FNMSUB f17, S, f8, f17
  167. LFD f0, 12 * SIZE(X)
  168. LFD f4, 13 * SIZE(X)
  169. LFD f6, 14 * SIZE(X)
  170. LFD f8, 15 * SIZE(X)
  171. LFD f3, 12 * SIZE(Y)
  172. LFD f5, 13 * SIZE(Y)
  173. LFD f7, 14 * SIZE(Y)
  174. LFD f9, 15 * SIZE(Y)
  175. STFD f10, 8 * SIZE(X)
  176. STFD f12, 9 * SIZE(X)
  177. STFD f14, 10 * SIZE(X)
  178. STFD f16, 11 * SIZE(X)
  179. STFD f11, 8 * SIZE(Y)
  180. STFD f13, 9 * SIZE(Y)
  181. STFD f15, 10 * SIZE(Y)
  182. STFD f17, 11 * SIZE(Y)
  183. FMUL f10, C, f0
  184. FMUL f11, C, f3
  185. FMUL f12, C, f4
  186. FMUL f13, C, f5
  187. FMUL f14, C, f6
  188. FMUL f15, C, f7
  189. FMUL f16, C, f8
  190. FMUL f17, C, f9
  191. FMADD f10, S, f3, f10
  192. FNMSUB f11, S, f0, f11
  193. FMADD f12, S, f5, f12
  194. FNMSUB f13, S, f4, f13
  195. FMADD f14, S, f7, f14
  196. FNMSUB f15, S, f6, f15
  197. FMADD f16, S, f9, f16
  198. FNMSUB f17, S, f8, f17
  199. LFD f0, 16 * SIZE(X)
  200. LFD f4, 17 * SIZE(X)
  201. LFD f6, 18 * SIZE(X)
  202. LFD f8, 19 * SIZE(X)
  203. LFD f3, 16 * SIZE(Y)
  204. LFD f5, 17 * SIZE(Y)
  205. LFD f7, 18 * SIZE(Y)
  206. LFD f9, 19 * SIZE(Y)
  207. STFD f10, 12 * SIZE(X)
  208. STFD f12, 13 * SIZE(X)
  209. STFD f14, 14 * SIZE(X)
  210. STFD f16, 15 * SIZE(X)
  211. STFD f11, 12 * SIZE(Y)
  212. STFD f13, 13 * SIZE(Y)
  213. STFD f15, 14 * SIZE(Y)
  214. STFD f17, 15 * SIZE(Y)
  215. #ifndef POWER6
  216. dcbtst X, PREA
  217. #endif
  218. addi X, X, 16 * SIZE
  219. addi Y, Y, 16 * SIZE
  220. #ifdef POWER6
  221. dcbtst X, PREA
  222. dcbtst X, PREA
  223. #endif
  224. bdnz LL(10)
  225. .align 4
  226. LL(12):
  227. FMUL f10, C, f0
  228. FMUL f11, C, f3
  229. FMUL f12, C, f4
  230. FMUL f13, C, f5
  231. FMUL f14, C, f6
  232. FMUL f15, C, f7
  233. FMUL f16, C, f8
  234. FMUL f17, C, f9
  235. FMADD f10, S, f3, f10
  236. FNMSUB f11, S, f0, f11
  237. FMADD f12, S, f5, f12
  238. FNMSUB f13, S, f4, f13
  239. FMADD f14, S, f7, f14
  240. FNMSUB f15, S, f6, f15
  241. FMADD f16, S, f9, f16
  242. FNMSUB f17, S, f8, f17
  243. STFD f10, 0 * SIZE(X)
  244. STFD f12, 1 * SIZE(X)
  245. STFD f14, 2 * SIZE(X)
  246. STFD f16, 3 * SIZE(X)
  247. STFD f11, 0 * SIZE(Y)
  248. STFD f13, 1 * SIZE(Y)
  249. STFD f15, 2 * SIZE(Y)
  250. STFD f17, 3 * SIZE(Y)
  251. LFD f0, 4 * SIZE(X)
  252. LFD f4, 5 * SIZE(X)
  253. LFD f6, 6 * SIZE(X)
  254. LFD f8, 7 * SIZE(X)
  255. LFD f3, 4 * SIZE(Y)
  256. LFD f5, 5 * SIZE(Y)
  257. LFD f7, 6 * SIZE(Y)
  258. LFD f9, 7 * SIZE(Y)
  259. FMUL f10, C, f0
  260. FMUL f11, C, f3
  261. FMUL f12, C, f4
  262. FMUL f13, C, f5
  263. FMUL f14, C, f6
  264. FMUL f15, C, f7
  265. FMUL f16, C, f8
  266. FMUL f17, C, f9
  267. FMADD f10, S, f3, f10
  268. FNMSUB f11, S, f0, f11
  269. FMADD f12, S, f5, f12
  270. FNMSUB f13, S, f4, f13
  271. FMADD f14, S, f7, f14
  272. FNMSUB f15, S, f6, f15
  273. FMADD f16, S, f9, f16
  274. FNMSUB f17, S, f8, f17
  275. STFD f10, 4 * SIZE(X)
  276. STFD f12, 5 * SIZE(X)
  277. STFD f14, 6 * SIZE(X)
  278. STFD f16, 7 * SIZE(X)
  279. STFD f11, 4 * SIZE(Y)
  280. STFD f13, 5 * SIZE(Y)
  281. STFD f15, 6 * SIZE(Y)
  282. STFD f17, 7 * SIZE(Y)
  283. LFD f0, 8 * SIZE(X)
  284. LFD f4, 9 * SIZE(X)
  285. LFD f6, 10 * SIZE(X)
  286. LFD f8, 11 * SIZE(X)
  287. LFD f3, 8 * SIZE(Y)
  288. LFD f5, 9 * SIZE(Y)
  289. LFD f7, 10 * SIZE(Y)
  290. LFD f9, 11 * SIZE(Y)
  291. FMUL f10, C, f0
  292. FMUL f11, C, f3
  293. FMUL f12, C, f4
  294. FMUL f13, C, f5
  295. FMUL f14, C, f6
  296. FMUL f15, C, f7
  297. FMUL f16, C, f8
  298. FMUL f17, C, f9
  299. FMADD f10, S, f3, f10
  300. FNMSUB f11, S, f0, f11
  301. FMADD f12, S, f5, f12
  302. FNMSUB f13, S, f4, f13
  303. FMADD f14, S, f7, f14
  304. FNMSUB f15, S, f6, f15
  305. FMADD f16, S, f9, f16
  306. FNMSUB f17, S, f8, f17
  307. STFD f10, 8 * SIZE(X)
  308. STFD f12, 9 * SIZE(X)
  309. STFD f14, 10 * SIZE(X)
  310. STFD f16, 11 * SIZE(X)
  311. STFD f11, 8 * SIZE(Y)
  312. STFD f13, 9 * SIZE(Y)
  313. STFD f15, 10 * SIZE(Y)
  314. STFD f17, 11 * SIZE(Y)
  315. LFD f0, 12 * SIZE(X)
  316. LFD f4, 13 * SIZE(X)
  317. LFD f6, 14 * SIZE(X)
  318. LFD f8, 15 * SIZE(X)
  319. LFD f3, 12 * SIZE(Y)
  320. LFD f5, 13 * SIZE(Y)
  321. LFD f7, 14 * SIZE(Y)
  322. LFD f9, 15 * SIZE(Y)
  323. FMUL f10, C, f0
  324. FMUL f11, C, f3
  325. FMUL f12, C, f4
  326. FMUL f13, C, f5
  327. FMUL f14, C, f6
  328. FMUL f15, C, f7
  329. FMUL f16, C, f8
  330. FMUL f17, C, f9
  331. FMADD f10, S, f3, f10
  332. FNMSUB f11, S, f0, f11
  333. FMADD f12, S, f5, f12
  334. FNMSUB f13, S, f4, f13
  335. FMADD f14, S, f7, f14
  336. FNMSUB f15, S, f6, f15
  337. FMADD f16, S, f9, f16
  338. FNMSUB f17, S, f8, f17
  339. STFD f10, 12 * SIZE(X)
  340. STFD f12, 13 * SIZE(X)
  341. STFD f14, 14 * SIZE(X)
  342. STFD f16, 15 * SIZE(X)
  343. STFD f11, 12 * SIZE(Y)
  344. STFD f13, 13 * SIZE(Y)
  345. STFD f15, 14 * SIZE(Y)
  346. STFD f17, 15 * SIZE(Y)
  347. addi X, X, 16 * SIZE
  348. addi Y, Y, 16 * SIZE
  349. .align 4
  350. LL(50):
  351. andi. r0, N, 7
  352. mtspr CTR, r0
  353. beq LL(999)
  354. .align 4
  355. LL(60):
  356. LFD f3, 0 * SIZE(X)
  357. LFD f4, 0 * SIZE(Y)
  358. LFD f5, 1 * SIZE(X)
  359. LFD f6, 1 * SIZE(Y)
  360. FMUL f10, C, f3
  361. FMUL f11, C, f4
  362. FMUL f12, C, f5
  363. FMUL f13, C, f6
  364. FMADD f10, S, f4, f10
  365. FNMSUB f11, S, f3, f11
  366. FMADD f12, S, f6, f12
  367. FNMSUB f13, S, f5, f13
  368. STFD f10, 0 * SIZE(X)
  369. STFD f11, 0 * SIZE(Y)
  370. STFD f12, 1 * SIZE(X)
  371. STFD f13, 1 * SIZE(Y)
  372. addi X, X, 2 * SIZE
  373. addi Y, Y, 2 * SIZE
  374. bdnz LL(60)
  375. b LL(999)
  376. .align 4
  377. LL(100):
  378. sub X, X, INCXM1
  379. sub Y, Y, INCYM1
  380. mr XX, X
  381. mr YY, Y
  382. srawi. r0, N, 2
  383. mtspr CTR, r0
  384. beq- LL(150)
  385. .align 4
  386. LL(110):
  387. LFDX f0, X, INCXM1
  388. LFDX f3, Y, INCYM1
  389. LFDUX f4, X, INCX
  390. LFDUX f5, Y, INCY
  391. LFDX f6, X, INCXM1
  392. LFDX f7, Y, INCYM1
  393. LFDUX f8, X, INCX
  394. LFDUX f9, Y, INCY
  395. FMUL f10, C, f0
  396. FMUL f11, C, f3
  397. FMUL f12, C, f4
  398. FMUL f13, C, f5
  399. FMUL f14, C, f6
  400. FMUL f15, C, f7
  401. FMUL f16, C, f8
  402. FMUL f17, C, f9
  403. FMADD f10, S, f3, f10
  404. FNMSUB f11, S, f0, f11
  405. FMADD f12, S, f5, f12
  406. FNMSUB f13, S, f4, f13
  407. FMADD f14, S, f7, f14
  408. FNMSUB f15, S, f6, f15
  409. FMADD f16, S, f9, f16
  410. FNMSUB f17, S, f8, f17
  411. STFDX f10, XX, INCXM1
  412. STFDX f11, YY, INCYM1
  413. STFDUX f12, XX, INCX
  414. STFDUX f13, YY, INCY
  415. STFDX f14, XX, INCXM1
  416. STFDX f15, YY, INCYM1
  417. STFDUX f16, XX, INCX
  418. STFDUX f17, YY, INCY
  419. LFDX f0, X, INCXM1
  420. LFDX f3, Y, INCYM1
  421. LFDUX f4, X, INCX
  422. LFDUX f5, Y, INCY
  423. LFDX f6, X, INCXM1
  424. LFDX f7, Y, INCYM1
  425. LFDUX f8, X, INCX
  426. LFDUX f9, Y, INCY
  427. FMUL f10, C, f0
  428. FMUL f11, C, f3
  429. FMUL f12, C, f4
  430. FMUL f13, C, f5
  431. FMUL f14, C, f6
  432. FMUL f15, C, f7
  433. FMUL f16, C, f8
  434. FMUL f17, C, f9
  435. FMADD f10, S, f3, f10
  436. FNMSUB f11, S, f0, f11
  437. FMADD f12, S, f5, f12
  438. FNMSUB f13, S, f4, f13
  439. FMADD f14, S, f7, f14
  440. FNMSUB f15, S, f6, f15
  441. FMADD f16, S, f9, f16
  442. FNMSUB f17, S, f8, f17
  443. STFDX f10, XX, INCXM1
  444. STFDX f11, YY, INCYM1
  445. STFDUX f12, XX, INCX
  446. STFDUX f13, YY, INCY
  447. STFDX f14, XX, INCXM1
  448. STFDX f15, YY, INCYM1
  449. STFDUX f16, XX, INCX
  450. STFDUX f17, YY, INCY
  451. bdnz LL(110)
  452. .align 4
  453. LL(150):
  454. andi. r0, N, 3
  455. mtspr CTR, r0
  456. beq LL(999)
  457. .align 4
  458. LL(160):
  459. LFDX f0, X, INCXM1
  460. LFDX f3, Y, INCYM1
  461. LFDUX f4, X, INCX
  462. LFDUX f5, Y, INCY
  463. FMUL f10, C, f0
  464. FMUL f11, C, f3
  465. FMUL f12, C, f4
  466. FMUL f13, C, f5
  467. FMADD f10, S, f3, f10
  468. FNMSUB f11, S, f0, f11
  469. FMADD f12, S, f5, f12
  470. FNMSUB f13, S, f4, f13
  471. STFDX f10, XX, INCXM1
  472. STFDX f11, YY, INCYM1
  473. STFDUX f12, XX, INCX
  474. STFDUX f13, YY, INCY
  475. bdnz LL(160)
  476. .align 4
  477. LL(999):
  478. lfd f14, 0(SP)
  479. lfd f15, 8(SP)
  480. lfd f16, 16(SP)
  481. lfd f17, 24(SP)
  482. addi SP, SP, STACKSIZE
  483. blr
  484. EPILOGUE
  485. #endif