You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zasum_cell.S 9.4 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define PREA r8
  44. #define INCXM1 r9
  45. #define FZERO f0
  46. #define STACKSIZE 16
  47. PROLOGUE
  48. PROFCODE
  49. addi SP, SP, -STACKSIZE
  50. li r0, 0
  51. stw r0, 0(SP)
  52. #ifdef F_INTERFACE
  53. LDINT N, 0(N)
  54. LDINT INCX, 0(INCX)
  55. #endif
  56. lfs FZERO, 0(SP)
  57. slwi INCX, INCX, ZBASE_SHIFT
  58. fmr f1, FZERO
  59. li PREA, 8 * 16 * SIZE
  60. fmr f2, FZERO
  61. subi INCXM1, INCX, SIZE
  62. cmpwi cr0, N, 0
  63. fmr f3, FZERO
  64. ble- LL(999)
  65. cmpwi cr0, INCX, 0
  66. ble- LL(999)
  67. cmpwi cr0, INCX, SIZE * 2
  68. bne- cr0, LL(20)
  69. srawi. r0, N, 3
  70. mtspr CTR, r0
  71. beq- cr0, LL(15)
  72. .align 4
  73. LFD f8, 0 * SIZE(X)
  74. LFD f9, 1 * SIZE(X)
  75. fabs f4, f8
  76. LFD f10, 2 * SIZE(X)
  77. fabs f5, f9
  78. LFD f11, 3 * SIZE(X)
  79. fabs f6, f10
  80. LFD f8, 4 * SIZE(X)
  81. fabs f7, f11
  82. bdz LL(13)
  83. .align 4
  84. LL(12):
  85. FADD f0, f0, f4
  86. dcbt X, PREA
  87. fabs f4, f8
  88. LFD f9, 5 * SIZE(X)
  89. FADD f1, f1, f5
  90. nop
  91. fabs f5, f9
  92. LFD f10, 6 * SIZE(X)
  93. FADD f2, f2, f6
  94. nop
  95. fabs f6, f10
  96. LFD f11, 7 * SIZE(X)
  97. FADD f3, f3, f7
  98. nop
  99. fabs f7, f11
  100. LFD f8, 8 * SIZE(X)
  101. FADD f0, f0, f4
  102. nop
  103. fabs f4, f8
  104. LFD f9, 9 * SIZE(X)
  105. FADD f1, f1, f5
  106. nop
  107. fabs f5, f9
  108. LFD f10, 10 * SIZE(X)
  109. FADD f2, f2, f6
  110. nop
  111. fabs f6, f10
  112. LFD f11, 11 * SIZE(X)
  113. FADD f3, f3, f7
  114. nop
  115. fabs f7, f11
  116. LFD f8, 12 * SIZE(X)
  117. FADD f0, f0, f4
  118. nop
  119. fabs f4, f8
  120. LFD f9, 13 * SIZE(X)
  121. FADD f1, f1, f5
  122. nop
  123. fabs f5, f9
  124. LFD f10, 14 * SIZE(X)
  125. FADD f2, f2, f6
  126. nop
  127. fabs f6, f10
  128. LFD f11, 15 * SIZE(X)
  129. FADD f3, f3, f7
  130. nop
  131. fabs f7, f11
  132. LFD f8, 16 * SIZE(X)
  133. FADD f0, f0, f4
  134. nop
  135. fabs f4, f8
  136. LFD f9, 17 * SIZE(X)
  137. FADD f1, f1, f5
  138. addi X, X, 16 * SIZE
  139. fabs f5, f9
  140. LFD f10, 2 * SIZE(X)
  141. FADD f2, f2, f6
  142. nop
  143. fabs f6, f10
  144. LFD f11, 3 * SIZE(X)
  145. FADD f3, f3, f7
  146. LFD f8, 4 * SIZE(X)
  147. fabs f7, f11
  148. bdnz LL(12)
  149. .align 4
  150. LL(13):
  151. FADD f0, f0, f4
  152. nop
  153. fabs f4, f8
  154. LFD f9, 5 * SIZE(X)
  155. FADD f1, f1, f5
  156. nop
  157. fabs f5, f9
  158. LFD f10, 6 * SIZE(X)
  159. FADD f2, f2, f6
  160. nop
  161. fabs f6, f10
  162. LFD f11, 7 * SIZE(X)
  163. FADD f3, f3, f7
  164. nop
  165. fabs f7, f11
  166. LFD f8, 8 * SIZE(X)
  167. FADD f0, f0, f4
  168. nop
  169. fabs f4, f8
  170. LFD f9, 9 * SIZE(X)
  171. FADD f1, f1, f5
  172. nop
  173. fabs f5, f9
  174. LFD f10, 10 * SIZE(X)
  175. FADD f2, f2, f6
  176. nop
  177. fabs f6, f10
  178. LFD f11, 11 * SIZE(X)
  179. FADD f3, f3, f7
  180. nop
  181. fabs f7, f11
  182. LFD f8, 12 * SIZE(X)
  183. FADD f0, f0, f4
  184. nop
  185. fabs f4, f8
  186. LFD f9, 13 * SIZE(X)
  187. FADD f1, f1, f5
  188. nop
  189. fabs f5, f9
  190. LFD f10, 14 * SIZE(X)
  191. FADD f2, f2, f6
  192. addi X, X, 16 * SIZE
  193. fabs f6, f10
  194. LFD f11, -1 * SIZE(X)
  195. FADD f3, f3, f7
  196. fabs f7, f11
  197. FADD f0, f0, f4
  198. FADD f1, f1, f5
  199. FADD f2, f2, f6
  200. FADD f3, f3, f7
  201. .align 4
  202. LL(15):
  203. andi. r0, N, 7
  204. beq LL(999)
  205. andi. r0, N, 4
  206. beq LL(16)
  207. LFD f8, 0 * SIZE(X)
  208. LFD f9, 1 * SIZE(X)
  209. fabs f4, f8
  210. LFD f10, 2 * SIZE(X)
  211. fabs f5, f9
  212. LFD f11, 3 * SIZE(X)
  213. fabs f6, f10
  214. LFD f8, 4 * SIZE(X)
  215. fabs f7, f11
  216. FADD f0, f0, f4
  217. nop
  218. fabs f4, f8
  219. LFD f9, 5 * SIZE(X)
  220. FADD f1, f1, f5
  221. nop
  222. fabs f5, f9
  223. LFD f10, 6 * SIZE(X)
  224. FADD f2, f2, f6
  225. addi X, X, 8 * SIZE
  226. fabs f6, f10
  227. LFD f11, -1 * SIZE(X)
  228. FADD f3, f3, f7
  229. fabs f7, f11
  230. FADD f0, f0, f4
  231. FADD f1, f1, f5
  232. FADD f2, f2, f6
  233. FADD f3, f3, f7
  234. .align 4
  235. LL(16):
  236. andi. r0, N, 2
  237. beq LL(17)
  238. LFD f8, 0 * SIZE(X)
  239. LFD f9, 1 * SIZE(X)
  240. fabs f4, f8
  241. LFD f10, 2 * SIZE(X)
  242. fabs f5, f9
  243. LFD f11, 3 * SIZE(X)
  244. fabs f6, f10
  245. addi X, X, 4 * SIZE
  246. fabs f7, f11
  247. nop
  248. FADD f0, f0, f4
  249. FADD f1, f1, f5
  250. FADD f2, f2, f6
  251. FADD f3, f3, f7
  252. .align 4
  253. LL(17):
  254. andi. r0, N, 1
  255. beq LL(999)
  256. LFD f8, 0 * SIZE(X)
  257. LFD f9, 1 * SIZE(X)
  258. fabs f4, f8
  259. fabs f5, f9
  260. FADD f0, f0, f4
  261. addi X, X, 2 * SIZE
  262. FADD f1, f1, f5
  263. b LL(999)
  264. .align 4
  265. LL(20):
  266. sub X, X, INCXM1
  267. srawi. r0, N, 3
  268. mtspr CTR, r0
  269. beq- cr0, LL(25)
  270. LFDX f8, X, INCXM1
  271. LFDUX f9, X, INCX
  272. fabs f4, f8
  273. LFDX f10, X, INCXM1
  274. fabs f5, f9
  275. LFDUX f11, X, INCX
  276. fabs f6, f10
  277. LFDX f8, X, INCXM1
  278. fabs f7, f11
  279. bdz LL(23)
  280. .align 4
  281. LL(22):
  282. FADD f0, f0, f4
  283. dcbt X, PREA
  284. fabs f4, f8
  285. LFDUX f9, X, INCX
  286. FADD f1, f1, f5
  287. nop
  288. fabs f5, f9
  289. LFDX f10, X, INCXM1
  290. FADD f2, f2, f6
  291. nop
  292. fabs f6, f10
  293. LFDUX f11, X, INCX
  294. FADD f3, f3, f7
  295. nop
  296. fabs f7, f11
  297. LFDX f8, X, INCXM1
  298. FADD f0, f0, f4
  299. nop
  300. fabs f4, f8
  301. LFDUX f9, X, INCX
  302. FADD f1, f1, f5
  303. nop
  304. fabs f5, f9
  305. LFDX f10, X, INCXM1
  306. FADD f2, f2, f6
  307. nop
  308. fabs f6, f10
  309. LFDUX f11, X, INCX
  310. FADD f3, f3, f7
  311. nop
  312. fabs f7, f11
  313. LFDX f8, X, INCXM1
  314. FADD f0, f0, f4
  315. nop
  316. fabs f4, f8
  317. LFDUX f9, X, INCX
  318. FADD f1, f1, f5
  319. nop
  320. fabs f5, f9
  321. LFDX f10, X, INCXM1
  322. FADD f2, f2, f6
  323. nop
  324. fabs f6, f10
  325. LFDUX f11, X, INCX
  326. FADD f3, f3, f7
  327. nop
  328. fabs f7, f11
  329. LFDX f8, X, INCXM1
  330. FADD f0, f0, f4
  331. nop
  332. fabs f4, f8
  333. LFDUX f9, X, INCX
  334. FADD f1, f1, f5
  335. nop
  336. fabs f5, f9
  337. LFDX f10, X, INCXM1
  338. FADD f2, f2, f6
  339. nop
  340. fabs f6, f10
  341. LFDUX f11, X, INCX
  342. FADD f3, f3, f7
  343. LFDX f8, X, INCXM1
  344. fabs f7, f11
  345. bdnz LL(22)
  346. .align 4
  347. LL(23):
  348. FADD f0, f0, f4
  349. nop
  350. fabs f4, f8
  351. LFDUX f9, X, INCX
  352. FADD f1, f1, f5
  353. nop
  354. fabs f5, f9
  355. LFDX f10, X, INCXM1
  356. FADD f2, f2, f6
  357. nop
  358. fabs f6, f10
  359. LFDUX f11, X, INCX
  360. FADD f3, f3, f7
  361. nop
  362. fabs f7, f11
  363. LFDX f8, X, INCXM1
  364. FADD f0, f0, f4
  365. nop
  366. fabs f4, f8
  367. LFDUX f9, X, INCX
  368. FADD f1, f1, f5
  369. nop
  370. fabs f5, f9
  371. LFDX f10, X, INCXM1
  372. FADD f2, f2, f6
  373. nop
  374. fabs f6, f10
  375. LFDUX f11, X, INCX
  376. FADD f3, f3, f7
  377. nop
  378. fabs f7, f11
  379. LFDX f8, X, INCXM1
  380. FADD f0, f0, f4
  381. nop
  382. fabs f4, f8
  383. LFDUX f9, X, INCX
  384. FADD f1, f1, f5
  385. nop
  386. fabs f5, f9
  387. LFDX f10, X, INCXM1
  388. FADD f2, f2, f6
  389. nop
  390. fabs f6, f10
  391. LFDUX f11, X, INCX
  392. FADD f3, f3, f7
  393. fabs f7, f11
  394. FADD f0, f0, f4
  395. FADD f1, f1, f5
  396. FADD f2, f2, f6
  397. FADD f3, f3, f7
  398. .align 4
  399. LL(25):
  400. andi. r0, N, 7
  401. beq LL(999)
  402. andi. r0, N, 4
  403. beq LL(26)
  404. LFDX f8, X, INCXM1
  405. LFDUX f9, X, INCX
  406. fabs f4, f8
  407. LFDX f10, X, INCXM1
  408. fabs f5, f9
  409. LFDUX f11, X, INCX
  410. fabs f6, f10
  411. LFDX f8, X, INCXM1
  412. fabs f7, f11
  413. FADD f0, f0, f4
  414. nop
  415. fabs f4, f8
  416. LFDUX f9, X, INCX
  417. FADD f1, f1, f5
  418. nop
  419. fabs f5, f9
  420. LFDX f10, X, INCXM1
  421. FADD f2, f2, f6
  422. fabs f6, f10
  423. LFDUX f11, X, INCX
  424. FADD f3, f3, f7
  425. fabs f7, f11
  426. FADD f0, f0, f4
  427. FADD f1, f1, f5
  428. FADD f2, f2, f6
  429. FADD f3, f3, f7
  430. .align 4
  431. LL(26):
  432. andi. r0, N, 2
  433. beq LL(27)
  434. LFDX f8, X, INCXM1
  435. LFDUX f9, X, INCX
  436. fabs f4, f8
  437. LFDX f10, X, INCXM1
  438. fabs f5, f9
  439. LFDUX f11, X, INCX
  440. fabs f6, f10
  441. fabs f7, f11
  442. FADD f0, f0, f4
  443. FADD f1, f1, f5
  444. FADD f2, f2, f6
  445. FADD f3, f3, f7
  446. .align 4
  447. LL(27):
  448. andi. r0, N, 1
  449. beq LL(999)
  450. LFDX f8, X, INCXM1
  451. LFDUX f9, X, INCX
  452. fabs f4, f8
  453. fabs f5, f9
  454. FADD f0, f0, f4
  455. FADD f1, f1, f5
  456. .align 4
  457. LL(999):
  458. FADD f0, f0, f1
  459. FADD f2, f2, f3
  460. FADD f1, f0, f2
  461. addi SP, SP, STACKSIZE
  462. blr
  463. EPILOGUE