You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

asum_cell.S 9.6 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N r3
  41. #define X r4
  42. #define INCX r5
  43. #define PREA r8
  44. #define FZERO f0
  45. #define STACKSIZE 16
  46. PROLOGUE
  47. PROFCODE
  48. addi SP, SP, -STACKSIZE
  49. li r0, 0
  50. stw r0, 0(SP)
  51. #ifdef F_INTERFACE
  52. LDINT N, 0(N)
  53. LDINT INCX, 0(INCX)
  54. #endif
  55. lfs FZERO, 0(SP)
  56. slwi INCX, INCX, BASE_SHIFT
  57. fmr f1, FZERO
  58. li PREA, 8 * 16 * SIZE
  59. fmr f2, FZERO
  60. cmpwi cr0, N, 0
  61. fmr f3, FZERO
  62. ble- LL(999)
  63. cmpwi cr0, INCX, 0
  64. ble- LL(999)
  65. cmpwi cr0, INCX, SIZE
  66. bne- cr0, LL(20)
  67. srawi. r0, N, 4
  68. mtspr CTR, r0
  69. beq- cr0, LL(15)
  70. .align 4
  71. LFD f8, 0 * SIZE(X)
  72. LFD f9, 1 * SIZE(X)
  73. fabs f4, f8
  74. LFD f10, 2 * SIZE(X)
  75. fabs f5, f9
  76. LFD f11, 3 * SIZE(X)
  77. fabs f6, f10
  78. LFD f8, 4 * SIZE(X)
  79. fabs f7, f11
  80. bdz LL(13)
  81. .align 4
  82. LL(12):
  83. FADD f0, f0, f4
  84. dcbt X, PREA
  85. fabs f4, f8
  86. LFD f9, 5 * SIZE(X)
  87. FADD f1, f1, f5
  88. nop
  89. fabs f5, f9
  90. LFD f10, 6 * SIZE(X)
  91. FADD f2, f2, f6
  92. nop
  93. fabs f6, f10
  94. LFD f11, 7 * SIZE(X)
  95. FADD f3, f3, f7
  96. nop
  97. fabs f7, f11
  98. LFD f8, 8 * SIZE(X)
  99. FADD f0, f0, f4
  100. nop
  101. fabs f4, f8
  102. LFD f9, 9 * SIZE(X)
  103. FADD f1, f1, f5
  104. nop
  105. fabs f5, f9
  106. LFD f10, 10 * SIZE(X)
  107. FADD f2, f2, f6
  108. nop
  109. fabs f6, f10
  110. LFD f11, 11 * SIZE(X)
  111. FADD f3, f3, f7
  112. nop
  113. fabs f7, f11
  114. LFD f8, 12 * SIZE(X)
  115. FADD f0, f0, f4
  116. nop
  117. fabs f4, f8
  118. LFD f9, 13 * SIZE(X)
  119. FADD f1, f1, f5
  120. nop
  121. fabs f5, f9
  122. LFD f10, 14 * SIZE(X)
  123. FADD f2, f2, f6
  124. nop
  125. fabs f6, f10
  126. LFD f11, 15 * SIZE(X)
  127. FADD f3, f3, f7
  128. nop
  129. fabs f7, f11
  130. LFD f8, 16 * SIZE(X)
  131. FADD f0, f0, f4
  132. nop
  133. fabs f4, f8
  134. LFD f9, 17 * SIZE(X)
  135. FADD f1, f1, f5
  136. addi X, X, 16 * SIZE
  137. fabs f5, f9
  138. LFD f10, 2 * SIZE(X)
  139. FADD f2, f2, f6
  140. nop
  141. fabs f6, f10
  142. LFD f11, 3 * SIZE(X)
  143. FADD f3, f3, f7
  144. LFD f8, 4 * SIZE(X)
  145. fabs f7, f11
  146. bdnz LL(12)
  147. .align 4
  148. LL(13):
  149. FADD f0, f0, f4
  150. nop
  151. fabs f4, f8
  152. LFD f9, 5 * SIZE(X)
  153. FADD f1, f1, f5
  154. nop
  155. fabs f5, f9
  156. LFD f10, 6 * SIZE(X)
  157. FADD f2, f2, f6
  158. nop
  159. fabs f6, f10
  160. LFD f11, 7 * SIZE(X)
  161. FADD f3, f3, f7
  162. nop
  163. fabs f7, f11
  164. LFD f8, 8 * SIZE(X)
  165. FADD f0, f0, f4
  166. nop
  167. fabs f4, f8
  168. LFD f9, 9 * SIZE(X)
  169. FADD f1, f1, f5
  170. nop
  171. fabs f5, f9
  172. LFD f10, 10 * SIZE(X)
  173. FADD f2, f2, f6
  174. nop
  175. fabs f6, f10
  176. LFD f11, 11 * SIZE(X)
  177. FADD f3, f3, f7
  178. nop
  179. fabs f7, f11
  180. LFD f8, 12 * SIZE(X)
  181. FADD f0, f0, f4
  182. nop
  183. fabs f4, f8
  184. LFD f9, 13 * SIZE(X)
  185. FADD f1, f1, f5
  186. nop
  187. fabs f5, f9
  188. LFD f10, 14 * SIZE(X)
  189. FADD f2, f2, f6
  190. addi X, X, 16 * SIZE
  191. fabs f6, f10
  192. LFD f11, -1 * SIZE(X)
  193. FADD f3, f3, f7
  194. fabs f7, f11
  195. FADD f0, f0, f4
  196. FADD f1, f1, f5
  197. FADD f2, f2, f6
  198. FADD f3, f3, f7
  199. .align 4
  200. LL(15):
  201. andi. r0, N, 15
  202. beq LL(999)
  203. andi. r0, N, 8
  204. beq LL(16)
  205. LFD f8, 0 * SIZE(X)
  206. LFD f9, 1 * SIZE(X)
  207. fabs f4, f8
  208. LFD f10, 2 * SIZE(X)
  209. fabs f5, f9
  210. LFD f11, 3 * SIZE(X)
  211. fabs f6, f10
  212. LFD f8, 4 * SIZE(X)
  213. fabs f7, f11
  214. FADD f0, f0, f4
  215. nop
  216. fabs f4, f8
  217. LFD f9, 5 * SIZE(X)
  218. FADD f1, f1, f5
  219. nop
  220. fabs f5, f9
  221. LFD f10, 6 * SIZE(X)
  222. FADD f2, f2, f6
  223. addi X, X, 8 * SIZE
  224. fabs f6, f10
  225. LFD f11, -1 * SIZE(X)
  226. FADD f3, f3, f7
  227. fabs f7, f11
  228. FADD f0, f0, f4
  229. FADD f1, f1, f5
  230. FADD f2, f2, f6
  231. FADD f3, f3, f7
  232. .align 4
  233. LL(16):
  234. andi. r0, N, 4
  235. beq LL(17)
  236. LFD f8, 0 * SIZE(X)
  237. LFD f9, 1 * SIZE(X)
  238. fabs f4, f8
  239. LFD f10, 2 * SIZE(X)
  240. fabs f5, f9
  241. LFD f11, 3 * SIZE(X)
  242. fabs f6, f10
  243. addi X, X, 4 * SIZE
  244. fabs f7, f11
  245. nop
  246. FADD f0, f0, f4
  247. FADD f1, f1, f5
  248. FADD f2, f2, f6
  249. FADD f3, f3, f7
  250. .align 4
  251. LL(17):
  252. andi. r0, N, 2
  253. beq LL(18)
  254. LFD f8, 0 * SIZE(X)
  255. LFD f9, 1 * SIZE(X)
  256. fabs f4, f8
  257. fabs f5, f9
  258. FADD f0, f0, f4
  259. addi X, X, 2 * SIZE
  260. FADD f1, f1, f5
  261. nop
  262. .align 4
  263. LL(18):
  264. andi. r0, N, 1
  265. beq LL(999)
  266. LFD f8, 0 * SIZE(X)
  267. fabs f4, f8
  268. FADD f0, f0, f4
  269. b LL(999)
  270. .align 4
  271. LL(20):
  272. sub X, X, INCX
  273. srawi. r0, N, 4
  274. mtspr CTR, r0
  275. beq- cr0, LL(25)
  276. .align 4
  277. LFDUX f8, X, INCX
  278. LFDUX f9, X, INCX
  279. fabs f4, f8
  280. LFDUX f10, X, INCX
  281. fabs f5, f9
  282. LFDUX f11, X, INCX
  283. fabs f6, f10
  284. LFDUX f8, X, INCX
  285. fabs f7, f11
  286. bdz LL(23)
  287. .align 4
  288. LL(22):
  289. FADD f0, f0, f4
  290. dcbt X, PREA
  291. fabs f4, f8
  292. LFDUX f9, X, INCX
  293. FADD f1, f1, f5
  294. nop
  295. fabs f5, f9
  296. LFDUX f10, X, INCX
  297. FADD f2, f2, f6
  298. nop
  299. fabs f6, f10
  300. LFDUX f11, X, INCX
  301. FADD f3, f3, f7
  302. nop
  303. fabs f7, f11
  304. LFDUX f8, X, INCX
  305. FADD f0, f0, f4
  306. nop
  307. fabs f4, f8
  308. LFDUX f9, X, INCX
  309. FADD f1, f1, f5
  310. nop
  311. fabs f5, f9
  312. LFDUX f10, X, INCX
  313. FADD f2, f2, f6
  314. nop
  315. fabs f6, f10
  316. LFDUX f11, X, INCX
  317. FADD f3, f3, f7
  318. nop
  319. fabs f7, f11
  320. LFDUX f8, X, INCX
  321. FADD f0, f0, f4
  322. nop
  323. fabs f4, f8
  324. LFDUX f9, X, INCX
  325. FADD f1, f1, f5
  326. nop
  327. fabs f5, f9
  328. LFDUX f10, X, INCX
  329. FADD f2, f2, f6
  330. nop
  331. fabs f6, f10
  332. LFDUX f11, X, INCX
  333. FADD f3, f3, f7
  334. nop
  335. fabs f7, f11
  336. LFDUX f8, X, INCX
  337. FADD f0, f0, f4
  338. nop
  339. fabs f4, f8
  340. LFDUX f9, X, INCX
  341. FADD f1, f1, f5
  342. nop
  343. fabs f5, f9
  344. LFDUX f10, X, INCX
  345. FADD f2, f2, f6
  346. nop
  347. fabs f6, f10
  348. LFDUX f11, X, INCX
  349. FADD f3, f3, f7
  350. LFDUX f8, X, INCX
  351. fabs f7, f11
  352. bdnz LL(22)
  353. .align 4
  354. LL(23):
  355. FADD f0, f0, f4
  356. nop
  357. fabs f4, f8
  358. LFDUX f9, X, INCX
  359. FADD f1, f1, f5
  360. nop
  361. fabs f5, f9
  362. LFDUX f10, X, INCX
  363. FADD f2, f2, f6
  364. nop
  365. fabs f6, f10
  366. LFDUX f11, X, INCX
  367. FADD f3, f3, f7
  368. nop
  369. fabs f7, f11
  370. LFDUX f8, X, INCX
  371. FADD f0, f0, f4
  372. nop
  373. fabs f4, f8
  374. LFDUX f9, X, INCX
  375. FADD f1, f1, f5
  376. nop
  377. fabs f5, f9
  378. LFDUX f10, X, INCX
  379. FADD f2, f2, f6
  380. nop
  381. fabs f6, f10
  382. LFDUX f11, X, INCX
  383. FADD f3, f3, f7
  384. nop
  385. fabs f7, f11
  386. LFDUX f8, X, INCX
  387. FADD f0, f0, f4
  388. nop
  389. fabs f4, f8
  390. LFDUX f9, X, INCX
  391. FADD f1, f1, f5
  392. nop
  393. fabs f5, f9
  394. LFDUX f10, X, INCX
  395. FADD f2, f2, f6
  396. nop
  397. fabs f6, f10
  398. LFDUX f11, X, INCX
  399. FADD f3, f3, f7
  400. fabs f7, f11
  401. FADD f0, f0, f4
  402. FADD f1, f1, f5
  403. FADD f2, f2, f6
  404. FADD f3, f3, f7
  405. .align 4
  406. LL(25):
  407. andi. r0, N, 15
  408. beq LL(999)
  409. andi. r0, N, 8
  410. beq LL(26)
  411. LFDUX f8, X, INCX
  412. LFDUX f9, X, INCX
  413. fabs f4, f8
  414. LFDUX f10, X, INCX
  415. fabs f5, f9
  416. LFDUX f11, X, INCX
  417. fabs f6, f10
  418. LFDUX f8, X, INCX
  419. fabs f7, f11
  420. FADD f0, f0, f4
  421. nop
  422. fabs f4, f8
  423. LFDUX f9, X, INCX
  424. FADD f1, f1, f5
  425. nop
  426. fabs f5, f9
  427. LFDUX f10, X, INCX
  428. FADD f2, f2, f6
  429. fabs f6, f10
  430. LFDUX f11, X, INCX
  431. FADD f3, f3, f7
  432. fabs f7, f11
  433. FADD f0, f0, f4
  434. FADD f1, f1, f5
  435. FADD f2, f2, f6
  436. FADD f3, f3, f7
  437. .align 4
  438. LL(26):
  439. andi. r0, N, 4
  440. beq LL(27)
  441. LFDUX f8, X, INCX
  442. LFDUX f9, X, INCX
  443. fabs f4, f8
  444. LFDUX f10, X, INCX
  445. fabs f5, f9
  446. LFDUX f11, X, INCX
  447. fabs f6, f10
  448. fabs f7, f11
  449. FADD f0, f0, f4
  450. FADD f1, f1, f5
  451. FADD f2, f2, f6
  452. FADD f3, f3, f7
  453. .align 4
  454. LL(27):
  455. andi. r0, N, 2
  456. beq LL(28)
  457. LFDUX f8, X, INCX
  458. LFDUX f9, X, INCX
  459. fabs f4, f8
  460. fabs f5, f9
  461. FADD f0, f0, f4
  462. FADD f1, f1, f5
  463. .align 4
  464. LL(28):
  465. andi. r0, N, 1
  466. beq LL(999)
  467. LFDUX f8, X, INCX
  468. fabs f4, f8
  469. FADD f0, f0, f4
  470. .align 4
  471. LL(999):
  472. FADD f0, f0, f1
  473. FADD f2, f2, f3
  474. FADD f1, f0, f2
  475. addi SP, SP, STACKSIZE
  476. blr
  477. EPILOGUE