You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dot.S 8.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. /***************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define Y $r7
  33. #define INCY $r8
  34. #define I $r17
  35. #define TEMP $r18
  36. #define a1 $f23
  37. #define a2 $f9
  38. #define a3 $f10
  39. #define a4 $f11
  40. #define b1 $f12
  41. #define b2 $f13
  42. #define b3 $f14
  43. #define b4 $f15
  44. #define s1 $f22
  45. #define s2 $f8
  46. PROLOGUE
  47. #ifdef F_INTERFACE
  48. LDINT N, 0(N)
  49. LDINT INCX, 0(INCX)
  50. LDINT INCY, 0(INCY)
  51. #endif
  52. MTC s1, $r0
  53. MTC s2, $r0
  54. slli.d INCX, INCX, BASE_SHIFT
  55. li.d TEMP, SIZE
  56. slli.d INCY, INCY, BASE_SHIFT
  57. bge $r0, N, .L999
  58. srai.d I, N, 3
  59. bne INCX, TEMP, .L20
  60. bne INCY, TEMP, .L20
  61. bge $r0, I, .L15
  62. LD a1, X, 0 * SIZE
  63. LD b1, Y, 0 * SIZE
  64. LD a2, X, 1 * SIZE
  65. LD b2, Y, 1 * SIZE
  66. LD a3, X, 2 * SIZE
  67. LD b3, Y, 2 * SIZE
  68. LD a4, X, 3 * SIZE
  69. addi.d I, I, -1
  70. LD b4, Y, 3 * SIZE
  71. bge $r0, I, .L13
  72. .align 3
  73. .L12:
  74. #ifdef DSDOT
  75. fcvt.d.s a1, a1
  76. fcvt.d.s b1, b1
  77. fmadd.d s1, b1, a1, s1
  78. #else
  79. MADD s1, b1, a1, s1
  80. #endif
  81. LD a1, X, 4 * SIZE
  82. LD b1, Y, 4 * SIZE
  83. #ifdef DSDOT
  84. fcvt.d.s a2, a2
  85. fcvt.d.s b2, b2
  86. fmadd.d s2, b2, a2, s2
  87. #else
  88. MADD s2, b2, a2, s2
  89. #endif
  90. LD a2, X, 5 * SIZE
  91. LD b2, Y, 5 * SIZE
  92. #ifdef DSDOT
  93. fcvt.d.s a3, a3
  94. fcvt.d.s b3, b3
  95. fmadd.d s1, b3, a3, s1
  96. #else
  97. MADD s1, b3, a3, s1
  98. #endif
  99. LD a3, X, 6 * SIZE
  100. LD b3, Y, 6 * SIZE
  101. #ifdef DSDOT
  102. fcvt.d.s a4, a4
  103. fcvt.d.s b4, b4
  104. fmadd.d s2, b4, a4, s2
  105. #else
  106. MADD s2, b4, a4, s2
  107. #endif
  108. LD a4, X, 7 * SIZE
  109. LD b4, Y, 7 * SIZE
  110. #ifdef DSDOT
  111. fcvt.d.s a1, a1
  112. fcvt.d.s b1, b1
  113. fmadd.d s1, b1, a1, s1
  114. #else
  115. MADD s1, b1, a1, s1
  116. #endif
  117. LD a1, X, 8 * SIZE
  118. LD b1, Y, 8 * SIZE
  119. #ifdef DSDOT
  120. fcvt.d.s a2, a2
  121. fcvt.d.s b2, b2
  122. fmadd.d s2, b2, a2, s2
  123. #else
  124. MADD s2, b2, a2, s2
  125. #endif
  126. LD a2, X, 9 * SIZE
  127. LD b2, Y, 9 * SIZE
  128. #ifdef DSDOT
  129. fcvt.d.s a3, a3
  130. fcvt.d.s b3, b3
  131. fmadd.d s1, b3, a3, s1
  132. #else
  133. MADD s1, b3, a3, s1
  134. #endif
  135. LD a3, X, 10 * SIZE
  136. LD b3, Y, 10 * SIZE
  137. #ifdef DSDOT
  138. fcvt.d.s a4, a4
  139. fcvt.d.s b4, b4
  140. fmadd.d s2, b4, a4, s2
  141. #else
  142. MADD s2, b4, a4, s2
  143. #endif
  144. LD a4, X, 11 * SIZE
  145. LD b4, Y, 11 * SIZE
  146. addi.d I, I, -1
  147. addi.d X, X, 8 * SIZE
  148. addi.d Y, Y, 8 * SIZE
  149. blt $r0, I, .L12
  150. .align 3
  151. .L13:
  152. #ifdef DSDOT
  153. fcvt.d.s a1, a1
  154. fcvt.d.s b1, b1
  155. fmadd.d s1, b1, a1, s1
  156. #else
  157. MADD s1, b1, a1, s1
  158. #endif
  159. LD a1, X, 4 * SIZE
  160. LD b1, Y, 4 * SIZE
  161. #ifdef DSDOT
  162. fcvt.d.s a2, a2
  163. fcvt.d.s b2, b2
  164. fmadd.d s2, b2, a2, s2
  165. #else
  166. MADD s2, b2, a2, s2
  167. #endif
  168. LD a2, X, 5 * SIZE
  169. LD b2, Y, 5 * SIZE
  170. #ifdef DSDOT
  171. fcvt.d.s a3, a3
  172. fcvt.d.s b3, b3
  173. fmadd.d s1, b3, a3, s1
  174. #else
  175. MADD s1, b3, a3, s1
  176. #endif
  177. LD a3, X, 6 * SIZE
  178. LD b3, Y, 6 * SIZE
  179. #ifdef DSDOT
  180. fcvt.d.s a4, a4
  181. fcvt.d.s b4, b4
  182. fmadd.d s2, b4, a4, s2
  183. #else
  184. MADD s2, b4, a4, s2
  185. #endif
  186. LD a4, X, 7 * SIZE
  187. LD b4, Y, 7 * SIZE
  188. #ifdef DSDOT
  189. fcvt.d.s a1, a1
  190. fcvt.d.s b1, b1
  191. fmadd.d s1, b1, a1, s1
  192. #else
  193. MADD s1, b1, a1, s1
  194. #endif
  195. addi.d X, X, 8 * SIZE
  196. #ifdef DSDOT
  197. fcvt.d.s a2, a2
  198. fcvt.d.s b2, b2
  199. fmadd.d s2, b2, a2, s2
  200. #else
  201. MADD s2, b2, a2, s2
  202. #endif
  203. addi.d Y, Y, 8 * SIZE
  204. #ifdef DSDOT
  205. fcvt.d.s a3, a3
  206. fcvt.d.s b3, b3
  207. fmadd.d s1, b3, a3, s1
  208. #else
  209. MADD s1, b3, a3, s1
  210. #endif
  211. #ifdef DSDOT
  212. fcvt.d.s a4, a4
  213. fcvt.d.s b4, b4
  214. fmadd.d s2, b4, a4, s2
  215. #else
  216. MADD s2, b4, a4, s2
  217. #endif
  218. .align 3
  219. .L15:
  220. andi I, N, 7
  221. bge $r0, I, .L999
  222. .align 3
  223. .L16:
  224. LD a1, X, 0 * SIZE
  225. LD b1, Y, 0 * SIZE
  226. #ifdef DSDOT
  227. fcvt.d.s a1, a1
  228. fcvt.d.s b1, b1
  229. fmadd.d s1, b1, a1, s1
  230. #else
  231. MADD s1, b1, a1, s1
  232. #endif
  233. addi.d I, I, -1
  234. addi.d X, X, SIZE
  235. addi.d Y, Y, SIZE
  236. blt $r0, I, .L16
  237. b .L999
  238. .align 3
  239. .L20:
  240. #ifdef F_INTERFACE
  241. bgez INCX, .L21
  242. addi.d TEMP, N, -1
  243. mult TEMP, INCX
  244. mflo TEMP
  245. dsub X, X, TEMP
  246. .align 3
  247. .L21:
  248. bgez INCY, .L22
  249. addi.d TEMP, N, -1
  250. mult TEMP, INCY
  251. mflo TEMP
  252. dsub Y, Y, TEMP
  253. .align 3
  254. .L22:
  255. #endif
  256. bge $r0, I, .L25
  257. .align 3
  258. .L23:
  259. LD a1, X, 0 * SIZE
  260. add.d X, X, INCX
  261. LD b1, Y, 0 * SIZE
  262. add.d Y, Y, INCY
  263. #ifdef DSDOT
  264. fcvt.d.s a1, a1
  265. fcvt.d.s b1, b1
  266. fmadd.d s1, b1, a1, s1
  267. #else
  268. MADD s1, b1, a1, s1
  269. #endif
  270. LD a1, X, 0 * SIZE
  271. add.d X, X, INCX
  272. LD b1, Y, 0 * SIZE
  273. add.d Y, Y, INCY
  274. #ifdef DSDOT
  275. fcvt.d.s a1, a1
  276. fcvt.d.s b1, b1
  277. fmadd.d s2, b1, a1, s2
  278. #else
  279. MADD s2, b1, a1, s2
  280. #endif
  281. LD a1, X, 0 * SIZE
  282. add.d X, X, INCX
  283. LD b1, Y, 0 * SIZE
  284. add.d Y, Y, INCY
  285. #ifdef DSDOT
  286. fcvt.d.s a1, a1
  287. fcvt.d.s b1, b1
  288. fmadd.d s1, b1, a1, s1
  289. #else
  290. MADD s1, b1, a1, s1
  291. #endif
  292. LD a1, X, 0 * SIZE
  293. add.d X, X, INCX
  294. LD b1, Y, 0 * SIZE
  295. add.d Y, Y, INCY
  296. #ifdef DSDOT
  297. fcvt.d.s a1, a1
  298. fcvt.d.s b1, b1
  299. fmadd.d s2, b1, a1, s2
  300. #else
  301. MADD s2, b1, a1, s2
  302. #endif
  303. LD a1, X, 0 * SIZE
  304. add.d X, X, INCX
  305. LD b1, Y, 0 * SIZE
  306. add.d Y, Y, INCY
  307. #ifdef DSDOT
  308. fcvt.d.s a1, a1
  309. fcvt.d.s b1, b1
  310. fmadd.d s1, b1, a1, s1
  311. #else
  312. MADD s1, b1, a1, s1
  313. #endif
  314. LD a1, X, 0 * SIZE
  315. add.d X, X, INCX
  316. LD b1, Y, 0 * SIZE
  317. add.d Y, Y, INCY
  318. #ifdef DSDOT
  319. fcvt.d.s a1, a1
  320. fcvt.d.s b1, b1
  321. fmadd.d s2, b1, a1, s2
  322. #else
  323. MADD s2, b1, a1, s2
  324. #endif
  325. LD a1, X, 0 * SIZE
  326. add.d X, X, INCX
  327. LD b1, Y, 0 * SIZE
  328. add.d Y, Y, INCY
  329. #ifdef DSDOT
  330. fcvt.d.s a1, a1
  331. fcvt.d.s b1, b1
  332. fmadd.d s1, b1, a1, s1
  333. #else
  334. MADD s1, b1, a1, s1
  335. #endif
  336. LD a1, X, 0 * SIZE
  337. add.d X, X, INCX
  338. LD b1, Y, 0 * SIZE
  339. add.d Y, Y, INCY
  340. addi.d I, I, -1
  341. #ifdef DSDOT
  342. fcvt.d.s a1, a1
  343. fcvt.d.s b1, b1
  344. fmadd.d s2, b1, a1, s2
  345. #else
  346. MADD s2, b1, a1, s2
  347. #endif
  348. blt $r0, I, .L23
  349. .align 3
  350. .L25:
  351. andi I, N, 7
  352. bge $r0, I, .L999
  353. .align 3
  354. .L26:
  355. LD a1, X, 0 * SIZE
  356. add.d X, X, INCX
  357. LD b1, Y, 0 * SIZE
  358. add.d Y, Y, INCY
  359. addi.d I, I, -1
  360. #ifdef DSDOT
  361. fcvt.d.s a1, a1
  362. fcvt.d.s b1, b1
  363. fmadd.d s1, b1, a1, s1
  364. #else
  365. MADD s1, b1, a1, s1
  366. #endif
  367. blt $r0, I, .L26
  368. .align 3
  369. .L999:
  370. #ifdef DSDOT
  371. fadd.d $f0, s1, s2
  372. #else
  373. ADD $f0, s1, s2
  374. #endif
  375. move $r4, $r17
  376. jirl $r0, $r1, 0x0
  377. EPILOGUE