You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zdot.S 7.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. /***************************************************************************
  2. Copyright (c) 2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define N $r4
  30. #define X $r5
  31. #define INCX $r6
  32. #define Y $r7
  33. #define INCY $r8
  34. #define I $r17
  35. #define TEMP $r18
  36. #define a1 $f10
  37. #define a2 $f11
  38. #define a3 $f12
  39. #define a4 $f13
  40. #define b1 $f14
  41. #define b2 $f15
  42. #define b3 $f16
  43. #define b4 $f17
  44. #define s1 $f22
  45. #define s2 $f8
  46. #define s3 $f23
  47. #define s4 $f9
  48. PROLOGUE
  49. #ifdef F_INTERFACE
  50. LDINT N, 0(N)
  51. LDINT INCX, 0(INCX)
  52. LDINT INCY, 0(INCY)
  53. #endif
  54. MTC s1, $r0
  55. MOV s2, s1
  56. MOV s3, s2
  57. MOV s4, s3
  58. slli.d INCX, INCX, ZBASE_SHIFT
  59. li.d TEMP, 2 * SIZE
  60. slli.d INCY, INCY, ZBASE_SHIFT
  61. bge $r0, N, .L999
  62. srai.d I, N, 2
  63. bne INCX, TEMP, .L20
  64. bne INCY, TEMP, .L20
  65. bge $r0, I, .L15
  66. LD a1, X, 0 * SIZE
  67. LD a2, X, 1 * SIZE
  68. LD b1, Y, 0 * SIZE
  69. addi.d I, I, -1
  70. LD b2, Y, 1 * SIZE
  71. bge $r0, I, .L14
  72. .align 3
  73. .L13:
  74. MADD s1, b1, a1, s1
  75. LD a3, X, 2 * SIZE
  76. MADD s2, b1, a2, s2
  77. LD a4, X, 3 * SIZE
  78. MADD s3, b2, a1, s3
  79. LD b3, Y, 2 * SIZE
  80. MADD s4, b2, a2, s4
  81. LD b4, Y, 3 * SIZE
  82. MADD s1, b3, a3, s1
  83. LD a1, X, 4 * SIZE
  84. MADD s2, b3, a4, s2
  85. LD a2, X, 5 * SIZE
  86. MADD s3, b4, a3, s3
  87. LD b1, Y, 4 * SIZE
  88. MADD s4, b4, a4, s4
  89. LD b2, Y, 5 * SIZE
  90. MADD s1, b1, a1, s1
  91. LD a3, X, 6 * SIZE
  92. MADD s2, b1, a2, s2
  93. LD a4, X, 7 * SIZE
  94. MADD s3, b2, a1, s3
  95. LD b3, Y, 6 * SIZE
  96. MADD s4, b2, a2, s4
  97. LD b4, Y, 7 * SIZE
  98. MADD s1, b3, a3, s1
  99. LD a1, X, 8 * SIZE
  100. MADD s2, b3, a4, s2
  101. LD a2, X, 9 * SIZE
  102. MADD s3, b4, a3, s3
  103. LD b1, Y, 8 * SIZE
  104. MADD s4, b4, a4, s4
  105. LD b2, Y, 9 * SIZE
  106. addi.d I, I, -1
  107. addi.d X, X, 8 * SIZE
  108. addi.d Y, Y, 8 * SIZE
  109. blt $r0, I, .L13
  110. .align 3
  111. .L14:
  112. MADD s1, b1, a1, s1
  113. LD a3, X, 2 * SIZE
  114. MADD s2, b1, a2, s2
  115. LD a4, X, 3 * SIZE
  116. MADD s3, b2, a1, s3
  117. LD b3, Y, 2 * SIZE
  118. MADD s4, b2, a2, s4
  119. LD b4, Y, 3 * SIZE
  120. MADD s1, b3, a3, s1
  121. LD a1, X, 4 * SIZE
  122. MADD s2, b3, a4, s2
  123. LD a2, X, 5 * SIZE
  124. MADD s3, b4, a3, s3
  125. LD b1, Y, 4 * SIZE
  126. MADD s4, b4, a4, s4
  127. LD b2, Y, 5 * SIZE
  128. MADD s1, b1, a1, s1
  129. LD a3, X, 6 * SIZE
  130. MADD s2, b1, a2, s2
  131. LD a4, X, 7 * SIZE
  132. MADD s3, b2, a1, s3
  133. LD b3, Y, 6 * SIZE
  134. MADD s4, b2, a2, s4
  135. LD b4, Y, 7 * SIZE
  136. MADD s1, b3, a3, s1
  137. addi.d X, X, 8 * SIZE
  138. MADD s2, b3, a4, s2
  139. addi.d Y, Y, 8 * SIZE
  140. MADD s3, b4, a3, s3
  141. MADD s4, b4, a4, s4
  142. .align 3
  143. .L15:
  144. andi I, N, 3
  145. bge $r0, I, .L999
  146. LD a1, X, 0 * SIZE
  147. LD a2, X, 1 * SIZE
  148. LD b1, Y, 0 * SIZE
  149. addi.d I, I, -1
  150. LD b2, Y, 1 * SIZE
  151. bge $r0, I, .L17
  152. .align 3
  153. .L16:
  154. MADD s1, b1, a1, s1
  155. addi.d I, I, -1
  156. MADD s2, b1, a2, s2
  157. LD b1, Y, 2 * SIZE
  158. MADD s3, b2, a1, s3
  159. LD a1, X, 2 * SIZE
  160. MADD s4, b2, a2, s4
  161. LD a2, X, 3 * SIZE
  162. LD b2, Y, 3 * SIZE
  163. addi.d X, X, 2 * SIZE
  164. addi.d Y, Y, 2 * SIZE
  165. blt $r0, I, .L16
  166. .align 3
  167. .L17:
  168. MADD s1, b1, a1, s1
  169. MADD s2, b1, a2, s2
  170. MADD s3, b2, a1, s3
  171. MADD s4, b2, a2, s4
  172. b .L999
  173. .align 3
  174. .L20:
  175. #ifdef F_INTERFACE
  176. bgez INCX, .L21
  177. addi.d TEMP, N, -1
  178. mult TEMP, INCX
  179. mflo TEMP
  180. dsub X, X, TEMP
  181. .align 3
  182. .L21:
  183. bgez INCY, .L22
  184. addi.d TEMP, N, -1
  185. mult TEMP, INCY
  186. mflo TEMP
  187. dsub Y, Y, TEMP
  188. .align 3
  189. .L22:
  190. #endif
  191. bge $r0, I, .L25
  192. LD a1, X, 0 * SIZE
  193. LD a2, X, 1 * SIZE
  194. LD b1, Y, 0 * SIZE
  195. LD b2, Y, 1 * SIZE
  196. add.d X, X, INCX
  197. addi.d I, I, -1
  198. add.d Y, Y, INCY
  199. bge $r0, I, .L24
  200. .align 3
  201. .L23:
  202. MADD s1, b1, a1, s1
  203. LD a3, X, 0 * SIZE
  204. MADD s2, b1, a2, s2
  205. LD a4, X, 1 * SIZE
  206. MADD s3, b2, a1, s3
  207. LD b3, Y, 0 * SIZE
  208. MADD s4, b2, a2, s4
  209. LD b4, Y, 1 * SIZE
  210. add.d X, X, INCX
  211. add.d Y, Y, INCY
  212. MADD s1, b3, a3, s1
  213. LD a1, X, 0 * SIZE
  214. MADD s2, b3, a4, s2
  215. LD a2, X, 1 * SIZE
  216. MADD s3, b4, a3, s3
  217. LD b1, Y, 0 * SIZE
  218. MADD s4, b4, a4, s4
  219. LD b2, Y, 1 * SIZE
  220. add.d X, X, INCX
  221. add.d Y, Y, INCY
  222. MADD s1, b1, a1, s1
  223. LD a3, X, 0 * SIZE
  224. MADD s2, b1, a2, s2
  225. LD a4, X, 1 * SIZE
  226. MADD s3, b2, a1, s3
  227. LD b3, Y, 0 * SIZE
  228. MADD s4, b2, a2, s4
  229. LD b4, Y, 1 * SIZE
  230. add.d X, X, INCX
  231. add.d Y, Y, INCY
  232. MADD s1, b3, a3, s1
  233. LD a1, X, 0 * SIZE
  234. MADD s2, b3, a4, s2
  235. LD a2, X, 1 * SIZE
  236. MADD s3, b4, a3, s3
  237. LD b1, Y, 0 * SIZE
  238. MADD s4, b4, a4, s4
  239. LD b2, Y, 1 * SIZE
  240. add.d X, X, INCX
  241. addi.d I, I, -1
  242. add.d Y, Y, INCY
  243. blt $r0, I, .L23
  244. .align 3
  245. .L24:
  246. MADD s1, b1, a1, s1
  247. LD a3, X, 0 * SIZE
  248. MADD s2, b1, a2, s2
  249. LD a4, X, 1 * SIZE
  250. MADD s3, b2, a1, s3
  251. LD b3, Y, 0 * SIZE
  252. MADD s4, b2, a2, s4
  253. LD b4, Y, 1 * SIZE
  254. add.d X, X, INCX
  255. add.d Y, Y, INCY
  256. MADD s1, b3, a3, s1
  257. LD a1, X, 0 * SIZE
  258. MADD s2, b3, a4, s2
  259. LD a2, X, 1 * SIZE
  260. MADD s3, b4, a3, s3
  261. LD b1, Y, 0 * SIZE
  262. MADD s4, b4, a4, s4
  263. LD b2, Y, 1 * SIZE
  264. add.d X, X, INCX
  265. add.d Y, Y, INCY
  266. MADD s1, b1, a1, s1
  267. LD a3, X, 0 * SIZE
  268. MADD s2, b1, a2, s2
  269. LD a4, X, 1 * SIZE
  270. MADD s3, b2, a1, s3
  271. LD b3, Y, 0 * SIZE
  272. MADD s4, b2, a2, s4
  273. LD b4, Y, 1 * SIZE
  274. MADD s1, b3, a3, s1
  275. add.d X, X, INCX
  276. MADD s2, b3, a4, s2
  277. add.d Y, Y, INCY
  278. MADD s3, b4, a3, s3
  279. MADD s4, b4, a4, s4
  280. .align 3
  281. .L25:
  282. andi I, N, 3
  283. bge $r0, I, .L999
  284. .align 3
  285. .L26:
  286. LD a1, X, 0 * SIZE
  287. LD a2, X, 1 * SIZE
  288. LD b1, Y, 0 * SIZE
  289. LD b2, Y, 1 * SIZE
  290. MADD s1, b1, a1, s1
  291. MADD s2, b1, a2, s2
  292. MADD s3, b2, a1, s3
  293. MADD s4, b2, a2, s4
  294. add.d X, X, INCX
  295. add.d Y, Y, INCY
  296. addi.d I, I, -1
  297. blt $r0, I, .L26
  298. .align 3
  299. .L999:
  300. #ifndef CONJ
  301. SUB $f0, s1, s4
  302. #else
  303. ADD $f0, s1, s4
  304. #endif
  305. #ifndef CONJ
  306. ADD $f1, s3, s2
  307. #else
  308. SUB $f1, s3, s2
  309. #endif
  310. jirl $r0, $r1, 0x0
  311. EPILOGUE