You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. /***************************************************************************
  2. Copyright (c) 2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M $r4
  30. #define N $r5
  31. #define A $r7
  32. #define LDA $r8
  33. #define X $r9
  34. #define INCX $r10
  35. #define Y $r11
  36. #define INCY $r6
  37. #define BUFFER $r17
  38. #define XORIG $r18
  39. #define XX $r12
  40. #define YY $r13
  41. #define I $r14
  42. #define J $r15
  43. #define AO1 $r23
  44. #define AO2 $r24
  45. #define ALPHA_R $f0
  46. #define ALPHA_I $f1
  47. #define a1 $f22
  48. #define a2 $f8
  49. #define a3 $f23
  50. #define a4 $f9
  51. #define a5 $f10
  52. #define a6 $f11
  53. #define a7 $f12
  54. #define a8 $f13
  55. #define y1 $f14
  56. #define y2 $f15
  57. #define y3 $f16
  58. #define y4 $f17
  59. #define x1 $f3
  60. #define x2 $f4
  61. #define x3 $f2
  62. #define x4 $f5
  63. #define x5 $f6
  64. #define x6 $f7
  65. #define x7 $f18
  66. #define x8 $f19
  67. #if !defined(CONJ) && !defined(XCONJ)
  68. #define MADD1 MADD
  69. #define MADD2 MADD
  70. #define MADD3 NMSUB
  71. #define MADD4 MADD
  72. #endif
  73. #if defined(CONJ) && !defined(XCONJ)
  74. #define MADD1 MADD
  75. #define MADD2 MADD
  76. #define MADD3 MADD
  77. #define MADD4 NMSUB
  78. #endif
  79. #if !defined(CONJ) && defined(XCONJ)
  80. #define MADD1 MADD
  81. #define MADD2 NMSUB
  82. #define MADD3 MADD
  83. #define MADD4 MADD
  84. #endif
  85. #if defined(CONJ) && defined(XCONJ)
  86. #define MADD1 MADD
  87. #define MADD2 NMSUB
  88. #define MADD3 NMSUB
  89. #define MADD4 NMSUB
  90. #endif
  91. PROLOGUE
  92. LDARG INCY, $sp, 0
  93. LDARG BUFFER, $sp, 8
  94. #ifdef __64BIT__
  95. addi.d $sp, $sp, -16
  96. #else
  97. addi.d $sp, $sp, -32
  98. #endif
  99. MTC y1, $r0
  100. SDARG $r23, $sp, 0
  101. SDARG $r24, $sp, 8
  102. slli.d LDA, LDA, ZBASE_SHIFT
  103. #ifndef __64BIT__
  104. fst.d $f18, $sp, 16
  105. fst.d $f19, $sp, 24
  106. #endif
  107. slli.d INCX, INCX, ZBASE_SHIFT
  108. bge $r0, M, .L999
  109. slli.d INCY, INCY, ZBASE_SHIFT
  110. bge $r0, N, .L999
  111. li.d I, 2 * SIZE
  112. move XORIG, X
  113. beq INCX, I, .L10
  114. srai.d I, M, 2
  115. move XORIG, BUFFER
  116. move YY, BUFFER
  117. bge $r0, I, .L05
  118. .align 3
  119. .L02:
  120. LD a1, X, 0 * SIZE
  121. LD a2, X, 1 * SIZE
  122. add.d X, X, INCX
  123. LD a3, X, 0 * SIZE
  124. LD a4, X, 1 * SIZE
  125. add.d X, X, INCX
  126. LD a5, X, 0 * SIZE
  127. LD a6, X, 1 * SIZE
  128. add.d X, X, INCX
  129. LD a7, X, 0 * SIZE
  130. LD a8, X, 1 * SIZE
  131. add.d X, X, INCX
  132. addi.d I, I, -1
  133. addi.d YY, YY, 8 * SIZE
  134. ST a1, YY, -8 * SIZE
  135. ST a2, YY, -7 * SIZE
  136. ST a3, YY, -6 * SIZE
  137. ST a4, YY, -5 * SIZE
  138. ST a5, YY, -4 * SIZE
  139. ST a6, YY, -3 * SIZE
  140. ST a7, YY, -2 * SIZE
  141. ST a8, YY, -1 * SIZE
  142. blt $r0, I, .L02
  143. .align 3
  144. .L05:
  145. andi I, M, 3
  146. bge $r0, I, .L10
  147. .align 3
  148. .L06:
  149. LD a1, X, 0 * SIZE
  150. LD a2, X, 1 * SIZE
  151. add.d X, X, INCX
  152. ST a1, YY, 0 * SIZE
  153. ST a2, YY, 1 * SIZE
  154. addi.d I, I, -1
  155. addi.d YY, YY, 2 * SIZE
  156. blt $r0, I, .L06
  157. .align 3
  158. .L10:
  159. srai.d J, N, 1
  160. move YY, Y
  161. bge $r0, J, .L20
  162. .align 3
  163. .L11:
  164. move AO1, A
  165. MOV y2, y1
  166. add.d AO2, A, LDA
  167. MOV y3, y1
  168. add.d A, AO2, LDA
  169. MOV y4, y1
  170. srai.d I, M, 2
  171. move XX, XORIG
  172. bge $r0, I, .L15
  173. LD x1, XX, 0 * SIZE
  174. LD x2, XX, 1 * SIZE
  175. LD x4, XX, 3 * SIZE
  176. LD a1, AO1, 0 * SIZE
  177. LD a3, AO2, 0 * SIZE
  178. LD a2, AO1, 1 * SIZE
  179. LD a4, AO2, 1 * SIZE
  180. LD a5, AO1, 2 * SIZE
  181. LD a7, AO2, 2 * SIZE
  182. LD a6, AO1, 3 * SIZE
  183. LD a8, AO2, 3 * SIZE
  184. addi.d I, I, -1
  185. bge $r0, I, .L13
  186. .align 3
  187. .L12:
  188. MADD1 y1, a1, x1, y1
  189. LD x3, XX, 2 * SIZE
  190. MADD2 y2, a1, x2, y2
  191. LD a1, AO1, 4 * SIZE
  192. MADD1 y3, a3, x1, y3
  193. MADD2 y4, a3, x2, y4
  194. LD a3, AO2, 4 * SIZE
  195. MADD3 y1, a2, x2, y1
  196. MADD4 y2, a2, x1, y2
  197. LD a2, AO1, 5 * SIZE
  198. MADD3 y3, a4, x2, y3
  199. LD x2, XX, 5 * SIZE
  200. MADD4 y4, a4, x1, y4
  201. LD a4, AO2, 5 * SIZE
  202. MADD1 y1, a5, x3, y1
  203. LD x1, XX, 4 * SIZE
  204. MADD2 y2, a5, x4, y2
  205. LD a5, AO1, 6 * SIZE
  206. MADD1 y3, a7, x3, y3
  207. MADD2 y4, a7, x4, y4
  208. LD a7, AO2, 6 * SIZE
  209. MADD3 y1, a6, x4, y1
  210. addi.d I, I, -1
  211. MADD4 y2, a6, x3, y2
  212. LD a6, AO1, 7 * SIZE
  213. MADD3 y3, a8, x4, y3
  214. LD x4, XX, 7 * SIZE
  215. MADD4 y4, a8, x3, y4
  216. LD a8, AO2, 7 * SIZE
  217. MADD1 y1, a1, x1, y1
  218. LD x3, XX, 6 * SIZE
  219. MADD2 y2, a1, x2, y2
  220. LD a1, AO1, 8 * SIZE
  221. MADD1 y3, a3, x1, y3
  222. MADD2 y4, a3, x2, y4
  223. LD a3, AO2, 8 * SIZE
  224. MADD3 y1, a2, x2, y1
  225. MADD4 y2, a2, x1, y2
  226. LD a2, AO1, 9 * SIZE
  227. MADD3 y3, a4, x2, y3
  228. LD x2, XX, 9 * SIZE
  229. MADD4 y4, a4, x1, y4
  230. LD a4, AO2, 9 * SIZE
  231. MADD1 y1, a5, x3, y1
  232. LD x1, XX, 8 * SIZE
  233. MADD2 y2, a5, x4, y2
  234. LD a5, AO1, 10 * SIZE
  235. MADD1 y3, a7, x3, y3
  236. addi.d XX, XX, 8 * SIZE
  237. MADD2 y4, a7, x4, y4
  238. LD a7, AO2, 10 * SIZE
  239. MADD3 y1, a6, x4, y1
  240. addi.d AO2, AO2, 8 * SIZE
  241. MADD4 y2, a6, x3, y2
  242. LD a6, AO1, 11 * SIZE
  243. MADD3 y3, a8, x4, y3
  244. LD x4, XX, 3 * SIZE
  245. MADD4 y4, a8, x3, y4
  246. LD a8, AO2, 3 * SIZE
  247. addi.d AO1, AO1, 8 * SIZE
  248. blt $r0, I, .L12
  249. .align 3
  250. .L13:
  251. MADD1 y1, a1, x1, y1
  252. LD x3, XX, 2 * SIZE
  253. MADD2 y2, a1, x2, y2
  254. LD a1, AO1, 4 * SIZE
  255. MADD1 y3, a3, x1, y3
  256. MADD2 y4, a3, x2, y4
  257. LD a3, AO2, 4 * SIZE
  258. MADD3 y1, a2, x2, y1
  259. MADD4 y2, a2, x1, y2
  260. LD a2, AO1, 5 * SIZE
  261. MADD3 y3, a4, x2, y3
  262. LD x2, XX, 5 * SIZE
  263. MADD4 y4, a4, x1, y4
  264. LD a4, AO2, 5 * SIZE
  265. MADD1 y1, a5, x3, y1
  266. LD x1, XX, 4 * SIZE
  267. MADD2 y2, a5, x4, y2
  268. LD a5, AO1, 6 * SIZE
  269. MADD1 y3, a7, x3, y3
  270. MADD2 y4, a7, x4, y4
  271. LD a7, AO2, 6 * SIZE
  272. MADD3 y1, a6, x4, y1
  273. MADD4 y2, a6, x3, y2
  274. LD a6, AO1, 7 * SIZE
  275. MADD3 y3, a8, x4, y3
  276. LD x4, XX, 7 * SIZE
  277. MADD4 y4, a8, x3, y4
  278. LD a8, AO2, 7 * SIZE
  279. MADD1 y1, a1, x1, y1
  280. LD x3, XX, 6 * SIZE
  281. MADD2 y2, a1, x2, y2
  282. MADD1 y3, a3, x1, y3
  283. MADD2 y4, a3, x2, y4
  284. MADD3 y1, a2, x2, y1
  285. MADD4 y2, a2, x1, y2
  286. MADD3 y3, a4, x2, y3
  287. MADD4 y4, a4, x1, y4
  288. MADD1 y1, a5, x3, y1
  289. MADD2 y2, a5, x4, y2
  290. MADD1 y3, a7, x3, y3
  291. MADD2 y4, a7, x4, y4
  292. MADD3 y1, a6, x4, y1
  293. addi.d XX, XX, 8 * SIZE
  294. MADD4 y2, a6, x3, y2
  295. addi.d AO1, AO1, 8 * SIZE
  296. MADD3 y3, a8, x4, y3
  297. addi.d AO2, AO2, 8 * SIZE
  298. MADD4 y4, a8, x3, y4
  299. .align 3
  300. .L15:
  301. andi I, M, 2
  302. bge $r0, I, .L17
  303. LD x1, XX, 0 * SIZE
  304. LD x2, XX, 1 * SIZE
  305. LD x3, XX, 2 * SIZE
  306. LD x4, XX, 3 * SIZE
  307. LD a1, AO1, 0 * SIZE
  308. LD a3, AO2, 0 * SIZE
  309. LD a2, AO1, 1 * SIZE
  310. LD a4, AO2, 1 * SIZE
  311. LD a5, AO1, 2 * SIZE
  312. LD a7, AO2, 2 * SIZE
  313. LD a6, AO1, 3 * SIZE
  314. LD a8, AO2, 3 * SIZE
  315. MADD1 y1, a1, x1, y1
  316. MADD2 y2, a1, x2, y2
  317. MADD1 y3, a3, x1, y3
  318. MADD2 y4, a3, x2, y4
  319. MADD3 y1, a2, x2, y1
  320. MADD4 y2, a2, x1, y2
  321. MADD3 y3, a4, x2, y3
  322. MADD4 y4, a4, x1, y4
  323. MADD1 y1, a5, x3, y1
  324. MADD2 y2, a5, x4, y2
  325. MADD1 y3, a7, x3, y3
  326. MADD2 y4, a7, x4, y4
  327. MADD3 y1, a6, x4, y1
  328. addi.d XX, XX, 4 * SIZE
  329. MADD4 y2, a6, x3, y2
  330. addi.d AO1, AO1, 4 * SIZE
  331. MADD3 y3, a8, x4, y3
  332. addi.d AO2, AO2, 4 * SIZE
  333. MADD4 y4, a8, x3, y4
  334. .align 3
  335. .L17:
  336. andi I, M, 1
  337. .align 3
  338. bge $r0, I, .L19
  339. .L18:
  340. LD x1, XX, 0 * SIZE
  341. LD x2, XX, 1 * SIZE
  342. LD a1, AO1, 0 * SIZE
  343. LD a3, AO2, 0 * SIZE
  344. MADD1 y1, a1, x1, y1
  345. LD a2, AO1, 1 * SIZE
  346. MADD2 y2, a1, x2, y2
  347. LD a4, AO2, 1 * SIZE
  348. MADD1 y3, a3, x1, y3
  349. MADD2 y4, a3, x2, y4
  350. MADD3 y1, a2, x2, y1
  351. MADD4 y2, a2, x1, y2
  352. MADD3 y3, a4, x2, y3
  353. MADD4 y4, a4, x1, y4
  354. .align 3
  355. .L19:
  356. LD a1, Y, 0 * SIZE
  357. LD a2, Y, 1 * SIZE
  358. add.d Y, Y, INCY
  359. LD a3, Y, 0 * SIZE
  360. LD a4, Y, 1 * SIZE
  361. add.d Y, Y, INCY
  362. MADD a1, y1, ALPHA_R, a1
  363. MADD a2, y1, ALPHA_I, a2
  364. MADD a3, y3, ALPHA_R, a3
  365. MADD a4, y3, ALPHA_I, a4
  366. NMSUB a1, y2, ALPHA_I, a1
  367. MADD a2, y2, ALPHA_R, a2
  368. NMSUB a3, y4, ALPHA_I, a3
  369. MTC y1, $r0
  370. MADD a4, y4, ALPHA_R, a4
  371. addi.d J, J, -1
  372. ST a1, YY, 0 * SIZE
  373. ST a2, YY, 1 * SIZE
  374. add.d YY, YY, INCY
  375. ST a3, YY, 0 * SIZE
  376. ST a4, YY, 1 * SIZE
  377. add.d YY, YY, INCY
  378. blt $r0, J, .L11
  379. .align 3
  380. .L20:
  381. andi J, N, 1
  382. MOV y2, y1
  383. srai.d I, M, 2
  384. bge $r0, J, .L999
  385. MOV y3, y1
  386. move AO1, A
  387. MOV y4, y1
  388. move XX, XORIG
  389. bge $r0, I, .L25
  390. LD a1, AO1, 0 * SIZE
  391. LD x1, XX, 0 * SIZE
  392. LD a2, AO1, 1 * SIZE
  393. LD x2, XX, 1 * SIZE
  394. LD a5, AO1, 2 * SIZE
  395. LD x4, XX, 3 * SIZE
  396. addi.d I, I, -1
  397. LD a6, AO1, 3 * SIZE
  398. bge $r0, I, .L23
  399. .align 3
  400. .L22:
  401. MADD1 y1, a1, x1, y1
  402. LD x3, XX, 2 * SIZE
  403. MADD2 y2, a1, x2, y2
  404. LD a1, AO1, 4 * SIZE
  405. MADD3 y3, a2, x2, y3
  406. LD x2, XX, 5 * SIZE
  407. MADD4 y4, a2, x1, y4
  408. LD a2, AO1, 5 * SIZE
  409. MADD1 y1, a5, x3, y1
  410. LD x1, XX, 4 * SIZE
  411. MADD2 y2, a5, x4, y2
  412. LD a5, AO1, 6 * SIZE
  413. MADD3 y3, a6, x4, y3
  414. LD x4, XX, 7 * SIZE
  415. MADD4 y4, a6, x3, y4
  416. LD a6, AO1, 7 * SIZE
  417. MADD1 y1, a1, x1, y1
  418. LD x3, XX, 6 * SIZE
  419. MADD2 y2, a1, x2, y2
  420. LD a1, AO1, 8 * SIZE
  421. MADD3 y3, a2, x2, y3
  422. LD x2, XX, 9 * SIZE
  423. MADD4 y4, a2, x1, y4
  424. LD a2, AO1, 9 * SIZE
  425. MADD1 y1, a5, x3, y1
  426. LD x1, XX, 8 * SIZE
  427. MADD2 y2, a5, x4, y2
  428. LD a5, AO1, 10 * SIZE
  429. MADD3 y3, a6, x4, y3
  430. LD x4, XX, 11 * SIZE
  431. MADD4 y4, a6, x3, y4
  432. LD a6, AO1, 11 * SIZE
  433. addi.d I, I, -1
  434. addi.d XX, XX, 8 * SIZE
  435. addi.d AO1, AO1, 8 * SIZE
  436. blt $r0, I, .L22
  437. .align 3
  438. .L23:
  439. MADD1 y1, a1, x1, y1
  440. LD x3, XX, 2 * SIZE
  441. MADD2 y2, a1, x2, y2
  442. LD a1, AO1, 4 * SIZE
  443. MADD3 y3, a2, x2, y3
  444. LD x2, XX, 5 * SIZE
  445. MADD4 y4, a2, x1, y4
  446. LD a2, AO1, 5 * SIZE
  447. MADD1 y1, a5, x3, y1
  448. LD x1, XX, 4 * SIZE
  449. MADD2 y2, a5, x4, y2
  450. LD a5, AO1, 6 * SIZE
  451. MADD3 y3, a6, x4, y3
  452. LD x4, XX, 7 * SIZE
  453. MADD4 y4, a6, x3, y4
  454. LD a6, AO1, 7 * SIZE
  455. MADD1 y1, a1, x1, y1
  456. LD x3, XX, 6 * SIZE
  457. MADD2 y2, a1, x2, y2
  458. MADD3 y3, a2, x2, y3
  459. MADD4 y4, a2, x1, y4
  460. MADD1 y1, a5, x3, y1
  461. MADD2 y2, a5, x4, y2
  462. MADD3 y3, a6, x4, y3
  463. addi.d XX, XX, 8 * SIZE
  464. MADD4 y4, a6, x3, y4
  465. addi.d AO1, AO1, 8 * SIZE
  466. .align 3
  467. .L25:
  468. andi I, M, 2
  469. bge $r0, I, .L27
  470. LD a1, AO1, 0 * SIZE
  471. LD x1, XX, 0 * SIZE
  472. LD a2, AO1, 1 * SIZE
  473. LD x2, XX, 1 * SIZE
  474. LD a5, AO1, 2 * SIZE
  475. MADD1 y1, a1, x1, y1
  476. LD x3, XX, 2 * SIZE
  477. MADD2 y2, a1, x2, y2
  478. LD a6, AO1, 3 * SIZE
  479. MADD3 y3, a2, x2, y3
  480. LD x4, XX, 3 * SIZE
  481. MADD4 y4, a2, x1, y4
  482. MADD1 y1, a5, x3, y1
  483. MADD2 y2, a5, x4, y2
  484. MADD3 y3, a6, x4, y3
  485. addi.d XX, XX, 4 * SIZE
  486. MADD4 y4, a6, x3, y4
  487. addi.d AO1, AO1, 4 * SIZE
  488. .align 3
  489. .L27:
  490. andi I, M, 1
  491. .align 3
  492. bge $r0, I, .L29
  493. .L28:
  494. LD a1, AO1, 0 * SIZE
  495. LD x1, XX, 0 * SIZE
  496. LD a2, AO1, 1 * SIZE
  497. LD x2, XX, 1 * SIZE
  498. MADD1 y1, a1, x1, y1
  499. MADD2 y2, a1, x2, y2
  500. MADD3 y3, a2, x2, y3
  501. MADD4 y4, a2, x1, y4
  502. .align 3
  503. .L29:
  504. LD a1, Y, 0 * SIZE
  505. LD a2, Y, 1 * SIZE
  506. ADD y1, y1, y3
  507. ADD y2, y2, y4
  508. MADD a1, y1, ALPHA_R, a1
  509. MADD a2, y1, ALPHA_I, a2
  510. NMSUB a1, y2, ALPHA_I, a1
  511. MADD a2, y2, ALPHA_R, a2
  512. ST a1, YY, 0 * SIZE
  513. ST a2, YY, 1 * SIZE
  514. .align 3
  515. .L999:
  516. LDARG $r23, $sp, 0
  517. LDARG $r24, $sp, 8
  518. #ifndef __64BIT__
  519. fld.d $f18, $sp, 16
  520. fld.d $f19, $sp, 24
  521. #endif
  522. #ifdef __64BIT__
  523. addi.d $sp, $sp, 16
  524. #else
  525. addi.d $sp, $sp, 32
  526. #endif
  527. move $r4, $r17
  528. fmov.d $f0, $f22
  529. jirl $r0, $r1, 0x0
  530. EPILOGUE