You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_ncopy_16.S 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693
  1. /*******************************************************************************
  2. Copyright (c) 2021, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: m
  31. #define N $r5 // param 2: n
  32. #define SRC $r6 // param 3: src
  33. #define LDA $r7 // param 4: lda
  34. #define DST $r8 // param 5: dst
  35. #define I $r9
  36. #define J $r10
  37. #define S1 $r12
  38. #define S2 $r13
  39. #define S3 $r14
  40. #define S4 $r15
  41. #define S5 $r16
  42. #define S6 $r17
  43. #define S7 $r18
  44. #define S8 $r19
  45. #define S9 $r20
  46. #define S10 $r23
  47. #define S11 $r24
  48. #define S12 $r25
  49. #define S13 $r26
  50. #define S14 $r27
  51. #define S15 $r28
  52. #define S16 $r29
  53. #define TD $r30
  54. #define TS $r31
  55. #define TL $r7
  56. #define T0 $r6
  57. #define ZERO $r0
  58. #define F0 $f0
  59. #define F1 $f1
  60. #define F2 $f2
  61. #define F3 $f3
  62. #define F4 $f4
  63. #define F5 $f5
  64. #define F6 $f6
  65. #define F7 $f7
  66. /* LASX vectors */
  67. #define U0 $xr0
  68. #define U1 $xr1
  69. #define U2 $xr2
  70. #define U3 $xr3
  71. #define U4 $xr4
  72. #define U5 $xr5
  73. #define U6 $xr6
  74. #define U7 $xr7
  75. #define U8 $xr8
  76. #define U9 $xr9
  77. #define U10 $xr10
  78. #define U11 $xr11
  79. #define U12 $xr12
  80. #define U13 $xr13
  81. #define U14 $xr14
  82. #define U15 $xr15
  83. #define D0 $xr16
  84. #define D1 $xr17
  85. #define D2 $xr18
  86. #define D3 $xr19
  87. #define D4 $xr20
  88. #define D5 $xr21
  89. #define D6 $xr22
  90. #define D7 $xr23
  91. #define D8 $xr24
  92. #define D9 $xr25
  93. #define D10 $xr26
  94. #define D11 $xr27
  95. #define D12 $xr28
  96. #define D13 $xr29
  97. #define D14 $xr30
  98. #define D15 $xr31
  99. PROLOGUE
  100. addi.d $sp, $sp, -0x90
  101. SDARG $r23, $sp, 0x00
  102. SDARG $r24, $sp, 0x08
  103. SDARG $r25, $sp, 0x10
  104. SDARG $r26, $sp, 0x18
  105. SDARG $r27, $sp, 0x20
  106. SDARG $r28, $sp, 0x28
  107. SDARG $r29, $sp, 0x30
  108. SDARG $r30, $sp, 0x38
  109. SDARG $r31, $sp, 0x40
  110. ST $f23, $sp, 0x48
  111. ST $f24, $sp, 0x50
  112. ST $f25, $sp, 0x58
  113. ST $f26, $sp, 0x60
  114. ST $f27, $sp, 0x68
  115. ST $f28, $sp, 0x70
  116. ST $f29, $sp, 0x78
  117. ST $f30, $sp, 0x80
  118. ST $f31, $sp, 0x88
  119. move TD, DST
  120. move TS, SRC
  121. slli.d TL, LDA, 0x03
  122. slli.d T0, TL, 0x01
  123. srai.d J, N, 0x04
  124. beq J, ZERO, .L_N8
  125. .L_J1: /* J-- */
  126. move S1, TS
  127. add.d S2, TS, TL
  128. srai.d I, M, 0x03
  129. add.d S3, S2, TL
  130. addi.d J, J, -1
  131. add.d S4, S3, TL
  132. add.d S5, S3, T0
  133. add.d S6, S4, T0
  134. add.d S7, S5, T0
  135. add.d S8, S6, T0
  136. add.d S9, S7, T0
  137. add.d S10, S8, T0
  138. add.d S11, S9, T0
  139. add.d S12, S10, T0
  140. add.d S13, S11, T0
  141. add.d S14, S12, T0
  142. add.d S15, S13, T0
  143. add.d S16, S14, T0
  144. add.d TS, S15, T0
  145. beq I, ZERO, .L_I7
  146. .L_I1: /* I-- */
  147. xvld U0, S1, 0x00
  148. xvld U1, S2, 0x00
  149. xvld U2, S3, 0x00
  150. xvld U3, S4, 0x00
  151. xvld U4, S5, 0x00
  152. xvld U5, S6, 0x00
  153. xvld U6, S7, 0x00
  154. xvld U7, S8, 0x00
  155. xvld U8, S9, 0x00
  156. xvld U9, S10, 0x00
  157. xvld U10, S11, 0x00
  158. xvld U11, S12, 0x00
  159. xvld U12, S13, 0x00
  160. xvld U13, S14, 0x00
  161. xvld U14, S15, 0x00
  162. xvld U15, S16, 0x00
  163. xvpackev.d D0, U1, U0
  164. xvpackod.d D1, U1, U0
  165. xvpackev.d D2, U3, U2
  166. xvpackod.d D3, U3, U2
  167. xvpackev.d D4, U5, U4
  168. xvpackod.d D5, U5, U4
  169. xvpackev.d D6, U7, U6
  170. xvpackod.d D7, U7, U6
  171. xvpackev.d D8, U9, U8
  172. xvpackod.d D9, U9, U8
  173. xvpackev.d D10, U11, U10
  174. xvpackod.d D11, U11, U10
  175. xvpackev.d D12, U13, U12
  176. xvpackod.d D13, U13, U12
  177. xvpackev.d D14, U15, U14
  178. xvpackod.d D15, U15, U14
  179. xvand.v U0, D0, D0
  180. xvpermi.q D0, D2, 0x02 // 0
  181. xvand.v U4, D4, D4
  182. xvpermi.q D4, D6, 0x02 // 1
  183. xvand.v U1, D1, D1
  184. xvpermi.q D1, D3, 0x02 // 4
  185. xvand.v U5, D5, D5
  186. xvpermi.q D5, D7, 0x02 // 5
  187. xvpermi.q D2, U0, 0x31 // 8
  188. xvpermi.q D6, U4, 0x31 // 9
  189. xvpermi.q D3, U1, 0x31 // 12
  190. xvpermi.q D7, U5, 0x31 // 13
  191. xvand.v U8, D8, D8
  192. xvpermi.q D8, D10, 0x02 // 2
  193. xvand.v U12, D12, D12
  194. xvpermi.q D12, D14, 0x02 // 3
  195. xvand.v U9, D9, D9
  196. xvpermi.q D9, D11, 0x02 // 6
  197. xvand.v U13, D13, D13
  198. xvpermi.q D13, D15, 0x02 // 7
  199. xvpermi.q D10, U8, 0x31 // 10
  200. xvpermi.q D14, U12, 0x31 // 11
  201. xvpermi.q D11, U9, 0x31 // 14
  202. xvpermi.q D15, U13, 0x31 // 15
  203. xvst D0, TD, 0x00 // 0
  204. xvst D4, TD, 0x20 // 1
  205. xvst D8, TD, 0x40 // 2
  206. xvst D12, TD, 0x60 // 3
  207. xvst D1, TD, 0x80 // 4
  208. xvst D5, TD, 0xA0 // 5
  209. xvst D9, TD, 0xC0 // 6
  210. xvst D13, TD, 0xE0 // 7
  211. addi.d TD, TD, 0x100
  212. xvst D2, TD, 0x00 // 8
  213. xvst D6, TD, 0x20 // 9
  214. xvst D10, TD, 0x40 // 10
  215. xvst D14, TD, 0x60 // 11
  216. xvst D3, TD, 0x80 // 12
  217. xvst D7, TD, 0xA0 // 13
  218. xvst D11, TD, 0xC0 // 14
  219. xvst D15, TD, 0xE0 // 15
  220. addi.d TD, TD, 0x100
  221. xvld U0, S1, 0x20
  222. xvld U1, S2, 0x20
  223. xvld U2, S3, 0x20
  224. xvld U3, S4, 0x20
  225. xvld U4, S5, 0x20
  226. xvld U5, S6, 0x20
  227. xvld U6, S7, 0x20
  228. xvld U7, S8, 0x20
  229. xvld U8, S9, 0x20
  230. xvld U9, S10, 0x20
  231. xvld U10, S11, 0x20
  232. xvld U11, S12, 0x20
  233. xvld U12, S13, 0x20
  234. xvld U13, S14, 0x20
  235. xvld U14, S15, 0x20
  236. xvld U15, S16, 0x20
  237. xvpackev.d D0, U1, U0
  238. xvpackod.d D1, U1, U0
  239. xvpackev.d D2, U3, U2
  240. xvpackod.d D3, U3, U2
  241. xvpackev.d D4, U5, U4
  242. xvpackod.d D5, U5, U4
  243. xvpackev.d D6, U7, U6
  244. xvpackod.d D7, U7, U6
  245. xvpackev.d D8, U9, U8
  246. xvpackod.d D9, U9, U8
  247. xvpackev.d D10, U11, U10
  248. xvpackod.d D11, U11, U10
  249. xvpackev.d D12, U13, U12
  250. xvpackod.d D13, U13, U12
  251. xvpackev.d D14, U15, U14
  252. xvpackod.d D15, U15, U14
  253. xvand.v U0, D0, D0
  254. xvpermi.q D0, D2, 0x02 // 0
  255. xvand.v U4, D4, D4
  256. xvpermi.q D4, D6, 0x02 // 1
  257. xvand.v U1, D1, D1
  258. xvpermi.q D1, D3, 0x02 // 4
  259. xvand.v U5, D5, D5
  260. xvpermi.q D5, D7, 0x02 // 5
  261. xvpermi.q D2, U0, 0x31 // 8
  262. xvpermi.q D6, U4, 0x31 // 9
  263. xvpermi.q D3, U1, 0x31 // 12
  264. xvpermi.q D7, U5, 0x31 // 13
  265. xvand.v U8, D8, D8
  266. xvpermi.q D8, D10, 0x02 // 2
  267. xvand.v U12, D12, D12
  268. xvpermi.q D12, D14, 0x02 // 3
  269. xvand.v U9, D9, D9
  270. xvpermi.q D9, D11, 0x02 // 6
  271. xvand.v U13, D13, D13
  272. xvpermi.q D13, D15, 0x02 // 7
  273. xvpermi.q D10, U8, 0x31 // 10
  274. xvpermi.q D14, U12, 0x31 // 11
  275. xvpermi.q D11, U9, 0x31 // 14
  276. xvpermi.q D15, U13, 0x31 // 15
  277. xvst D0, TD, 0x00 // 0
  278. xvst D4, TD, 0x20 // 1
  279. xvst D8, TD, 0x40 // 2
  280. xvst D12, TD, 0x60 // 3
  281. xvst D1, TD, 0x80 // 4
  282. xvst D5, TD, 0xA0 // 5
  283. xvst D9, TD, 0xC0 // 6
  284. xvst D13, TD, 0xE0 // 7
  285. addi.d TD, TD, 0x100
  286. xvst D2, TD, 0x00 // 8
  287. xvst D6, TD, 0x20 // 9
  288. xvst D10, TD, 0x40 // 10
  289. xvst D14, TD, 0x60 // 11
  290. xvst D3, TD, 0x80 // 12
  291. xvst D7, TD, 0xA0 // 13
  292. xvst D11, TD, 0xC0 // 14
  293. xvst D15, TD, 0xE0 // 15
  294. addi.d TD, TD, 0x100
  295. addi.d S1, S1, 0x40
  296. addi.d S2, S2, 0x40
  297. addi.d S3, S3, 0x40
  298. addi.d S4, S4, 0x40
  299. addi.d S5, S5, 0x40
  300. addi.d S6, S6, 0x40
  301. addi.d S7, S7, 0x40
  302. addi.d S8, S8, 0x40
  303. addi.d S9, S9, 0x40
  304. addi.d S10, S10, 0x40
  305. addi.d S11, S11, 0x40
  306. addi.d S12, S12, 0x40
  307. addi.d S13, S13, 0x40
  308. addi.d S14, S14, 0x40
  309. addi.d S15, S15, 0x40
  310. addi.d S16, S16, 0x40
  311. addi.d I, I, -1
  312. blt ZERO, I, .L_I1
  313. .L_I7:
  314. andi I, M, 0x07
  315. beq I, ZERO, .L_I0
  316. .L_II1: /* I-- */
  317. fld.d F0, S1, 0x00
  318. fld.d F1, S2, 0x00
  319. fld.d F2, S3, 0x00
  320. fld.d F3, S4, 0x00
  321. fld.d F4, S5, 0x00
  322. fld.d F5, S6, 0x00
  323. fld.d F6, S7, 0x00
  324. fld.d F7, S8, 0x00
  325. fst.d F0, TD, 0x00
  326. addi.d S1, S1, 0x08
  327. fst.d F1, TD, 0x08
  328. addi.d S2, S2, 0x08
  329. fst.d F2, TD, 0x10
  330. addi.d S3, S3, 0x08
  331. fst.d F3, TD, 0x18
  332. addi.d S4, S4, 0x08
  333. fst.d F4, TD, 0x20
  334. addi.d S5, S5, 0x08
  335. fst.d F5, TD, 0x28
  336. addi.d S6, S6, 0x08
  337. fst.d F6, TD, 0x30
  338. addi.d S7, S7, 0x08
  339. fst.d F7, TD, 0x38
  340. addi.d S8, S8, 0x08
  341. addi.d TD, TD, 0x40
  342. fld.d F0, S9, 0x00
  343. fld.d F1, S10, 0x00
  344. fld.d F2, S11, 0x00
  345. fld.d F3, S12, 0x00
  346. fld.d F4, S13, 0x00
  347. fld.d F5, S14, 0x00
  348. fld.d F6, S15, 0x00
  349. fld.d F7, S16, 0x00
  350. fst.d F0, TD, 0x00
  351. addi.d S9, S9, 0x08
  352. fst.d F1, TD, 0x08
  353. addi.d S10, S10, 0x08
  354. fst.d F2, TD, 0x10
  355. addi.d S11, S11, 0x08
  356. fst.d F3, TD, 0x18
  357. addi.d S12, S12, 0x08
  358. fst.d F4, TD, 0x20
  359. addi.d S13, S13, 0x08
  360. fst.d F5, TD, 0x28
  361. addi.d S14, S14, 0x08
  362. fst.d F6, TD, 0x30
  363. addi.d S15, S15, 0x08
  364. fst.d F7, TD, 0x38
  365. addi.d S16, S16, 0x08
  366. addi.d TD, TD, 0x40
  367. addi.d I, I, -1
  368. blt ZERO, I, .L_II1
  369. .L_I0:
  370. blt ZERO, J, .L_J1
  371. .L_N8:
  372. andi J, N, 0x08
  373. beq ZERO, J, .L_N4
  374. move S1, TS
  375. add.d S2, TS, TL
  376. srai.d I, M, 0x03
  377. add.d S3, S2, TL
  378. add.d S4, S2, T0
  379. add.d S5, S3, T0
  380. add.d S6, S4, T0
  381. add.d S7, S5, T0
  382. add.d S8, S6, T0
  383. add.d TS, S7, T0
  384. beq I, ZERO, .L_8I3
  385. .L_8I1: /* I-- */
  386. xvld U0, S1, 0x00
  387. xvld U1, S2, 0x00
  388. xvld U2, S3, 0x00
  389. xvld U3, S4, 0x00
  390. xvld U4, S5, 0x00
  391. xvld U5, S6, 0x00
  392. xvld U6, S7, 0x00
  393. xvld U7, S8, 0x00
  394. xvpackev.d D0, U1, U0
  395. xvpackod.d D1, U1, U0
  396. xvpackev.d D2, U3, U2
  397. xvpackod.d D3, U3, U2
  398. xvpackev.d D4, U5, U4
  399. xvpackod.d D5, U5, U4
  400. xvpackev.d D6, U7, U6
  401. xvpackod.d D7, U7, U6
  402. xvand.v U0, D0, D0
  403. xvpermi.q D0, D2, 0x02 // 0
  404. xvand.v U4, D4, D4
  405. xvpermi.q D4, D6, 0x02 // 1
  406. xvand.v U1, D1, D1
  407. xvpermi.q D1, D3, 0x02 // 2
  408. xvand.v U5, D5, D5
  409. xvpermi.q D5, D7, 0x02 // 3
  410. xvpermi.q D2, U0, 0x31 // 4
  411. xvpermi.q D6, U4, 0x31 // 5
  412. xvpermi.q D3, U1, 0x31 // 6
  413. xvpermi.q D7, U5, 0x31 // 7
  414. xvst D0, TD, 0x00
  415. xvst D4, TD, 0x20
  416. xvst D1, TD, 0x40
  417. xvst D5, TD, 0x60
  418. xvst D2, TD, 0x80
  419. xvst D6, TD, 0xA0
  420. xvst D3, TD, 0xC0
  421. xvst D7, TD, 0xE0
  422. addi.d TD, TD, 0x100
  423. xvld U0, S1, 0x20
  424. xvld U1, S2, 0x20
  425. xvld U2, S3, 0x20
  426. xvld U3, S4, 0x20
  427. xvld U4, S5, 0x20
  428. xvld U5, S6, 0x20
  429. xvld U6, S7, 0x20
  430. xvld U7, S8, 0x20
  431. xvpackev.d D0, U1, U0
  432. xvpackod.d D1, U1, U0
  433. xvpackev.d D2, U3, U2
  434. xvpackod.d D3, U3, U2
  435. xvpackev.d D4, U5, U4
  436. xvpackod.d D5, U5, U4
  437. xvpackev.d D6, U7, U6
  438. xvpackod.d D7, U7, U6
  439. xvand.v U0, D0, D0
  440. xvpermi.q D0, D2, 0x02 // 0
  441. xvand.v U4, D4, D4
  442. xvpermi.q D4, D6, 0x02 // 1
  443. xvand.v U1, D1, D1
  444. xvpermi.q D1, D3, 0x02 // 2
  445. xvand.v U5, D5, D5
  446. xvpermi.q D5, D7, 0x02 // 3
  447. xvpermi.q D2, U0, 0x31 // 4
  448. xvpermi.q D6, U4, 0x31 // 5
  449. xvpermi.q D3, U1, 0x31 // 6
  450. xvpermi.q D7, U5, 0x31 // 7
  451. xvst D0, TD, 0x00
  452. xvst D4, TD, 0x20
  453. xvst D1, TD, 0x40
  454. xvst D5, TD, 0x60
  455. xvst D2, TD, 0x80
  456. xvst D6, TD, 0xA0
  457. xvst D3, TD, 0xC0
  458. xvst D7, TD, 0xE0
  459. addi.d TD, TD, 0x100
  460. addi.d S1, S1, 0x40
  461. addi.d S2, S2, 0x40
  462. addi.d S3, S3, 0x40
  463. addi.d S4, S4, 0x40
  464. addi.d S5, S5, 0x40
  465. addi.d S6, S6, 0x40
  466. addi.d S7, S7, 0x40
  467. addi.d S8, S8, 0x40
  468. addi.d I, I, -1
  469. blt ZERO, I, .L_8I1
  470. .L_8I3:
  471. andi I, M, 0x07
  472. beq I, ZERO, .L_N4
  473. .L_8I11:
  474. fld.d F0, S1, 0x00
  475. fld.d F1, S2, 0x00
  476. fld.d F2, S3, 0x00
  477. fld.d F3, S4, 0x00
  478. fld.d F4, S5, 0x00
  479. fld.d F5, S6, 0x00
  480. fld.d F6, S7, 0x00
  481. fld.d F7, S8, 0x00
  482. fst.d F0, TD, 0x00
  483. addi.d S1, S1, 0x08
  484. fst.d F1, TD, 0x08
  485. addi.d S2, S2, 0x08
  486. fst.d F2, TD, 0x10
  487. addi.d S3, S3, 0x08
  488. fst.d F3, TD, 0x18
  489. addi.d S4, S4, 0x08
  490. fst.d F4, TD, 0x20
  491. addi.d S5, S5, 0x08
  492. fst.d F5, TD, 0x28
  493. addi.d S6, S6, 0x08
  494. fst.d F6, TD, 0x30
  495. addi.d S7, S7, 0x08
  496. fst.d F7, TD, 0x38
  497. addi.d S8, S8, 0x08
  498. addi.d TD, TD, 0x40
  499. addi.d I, I, -1
  500. blt ZERO, I, .L_8I11
  501. .L_N4:
  502. andi J, N, 0x04
  503. beq ZERO, J, .L_N2
  504. move S1, TS
  505. add.d S2, TS, TL
  506. srai.d I, M, 0x02
  507. add.d S3, S2, TL
  508. add.d S4, S2, T0
  509. add.d TS, S3, T0
  510. beq I, ZERO, .L_I3
  511. .L_4I1: /* I-- */
  512. xvld U0, S1, 0x00
  513. xvld U1, S2, 0x00
  514. xvld U2, S3, 0x00
  515. xvld U3, S4, 0x00
  516. xvpackev.d D0, U1, U0
  517. xvpackod.d D1, U1, U0
  518. xvpackev.d D2, U3, U2
  519. xvpackod.d D3, U3, U2
  520. xvand.v U0, D0, D0
  521. xvpermi.q D0, D2, 0x02 // 0
  522. xvand.v U1, D1, D1
  523. xvpermi.q D1, D3, 0x02 // 1
  524. xvpermi.q D2, U0, 0x31 // 2
  525. xvpermi.q D3, U1, 0x31 // 3
  526. xvst D0, TD, 0x00
  527. xvst D1, TD, 0x20
  528. xvst D2, TD, 0x40
  529. xvst D3, TD, 0x60
  530. addi.d S1, S1, 0x20
  531. addi.d S2, S2, 0x20
  532. addi.d S3, S3, 0x20
  533. addi.d S4, S4, 0x20
  534. addi.d TD, TD, 0x80
  535. addi.d I, I, -1
  536. blt ZERO, I, .L_4I1
  537. .L_I3:
  538. andi I, M, 0x03
  539. beq I, ZERO, .L_N2
  540. .L_4II1:
  541. fld.d F0, S1, 0x00
  542. fld.d F1, S2, 0x00
  543. fld.d F2, S3, 0x00
  544. fld.d F3, S4, 0x00
  545. fst.d F0, TD, 0x00
  546. addi.d S1, S1, 0x08
  547. fst.d F1, TD, 0x08
  548. addi.d S2, S2, 0x08
  549. fst.d F2, TD, 0x10
  550. addi.d S3, S3, 0x08
  551. fst.d F3, TD, 0x18
  552. addi.d S4, S4, 0x08
  553. addi.d TD, TD, 0x20
  554. addi.d I, I, -1
  555. blt ZERO, I, .L_4II1
  556. .L_N2:
  557. andi J, N, 0x02
  558. beq ZERO, J, .L_N1
  559. move S1, TS
  560. add.d S2, TS, TL
  561. srai.d I, M, 0x01
  562. add.d TS, S2, TL
  563. beq I, ZERO, .L_NI1
  564. .L_2I1: /* I-- */
  565. xvld U0, S1, 0x00
  566. xvld U1, S2, 0x00
  567. xvpackev.d D0, U1, U0
  568. xvpackod.d D1, U1, U0
  569. xvpermi.q D0, D1, 0x02 // 0
  570. xvst D0, TD, 0x00
  571. addi.d S1, S1, 0x10
  572. addi.d S2, S2, 0x10
  573. addi.d TD, TD, 0x20
  574. addi.d I, I, -1
  575. blt ZERO, I, .L_2I1
  576. .L_NI1:
  577. andi I, M, 0x01
  578. beq I, ZERO, .L_N1
  579. fld.d F0, S1, 0x00
  580. fld.d F1, S2, 0x00
  581. fst.d F0, TD, 0x00
  582. addi.d S1, S1, 0x08
  583. fst.d F1, TD, 0x08
  584. addi.d S2, S2, 0x08
  585. addi.d TD, TD, 0x10
  586. .L_N1:
  587. andi J, N, 0x01
  588. beq ZERO, J, .L_N0
  589. move S1, TS
  590. beq ZERO, M, .L_N0
  591. .L_M1:
  592. fld.d F0, S1, 0x00
  593. addi.d S1, S1, 0x08
  594. fst.d F0, TD, 0x00
  595. addi.d TD, TD, 0x08
  596. addi.d M, M, -1
  597. blt ZERO, M, .L_M1
  598. .L_N0:
  599. LDARG $r23, $sp, 0x00
  600. LDARG $r24, $sp, 0x08
  601. LDARG $r25, $sp, 0x10
  602. LDARG $r26, $sp, 0x18
  603. LDARG $r27, $sp, 0x20
  604. LDARG $r28, $sp, 0x28
  605. LDARG $r29, $sp, 0x30
  606. LDARG $r30, $sp, 0x38
  607. LDARG $r31, $sp, 0x40
  608. LD $f23, $sp, 0x48
  609. LD $f24, $sp, 0x50
  610. LD $f25, $sp, 0x58
  611. LD $f26, $sp, 0x60
  612. LD $f27, $sp, 0x68
  613. LD $f28, $sp, 0x70
  614. LD $f29, $sp, 0x78
  615. LD $f30, $sp, 0x80
  616. LD $f31, $sp, 0x88
  617. addi.d $sp, $sp, 0x90
  618. jirl $r0, $r1, 0x00
  619. EPILOGUE