You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

loongarch64_asm.S 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #if __loongarch_grlen == 64
  28. #define LA_REG int64_t
  29. #define REG_SIZE 8
  30. #define REG_LOG 3
  31. #define PTR_ADDI addi.d
  32. #define PTR_ADD add.d
  33. #define PTR_SUB sub.d
  34. #define PTR_LD ld.d
  35. #define PTR_ST st.d
  36. #define PTR_SLLI slli.d
  37. #define PTR_SRLI srli.d
  38. #define PTR_SRAI srai.d
  39. #define PTR_MUL mul.d
  40. #define PTR_ALSL alsl.d
  41. #elif __loongarch_grlen == 32
  42. #define LA_REG int32_t
  43. #define REG_SIZE 4
  44. #define REG_LOG 2
  45. #define PTR_ADDI addi.w
  46. #define PTR_ADD add.w
  47. #define PTR_SUB sub.w
  48. #define PTR_LD ld.w
  49. #define PTR_ST st.w
  50. #define PTR_SLLI slli.w
  51. #define PTR_SRLI srli.w
  52. #define PTR_SRAI srai.w
  53. #define PTR_MUL mul.w
  54. #define PTR_ALSL alsl.w
  55. #else
  56. // If neither of the above two conditions is supported, it means this is an early
  57. // internal toolchain. To ensure maximum compatibility, the following approach is taken:
  58. #define LA_REG int64_t
  59. #define REG_SIZE 8
  60. #define REG_LOG 3
  61. #define PTR_ADDI addi.d
  62. #define PTR_ADD add.d
  63. #define PTR_SUB sub.d
  64. #define PTR_LD ld.d
  65. #define PTR_ST st.d
  66. #define PTR_SLLI slli.d
  67. #define PTR_SRLI srli.d
  68. #define PTR_SRAI srai.d
  69. #define PTR_MUL mul.d
  70. #define PTR_ALSL alsl.d
  71. #endif
  72. #if __loongarch_frlen == 64
  73. #define FREG_SIZE 8
  74. #define FREG_LOG 3
  75. #define PTR_FLD fld.d
  76. #define PTR_FST fst.d
  77. #elif __loongarch_frlen == 32
  78. #define FREG_SIZE 4
  79. #define FREG_LOG 2
  80. #define PTR_FLD fld.s
  81. #define PTR_FST fst.s
  82. #else
  83. // If neither of the above two conditions is supported, it means this is an early
  84. // internal toolchain. To ensure maximum compatibility, the following approach is taken:
  85. #define FREG_SIZE 8
  86. #define FREG_LOG 3
  87. #define PTR_FLD fld.d
  88. #define PTR_FST fst.d
  89. #endif
  90. .altmacro // Enable alternate macro mode
  91. /*
  92. * Pushing and popping static registers into/from the stack.
  93. * regs : number of static general-purpose registers, greater than or equal to 0, less than or equal to 9
  94. * fregs: number of static floating-point registers, greater than or equal to 0, less than or equal to 8
  95. */
  96. .macro push_if_used regs, fregs
  97. .if \regs > 0
  98. PTR_ADDI $sp, $sp, -(\regs << REG_LOG)
  99. push_regs 0, \regs - 1
  100. .endif
  101. .if \fregs > 0
  102. PTR_ADDI $sp, $sp, -(\fregs << FREG_LOG)
  103. push_fregs 0, \fregs - 1
  104. .endif
  105. .endm // End push_if_used
  106. .macro pop_if_used regs, fregs
  107. .if \fregs > 0
  108. pop_fregs 0, \fregs - 1
  109. PTR_ADDI $sp, $sp, \fregs << FREG_LOG
  110. .endif
  111. .if \regs > 0
  112. pop_regs 0, \regs - 1
  113. PTR_ADDI $sp, $sp, \regs << REG_LOG
  114. .endif
  115. .endm // End pop_if_used
  116. .macro push_regs from, to
  117. #ifdef __clang__
  118. .if \to >= 0
  119. PTR_ST $s0, $sp, 0 << REG_LOG
  120. .endif
  121. .if \to >= 1
  122. PTR_ST $s1, $sp, 1 << REG_LOG
  123. .endif
  124. .if \to >= 2
  125. PTR_ST $s2, $sp, 2 << REG_LOG
  126. .endif
  127. .if \to >= 3
  128. PTR_ST $s3, $sp, 3 << REG_LOG
  129. .endif
  130. .if \to >= 4
  131. PTR_ST $s4, $sp, 4 << REG_LOG
  132. .endif
  133. .if \to >= 5
  134. PTR_ST $s5, $sp, 5 << REG_LOG
  135. .endif
  136. .if \to >= 6
  137. PTR_ST $s6, $sp, 6 << REG_LOG
  138. .endif
  139. .if \to >= 7
  140. PTR_ST $s7, $sp, 7 << REG_LOG
  141. .endif
  142. .if \to >= 8
  143. PTR_ST $s8, $sp, 8 << REG_LOG
  144. .endif
  145. #else
  146. PTR_ST $s\()\from, $sp, \from << REG_LOG
  147. .if \to - \from
  148. push_regs %from + 1, \to
  149. .endif
  150. #endif
  151. .endm // End push_regs
  152. .macro pop_regs from, to
  153. #ifdef __clang__
  154. .if \to >= 0
  155. PTR_LD $s0, $sp, 0 << REG_LOG
  156. .endif
  157. .if \to >= 1
  158. PTR_LD $s1, $sp, 1 << REG_LOG
  159. .endif
  160. .if \to >= 2
  161. PTR_LD $s2, $sp, 2 << REG_LOG
  162. .endif
  163. .if \to >= 3
  164. PTR_LD $s3, $sp, 3 << REG_LOG
  165. .endif
  166. .if \to >= 4
  167. PTR_LD $s4, $sp, 4 << REG_LOG
  168. .endif
  169. .if \to >= 5
  170. PTR_LD $s5, $sp, 5 << REG_LOG
  171. .endif
  172. .if \to >= 6
  173. PTR_LD $s6, $sp, 6 << REG_LOG
  174. .endif
  175. .if \to >= 7
  176. PTR_LD $s7, $sp, 7 << REG_LOG
  177. .endif
  178. .if \to >= 8
  179. PTR_LD $s8, $sp, 8 << REG_LOG
  180. .endif
  181. #else
  182. PTR_LD $s\()\from, $sp, \from << REG_LOG
  183. .if \to - \from
  184. pop_regs %from + 1, \to
  185. .endif
  186. #endif
  187. .endm // End pop_regs
  188. .macro push_fregs from, to
  189. #ifdef __clang__
  190. .if \to >= 0
  191. PTR_FST $fs0, $sp, 0 << FREG_LOG
  192. .endif
  193. .if \to >= 1
  194. PTR_FST $fs1, $sp, 1 << FREG_LOG
  195. .endif
  196. .if \to >= 2
  197. PTR_FST $fs2, $sp, 2 << FREG_LOG
  198. .endif
  199. .if \to >= 3
  200. PTR_FST $fs3, $sp, 3 << FREG_LOG
  201. .endif
  202. .if \to >= 4
  203. PTR_FST $fs4, $sp, 4 << FREG_LOG
  204. .endif
  205. .if \to >= 5
  206. PTR_FST $fs5, $sp, 5 << FREG_LOG
  207. .endif
  208. .if \to >= 6
  209. PTR_FST $fs6, $sp, 6 << FREG_LOG
  210. .endif
  211. .if \to >= 7
  212. PTR_FST $fs7, $sp, 7 << FREG_LOG
  213. .endif
  214. #else
  215. PTR_FST $fs\()\from, $sp, \from << FREG_LOG
  216. .if \to - \from
  217. push_fregs %from + 1, \to
  218. .endif
  219. #endif
  220. .endm // End push_fregs
  221. .macro pop_fregs from, to
  222. #ifdef __clang__
  223. .if \to >= 0
  224. PTR_FLD $fs0, $sp, 0 << FREG_LOG
  225. .endif
  226. .if \to >= 1
  227. PTR_FLD $fs1, $sp, 1 << FREG_LOG
  228. .endif
  229. .if \to >= 2
  230. PTR_FLD $fs2, $sp, 2 << FREG_LOG
  231. .endif
  232. .if \to >= 3
  233. PTR_FLD $fs3, $sp, 3 << FREG_LOG
  234. .endif
  235. .if \to >= 4
  236. PTR_FLD $fs4, $sp, 4 << FREG_LOG
  237. .endif
  238. .if \to >= 5
  239. PTR_FLD $fs5, $sp, 5 << FREG_LOG
  240. .endif
  241. .if \to >= 6
  242. PTR_FLD $fs6, $sp, 6 << FREG_LOG
  243. .endif
  244. .if \to >= 7
  245. PTR_FLD $fs7, $sp, 7 << FREG_LOG
  246. .endif
  247. #else
  248. PTR_FLD $fs\()\from, $sp, \from << FREG_LOG
  249. .if \to - \from
  250. pop_fregs %from + 1, \to
  251. .endif
  252. #endif
  253. .endm // End pop_fregs
  254. //
  255. // Instruction Related Macros
  256. //
  257. // GLD
  258. //
  259. .macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
  260. .ifeqs "\suf_op", "0"
  261. \pre_op\()ld \out, \src, \offset
  262. .else
  263. \pre_op\()ld.\suf_op \out, \src, \offset
  264. .endif
  265. .ifnb \more
  266. GLD \pre_op, \suf_op, \more
  267. .endif
  268. .endm
  269. //
  270. // GLD_INC
  271. //
  272. .macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
  273. .ifeqs "\suf_op", "0"
  274. \pre_op\()ld \out, \src, \offset
  275. .else
  276. \pre_op\()ld.\suf_op \out, \src, \offset
  277. .endif
  278. PTR_ADDI \src, \src, \inc
  279. .ifnb \more
  280. GLD_INC \pre_op, \suf_op, \inc, \more
  281. .endif
  282. .endm
  283. //
  284. // GLDX is same as GLD except the stride is a register
  285. //
  286. .macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
  287. .ifeqs "\suf_op", "0"
  288. \pre_op\()ldx \out, \src, \offset
  289. .else
  290. \pre_op\()ldx.\suf_op \out, \src, \offset
  291. .endif
  292. .ifnb \more
  293. GLDX \pre_op, \suf_op, \more
  294. .endif
  295. .endm
  296. //
  297. // GLDREPL
  298. //
  299. .macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
  300. \pre_op\()ldrepl.\suf_op \out, \src, \offset
  301. .ifnb \more
  302. GLDREPL \pre_op, \suf_op, \more
  303. .endif
  304. .endm
  305. //
  306. // GST
  307. //
  308. .macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
  309. .ifeqs "\suf_op", "0"
  310. \pre_op\()st \src, \dst, \offset
  311. .else
  312. \pre_op\()st.\suf_op \src, \dst, \offset
  313. .endif
  314. .ifnb \more
  315. GST \pre_op, \suf_op, \more
  316. .endif
  317. .endm
  318. //
  319. // GMUL
  320. //
  321. .macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
  322. \pre_op\()mul.\suf_op \out, \in0, \in1
  323. .ifnb \more
  324. GMUL \pre_op, \suf_op, \more
  325. .endif
  326. .endm
  327. //
  328. // GMADD
  329. //
  330. .macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
  331. \pre_op\()madd.\suf_op \out, \in0, \in1, \in2
  332. .ifnb \more
  333. GMADD \pre_op, \suf_op, \more
  334. .endif
  335. .endm
  336. //
  337. // GADD
  338. //
  339. .macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
  340. \pre_op\()add.\suf_op \out, \in0, \in1
  341. .ifnb \more
  342. GADD \pre_op, \suf_op, \more
  343. .endif
  344. .endm
  345. //
  346. // GADDI
  347. //
  348. .macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
  349. \pre_op\()addi.\suf_op \out, \in0, \in1
  350. .ifnb \more
  351. GADDI \pre_op, \suf_op, \more
  352. .endif
  353. .endm
  354. //
  355. // GSUB
  356. //
  357. .macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
  358. \pre_op\()sub.\suf_op \out, \in0, \in1
  359. .ifnb \more
  360. GSUB \pre_op, \suf_op, \more
  361. .endif
  362. .endm
  363. //
  364. // GSLLI
  365. //
  366. .macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
  367. \pre_op\()slli.\suf_op \out, \in0, \in1
  368. .ifnb \more
  369. GSLLI \pre_op, \suf_op, \more
  370. .endif
  371. .endm
  372. //
  373. // GINSVE0
  374. //
  375. .macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
  376. \pre_op\()insve0.\suf_op \out, \in0, \in1
  377. .ifnb \more
  378. GINSVE0 \pre_op, \suf_op, \more
  379. .endif
  380. .endm
  381. //
  382. // GXOR
  383. //
  384. .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
  385. .ifnb \pre_op
  386. \pre_op\()xor.v \out, \in0, \in1
  387. .else
  388. xor.\suf_op \out, \in0, \in1
  389. .endif
  390. .ifnb \more
  391. GXOR \pre_op, \suf_op, \more
  392. .endif
  393. .endm
  394. //
  395. // GPERMI
  396. //
  397. .macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
  398. \pre_op\()permi.\suf_op \out, \in0, \in1
  399. .ifnb \more
  400. GPERMI \pre_op, \suf_op, \more
  401. .endif
  402. .endm
  403. //
  404. // GNMSUB
  405. //
  406. .macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
  407. \pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2
  408. .ifnb \more
  409. GNMSUB \pre_op, \suf_op, \more
  410. .endif
  411. .endm
  412. //
  413. // GPRELD
  414. //
  415. .macro GPRELD in0:req, in1:req, in2:req, more:vararg
  416. preld \in0, \in1, \in2
  417. .ifnb \more
  418. GPRELD \more
  419. .endif
  420. .endm
  421. //
  422. // GPACKEV
  423. //
  424. .macro GPACKEV pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
  425. \pre_op\()packev.\suf_op \out, \in0, \in1
  426. .ifnb \more
  427. GPACKEV \pre_op, \suf_op, \more
  428. .endif
  429. .endm
  430. //
  431. // GPACKOD
  432. //
  433. .macro GPACKOD pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
  434. \pre_op\()packod.\suf_op \out, \in0, \in1
  435. .ifnb \more
  436. GPACKOD \pre_op, \suf_op, \more
  437. .endif
  438. .endm
  439. //
  440. // GSHUF4I
  441. //
  442. .macro GSHUF4I pre_op:req, suf_op:req, out:req, in0:req, in1:req /* imm */, more:vararg
  443. \pre_op\()shuf4i.\suf_op \out, \in0, \in1
  444. .ifnb \more
  445. GSHUF4I \pre_op, \suf_op, \more
  446. .endif
  447. .endm
  448. .macro TRANSF2G name, pre_op:req, suf_op:req, more:vararg
  449. .ifeqs "\pre_op\()\suf_op", "vfs"
  450. \name v, w, \more
  451. .endif
  452. .ifeqs "\pre_op\()\suf_op", "vfd"
  453. \name v, d, \more
  454. .endif
  455. .ifeqs "\pre_op\()\suf_op", "xvfs"
  456. \name xv, w, \more
  457. .endif
  458. .ifeqs "\pre_op\()\suf_op", "xvfd"
  459. \name xv, d, \more
  460. .endif
  461. .endm
  462. //
  463. // Compound instructions
  464. //
  465. // GACC: Accumulate the values of vector registers
  466. //
  467. .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
  468. .ifeqs "\pre_op\()\suf_op", "xvfd"
  469. xvpermi.q \out, \in, 0x01
  470. \pre_op\()add.\suf_op \in, \out, \in
  471. xvpackod.d \out, \in, \in
  472. \pre_op\()add.\suf_op \out, \out, \in
  473. .endif
  474. .ifeqs "\pre_op\()\suf_op", "xvfs"
  475. xvpermi.q \out, \in, 0x01
  476. \pre_op\()add.\suf_op \in, \out, \in
  477. xvpackod.d \out, \in, \in
  478. \pre_op\()add.\suf_op \out, \out, \in
  479. xvpackod.w \in, \out, \out
  480. \pre_op\()add.\suf_op \out, \out, \in
  481. .endif
  482. .ifeqs "\pre_op\()\suf_op", "vfd"
  483. vpackod.d \out, \in, \in
  484. \pre_op\()add.\suf_op \out, \out, \in
  485. .endif
  486. .ifeqs "\pre_op\()\suf_op", "vfs"
  487. vpackod.d \out, \in, \in
  488. \pre_op\()add.\suf_op \out, \out, \in
  489. vpackod.w \in, \out, \out
  490. \pre_op\()add.\suf_op \out, \out, \in
  491. .endif
  492. .ifeqs "\pre_op\()\suf_op", "xvd"
  493. xvpermi.q \out, \in, 0x01
  494. \pre_op\()add.\suf_op \in, \out, \in
  495. xvpackod.d \out, \in, \in
  496. \pre_op\()add.\suf_op \out, \out, \in
  497. .endif
  498. .ifeqs "\pre_op\()\suf_op", "xvw"
  499. xvpermi.q \out, \in, 0x01
  500. \pre_op\()add.\suf_op \in, \out, \in
  501. xvpackod.d \out, \in, \in
  502. \pre_op\()add.\suf_op \out, \out, \in
  503. xvpackod.w \in, \out, \out
  504. \pre_op\()add.\suf_op \out, \out, \in
  505. .endif
  506. .ifeqs "\pre_op\()\suf_op", "xvh"
  507. xvpermi.q \out, \in, 0x01
  508. \pre_op\()add.\suf_op \in, \out, \in
  509. xvpackod.d \out, \in, \in
  510. \pre_op\()add.\suf_op \out, \out, \in
  511. xvpackod.w \in, \out, \out
  512. \pre_op\()add.\suf_op \out, \out, \in
  513. xvpackod.h \in, \out, \out
  514. \pre_op\()add.\suf_op \out, \out, \in
  515. .endif
  516. .ifeqs "\pre_op\()\suf_op", "xvb"
  517. xvpermi.q \out, \in, 0x01
  518. \pre_op\()add.\suf_op \in, \out, \in
  519. xvpackod.d \out, \in, \in
  520. \pre_op\()add.\suf_op \out, \out, \in
  521. xvpackod.w \in, \out, \out
  522. \pre_op\()add.\suf_op \out, \out, \in
  523. xvpackod.h \in, \out, \out
  524. \pre_op\()add.\suf_op \out, \out, \in
  525. xvpackod.b \in, \out, \out
  526. \pre_op\()add.\suf_op \out, \out, \in
  527. .endif
  528. .ifeqs "\pre_op\()\suf_op", "vd"
  529. vpackod.d \out, \in, \in
  530. \pre_op\()add.\suf_op \out, \out, \in
  531. .endif
  532. .ifeqs "\pre_op\()\suf_op", "vw"
  533. vpackod.d \out, \in, \in
  534. \pre_op\()add.\suf_op \out, \out, \in
  535. vpackod.w \in, \out, \out
  536. \pre_op\()add.\suf_op \out, \out, \in
  537. .endif
  538. .ifeqs "\pre_op\()\suf_op", "vh"
  539. vpackod.d \out, \in, \in
  540. \pre_op\()add.\suf_op \out, \out, \in
  541. vpackod.w \in, \out, \out
  542. \pre_op\()add.\suf_op \out, \out, \in
  543. vpackod.h \in, \out, \out
  544. \pre_op\()add.\suf_op \out, \out, \in
  545. .endif
  546. .ifeqs "\pre_op\()\suf_op", "vb"
  547. vpackod.d \out, \in, \in
  548. \pre_op\()add.\suf_op \out, \out, \in
  549. vpackod.w \in, \out, \out
  550. \pre_op\()add.\suf_op \out, \out, \in
  551. vpackod.h \in, \out, \out
  552. \pre_op\()add.\suf_op \out, \out, \in
  553. vpackod.b \in, \out, \out
  554. \pre_op\()add.\suf_op \out, \out, \in
  555. .endif
  556. .ifnb \more
  557. GACC \pre_op, \suf_op, \more
  558. .endif
  559. .endm
  560. //
  561. // GMOV
  562. //
  563. .macro GMOV pre_op:req, out:req, in:req, more:vararg
  564. \pre_op\()or.v \out, \in, \in
  565. .ifnb \more
  566. GMOV \pre_op, \more
  567. .endif
  568. .endm
  569. //
  570. // GCOMPLEXACC: Complex accumulate the values of vector registers
  571. // pre_op: xvf or vf, differentiate between LSX or LASX instruction
  572. // suf_op: s or d, differentiate between single precision or double precision complex numbers
  573. // Note: When "pre_op = xvf && suf_op = s", in will be modified.
  574. //
  575. .macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
  576. .ifeqs "\pre_op\()\suf_op", "xvfd"
  577. xvpermi.q \out, \in, 0x01
  578. \pre_op\()add.\suf_op \out, \out, \in
  579. .endif
  580. .ifeqs "\pre_op\()\suf_op", "xvfs"
  581. xvpermi.q \out, \in, 0x01
  582. \pre_op\()add.\suf_op \in, \out, \in
  583. xvpackod.d \out, \in, \in
  584. \pre_op\()add.\suf_op \out, \out, \in
  585. .endif
  586. .ifeqs "\pre_op\()\suf_op", "vfd"
  587. vor.v \out, \in, \in
  588. .endif
  589. .ifeqs "\pre_op\()\suf_op", "vfs"
  590. vpackod.d \out, \in, \in
  591. \pre_op\()add.\suf_op \out, \out, \in
  592. .endif
  593. .ifnb \more
  594. GCOMPLEXACC \pre_op, \suf_op, \more
  595. .endif
  596. .endm
  597. //
  598. // GCOMPLEXMUL: Complex multiplication, out = in0 * in1
  599. // xconj: default value 0.
  600. // if !(xconj)
  601. // out_r = in0_r * in1_r - in0_i * in1_i;
  602. // out_i = in0_r * in1_i + in0_i * in1_r;
  603. // else
  604. // out_r = in0_r * in1_r + in0_i * in1_i;
  605. // out_i = in0_r * in1_i - in0_i * in1_r;
  606. // pre_op: xvf or vf, differentiate between LSX or LASX instruction
  607. // suf_op: s or d, differentiate between single precision or double precision complex numbers
  608. //
  609. .macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
  610. TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1
  611. TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0
  612. \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
  613. .ifeqs "\xconj", "0"
  614. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
  615. .else
  616. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
  617. .endif
  618. .ifeqs "\suf_op", "s"
  619. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
  620. .else
  621. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
  622. .endif
  623. \pre_op\()mul.\suf_op \out, \tmp0, \in1
  624. \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out
  625. .ifnb \more
  626. GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more
  627. .endif
  628. .endm
  629. //
  630. // GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2
  631. // xconj: default value 0
  632. // conj: default value 0
  633. // if !(CONJ)
  634. // if !(XCONJ)
  635. // out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
  636. // out_i = in0_r * in1_i + in0_i * in1_r + in2_i;
  637. // else
  638. // out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
  639. // out_i = in0_r * in1_i - in0_i * in1_r + in2_i;
  640. // else
  641. // if !(XCONJ)
  642. // out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
  643. // out_i = in2_i - (in0_r * in1_i - in0_i * in1_r);
  644. // else
  645. // out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
  646. // out_i = in2_i - (in0_r * in1_i + in0_i * in1_r);
  647. // pre_op: xvf or vf, differentiate between LSX or LASX instruction
  648. // suf_op: s or d, differentiate between single precision or double precision complex numbers
  649. //
  650. .macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
  651. TRANSF2G GXOR, \pre_op, s, \tmp1, \tmp1, \tmp1
  652. TRANSF2G GPACKEV, \pre_op, \suf_op, \tmp0, \in0, \in0
  653. \pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2
  654. .ifeqs "\conj\()\suf_op", "1s"
  655. \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
  656. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0xb1
  657. TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2
  658. .endif
  659. .ifeqs "\conj\()\suf_op", "1d"
  660. \pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
  661. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp0, \tmp0, 0x0b
  662. TRANSF2G GPACKEV, \pre_op, \suf_op, \out, \tmp0, \tmp2
  663. .endif
  664. .ifeqs "\conj", "0"
  665. \pre_op\()add.\suf_op \out, \tmp2, \tmp1
  666. .endif
  667. \pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
  668. .ifeqs "\xconj\()\conj\()\suf_op", "00s"
  669. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
  670. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
  671. .endif
  672. .ifeqs "\xconj\()\conj\()\suf_op", "10s"
  673. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
  674. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
  675. .endif
  676. .ifeqs "\xconj\()\conj\()\suf_op", "01s"
  677. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0
  678. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
  679. .endif
  680. .ifeqs "\xconj\()\conj\()\suf_op", "11s"
  681. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1
  682. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0xb1
  683. .endif
  684. .ifeqs "\xconj\()\conj\()\suf_op", "00d"
  685. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \tmp1
  686. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
  687. .endif
  688. .ifeqs "\xconj\()\conj\()\suf_op", "10d"
  689. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \in0
  690. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
  691. .endif
  692. .ifeqs "\xconj\()\conj\()\suf_op", "01d"
  693. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \in0, \in0
  694. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
  695. .endif
  696. .ifeqs "\xconj\()\conj\()\suf_op", "11d"
  697. TRANSF2G GPACKOD, \pre_op, \suf_op, \tmp1, \tmp1, \tmp1
  698. TRANSF2G GSHUF4I, \pre_op, \suf_op, \tmp2, \in1, 0x0b
  699. .endif
  700. \pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out
  701. .ifnb \more
  702. GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more
  703. .endif
  704. .endm
  705. //
  706. // Media Related Macros
  707. //
  708. .macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1
  709. \pre_op\()ilvl.\suf_op \out0, \in0, \in1
  710. \pre_op\()ilvh.\suf_op \out1, \in0, \in1
  711. .endm
  712. .macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1
  713. \pre_op\()pickev.\suf_op \out0, \in0, \in1
  714. \pre_op\()pickod.\suf_op \out1, \in0, \in1
  715. .endm
  716. //
  717. // TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors,
  718. // has no pre_op param. 128-bit vector instructions are not supported.
  719. //
  720. .macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
  721. vt0, vt1
  722. GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0
  723. GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2
  724. GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3
  725. GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02
  726. .endm
  727. .macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \
  728. in0, in1, in2, in3, in4, in5, in6, in7, \
  729. tmp0, tmp1, tmp2, tmp3
  730. GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0
  731. GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1
  732. GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0
  733. GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2
  734. GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4
  735. GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5
  736. GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0
  737. GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2
  738. GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3
  739. GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \
  740. \out2, \out6, 0x02, \out3, \out7, 0x02, \
  741. \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \
  742. \out6, \tmp2, 0x31, \out7, \tmp3, 0x31
  743. .endm