|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792 |
- /*******************************************************************************
- Copyright (c) 2016, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *******************************************************************************/
-
- #ifndef __MACROS_MSA_H__
- #define __MACROS_MSA_H__
-
- #include <stdint.h>
- #include <msa.h>
-
- #define ENABLE_PREFETCH
-
- #ifdef ENABLE_PREFETCH
- inline static void prefetch_load_lf(unsigned char *src)
- {
- __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r" (src));
- }
-
- #define PREFETCH(PTR) prefetch_load_lf((unsigned char *)(PTR));
-
- #define STRNG(X) #X
- #define PREF_OFFSET(src_ptr, offset) \
- __asm__ __volatile__("pref 0, " STRNG(offset) "(%[src]) \n\t" : : [src] "r" (src_ptr));
-
- #else
- #define PREFETCH(PTR)
- #define PREF_OFFSET(src_ptr, offset)
- #endif
-
- #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
- #define LD_SP(...) LD_W(v4f32, __VA_ARGS__)
-
- #define LD_D(RTYPE, psrc) *((RTYPE *)(psrc))
- #define LD_DP(...) LD_D(v2f64, __VA_ARGS__)
-
- #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
- #define ST_SP(...) ST_W(v4f32, __VA_ARGS__)
-
- #define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
- #define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
-
- #define COPY_FLOAT_TO_VECTOR(a) ( { \
- v4f32 out; \
- out = __msa_cast_to_vector_float(a); \
- out = (v4f32) __msa_splati_w((v4i32) out, 0); \
- out; \
- } )
-
- #define COPY_DOUBLE_TO_VECTOR(a) ( { \
- v2f64 out; \
- out = __msa_cast_to_vector_double(a); \
- out = (v2f64) __msa_splati_d((v2i64) out, 0); \
- out; \
- } )
-
- /* Description : Load 2 variables with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- */
- #define LD_GP2_INC(psrc, stride, out0, out1) \
- { \
- out0 = *(psrc); \
- (psrc) += stride; \
- out1 = *(psrc); \
- (psrc) += stride; \
- }
-
- #define LD_GP3_INC(psrc, stride, out0, \
- out1, out2) \
- { \
- LD_GP2_INC(psrc, stride, out0, out1); \
- out2 = *(psrc); \
- (psrc) += stride; \
- }
-
- #define LD_GP4_INC(psrc, stride, out0, \
- out1, out2, out3) \
- { \
- LD_GP2_INC(psrc, stride, out0, out1); \
- LD_GP2_INC(psrc, stride, out2, out3); \
- }
-
- #define LD_GP5_INC(psrc, stride, out0, \
- out1, out2, out3, out4) \
- { \
- LD_GP2_INC(psrc, stride, out0, out1); \
- LD_GP2_INC(psrc, stride, out2, out3); \
- out4 = *(psrc); \
- (psrc) += stride; \
- }
-
- #define LD_GP6_INC(psrc, stride, out0, \
- out1, out2, out3, \
- out4, out5) \
- { \
- LD_GP2_INC(psrc, stride, out0, out1); \
- LD_GP2_INC(psrc, stride, out2, out3); \
- LD_GP2_INC(psrc, stride, out4, out5); \
- }
-
- #define LD_GP7_INC(psrc, stride, out0, \
- out1, out2, out3, \
- out4, out5, out6) \
- { \
- LD_GP2_INC(psrc, stride, out0, out1); \
- LD_GP2_INC(psrc, stride, out2, out3); \
- LD_GP2_INC(psrc, stride, out4, out5); \
- out6 = *(psrc); \
- (psrc) += stride; \
- }
-
- #define LD_GP8_INC(psrc, stride, out0, out1, out2, \
- out3, out4, out5, out6, out7) \
- { \
- LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \
- LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \
- }
-
- /* Description : Load 2 vectors of single precision floating point elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Return Type - single precision floating point
- */
- #define LD_SP2(psrc, stride, out0, out1) \
- { \
- out0 = LD_SP((psrc)); \
- out1 = LD_SP((psrc) + stride); \
- }
-
- #define LD_SP4(psrc, stride, out0, out1, out2, out3) \
- { \
- LD_SP2(psrc, stride, out0, out1) \
- LD_SP2(psrc + 2 * stride, stride, out2, out3) \
- }
-
- #define LD_SP2_INC(psrc, stride, out0, out1) \
- { \
- out0 = LD_SP((psrc)); \
- (psrc) += stride; \
- out1 = LD_SP((psrc)); \
- (psrc) += stride; \
- }
-
- #define LD_SP3_INC(psrc, stride, out0, \
- out1, out2) \
- { \
- LD_SP2_INC(psrc, stride, out0, out1); \
- out2 = LD_SP((psrc)); \
- (psrc) += stride; \
- }
-
- #define LD_SP4_INC(psrc, stride, out0, \
- out1, out2, out3) \
- { \
- LD_SP2_INC(psrc, stride, out0, out1); \
- LD_SP2_INC(psrc, stride, out2, out3); \
- }
-
- #define LD_SP5_INC(psrc, stride, out0, \
- out1, out2, out3, out4) \
- { \
- LD_SP2_INC(psrc, stride, out0, out1); \
- LD_SP2_INC(psrc, stride, out2, out3); \
- out4 = LD_SP((psrc)); \
- (psrc) += stride; \
- }
-
- #define LD_SP6_INC(psrc, stride, out0, \
- out1, out2, out3, \
- out4, out5) \
- { \
- LD_SP2_INC(psrc, stride, out0, out1); \
- LD_SP2_INC(psrc, stride, out2, out3); \
- LD_SP2_INC(psrc, stride, out4, out5); \
- }
-
- #define LD_SP7_INC(psrc, stride, out0, \
- out1, out2, out3, \
- out4, out5, out6) \
- { \
- LD_SP2_INC(psrc, stride, out0, out1); \
- LD_SP2_INC(psrc, stride, out2, out3); \
- LD_SP2_INC(psrc, stride, out4, out5); \
- out6 = LD_SP((psrc)); \
- (psrc) += stride; \
- }
-
- #define LD_SP8_INC(psrc, stride, out0, out1, out2, \
- out3, out4, out5, out6, out7) \
- { \
- LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \
- LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \
- }
-
- #define LD_SP16_INC(psrc, stride, out0, out1, out2, \
- out3, out4, out5, out6, out7, out8, \
- out9, out10, out11, out12, out13, \
- out14, out15) \
- { \
- LD_SP8_INC(psrc, stride, out0, out1, out2, \
- out3, out4, out5, out6, out7); \
- LD_SP8_INC(psrc, stride, out8, out9, out10, \
- out11, out12, out13, out14, out15); \
- }
-
- /* Description : Load 2 vectors of double precision floating point elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Return Type - double precision floating point
- */
- #define LD_DP2(psrc, stride, out0, out1) \
- { \
- out0 = LD_DP((psrc)); \
- out1 = LD_DP((psrc) + stride); \
- }
-
- #define LD_DP4(psrc, stride, out0, out1, out2, out3) \
- { \
- LD_DP2(psrc, stride, out0, out1) \
- LD_DP2(psrc + 2 * stride, stride, out2, out3) \
- }
-
- #define LD_DP2_INC(psrc, stride, out0, out1) \
- { \
- out0 = LD_DP(psrc); \
- (psrc) += stride; \
- out1 = LD_DP(psrc); \
- (psrc) += stride; \
- }
-
- #define LD_DP3_INC(psrc, stride, out0, \
- out1, out2) \
- { \
- LD_DP2_INC(psrc, stride, out0, out1); \
- out2 = LD_DP((psrc)); \
- (psrc) += stride; \
- }
-
- #define LD_DP4_INC(psrc, stride, out0, \
- out1, out2, out3) \
- { \
- LD_DP2_INC(psrc, stride, out0, out1); \
- LD_DP2_INC(psrc, stride, out2, out3); \
- }
-
- #define LD_DP5_INC(psrc, stride, out0, \
- out1, out2, out3, out4) \
- { \
- LD_DP2_INC(psrc, stride, out0, out1); \
- LD_DP2_INC(psrc, stride, out2, out3); \
- out4 = LD_DP((psrc)); \
- (psrc) += stride; \
- }
-
- #define LD_DP6_INC(psrc, stride, out0, \
- out1, out2, out3, \
- out4, out5) \
- { \
- LD_DP2_INC(psrc, stride, out0, out1); \
- LD_DP2_INC(psrc, stride, out2, out3); \
- LD_DP2_INC(psrc, stride, out4, out5); \
- }
-
- #define LD_DP7_INC(psrc, stride, out0, \
- out1, out2, out3, \
- out4, out5, out6) \
- { \
- LD_DP2_INC(psrc, stride, out0, out1); \
- LD_DP2_INC(psrc, stride, out2, out3); \
- LD_DP2_INC(psrc, stride, out4, out5); \
- out6 = LD_DP((psrc)); \
- (psrc) += stride; \
- }
-
- #define LD_DP8_INC(psrc, stride, out0, out1, out2, \
- out3, out4, out5, out6, out7) \
- { \
- LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \
- LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \
- }
-
- #define LD_DP16_INC(psrc, stride, out0, out1, out2, \
- out3, out4, out5, out6, out7, out8, \
- out9, out10, out11, out12, out13, \
- out14, out15) \
- { \
- LD_DP8_INC(psrc, stride, out0, out1, out2, \
- out3, out4, out5, out6, out7); \
- LD_DP8_INC(psrc, stride, out8, out9, out10, \
- out11, out12, out13, out14, out15); \
- }
-
- /* Description : Store GP variable with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 4 single precision floating point elements from 'in0' to (pdst)
- Store 4 single precision floating point elements from 'in1' to (pdst + stride)
- */
- #define ST_GP2_INC(in0, in1, \
- pdst, stride) \
- { \
- *(pdst) = in0; \
- (pdst) += stride; \
- *(pdst) = in1; \
- (pdst) += stride; \
- }
-
- #define ST_GP3_INC(in0, in1, in2, \
- pdst, stride) \
- { \
- ST_GP2_INC(in0, in1, pdst, stride); \
- *(pdst) = in2; \
- (pdst) += stride; \
- }
-
- #define ST_GP4_INC(in0, in1, in2, in3, \
- pdst, stride) \
- { \
- ST_GP2_INC(in0, in1, pdst, stride); \
- ST_GP2_INC(in2, in3, pdst, stride); \
- }
-
- #define ST_GP5_INC(in0, in1, in2, in3, \
- in4, pdst, stride) \
- { \
- ST_GP2_INC(in0, in1, pdst, stride); \
- ST_GP2_INC(in2, in3, pdst, stride); \
- *(pdst) = in4; \
- (pdst) += stride; \
- }
-
- #define ST_GP6_INC(in0, in1, in2, in3, \
- in4, in5, pdst, stride) \
- { \
- ST_GP2_INC(in0, in1, pdst, stride); \
- ST_GP2_INC(in2, in3, pdst, stride); \
- ST_GP2_INC(in4, in5, pdst, stride); \
- }
-
- #define ST_GP7_INC(in0, in1, in2, in3, in4, \
- in5, in6, pdst, stride) \
- { \
- ST_GP2_INC(in0, in1, pdst, stride); \
- ST_GP2_INC(in2, in3, pdst, stride); \
- ST_GP2_INC(in4, in5, pdst, stride); \
- *(pdst) = in6; \
- (pdst) += stride; \
- }
-
- #define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \
- in6, in7, pdst, stride) \
- { \
- ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \
- ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \
- }
-
- /* Description : Store vectors of single precision floating point elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 4 single precision floating point elements from 'in0' to (pdst)
- Store 4 single precision floating point elements from 'in1' to (pdst + stride)
- */
- #define ST_SP2(in0, in1, pdst, stride) \
- { \
- ST_SP(in0, (pdst)); \
- ST_SP(in1, (pdst) + stride); \
- }
-
- #define ST_SP4(in0, in1, in2, in3, pdst, stride) \
- { \
- ST_SP2(in0, in1, (pdst), stride); \
- ST_SP2(in2, in3, (pdst + 2 * stride), stride); \
- }
-
- #define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
- { \
- ST_SP4(in0, in1, in2, in3, (pdst), stride); \
- ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \
- }
-
- #define ST_SP2_INC(in0, in1, pdst, stride) \
- { \
- ST_SP(in0, (pdst)); \
- (pdst) += stride; \
- ST_SP(in1, (pdst)); \
- (pdst) += stride; \
- }
-
- #define ST_SP3_INC(in0, in1, in2, \
- pdst, stride) \
- { \
- ST_SP2_INC(in0, in1, pdst, stride); \
- ST_SP(in2, (pdst)); \
- (pdst) += stride; \
- }
-
- #define ST_SP4_INC(in0, in1, in2, in3, \
- pdst, stride) \
- { \
- ST_SP2_INC(in0, in1, pdst, stride); \
- ST_SP2_INC(in2, in3, pdst, stride); \
- }
-
- #define ST_SP5_INC(in0, in1, in2, in3, \
- in4, pdst, stride) \
- { \
- ST_SP2_INC(in0, in1, pdst, stride); \
- ST_SP2_INC(in2, in3, pdst, stride); \
- ST_SP(in4, (pdst)); \
- (pdst) += stride; \
- }
-
- #define ST_SP6_INC(in0, in1, in2, in3, \
- in4, in5, pdst, stride) \
- { \
- ST_SP2_INC(in0, in1, pdst, stride); \
- ST_SP2_INC(in2, in3, pdst, stride); \
- ST_SP2_INC(in4, in5, pdst, stride); \
- }
-
- #define ST_SP7_INC(in0, in1, in2, in3, in4, \
- in5, in6, pdst, stride) \
- { \
- ST_SP2_INC(in0, in1, pdst, stride); \
- ST_SP2_INC(in2, in3, pdst, stride); \
- ST_SP2_INC(in4, in5, pdst, stride); \
- ST_SP(in6, (pdst)); \
- (pdst) += stride; \
- }
-
- #define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \
- in6, in7, pdst, stride) \
- { \
- ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \
- ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \
- }
-
- #define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \
- in7, in8, in9, in10, in11, in12, \
- in13, in14, in15, pdst, stride) \
- { \
- ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \
- in7, pdst, stride); \
- ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \
- in15, pdst, stride); \
- }
-
- /* Description : Store vectors of double precision floating point elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 2 double precision floating point elements from 'in0' to (pdst)
- Store 2 double precision floating point elements from 'in1' to (pdst + stride)
- */
- #define ST_DP2(in0, in1, pdst, stride) \
- { \
- ST_DP(in0, (pdst)); \
- ST_DP(in1, (pdst) + stride); \
- }
-
- #define ST_DP4(in0, in1, in2, in3, pdst, stride) \
- { \
- ST_DP2(in0, in1, (pdst), stride); \
- ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \
- }
-
- #define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
- { \
- ST_DP4(in0, in1, in2, in3, (pdst), stride); \
- ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
- }
-
- #define ST_DP2_INC(in0, in1, pdst, stride) \
- { \
- ST_DP(in0, (pdst)); \
- (pdst) += stride; \
- ST_DP(in1, (pdst)); \
- (pdst) += stride; \
- }
-
- #define ST_DP3_INC(in0, in1, in2, \
- pdst, stride) \
- { \
- ST_DP2_INC(in0, in1, pdst, stride); \
- ST_DP(in2, (pdst)); \
- (pdst) += stride; \
- }
-
- #define ST_DP4_INC(in0, in1, in2, in3, \
- pdst, stride) \
- { \
- ST_DP2_INC(in0, in1, pdst, stride); \
- ST_DP2_INC(in2, in3, pdst, stride); \
- }
-
- #define ST_DP5_INC(in0, in1, in2, in3, \
- in4, pdst, stride) \
- { \
- ST_DP2_INC(in0, in1, pdst, stride); \
- ST_DP2_INC(in2, in3, pdst, stride); \
- ST_DP(in4, (pdst)); \
- (pdst) += stride; \
- }
-
- #define ST_DP6_INC(in0, in1, in2, in3, \
- in4, in5, pdst, stride) \
- { \
- ST_DP2_INC(in0, in1, pdst, stride); \
- ST_DP2_INC(in2, in3, pdst, stride); \
- ST_DP2_INC(in4, in5, pdst, stride); \
- }
-
- #define ST_DP7_INC(in0, in1, in2, in3, in4, \
- in5, in6, pdst, stride) \
- { \
- ST_DP2_INC(in0, in1, pdst, stride); \
- ST_DP2_INC(in2, in3, pdst, stride); \
- ST_DP2_INC(in4, in5, pdst, stride); \
- ST_DP(in6, (pdst)); \
- (pdst) += stride; \
- }
-
- #define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \
- in6, in7, pdst, stride) \
- { \
- ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \
- ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \
- }
-
- #define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \
- in7, in8, in9, in10, in11, in12, \
- in13, in14, in15, pdst, stride) \
- { \
- ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \
- in7, pdst, stride); \
- ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \
- in15, pdst, stride); \
- }
-
- /* Description : shuffle elements in vector as shf_val
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- */
- #define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \
- { \
- out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
- out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
- }
- #define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__)
- #define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__)
-
- #define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \
- shf_val) \
- { \
- out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
- out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
- out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \
- }
- #define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__)
-
- #define SHF_W4(RTYPE, in0, in1, in2, in3, \
- out0, out1, out2, out3, shf_val) \
- { \
- SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \
- SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \
- }
- #define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__)
- #define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__)
-
- /* Description : Interleave both left and right half of input vectors
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Right half of byte elements from 'in0' and 'in1' are
- interleaved and written to 'out0'
- */
- #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
- out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
- }
- #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
- #define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__)
-
- #define ILVRL_D2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
- out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
- }
- #define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__)
- #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
-
- /* Description : Indexed word element values are replicated to all
- elements in output vector
- Arguments : Inputs - in, stidx
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : 'stidx' element value from 'in' vector is replicated to all
- elements in 'out0' vector
- 'stidx + 1' element value from 'in' vector is replicated to all
- elements in 'out1' vector
- Valid index range for word operation is 0-3
- */
- #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
- { \
- out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
- out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
- }
- #define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__)
-
- #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
- { \
- SPLATI_W2(RTYPE, in, 0, out0, out1); \
- SPLATI_W2(RTYPE, in, 2, out2, out3); \
- }
- #define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
-
- #define SPLATI_D2(RTYPE, in, out0, out1) \
- { \
- out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \
- out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \
- }
- #define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__)
-
- /* Description : Pack even double word elements of vector pairs
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even double word elements of 'in0' are copied to the left half
- of 'out0' & even double word elements of 'in1' are copied to
- the right half of 'out0'.
- */
- #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
- out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
- }
- #define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__)
- #define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__)
-
- #define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \
- out0, out1, out2) \
- { \
- out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
- out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
- out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \
- }
- #define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__)
-
- #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) \
- { \
- PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
- #define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__)
-
- /* Description : pack both even and odd half of input vectors
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even double word elements of 'in0' and 'in1' are copied to the
- 'out0' & odd double word elements of 'in0' and 'in1' are
- copied to the 'out1'.
- */
- #define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \
- out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \
- }
- #define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__)
-
- #define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
- out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
- }
- #define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__)
-
- /* Description : Multiplication of pairs of vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Details : Each element from 'in0' is multiplied with elements from 'in1'
- and the result is written to 'out0'
- */
- #define MUL2(in0, in1, in2, in3, out0, out1) \
- { \
- out0 = in0 * in1; \
- out1 = in2 * in3; \
- }
- #define MUL3(in0, in1, in2, in3, in4, in5, \
- out0, out1, out2) \
- { \
- out0 = in0 * in1; \
- out1 = in2 * in3; \
- out2 = in4 * in5; \
- }
- #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) \
- { \
- MUL2(in0, in1, in2, in3, out0, out1); \
- MUL2(in4, in5, in6, in7, out2, out3); \
- }
-
- /* Description : Multiplication of pairs of vectors and added in output
- Arguments : Inputs - in0, in1, vec, out0, out1
- Outputs - out0, out1
- Details : Each element from 'in0' is multiplied with elements from 'vec'
- and the result is added to 'out0'
- */
- #define FMADD2(in0, in1, vec, inout0, inout1) \
- { \
- inout0 += in0 * vec; \
- inout1 += in1 * vec; \
- }
- #define FMADD3(in0, in1, in2, vec, \
- inout0, inout1, inout2) \
- { \
- inout0 += in0 * vec; \
- inout1 += in1 * vec; \
- inout2 += in2 * vec; \
- }
- #define FMADD4(in0, in1, in2, in3, vec, \
- inout0, inout1, inout2, inout3) \
- { \
- FMADD2(in0, in1, vec, inout0, inout1); \
- FMADD2(in2, in3, vec, inout2, inout3); \
- }
-
- /* Description : Addition of 2 pairs of variables
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Details : Each element in 'in0' is added to 'in1' and result is written
- to 'out0'.
- */
- #define ADD2(in0, in1, in2, in3, out0, out1) \
- { \
- out0 = in0 + in1; \
- out1 = in2 + in3; \
- }
- #define ADD3(in0, in1, in2, in3, in4, in5, \
- out0, out1, out2) \
- { \
- out0 = in0 + in1; \
- out1 = in2 + in3; \
- out2 = in4 + in5; \
- }
- #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3) \
- { \
- ADD2(in0, in1, in2, in3, out0, out1); \
- ADD2(in4, in5, in6, in7, out2, out3); \
- }
-
- /* Description : Transpose 4x4 block with word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1, out2, out3
- Return Type - as per RTYPE
- */
- #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \
- out0, out1, out2, out3) \
- { \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
- \
- ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
- ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
- ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \
- ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \
- }
- #define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
-
- #endif /* __MACROS_MSA_H__ */
|