|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419 |
- /*********************************************************************/
- /* Copyright 2009, 2010 The University of Texas at Austin. */
- /* All rights reserved. */
- /* */
- /* Redistribution and use in source and binary forms, with or */
- /* without modification, are permitted provided that the following */
- /* conditions are met: */
- /* */
- /* 1. Redistributions of source code must retain the above */
- /* copyright notice, this list of conditions and the following */
- /* disclaimer. */
- /* */
- /* 2. Redistributions in binary form must reproduce the above */
- /* copyright notice, this list of conditions and the following */
- /* disclaimer in the documentation and/or other materials */
- /* provided with the distribution. */
- /* */
- /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
- /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
- /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
- /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
- /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
- /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
- /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
- /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
- /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
- /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
- /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
- /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
- /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
- /* POSSIBILITY OF SUCH DAMAGE. */
- /* */
- /* The views and conclusions contained in the software and */
- /* documentation are those of the authors and should not be */
- /* interpreted as representing official policies, either expressed */
- /* or implied, of The University of Texas at Austin. */
- /*********************************************************************/
-
- #ifndef COMMON_IA64
- #define COMMON_IA64
-
- #ifndef ASSEMBLER
-
- #ifndef MAP_WRITECOMBINED
- #define MAP_WRITECOMBINED 0x10000
- #endif
-
- #define MB
- #define WMB
-
- #ifdef __ECC
- #include <ia64intrin.h>
- #endif
-
- #define RPCC64BIT
-
- #ifndef __ECC
- static __inline void blas_lock(volatile unsigned long *address){
-
- unsigned long ret;
-
- do {
- while (*address) {YIELDING;};
-
- __asm__ __volatile__ ("mov ar.ccv=r0\n;;\n"
- "cmpxchg4.acq %0=[%2],%1,ar.ccv\n"
- : "=r"(ret) : "r"(1), "r"(address)
- : "ar.ccv", "memory");
- } while (ret);
- }
- #define BLAS_LOCK_DEFINED
-
- static __inline unsigned long rpcc(void) {
- unsigned long clocks;
-
- __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks));
- return clocks;
- }
- #define RPCC_DEFINED
-
-
- static __inline unsigned long stmxcsr(void){
- unsigned long fp;
-
- __asm__ __volatile__ ("mov.m %0=ar.fpsr" : "=r" (fp));
-
- return fp;
- }
-
- static __inline void ldmxcsr(unsigned long fp) {
-
- __asm__ __volatile__ ("mov.m ar.fpsr=%0" :: "r" (fp));
-
- }
-
- #define GET_IMAGE(res) asm __volatile__("mov %0 = f9" : "=f"(res) : : "memory")
-
- #else
-
- static __inline void blas_lock(volatile unsigned long *address){
- while (*address || _InterlockedCompareExchange((volatile int *) address,1,0))
- ;
- }
- #define BLAS_LOCK_DEFINED
-
- static __inline unsigned int rpcc(void) {
- return __getReg(_IA64_REG_AR_ITC);
- }
- #define RPCC_DEFINED
-
- static __inline unsigned int stmxcsr(void) {
- return __getReg(_IA64_REG_AR_FPSR);
- }
-
- static __inline void ldmxcsr(unsigned long fp) {
-
- return __setReg(_IA64_REG_AR_FPSR, fp);
-
- }
-
- #ifdef DOUBLE
- #define GET_IMAGE(res) __stfd(&res, 9)
- #else
- #define GET_IMAGE(res) __stfs(&res, 9)
- #endif
-
- #endif
-
- #define GET_IMAGE_CANCEL
-
- #ifdef ENABLE_SSE_EXCEPTION
-
- #define IDEBUG_START \
- { \
- unsigned long fp_sse_mode, new_fp_mode; \
- fp_sse_mode = stmxcsr();\
- new_fp_mode = (fp_sse_mode & ~(FE_UNDERFLOW | FE_OVERFLOW | FE_UNNORMAL | FE_INVALID));\
- ldmxcsr(new_fp_mode);
-
- #define IDEBUG_END \
- ldmxcsr(fp_sse_mode); \
- }
-
- #endif
-
- #ifdef SMP
-
- #ifdef USE64BITINT
-
- /* 64bit version */
-
- extern unsigned long blas_quick_divide_table[];
-
- #ifndef __ECC
- static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){
- unsigned long ret;
-
- if (y <= 1) return x;
-
- __asm__ __volatile__("setf.sig f6 = %1\n\t"
- "ldf8 f7 = [%2];;\n\t"
- "xmpy.hu f6= f6, f7;;\n\t"
- "getf.sig %0 = f6;;\n"
- : "=r"(ret)
- : "r"(x), "r"(&blas_quick_divide_table[y]) : "f6", "f7"
- );
-
- return ret;
- }
- #else
- /* Using Intel Compiler */
- static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){
- if (y <= 1) return x;
- return _m64_xmahu(x, blas_quick_divide_table[y], 0);
- }
- #endif
-
- #else
- /* 32bit version */
- extern unsigned int blas_quick_divide_table[];
-
- static __inline int blas_quickdivide(unsigned int x, unsigned int y){
- if (y <= 1) return x;
- return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32);
- }
- #endif
- #endif
-
- #endif
-
- #if 0
- #ifdef DOUBLE
- #define GEMM_NCOPY dgemm_ncopy
- #define GEMM_TCOPY dgemm_tcopy
- #define ZGEMM_NCOPY zgemm_ncopy
- #define ZGEMM_TCOPY zgemm_tcopy
- #define GEMM_KERNEL dgemm_kernel
-
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- #define ZGEMM_KERNEL zgemm_kernel_n
- #endif
- #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
- #define ZGEMM_KERNEL zgemm_kernel_l
- #endif
- #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
- #define ZGEMM_KERNEL zgemm_kernel_r
- #endif
- #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
- #define ZGEMM_KERNEL zgemm_kernel_b
- #endif
-
- #else
- #define GEMM_NCOPY sgemm_ncopy
- #define GEMM_TCOPY sgemm_tcopy
- #define ZGEMM_NCOPY cgemm_ncopy
- #define ZGEMM_TCOPY cgemm_tcopy
- #define GEMM_KERNEL sgemm_kernel
-
- #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
- #define ZGEMM_KERNEL cgemm_kernel_n
- #endif
- #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
- #define ZGEMM_KERNEL cgemm_kernel_l
- #endif
- #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
- #define ZGEMM_KERNEL cgemm_kernel_r
- #endif
- #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
- #define ZGEMM_KERNEL cgemm_kernel_b
- #endif
-
- #endif
- #endif
-
- #ifdef USE64BITINT
- #define LDINT ld8
- #define INTSIZE 8
- #define CMP4GE cmp.ge
- #define CMP4NE cmp.ge
- #define CMP4EQ cmp.eq
- #else
- #define LDINT ld4
- #define INTSIZE 4
- #define CMP4GE cmp4.ge
- #define CMP4NE cmp4.ne
- #define CMP4EQ cmp4.eq
- #endif
-
- #define HALT mov r0 = 0
-
- #ifdef XDOUBLE
- #define LD8 ld8
- #define ST8 st8
- #define LDFD ldfe
- #define LDFPD ldfpe
- #define LDFD_T1 ldfe.t1
- #define LDFD_NT1 ldfe.nt1
- #define LDFD_NT2 ldfe.nt2
- #define LDFD_NTA ldfe.nta
- #define LDFPD_NT1 ldfpe.nt1
- #define LDFPD_NT2 ldfpe.nt2
- #define LDFPD_NTA ldfpe.nta
- #define STFD stfe
- #define STFD_NTA stfe.nta
- #define FADD fadd
- #define FSUB fsub
- #define FMPY fmpy
- #define FMA fma
- #define FMS fms
- #define FNMA fnma
- #define FPMA fpma
- #define SETF setf.d
- #elif defined(DOUBLE)
- #define LD8 ld8
- #define ST8 st8
- #define LDF8 ldf8
- #define LDF8_NT1 ldf8.nt1
- #define LDF8_NTA ldf8.nta
- #define STF8 stf8
- #define STF8_NTA stf8.nta
- #define LDFD ldfd
- #define LDFPD ldfpd
- #define LDFD_T1 ldfd.t1
- #define LDFD_NT1 ldfd.nt1
- #define LDFD_NT2 ldfd.nt2
- #define LDFD_NTA ldfd.nta
- #define LDFPD_NT1 ldfpd.nt1
- #define LDFPD_NT2 ldfpd.nt2
- #define LDFPD_NTA ldfpd.nta
- #define STFD stfd
- #define STFD_NTA stfd.nta
- #define FADD fadd.d
- #define FSUB fsub.d
- #define FMPY fmpy.d
- #define FMA fma.d
- #define FMS fms.d
- #define FNMA fnma.d
- #define FPMA fpma.d
- #define SETF setf.d
- #else
- #define LD8 ld4
- #define ST8 st4
- #define LDF8 ldfs
- #define LDF8_NT1 ldfs.nt1
- #define LDF8_NTA ldfs.nta
- #define STF8 stfs
- #define STF8_NTA stfs.nta
- #define LDFD ldfs
- #define LDFPD ldfps
- #define LDFD_T1 ldfs.t1
- #define LDFD_NT1 ldfs.nt1
- #define LDFD_NT2 ldfs.nt2
- #define LDFD_NTA ldfs.nta
- #define LDFPD_NT1 ldfps.nt1
- #define LDFPD_NT2 ldfps.nt2
- #define LDFPD_NTA ldfps.nta
- #define STFD stfs
- #define STFD_NTA stfs.nta
- #if 0
- #define FADD fadd.s
- #define FSUB fsub.s
- #define FMPY fmpy.s
- #define FMA fma.s
- #define FMS fms.s
- #define FNMA fnma.s
- #define FPMA fpma.s
- #else
- #define FADD fadd
- #define FSUB fsub
- #define FMPY fmpy
- #define FMA fma
- #define FMS fms
- #define FNMA fnma
- #define FPMA fpma
- #endif
- #define SETF setf.s
- #endif
-
- #ifndef F_INTERFACE
- #define REALNAME ASMNAME
- #else
- #define REALNAME ASMFNAME
- #endif
-
- #ifdef F_INTERFACE_G77
- #define RETURN_BY_STACK
- #endif
-
- #ifdef F_INTERFACE_G95
- #define RETURN_BY_STACK
- #endif
-
- #ifdef F_INTERFACE_GFORT
- #define RETURN_BY_REGS
- #endif
-
- #ifdef F_INTERFACE_INTEL
- #define RETURN_BY_STACK
- #endif
-
- #define PROLOGUE \
- .explicit; \
- .text; \
- .align 128; \
- .global REALNAME; \
- .proc REALNAME; \
- REALNAME:
-
-
- #ifdef PROFILE
- #define PROFCODE \
- .data; \
- .align 8; \
- .LP0:; \
- data8 0; \
- .text; \
- alloc out0 = ar.pfs, 8, 0, 4, 0; \
- mov out1 = r1; \
- mov out2 = b0; \
- addl out3 = @ltoff(.LP0), r1;;; \
- br.call.sptk.many b0 = _mcount;;
- #else
- #define PROFCODE
- #endif
-
- #if defined(__linux__) && defined(__ELF__)
- #define GNUSTACK .section .note.GNU-stack,"",@progbits
- #else
- #define GNUSTACK
- #endif
-
- #define EPILOGUE \
- .endp REALNAME ; \
- GNUSTACK
-
- #define START_ADDRESS 0x20000fc800000000UL
-
- #undef SEEK_ADDRESS
-
- #if 0
- #ifdef CONFIG_IA64_PAGE_SIZE_4KB
- #define SEEK_ADDRESS
- #endif
-
- #ifdef CONFIG_IA64_PAGE_SIZE_8KB
- #define SEEK_ADDRESS
- #endif
- #endif
-
- #define BUFFER_SIZE (128 << 20)
-
- #ifndef PAGESIZE
- #define PAGESIZE (16UL << 10)
- #endif
- #define HUGE_PAGESIZE ( 4 << 20)
-
- #define BASE_ADDRESS (START_ADDRESS - (BLASULONG)BUFFER_SIZE * MAX_CPU_NUMBER)
-
- #endif
|