You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

common_ia64.h 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #ifndef COMMON_IA64
  39. #define COMMON_IA64
  40. #ifndef ASSEMBLER
  41. #ifndef MAP_WRITECOMBINED
  42. #define MAP_WRITECOMBINED 0x10000
  43. #endif
  44. #define MB
  45. #define WMB
  46. #define RMB
  47. #ifdef __ECC
  48. #include <ia64intrin.h>
  49. #endif
  50. #define RPCC64BIT
  51. #ifndef __ECC
  52. static __inline void blas_lock(volatile unsigned long *address){
  53. unsigned long ret;
  54. do {
  55. while (*address) {YIELDING;};
  56. __asm__ __volatile__ ("mov ar.ccv=r0\n;;\n"
  57. "cmpxchg4.acq %0=[%2],%1,ar.ccv\n"
  58. : "=r"(ret) : "r"(1), "r"(address)
  59. : "ar.ccv", "memory");
  60. } while (ret);
  61. }
  62. #define BLAS_LOCK_DEFINED
  63. static __inline unsigned long rpcc(void) {
  64. unsigned long clocks;
  65. __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks));
  66. return clocks;
  67. }
  68. #define RPCC_DEFINED
  69. static __inline unsigned long stmxcsr(void){
  70. unsigned long fp;
  71. __asm__ __volatile__ ("mov.m %0=ar.fpsr" : "=r" (fp));
  72. return fp;
  73. }
  74. static __inline void ldmxcsr(unsigned long fp) {
  75. __asm__ __volatile__ ("mov.m ar.fpsr=%0" :: "r" (fp));
  76. }
  77. #define GET_IMAGE(res) asm __volatile__("mov %0 = f9" : "=f"(res) : : "memory")
  78. #else
  79. static __inline void blas_lock(volatile unsigned long *address){
  80. while (*address || _InterlockedCompareExchange((volatile int *) address,1,0))
  81. ;
  82. }
  83. #define BLAS_LOCK_DEFINED
  84. static __inline unsigned int rpcc(void) {
  85. return __getReg(_IA64_REG_AR_ITC);
  86. }
  87. #define RPCC_DEFINED
  88. static __inline unsigned int stmxcsr(void) {
  89. return __getReg(_IA64_REG_AR_FPSR);
  90. }
  91. static __inline void ldmxcsr(unsigned long fp) {
  92. return __setReg(_IA64_REG_AR_FPSR, fp);
  93. }
  94. #ifdef DOUBLE
  95. #define GET_IMAGE(res) __stfd(&res, 9)
  96. #else
  97. #define GET_IMAGE(res) __stfs(&res, 9)
  98. #endif
  99. #endif
  100. #define GET_IMAGE_CANCEL
  101. #ifdef ENABLE_SSE_EXCEPTION
  102. #define IDEBUG_START \
  103. { \
  104. unsigned long fp_sse_mode, new_fp_mode; \
  105. fp_sse_mode = stmxcsr();\
  106. new_fp_mode = (fp_sse_mode & ~(FE_UNDERFLOW | FE_OVERFLOW | FE_UNNORMAL | FE_INVALID));\
  107. ldmxcsr(new_fp_mode);
  108. #define IDEBUG_END \
  109. ldmxcsr(fp_sse_mode); \
  110. }
  111. #endif
  112. #ifdef SMP
  113. #ifdef USE64BITINT
  114. /* 64bit version */
  115. extern unsigned long blas_quick_divide_table[];
  116. #ifndef __ECC
  117. static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){
  118. unsigned long ret;
  119. if (y <= 1) return x;
  120. __asm__ __volatile__("setf.sig f6 = %1\n\t"
  121. "ldf8 f7 = [%2];;\n\t"
  122. "xmpy.hu f6= f6, f7;;\n\t"
  123. "getf.sig %0 = f6;;\n"
  124. : "=r"(ret)
  125. : "r"(x), "r"(&blas_quick_divide_table[y]) : "f6", "f7"
  126. );
  127. return ret;
  128. }
  129. #else
  130. /* Using Intel Compiler */
  131. static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){
  132. if (y <= 1) return x;
  133. return _m64_xmahu(x, blas_quick_divide_table[y], 0);
  134. }
  135. #endif
  136. #else
  137. /* 32bit version */
  138. extern unsigned int blas_quick_divide_table[];
  139. static __inline int blas_quickdivide(unsigned int x, unsigned int y){
  140. if (y <= 1) return x;
  141. return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32);
  142. }
  143. #endif
  144. #endif
  145. #endif
  146. #if 0
  147. #ifdef DOUBLE
  148. #define GEMM_NCOPY dgemm_ncopy
  149. #define GEMM_TCOPY dgemm_tcopy
  150. #define ZGEMM_NCOPY zgemm_ncopy
  151. #define ZGEMM_TCOPY zgemm_tcopy
  152. #define GEMM_KERNEL dgemm_kernel
  153. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  154. #define ZGEMM_KERNEL zgemm_kernel_n
  155. #endif
  156. #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
  157. #define ZGEMM_KERNEL zgemm_kernel_l
  158. #endif
  159. #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
  160. #define ZGEMM_KERNEL zgemm_kernel_r
  161. #endif
  162. #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
  163. #define ZGEMM_KERNEL zgemm_kernel_b
  164. #endif
  165. #else
  166. #define GEMM_NCOPY sgemm_ncopy
  167. #define GEMM_TCOPY sgemm_tcopy
  168. #define ZGEMM_NCOPY cgemm_ncopy
  169. #define ZGEMM_TCOPY cgemm_tcopy
  170. #define GEMM_KERNEL sgemm_kernel
  171. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  172. #define ZGEMM_KERNEL cgemm_kernel_n
  173. #endif
  174. #if defined(CN) || defined(CT) || defined(RN) || defined(RT)
  175. #define ZGEMM_KERNEL cgemm_kernel_l
  176. #endif
  177. #if defined(NC) || defined(TC) || defined(NR) || defined(TR)
  178. #define ZGEMM_KERNEL cgemm_kernel_r
  179. #endif
  180. #if defined(CC) || defined(CR) || defined(RC) || defined(RR)
  181. #define ZGEMM_KERNEL cgemm_kernel_b
  182. #endif
  183. #endif
  184. #endif
  185. #ifdef USE64BITINT
  186. #define LDINT ld8
  187. #define INTSIZE 8
  188. #define CMP4GE cmp.ge
  189. #define CMP4NE cmp.ge
  190. #define CMP4EQ cmp.eq
  191. #else
  192. #define LDINT ld4
  193. #define INTSIZE 4
  194. #define CMP4GE cmp4.ge
  195. #define CMP4NE cmp4.ne
  196. #define CMP4EQ cmp4.eq
  197. #endif
  198. #define HALT mov r0 = 0
  199. #ifdef XDOUBLE
  200. #define LD8 ld8
  201. #define ST8 st8
  202. #define LDFD ldfe
  203. #define LDFPD ldfpe
  204. #define LDFD_T1 ldfe.t1
  205. #define LDFD_NT1 ldfe.nt1
  206. #define LDFD_NT2 ldfe.nt2
  207. #define LDFD_NTA ldfe.nta
  208. #define LDFPD_NT1 ldfpe.nt1
  209. #define LDFPD_NT2 ldfpe.nt2
  210. #define LDFPD_NTA ldfpe.nta
  211. #define STFD stfe
  212. #define STFD_NTA stfe.nta
  213. #define FADD fadd
  214. #define FSUB fsub
  215. #define FMPY fmpy
  216. #define FMA fma
  217. #define FMS fms
  218. #define FNMA fnma
  219. #define FPMA fpma
  220. #define SETF setf.d
  221. #elif defined(DOUBLE)
  222. #define LD8 ld8
  223. #define ST8 st8
  224. #define LDF8 ldf8
  225. #define LDF8_NT1 ldf8.nt1
  226. #define LDF8_NTA ldf8.nta
  227. #define STF8 stf8
  228. #define STF8_NTA stf8.nta
  229. #define LDFD ldfd
  230. #define LDFPD ldfpd
  231. #define LDFD_T1 ldfd.t1
  232. #define LDFD_NT1 ldfd.nt1
  233. #define LDFD_NT2 ldfd.nt2
  234. #define LDFD_NTA ldfd.nta
  235. #define LDFPD_NT1 ldfpd.nt1
  236. #define LDFPD_NT2 ldfpd.nt2
  237. #define LDFPD_NTA ldfpd.nta
  238. #define STFD stfd
  239. #define STFD_NTA stfd.nta
  240. #define FADD fadd.d
  241. #define FSUB fsub.d
  242. #define FMPY fmpy.d
  243. #define FMA fma.d
  244. #define FMS fms.d
  245. #define FNMA fnma.d
  246. #define FPMA fpma.d
  247. #define SETF setf.d
  248. #else
  249. #define LD8 ld4
  250. #define ST8 st4
  251. #define LDF8 ldfs
  252. #define LDF8_NT1 ldfs.nt1
  253. #define LDF8_NTA ldfs.nta
  254. #define STF8 stfs
  255. #define STF8_NTA stfs.nta
  256. #define LDFD ldfs
  257. #define LDFPD ldfps
  258. #define LDFD_T1 ldfs.t1
  259. #define LDFD_NT1 ldfs.nt1
  260. #define LDFD_NT2 ldfs.nt2
  261. #define LDFD_NTA ldfs.nta
  262. #define LDFPD_NT1 ldfps.nt1
  263. #define LDFPD_NT2 ldfps.nt2
  264. #define LDFPD_NTA ldfps.nta
  265. #define STFD stfs
  266. #define STFD_NTA stfs.nta
  267. #if 0
  268. #define FADD fadd.s
  269. #define FSUB fsub.s
  270. #define FMPY fmpy.s
  271. #define FMA fma.s
  272. #define FMS fms.s
  273. #define FNMA fnma.s
  274. #define FPMA fpma.s
  275. #else
  276. #define FADD fadd
  277. #define FSUB fsub
  278. #define FMPY fmpy
  279. #define FMA fma
  280. #define FMS fms
  281. #define FNMA fnma
  282. #define FPMA fpma
  283. #endif
  284. #define SETF setf.s
  285. #endif
  286. #ifndef F_INTERFACE
  287. #define REALNAME ASMNAME
  288. #else
  289. #define REALNAME ASMFNAME
  290. #endif
  291. #ifdef F_INTERFACE_G77
  292. #define RETURN_BY_STACK
  293. #endif
  294. #ifdef F_INTERFACE_G95
  295. #define RETURN_BY_STACK
  296. #endif
  297. #ifdef F_INTERFACE_GFORT
  298. #define RETURN_BY_REGS
  299. #endif
  300. #ifdef F_INTERFACE_INTEL
  301. #define RETURN_BY_STACK
  302. #endif
  303. #define PROLOGUE \
  304. .explicit; \
  305. .text; \
  306. .align 128; \
  307. .global REALNAME; \
  308. .proc REALNAME; \
  309. REALNAME:
  310. #ifdef PROFILE
  311. #define PROFCODE \
  312. .data; \
  313. .align 8; \
  314. .LP0:; \
  315. data8 0; \
  316. .text; \
  317. alloc out0 = ar.pfs, 8, 0, 4, 0; \
  318. mov out1 = r1; \
  319. mov out2 = b0; \
  320. addl out3 = @ltoff(.LP0), r1;;; \
  321. br.call.sptk.many b0 = _mcount;;
  322. #else
  323. #define PROFCODE
  324. #endif
  325. #if defined(__linux__) && defined(__ELF__)
  326. #define GNUSTACK .section .note.GNU-stack,"",@progbits
  327. #else
  328. #define GNUSTACK
  329. #endif
  330. #define EPILOGUE \
  331. .endp REALNAME ; \
  332. GNUSTACK
  333. #define START_ADDRESS 0x20000fc800000000UL
  334. #undef SEEK_ADDRESS
  335. #if 0
  336. #ifdef CONFIG_IA64_PAGE_SIZE_4KB
  337. #define SEEK_ADDRESS
  338. #endif
  339. #ifdef CONFIG_IA64_PAGE_SIZE_8KB
  340. #define SEEK_ADDRESS
  341. #endif
  342. #endif
  343. #define BUFFER_SIZE (128 << 20)
  344. #ifndef PAGESIZE
  345. #define PAGESIZE (16UL << 10)
  346. #endif
  347. #define HUGE_PAGESIZE ( 4 << 20)
  348. #define BASE_ADDRESS (START_ADDRESS - (BLASULONG)BUFFER_SIZE * MAX_CPU_NUMBER)
  349. #endif