You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

common.h 21 kB

13 years ago
10 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
12 years ago
13 years ago
13 years ago
7 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #ifndef COMMON_H
  39. #define COMMON_H
  40. #ifdef __cplusplus
  41. extern "C" {
  42. /* Assume C declarations for C++ */
  43. #endif /* __cplusplus */
  44. #ifndef _GNU_SOURCE
  45. #define _GNU_SOURCE
  46. #endif
  47. #ifndef __USE_XOPEN
  48. #define __USE_XOPEN
  49. #endif
  50. #ifndef __USE_SVID
  51. #define __USE_SVID
  52. #endif
  53. #ifdef BUILD_KERNEL
  54. #include "config_kernel.h"
  55. #else
  56. #include "config.h"
  57. #endif
  58. #undef ENABLE_SSE_EXCEPTION
  59. #if defined(SMP_SERVER) || defined(SMP_ONDEMAND)
  60. #define SMP
  61. #endif
  62. #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
  63. #define WINDOWS_ABI
  64. #define OS_WINDOWS
  65. #ifdef DOUBLE
  66. #define DOUBLE_DEFINED DOUBLE
  67. #undef DOUBLE
  68. #endif
  69. #endif
  70. #if !defined(NOINCLUDE) && !defined(ASSEMBLER)
  71. #include <stdio.h>
  72. #include <stdlib.h>
  73. #include <string.h>
  74. #if !defined(_MSC_VER)
  75. #include <unistd.h>
  76. #elif _MSC_VER < 1900
  77. #define snprintf _snprintf
  78. #endif
  79. #include <time.h>
  80. #ifdef OS_LINUX
  81. #include <malloc.h>
  82. #include <sched.h>
  83. #endif
  84. #if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
  85. #include <sched.h>
  86. #endif
  87. #ifdef OS_ANDROID
  88. #define NO_SYSV_IPC
  89. //Android NDK only supports complex.h since Android 5.0
  90. #if __ANDROID_API__ < 21
  91. #define FORCE_OPENBLAS_COMPLEX_STRUCT
  92. #endif
  93. #endif
  94. #ifdef OS_HAIKU
  95. #define NO_SYSV_IPC
  96. #endif
  97. #ifdef OS_WINDOWS
  98. #ifdef ATOM
  99. #define GOTO_ATOM ATOM
  100. #undef ATOM
  101. #endif
  102. #include <windows.h>
  103. #include <math.h>
  104. #ifdef GOTO_ATOM
  105. #define ATOM GOTO_ATOM
  106. #undef GOTO_ATOM
  107. #endif
  108. #else
  109. #include <sys/mman.h>
  110. #ifndef NO_SYSV_IPC
  111. #include <sys/shm.h>
  112. #endif
  113. #include <sys/time.h>
  114. #include <time.h>
  115. #include <unistd.h>
  116. #include <math.h>
  117. #if defined(SMP) || defined(USE_LOCKING)
  118. #include <pthread.h>
  119. #endif
  120. #endif
  121. #if defined(OS_SUNOS)
  122. #include <thread.h>
  123. #endif
  124. #ifdef __DECC
  125. #include <c_asm.h>
  126. #include <machine/builtins.h>
  127. #endif
  128. #if defined(ARCH_IA64) && defined(ENABLE_SSE_EXCEPTION)
  129. #include <fenv.h>
  130. #endif
  131. #endif
  132. #if defined(OS_WINDOWS) && defined(DOUBLE_DEFINED)
  133. #define DOUBLE DOUBLE_DEFINED
  134. #undef DOUBLE_DEFINED
  135. #endif
  136. #undef DEBUG_INFO
  137. #define SMP_DEBUG
  138. #undef MALLOC_DEBUG
  139. #undef SMP_ALLOC_DEBUG
  140. #ifndef ZERO
  141. #ifdef XDOUBLE
  142. #define ZERO 0.e0L
  143. #elif defined DOUBLE
  144. #define ZERO 0.e0
  145. #else
  146. #define ZERO 0.e0f
  147. #endif
  148. #endif
  149. #ifndef ONE
  150. #ifdef XDOUBLE
  151. #define ONE 1.e0L
  152. #elif defined DOUBLE
  153. #define ONE 1.e0
  154. #else
  155. #define ONE 1.e0f
  156. #endif
  157. #endif
  158. #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
  159. #define ALLOCA_ALIGN 63UL
  160. #define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
  161. #ifdef NEEDBUNDERSCORE
  162. #define BLASFUNC(FUNC) FUNC##_
  163. #else
  164. #define BLASFUNC(FUNC) FUNC
  165. #endif
  166. #undef USE_PTHREAD_LOCK
  167. #undef USE_PTHREAD_SPINLOCK
  168. #if defined(USE_PTHREAD_LOCK) && defined(USE_PTHREAD_SPINLOCK)
  169. #error "You can't specify both LOCK operation!"
  170. #endif
  171. #if defined(SMP) || defined(USE_LOCKING)
  172. #define USE_PTHREAD_LOCK
  173. #undef USE_PTHREAD_SPINLOCK
  174. #endif
  175. #ifdef OS_WINDOWS
  176. #undef USE_PTHREAD_LOCK
  177. #undef USE_PTHREAD_SPINLOCK
  178. #endif
  179. #if defined(USE_PTHREAD_LOCK)
  180. #define LOCK_COMMAND(x) pthread_mutex_lock(x)
  181. #define UNLOCK_COMMAND(x) pthread_mutex_unlock(x)
  182. #elif defined(USE_PTHREAD_SPINLOCK)
  183. #ifndef ASSEMBLER
  184. typedef volatile int pthread_spinlock_t;
  185. int pthread_spin_lock (pthread_spinlock_t *__lock);
  186. int pthread_spin_unlock (pthread_spinlock_t *__lock);
  187. #endif
  188. #define LOCK_COMMAND(x) pthread_spin_lock(x)
  189. #define UNLOCK_COMMAND(x) pthread_spin_unlock(x)
  190. #else
  191. #define LOCK_COMMAND(x) blas_lock(x)
  192. #define UNLOCK_COMMAND(x) blas_unlock(x)
  193. #endif
  194. #define GOTO_SHMID 0x510510
  195. #if 0
  196. #ifndef __CUDACC__
  197. #define __global__
  198. #define __device__
  199. #define __host__
  200. #define __shared__
  201. #endif
  202. #endif
  203. #ifndef ASSEMBLER
  204. #ifdef QUAD_PRECISION
  205. typedef struct {
  206. unsigned long x[2];
  207. } xdouble;
  208. #elif defined EXPRECISION
  209. #define xdouble long double
  210. #else
  211. #define xdouble double
  212. #endif
  213. #if defined(OS_WINDOWS) && defined(__64BIT__)
  214. typedef long long BLASLONG;
  215. typedef unsigned long long BLASULONG;
  216. #else
  217. typedef long BLASLONG;
  218. typedef unsigned long BLASULONG;
  219. #endif
  220. #ifndef BFLOAT16
  221. typedef unsigned short bfloat16;
  222. #define HALFCONVERSION 1
  223. #endif
  224. #ifdef USE64BITINT
  225. typedef BLASLONG blasint;
  226. #if defined(OS_WINDOWS) && defined(__64BIT__)
  227. #define blasabs(x) llabs(x)
  228. #else
  229. #define blasabs(x) labs(x)
  230. #endif
  231. #else
  232. typedef int blasint;
  233. #define blasabs(x) abs(x)
  234. #endif
  235. #else
  236. #ifdef USE64BITINT
  237. #define INTSHIFT 3
  238. #define INTSIZE 8
  239. #else
  240. #define INTSHIFT 2
  241. #define INTSIZE 4
  242. #endif
  243. #endif
  244. #ifdef XDOUBLE
  245. #define FLOAT xdouble
  246. #ifdef QUAD_PRECISION
  247. #define XFLOAT xidouble
  248. #endif
  249. #ifdef QUAD_PRECISION
  250. #define SIZE 32
  251. #define BASE_SHIFT 5
  252. #define ZBASE_SHIFT 6
  253. #else
  254. #define SIZE 16
  255. #define BASE_SHIFT 4
  256. #define ZBASE_SHIFT 5
  257. #endif
  258. #elif defined(DOUBLE)
  259. #define FLOAT double
  260. #define SIZE 8
  261. #define BASE_SHIFT 3
  262. #define ZBASE_SHIFT 4
  263. #elif defined(HALF)
  264. #define IFLOAT bfloat16
  265. #define XFLOAT IFLOAT
  266. #define FLOAT float
  267. #define SIZE 2
  268. #define BASE_SHIFT 1
  269. #define ZBASE_SHIFT 2
  270. #else
  271. #define FLOAT float
  272. #define SIZE 4
  273. #define BASE_SHIFT 2
  274. #define ZBASE_SHIFT 3
  275. #endif
  276. #ifndef XFLOAT
  277. #define XFLOAT FLOAT
  278. #endif
  279. #ifndef IFLOAT
  280. #define IFLOAT FLOAT
  281. #endif
  282. #ifndef COMPLEX
  283. #define COMPSIZE 1
  284. #else
  285. #define COMPSIZE 2
  286. #endif
  287. #define Address_H(x) (((x)+(1<<15))>>16)
  288. #define Address_L(x) ((x)-((Address_H(x))<<16))
  289. #ifndef MAX_CPU_NUMBER
  290. #define MAX_CPU_NUMBER 2
  291. #endif
  292. #if defined(OS_SUNOS)
  293. #define YIELDING thr_yield()
  294. #endif
  295. #if defined(OS_WINDOWS)
  296. #if defined(_MSC_VER) && !defined(__clang__)
  297. #define YIELDING YieldProcessor()
  298. #else
  299. #define YIELDING SwitchToThread()
  300. #endif
  301. #endif
  302. #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5)
  303. #define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
  304. #endif
  305. #ifdef BULLDOZER
  306. #ifndef YIELDING
  307. #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
  308. #endif
  309. #endif
  310. #ifdef POWER8
  311. #ifndef YIELDING
  312. #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
  313. #endif
  314. #endif
  315. #ifdef POWER9
  316. #ifndef YIELDING
  317. #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
  318. #endif
  319. #endif
  320. /*
  321. #ifdef PILEDRIVER
  322. #ifndef YIELDING
  323. #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
  324. #endif
  325. #endif
  326. */
  327. /*
  328. #ifdef STEAMROLLER
  329. #ifndef YIELDING
  330. #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
  331. #endif
  332. #endif
  333. */
  334. #ifndef YIELDING
  335. #define YIELDING sched_yield()
  336. #endif
  337. /***
  338. To alloc job_t on heap or statck.
  339. please https://github.com/xianyi/OpenBLAS/issues/246
  340. ***/
  341. #if defined(OS_WINDOWS)
  342. #define GETRF_MEM_ALLOC_THRESHOLD 32
  343. #define BLAS3_MEM_ALLOC_THRESHOLD 32
  344. #endif
  345. #ifndef GETRF_MEM_ALLOC_THRESHOLD
  346. #define GETRF_MEM_ALLOC_THRESHOLD 80
  347. #endif
  348. #ifndef BLAS3_MEM_ALLOC_THRESHOLD
  349. #define BLAS3_MEM_ALLOC_THRESHOLD 160
  350. #endif
  351. #ifdef QUAD_PRECISION
  352. #include "common_quad.h"
  353. #endif
  354. #ifdef ARCH_ALPHA
  355. #include "common_alpha.h"
  356. #endif
  357. #ifdef ARCH_X86
  358. #include "common_x86.h"
  359. #endif
  360. #ifdef ARCH_X86_64
  361. #include "common_x86_64.h"
  362. #endif
  363. #ifdef ARCH_IA64
  364. #include "common_ia64.h"
  365. #endif
  366. #ifdef ARCH_POWER
  367. #include "common_power.h"
  368. #endif
  369. #ifdef sparc
  370. #include "common_sparc.h"
  371. #endif
  372. #ifdef ARCH_MIPS
  373. #include "common_mips.h"
  374. #endif
  375. #ifdef ARCH_MIPS64
  376. #include "common_mips64.h"
  377. #endif
  378. #ifdef ARCH_ARM
  379. #include "common_arm.h"
  380. #endif
  381. #ifdef ARCH_ARM64
  382. #include "common_arm64.h"
  383. #endif
  384. #ifdef ARCH_ZARCH
  385. #include "common_zarch.h"
  386. #endif
  387. #ifndef ASSEMBLER
  388. #ifdef OS_WINDOWSSTORE
  389. typedef char env_var_t[MAX_PATH];
  390. #define readenv(p, n) 0
  391. #else
  392. #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
  393. typedef char env_var_t[MAX_PATH];
  394. #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
  395. #else
  396. typedef char* env_var_t;
  397. #define readenv(p, n) ((p)=getenv(n))
  398. #endif
  399. #endif
  400. #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
  401. #ifdef _POSIX_MONOTONIC_CLOCK
  402. #if defined(__GLIBC_PREREQ) // cut the if condition if two lines, otherwise will fail at __GLIBC_PREREQ(2, 17)
  403. #if __GLIBC_PREREQ(2, 17) // don't require -lrt
  404. #define USE_MONOTONIC
  405. #endif
  406. #elif defined(OS_ANDROID)
  407. #define USE_MONOTONIC
  408. #endif
  409. #endif
  410. /* use similar scale as x86 rdtsc for timeouts to work correctly */
  411. static inline unsigned long long rpcc(void){
  412. #ifdef USE_MONOTONIC
  413. struct timespec ts;
  414. clock_gettime(CLOCK_MONOTONIC, &ts);
  415. return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
  416. #else
  417. struct timeval tv;
  418. gettimeofday(&tv,NULL);
  419. return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
  420. #endif
  421. }
  422. #define RPCC_DEFINED
  423. #define RPCC64BIT
  424. #endif // !RPCC_DEFINED
  425. #if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
  426. static void __inline blas_lock(volatile BLASULONG *address){
  427. do {
  428. while (*address) {YIELDING;};
  429. } while (!__sync_bool_compare_and_swap(address, 0, 1));
  430. }
  431. #define BLAS_LOCK_DEFINED
  432. #endif
  433. #ifndef RPCC_DEFINED
  434. #error "rpcc() implementation is missing for your platform"
  435. #endif
  436. #ifndef BLAS_LOCK_DEFINED
  437. #error "blas_lock() implementation is missing for your platform"
  438. #endif
  439. #endif // !ASSEMBLER
  440. #ifdef OS_LINUX
  441. #include "common_linux.h"
  442. #endif
  443. #define MMAP_ACCESS (PROT_READ | PROT_WRITE)
  444. #ifdef __NetBSD__
  445. #define MMAP_POLICY (MAP_PRIVATE | MAP_ANON)
  446. #else
  447. #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS)
  448. #endif
  449. #ifndef ASSEMBLER
  450. /* C99 supports complex floating numbers natively, which GCC also offers as an
  451. extension since version 3.0. If neither are available, use a compatible
  452. structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
  453. #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
  454. (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER)
  455. #define OPENBLAS_COMPLEX_C99
  456. #ifndef __cplusplus
  457. #include <complex.h>
  458. #endif
  459. typedef float _Complex openblas_complex_float;
  460. typedef double _Complex openblas_complex_double;
  461. typedef xdouble _Complex openblas_complex_xdouble;
  462. #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I))
  463. #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I))
  464. #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I))
  465. #else
  466. #define OPENBLAS_COMPLEX_STRUCT
  467. typedef struct { float real, imag; } openblas_complex_float;
  468. typedef struct { double real, imag; } openblas_complex_double;
  469. typedef struct { xdouble real, imag; } openblas_complex_xdouble;
  470. #define openblas_make_complex_float(real, imag) {(real), (imag)}
  471. #define openblas_make_complex_double(real, imag) {(real), (imag)}
  472. #define openblas_make_complex_xdouble(real, imag) {(real), (imag)}
  473. #endif
  474. #endif
  475. #include "param.h"
  476. #include "common_param.h"
  477. #ifndef STDERR
  478. #define STDERR stderr
  479. #endif
  480. #ifndef MASK
  481. #define MASK(a, b) (((a) + ((b) - 1)) & ~((b) - 1))
  482. #endif
  483. #if defined(XDOUBLE) || defined(DOUBLE)
  484. #define FLOATRET FLOAT
  485. #else
  486. #ifdef NEED_F2CCONV
  487. #define FLOATRET double
  488. #else
  489. #define FLOATRET float
  490. #endif
  491. #endif
  492. #ifndef ASSEMBLER
  493. #ifndef NOINCLUDE
  494. /* Inclusion of a standard header file is needed for definition of __STDC_*
  495. predefined macros with some compilers (e.g. GCC 4.7 on Linux). This occurs
  496. as a side effect of including either <features.h> or <stdc-predef.h>. */
  497. #include <stdio.h>
  498. #endif // NOINCLUDE
  499. #ifdef XDOUBLE
  500. #define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble
  501. #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i)
  502. #elif defined(DOUBLE)
  503. #define OPENBLAS_COMPLEX_FLOAT openblas_complex_double
  504. #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_double(r,i)
  505. #else
  506. #define OPENBLAS_COMPLEX_FLOAT openblas_complex_float
  507. #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_float(r,i)
  508. #endif
  509. #if defined(C_PGI) || defined(C_SUN)
  510. #if defined(__STDC_IEC_559_COMPLEX__)
  511. #define CREAL(X) creal(X)
  512. #define CIMAG(X) cimag(X)
  513. #else
  514. #define CREAL(X) (*((FLOAT *)&X + 0))
  515. #define CIMAG(X) (*((FLOAT *)&X + 1))
  516. #endif
  517. #else
  518. #ifdef OPENBLAS_COMPLEX_STRUCT
  519. #define CREAL(Z) ((Z).real)
  520. #define CIMAG(Z) ((Z).imag)
  521. #else
  522. #define CREAL __real__
  523. #define CIMAG __imag__
  524. #endif
  525. #endif
  526. #endif // ASSEMBLER
  527. #ifndef IFLUSH
  528. #define IFLUSH
  529. #endif
  530. #ifndef IFLUSH_HALF
  531. #define IFLUSH_HALF
  532. #endif
  533. #if defined(C_GCC) && (( __GNUC__ <= 3) || ((__GNUC__ == 4) && (__GNUC_MINOR__ < 2)))
  534. #ifdef USE_OPENMP
  535. #undef USE_OPENMP
  536. #endif
  537. #endif
  538. #if defined(C_MSVC)
  539. #define inline __inline
  540. #endif
  541. #ifndef ASSEMBLER
  542. #ifndef MIN
  543. #define MIN(a,b) (a>b? b:a)
  544. #endif
  545. #ifndef MAX
  546. #define MAX(a,b) (a<b? b:a)
  547. #endif
  548. #define TOUPPER(a) {if ((a) > 0x60) (a) -= 0x20;}
  549. #if defined(__FreeBSD__) || defined(__APPLE__)
  550. #define MAP_ANONYMOUS MAP_ANON
  551. #endif
  552. /* Common Memory Management Routine */
  553. void blas_set_parameter(void);
  554. int blas_get_cpu_number(void);
  555. void *blas_memory_alloc (int);
  556. void blas_memory_free (void *);
  557. void *blas_memory_alloc_nolock (int); //use malloc without blas_lock
  558. void blas_memory_free_nolock (void *);
  559. int get_num_procs (void);
  560. #if defined(OS_LINUX) && defined(SMP) && !defined(NO_AFFINITY)
  561. int get_num_nodes (void);
  562. int get_num_proc (int);
  563. int get_node_equal (void);
  564. #endif
  565. void goto_set_num_threads(int);
  566. void gotoblas_affinity_init(void);
  567. void gotoblas_affinity_quit(void);
  568. void gotoblas_dynamic_init(void);
  569. void gotoblas_dynamic_quit(void);
  570. void gotoblas_profile_init(void);
  571. void gotoblas_profile_quit(void);
  572. int support_avx512(void);
  573. #ifdef USE_OPENMP
  574. #ifndef C_MSVC
  575. int omp_in_parallel(void);
  576. int omp_get_num_procs(void);
  577. #else
  578. __declspec(dllimport) int __cdecl omp_in_parallel(void);
  579. __declspec(dllimport) int __cdecl omp_get_num_procs(void);
  580. #endif
  581. #if (__STDC_VERSION__ >= 201112L)
  582. #if defined(C_GCC) && ( __GNUC__ < 7)
  583. // workaround for GCC bug 65467
  584. #ifndef _Atomic
  585. #define _Atomic volatile
  586. #endif
  587. #endif
  588. #include <stdatomic.h>
  589. #else
  590. #ifndef _Atomic
  591. #define _Atomic volatile
  592. #endif
  593. #endif
  594. #else
  595. #ifdef __ELF__
  596. int omp_in_parallel (void) __attribute__ ((weak));
  597. int omp_get_num_procs(void) __attribute__ ((weak));
  598. #endif
  599. #endif
  600. static __inline void blas_unlock(volatile BLASULONG *address){
  601. MB;
  602. *address = 0;
  603. }
  604. #ifdef OS_WINDOWSSTORE
  605. static __inline int readenv_atoi(char *env) {
  606. return 0;
  607. }
  608. #else
  609. #ifdef OS_WINDOWS
  610. static __inline int readenv_atoi(char *env) {
  611. env_var_t p;
  612. return readenv(p,env) ? 0 : atoi(p);
  613. }
  614. #else
  615. static __inline int readenv_atoi(char *env) {
  616. char *p;
  617. if (( p = getenv(env) ))
  618. return (atoi(p));
  619. else
  620. return(0);
  621. }
  622. #endif
  623. #endif
  624. #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
  625. static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){
  626. #ifndef UNIT
  627. FLOAT ratio, den;
  628. if (
  629. #ifdef XDOUBLE
  630. (fabsl(ar)) >= (fabsl(ai))
  631. #elif defined DOUBLE
  632. (fabs (ar)) >= (fabs (ai))
  633. #else
  634. (fabsf(ar)) >= (fabsf(ai))
  635. #endif
  636. ) {
  637. ratio = ai / ar;
  638. den = (FLOAT)(ONE / (ar * (ONE + ratio * ratio)));
  639. ar = den;
  640. ai = -ratio * den;
  641. } else {
  642. ratio = ar / ai;
  643. den = (FLOAT)(ONE /(ai * (ONE + ratio * ratio)));
  644. ar = ratio * den;
  645. ai = -den;
  646. }
  647. b[0] = ar;
  648. b[1] = ai;
  649. #else
  650. b[0] = ONE;
  651. b[1] = ZERO;
  652. #endif
  653. }
  654. #endif
  655. #ifdef MALLOC_DEBUG
  656. void *blas_debug_alloc(int);
  657. void *blas_debug_free(void *);
  658. #undef malloc
  659. #undef free
  660. #define malloc(a) blas_debug_alloc(a)
  661. #define free(a) blas_debug_free (a)
  662. #endif
  663. #ifndef COPYOVERHEAD
  664. #define GEMMRETTYPE int
  665. #else
  666. typedef struct {
  667. double outercopy;
  668. double innercopy;
  669. double kernel;
  670. double mflops;
  671. } copyoverhead_t;
  672. #define GEMMRETTYPE copyoverhead_t
  673. #endif
  674. #endif
  675. #ifndef BUILD_KERNEL
  676. #define KNAME(A, B) A
  677. #else
  678. #define KNAME(A, B) A##B
  679. #endif
  680. #include "common_interface.h"
  681. #ifdef SANITY_CHECK
  682. #include "common_reference.h"
  683. #endif
  684. #include "common_macro.h"
  685. #include "common_level1.h"
  686. #include "common_level2.h"
  687. #include "common_level3.h"
  688. #include "common_lapack.h"
  689. #ifdef CBLAS
  690. # define OPENBLAS_CONST /* see comment in cblas.h */
  691. # include "cblas.h"
  692. #endif
  693. #ifndef ASSEMBLER
  694. #include "common_stackalloc.h"
  695. #if 0
  696. #include "symcopy.h"
  697. #endif
  698. #if defined(SMP_SERVER) && defined(SMP_ONDEMAND)
  699. #error Both SMP_SERVER and SMP_ONDEMAND are specified.
  700. #endif
  701. #if defined(SMP_SERVER) || defined(SMP_ONDEMAND)
  702. #include "common_thread.h"
  703. #endif
  704. #endif
  705. #define INFO_NUM 99
  706. #ifndef DEFAULT_CPU_NUMBER
  707. #define DEFAULT_CPU_NUMBER 4
  708. #endif
  709. #ifndef IDEBUG_START
  710. #define IDEBUG_START
  711. #endif
  712. #ifndef IDEBUG_END
  713. #define IDEBUG_END
  714. #endif
  715. #if !defined(ASSEMBLER) && defined(FUNCTION_PROFILE)
  716. typedef struct {
  717. int func;
  718. unsigned long long calls, fops, area, cycles, tcycles;
  719. } func_profile_t;
  720. extern func_profile_t function_profile_table[];
  721. extern int gotoblas_profile;
  722. #ifdef XDOUBLE
  723. #define NUMOPT QNUMOPT
  724. #elif defined DOUBLE
  725. #define NUMOPT DNUMOPT
  726. #else
  727. #define NUMOPT SNUMOPT
  728. #endif
  729. #define FUNCTION_PROFILE_START() { unsigned long long profile_start = rpcc(), profile_end;
  730. #ifdef SMP
  731. #define FUNCTION_PROFILE_END(COMP, AREA, OPS) \
  732. if (gotoblas_profile) { \
  733. profile_end = rpcc(); \
  734. function_profile_table[PROFILE_FUNC_NAME].calls ++; \
  735. function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \
  736. function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \
  737. function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \
  738. function_profile_table[PROFILE_FUNC_NAME].tcycles += blas_cpu_number * (profile_end - profile_start); \
  739. } \
  740. }
  741. #else
  742. #define FUNCTION_PROFILE_END(COMP, AREA, OPS) \
  743. if (gotoblas_profile) { \
  744. profile_end = rpcc(); \
  745. function_profile_table[PROFILE_FUNC_NAME].calls ++; \
  746. function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \
  747. function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \
  748. function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \
  749. function_profile_table[PROFILE_FUNC_NAME].tcycles += (profile_end - profile_start); \
  750. } \
  751. }
  752. #endif
  753. #else
  754. #define FUNCTION_PROFILE_START()
  755. #define FUNCTION_PROFILE_END(COMP, AREA, OPS)
  756. #endif
  757. #if 1
  758. #define PRINT_DEBUG_CNAME
  759. #define PRINT_DEBUG_NAME
  760. #else
  761. #define PRINT_DEBUG_CNAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_CNAME)
  762. #define PRINT_DEBUG_NAME if (readenv_atoi("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME)
  763. #endif
  764. #ifdef __cplusplus
  765. }
  766. #endif /* __cplusplus */
  767. #endif