You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

parameter.c 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include <string.h>
  40. #include "common.h"
  41. int get_L2_size(void);
  42. #define DEFAULT_GEMM_P 128
  43. #define DEFAULT_GEMM_Q 128
  44. #define DEFAULT_GEMM_R 128
  45. #define DEFAULT_GEMM_OFFSET_A 0
  46. #define DEFAULT_GEMM_OFFSET_B 0
  47. /* Global Parameter */
  48. #if GEMM_OFFSET_A == gemm_offset_a
  49. BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
  50. #else
  51. BLASLONG gemm_offset_a = GEMM_OFFSET_A;
  52. #endif
  53. #if GEMM_OFFSET_B == gemm_offset_b
  54. BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
  55. #else
  56. BLASLONG gemm_offset_b = GEMM_OFFSET_B;
  57. #endif
  58. #if SGEMM_P == sgemm_p
  59. BLASLONG sgemm_p = DEFAULT_GEMM_P;
  60. #else
  61. BLASLONG sgemm_p = SGEMM_P;
  62. #endif
  63. #if DGEMM_P == dgemm_p
  64. BLASLONG dgemm_p = DEFAULT_GEMM_P;
  65. #else
  66. BLASLONG dgemm_p = DGEMM_P;
  67. #endif
  68. #if CGEMM_P == cgemm_p
  69. BLASLONG cgemm_p = DEFAULT_GEMM_P;
  70. #else
  71. BLASLONG cgemm_p = CGEMM_P;
  72. #endif
  73. #if ZGEMM_P == zgemm_p
  74. BLASLONG zgemm_p = DEFAULT_GEMM_P;
  75. #else
  76. BLASLONG zgemm_p = ZGEMM_P;
  77. #endif
  78. #if SGEMM_Q == sgemm_q
  79. BLASLONG sgemm_q = DEFAULT_GEMM_Q;
  80. #else
  81. BLASLONG sgemm_q = SGEMM_Q;
  82. #endif
  83. #if DGEMM_Q == dgemm_q
  84. BLASLONG dgemm_q = DEFAULT_GEMM_Q;
  85. #else
  86. BLASLONG dgemm_q = DGEMM_Q;
  87. #endif
  88. #if CGEMM_Q == cgemm_q
  89. BLASLONG cgemm_q = DEFAULT_GEMM_Q;
  90. #else
  91. BLASLONG cgemm_q = CGEMM_Q;
  92. #endif
  93. #if ZGEMM_Q == zgemm_q
  94. BLASLONG zgemm_q = DEFAULT_GEMM_Q;
  95. #else
  96. BLASLONG zgemm_q = ZGEMM_Q;
  97. #endif
  98. #if SGEMM_R == sgemm_r
  99. BLASLONG sgemm_r = DEFAULT_GEMM_R;
  100. #else
  101. BLASLONG sgemm_r = SGEMM_R;
  102. #endif
  103. #if DGEMM_R == dgemm_r
  104. BLASLONG dgemm_r = DEFAULT_GEMM_R;
  105. #else
  106. BLASLONG dgemm_r = DGEMM_R;
  107. #endif
  108. #if CGEMM_R == cgemm_r
  109. BLASLONG cgemm_r = DEFAULT_GEMM_R;
  110. #else
  111. BLASLONG cgemm_r = CGEMM_R;
  112. #endif
  113. #if ZGEMM_R == zgemm_r
  114. BLASLONG zgemm_r = DEFAULT_GEMM_R;
  115. #else
  116. BLASLONG zgemm_r = ZGEMM_R;
  117. #endif
  118. #if defined(EXPRECISION) || defined(QUAD_PRECISION)
  119. #if QGEMM_P == qgemm_p
  120. BLASLONG qgemm_p = DEFAULT_GEMM_P;
  121. #else
  122. BLASLONG qgemm_p = QGEMM_P;
  123. #endif
  124. #if XGEMM_P == xgemm_p
  125. BLASLONG xgemm_p = DEFAULT_GEMM_P;
  126. #else
  127. BLASLONG xgemm_p = XGEMM_P;
  128. #endif
  129. #if QGEMM_Q == qgemm_q
  130. BLASLONG qgemm_q = DEFAULT_GEMM_Q;
  131. #else
  132. BLASLONG qgemm_q = QGEMM_Q;
  133. #endif
  134. #if XGEMM_Q == xgemm_q
  135. BLASLONG xgemm_q = DEFAULT_GEMM_Q;
  136. #else
  137. BLASLONG xgemm_q = XGEMM_Q;
  138. #endif
  139. #if QGEMM_R == qgemm_r
  140. BLASLONG qgemm_r = DEFAULT_GEMM_R;
  141. #else
  142. BLASLONG qgemm_r = QGEMM_R;
  143. #endif
  144. #if XGEMM_R == xgemm_r
  145. BLASLONG xgemm_r = DEFAULT_GEMM_R;
  146. #else
  147. BLASLONG xgemm_r = XGEMM_R;
  148. #endif
  149. #endif
  150. #if defined(ARCH_X86) || defined(ARCH_X86_64)
  151. int get_L2_size(void){
  152. int eax, ebx, ecx, edx;
  153. #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
  154. defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
  155. defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
  156. cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
  157. return BITMASK(ecx, 16, 0xffff);
  158. #else
  159. int info[15];
  160. int i;
  161. cpuid(2, &eax, &ebx, &ecx, &edx);
  162. info[ 0] = BITMASK(eax, 8, 0xff);
  163. info[ 1] = BITMASK(eax, 16, 0xff);
  164. info[ 2] = BITMASK(eax, 24, 0xff);
  165. info[ 3] = BITMASK(ebx, 0, 0xff);
  166. info[ 4] = BITMASK(ebx, 8, 0xff);
  167. info[ 5] = BITMASK(ebx, 16, 0xff);
  168. info[ 6] = BITMASK(ebx, 24, 0xff);
  169. info[ 7] = BITMASK(ecx, 0, 0xff);
  170. info[ 8] = BITMASK(ecx, 8, 0xff);
  171. info[ 9] = BITMASK(ecx, 16, 0xff);
  172. info[10] = BITMASK(ecx, 24, 0xff);
  173. info[11] = BITMASK(edx, 0, 0xff);
  174. info[12] = BITMASK(edx, 8, 0xff);
  175. info[13] = BITMASK(edx, 16, 0xff);
  176. info[14] = BITMASK(edx, 24, 0xff);
  177. for (i = 0; i < 15; i++){
  178. switch (info[i]){
  179. case 0x3b :
  180. case 0x41 :
  181. case 0x79 :
  182. return 128;
  183. break;
  184. case 0x3c :
  185. case 0x42 :
  186. case 0x7a :
  187. case 0x7e :
  188. case 0x82 :
  189. return 256;
  190. break;
  191. case 0x43 :
  192. case 0x7b :
  193. case 0x7f :
  194. case 0x83 :
  195. case 0x86 :
  196. return 512;
  197. break;
  198. case 0x44 :
  199. case 0x78 :
  200. case 0x7c :
  201. case 0x84 :
  202. case 0x87 :
  203. return 1024;
  204. break;
  205. case 0x45 :
  206. case 0x7d :
  207. case 0x85 :
  208. return 2048;
  209. case 0x49 :
  210. return 4096;
  211. break;
  212. }
  213. }
  214. /* Never reached */
  215. return 0;
  216. #endif
  217. }
  218. void blas_set_parameter(void){
  219. char *p;
  220. int factor;
  221. int size = get_L2_size();
  222. #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
  223. size >>= 7;
  224. #if defined(CORE_BANIAS) && (HAVE_HIT > 1)
  225. sgemm_p = 64 / HAVE_HIT * size;
  226. dgemm_p = 32 / HAVE_HIT * size;
  227. cgemm_p = 32 / HAVE_HIT * size;
  228. zgemm_p = 16 / HAVE_HIT * size;
  229. #ifdef EXPRECISION
  230. qgemm_p = 16 / HAVE_HIT * size;
  231. xgemm_p = 8 / HAVE_HIT * size;
  232. #endif
  233. #ifdef QUAD_PRECISION
  234. qgemm_p = 8 / HAVE_HIT * size;
  235. xgemm_p = 4 / HAVE_HIT * size;
  236. #endif
  237. #else
  238. sgemm_p = 64 * size;
  239. dgemm_p = 32 * size;
  240. cgemm_p = 32 * size;
  241. zgemm_p = 16 * size;
  242. #ifdef EXPRECISION
  243. qgemm_p = 16 * size;
  244. xgemm_p = 8 * size;
  245. #endif
  246. #ifdef QUAD_PRECISION
  247. qgemm_p = 8 * size;
  248. xgemm_p = 4 * size;
  249. #endif
  250. #endif
  251. #endif
  252. #if defined(CORE_NORTHWOOD)
  253. size >>= 7;
  254. #ifdef ALLOC_HUGETLB
  255. sgemm_p = 128 * size;
  256. dgemm_p = 64 * size;
  257. cgemm_p = 64 * size;
  258. zgemm_p = 32 * size;
  259. #ifdef EXPRECISION
  260. qgemm_p = 32 * size;
  261. xgemm_p = 16 * size;
  262. #endif
  263. #ifdef QUAD_PRECISION
  264. qgemm_p = 16 * size;
  265. xgemm_p = 8 * size;
  266. #endif
  267. #else
  268. sgemm_p = 96 * size;
  269. dgemm_p = 48 * size;
  270. cgemm_p = 48 * size;
  271. zgemm_p = 24 * size;
  272. #ifdef EXPRECISION
  273. qgemm_p = 24 * size;
  274. xgemm_p = 12 * size;
  275. #endif
  276. #ifdef QUAD_PRECISION
  277. qgemm_p = 12 * size;
  278. xgemm_p = 6 * size;
  279. #endif
  280. #endif
  281. #endif
  282. #if defined(CORE_CORE2)
  283. size >>= 9;
  284. sgemm_p = 92 * size;
  285. dgemm_p = 46 * size;
  286. cgemm_p = 46 * size;
  287. zgemm_p = 23 * size;
  288. #ifdef EXPRECISION
  289. qgemm_p = 23 * size;
  290. xgemm_p = 11 * size;
  291. #endif
  292. #ifdef QUAD_PRECISION
  293. qgemm_p = 11 * size;
  294. xgemm_p = 5 * size;
  295. #endif
  296. #endif
  297. #if defined(PENRYN)
  298. size >>= 9;
  299. sgemm_p = 1024;
  300. dgemm_p = 512;
  301. cgemm_p = 512;
  302. zgemm_p = 256;
  303. #ifdef EXPRECISION
  304. qgemm_p = 256;
  305. xgemm_p = 128;
  306. #endif
  307. #ifdef QUAD_PRECISION
  308. qgemm_p = 21 * size + 4;
  309. xgemm_p = 10 * size + 2;
  310. #endif
  311. #endif
  312. #if defined(DUNNINGTON)
  313. size >>= 9;
  314. sgemm_p = 384;
  315. dgemm_p = 384;
  316. cgemm_p = 384;
  317. zgemm_p = 384;
  318. #ifdef EXPRECISION
  319. qgemm_p = 384;
  320. xgemm_p = 384;
  321. #endif
  322. #ifdef QUAD_PRECISION
  323. qgemm_p = 21 * size + 4;
  324. xgemm_p = 10 * size + 2;
  325. #endif
  326. #endif
  327. #if defined(NEHALEM)
  328. sgemm_p = 1024;
  329. dgemm_p = 512;
  330. cgemm_p = 512;
  331. zgemm_p = 256;
  332. #ifdef EXPRECISION
  333. qgemm_p = 256;
  334. xgemm_p = 128;
  335. #endif
  336. #endif
  337. #if defined(SANDYBRIDGE)
  338. sgemm_p = 1024;
  339. dgemm_p = 512;
  340. cgemm_p = 512;
  341. zgemm_p = 256;
  342. #ifdef EXPRECISION
  343. qgemm_p = 256;
  344. xgemm_p = 128;
  345. #endif
  346. #endif
  347. #if defined(CORE_PRESCOTT) || defined(GENERIC)
  348. size >>= 6;
  349. if (size > 16) size = 16;
  350. sgemm_p = 56 * size;
  351. dgemm_p = 28 * size;
  352. cgemm_p = 28 * size;
  353. zgemm_p = 14 * size;
  354. #ifdef EXPRECISION
  355. qgemm_p = 14 * size;
  356. xgemm_p = 7 * size;
  357. #endif
  358. #ifdef QUAD_PRECISION
  359. qgemm_p = 7 * size;
  360. xgemm_p = 3 * size;
  361. #endif
  362. #endif
  363. #if defined(CORE_OPTERON)
  364. sgemm_p = 224 + 14 * (size >> 5);
  365. dgemm_p = 112 + 14 * (size >> 6);
  366. cgemm_p = 116 + 14 * (size >> 6);
  367. zgemm_p = 58 + 14 * (size >> 7);
  368. #ifdef EXPRECISION
  369. qgemm_p = 58 + 14 * (size >> 7);
  370. xgemm_p = 29 + 14 * (size >> 8);
  371. #endif
  372. #ifdef QUAD_PRECISION
  373. qgemm_p = 29 + 14 * (size >> 8);
  374. xgemm_p = 15 + 14 * (size >> 9);
  375. #endif
  376. #endif
  377. #if defined(ATOM)
  378. size >>= 8;
  379. sgemm_p = 256;
  380. dgemm_p = 128;
  381. cgemm_p = 128;
  382. zgemm_p = 64;
  383. #ifdef EXPRECISION
  384. qgemm_p = 64;
  385. xgemm_p = 32;
  386. #endif
  387. #ifdef QUAD_PRECISION
  388. qgemm_p = 32;
  389. xgemm_p = 16;
  390. #endif
  391. #endif
  392. #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
  393. size >>= 8;
  394. sgemm_p = 232 * size;
  395. dgemm_p = 116 * size;
  396. cgemm_p = 116 * size;
  397. zgemm_p = 58 * size;
  398. #ifdef EXPRECISION
  399. qgemm_p = 58 * size;
  400. xgemm_p = 26 * size;
  401. #endif
  402. #ifdef QUAD_PRECISION
  403. qgemm_p = 26 * size;
  404. xgemm_p = 13 * size;
  405. #endif
  406. #endif
  407. p = getenv("GOTO_BLOCK_FACTOR");
  408. if (p) {
  409. factor = atoi(p);
  410. if (factor < 10) factor = 10;
  411. if (factor > 200) factor = 200;
  412. sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L;
  413. dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L;
  414. cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L;
  415. zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L;
  416. #ifdef EXPRECISION
  417. qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L;
  418. xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L;
  419. #endif
  420. }
  421. if (sgemm_p == 0) sgemm_p = 64;
  422. if (dgemm_p == 0) dgemm_p = 64;
  423. if (cgemm_p == 0) cgemm_p = 64;
  424. if (zgemm_p == 0) zgemm_p = 64;
  425. #ifdef EXPRECISION
  426. if (qgemm_p == 0) qgemm_p = 64;
  427. if (xgemm_p == 0) xgemm_p = 64;
  428. #endif
  429. #ifdef QUAD_PRECISION
  430. if (qgemm_p == 0) qgemm_p = 64;
  431. if (xgemm_p == 0) xgemm_p = 64;
  432. #endif
  433. sgemm_p = (sgemm_p + SGEMM_UNROLL_M - 1) & ~(SGEMM_UNROLL_M - 1);
  434. dgemm_p = (dgemm_p + DGEMM_UNROLL_M - 1) & ~(DGEMM_UNROLL_M - 1);
  435. cgemm_p = (cgemm_p + CGEMM_UNROLL_M - 1) & ~(CGEMM_UNROLL_M - 1);
  436. zgemm_p = (zgemm_p + ZGEMM_UNROLL_M - 1) & ~(ZGEMM_UNROLL_M - 1);
  437. #ifdef QUAD_PRECISION
  438. qgemm_p = (qgemm_p + QGEMM_UNROLL_M - 1) & ~(QGEMM_UNROLL_M - 1);
  439. xgemm_p = (xgemm_p + XGEMM_UNROLL_M - 1) & ~(XGEMM_UNROLL_M - 1);
  440. #endif
  441. sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
  442. dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
  443. cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
  444. zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
  445. #if defined(EXPRECISION) || defined(QUAD_PRECISION)
  446. qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
  447. xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
  448. #endif
  449. #if 0
  450. fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R);
  451. fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R);
  452. fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R);
  453. fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R);
  454. #endif
  455. return;
  456. }
  457. #if 0
  458. int get_current_cpu_info(void){
  459. int nlprocs, ncores, cmplegacy;
  460. int htt = 0;
  461. int apicid = 0;
  462. #if defined(CORE_PRESCOTT) || defined(CORE_OPTERON)
  463. int eax, ebx, ecx, edx;
  464. cpuid(1, &eax, &ebx, &ecx, &edx);
  465. nlprocs = BITMASK(ebx, 16, 0xff);
  466. apicid = BITMASK(ebx, 24, 0xff);
  467. htt = BITMASK(edx, 28, 0x01);
  468. #endif
  469. #if defined(CORE_PRESCOTT)
  470. cpuid(4, &eax, &ebx, &ecx, &edx);
  471. ncores = BITMASK(eax, 26, 0x3f);
  472. if (htt == 0) nlprocs = 0;
  473. #endif
  474. #if defined(CORE_OPTERON)
  475. cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
  476. ncores = BITMASK(ecx, 0, 0xff);
  477. cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
  478. cmplegacy = BITMASK(ecx, 1, 0x01);
  479. if (htt == 0) {
  480. nlprocs = 0;
  481. ncores = 0;
  482. cmplegacy = 0;
  483. }
  484. #endif
  485. ncores ++;
  486. fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores);
  487. return 0;
  488. }
  489. #endif
  490. #endif
  491. #if defined(ARCH_IA64)
  492. static inline BLASULONG cpuid(BLASULONG regnum){
  493. BLASULONG value;
  494. #ifndef __ECC
  495. asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum));
  496. #else
  497. value = __getIndReg(_IA64_REG_INDR_CPUID, regnum);
  498. #endif
  499. return value;
  500. }
  501. #if 1
  502. void blas_set_parameter(void){
  503. BLASULONG cpuid3, size;
  504. cpuid3 = cpuid(3);
  505. size = BITMASK(cpuid3, 16, 0xff);
  506. sgemm_p = 192 * (size + 1);
  507. dgemm_p = 96 * (size + 1);
  508. cgemm_p = 96 * (size + 1);
  509. zgemm_p = 48 * (size + 1);
  510. #ifdef EXPRECISION
  511. qgemm_p = 64 * (size + 1);
  512. xgemm_p = 32 * (size + 1);
  513. #endif
  514. #ifdef QUAD_PRECISION
  515. qgemm_p = 32 * (size + 1);
  516. xgemm_p = 16 * (size + 1);
  517. #endif
  518. sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
  519. dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
  520. cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
  521. zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
  522. #if defined(EXPRECISION) || defined(QUAD_PRECISION)
  523. qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
  524. xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
  525. #endif
  526. return;
  527. }
  528. #else
  529. #define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size"
  530. #define IA64_PROC_NAME "/proc/pal/cpu0/cache_info"
  531. void blas_set_parameter(void){
  532. BLASULONG cpuid3;
  533. int size = 0;
  534. #if 1
  535. char buffer[128];
  536. FILE *infile;
  537. if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) {
  538. fgets(buffer, sizeof(buffer), infile);
  539. fclose(infile);
  540. size = atoi(buffer) / 1536;
  541. }
  542. if (size <= 0) {
  543. if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) {
  544. while(fgets(buffer, sizeof(buffer), infile) != NULL) {
  545. if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break;
  546. }
  547. fgets(buffer, sizeof(buffer), infile);
  548. fclose(infile);
  549. *strstr(buffer, "bytes") = (char)NULL;
  550. size = atoi(strchr(buffer, ':') + 1) / 1572864;
  551. }
  552. }
  553. #endif
  554. /* The last resort */
  555. if (size <= 0) {
  556. cpuid3 = cpuid(3);
  557. size = BITMASK(cpuid3, 16, 0xff) + 1;
  558. }
  559. sgemm_p = 320 * size;
  560. dgemm_p = 160 * size;
  561. cgemm_p = 160 * size;
  562. zgemm_p = 80 * size;
  563. #ifdef EXPRECISION
  564. qgemm_p = 80 * size;
  565. xgemm_p = 40 * size;
  566. #endif
  567. sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
  568. dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
  569. cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
  570. zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
  571. #ifdef EXPRECISION
  572. qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
  573. xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
  574. #endif
  575. return;
  576. }
  577. #endif
  578. #endif
  579. #if defined(ARCH_MIPS64)
  580. void blas_set_parameter(void){
  581. #if defined(LOONGSON3A)
  582. #ifdef SMP
  583. if(blas_num_threads == 1){
  584. #endif
  585. //single thread
  586. dgemm_r = 1024;
  587. #ifdef SMP
  588. }else{
  589. //multi thread
  590. dgemm_r = 200;
  591. }
  592. #endif
  593. #endif
  594. #if defined(LOONGSON3B)
  595. #ifdef SMP
  596. if(blas_num_threads == 1 || blas_num_threads == 2){
  597. #endif
  598. //single thread
  599. dgemm_r = 640;
  600. #ifdef SMP
  601. }else{
  602. //multi thread
  603. dgemm_r = 160;
  604. }
  605. #endif
  606. #endif
  607. }
  608. #endif