You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

parameter.c 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include <string.h>
  40. #include "common.h"
  41. extern int openblas_block_factor(void);
  42. int get_L2_size(void);
  43. #define DEFAULT_GEMM_P 128
  44. #define DEFAULT_GEMM_Q 128
  45. #define DEFAULT_GEMM_R 128
  46. #define DEFAULT_GEMM_OFFSET_A 0
  47. #define DEFAULT_GEMM_OFFSET_B 0
  48. /* Global Parameter */
  49. #if GEMM_OFFSET_A == gemm_offset_a
  50. BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
  51. #else
  52. BLASLONG gemm_offset_a = GEMM_OFFSET_A;
  53. #endif
  54. #if GEMM_OFFSET_B == gemm_offset_b
  55. BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
  56. #else
  57. BLASLONG gemm_offset_b = GEMM_OFFSET_B;
  58. #endif
  59. #if SBGEMM_P == sbgemm_p
  60. BLASLONG sbgemm_p = DEFAULT_GEMM_P;
  61. #else
  62. BLASLONG sbgemm_p = SBGEMM_P;
  63. #endif
  64. #if SHGEMM_P == shgemm_p
  65. BLASLONG shgemm_p = DEFAULT_GEMM_P;
  66. #else
  67. BLASLONG shgemm_p = SHGEMM_P;
  68. #endif
  69. #if BGEMM_P == bgemm_p
  70. BLASLONG bgemm_p = DEFAULT_GEMM_P;
  71. #else
  72. BLASLONG bgemm_p = BGEMM_P;
  73. #endif
  74. #if SGEMM_P == sgemm_p
  75. BLASLONG sgemm_p = DEFAULT_GEMM_P;
  76. #else
  77. BLASLONG sgemm_p = SGEMM_P;
  78. #endif
  79. #if DGEMM_P == dgemm_p
  80. BLASLONG dgemm_p = DEFAULT_GEMM_P;
  81. #else
  82. BLASLONG dgemm_p = DGEMM_P;
  83. #endif
  84. #if CGEMM_P == cgemm_p
  85. BLASLONG cgemm_p = DEFAULT_GEMM_P;
  86. #else
  87. BLASLONG cgemm_p = CGEMM_P;
  88. #endif
  89. #if ZGEMM_P == zgemm_p
  90. BLASLONG zgemm_p = DEFAULT_GEMM_P;
  91. #else
  92. BLASLONG zgemm_p = ZGEMM_P;
  93. #endif
  94. #if SBGEMM_Q == sbgemm_q
  95. BLASLONG sbgemm_q = DEFAULT_GEMM_Q;
  96. #else
  97. BLASLONG sbgemm_q = SBGEMM_Q;
  98. #endif
  99. #if SHGEMM_Q == shgemm_q
  100. BLASLONG shgemm_q = DEFAULT_GEMM_Q;
  101. #else
  102. BLASLONG shgemm_q = SHGEMM_Q;
  103. #endif
  104. #if BGEMM_Q == bgemm_q
  105. BLASLONG bgemm_q = DEFAULT_GEMM_Q;
  106. #else
  107. BLASLONG bgemm_q = BGEMM_Q;
  108. #endif
  109. #if SGEMM_Q == sgemm_q
  110. BLASLONG sgemm_q = DEFAULT_GEMM_Q;
  111. #else
  112. BLASLONG sgemm_q = SGEMM_Q;
  113. #endif
  114. #if DGEMM_Q == dgemm_q
  115. BLASLONG dgemm_q = DEFAULT_GEMM_Q;
  116. #else
  117. BLASLONG dgemm_q = DGEMM_Q;
  118. #endif
  119. #if CGEMM_Q == cgemm_q
  120. BLASLONG cgemm_q = DEFAULT_GEMM_Q;
  121. #else
  122. BLASLONG cgemm_q = CGEMM_Q;
  123. #endif
  124. #if ZGEMM_Q == zgemm_q
  125. BLASLONG zgemm_q = DEFAULT_GEMM_Q;
  126. #else
  127. BLASLONG zgemm_q = ZGEMM_Q;
  128. #endif
  129. #if SBGEMM_R == sbgemm_r
  130. BLASLONG sbgemm_r = DEFAULT_GEMM_R;
  131. #else
  132. BLASLONG sbgemm_r = SBGEMM_R;
  133. #endif
  134. #if SHGEMM_R == shgemm_r
  135. BLASLONG shgemm_r = DEFAULT_GEMM_R;
  136. #else
  137. BLASLONG shgemm_r = SHGEMM_R;
  138. #endif
  139. #if BGEMM_R == bgemm_r
  140. BLASLONG bgemm_r = DEFAULT_GEMM_R;
  141. #else
  142. BLASLONG bgemm_r = BGEMM_R;
  143. #endif
  144. #if SGEMM_R == sgemm_r
  145. BLASLONG sgemm_r = DEFAULT_GEMM_R;
  146. #else
  147. BLASLONG sgemm_r = SGEMM_R;
  148. #endif
  149. #if DGEMM_R == dgemm_r
  150. BLASLONG dgemm_r = DEFAULT_GEMM_R;
  151. #else
  152. BLASLONG dgemm_r = DGEMM_R;
  153. #endif
  154. #if CGEMM_R == cgemm_r
  155. BLASLONG cgemm_r = DEFAULT_GEMM_R;
  156. #else
  157. BLASLONG cgemm_r = CGEMM_R;
  158. #endif
  159. #if ZGEMM_R == zgemm_r
  160. BLASLONG zgemm_r = DEFAULT_GEMM_R;
  161. #else
  162. BLASLONG zgemm_r = ZGEMM_R;
  163. #endif
  164. #if defined(EXPRECISION) || defined(QUAD_PRECISION)
  165. #if QGEMM_P == qgemm_p
  166. BLASLONG qgemm_p = DEFAULT_GEMM_P;
  167. #else
  168. BLASLONG qgemm_p = QGEMM_P;
  169. #endif
  170. #if XGEMM_P == xgemm_p
  171. BLASLONG xgemm_p = DEFAULT_GEMM_P;
  172. #else
  173. BLASLONG xgemm_p = XGEMM_P;
  174. #endif
  175. #if QGEMM_Q == qgemm_q
  176. BLASLONG qgemm_q = DEFAULT_GEMM_Q;
  177. #else
  178. BLASLONG qgemm_q = QGEMM_Q;
  179. #endif
  180. #if XGEMM_Q == xgemm_q
  181. BLASLONG xgemm_q = DEFAULT_GEMM_Q;
  182. #else
  183. BLASLONG xgemm_q = XGEMM_Q;
  184. #endif
  185. #if QGEMM_R == qgemm_r
  186. BLASLONG qgemm_r = DEFAULT_GEMM_R;
  187. #else
  188. BLASLONG qgemm_r = QGEMM_R;
  189. #endif
  190. #if XGEMM_R == xgemm_r
  191. BLASLONG xgemm_r = DEFAULT_GEMM_R;
  192. #else
  193. BLASLONG xgemm_r = XGEMM_R;
  194. #endif
  195. #endif
  196. #if defined(ARCH_X86) || defined(ARCH_X86_64)
  197. int get_L2_size(void){
  198. int eax, ebx, ecx, edx;
  199. #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
  200. defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
  201. defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
  202. defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \
  203. defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
  204. cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
  205. return BITMASK(ecx, 16, 0xffff);
  206. #else
  207. int info[15];
  208. int i;
  209. cpuid(2, &eax, &ebx, &ecx, &edx);
  210. info[ 0] = BITMASK(eax, 8, 0xff);
  211. info[ 1] = BITMASK(eax, 16, 0xff);
  212. info[ 2] = BITMASK(eax, 24, 0xff);
  213. info[ 3] = BITMASK(ebx, 0, 0xff);
  214. info[ 4] = BITMASK(ebx, 8, 0xff);
  215. info[ 5] = BITMASK(ebx, 16, 0xff);
  216. info[ 6] = BITMASK(ebx, 24, 0xff);
  217. info[ 7] = BITMASK(ecx, 0, 0xff);
  218. info[ 8] = BITMASK(ecx, 8, 0xff);
  219. info[ 9] = BITMASK(ecx, 16, 0xff);
  220. info[10] = BITMASK(ecx, 24, 0xff);
  221. info[11] = BITMASK(edx, 0, 0xff);
  222. info[12] = BITMASK(edx, 8, 0xff);
  223. info[13] = BITMASK(edx, 16, 0xff);
  224. info[14] = BITMASK(edx, 24, 0xff);
  225. for (i = 0; i < 15; i++){
  226. switch (info[i]){
  227. case 0x3b :
  228. case 0x41 :
  229. case 0x79 :
  230. return 128;
  231. break;
  232. case 0x3c :
  233. case 0x42 :
  234. case 0x7a :
  235. case 0x7e :
  236. case 0x82 :
  237. return 256;
  238. break;
  239. case 0x43 :
  240. case 0x7b :
  241. case 0x7f :
  242. case 0x83 :
  243. case 0x86 :
  244. return 512;
  245. break;
  246. case 0x44 :
  247. case 0x78 :
  248. case 0x7c :
  249. case 0x84 :
  250. case 0x87 :
  251. return 1024;
  252. break;
  253. case 0x45 :
  254. case 0x7d :
  255. case 0x85 :
  256. return 2048;
  257. case 0x49 :
  258. return 4096;
  259. break;
  260. }
  261. }
  262. /* Never reached */
  263. return 0;
  264. #endif
  265. }
  266. void blas_set_parameter(void){
  267. int factor;
  268. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \
  269. defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \
  270. defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
  271. int size = 16;
  272. #else
  273. int size = get_L2_size();
  274. #endif
  275. #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
  276. size >>= 7;
  277. #if defined(CORE_BANIAS) && (HAVE_HIT > 1)
  278. sgemm_p = 64 / HAVE_HIT * size;
  279. dgemm_p = 32 / HAVE_HIT * size;
  280. cgemm_p = 32 / HAVE_HIT * size;
  281. zgemm_p = 16 / HAVE_HIT * size;
  282. #ifdef EXPRECISION
  283. qgemm_p = 16 / HAVE_HIT * size;
  284. xgemm_p = 8 / HAVE_HIT * size;
  285. #endif
  286. #ifdef QUAD_PRECISION
  287. qgemm_p = 8 / HAVE_HIT * size;
  288. xgemm_p = 4 / HAVE_HIT * size;
  289. #endif
  290. #else
  291. sgemm_p = 64 * size;
  292. dgemm_p = 32 * size;
  293. cgemm_p = 32 * size;
  294. zgemm_p = 16 * size;
  295. #ifdef EXPRECISION
  296. qgemm_p = 16 * size;
  297. xgemm_p = 8 * size;
  298. #endif
  299. #ifdef QUAD_PRECISION
  300. qgemm_p = 8 * size;
  301. xgemm_p = 4 * size;
  302. #endif
  303. #endif
  304. #endif
  305. #if defined(CORE_NORTHWOOD)
  306. size >>= 7;
  307. #ifdef ALLOC_HUGETLB
  308. sgemm_p = 128 * size;
  309. dgemm_p = 64 * size;
  310. cgemm_p = 64 * size;
  311. zgemm_p = 32 * size;
  312. #ifdef EXPRECISION
  313. qgemm_p = 32 * size;
  314. xgemm_p = 16 * size;
  315. #endif
  316. #ifdef QUAD_PRECISION
  317. qgemm_p = 16 * size;
  318. xgemm_p = 8 * size;
  319. #endif
  320. #else
  321. sgemm_p = 96 * size;
  322. dgemm_p = 48 * size;
  323. cgemm_p = 48 * size;
  324. zgemm_p = 24 * size;
  325. #ifdef EXPRECISION
  326. qgemm_p = 24 * size;
  327. xgemm_p = 12 * size;
  328. #endif
  329. #ifdef QUAD_PRECISION
  330. qgemm_p = 12 * size;
  331. xgemm_p = 6 * size;
  332. #endif
  333. #endif
  334. #endif
  335. #if defined(CORE_CORE2)
  336. size >>= 9;
  337. sgemm_p = 92 * size;
  338. dgemm_p = 46 * size;
  339. cgemm_p = 46 * size;
  340. zgemm_p = 23 * size;
  341. #ifdef EXPRECISION
  342. qgemm_p = 23 * size;
  343. xgemm_p = 11 * size;
  344. #endif
  345. #ifdef QUAD_PRECISION
  346. qgemm_p = 11 * size;
  347. xgemm_p = 5 * size;
  348. #endif
  349. #endif
  350. #if defined(PENRYN)
  351. size >>= 9;
  352. sgemm_p = 1024;
  353. dgemm_p = 512;
  354. cgemm_p = 512;
  355. zgemm_p = 256;
  356. #ifdef EXPRECISION
  357. qgemm_p = 256;
  358. xgemm_p = 128;
  359. #endif
  360. #ifdef QUAD_PRECISION
  361. qgemm_p = 21 * size + 4;
  362. xgemm_p = 10 * size + 2;
  363. #endif
  364. #endif
  365. #if defined(DUNNINGTON)
  366. size >>= 9;
  367. sgemm_p = 384;
  368. dgemm_p = 384;
  369. cgemm_p = 384;
  370. zgemm_p = 384;
  371. #ifdef EXPRECISION
  372. qgemm_p = 384;
  373. xgemm_p = 384;
  374. #endif
  375. #ifdef QUAD_PRECISION
  376. qgemm_p = 21 * size + 4;
  377. xgemm_p = 10 * size + 2;
  378. #endif
  379. #endif
  380. #if defined(NEHALEM)
  381. sgemm_p = 1024;
  382. dgemm_p = 512;
  383. cgemm_p = 512;
  384. zgemm_p = 256;
  385. #ifdef EXPRECISION
  386. qgemm_p = 256;
  387. xgemm_p = 128;
  388. #endif
  389. #endif
  390. #if defined(SANDYBRIDGE)
  391. sgemm_p = 1024;
  392. dgemm_p = 512;
  393. cgemm_p = 512;
  394. zgemm_p = 256;
  395. #ifdef EXPRECISION
  396. qgemm_p = 256;
  397. xgemm_p = 128;
  398. #endif
  399. #endif
  400. #if defined(CORE_PRESCOTT) || defined(GENERIC)
  401. size >>= 6;
  402. if (size > 16) size = 16;
  403. sgemm_p = 56 * size;
  404. dgemm_p = 28 * size;
  405. cgemm_p = 28 * size;
  406. zgemm_p = 14 * size;
  407. #ifdef EXPRECISION
  408. qgemm_p = 14 * size;
  409. xgemm_p = 7 * size;
  410. #endif
  411. #ifdef QUAD_PRECISION
  412. qgemm_p = 7 * size;
  413. xgemm_p = 3 * size;
  414. #endif
  415. #endif
  416. #if defined(CORE_OPTERON)
  417. sgemm_p = 224 + 14 * (size >> 5);
  418. dgemm_p = 112 + 14 * (size >> 6);
  419. cgemm_p = 116 + 14 * (size >> 6);
  420. zgemm_p = 58 + 14 * (size >> 7);
  421. #ifdef EXPRECISION
  422. qgemm_p = 58 + 14 * (size >> 7);
  423. xgemm_p = 29 + 14 * (size >> 8);
  424. #endif
  425. #ifdef QUAD_PRECISION
  426. qgemm_p = 29 + 14 * (size >> 8);
  427. xgemm_p = 15 + 14 * (size >> 9);
  428. #endif
  429. #endif
  430. #if defined(ATOM)
  431. size >>= 8;
  432. sgemm_p = 256;
  433. dgemm_p = 128;
  434. cgemm_p = 128;
  435. zgemm_p = 64;
  436. #ifdef EXPRECISION
  437. qgemm_p = 64;
  438. xgemm_p = 32;
  439. #endif
  440. #ifdef QUAD_PRECISION
  441. qgemm_p = 32;
  442. xgemm_p = 16;
  443. #endif
  444. #endif
  445. #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
  446. size >>= 8;
  447. sgemm_p = 232 * size;
  448. dgemm_p = 116 * size;
  449. cgemm_p = 116 * size;
  450. zgemm_p = 58 * size;
  451. #ifdef EXPRECISION
  452. qgemm_p = 58 * size;
  453. xgemm_p = 26 * size;
  454. #endif
  455. #ifdef QUAD_PRECISION
  456. qgemm_p = 26 * size;
  457. xgemm_p = 13 * size;
  458. #endif
  459. #endif
  460. factor=openblas_block_factor();
  461. if (factor>0) {
  462. if (factor < 10) factor = 10;
  463. if (factor > 200) factor = 200;
  464. sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L;
  465. dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L;
  466. cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L;
  467. zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L;
  468. #ifdef EXPRECISION
  469. qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L;
  470. xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L;
  471. #endif
  472. }
  473. if (sgemm_p == 0) sgemm_p = 64;
  474. if (dgemm_p == 0) dgemm_p = 64;
  475. if (cgemm_p == 0) cgemm_p = 64;
  476. if (zgemm_p == 0) zgemm_p = 64;
  477. #ifdef EXPRECISION
  478. if (qgemm_p == 0) qgemm_p = 64;
  479. if (xgemm_p == 0) xgemm_p = 64;
  480. #endif
  481. #ifdef QUAD_PRECISION
  482. if (qgemm_p == 0) qgemm_p = 64;
  483. if (xgemm_p == 0) xgemm_p = 64;
  484. #endif
  485. sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M;
  486. dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M;
  487. cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M;
  488. zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M;
  489. #ifdef QUAD_PRECISION
  490. qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M;
  491. xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
  492. #endif
  493. #ifdef BUILD_BFLOAT16
  494. sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
  495. bgemm_r = (((BUFFER_SIZE - ((BGEMM_P * BGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (BGEMM_Q * 4)) - 15) & ~15;
  496. #endif
  497. #ifdef BUILD_HFLOAT16
  498. shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15;
  499. #endif
  500. sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
  501. dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
  502. cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
  503. zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
  504. #if defined(EXPRECISION) || defined(QUAD_PRECISION)
  505. qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
  506. xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
  507. #endif
  508. #if 0
  509. fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R);
  510. fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R);
  511. fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R);
  512. fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R);
  513. #endif
  514. return;
  515. }
  516. #if 0
  517. int get_current_cpu_info(void){
  518. int nlprocs, ncores, cmplegacy;
  519. int htt = 0;
  520. int apicid = 0;
  521. #if defined(CORE_PRESCOTT) || defined(CORE_OPTERON)
  522. int eax, ebx, ecx, edx;
  523. cpuid(1, &eax, &ebx, &ecx, &edx);
  524. nlprocs = BITMASK(ebx, 16, 0xff);
  525. apicid = BITMASK(ebx, 24, 0xff);
  526. htt = BITMASK(edx, 28, 0x01);
  527. #endif
  528. #if defined(CORE_PRESCOTT)
  529. cpuid(4, &eax, &ebx, &ecx, &edx);
  530. ncores = BITMASK(eax, 26, 0x3f);
  531. if (htt == 0) nlprocs = 0;
  532. #endif
  533. #if defined(CORE_OPTERON)
  534. cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
  535. ncores = BITMASK(ecx, 0, 0xff);
  536. cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
  537. cmplegacy = BITMASK(ecx, 1, 0x01);
  538. if (htt == 0) {
  539. nlprocs = 0;
  540. ncores = 0;
  541. cmplegacy = 0;
  542. }
  543. #endif
  544. ncores ++;
  545. fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores);
  546. return 0;
  547. }
  548. #endif
  549. #endif
  550. #if defined(ARCH_IA64)
  551. static inline BLASULONG cpuid(BLASULONG regnum){
  552. BLASULONG value;
  553. #ifndef __ECC
  554. asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum));
  555. #else
  556. value = __getIndReg(_IA64_REG_INDR_CPUID, regnum);
  557. #endif
  558. return value;
  559. }
  560. #if 1
  561. void blas_set_parameter(void){
  562. BLASULONG cpuid3, size;
  563. cpuid3 = cpuid(3);
  564. size = BITMASK(cpuid3, 16, 0xff);
  565. sbgemm_p = 192 * (size + 1);
  566. shgemm_p = 192 * (size + 1);
  567. sgemm_p = 192 * (size + 1);
  568. dgemm_p = 96 * (size + 1);
  569. cgemm_p = 96 * (size + 1);
  570. zgemm_p = 48 * (size + 1);
  571. #ifdef EXPRECISION
  572. qgemm_p = 64 * (size + 1);
  573. xgemm_p = 32 * (size + 1);
  574. #endif
  575. #ifdef QUAD_PRECISION
  576. qgemm_p = 32 * (size + 1);
  577. xgemm_p = 16 * (size + 1);
  578. #endif
  579. #ifdef BUILD_BFLOAT16
  580. sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
  581. bgemm_r = (((BUFFER_SIZE - ((BGEMM_P * BGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (BGEMM_Q * 4)) - 15) & ~15;
  582. #endif
  583. #ifdef BUILD_HFLOAT16
  584. shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15;
  585. #endif
  586. sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
  587. dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
  588. cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
  589. zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
  590. #if defined(EXPRECISION) || defined(QUAD_PRECISION)
  591. qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
  592. xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
  593. #endif
  594. return;
  595. }
  596. #else
  597. #define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size"
  598. #define IA64_PROC_NAME "/proc/pal/cpu0/cache_info"
  599. void blas_set_parameter(void){
  600. BLASULONG cpuid3;
  601. int size = 0;
  602. #if 1
  603. char buffer[128];
  604. FILE *infile;
  605. if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) {
  606. fgets(buffer, sizeof(buffer), infile);
  607. fclose(infile);
  608. size = atoi(buffer) / 1536;
  609. }
  610. if (size <= 0) {
  611. if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) {
  612. while(fgets(buffer, sizeof(buffer), infile) != NULL) {
  613. if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break;
  614. }
  615. fgets(buffer, sizeof(buffer), infile);
  616. fclose(infile);
  617. *strstr(buffer, "bytes") = (char)NULL;
  618. size = atoi(strchr(buffer, ':') + 1) / 1572864;
  619. }
  620. }
  621. #endif
  622. /* The last resort */
  623. if (size <= 0) {
  624. cpuid3 = cpuid(3);
  625. size = BITMASK(cpuid3, 16, 0xff) + 1;
  626. }
  627. sgemm_p = 320 * size;
  628. dgemm_p = 160 * size;
  629. cgemm_p = 160 * size;
  630. zgemm_p = 80 * size;
  631. #ifdef EXPRECISION
  632. qgemm_p = 80 * size;
  633. xgemm_p = 40 * size;
  634. #endif
  635. sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
  636. dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
  637. cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
  638. zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
  639. #ifdef EXPRECISION
  640. qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
  641. xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
  642. #endif
  643. return;
  644. }
  645. #endif
  646. #endif
  647. #if defined(ARCH_MIPS64)
  648. void blas_set_parameter(void){
  649. #if defined(LOONGSON3R3) || defined(LOONGSON3R4)
  650. #ifdef SMP
  651. if(blas_num_threads == 1){
  652. #endif
  653. //single thread
  654. dgemm_r = 1024;
  655. #ifdef SMP
  656. }else{
  657. //multi thread
  658. dgemm_r = 200;
  659. }
  660. #endif
  661. #endif
  662. }
  663. #endif
  664. #if defined(ARCH_LOONGARCH64)
  665. int get_L3_size() {
  666. int ret = 0, id = 0x14;
  667. __asm__ volatile (
  668. "cpucfg %[ret], %[id]"
  669. : [ret]"=r"(ret)
  670. : [id]"r"(id)
  671. : "memory"
  672. );
  673. return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB
  674. }
  675. void blas_set_parameter(void){
  676. #if defined(LA464)
  677. int L3_size = get_L3_size();
  678. #ifdef SMP
  679. if(blas_num_threads == 1){
  680. #endif
  681. //single thread
  682. if (L3_size == 32){ // 3C5000 and 3D5000
  683. sgemm_p = 256;
  684. sgemm_q = 384;
  685. sgemm_r = 8192;
  686. dgemm_p = 112;
  687. dgemm_q = 289;
  688. dgemm_r = 4096;
  689. cgemm_p = 128;
  690. cgemm_q = 256;
  691. cgemm_r = 4096;
  692. zgemm_p = 128;
  693. zgemm_q = 128;
  694. zgemm_r = 2048;
  695. } else { // 3A5000 and 3C5000L
  696. sgemm_p = 256;
  697. sgemm_q = 384;
  698. sgemm_r = 4096;
  699. dgemm_p = 112;
  700. dgemm_q = 300;
  701. dgemm_r = 3024;
  702. cgemm_p = 128;
  703. cgemm_q = 256;
  704. cgemm_r = 2048;
  705. zgemm_p = 128;
  706. zgemm_q = 128;
  707. zgemm_r = 1024;
  708. }
  709. #ifdef SMP
  710. }else{
  711. //multi thread
  712. if (L3_size == 32){ // 3C5000 and 3D5000
  713. sgemm_p = 256;
  714. sgemm_q = 384;
  715. sgemm_r = 1024;
  716. dgemm_p = 112;
  717. dgemm_q = 289;
  718. dgemm_r = 342;
  719. cgemm_p = 128;
  720. cgemm_q = 256;
  721. cgemm_r = 512;
  722. zgemm_p = 128;
  723. zgemm_q = 128;
  724. zgemm_r = 512;
  725. } else { // 3A5000 and 3C5000L
  726. sgemm_p = 256;
  727. sgemm_q = 384;
  728. sgemm_r = 2048;
  729. dgemm_p = 112;
  730. dgemm_q = 300;
  731. dgemm_r = 738;
  732. cgemm_p = 128;
  733. cgemm_q = 256;
  734. cgemm_r = 1024;
  735. zgemm_p = 128;
  736. zgemm_q = 128;
  737. zgemm_r = 1024;
  738. }
  739. }
  740. #endif
  741. #endif
  742. }
  743. #endif
  744. #if defined(ARCH_ARM64)
  745. void blas_set_parameter(void)
  746. {
  747. }
  748. #endif