|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864 |
- /*********************************************************************/
- /* Copyright 2009, 2010 The University of Texas at Austin. */
- /* All rights reserved. */
- /* */
- /* Redistribution and use in source and binary forms, with or */
- /* without modification, are permitted provided that the following */
- /* conditions are met: */
- /* */
- /* 1. Redistributions of source code must retain the above */
- /* copyright notice, this list of conditions and the following */
- /* disclaimer. */
- /* */
- /* 2. Redistributions in binary form must reproduce the above */
- /* copyright notice, this list of conditions and the following */
- /* disclaimer in the documentation and/or other materials */
- /* provided with the distribution. */
- /* */
- /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
- /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
- /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
- /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
- /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
- /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
- /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
- /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
- /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
- /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
- /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
- /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
- /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
- /* POSSIBILITY OF SUCH DAMAGE. */
- /* */
- /* The views and conclusions contained in the software and */
- /* documentation are those of the authors and should not be */
- /* interpreted as representing official policies, either expressed */
- /* or implied, of The University of Texas at Austin. */
- /*********************************************************************/
-
- #include <stdio.h>
- #include <string.h>
- #include "common.h"
-
- extern int openblas_block_factor(void);
- int get_L2_size(void);
-
- #define DEFAULT_GEMM_P 128
- #define DEFAULT_GEMM_Q 128
- #define DEFAULT_GEMM_R 128
- #define DEFAULT_GEMM_OFFSET_A 0
- #define DEFAULT_GEMM_OFFSET_B 0
-
- /* Global Parameter */
- #if GEMM_OFFSET_A == gemm_offset_a
- BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
- #else
- BLASLONG gemm_offset_a = GEMM_OFFSET_A;
- #endif
-
- #if GEMM_OFFSET_B == gemm_offset_b
- BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
- #else
- BLASLONG gemm_offset_b = GEMM_OFFSET_B;
- #endif
-
- #if SBGEMM_P == sbgemm_p
- BLASLONG sbgemm_p = DEFAULT_GEMM_P;
- #else
- BLASLONG sbgemm_p = SBGEMM_P;
- #endif
- #if SHGEMM_P == shgemm_p
- BLASLONG shgemm_p = DEFAULT_GEMM_P;
- #else
- BLASLONG shgemm_p = SHGEMM_P;
- #endif
- #if SGEMM_P == sgemm_p
- BLASLONG sgemm_p = DEFAULT_GEMM_P;
- #else
- BLASLONG sgemm_p = SGEMM_P;
- #endif
- #if DGEMM_P == dgemm_p
- BLASLONG dgemm_p = DEFAULT_GEMM_P;
- #else
- BLASLONG dgemm_p = DGEMM_P;
- #endif
- #if CGEMM_P == cgemm_p
- BLASLONG cgemm_p = DEFAULT_GEMM_P;
- #else
- BLASLONG cgemm_p = CGEMM_P;
- #endif
- #if ZGEMM_P == zgemm_p
- BLASLONG zgemm_p = DEFAULT_GEMM_P;
- #else
- BLASLONG zgemm_p = ZGEMM_P;
- #endif
-
- #if SBGEMM_Q == sbgemm_q
- BLASLONG sbgemm_q = DEFAULT_GEMM_Q;
- #else
- BLASLONG sbgemm_q = SBGEMM_Q;
- #endif
- #if SHGEMM_Q == shgemm_q
- BLASLONG shgemm_q = DEFAULT_GEMM_Q;
- #else
- BLASLONG shgemm_q = SHGEMM_Q;
- #endif
- #if SGEMM_Q == sgemm_q
- BLASLONG sgemm_q = DEFAULT_GEMM_Q;
- #else
- BLASLONG sgemm_q = SGEMM_Q;
- #endif
- #if DGEMM_Q == dgemm_q
- BLASLONG dgemm_q = DEFAULT_GEMM_Q;
- #else
- BLASLONG dgemm_q = DGEMM_Q;
- #endif
- #if CGEMM_Q == cgemm_q
- BLASLONG cgemm_q = DEFAULT_GEMM_Q;
- #else
- BLASLONG cgemm_q = CGEMM_Q;
- #endif
- #if ZGEMM_Q == zgemm_q
- BLASLONG zgemm_q = DEFAULT_GEMM_Q;
- #else
- BLASLONG zgemm_q = ZGEMM_Q;
- #endif
-
- #if SBGEMM_R == sbgemm_r
- BLASLONG sbgemm_r = DEFAULT_GEMM_R;
- #else
- BLASLONG sbgemm_r = SBGEMM_R;
- #endif
- #if SHGEMM_R == shgemm_r
- BLASLONG shgemm_r = DEFAULT_GEMM_R;
- #else
- BLASLONG shgemm_r = SHGEMM_R;
- #endif
- #if SGEMM_R == sgemm_r
- BLASLONG sgemm_r = DEFAULT_GEMM_R;
- #else
- BLASLONG sgemm_r = SGEMM_R;
- #endif
- #if DGEMM_R == dgemm_r
- BLASLONG dgemm_r = DEFAULT_GEMM_R;
- #else
- BLASLONG dgemm_r = DGEMM_R;
- #endif
- #if CGEMM_R == cgemm_r
- BLASLONG cgemm_r = DEFAULT_GEMM_R;
- #else
- BLASLONG cgemm_r = CGEMM_R;
- #endif
- #if ZGEMM_R == zgemm_r
- BLASLONG zgemm_r = DEFAULT_GEMM_R;
- #else
- BLASLONG zgemm_r = ZGEMM_R;
- #endif
-
- #if defined(EXPRECISION) || defined(QUAD_PRECISION)
- #if QGEMM_P == qgemm_p
- BLASLONG qgemm_p = DEFAULT_GEMM_P;
- #else
- BLASLONG qgemm_p = QGEMM_P;
- #endif
- #if XGEMM_P == xgemm_p
- BLASLONG xgemm_p = DEFAULT_GEMM_P;
- #else
- BLASLONG xgemm_p = XGEMM_P;
- #endif
- #if QGEMM_Q == qgemm_q
- BLASLONG qgemm_q = DEFAULT_GEMM_Q;
- #else
- BLASLONG qgemm_q = QGEMM_Q;
- #endif
- #if XGEMM_Q == xgemm_q
- BLASLONG xgemm_q = DEFAULT_GEMM_Q;
- #else
- BLASLONG xgemm_q = XGEMM_Q;
- #endif
- #if QGEMM_R == qgemm_r
- BLASLONG qgemm_r = DEFAULT_GEMM_R;
- #else
- BLASLONG qgemm_r = QGEMM_R;
- #endif
- #if XGEMM_R == xgemm_r
- BLASLONG xgemm_r = DEFAULT_GEMM_R;
- #else
- BLASLONG xgemm_r = XGEMM_R;
- #endif
- #endif
-
- #if defined(ARCH_X86) || defined(ARCH_X86_64)
-
- int get_L2_size(void){
-
- int eax, ebx, ecx, edx;
-
- #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
- defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
- defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
- defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \
- defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
-
- cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
-
- return BITMASK(ecx, 16, 0xffff);
-
- #else
-
- int info[15];
- int i;
-
- cpuid(2, &eax, &ebx, &ecx, &edx);
-
- info[ 0] = BITMASK(eax, 8, 0xff);
- info[ 1] = BITMASK(eax, 16, 0xff);
- info[ 2] = BITMASK(eax, 24, 0xff);
-
- info[ 3] = BITMASK(ebx, 0, 0xff);
- info[ 4] = BITMASK(ebx, 8, 0xff);
- info[ 5] = BITMASK(ebx, 16, 0xff);
- info[ 6] = BITMASK(ebx, 24, 0xff);
-
- info[ 7] = BITMASK(ecx, 0, 0xff);
- info[ 8] = BITMASK(ecx, 8, 0xff);
- info[ 9] = BITMASK(ecx, 16, 0xff);
- info[10] = BITMASK(ecx, 24, 0xff);
-
- info[11] = BITMASK(edx, 0, 0xff);
- info[12] = BITMASK(edx, 8, 0xff);
- info[13] = BITMASK(edx, 16, 0xff);
- info[14] = BITMASK(edx, 24, 0xff);
-
- for (i = 0; i < 15; i++){
-
- switch (info[i]){
- case 0x3b :
- case 0x41 :
- case 0x79 :
- return 128;
- break;
-
- case 0x3c :
- case 0x42 :
- case 0x7a :
- case 0x7e :
- case 0x82 :
- return 256;
- break;
-
- case 0x43 :
- case 0x7b :
- case 0x7f :
- case 0x83 :
- case 0x86 :
- return 512;
- break;
-
- case 0x44 :
- case 0x78 :
- case 0x7c :
- case 0x84 :
- case 0x87 :
- return 1024;
- break;
-
- case 0x45 :
- case 0x7d :
- case 0x85 :
- return 2048;
-
- case 0x49 :
- return 4096;
- break;
- }
- }
-
- /* Never reached */
- return 0;
- #endif
- }
-
- void blas_set_parameter(void){
-
- int factor;
- #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \
- defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \
- defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
- int size = 16;
- #else
- int size = get_L2_size();
- #endif
-
- #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
- size >>= 7;
-
- #if defined(CORE_BANIAS) && (HAVE_HIT > 1)
- sgemm_p = 64 / HAVE_HIT * size;
- dgemm_p = 32 / HAVE_HIT * size;
- cgemm_p = 32 / HAVE_HIT * size;
- zgemm_p = 16 / HAVE_HIT * size;
- #ifdef EXPRECISION
- qgemm_p = 16 / HAVE_HIT * size;
- xgemm_p = 8 / HAVE_HIT * size;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 8 / HAVE_HIT * size;
- xgemm_p = 4 / HAVE_HIT * size;
- #endif
- #else
- sgemm_p = 64 * size;
- dgemm_p = 32 * size;
- cgemm_p = 32 * size;
- zgemm_p = 16 * size;
- #ifdef EXPRECISION
- qgemm_p = 16 * size;
- xgemm_p = 8 * size;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 8 * size;
- xgemm_p = 4 * size;
- #endif
- #endif
- #endif
-
- #if defined(CORE_NORTHWOOD)
- size >>= 7;
-
- #ifdef ALLOC_HUGETLB
- sgemm_p = 128 * size;
- dgemm_p = 64 * size;
- cgemm_p = 64 * size;
- zgemm_p = 32 * size;
- #ifdef EXPRECISION
- qgemm_p = 32 * size;
- xgemm_p = 16 * size;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 16 * size;
- xgemm_p = 8 * size;
- #endif
- #else
- sgemm_p = 96 * size;
- dgemm_p = 48 * size;
- cgemm_p = 48 * size;
- zgemm_p = 24 * size;
- #ifdef EXPRECISION
- qgemm_p = 24 * size;
- xgemm_p = 12 * size;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 12 * size;
- xgemm_p = 6 * size;
- #endif
- #endif
- #endif
-
- #if defined(CORE_CORE2)
-
- size >>= 9;
-
- sgemm_p = 92 * size;
- dgemm_p = 46 * size;
- cgemm_p = 46 * size;
- zgemm_p = 23 * size;
-
- #ifdef EXPRECISION
- qgemm_p = 23 * size;
- xgemm_p = 11 * size;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 11 * size;
- xgemm_p = 5 * size;
- #endif
- #endif
-
- #if defined(PENRYN)
-
- size >>= 9;
-
- sgemm_p = 1024;
- dgemm_p = 512;
- cgemm_p = 512;
- zgemm_p = 256;
-
- #ifdef EXPRECISION
- qgemm_p = 256;
- xgemm_p = 128;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 21 * size + 4;
- xgemm_p = 10 * size + 2;
- #endif
- #endif
-
- #if defined(DUNNINGTON)
-
- size >>= 9;
-
- sgemm_p = 384;
- dgemm_p = 384;
- cgemm_p = 384;
- zgemm_p = 384;
-
- #ifdef EXPRECISION
- qgemm_p = 384;
- xgemm_p = 384;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 21 * size + 4;
- xgemm_p = 10 * size + 2;
- #endif
- #endif
-
- #if defined(NEHALEM)
- sgemm_p = 1024;
- dgemm_p = 512;
- cgemm_p = 512;
- zgemm_p = 256;
- #ifdef EXPRECISION
- qgemm_p = 256;
- xgemm_p = 128;
- #endif
- #endif
-
- #if defined(SANDYBRIDGE)
- sgemm_p = 1024;
- dgemm_p = 512;
- cgemm_p = 512;
- zgemm_p = 256;
- #ifdef EXPRECISION
- qgemm_p = 256;
- xgemm_p = 128;
- #endif
- #endif
-
- #if defined(CORE_PRESCOTT) || defined(GENERIC)
- size >>= 6;
-
- if (size > 16) size = 16;
-
- sgemm_p = 56 * size;
- dgemm_p = 28 * size;
- cgemm_p = 28 * size;
- zgemm_p = 14 * size;
- #ifdef EXPRECISION
- qgemm_p = 14 * size;
- xgemm_p = 7 * size;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 7 * size;
- xgemm_p = 3 * size;
- #endif
- #endif
-
- #if defined(CORE_OPTERON)
- sgemm_p = 224 + 14 * (size >> 5);
- dgemm_p = 112 + 14 * (size >> 6);
- cgemm_p = 116 + 14 * (size >> 6);
- zgemm_p = 58 + 14 * (size >> 7);
- #ifdef EXPRECISION
- qgemm_p = 58 + 14 * (size >> 7);
- xgemm_p = 29 + 14 * (size >> 8);
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 29 + 14 * (size >> 8);
- xgemm_p = 15 + 14 * (size >> 9);
- #endif
- #endif
-
- #if defined(ATOM)
- size >>= 8;
-
- sgemm_p = 256;
- dgemm_p = 128;
- cgemm_p = 128;
- zgemm_p = 64;
- #ifdef EXPRECISION
- qgemm_p = 64;
- xgemm_p = 32;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 32;
- xgemm_p = 16;
- #endif
- #endif
-
- #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
- size >>= 8;
-
- sgemm_p = 232 * size;
- dgemm_p = 116 * size;
- cgemm_p = 116 * size;
- zgemm_p = 58 * size;
- #ifdef EXPRECISION
- qgemm_p = 58 * size;
- xgemm_p = 26 * size;
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 26 * size;
- xgemm_p = 13 * size;
- #endif
- #endif
-
- factor=openblas_block_factor();
- if (factor>0) {
- if (factor < 10) factor = 10;
- if (factor > 200) factor = 200;
-
- sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L;
- dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L;
- cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L;
- zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L;
- #ifdef EXPRECISION
- qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L;
- xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L;
- #endif
- }
-
- if (sgemm_p == 0) sgemm_p = 64;
- if (dgemm_p == 0) dgemm_p = 64;
- if (cgemm_p == 0) cgemm_p = 64;
- if (zgemm_p == 0) zgemm_p = 64;
- #ifdef EXPRECISION
- if (qgemm_p == 0) qgemm_p = 64;
- if (xgemm_p == 0) xgemm_p = 64;
- #endif
-
- #ifdef QUAD_PRECISION
- if (qgemm_p == 0) qgemm_p = 64;
- if (xgemm_p == 0) xgemm_p = 64;
- #endif
-
- sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M;
- dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M;
- cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M;
- zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M;
- #ifdef QUAD_PRECISION
- qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M;
- xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
- #endif
-
- #ifdef BUILD_BFLOAT16
- sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
- #endif
- #ifdef BUILD_HFLOAT16
- shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15;
- #endif
- sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
- dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
- cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
- zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
- #if defined(EXPRECISION) || defined(QUAD_PRECISION)
- qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
- xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
- #endif
-
- #if 0
- fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R);
- fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R);
- fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R);
- fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R);
- #endif
-
- return;
- }
-
- #if 0
-
- int get_current_cpu_info(void){
-
- int nlprocs, ncores, cmplegacy;
- int htt = 0;
- int apicid = 0;
-
- #if defined(CORE_PRESCOTT) || defined(CORE_OPTERON)
- int eax, ebx, ecx, edx;
-
- cpuid(1, &eax, &ebx, &ecx, &edx);
- nlprocs = BITMASK(ebx, 16, 0xff);
- apicid = BITMASK(ebx, 24, 0xff);
- htt = BITMASK(edx, 28, 0x01);
- #endif
-
- #if defined(CORE_PRESCOTT)
- cpuid(4, &eax, &ebx, &ecx, &edx);
- ncores = BITMASK(eax, 26, 0x3f);
-
- if (htt == 0) nlprocs = 0;
- #endif
-
- #if defined(CORE_OPTERON)
- cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
- ncores = BITMASK(ecx, 0, 0xff);
-
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- cmplegacy = BITMASK(ecx, 1, 0x01);
-
- if (htt == 0) {
- nlprocs = 0;
- ncores = 0;
- cmplegacy = 0;
- }
- #endif
-
- ncores ++;
-
- fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores);
-
- return 0;
- }
- #endif
-
- #endif
-
- #if defined(ARCH_IA64)
-
- static inline BLASULONG cpuid(BLASULONG regnum){
- BLASULONG value;
-
- #ifndef __ECC
- asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum));
- #else
- value = __getIndReg(_IA64_REG_INDR_CPUID, regnum);
- #endif
-
- return value;
- }
-
- #if 1
-
- void blas_set_parameter(void){
-
- BLASULONG cpuid3, size;
-
- cpuid3 = cpuid(3);
-
- size = BITMASK(cpuid3, 16, 0xff);
-
- sbgemm_p = 192 * (size + 1);
- shgemm_p = 192 * (size + 1);
- sgemm_p = 192 * (size + 1);
- dgemm_p = 96 * (size + 1);
- cgemm_p = 96 * (size + 1);
- zgemm_p = 48 * (size + 1);
- #ifdef EXPRECISION
- qgemm_p = 64 * (size + 1);
- xgemm_p = 32 * (size + 1);
- #endif
- #ifdef QUAD_PRECISION
- qgemm_p = 32 * (size + 1);
- xgemm_p = 16 * (size + 1);
- #endif
-
- #ifdef BUILD_BFLOAT16
- sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
- #endif
- #ifdef BUILD_HFLOAT16
- shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15;
- #endif
- sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
- dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
- cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
- zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
- #if defined(EXPRECISION) || defined(QUAD_PRECISION)
- qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
- xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
- #endif
-
- return;
- }
-
- #else
-
- #define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size"
- #define IA64_PROC_NAME "/proc/pal/cpu0/cache_info"
-
- void blas_set_parameter(void){
-
- BLASULONG cpuid3;
- int size = 0;
-
- #if 1
- char buffer[128];
- FILE *infile;
-
- if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) {
-
- fgets(buffer, sizeof(buffer), infile);
- fclose(infile);
-
- size = atoi(buffer) / 1536;
- }
-
- if (size <= 0) {
- if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) {
-
- while(fgets(buffer, sizeof(buffer), infile) != NULL) {
- if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break;
- }
-
- fgets(buffer, sizeof(buffer), infile);
-
- fclose(infile);
-
- *strstr(buffer, "bytes") = (char)NULL;
-
- size = atoi(strchr(buffer, ':') + 1) / 1572864;
- }
- }
- #endif
-
- /* The last resort */
-
- if (size <= 0) {
- cpuid3 = cpuid(3);
-
- size = BITMASK(cpuid3, 16, 0xff) + 1;
- }
-
- sgemm_p = 320 * size;
- dgemm_p = 160 * size;
- cgemm_p = 160 * size;
- zgemm_p = 80 * size;
- #ifdef EXPRECISION
- qgemm_p = 80 * size;
- xgemm_p = 40 * size;
- #endif
-
- sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
- dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
- cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
- zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
- #ifdef EXPRECISION
- qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
- xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
- #endif
-
- return;
- }
-
- #endif
-
- #endif
-
- #if defined(ARCH_MIPS64)
- void blas_set_parameter(void){
- #if defined(LOONGSON3R3) || defined(LOONGSON3R4)
- #ifdef SMP
- if(blas_num_threads == 1){
- #endif
- //single thread
- dgemm_r = 1024;
- #ifdef SMP
- }else{
- //multi thread
- dgemm_r = 200;
- }
- #endif
- #endif
-
- }
- #endif
-
- #if defined(ARCH_LOONGARCH64)
- int get_L3_size() {
- int ret = 0, id = 0x14;
- __asm__ volatile (
- "cpucfg %[ret], %[id]"
- : [ret]"=r"(ret)
- : [id]"r"(id)
- : "memory"
- );
- return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB
- }
-
- void blas_set_parameter(void){
- #if defined(LA464)
- int L3_size = get_L3_size();
- #ifdef SMP
- if(blas_num_threads == 1){
- #endif
- //single thread
- if (L3_size == 32){ // 3C5000 and 3D5000
- sgemm_p = 256;
- sgemm_q = 384;
- sgemm_r = 8192;
-
- dgemm_p = 112;
- dgemm_q = 289;
- dgemm_r = 4096;
-
- cgemm_p = 128;
- cgemm_q = 256;
- cgemm_r = 4096;
-
- zgemm_p = 128;
- zgemm_q = 128;
- zgemm_r = 2048;
- } else { // 3A5000 and 3C5000L
- sgemm_p = 256;
- sgemm_q = 384;
- sgemm_r = 4096;
-
- dgemm_p = 112;
- dgemm_q = 300;
- dgemm_r = 3024;
-
- cgemm_p = 128;
- cgemm_q = 256;
- cgemm_r = 2048;
-
- zgemm_p = 128;
- zgemm_q = 128;
- zgemm_r = 1024;
- }
- #ifdef SMP
- }else{
- //multi thread
- if (L3_size == 32){ // 3C5000 and 3D5000
- sgemm_p = 256;
- sgemm_q = 384;
- sgemm_r = 1024;
-
- dgemm_p = 112;
- dgemm_q = 289;
- dgemm_r = 342;
-
- cgemm_p = 128;
- cgemm_q = 256;
- cgemm_r = 512;
-
- zgemm_p = 128;
- zgemm_q = 128;
- zgemm_r = 512;
- } else { // 3A5000 and 3C5000L
- sgemm_p = 256;
- sgemm_q = 384;
- sgemm_r = 2048;
-
- dgemm_p = 112;
- dgemm_q = 300;
- dgemm_r = 738;
-
- cgemm_p = 128;
- cgemm_q = 256;
- cgemm_r = 1024;
-
- zgemm_p = 128;
- zgemm_q = 128;
- zgemm_r = 1024;
- }
- }
- #endif
- #endif
- }
- #endif
-
- #if defined(ARCH_ARM64)
-
- void blas_set_parameter(void)
- {
- }
-
- #endif
|