You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

setparam-ref.c 34 kB

11 years ago
11 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include <string.h>
  40. #include "common.h"
  41. #ifdef BUILD_KERNEL
  42. #include "kernelTS.h"
  43. #endif
  44. #undef DEBUG
  45. static void init_parameter(void);
  46. gotoblas_t TABLE_NAME = {
  47. DTB_DEFAULT_ENTRIES ,
  48. GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
  49. 0, 0, 0,
  50. SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N,
  51. #ifdef SGEMM_DEFAULT_UNROLL_MN
  52. SGEMM_DEFAULT_UNROLL_MN,
  53. #else
  54. MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
  55. #endif
  56. #ifdef HAVE_EXCLUSIVE_CACHE
  57. 1,
  58. #else
  59. 0,
  60. #endif
  61. samax_kTS, samin_kTS, smax_kTS, smin_kTS,
  62. isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
  63. snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS,
  64. dsdot_kTS,
  65. srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
  66. sgemv_nTS, sgemv_tTS, sger_kTS,
  67. ssymv_LTS, ssymv_UTS,
  68. sgemm_kernelTS, sgemm_betaTS,
  69. #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
  70. sgemm_incopyTS, sgemm_itcopyTS,
  71. #else
  72. sgemm_oncopyTS, sgemm_otcopyTS,
  73. #endif
  74. sgemm_oncopyTS, sgemm_otcopyTS,
  75. strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS,
  76. #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
  77. strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS,
  78. strsm_ilnucopyTS, strsm_ilnncopyTS, strsm_iltucopyTS, strsm_iltncopyTS,
  79. #else
  80. strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS,
  81. strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS,
  82. #endif
  83. strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS,
  84. strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS,
  85. strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS,
  86. #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
  87. strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS,
  88. strmm_ilnucopyTS, strmm_ilnncopyTS, strmm_iltucopyTS, strmm_iltncopyTS,
  89. #else
  90. strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS,
  91. strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS,
  92. #endif
  93. strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS,
  94. strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS,
  95. #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
  96. ssymm_iutcopyTS, ssymm_iltcopyTS,
  97. #else
  98. ssymm_outcopyTS, ssymm_oltcopyTS,
  99. #endif
  100. ssymm_outcopyTS, ssymm_oltcopyTS,
  101. #ifndef NO_LAPACK
  102. sneg_tcopyTS, slaswp_ncopyTS,
  103. #else
  104. NULL,NULL,
  105. #endif
  106. 0, 0, 0,
  107. DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N,
  108. #ifdef DGEMM_DEFAULT_UNROLL_MN
  109. DGEMM_DEFAULT_UNROLL_MN,
  110. #else
  111. MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
  112. #endif
  113. damax_kTS, damin_kTS, dmax_kTS, dmin_kTS,
  114. idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
  115. dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS,
  116. drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS,
  117. dgemv_nTS, dgemv_tTS, dger_kTS,
  118. dsymv_LTS, dsymv_UTS,
  119. dgemm_kernelTS, dgemm_betaTS,
  120. #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
  121. dgemm_incopyTS, dgemm_itcopyTS,
  122. #else
  123. dgemm_oncopyTS, dgemm_otcopyTS,
  124. #endif
  125. dgemm_oncopyTS, dgemm_otcopyTS,
  126. dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS,
  127. #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
  128. dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS,
  129. dtrsm_ilnucopyTS, dtrsm_ilnncopyTS, dtrsm_iltucopyTS, dtrsm_iltncopyTS,
  130. #else
  131. dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS,
  132. dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS,
  133. #endif
  134. dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS,
  135. dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS,
  136. dtrmm_kernel_RNTS, dtrmm_kernel_RTTS, dtrmm_kernel_LNTS, dtrmm_kernel_LTTS,
  137. #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
  138. dtrmm_iunucopyTS, dtrmm_iunncopyTS, dtrmm_iutucopyTS, dtrmm_iutncopyTS,
  139. dtrmm_ilnucopyTS, dtrmm_ilnncopyTS, dtrmm_iltucopyTS, dtrmm_iltncopyTS,
  140. #else
  141. dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS,
  142. dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS,
  143. #endif
  144. dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS,
  145. dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS,
  146. #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
  147. dsymm_iutcopyTS, dsymm_iltcopyTS,
  148. #else
  149. dsymm_outcopyTS, dsymm_oltcopyTS,
  150. #endif
  151. dsymm_outcopyTS, dsymm_oltcopyTS,
  152. #ifndef NO_LAPACK
  153. dneg_tcopyTS, dlaswp_ncopyTS,
  154. #else
  155. NULL, NULL,
  156. #endif
  157. #ifdef EXPRECISION
  158. 0, 0, 0,
  159. QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),
  160. qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS,
  161. iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
  162. qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS,
  163. qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS,
  164. qgemv_nTS, qgemv_tTS, qger_kTS,
  165. qsymv_LTS, qsymv_UTS,
  166. qgemm_kernelTS, qgemm_betaTS,
  167. #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
  168. qgemm_incopyTS, qgemm_itcopyTS,
  169. #else
  170. qgemm_oncopyTS, qgemm_otcopyTS,
  171. #endif
  172. qgemm_oncopyTS, qgemm_otcopyTS,
  173. qtrsm_kernel_LNTS, qtrsm_kernel_LTTS, qtrsm_kernel_RNTS, qtrsm_kernel_RTTS,
  174. #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
  175. qtrsm_iunucopyTS, qtrsm_iunncopyTS, qtrsm_iutucopyTS, qtrsm_iutncopyTS,
  176. qtrsm_ilnucopyTS, qtrsm_ilnncopyTS, qtrsm_iltucopyTS, qtrsm_iltncopyTS,
  177. #else
  178. qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS,
  179. qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS,
  180. #endif
  181. qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS,
  182. qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS,
  183. qtrmm_kernel_RNTS, qtrmm_kernel_RTTS, qtrmm_kernel_LNTS, qtrmm_kernel_LTTS,
  184. #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
  185. qtrmm_iunucopyTS, qtrmm_iunncopyTS, qtrmm_iutucopyTS, qtrmm_iutncopyTS,
  186. qtrmm_ilnucopyTS, qtrmm_ilnncopyTS, qtrmm_iltucopyTS, qtrmm_iltncopyTS,
  187. #else
  188. qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS,
  189. qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS,
  190. #endif
  191. qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS,
  192. qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS,
  193. #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
  194. qsymm_iutcopyTS, qsymm_iltcopyTS,
  195. #else
  196. qsymm_outcopyTS, qsymm_oltcopyTS,
  197. #endif
  198. qsymm_outcopyTS, qsymm_oltcopyTS,
  199. #ifndef NO_LAPACK
  200. qneg_tcopyTS, qlaswp_ncopyTS,
  201. #else
  202. NULL, NULL,
  203. #endif
  204. #endif
  205. 0, 0, 0,
  206. CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N,
  207. #ifdef CGEMM_DEFAULT_UNROLL_MN
  208. CGEMM_DEFAULT_UNROLL_MN,
  209. #else
  210. MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N),
  211. #endif
  212. camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
  213. cnrm2_kTS, casum_kTS, ccopy_kTS,
  214. cdotu_kTS, cdotc_kTS, csrot_kTS,
  215. caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
  216. cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS,
  217. cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS,
  218. cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS,
  219. csymv_LTS, csymv_UTS,
  220. chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS,
  221. cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS,
  222. cgemm_betaTS,
  223. #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  224. cgemm_incopyTS, cgemm_itcopyTS,
  225. #else
  226. cgemm_oncopyTS, cgemm_otcopyTS,
  227. #endif
  228. cgemm_oncopyTS, cgemm_otcopyTS,
  229. ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS,
  230. ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS,
  231. #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  232. ctrsm_iunucopyTS, ctrsm_iunncopyTS, ctrsm_iutucopyTS, ctrsm_iutncopyTS,
  233. ctrsm_ilnucopyTS, ctrsm_ilnncopyTS, ctrsm_iltucopyTS, ctrsm_iltncopyTS,
  234. #else
  235. ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS,
  236. ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS,
  237. #endif
  238. ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS,
  239. ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS,
  240. ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS,
  241. ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS,
  242. #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  243. ctrmm_iunucopyTS, ctrmm_iunncopyTS, ctrmm_iutucopyTS, ctrmm_iutncopyTS,
  244. ctrmm_ilnucopyTS, ctrmm_ilnncopyTS, ctrmm_iltucopyTS, ctrmm_iltncopyTS,
  245. #else
  246. ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS,
  247. ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS,
  248. #endif
  249. ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS,
  250. ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS,
  251. #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  252. csymm_iutcopyTS, csymm_iltcopyTS,
  253. #else
  254. csymm_outcopyTS, csymm_oltcopyTS,
  255. #endif
  256. csymm_outcopyTS, csymm_oltcopyTS,
  257. #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
  258. chemm_iutcopyTS, chemm_iltcopyTS,
  259. #else
  260. chemm_outcopyTS, chemm_oltcopyTS,
  261. #endif
  262. chemm_outcopyTS, chemm_oltcopyTS,
  263. 0, 0, 0,
  264. #ifdef CGEMM3M_DEFAULT_UNROLL_M
  265. CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N),
  266. #else
  267. SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
  268. #endif
  269. cgemm3m_kernelTS,
  270. cgemm3m_incopybTS, cgemm3m_incopyrTS,
  271. cgemm3m_incopyiTS, cgemm3m_itcopybTS,
  272. cgemm3m_itcopyrTS, cgemm3m_itcopyiTS,
  273. cgemm3m_oncopybTS, cgemm3m_oncopyrTS,
  274. cgemm3m_oncopyiTS, cgemm3m_otcopybTS,
  275. cgemm3m_otcopyrTS, cgemm3m_otcopyiTS,
  276. csymm3m_iucopybTS, csymm3m_ilcopybTS,
  277. csymm3m_iucopyrTS, csymm3m_ilcopyrTS,
  278. csymm3m_iucopyiTS, csymm3m_ilcopyiTS,
  279. csymm3m_oucopybTS, csymm3m_olcopybTS,
  280. csymm3m_oucopyrTS, csymm3m_olcopyrTS,
  281. csymm3m_oucopyiTS, csymm3m_olcopyiTS,
  282. chemm3m_iucopybTS, chemm3m_ilcopybTS,
  283. chemm3m_iucopyrTS, chemm3m_ilcopyrTS,
  284. chemm3m_iucopyiTS, chemm3m_ilcopyiTS,
  285. chemm3m_oucopybTS, chemm3m_olcopybTS,
  286. chemm3m_oucopyrTS, chemm3m_olcopyrTS,
  287. chemm3m_oucopyiTS, chemm3m_olcopyiTS,
  288. #ifndef NO_LAPACK
  289. cneg_tcopyTS, claswp_ncopyTS,
  290. #else
  291. NULL, NULL,
  292. #endif
  293. 0, 0, 0,
  294. ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N,
  295. #ifdef ZGEMM_DEFAULT_UNROLL_MN
  296. ZGEMM_DEFAULT_UNROLL_MN,
  297. #else
  298. MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),
  299. #endif
  300. zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
  301. znrm2_kTS, zasum_kTS, zcopy_kTS,
  302. zdotu_kTS, zdotc_kTS, zdrot_kTS,
  303. zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
  304. zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS,
  305. zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS,
  306. zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS,
  307. zsymv_LTS, zsymv_UTS,
  308. zhemv_LTS, zhemv_UTS, zhemv_MTS, zhemv_VTS,
  309. zgemm_kernel_nTS, zgemm_kernel_lTS, zgemm_kernel_rTS, zgemm_kernel_bTS,
  310. zgemm_betaTS,
  311. #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  312. zgemm_incopyTS, zgemm_itcopyTS,
  313. #else
  314. zgemm_oncopyTS, zgemm_otcopyTS,
  315. #endif
  316. zgemm_oncopyTS, zgemm_otcopyTS,
  317. ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS,
  318. ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS,
  319. #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  320. ztrsm_iunucopyTS, ztrsm_iunncopyTS, ztrsm_iutucopyTS, ztrsm_iutncopyTS,
  321. ztrsm_ilnucopyTS, ztrsm_ilnncopyTS, ztrsm_iltucopyTS, ztrsm_iltncopyTS,
  322. #else
  323. ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS,
  324. ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS,
  325. #endif
  326. ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS,
  327. ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS,
  328. ztrmm_kernel_RNTS, ztrmm_kernel_RTTS, ztrmm_kernel_RRTS, ztrmm_kernel_RCTS,
  329. ztrmm_kernel_LNTS, ztrmm_kernel_LTTS, ztrmm_kernel_LRTS, ztrmm_kernel_LCTS,
  330. #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  331. ztrmm_iunucopyTS, ztrmm_iunncopyTS, ztrmm_iutucopyTS, ztrmm_iutncopyTS,
  332. ztrmm_ilnucopyTS, ztrmm_ilnncopyTS, ztrmm_iltucopyTS, ztrmm_iltncopyTS,
  333. #else
  334. ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS,
  335. ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS,
  336. #endif
  337. ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS,
  338. ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS,
  339. #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  340. zsymm_iutcopyTS, zsymm_iltcopyTS,
  341. #else
  342. zsymm_outcopyTS, zsymm_oltcopyTS,
  343. #endif
  344. zsymm_outcopyTS, zsymm_oltcopyTS,
  345. #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
  346. zhemm_iutcopyTS, zhemm_iltcopyTS,
  347. #else
  348. zhemm_outcopyTS, zhemm_oltcopyTS,
  349. #endif
  350. zhemm_outcopyTS, zhemm_oltcopyTS,
  351. 0, 0, 0,
  352. #ifdef ZGEMM3M_DEFAULT_UNROLL_M
  353. ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N),
  354. #else
  355. DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
  356. #endif
  357. zgemm3m_kernelTS,
  358. zgemm3m_incopybTS, zgemm3m_incopyrTS,
  359. zgemm3m_incopyiTS, zgemm3m_itcopybTS,
  360. zgemm3m_itcopyrTS, zgemm3m_itcopyiTS,
  361. zgemm3m_oncopybTS, zgemm3m_oncopyrTS,
  362. zgemm3m_oncopyiTS, zgemm3m_otcopybTS,
  363. zgemm3m_otcopyrTS, zgemm3m_otcopyiTS,
  364. zsymm3m_iucopybTS, zsymm3m_ilcopybTS,
  365. zsymm3m_iucopyrTS, zsymm3m_ilcopyrTS,
  366. zsymm3m_iucopyiTS, zsymm3m_ilcopyiTS,
  367. zsymm3m_oucopybTS, zsymm3m_olcopybTS,
  368. zsymm3m_oucopyrTS, zsymm3m_olcopyrTS,
  369. zsymm3m_oucopyiTS, zsymm3m_olcopyiTS,
  370. zhemm3m_iucopybTS, zhemm3m_ilcopybTS,
  371. zhemm3m_iucopyrTS, zhemm3m_ilcopyrTS,
  372. zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS,
  373. zhemm3m_oucopybTS, zhemm3m_olcopybTS,
  374. zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
  375. zhemm3m_oucopyiTS, zhemm3m_olcopyiTS,
  376. #ifndef NO_LAPACK
  377. zneg_tcopyTS, zlaswp_ncopyTS,
  378. #else
  379. NULL, NULL,
  380. #endif
  381. #ifdef EXPRECISION
  382. 0, 0, 0,
  383. XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),
  384. xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
  385. xnrm2_kTS, xasum_kTS, xcopy_kTS,
  386. xdotu_kTS, xdotc_kTS, xqrot_kTS,
  387. xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,
  388. xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS,
  389. xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS,
  390. xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS,
  391. xsymv_LTS, xsymv_UTS,
  392. xhemv_LTS, xhemv_UTS, xhemv_MTS, xhemv_VTS,
  393. xgemm_kernel_nTS, xgemm_kernel_lTS, xgemm_kernel_rTS, xgemm_kernel_bTS,
  394. xgemm_betaTS,
  395. #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  396. xgemm_incopyTS, xgemm_itcopyTS,
  397. #else
  398. xgemm_oncopyTS, xgemm_otcopyTS,
  399. #endif
  400. xgemm_oncopyTS, xgemm_otcopyTS,
  401. xtrsm_kernel_LNTS, xtrsm_kernel_LTTS, xtrsm_kernel_LRTS, xtrsm_kernel_LCTS,
  402. xtrsm_kernel_RNTS, xtrsm_kernel_RTTS, xtrsm_kernel_RRTS, xtrsm_kernel_RCTS,
  403. #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  404. xtrsm_iunucopyTS, xtrsm_iunncopyTS, xtrsm_iutucopyTS, xtrsm_iutncopyTS,
  405. xtrsm_ilnucopyTS, xtrsm_ilnncopyTS, xtrsm_iltucopyTS, xtrsm_iltncopyTS,
  406. #else
  407. xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS,
  408. xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS,
  409. #endif
  410. xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS,
  411. xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS,
  412. xtrmm_kernel_RNTS, xtrmm_kernel_RTTS, xtrmm_kernel_RRTS, xtrmm_kernel_RCTS,
  413. xtrmm_kernel_LNTS, xtrmm_kernel_LTTS, xtrmm_kernel_LRTS, xtrmm_kernel_LCTS,
  414. #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  415. xtrmm_iunucopyTS, xtrmm_iunncopyTS, xtrmm_iutucopyTS, xtrmm_iutncopyTS,
  416. xtrmm_ilnucopyTS, xtrmm_ilnncopyTS, xtrmm_iltucopyTS, xtrmm_iltncopyTS,
  417. #else
  418. xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS,
  419. xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS,
  420. #endif
  421. xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS,
  422. xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS,
  423. #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  424. xsymm_iutcopyTS, xsymm_iltcopyTS,
  425. #else
  426. xsymm_outcopyTS, xsymm_oltcopyTS,
  427. #endif
  428. xsymm_outcopyTS, xsymm_oltcopyTS,
  429. #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
  430. xhemm_iutcopyTS, xhemm_iltcopyTS,
  431. #else
  432. xhemm_outcopyTS, xhemm_oltcopyTS,
  433. #endif
  434. xhemm_outcopyTS, xhemm_oltcopyTS,
  435. 0, 0, 0,
  436. QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),
  437. xgemm3m_kernelTS,
  438. xgemm3m_incopybTS, xgemm3m_incopyrTS,
  439. xgemm3m_incopyiTS, xgemm3m_itcopybTS,
  440. xgemm3m_itcopyrTS, xgemm3m_itcopyiTS,
  441. xgemm3m_oncopybTS, xgemm3m_oncopyrTS,
  442. xgemm3m_oncopyiTS, xgemm3m_otcopybTS,
  443. xgemm3m_otcopyrTS, xgemm3m_otcopyiTS,
  444. xsymm3m_iucopybTS, xsymm3m_ilcopybTS,
  445. xsymm3m_iucopyrTS, xsymm3m_ilcopyrTS,
  446. xsymm3m_iucopyiTS, xsymm3m_ilcopyiTS,
  447. xsymm3m_oucopybTS, xsymm3m_olcopybTS,
  448. xsymm3m_oucopyrTS, xsymm3m_olcopyrTS,
  449. xsymm3m_oucopyiTS, xsymm3m_olcopyiTS,
  450. xhemm3m_iucopybTS, xhemm3m_ilcopybTS,
  451. xhemm3m_iucopyrTS, xhemm3m_ilcopyrTS,
  452. xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS,
  453. xhemm3m_oucopybTS, xhemm3m_olcopybTS,
  454. xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
  455. xhemm3m_oucopyiTS, xhemm3m_olcopyiTS,
  456. #ifndef NO_LAPACK
  457. xneg_tcopyTS, xlaswp_ncopyTS,
  458. #else
  459. NULL, NULL,
  460. #endif
  461. #endif
  462. init_parameter,
  463. SNUMOPT, DNUMOPT, QNUMOPT,
  464. saxpby_kTS, daxpby_kTS, caxpby_kTS, zaxpby_kTS,
  465. somatcopy_k_cnTS, somatcopy_k_ctTS, somatcopy_k_rnTS, somatcopy_k_rtTS,
  466. domatcopy_k_cnTS, domatcopy_k_ctTS, domatcopy_k_rnTS, domatcopy_k_rtTS,
  467. comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS,
  468. comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS,
  469. zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS,
  470. zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS,
  471. simatcopy_k_cnTS, simatcopy_k_ctTS, simatcopy_k_rnTS, simatcopy_k_rtTS,
  472. dimatcopy_k_cnTS, dimatcopy_k_ctTS, dimatcopy_k_rnTS, dimatcopy_k_rtTS,
  473. cimatcopy_k_cnTS, cimatcopy_k_ctTS, cimatcopy_k_rnTS, cimatcopy_k_rtTS,
  474. cimatcopy_k_cncTS, cimatcopy_k_ctcTS, cimatcopy_k_rncTS, cimatcopy_k_rtcTS,
  475. zimatcopy_k_cnTS, zimatcopy_k_ctTS, zimatcopy_k_rnTS, zimatcopy_k_rtTS,
  476. zimatcopy_k_cncTS, zimatcopy_k_ctcTS, zimatcopy_k_rncTS, zimatcopy_k_rtcTS,
  477. sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS
  478. };
  479. #ifdef ARCH_X86
  480. static int get_l2_size_old(void){
  481. int i, eax, ebx, ecx, edx, cpuid_level;
  482. int info[15];
  483. cpuid(2, &eax, &ebx, &ecx, &edx);
  484. info[ 0] = BITMASK(eax, 8, 0xff);
  485. info[ 1] = BITMASK(eax, 16, 0xff);
  486. info[ 2] = BITMASK(eax, 24, 0xff);
  487. info[ 3] = BITMASK(ebx, 0, 0xff);
  488. info[ 4] = BITMASK(ebx, 8, 0xff);
  489. info[ 5] = BITMASK(ebx, 16, 0xff);
  490. info[ 6] = BITMASK(ebx, 24, 0xff);
  491. info[ 7] = BITMASK(ecx, 0, 0xff);
  492. info[ 8] = BITMASK(ecx, 8, 0xff);
  493. info[ 9] = BITMASK(ecx, 16, 0xff);
  494. info[10] = BITMASK(ecx, 24, 0xff);
  495. info[11] = BITMASK(edx, 0, 0xff);
  496. info[12] = BITMASK(edx, 8, 0xff);
  497. info[13] = BITMASK(edx, 16, 0xff);
  498. info[14] = BITMASK(edx, 24, 0xff);
  499. for (i = 0; i < 15; i++){
  500. switch (info[i]){
  501. /* This table is from http://www.sandpile.org/ia32/cpuid.htm */
  502. case 0x1a :
  503. return 96;
  504. case 0x39 :
  505. case 0x3b :
  506. case 0x41 :
  507. case 0x79 :
  508. case 0x81 :
  509. return 128;
  510. case 0x3a :
  511. return 192;
  512. case 0x21 :
  513. case 0x3c :
  514. case 0x42 :
  515. case 0x7a :
  516. case 0x7e :
  517. case 0x82 :
  518. return 256;
  519. case 0x3d :
  520. return 384;
  521. case 0x3e :
  522. case 0x43 :
  523. case 0x7b :
  524. case 0x7f :
  525. case 0x83 :
  526. case 0x86 :
  527. return 512;
  528. case 0x44 :
  529. case 0x78 :
  530. case 0x7c :
  531. case 0x84 :
  532. case 0x87 :
  533. return 1024;
  534. case 0x45 :
  535. case 0x7d :
  536. case 0x85 :
  537. return 2048;
  538. case 0x48 :
  539. return 3184;
  540. case 0x49 :
  541. return 4096;
  542. case 0x4e :
  543. return 6144;
  544. }
  545. }
  546. return 0;
  547. }
  548. #endif
  549. static __inline__ int get_l2_size(void){
  550. int eax, ebx, ecx, edx, l2;
  551. cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
  552. l2 = BITMASK(ecx, 16, 0xffff);
  553. #ifndef ARCH_X86
  554. return l2;
  555. #else
  556. if (l2 > 0) return l2;
  557. return get_l2_size_old();
  558. #endif
  559. }
  560. static __inline__ int get_l3_size(void){
  561. int eax, ebx, ecx, edx;
  562. cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
  563. return BITMASK(edx, 18, 0x3fff) * 512;
  564. }
  565. static void init_parameter(void) {
  566. int l2 = get_l2_size();
  567. (void) l2; /* dirty trick to suppress unused variable warning for targets */
  568. /* where the GEMM unrolling parameters do not depend on l2 */
  569. TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
  570. TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
  571. TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
  572. TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
  573. #ifdef CGEMM3M_DEFAULT_Q
  574. TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
  575. #else
  576. TABLE_NAME.cgemm3m_q = SGEMM_DEFAULT_Q;
  577. #endif
  578. #ifdef ZGEMM3M_DEFAULT_Q
  579. TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
  580. #else
  581. TABLE_NAME.zgemm3m_q = DGEMM_DEFAULT_Q;
  582. #endif
  583. #ifdef EXPRECISION
  584. TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
  585. TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
  586. TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q;
  587. #endif
  588. #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
  589. #ifdef DEBUG
  590. fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n");
  591. #endif
  592. TABLE_NAME.sgemm_p = 64 * (l2 >> 7);
  593. TABLE_NAME.dgemm_p = 32 * (l2 >> 7);
  594. TABLE_NAME.cgemm_p = 32 * (l2 >> 7);
  595. TABLE_NAME.zgemm_p = 16 * (l2 >> 7);
  596. #ifdef EXPRECISION
  597. TABLE_NAME.qgemm_p = 16 * (l2 >> 7);
  598. TABLE_NAME.xgemm_p = 8 * (l2 >> 7);
  599. #endif
  600. #endif
  601. #ifdef CORE_NORTHWOOD
  602. #ifdef DEBUG
  603. fprintf(stderr, "Northwood\n");
  604. #endif
  605. TABLE_NAME.sgemm_p = 96 * (l2 >> 7);
  606. TABLE_NAME.dgemm_p = 48 * (l2 >> 7);
  607. TABLE_NAME.cgemm_p = 48 * (l2 >> 7);
  608. TABLE_NAME.zgemm_p = 24 * (l2 >> 7);
  609. #ifdef EXPRECISION
  610. TABLE_NAME.qgemm_p = 24 * (l2 >> 7);
  611. TABLE_NAME.xgemm_p = 12 * (l2 >> 7);
  612. #endif
  613. #endif
  614. #ifdef ATOM
  615. #ifdef DEBUG
  616. fprintf(stderr, "Atom\n");
  617. #endif
  618. TABLE_NAME.sgemm_p = 256;
  619. TABLE_NAME.dgemm_p = 128;
  620. TABLE_NAME.cgemm_p = 128;
  621. TABLE_NAME.zgemm_p = 64;
  622. #ifdef EXPRECISION
  623. TABLE_NAME.qgemm_p = 64;
  624. TABLE_NAME.xgemm_p = 32;
  625. #endif
  626. #endif
  627. #ifdef CORE_PRESCOTT
  628. #ifdef DEBUG
  629. fprintf(stderr, "Prescott\n");
  630. #endif
  631. TABLE_NAME.sgemm_p = 56 * (l2 >> 7);
  632. TABLE_NAME.dgemm_p = 28 * (l2 >> 7);
  633. TABLE_NAME.cgemm_p = 28 * (l2 >> 7);
  634. TABLE_NAME.zgemm_p = 14 * (l2 >> 7);
  635. #ifdef EXPRECISION
  636. TABLE_NAME.qgemm_p = 14 * (l2 >> 7);
  637. TABLE_NAME.xgemm_p = 7 * (l2 >> 7);
  638. #endif
  639. #endif
  640. #ifdef CORE2
  641. #ifdef DEBUG
  642. fprintf(stderr, "Core2\n");
  643. #endif
  644. TABLE_NAME.sgemm_p = 92 * (l2 >> 9) + 8;
  645. TABLE_NAME.dgemm_p = 46 * (l2 >> 9) + 8;
  646. TABLE_NAME.cgemm_p = 46 * (l2 >> 9) + 4;
  647. TABLE_NAME.zgemm_p = 23 * (l2 >> 9) + 4;
  648. #ifdef EXPRECISION
  649. TABLE_NAME.qgemm_p = 92 * (l2 >> 9) + 8;
  650. TABLE_NAME.xgemm_p = 46 * (l2 >> 9) + 4;
  651. #endif
  652. #endif
  653. #ifdef PENRYN
  654. #ifdef DEBUG
  655. fprintf(stderr, "Penryn\n");
  656. #endif
  657. TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8;
  658. TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8;
  659. TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4;
  660. TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4;
  661. #ifdef EXPRECISION
  662. TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8;
  663. TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4;
  664. #endif
  665. #endif
  666. #ifdef DUNNINGTON
  667. #ifdef DEBUG
  668. fprintf(stderr, "Dunnington\n");
  669. #endif
  670. TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8;
  671. TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8;
  672. TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4;
  673. TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4;
  674. #ifdef EXPRECISION
  675. TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8;
  676. TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4;
  677. #endif
  678. #endif
  679. #ifdef NEHALEM
  680. #ifdef DEBUG
  681. fprintf(stderr, "Nehalem\n");
  682. #endif
  683. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  684. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  685. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  686. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  687. #ifdef EXPRECISION
  688. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  689. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  690. #endif
  691. #endif
  692. #ifdef SANDYBRIDGE
  693. #ifdef DEBUG
  694. fprintf(stderr, "Sandybridge\n");
  695. #endif
  696. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  697. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  698. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  699. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  700. #ifdef EXPRECISION
  701. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  702. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  703. #endif
  704. #endif
  705. #ifdef HASWELL
  706. #ifdef DEBUG
  707. fprintf(stderr, "Haswell\n");
  708. #endif
  709. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  710. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  711. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  712. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  713. #ifdef EXPRECISION
  714. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  715. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  716. #endif
  717. #endif
  718. #ifdef OPTERON
  719. #ifdef DEBUG
  720. fprintf(stderr, "Opteron\n");
  721. #endif
  722. TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7);
  723. TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7);
  724. TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7);
  725. TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7);
  726. #ifdef EXPRECISION
  727. TABLE_NAME.qgemm_p = 56 + 14 * (l2 >> 7);
  728. TABLE_NAME.xgemm_p = 28 + 7 * (l2 >> 7);
  729. #endif
  730. #endif
  731. #ifdef BARCELONA
  732. #ifdef DEBUG
  733. fprintf(stderr, "Barcelona\n");
  734. #endif
  735. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  736. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  737. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  738. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  739. #ifdef EXPRECISION
  740. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  741. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  742. #endif
  743. #endif
  744. #ifdef BOBCAT
  745. #ifdef DEBUG
  746. fprintf(stderr, "Bobcate\n");
  747. #endif
  748. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  749. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  750. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  751. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  752. #ifdef EXPRECISION
  753. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  754. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  755. #endif
  756. #endif
  757. #ifdef BULLDOZER
  758. #ifdef DEBUG
  759. fprintf(stderr, "Bulldozer\n");
  760. #endif
  761. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  762. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  763. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  764. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  765. #ifdef EXPRECISION
  766. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  767. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  768. #endif
  769. #endif
  770. #ifdef EXCAVATOR
  771. #ifdef DEBUG
  772. fprintf(stderr, "Excavator\n");
  773. #endif
  774. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  775. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  776. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  777. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  778. #ifdef EXPRECISION
  779. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  780. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  781. #endif
  782. #endif
  783. #ifdef PILEDRIVER
  784. #ifdef DEBUG
  785. fprintf(stderr, "Piledriver\n");
  786. #endif
  787. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  788. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  789. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  790. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  791. #ifdef EXPRECISION
  792. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  793. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  794. #endif
  795. #endif
  796. #ifdef STEAMROLLER
  797. #ifdef DEBUG
  798. fprintf(stderr, "Steamroller\n");
  799. #endif
  800. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  801. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  802. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  803. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  804. #ifdef EXPRECISION
  805. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  806. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  807. #endif
  808. #endif
  809. #ifdef ZEN
  810. #ifdef DEBUG
  811. fprintf(stderr, "Zen\n");
  812. #endif
  813. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  814. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  815. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  816. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  817. #ifdef EXPRECISION
  818. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  819. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  820. #endif
  821. #endif
  822. #ifdef NANO
  823. #ifdef DEBUG
  824. fprintf(stderr, "NANO\n");
  825. #endif
  826. TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  827. TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
  828. TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
  829. TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
  830. #ifdef EXPRECISION
  831. TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
  832. TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
  833. #endif
  834. #endif
  835. #ifdef CGEMM3M_DEFAULT_P
  836. TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
  837. #else
  838. TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
  839. #endif
  840. #ifdef ZGEMM3M_DEFAULT_P
  841. TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
  842. #else
  843. TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
  844. #endif
  845. #ifdef EXPRECISION
  846. TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
  847. #endif
  848. TABLE_NAME.sgemm_p = ((TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M;
  849. TABLE_NAME.dgemm_p = ((TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M;
  850. TABLE_NAME.cgemm_p = ((TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1)/CGEMM_DEFAULT_UNROLL_M) * CGEMM_DEFAULT_UNROLL_M;
  851. TABLE_NAME.zgemm_p = ((TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1)/ZGEMM_DEFAULT_UNROLL_M) * ZGEMM_DEFAULT_UNROLL_M;
  852. #ifdef CGEMM3M_DEFAULT_UNROLL_M
  853. TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1)/CGEMM3M_DEFAULT_UNROLL_M) * CGEMM3M_DEFAULT_UNROLL_M;
  854. #else
  855. TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M;
  856. #endif
  857. #ifdef ZGEMM3M_DEFAULT_UNROLL_M
  858. TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1)/ZGEMM3M_DEFAULT_UNROLL_M) * ZGEMM3M_DEFAULT_UNROLL_M;
  859. #else
  860. TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M;
  861. #endif
  862. #ifdef QUAD_PRECISION
  863. TABLE_NAME.qgemm_p = ((TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1)/QGEMM_DEFAULT_UNROLL_M) * QGEMM_DEFAULT_UNROLL_M;
  864. TABLE_NAME.xgemm_p = ((TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1)/XGEMM_DEFAULT_UNROLL_M) * XGEMM_DEFAULT_UNROLL_M;
  865. TABLE_NAME.xgemm3m_p = ((TABLE_NAME.xgemm3m_p + QGEMM_DEFAULT_UNROLL_M - 1)/QGEMM_DEFAULT_UNROLL_M) * QGEMM_DEFAULT_UNROLL_M;
  866. #endif
  867. #ifdef DEBUG
  868. fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p);
  869. #endif
  870. TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
  871. ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA
  872. + TABLE_NAME.align) & ~TABLE_NAME.align)
  873. ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15);
  874. TABLE_NAME.dgemm_r = (((BUFFER_SIZE -
  875. ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA
  876. + TABLE_NAME.align) & ~TABLE_NAME.align)
  877. ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15);
  878. #ifdef EXPRECISION
  879. TABLE_NAME.qgemm_r = (((BUFFER_SIZE -
  880. ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA
  881. + TABLE_NAME.align) & ~TABLE_NAME.align)
  882. ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15);
  883. #endif
  884. TABLE_NAME.cgemm_r = (((BUFFER_SIZE -
  885. ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA
  886. + TABLE_NAME.align) & ~TABLE_NAME.align)
  887. ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15);
  888. TABLE_NAME.zgemm_r = (((BUFFER_SIZE -
  889. ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA
  890. + TABLE_NAME.align) & ~TABLE_NAME.align)
  891. ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15);
  892. TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE -
  893. ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA
  894. + TABLE_NAME.align) & ~TABLE_NAME.align)
  895. ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15);
  896. TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE -
  897. ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA
  898. + TABLE_NAME.align) & ~TABLE_NAME.align)
  899. ) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15);
  900. #ifdef EXPRECISION
  901. TABLE_NAME.xgemm_r = (((BUFFER_SIZE -
  902. ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA
  903. + TABLE_NAME.align) & ~TABLE_NAME.align)
  904. ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15);
  905. TABLE_NAME.xgemm3m_r = (((BUFFER_SIZE -
  906. ((TABLE_NAME.xgemm3m_p * TABLE_NAME.xgemm3m_q * 32 + TABLE_NAME.offsetA
  907. + TABLE_NAME.align) & ~TABLE_NAME.align)
  908. ) / (TABLE_NAME.xgemm3m_q * 32) - 15) & ~15);
  909. #endif
  910. }