You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

7 months ago
7 months ago
7 months ago
7 months ago
7 months ago
7 months ago
7 months ago
7 months ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. /*********************************************************************/
  2. /* Copyright 2022, The OpenBLAS Project. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /*********************************************************************/
  34. #include <stdio.h>
  35. #include <stdlib.h>
  36. #include "common.h"
  37. #ifndef COMPLEX
  38. #define SMP_THRESHOLD_MIN 65536.0
  39. #ifdef RNAME
  40. #ifdef XDOUBLE
  41. #define ERROR_NAME "QGEMMTR"
  42. #elif defined(DOUBLE)
  43. #define ERROR_NAME "DGEMMTR"
  44. #elif defined(BFLOAT16)
  45. #define ERROR_NAME "SBGEMMTR"
  46. #else
  47. #define ERROR_NAME "SGEMMTR"
  48. #endif
  49. #else
  50. #ifdef XDOUBLE
  51. #define ERROR_NAME "QGEMMT "
  52. #elif defined(DOUBLE)
  53. #define ERROR_NAME "DGEMMT "
  54. #elif defined(BFLOAT16)
  55. #define ERROR_NAME "SBGEMMT "
  56. #else
  57. #define ERROR_NAME "SGEMMT "
  58. #endif
  59. #endif
  60. #else
  61. #define SMP_THRESHOLD_MIN 8192.0
  62. #ifdef RNAME
  63. #ifdef XDOUBLE
  64. #define ERROR_NAME "XGEMMTR"
  65. #elif defined(DOUBLE)
  66. #define ERROR_NAME "ZGEMMTR"
  67. #else
  68. #define ERROR_NAME "CGEMMTR"
  69. #endif
  70. #else
  71. #ifdef XDOUBLE
  72. #define ERROR_NAME "XGEMMT "
  73. #elif defined(DOUBLE)
  74. #define ERROR_NAME "ZGEMMT "
  75. #else
  76. #define ERROR_NAME "CGEMMT "
  77. #endif
  78. #endif
  79. #endif
  80. #ifndef GEMM_MULTITHREAD_THRESHOLD
  81. #define GEMM_MULTITHREAD_THRESHOLD 4
  82. #endif
  83. #ifndef CBLAS
  84. void NAME(char *UPLO, char *TRANSA, char *TRANSB,
  85. blasint * M, blasint * K,
  86. FLOAT * Alpha,
  87. IFLOAT * a, blasint * ldA,
  88. IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
  89. {
  90. blasint m, k;
  91. blasint lda, ldb, ldc;
  92. int transa, transb, uplo;
  93. blasint info;
  94. char transA, transB, Uplo;
  95. blasint nrowa, nrowb;
  96. #if defined(COMPLEX)
  97. blasint ncolb;
  98. #endif
  99. IFLOAT *buffer;
  100. IFLOAT *aa, *bb;
  101. FLOAT *cc;
  102. #if defined(COMPLEX)
  103. FLOAT alpha_r, alpha_i, beta_r, beta_i;
  104. #else
  105. FLOAT alpha, beta;
  106. #endif
  107. PRINT_DEBUG_NAME;
  108. m = *M;
  109. k = *K;
  110. #if defined(COMPLEX)
  111. FLOAT *alpha = Alpha;
  112. alpha_r = *(Alpha + 0);
  113. alpha_i = *(Alpha + 1);
  114. beta_r = *(Beta + 0);
  115. beta_i = *(Beta + 1);
  116. #else
  117. alpha = *Alpha;
  118. beta = *Beta;
  119. #endif
  120. lda = *ldA;
  121. ldb = *ldB;
  122. ldc = *ldC;
  123. transA = *TRANSA;
  124. transB = *TRANSB;
  125. Uplo = *UPLO;
  126. TOUPPER(transA);
  127. TOUPPER(transB);
  128. TOUPPER(Uplo);
  129. transa = -1;
  130. transb = -1;
  131. uplo = -1;
  132. if (transA == 'N')
  133. transa = 0;
  134. if (transA == 'T')
  135. transa = 1;
  136. #ifndef COMPLEX
  137. if (transA == 'R')
  138. transa = 0;
  139. if (transA == 'C')
  140. transa = 1;
  141. #else
  142. if (transA == 'R')
  143. transa = 2;
  144. if (transA == 'C')
  145. transa = 3;
  146. #endif
  147. if (transB == 'N')
  148. transb = 0;
  149. if (transB == 'T')
  150. transb = 1;
  151. #ifndef COMPLEX
  152. if (transB == 'R')
  153. transb = 0;
  154. if (transB == 'C')
  155. transb = 1;
  156. #else
  157. if (transB == 'R')
  158. transb = 2;
  159. if (transB == 'C')
  160. transb = 3;
  161. #endif
  162. if (Uplo == 'U')
  163. uplo = 0;
  164. if (Uplo == 'L')
  165. uplo = 1;
  166. nrowa = m;
  167. if (transa & 1) nrowa = k;
  168. nrowb = k;
  169. #if defined(COMPLEX)
  170. ncolb = m;
  171. #endif
  172. if (transb & 1) {
  173. nrowb = m;
  174. #if defined(COMPLEX)
  175. ncolb = k;
  176. #endif
  177. }
  178. info = 0;
  179. if (ldc < MAX(1, m))
  180. info = 13;
  181. if (ldb < MAX(1, nrowb))
  182. info = 10;
  183. if (lda < MAX(1, nrowa))
  184. info = 8;
  185. if (k < 0)
  186. info = 5;
  187. if (m < 0)
  188. info = 4;
  189. if (transb < 0)
  190. info = 3;
  191. if (transa < 0)
  192. info = 2;
  193. if (uplo < 0)
  194. info = 1;
  195. if (info != 0) {
  196. BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
  197. return;
  198. }
  199. #else
  200. void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
  201. enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
  202. blasint k,
  203. #ifndef COMPLEX
  204. FLOAT alpha,
  205. IFLOAT * A, blasint LDA,
  206. IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
  207. {
  208. #else
  209. void *valpha,
  210. void *va, blasint LDA,
  211. void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc)
  212. {
  213. FLOAT *alpha = (FLOAT *) valpha;
  214. FLOAT *beta = (FLOAT *) vbeta;
  215. FLOAT *A = (FLOAT *) va;
  216. FLOAT *B = (FLOAT *) vb;
  217. FLOAT *c = (FLOAT *) vc;
  218. #endif
  219. FLOAT *aa, *bb, *cc;
  220. int transa, transb, uplo;
  221. blasint info;
  222. blasint lda, ldb;
  223. FLOAT *a, *b;
  224. #if defined(COMPLEX)
  225. blasint nrowb, ncolb;
  226. #endif
  227. XFLOAT *buffer;
  228. PRINT_DEBUG_CNAME;
  229. uplo = -1;
  230. transa = -1;
  231. transb = -1;
  232. info = 0;
  233. if (order == CblasColMajor) {
  234. if (Uplo == CblasUpper) uplo = 0;
  235. if (Uplo == CblasLower) uplo = 1;
  236. if (TransA == CblasNoTrans)
  237. transa = 0;
  238. if (TransA == CblasTrans)
  239. transa = 1;
  240. #ifndef COMPLEX
  241. if (TransA == CblasConjNoTrans)
  242. transa = 0;
  243. if (TransA == CblasConjTrans)
  244. transa = 1;
  245. #else
  246. if (TransA == CblasConjNoTrans)
  247. transa = 2;
  248. if (TransA == CblasConjTrans)
  249. transa = 3;
  250. #endif
  251. if (TransB == CblasNoTrans)
  252. transb = 0;
  253. if (TransB == CblasTrans)
  254. transb = 1;
  255. #ifndef COMPLEX
  256. if (TransB == CblasConjNoTrans)
  257. transb = 0;
  258. if (TransB == CblasConjTrans)
  259. transb = 1;
  260. #else
  261. if (TransB == CblasConjNoTrans)
  262. transb = 2;
  263. if (TransB == CblasConjTrans)
  264. transb = 3;
  265. #endif
  266. a = (void *)A;
  267. b = (void *)B;
  268. lda = LDA;
  269. ldb = LDB;
  270. info = -1;
  271. blasint nrowa;
  272. #if !defined(COMPLEX)
  273. blasint nrowb;
  274. #endif
  275. nrowa = m;
  276. if (transa & 1) nrowa = k;
  277. nrowb = k;
  278. #if defined(COMPLEX)
  279. ncolb = m;
  280. #endif
  281. if (transb & 1) {
  282. nrowb = m;
  283. #if defined(COMPLEX)
  284. ncolb = k;
  285. #endif
  286. }
  287. if (ldc < MAX(1, m))
  288. info = 13;
  289. if (ldb < MAX(1, nrowb))
  290. info = 10;
  291. if (lda < MAX(1, nrowa))
  292. info = 8;
  293. if (k < 0)
  294. info = 5;
  295. if (m < 0)
  296. info = 4;
  297. if (transb < 0)
  298. info = 3;
  299. if (transa < 0)
  300. info = 2;
  301. if (uplo < 0)
  302. info = 1;
  303. }
  304. if (order == CblasRowMajor) {
  305. a = (void *)B;
  306. b = (void *)A;
  307. lda = LDB;
  308. ldb = LDA;
  309. if (Uplo == CblasUpper) uplo = 1;
  310. if (Uplo == CblasLower) uplo = 0;
  311. if (TransB == CblasNoTrans)
  312. transa = 0;
  313. if (TransB == CblasTrans)
  314. transa = 1;
  315. #ifndef COMPLEX
  316. if (TransB == CblasConjNoTrans)
  317. transa = 0;
  318. if (TransB == CblasConjTrans)
  319. transa = 1;
  320. #else
  321. if (TransB == CblasConjNoTrans)
  322. transa = 2;
  323. if (TransB == CblasConjTrans)
  324. transa = 3;
  325. #endif
  326. if (TransA == CblasNoTrans)
  327. transb = 0;
  328. if (TransA == CblasTrans)
  329. transb = 1;
  330. #ifndef COMPLEX
  331. if (TransA == CblasConjNoTrans)
  332. transb = 0;
  333. if (TransA == CblasConjTrans)
  334. transb = 1;
  335. #else
  336. if (TransA == CblasConjNoTrans)
  337. transb = 2;
  338. if (TransA == CblasConjTrans)
  339. transb = 3;
  340. #endif
  341. info = -1;
  342. blasint ncola;
  343. #if !defined(COMPLEX)
  344. blasint ncolb;
  345. #endif
  346. ncola = m;
  347. if (transa & 1) ncola = k;
  348. ncolb = k;
  349. #if defined(COMPLEX)
  350. nrowb = m;
  351. #endif
  352. if (transb & 1) {
  353. #if defined(COMPLEX)
  354. nrowb = k;
  355. #endif
  356. ncolb = m;
  357. }
  358. if (ldc < MAX(1,m))
  359. info = 13;
  360. if (ldb < MAX(1, ncolb))
  361. info = 8;
  362. if (lda < MAX(1, ncola))
  363. info = 10;
  364. if (k < 0)
  365. info = 5;
  366. if (m < 0)
  367. info = 4;
  368. if (transb < 0)
  369. info = 2;
  370. if (transa < 0)
  371. info = 3;
  372. if (uplo < 0)
  373. info = 1;
  374. }
  375. if (info >= 0) {
  376. BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
  377. return;
  378. }
  379. #if defined(COMPLEX)
  380. FLOAT alpha_r = *(alpha + 0);
  381. FLOAT alpha_i = *(alpha + 1);
  382. FLOAT beta_r = *(beta + 0);
  383. FLOAT beta_i = *(beta + 1);
  384. #endif
  385. #endif
  386. int buffer_size;
  387. blasint l;
  388. blasint i, j;
  389. #ifdef SMP
  390. int nthreads;
  391. #endif
  392. #if defined(COMPLEX)
  393. #ifdef SMP
  394. static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *,
  395. BLASLONG, FLOAT *, BLASLONG, FLOAT *,
  396. BLASLONG, FLOAT *, int) = {
  397. #ifdef XDOUBLE
  398. xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c,
  399. xgemv_thread_o, xgemv_thread_u, xgemv_thread_s,
  400. xgemv_thread_d,
  401. #elif defined DOUBLE
  402. zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c,
  403. zgemv_thread_o, zgemv_thread_u, zgemv_thread_s,
  404. zgemv_thread_d,
  405. #else
  406. cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c,
  407. cgemv_thread_o, cgemv_thread_u, cgemv_thread_s,
  408. cgemv_thread_d,
  409. #endif
  410. };
  411. #endif
  412. int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *,
  413. BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG,
  414. FLOAT *) = {
  415. GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,};
  416. #else
  417. #ifdef SMP
  418. static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *,
  419. BLASLONG, FLOAT *, BLASLONG, FLOAT *,
  420. BLASLONG, FLOAT *, int) = {
  421. #ifdef XDOUBLE
  422. qgemv_thread_n, qgemv_thread_t,
  423. #elif defined DOUBLE
  424. dgemv_thread_n, dgemv_thread_t,
  425. #else
  426. sgemv_thread_n, sgemv_thread_t,
  427. #endif
  428. };
  429. #endif
  430. int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG,
  431. FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
  432. GEMV_N, GEMV_T,};
  433. #endif
  434. if (m == 0)
  435. return;
  436. IDEBUG_START;
  437. #if defined(COMPLEX)
  438. if (transb > 1){
  439. #ifndef CBLAS
  440. IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
  441. #else
  442. if (order == CblasColMajor)
  443. IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
  444. if (order == CblasRowMajor)
  445. IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
  446. #endif
  447. }
  448. #endif
  449. const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
  450. if (uplo == 1) {
  451. for (i = 0; i < m; i++) {
  452. j = m - i;
  453. l = j;
  454. #if defined(COMPLEX)
  455. aa = a + i * 2;
  456. bb = b + i * ldb * 2;
  457. if (transa & 1) {
  458. aa = a + lda * i * 2;
  459. }
  460. if (transb & 1)
  461. bb = b + i * 2;
  462. cc = c + i * 2 * ldc + i * 2;
  463. #else
  464. aa = a + i;
  465. bb = b + i * ldb;
  466. if (transa & 1) {
  467. aa = a + lda * i;
  468. }
  469. if (transb & 1)
  470. bb = b + i;
  471. cc = c + i * ldc + i;
  472. #endif
  473. #if defined(COMPLEX)
  474. if (beta_r != ONE || beta_i != ZERO)
  475. SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
  476. NULL, 0);
  477. if (alpha_r == ZERO && alpha_i == ZERO)
  478. continue;
  479. #else
  480. if (beta != ONE)
  481. SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
  482. if (alpha == ZERO)
  483. continue;
  484. #endif
  485. IDEBUG_START;
  486. buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
  487. #ifdef WINDOWS_ABI
  488. buffer_size += 160 / sizeof(FLOAT);
  489. #endif
  490. // for alignment
  491. buffer_size = (buffer_size + 3) & ~3;
  492. STACK_ALLOC(buffer_size, IFLOAT, buffer);
  493. #ifdef SMP
  494. if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
  495. nthreads = 1;
  496. else
  497. nthreads = num_cpu_avail(2);
  498. if (nthreads == 1) {
  499. #endif
  500. #if defined(COMPLEX)
  501. if (!(transa & 1))
  502. (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
  503. aa, lda, bb, incb, cc, 1,
  504. buffer);
  505. else
  506. (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
  507. aa, lda, bb, incb, cc, 1,
  508. buffer);
  509. #else
  510. if (!(transa & 1))
  511. (gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
  512. bb, incb, cc, 1, buffer);
  513. else
  514. (gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
  515. bb, incb, cc, 1, buffer);
  516. #endif
  517. #ifdef SMP
  518. } else {
  519. if (!(transa & 1))
  520. (gemv_thread[(int)transa]) (j, k, alpha, aa,
  521. lda, bb, incb, cc,
  522. 1, buffer,
  523. nthreads);
  524. else
  525. (gemv_thread[(int)transa]) (k, j, alpha, aa,
  526. lda, bb, incb, cc,
  527. 1, buffer,
  528. nthreads);
  529. }
  530. #endif
  531. STACK_FREE(buffer);
  532. }
  533. } else {
  534. for (i = 0; i < m; i++) {
  535. j = i + 1;
  536. l = j;
  537. #if defined COMPLEX
  538. bb = b + i * ldb * 2;
  539. if (transb & 1) {
  540. bb = b + i * 2;
  541. }
  542. cc = c + i * 2 * ldc;
  543. #else
  544. bb = b + i * ldb;
  545. if (transb & 1) {
  546. bb = b + i;
  547. }
  548. cc = c + i * ldc;
  549. #endif
  550. #if defined(COMPLEX)
  551. if (beta_r != ONE || beta_i != ZERO)
  552. SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
  553. NULL, 0);
  554. if (alpha_r == ZERO && alpha_i == ZERO)
  555. continue;
  556. #else
  557. if (beta != ONE)
  558. SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
  559. if (alpha == ZERO)
  560. continue;
  561. #endif
  562. IDEBUG_START;
  563. buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
  564. #ifdef WINDOWS_ABI
  565. buffer_size += 160 / sizeof(FLOAT);
  566. #endif
  567. // for alignment
  568. buffer_size = (buffer_size + 3) & ~3;
  569. STACK_ALLOC(buffer_size, IFLOAT, buffer);
  570. #ifdef SMP
  571. if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
  572. nthreads = 1;
  573. else
  574. nthreads = num_cpu_avail(2);
  575. if (nthreads == 1) {
  576. #endif
  577. #if defined(COMPLEX)
  578. if (!(transa & 1))
  579. (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
  580. a, lda, bb, incb, cc, 1,
  581. buffer);
  582. else
  583. (gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
  584. a, lda, bb, incb, cc, 1,
  585. buffer);
  586. #else
  587. if (!(transa & 1))
  588. (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
  589. incb, cc, 1, buffer);
  590. else
  591. (gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
  592. incb, cc, 1, buffer);
  593. #endif
  594. #ifdef SMP
  595. } else {
  596. if (!(transa & 1))
  597. (gemv_thread[(int)transa]) (j, k, alpha, a, lda,
  598. bb, incb, cc, 1,
  599. buffer, nthreads);
  600. else
  601. (gemv_thread[(int)transa]) (k, j, alpha, a, lda,
  602. bb, incb, cc, 1,
  603. buffer, nthreads);
  604. }
  605. #endif
  606. STACK_FREE(buffer);
  607. }
  608. }
  609. IDEBUG_END;
  610. /* transform B back if necessary */
  611. #if defined(COMPLEX)
  612. if (transb > 1){
  613. #ifndef CBLAS
  614. IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
  615. #else
  616. if (order == CblasColMajor)
  617. IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
  618. if (order == CblasRowMajor)
  619. IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
  620. #endif
  621. }
  622. #endif
  623. return;
  624. }