You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_vec.c 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710
  1. /*
  2. * Copyright (c) IBM Corporation 2020.
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are
  7. * met:
  8. *
  9. * 1. Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. *
  12. * 2. Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in
  14. * the documentation and/or other materials provided with the
  15. * distribution.
  16. * 3. Neither the name of the OpenBLAS project nor the names of
  17. * its contributors may be used to endorse or promote products
  18. * derived from this software without specific prior written
  19. * permission.
  20. *
  21. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  22. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  25. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  27. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  28. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  29. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  30. * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. */
  32. #include "common.h"
  33. #include <vecintrin.h>
  34. #include <stdbool.h>
  35. #include <stdio.h>
  36. #include <stdlib.h>
  37. #ifdef COMPLEX
  38. #error "Handling for complex numbers is not supported in this kernel"
  39. #endif
  40. #ifdef DOUBLE
  41. #define UNROLL_M DGEMM_DEFAULT_UNROLL_M
  42. #define UNROLL_N DGEMM_DEFAULT_UNROLL_N
  43. #else
  44. #define UNROLL_M SGEMM_DEFAULT_UNROLL_M
  45. #define UNROLL_N SGEMM_DEFAULT_UNROLL_N
  46. #endif
  47. static const size_t unroll_m = UNROLL_M;
  48. static const size_t unroll_n = UNROLL_N;
  49. /* Handling of triangular matrices */
  50. #ifdef TRMMKERNEL
  51. static const bool trmm = true;
  52. static const bool left =
  53. #ifdef LEFT
  54. true;
  55. #else
  56. false;
  57. #endif
  58. static const bool backwards =
  59. #if defined(LEFT) != defined(TRANSA)
  60. true;
  61. #else
  62. false;
  63. #endif
  64. #else
  65. static const bool trmm = false;
  66. static const bool left = false;
  67. static const bool backwards = false;
  68. #endif /* TRMMKERNEL */
  69. /*
  70. * Background:
  71. *
  72. * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication
  73. * problem by splitting all matrices into partitions multiple times, so that the
  74. * submatrices fit into the L1 or L2 caches. As a result, each multiplication of
  75. * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies
  76. * and rearranges the submatrices to enable contiguous memory accesses to
  77. * improve locality in both caches and TLBs.
  78. *
  79. * At the heart of the algorithm is this kernel, which multiplies, a "Block
  80. * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is
  81. * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this
  82. * operation GEBP. This kernel further partitions GEBP twice, such that (1)
  83. * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a
  84. * block of C fits into the registers, while multiplying panels from A and B
  85. * streamed from the L2 and L1 cache, respectively (GEBP_block).
  86. *
  87. *
  88. * Algorithm GEBP(A, B, C, m, n, k, alpha):
  89. *
  90. * The problem is calculating C += alpha * (A * B)
  91. * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix.
  92. *
  93. * - C is in column-major-order, with an offset of ldc to the element in the
  94. * next column (same row).
  95. * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column
  96. * contiguously while walking along rows.
  97. * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row
  98. * contiguously.
  99. * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or
  100. * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2
  101. * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a
  102. * block-of-1).
  103. *
  104. * Note that packing A and B into that form is taken care of by the caller in
  105. * driver/level3/level3.c (actually done by "copy kernels").
  106. *
  107. * Steps:
  108. * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j.
  109. * Now, B_j should fit into the L1 cache.
  110. * - For each partition, calculate C_j += alpha * (A * B_j) by
  111. * (1) Calculate C_aux := A * B_j (see below)
  112. * (2) unpack C_j = C_j + alpha * C_aux
  113. *
  114. *
  115. * Algorithm for Calculating C_aux:
  116. *
  117. * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows,
  118. * such that the m_r x n_r-submatrix of C_aux can be held in registers. Each
  119. * submatrix of C_aux can be calculated independently, and the registers are
  120. * added back into C_j.
  121. *
  122. * - For each row-block of C_aux:
  123. * (uses a row block of A and full B_j)
  124. * - stream over all columns of A, multiply with elements from B and
  125. * accumulate in registers. (use different inner-kernels to exploit
  126. * vectorization for varying block sizes)
  127. * - add alpha * row block of C_aux back into C_j.
  128. *
  129. * Note that there are additional mechanics for handling triangular matrices,
  130. * calculating B := alpha (A * B) where either of the matrices A or B can be
  131. * triangular. In case of A, the macro "LEFT" is defined. In addition, A can
  132. * optionally be transposed.
  133. * The code effectively skips an "offset" number of columns in A and rows of B
  134. * in each block, to save unnecessary work by exploiting the triangular nature.
  135. * To handle all cases, the code discerns (1) a "left" mode when A is triangular
  136. * and (2) "forward" / "backwards" modes where only the first "offset"
  137. * columns/rows of A/B are used or where the first "offset" columns/rows are
  138. * skipped, respectively.
  139. *
  140. * Reference:
  141. *
  142. * The summary above is based on staring at various kernel implementations and:
  143. * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix
  144. * Multiplication, in ACM Transactions of Mathematical Software, Vol. 34, No.
  145. * 3, May 2008.
  146. */
  147. #define VLEN_BYTES 16
  148. #define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
  149. typedef FLOAT vector_float __attribute__ ((vector_size (16)));
  150. /**
  151. * Load a vector into register, and hint on 8-byte alignment to improve
  152. * performance. gcc-9 and newer will create these hints by itself. For older
  153. * compiler versions, use inline assembly to explicitly express the hint.
  154. * Provide explicit hex encoding to cater for binutils versions that do not know
  155. * about vector-load with alignment hints yet.
  156. *
  157. * Note that, for block sizes where we apply vectorization, vectors in A will
  158. * always be 8-byte aligned.
  159. */
  160. static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
  161. vector_float const *restrict addr = (vector_float const *restrict)a;
  162. vector_float y;
  163. #if __GNUC__ < 9 && !defined(__clang__)
  164. // hex-encode vl %[out],%[addr],3
  165. asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
  166. : [ out ] "=v"(y)
  167. : [ addr ] "R"(*addr));
  168. #else
  169. y = *addr;
  170. #endif
  171. return y;
  172. }
  173. /**
  174. * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
  175. *
  176. * @param[in] A Pointer current block of input matrix A.
  177. * @param[in] k Number of columns in A.
  178. * @param[in] B Pointer current block of input matrix B.
  179. * @param[inout] C Pointer current block of output matrix C.
  180. * @param[in] ldc Offset between elements in adjacent columns in C.
  181. * @param[in] alpha Scalar factor.
  182. */
  183. #define VECTOR_BLOCK(ROWS, COLS) \
  184. static inline void GEBP_block_##ROWS##_##COLS( \
  185. FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, \
  186. FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \
  187. _Static_assert( \
  188. ROWS % VLEN_FLOATS == 0, \
  189. "rows in block must be multiples of vector length"); \
  190. vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \
  191. \
  192. for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
  193. vector_float A0 = \
  194. vec_load_hinted(A + i * VLEN_FLOATS); \
  195. for (BLASLONG j = 0; j < COLS; j++) \
  196. Caux[i][j] = A0 * B[j]; \
  197. } \
  198. \
  199. /* \
  200. * Stream over the row-block of A, which is packed \
  201. * column-by-column, multiply by coefficients in B and add up \
  202. * into temporaries Caux (which the compiler will hold in \
  203. * registers). Vectorization: Multiply column vectors from A \
  204. * with scalars from B and add up in column vectors of Caux. \
  205. * That equates to unrolling the loop over rows (in i) and \
  206. * executing each unrolled iteration as a vector element. \
  207. */ \
  208. for (BLASLONG k = 1; k < bk; k++) { \
  209. for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
  210. vector_float Ak = vec_load_hinted( \
  211. A + i * VLEN_FLOATS + k * ROWS); \
  212. \
  213. for (BLASLONG j = 0; j < COLS; j++) \
  214. Caux[i][j] += Ak * B[j + k * COLS]; \
  215. } \
  216. } \
  217. \
  218. /* \
  219. * Unpack row-block of C_aux into outer C_i, multiply by \
  220. * alpha and add up. \
  221. */ \
  222. for (BLASLONG j = 0; j < COLS; j++) { \
  223. for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
  224. vector_float *C_ij = \
  225. (vector_float *)(C + i * VLEN_FLOATS + \
  226. j * ldc); \
  227. if (trmm) { \
  228. *C_ij = alpha * Caux[i][j]; \
  229. } else { \
  230. *C_ij += alpha * Caux[i][j]; \
  231. } \
  232. } \
  233. } \
  234. }
  235. #if UNROLL_M == 16
  236. VECTOR_BLOCK(16, 2)
  237. VECTOR_BLOCK(16, 1)
  238. #endif
  239. #if UNROLL_N == 8
  240. VECTOR_BLOCK(8, 8)
  241. VECTOR_BLOCK(4, 8)
  242. #endif
  243. #ifndef DOUBLE
  244. VECTOR_BLOCK(8, 4)
  245. #endif
  246. VECTOR_BLOCK(8, 2)
  247. VECTOR_BLOCK(8, 1)
  248. VECTOR_BLOCK(4, 4)
  249. VECTOR_BLOCK(4, 2)
  250. VECTOR_BLOCK(4, 1)
  251. /**
  252. * Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations.
  253. * Simple implementation for smaller block sizes
  254. *
  255. * @param[in] A Pointer current block of input matrix A.
  256. * @param[in] k Number of columns in A.
  257. * @param[in] B Pointer current block of input matrix B.
  258. * @param[inout] C Pointer current block of output matrix C.
  259. * @param[in] ldc Offset between elements in adjacent columns in C.
  260. * @param[in] alpha Scalar factor.
  261. */
  262. #define SCALAR_BLOCK(ROWS, COLS) \
  263. static inline void GEBP_block_##ROWS##_##COLS( \
  264. FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B, \
  265. FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \
  266. FLOAT Caux[ROWS][COLS] __attribute__((aligned(16))); \
  267. \
  268. /* \
  269. * Peel off first iteration (i.e., column of A) for \
  270. * initializing Caux \
  271. */ \
  272. for (BLASLONG i = 0; i < ROWS; i++) \
  273. for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \
  274. \
  275. for (BLASLONG kk = 1; kk < k; kk++) \
  276. for (BLASLONG i = 0; i < ROWS; i++) \
  277. for (BLASLONG j = 0; j < COLS; j++) \
  278. Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS]; \
  279. \
  280. for (BLASLONG i = 0; i < ROWS; i++) \
  281. for (BLASLONG j = 0; j < COLS; j++) \
  282. if (trmm) { \
  283. C[i + j * ldc] = alpha * Caux[i][j]; \
  284. } else { \
  285. C[i + j * ldc] += alpha * Caux[i][j]; \
  286. } \
  287. }
  288. #ifdef DOUBLE
  289. VECTOR_BLOCK(2, 4)
  290. VECTOR_BLOCK(2, 2)
  291. VECTOR_BLOCK(2, 1)
  292. #else
  293. SCALAR_BLOCK(2, 4)
  294. SCALAR_BLOCK(2, 2)
  295. SCALAR_BLOCK(2, 1)
  296. #endif
  297. SCALAR_BLOCK(1, 4)
  298. SCALAR_BLOCK(1, 2)
  299. SCALAR_BLOCK(1, 1)
  300. /**
  301. * Calculate a row-block that fits 4x4 vector registers using a loop
  302. * unrolled-by-2 with explicit interleaving to better overlap loads and
  303. * computation.
  304. * This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM.
  305. */
  306. #ifdef DOUBLE
  307. static inline void GEBP_block_8_4(
  308. #else // float
  309. static inline void GEBP_block_16_4(
  310. #endif
  311. FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,
  312. FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {
  313. #define VEC_ROWS 4
  314. #define VEC_COLS 4
  315. #define ROWS VEC_ROWS * VLEN_FLOATS
  316. #define COLS (VEC_COLS)
  317. /*
  318. * Hold intermediate results in vector registers.
  319. * Since we need to force the compiler's hand in places, we need to use
  320. * individual variables in contrast to the generic implementation's
  321. * arrays.
  322. */
  323. #define INIT_ROW_OF_C(ROW) \
  324. vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \
  325. vector_float C_##ROW##_0 = A##ROW * B[0]; \
  326. vector_float C_##ROW##_1 = A##ROW * B[1]; \
  327. vector_float C_##ROW##_2 = A##ROW * B[2]; \
  328. vector_float C_##ROW##_3 = A##ROW * B[3];
  329. INIT_ROW_OF_C(0)
  330. INIT_ROW_OF_C(1)
  331. INIT_ROW_OF_C(2)
  332. INIT_ROW_OF_C(3)
  333. #undef INIT_ROW_OF_C
  334. if (bk > 1) {
  335. BLASLONG k = 1;
  336. vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS];
  337. vector_float Bk[VEC_COLS], Bknext[VEC_COLS];
  338. /*
  339. * Note that in several places, we enforce an instruction
  340. * sequence that we identified empirically by utilizing dummy
  341. * asm statements.
  342. */
  343. for (BLASLONG j = 0; j < VEC_COLS; j++)
  344. Bk[j] = vec_splats(B[j + k * COLS]);
  345. asm("");
  346. for (BLASLONG i = 0; i < VEC_ROWS; i++)
  347. Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS);
  348. for (; k < (bk - 2); k += 2) {
  349. /*
  350. * Load inputs for (k+1) into registers.
  351. * Loading from B first is advantageous.
  352. */
  353. for (BLASLONG j = 0; j < VEC_COLS; j++)
  354. Bknext[j] = vec_splats(B[j + (k + 1) * COLS]);
  355. asm("");
  356. for (BLASLONG i = 0; i < VEC_ROWS; i++)
  357. Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS +
  358. (k + 1) * ROWS);
  359. /*
  360. * To achieve better instruction-level parallelism,
  361. * make sure to first load input data for (k+1) before
  362. * initiating compute for k. We enforce that ordering
  363. * with a pseudo asm statement.
  364. * Note that we need to massage this particular "barrier"
  365. * depending on the gcc version.
  366. */
  367. #if __GNUC__ > 7 || defined(__clang__)
  368. #define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
  369. do { \
  370. asm("" \
  371. : "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \
  372. "+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3) \
  373. : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
  374. "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
  375. "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
  376. asm("" \
  377. : "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \
  378. "+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3) \
  379. : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \
  380. "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \
  381. "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \
  382. } while (0)
  383. #else // __GNUC__ <= 7
  384. #define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
  385. do { \
  386. asm(""); \
  387. } while (0)
  388. #endif
  389. BARRIER_READ_BEFORE_COMPUTE(knext);
  390. /* Compute for (k) */
  391. C_0_0 += Ak[0] * Bk[0];
  392. C_1_0 += Ak[1] * Bk[0];
  393. C_2_0 += Ak[2] * Bk[0];
  394. C_3_0 += Ak[3] * Bk[0];
  395. C_0_1 += Ak[0] * Bk[1];
  396. C_1_1 += Ak[1] * Bk[1];
  397. C_2_1 += Ak[2] * Bk[1];
  398. C_3_1 += Ak[3] * Bk[1];
  399. C_0_2 += Ak[0] * Bk[2];
  400. C_1_2 += Ak[1] * Bk[2];
  401. C_2_2 += Ak[2] * Bk[2];
  402. C_3_2 += Ak[3] * Bk[2];
  403. C_0_3 += Ak[0] * Bk[3];
  404. C_1_3 += Ak[1] * Bk[3];
  405. C_2_3 += Ak[2] * Bk[3];
  406. C_3_3 += Ak[3] * Bk[3];
  407. asm("");
  408. /*
  409. * Load inputs for (k+2) into registers.
  410. * First load from B.
  411. */
  412. for (BLASLONG j = 0; j < VEC_COLS; j++)
  413. Bk[j] = vec_splats(B[j + (k + 2) * COLS]);
  414. asm("");
  415. for (BLASLONG i = 0; i < VEC_ROWS; i++)
  416. Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS);
  417. /*
  418. * As above, make sure to first schedule the loads for (k+2)
  419. * before compute for (k+1).
  420. */
  421. BARRIER_READ_BEFORE_COMPUTE(k);
  422. /* Compute on (k+1) */
  423. C_0_0 += Aknext[0] * Bknext[0];
  424. C_1_0 += Aknext[1] * Bknext[0];
  425. C_2_0 += Aknext[2] * Bknext[0];
  426. C_3_0 += Aknext[3] * Bknext[0];
  427. C_0_1 += Aknext[0] * Bknext[1];
  428. C_1_1 += Aknext[1] * Bknext[1];
  429. C_2_1 += Aknext[2] * Bknext[1];
  430. C_3_1 += Aknext[3] * Bknext[1];
  431. C_0_2 += Aknext[0] * Bknext[2];
  432. C_1_2 += Aknext[1] * Bknext[2];
  433. C_2_2 += Aknext[2] * Bknext[2];
  434. C_3_2 += Aknext[3] * Bknext[2];
  435. C_0_3 += Aknext[0] * Bknext[3];
  436. C_1_3 += Aknext[1] * Bknext[3];
  437. C_2_3 += Aknext[2] * Bknext[3];
  438. C_3_3 += Aknext[3] * Bknext[3];
  439. }
  440. /* Wrapup remaining k's */
  441. for (; k < bk; k++) {
  442. vector_float Ak;
  443. #define COMPUTE_WRAPUP_ROW(ROW) \
  444. Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \
  445. C_##ROW##_0 += Ak * B[0 + k * COLS]; \
  446. C_##ROW##_1 += Ak * B[1 + k * COLS]; \
  447. C_##ROW##_2 += Ak * B[2 + k * COLS]; \
  448. C_##ROW##_3 += Ak * B[3 + k * COLS];
  449. COMPUTE_WRAPUP_ROW(0)
  450. COMPUTE_WRAPUP_ROW(1)
  451. COMPUTE_WRAPUP_ROW(2)
  452. COMPUTE_WRAPUP_ROW(3)
  453. #undef COMPUTE_WRAPUP_ROW
  454. }
  455. }
  456. /*
  457. * Unpack row-block of C_aux into outer C_i, multiply by
  458. * alpha and add up (or assign for TRMM).
  459. */
  460. #define WRITE_BACK_C(ROW, COL) \
  461. do { \
  462. vector_float *Cij = \
  463. (vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \
  464. if (trmm) { \
  465. *Cij = alpha * C_##ROW##_##COL; \
  466. } else { \
  467. *Cij += alpha * C_##ROW##_##COL; \
  468. } \
  469. } while (0)
  470. WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3);
  471. WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3);
  472. WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3);
  473. WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3);
  474. #undef WRITE_BACK_C
  475. #undef ROWS
  476. #undef VEC_ROWS
  477. #undef COLS
  478. #undef VEC_COLS
  479. #undef BARRIER_READ_BEFORE_COMPUTE
  480. }
  481. /**
  482. * Handle calculation for row blocks in C_i of any size by dispatching into
  483. * macro-defined (inline) functions or by deferring to a simple generic
  484. * implementation. Note that the compiler can remove this awkward-looking
  485. * dispatching code while inlineing.
  486. *
  487. * @param[in] m Number of rows in block C_i.
  488. * @param[in] n Number of columns in block C_i.
  489. * @param[in] first_row Index of first row of the block C_i (relative to C).
  490. * @param[in] A Pointer to input matrix A (note: all of it).
  491. * @param[in] k Number of columns in A and rows in B.
  492. * @param[in] B Pointer to current column block (panel) of input matrix B.
  493. * @param[inout] C Pointer to current column block (panel) of output matrix C.
  494. * @param[in] ldc Offset between elements in adjacent columns in C.
  495. * @param[in] alpha Scalar factor.
  496. * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
  497. * @param[in] off Running offset for handling triangular matrices.
  498. */
  499. static inline void GEBP_block(BLASLONG m, BLASLONG n,
  500. BLASLONG first_row,
  501. const FLOAT * restrict A, BLASLONG k,
  502. const FLOAT * restrict B,
  503. FLOAT *restrict C, BLASLONG ldc,
  504. FLOAT alpha,
  505. BLASLONG offset, BLASLONG off)
  506. {
  507. if (trmm && left)
  508. off = offset + first_row;
  509. A += first_row * k;
  510. C += first_row;
  511. if (trmm) {
  512. if (backwards) {
  513. A += off * m;
  514. B += off * n;
  515. k -= off;
  516. } else {
  517. if (left) {
  518. k = off + m;
  519. } else {
  520. k = off + n;
  521. }
  522. }
  523. }
  524. /* Dispatch into the implementation for each block size: */
  525. #define BLOCK(bm, bn) \
  526. if (m == bm && n == bn) { \
  527. GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
  528. return; \
  529. }
  530. #if UNROLL_M == 16
  531. BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
  532. #endif
  533. #if UNROLL_N == 8
  534. BLOCK(8, 8); BLOCK(4, 8);
  535. #endif
  536. BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
  537. BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
  538. BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1);
  539. BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1);
  540. #undef BLOCK
  541. }
  542. /**
  543. * Handle a column block (panel) of C and B while calculating C += alpha(A * B).
  544. *
  545. * @param[in] num_cols Number of columns in the block (in C and B).
  546. * @param[in] first_col First column of the current block (in C and B).
  547. * @param[in] A Pointer to input matrix A.
  548. * @param[in] bk Number of columns in A and rows in B.
  549. * @param[in] B Pointer to input matrix B (note: all of it).
  550. * @param[in] bm Number of rows in C and A.
  551. * @param[inout] C Pointer to output matrix C (note: all of it).
  552. * @param[in] ldc Offset between elements in adjacent columns in C.
  553. * @param[in] alpha Scalar factor.
  554. * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
  555. */
  556. static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
  557. const FLOAT *restrict A, BLASLONG bk,
  558. const FLOAT *restrict B, BLASLONG bm,
  559. FLOAT *restrict C, BLASLONG ldc,
  560. FLOAT alpha,
  561. BLASLONG const offset) {
  562. FLOAT *restrict C_i = C + first_col * ldc;
  563. /*
  564. * B is in column-order with n_r packed row elements, which does
  565. * not matter -- we always move in full such blocks of
  566. * column*pack
  567. */
  568. const FLOAT *restrict B_i = B + first_col * bk;
  569. BLASLONG off = 0;
  570. if (trmm) {
  571. if (left) {
  572. off = offset;
  573. } else {
  574. off = -offset + first_col;
  575. }
  576. }
  577. /*
  578. * Calculate C_aux := A * B_j
  579. * then unpack C_i += alpha * C_aux.
  580. *
  581. * For that purpose, further partition C_aux and A into blocks
  582. * of m_r (unroll_m) rows, or powers-of-2 if smaller.
  583. */
  584. BLASLONG row = 0;
  585. for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
  586. for (; bm - row >= block_size; row += block_size)
  587. GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
  588. ldc, alpha, offset, off);
  589. }
  590. /**
  591. * Inner kernel for matrix-matrix multiplication. C += alpha (A * B)
  592. * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
  593. * C are pointers to submatrices of the actual matrices.
  594. *
  595. * For triangular matrix multiplication, calculate B := alpha (A * B) where A
  596. * or B can be triangular (in case of A, the macro LEFT will be defined).
  597. *
  598. * @param[in] bm Number of rows in C and A.
  599. * @param[in] bn Number of columns in C and B.
  600. * @param[in] bk Number of columns in A and rows in B.
  601. * @param[in] alpha Scalar factor.
  602. * @param[in] ba Pointer to input matrix A.
  603. * @param[in] bb Pointer to input matrix B.
  604. * @param[inout] C Pointer to output matrix C.
  605. * @param[in] ldc Offset between elements in adjacent columns in C.
  606. * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
  607. * @returns 0 on success.
  608. */
  609. int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
  610. FLOAT *restrict ba, FLOAT *restrict bb,
  611. FLOAT *restrict C, BLASLONG ldc
  612. #ifdef TRMMKERNEL
  613. , BLASLONG offset
  614. #endif
  615. )
  616. {
  617. if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
  618. return 0;
  619. /*
  620. * interface code allocates buffers for ba and bb at page
  621. * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler
  622. * to make use of the fact in vector load operations.
  623. */
  624. ba = __builtin_assume_aligned(ba, 16);
  625. bb = __builtin_assume_aligned(bb, 16);
  626. /*
  627. * Use offset and off even when compiled as SGEMMKERNEL to simplify
  628. * function signatures and function calls.
  629. */
  630. #ifndef TRMMKERNEL
  631. BLASLONG const offset = 0;
  632. #endif
  633. /*
  634. * Partition B and C into blocks of n_r (unroll_n) columns, called B_i
  635. * and C_i. For each partition, calculate C_i += alpha * (A * B_j).
  636. *
  637. * For remaining columns that do not fill up a block of n_r, iteratively
  638. * use smaller block sizes of powers of 2.
  639. */
  640. BLASLONG col = 0;
  641. for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
  642. for (; bn - col >= block_size; col += block_size)
  643. GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset);
  644. return 0;
  645. }