OSchip
/
OpenBLAS

/*
 * Copyright (c) IBM Corporation 2020.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *    1. Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *
 *    2. Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 *    3. Neither the name of the OpenBLAS project nor the names of
 *       its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "common.h"
#include <vecintrin.h>

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>

#ifdef COMPLEX
#error "Handling for complex numbers is not supported in this kernel"
#endif

#ifdef DOUBLE
#define UNROLL_M DGEMM_DEFAULT_UNROLL_M
#define UNROLL_N DGEMM_DEFAULT_UNROLL_N
#else
#define UNROLL_M SGEMM_DEFAULT_UNROLL_M
#define UNROLL_N SGEMM_DEFAULT_UNROLL_N
#endif

static const size_t unroll_m = UNROLL_M;
static const size_t unroll_n = UNROLL_N;

/* Handling of triangular matrices */
#ifdef TRMMKERNEL
static const bool trmm = true;
static const bool left =
#ifdef LEFT
	true;
#else
	false;
#endif

static const bool backwards =
#if defined(LEFT) != defined(TRANSA)
	true;
#else
	false;
#endif

#else
static const bool trmm = false;
static const bool left = false;
static const bool backwards = false;
#endif /* TRMMKERNEL */

/*
 * Background:
 *
 * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication
 * problem by splitting all matrices into partitions multiple times, so that the
 * submatrices fit into the L1 or L2 caches. As a result, each multiplication of
 * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies
 * and rearranges the submatrices to enable contiguous memory accesses to
 * improve locality in both caches and TLBs.
 *
 * At the heart of the algorithm is this kernel, which multiplies, a "Block
 * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is
 * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this
 * operation GEBP. This kernel further partitions GEBP twice, such that (1)
 * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a
 * block of C fits into the registers, while multiplying panels from A and B
 * streamed from the L2 and L1 cache, respectively (GEBP_block).
 *
 *
 * Algorithm GEBP(A, B, C, m, n, k, alpha):
 *
 * The problem is calculating C += alpha * (A * B)
 * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix.
 *
 * - C is in column-major-order, with an offset of ldc to the element in the
 *   next column (same row).
 * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column
 *   contiguously while walking along rows.
 * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row
 *   contiguously.
 * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or
 * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2
 * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a
 * block-of-1).
 *
 * Note that packing A and B into that form is taken care of by the caller in
 * driver/level3/level3.c (actually done by "copy kernels").
 *
 * Steps:
 * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j.
 *   Now, B_j should fit into the L1 cache.
 * - For each partition, calculate C_j += alpha * (A * B_j) by
 *     (1) Calculate C_aux := A * B_j (see below)
 *     (2) unpack C_j = C_j + alpha * C_aux
 *
 *
 * Algorithm for Calculating C_aux:
 *
 * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows,
 *   such that the m_r x n_r-submatrix of C_aux can be held in registers. Each
 *   submatrix of C_aux can be calculated independently, and the registers are
 *   added back into C_j.
 *
 * - For each row-block of C_aux:
 *   (uses a row block of A and full B_j)
 *    - stream over all columns of A, multiply with elements from B and
 *      accumulate in registers. (use different inner-kernels to exploit
 *      vectorization for varying block sizes)
 *    - add alpha * row block of C_aux back into C_j.
 *
 * Note that there are additional mechanics for handling triangular matrices,
 * calculating B := alpha (A * B) where either of the matrices A or B can be
 * triangular. In case of A, the macro "LEFT" is defined. In addition, A can
 * optionally be transposed.
 * The code effectively skips an "offset" number of columns in A and rows of B
 * in each block, to save unnecessary work by exploiting the triangular nature.
 * To handle all cases, the code discerns (1) a "left" mode when A is triangular
 * and (2) "forward" / "backwards" modes where only the first "offset"
 * columns/rows of A/B are used or where the first "offset" columns/rows are
 * skipped, respectively.
 *
 * Reference:
 *
 * The summary above is based on staring at various kernel implementations and:
 * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix
 * Multiplication, in ACM Transactions of Mathematical Software, Vol.  34, No.
 * 3, May 2008.
 */

#define VLEN_BYTES 16
#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))

typedef FLOAT vector_float __attribute__ ((vector_size (16)));

/**
 * Load a vector into register, and hint on 8-byte alignment to improve
 * performance. gcc-9 and newer will create these hints by itself. For older
 * compiler versions, use inline assembly to explicitly express the hint.
 * Provide explicit hex encoding to cater for binutils versions that do not know
 * about vector-load with alignment hints yet.
 *
 * Note that, for block sizes where we apply vectorization, vectors in A will
 * always be 8-byte aligned.
 */
static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
	vector_float const *restrict addr = (vector_float const *restrict)a;
	vector_float y;

#if __GNUC__ < 9
	// hex-encode vl %[out],%[addr],3
	asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
	    : [ out ] "=v"(y)
	    : [ addr ] "R"(*addr));
#else
	y = *addr;
#endif

	return y;
}

/**
 * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
 *
 * @param[in] 	A	Pointer current block of input matrix A.
 * @param[in]	k	Number of columns in A.
 * @param[in]	B	Pointer current block of input matrix B.
 * @param[inout] C	Pointer current block of output matrix C.
 * @param[in]	ldc	Offset between elements in adjacent columns in C.
 * @param[in]	alpha	Scalar factor.
 */
#define VECTOR_BLOCK(ROWS, COLS)                                              \
	static inline void GEBP_block_##ROWS##_##COLS(                        \
	    FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,    \
	    FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {                   \
		_Static_assert(                                               \
		    ROWS % VLEN_FLOATS == 0,                                  \
		    "rows in block must be multiples of vector length");      \
		vector_float Caux[ROWS / VLEN_FLOATS][COLS];                  \
                                                                              \
		for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {           \
			vector_float A0 =                                     \
			    vec_load_hinted(A + i * VLEN_FLOATS);             \
			for (BLASLONG j = 0; j < COLS; j++)                   \
				Caux[i][j] = A0 * B[j];                       \
		}                                                             \
                                                                              \
		/*                                                            \
		 * Stream over the row-block of A, which is packed            \
		 * column-by-column, multiply by coefficients in B and add up \
		 * into temporaries Caux (which the compiler will hold in     \
		 * registers). Vectorization: Multiply column vectors from A  \
		 * with scalars from B and add up in column vectors of Caux.  \
		 * That equates to unrolling the loop over rows (in i) and    \
		 * executing each unrolled iteration as a vector element.     \
		 */                                                           \
		for (BLASLONG k = 1; k < bk; k++) {                           \
			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
				vector_float Ak = vec_load_hinted(            \
				    A + i * VLEN_FLOATS + k * ROWS);          \
                                                                              \
				for (BLASLONG j = 0; j < COLS; j++)           \
					Caux[i][j] += Ak * B[j + k * COLS];   \
			}                                                     \
		}                                                             \
                                                                              \
		/*                                                            \
		 * Unpack row-block of C_aux into outer C_i, multiply by      \
		 * alpha and add up.                                          \
		 */                                                           \
		for (BLASLONG j = 0; j < COLS; j++) {                         \
			for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) {   \
				vector_float *C_ij =                          \
				    (vector_float *)(C + i * VLEN_FLOATS +    \
						     j * ldc);                \
				if (trmm) {                                   \
					*C_ij = alpha * Caux[i][j];           \
				} else {                                      \
					*C_ij += alpha * Caux[i][j];          \
				}                                             \
			}                                                     \
		}                                                             \
	}


#if UNROLL_M == 16
VECTOR_BLOCK(16, 2)
VECTOR_BLOCK(16, 1)
#endif
#if UNROLL_N == 8
VECTOR_BLOCK(8, 8)
VECTOR_BLOCK(4, 8)
#endif
#ifndef DOUBLE
VECTOR_BLOCK(8, 4)
#endif
VECTOR_BLOCK(8, 2)
VECTOR_BLOCK(8, 1)
VECTOR_BLOCK(4, 4)
VECTOR_BLOCK(4, 2)
VECTOR_BLOCK(4, 1)

#ifdef DOUBLE
VECTOR_BLOCK(2, 4)
VECTOR_BLOCK(2, 2)
VECTOR_BLOCK(2, 1)
#endif


/**
 * Calculate a row-block that fits 4x4 vector registers using a loop
 * unrolled-by-2 with explicit interleaving to better overlap loads and
 * computation.
 * This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM.
 */
#ifdef DOUBLE
static inline void GEBP_block_8_4(
#else // float
static inline void GEBP_block_16_4(
#endif
    FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B,
    FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) {
#define VEC_ROWS 4
#define VEC_COLS 4
#define ROWS VEC_ROWS * VLEN_FLOATS
#define COLS (VEC_COLS)

	/*
	 * Hold intermediate results in vector registers.
	 * Since we need to force the compiler's hand in places, we need to use
	 * individual variables in contrast to the generic implementation's
	 * arrays.
	 */
#define INIT_ROW_OF_C(ROW)                                        \
    vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \
    vector_float C_##ROW##_0 = A##ROW * B[0];                     \
    vector_float C_##ROW##_1 = A##ROW * B[1];                     \
    vector_float C_##ROW##_2 = A##ROW * B[2];                     \
    vector_float C_##ROW##_3 = A##ROW * B[3];

	INIT_ROW_OF_C(0)
	INIT_ROW_OF_C(1)
	INIT_ROW_OF_C(2)
	INIT_ROW_OF_C(3)
#undef INIT_ROW_OF_C

	if (bk > 1) {
		BLASLONG k = 1;
		vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS];
		vector_float Bk[VEC_COLS], Bknext[VEC_COLS];

		/*
		 * Note that in several places, we enforce an instruction
		 * sequence that we identified empirically by utilizing dummy
		 * asm statements.
		 */

		for (BLASLONG j = 0; j < VEC_COLS; j++)
			Bk[j] = vec_splats(B[j + k * COLS]);
		asm("");

		for (BLASLONG i = 0; i < VEC_ROWS; i++)
			Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS);

		for (; k < (bk - 2); k += 2) {
			/*
			 * Load inputs for (k+1) into registers.
			 * Loading from B first is advantageous.
			 */
			for (BLASLONG j = 0; j < VEC_COLS; j++)
				Bknext[j] = vec_splats(B[j + (k + 1) * COLS]);
			asm("");
			for (BLASLONG i = 0; i < VEC_ROWS; i++)
				Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS +
						(k + 1) * ROWS);

			/*
			 * To achieve better instruction-level parallelism,
			 * make sure to first load input data for (k+1) before
			 * initiating compute for k. We enforce that ordering
			 * with a pseudo asm statement.
			 * Note that we need to massage this particular "barrier"
			 * depending on the gcc version.
			 */
#if __GNUC__ > 7
#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX)                                    \
    do {                                                                       \
	asm(""                                                                 \
	    : "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \
	      "+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3)                            \
	    : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]),         \
	      "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]),         \
	      "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3]));                           \
	asm(""                                                                 \
	    : "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \
	      "+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3)                            \
	    : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]),         \
	      "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]),         \
	      "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3]));                           \
    } while (0)
#else // __GNUC__ <= 7
#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \
    do {                                    \
	asm("");                            \
    } while (0)
#endif

			BARRIER_READ_BEFORE_COMPUTE(knext);

			/* Compute for (k) */
			C_0_0 += Ak[0] * Bk[0];
			C_1_0 += Ak[1] * Bk[0];
			C_2_0 += Ak[2] * Bk[0];
			C_3_0 += Ak[3] * Bk[0];

			C_0_1 += Ak[0] * Bk[1];
			C_1_1 += Ak[1] * Bk[1];
			C_2_1 += Ak[2] * Bk[1];
			C_3_1 += Ak[3] * Bk[1];

			C_0_2 += Ak[0] * Bk[2];
			C_1_2 += Ak[1] * Bk[2];
			C_2_2 += Ak[2] * Bk[2];
			C_3_2 += Ak[3] * Bk[2];

			C_0_3 += Ak[0] * Bk[3];
			C_1_3 += Ak[1] * Bk[3];
			C_2_3 += Ak[2] * Bk[3];
			C_3_3 += Ak[3] * Bk[3];

			asm("");

			/*
			 * Load inputs for (k+2) into registers.
			 * First load from B.
			 */
			for (BLASLONG j = 0; j < VEC_COLS; j++)
				Bk[j] = vec_splats(B[j + (k + 2) * COLS]);
			asm("");
			for (BLASLONG i = 0; i < VEC_ROWS; i++)
				Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS);

			/*
			 * As above, make sure to first schedule the loads for (k+2)
			 * before compute for (k+1).
			 */
			BARRIER_READ_BEFORE_COMPUTE(k);

			/* Compute on (k+1) */
			C_0_0 += Aknext[0] * Bknext[0];
			C_1_0 += Aknext[1] * Bknext[0];
			C_2_0 += Aknext[2] * Bknext[0];
			C_3_0 += Aknext[3] * Bknext[0];

			C_0_1 += Aknext[0] * Bknext[1];
			C_1_1 += Aknext[1] * Bknext[1];
			C_2_1 += Aknext[2] * Bknext[1];
			C_3_1 += Aknext[3] * Bknext[1];

			C_0_2 += Aknext[0] * Bknext[2];
			C_1_2 += Aknext[1] * Bknext[2];
			C_2_2 += Aknext[2] * Bknext[2];
			C_3_2 += Aknext[3] * Bknext[2];

			C_0_3 += Aknext[0] * Bknext[3];
			C_1_3 += Aknext[1] * Bknext[3];
			C_2_3 += Aknext[2] * Bknext[3];
			C_3_3 += Aknext[3] * Bknext[3];
		}

		/* Wrapup remaining k's */
		for (; k < bk; k++) {
			vector_float Ak;

#define COMPUTE_WRAPUP_ROW(ROW)                             \
    Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \
    C_##ROW##_0 += Ak * B[0 + k * COLS];                    \
    C_##ROW##_1 += Ak * B[1 + k * COLS];                    \
    C_##ROW##_2 += Ak * B[2 + k * COLS];                    \
    C_##ROW##_3 += Ak * B[3 + k * COLS];

			COMPUTE_WRAPUP_ROW(0)
			COMPUTE_WRAPUP_ROW(1)
			COMPUTE_WRAPUP_ROW(2)
			COMPUTE_WRAPUP_ROW(3)
#undef COMPUTE_WRAPUP_ROW
		}
	}

	/*
	 * Unpack row-block of C_aux into outer C_i, multiply by
	 * alpha and add up (or assign for TRMM).
	 */
#define WRITE_BACK_C(ROW, COL)                                   \
    do {                                                         \
	vector_float *Cij =                                      \
	    (vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \
	if (trmm) {                                              \
	    *Cij = alpha * C_##ROW##_##COL;                   \
	} else {                                                 \
	    *Cij += alpha * C_##ROW##_##COL;                  \
	}                                                        \
    } while (0)

	WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3);
	WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3);
	WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3);
	WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3);
#undef WRITE_BACK_C

#undef ROWS
#undef VEC_ROWS
#undef COLS
#undef VEC_COLS
#undef BARRIER_READ_BEFORE_COMPUTE
}

/**
 * Handle calculation for row blocks in C_i of any size by dispatching into
 * macro-defined (inline) functions or by deferring to a simple generic
 * implementation. Note that the compiler can remove this awkward-looking
 * dispatching code while inlineing.
 *
 * @param[in]	m	Number of rows in block C_i.
 * @param[in]	n	Number of columns in block C_i.
 * @param[in]	first_row Index of first row of the block C_i (relative to C).
 * @param[in]	A	Pointer to input matrix A (note: all of it).
 * @param[in]	k	Number of columns in A and rows in B.
 * @param[in]	B	Pointer to current column block (panel) of input matrix B.
 * @param[inout] C	Pointer to current column block (panel) of output matrix C.
 * @param[in]	ldc	Offset between elements in adjacent columns in C.
 * @param[in]	alpha	Scalar factor.
 * @param[in]	offset  Number of columns of A and rows of B to skip (for triangular matrices).
 * @param[in]	off	Running offset for handling triangular matrices.
 */
static inline void GEBP_block(BLASLONG m, BLASLONG n,
		       BLASLONG first_row,
		       const FLOAT * restrict A, BLASLONG k,
		       const FLOAT * restrict B,
		       FLOAT *restrict C, BLASLONG ldc,
		       FLOAT alpha,
		       BLASLONG offset, BLASLONG off)
{
	if (trmm && left)
		off = offset + first_row;

	A += first_row * k;
	C += first_row;

	if (trmm) {
		if (backwards) {
			A += off * m;
			B += off * n;
			k -= off;
		} else {
			if (left) {
				k = off + m;
			} else {
				k = off + n;
			}
		}
	}

#define BLOCK(bm, bn)                                           \
	if (m == bm && n == bn) {                               \
		GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
		return;                                         \
	}

#if UNROLL_M == 16
	BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
#endif
#if UNROLL_N == 8
	BLOCK(8, 8); BLOCK(4, 8);
#endif
	BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
	BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);

	#ifdef DOUBLE
	BLOCK(2, 4);
	BLOCK(2, 2);
	#endif

#undef BLOCK

	/* simple implementation for smaller block sizes: */
	FLOAT Caux[m][n] __attribute__ ((aligned (16)));

	/*
	 * Peel off first iteration (i.e., column of A) for initializing Caux
	 */
	for (BLASLONG i = 0; i < m; i++)
		for (BLASLONG j = 0; j < n; j++)
			Caux[i][j] = A[i] * B[j];

	for (BLASLONG kk = 1; kk < k; kk++)
		for (BLASLONG i = 0; i < m; i++)
			for (BLASLONG j = 0; j < n; j++)
				Caux[i][j] += A[i + kk * m] * B[j + kk * n];

	for (BLASLONG i = 0; i < m; i++)
		for (BLASLONG j = 0; j < n; j++)
			if (trmm) {
				C[i + j * ldc] = alpha * Caux[i][j];
			} else {
				C[i + j * ldc] += alpha * Caux[i][j];
			}
}

/**
 * Handle a column block (panel) of C and B while calculating C += alpha(A * B).
 *
 * @param[in]	num_cols	Number of columns in the block (in C and B).
 * @param[in]	first_col	First column of the current block (in C and B).
 * @param[in]	A	Pointer to input matrix A.
 * @param[in]	bk	Number of columns in A and rows in B.
 * @param[in]	B	Pointer to input matrix B (note: all of it).
 * @param[in]	bm	Number of rows in C and A.
 * @param[inout] C	Pointer to output matrix C (note: all of it).
 * @param[in]	ldc	Offset between elements in adjacent columns in C.
 * @param[in]	alpha	Scalar factor.
 * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
 */
static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
			const FLOAT *restrict A, BLASLONG bk,
			const FLOAT *restrict B, BLASLONG bm,
			FLOAT *restrict C, BLASLONG ldc,
			FLOAT alpha,
			BLASLONG const offset) {

	FLOAT *restrict C_i = C + first_col * ldc;
	/*
	 * B is in column-order with n_r packed row elements, which does
	 * not matter -- we always move in full such blocks of
	 * column*pack
	 */
	const FLOAT *restrict B_i = B + first_col * bk;

	BLASLONG off = 0;
	if (trmm) {
		if (left) {
			off = offset;
		} else {
			off = -offset + first_col;
		}
	}

	/*
	 * Calculate C_aux := A * B_j
	 * then unpack C_i += alpha * C_aux.
	 *
	 * For that purpose, further partition C_aux and A into blocks
	 * of m_r (unroll_m) rows, or powers-of-2 if smaller.
	 */
	BLASLONG row = 0;
	for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
		for (; bm - row >= block_size; row += block_size)
			GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
				   ldc, alpha, offset, off);
}

/**
 * Inner kernel for matrix-matrix multiplication. C += alpha (A * B)
 * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
 * C are pointers to submatrices of the actual matrices.
 *
 * For triangular matrix multiplication, calculate B := alpha (A * B) where A
 * or B can be triangular (in case of A, the macro LEFT will be defined).
 *
 * @param[in]	bm	Number of rows in C and A.
 * @param[in]	bn	Number of columns in C and B.
 * @param[in]	bk	Number of columns in A and rows in B.
 * @param[in]	alpha	Scalar factor.
 * @param[in]	ba	Pointer to input matrix A.
 * @param[in]	bb	Pointer to input matrix B.
 * @param[inout] C	Pointer to output matrix C.
 * @param[in]	ldc	Offset between elements in adjacent columns in C.
 * @param[in]	offset	Number of columns of A and rows of B to skip (for triangular matrices).
 * @returns 0 on success.
 */
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
	  FLOAT *restrict ba, FLOAT *restrict bb,
	  FLOAT *restrict C, BLASLONG ldc
#ifdef TRMMKERNEL
	  , BLASLONG offset
#endif
	  )
{
	if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
		return 0;

	/*
	 * interface code allocates buffers for ba and bb at page
	 * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler
	 * to make use of the fact in vector load operations.
	 */
	ba = __builtin_assume_aligned(ba, 16);
	bb = __builtin_assume_aligned(bb, 16);

	/*
	 * Use offset and off even when compiled as SGEMMKERNEL to simplify
	 * function signatures and function calls.
	 */
#ifndef TRMMKERNEL
	BLASLONG const offset = 0;
#endif

	/*
	 * Partition B and C into blocks of n_r (unroll_n) columns, called B_i
	 * and C_i. For each partition, calculate C_i += alpha * (A * B_j).
	 *
	 * For remaining columns that do not fill up a block of n_r, iteratively
	 * use smaller block sizes of powers of 2.
	 */
	BLASLONG col = 0;
	for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
		for (; bn - col >= block_size; col += block_size)
			GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset);

   return 0;
}