|
|
@@ -64,419 +64,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define INIT32x8() \ |
|
|
|
row0 = _mm512_setzero_ps(); \ |
|
|
|
row1 = _mm512_setzero_ps(); \ |
|
|
|
row2 = _mm512_setzero_ps(); \ |
|
|
|
row3 = _mm512_setzero_ps(); \ |
|
|
|
row4 = _mm512_setzero_ps(); \ |
|
|
|
row5 = _mm512_setzero_ps(); \ |
|
|
|
row6 = _mm512_setzero_ps(); \ |
|
|
|
row0b = _mm512_setzero_ps(); \ |
|
|
|
row1b = _mm512_setzero_ps(); \ |
|
|
|
row2b = _mm512_setzero_ps(); \ |
|
|
|
row3b = _mm512_setzero_ps(); \ |
|
|
|
row4b = _mm512_setzero_ps(); \ |
|
|
|
row5b = _mm512_setzero_ps(); \ |
|
|
|
row6b = _mm512_setzero_ps(); \ |
|
|
|
row7b = _mm512_setzero_ps(); \ |
|
|
|
|
|
|
|
#define KERNEL32x8_SUB() \ |
|
|
|
zmm0 = _mm512_loadu_ps(AO); \ |
|
|
|
zmm0b = _mm512_loadu_ps(AOb); \ |
|
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ |
|
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ |
|
|
|
row0 += zmm0 * zmm2; \ |
|
|
|
row1 += zmm0 * zmm3; \ |
|
|
|
row0b += zmm0b * zmm2; \ |
|
|
|
row1b += zmm0b * zmm3; \ |
|
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ |
|
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ |
|
|
|
row2 += zmm0 * zmm2; \ |
|
|
|
row3 += zmm0 * zmm3; \ |
|
|
|
row2b += zmm0b * zmm2; \ |
|
|
|
row3b += zmm0b * zmm3; \ |
|
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ |
|
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ |
|
|
|
row4 += zmm0 * zmm2; \ |
|
|
|
row5 += zmm0 * zmm3; \ |
|
|
|
row4b += zmm0b * zmm2; \ |
|
|
|
row5b += zmm0b * zmm3; \ |
|
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ |
|
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ |
|
|
|
row6 += zmm0 * zmm2; \ |
|
|
|
row7 += zmm0 * zmm3; \ |
|
|
|
row6b += zmm0b * zmm2; \ |
|
|
|
row7b += zmm0b * zmm3; \ |
|
|
|
BO += 8; \ |
|
|
|
AO += 16; \ |
|
|
|
AOb += 16; |
|
|
|
|
|
|
|
|
|
|
|
#define SAVE32x8(ALPHA) \ |
|
|
|
zmm0 = _mm512_set1_ps(ALPHA); \ |
|
|
|
row0 *= zmm0; \ |
|
|
|
row1 *= zmm0; \ |
|
|
|
row2 *= zmm0; \ |
|
|
|
row3 *= zmm0; \ |
|
|
|
row4 *= zmm0; \ |
|
|
|
row5 *= zmm0; \ |
|
|
|
row6 *= zmm0; \ |
|
|
|
row7 *= zmm0; \ |
|
|
|
row0b *= zmm0; \ |
|
|
|
row1b *= zmm0; \ |
|
|
|
row2b *= zmm0; \ |
|
|
|
row3b *= zmm0; \ |
|
|
|
row4b *= zmm0; \ |
|
|
|
row5b *= zmm0; \ |
|
|
|
row6b *= zmm0; \ |
|
|
|
row7b *= zmm0; \ |
|
|
|
row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ |
|
|
|
row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ |
|
|
|
row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ |
|
|
|
row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ |
|
|
|
row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ |
|
|
|
row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ |
|
|
|
row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ |
|
|
|
row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ |
|
|
|
_mm512_storeu_ps(CO1 + 0 * ldc, row0); \ |
|
|
|
_mm512_storeu_ps(CO1 + 1 * ldc, row1); \ |
|
|
|
_mm512_storeu_ps(CO1 + 2 * ldc, row2); \ |
|
|
|
_mm512_storeu_ps(CO1 + 3 * ldc, row3); \ |
|
|
|
_mm512_storeu_ps(CO1 + 4 * ldc, row4); \ |
|
|
|
_mm512_storeu_ps(CO1 + 5 * ldc, row5); \ |
|
|
|
_mm512_storeu_ps(CO1 + 6 * ldc, row6); \ |
|
|
|
_mm512_storeu_ps(CO1 + 7 * ldc, row7); \ |
|
|
|
row0b += _mm512_loadu_ps(CO1 + 0 * ldc + 16); \ |
|
|
|
row1b += _mm512_loadu_ps(CO1 + 1 * ldc + 16); \ |
|
|
|
row2b += _mm512_loadu_ps(CO1 + 2 * ldc + 16); \ |
|
|
|
row3b += _mm512_loadu_ps(CO1 + 3 * ldc + 16); \ |
|
|
|
row4b += _mm512_loadu_ps(CO1 + 4 * ldc + 16); \ |
|
|
|
row5b += _mm512_loadu_ps(CO1 + 5 * ldc + 16); \ |
|
|
|
row6b += _mm512_loadu_ps(CO1 + 6 * ldc + 16); \ |
|
|
|
row7b += _mm512_loadu_ps(CO1 + 7 * ldc + 16); \ |
|
|
|
_mm512_storeu_ps(CO1 + 0 * ldc + 16, row0b); \ |
|
|
|
_mm512_storeu_ps(CO1 + 1 * ldc + 16, row1b); \ |
|
|
|
_mm512_storeu_ps(CO1 + 2 * ldc + 16, row2b); \ |
|
|
|
_mm512_storeu_ps(CO1 + 3 * ldc + 16, row3b); \ |
|
|
|
_mm512_storeu_ps(CO1 + 4 * ldc + 16, row4b); \ |
|
|
|
_mm512_storeu_ps(CO1 + 5 * ldc + 16, row5b); \ |
|
|
|
_mm512_storeu_ps(CO1 + 6 * ldc + 16, row6b); \ |
|
|
|
_mm512_storeu_ps(CO1 + 7 * ldc + 16, row7b); \ |
|
|
|
|
|
|
|
|
|
|
|
#define INIT16x8() \ |
|
|
|
row0 = _mm512_setzero_ps(); \ |
|
|
|
row1 = _mm512_setzero_ps(); \ |
|
|
|
row2 = _mm512_setzero_ps(); \ |
|
|
|
row3 = _mm512_setzero_ps(); \ |
|
|
|
row4 = _mm512_setzero_ps(); \ |
|
|
|
row5 = _mm512_setzero_ps(); \ |
|
|
|
row6 = _mm512_setzero_ps(); \ |
|
|
|
row7 = _mm512_setzero_ps(); \ |
|
|
|
|
|
|
|
#define KERNEL16x8_SUB() \ |
|
|
|
zmm0 = _mm512_loadu_ps(AO); \ |
|
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 0)); \ |
|
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 1)); \ |
|
|
|
row0 += zmm0 * zmm2; \ |
|
|
|
row1 += zmm0 * zmm3; \ |
|
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 2)); \ |
|
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 3)); \ |
|
|
|
row2 += zmm0 * zmm2; \ |
|
|
|
row3 += zmm0 * zmm3; \ |
|
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 4)); \ |
|
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 5)); \ |
|
|
|
row4 += zmm0 * zmm2; \ |
|
|
|
row5 += zmm0 * zmm3; \ |
|
|
|
zmm2 = _mm512_broadcastss_ps(_mm_load_ss(BO + 6)); \ |
|
|
|
zmm3 = _mm512_broadcastss_ps(_mm_load_ss(BO + 7)); \ |
|
|
|
row6 += zmm0 * zmm2; \ |
|
|
|
row7 += zmm0 * zmm3; \ |
|
|
|
BO += 8; \ |
|
|
|
AO += 16; |
|
|
|
|
|
|
|
|
|
|
|
#define SAVE16x8(ALPHA) \ |
|
|
|
zmm0 = _mm512_set1_ps(ALPHA); \ |
|
|
|
row0 *= zmm0; \ |
|
|
|
row1 *= zmm0; \ |
|
|
|
row2 *= zmm0; \ |
|
|
|
row3 *= zmm0; \ |
|
|
|
row4 *= zmm0; \ |
|
|
|
row5 *= zmm0; \ |
|
|
|
row6 *= zmm0; \ |
|
|
|
row7 *= zmm0; \ |
|
|
|
row0 += _mm512_loadu_ps(CO1 + 0 * ldc); \ |
|
|
|
row1 += _mm512_loadu_ps(CO1 + 1 * ldc); \ |
|
|
|
row2 += _mm512_loadu_ps(CO1 + 2 * ldc); \ |
|
|
|
row3 += _mm512_loadu_ps(CO1 + 3 * ldc); \ |
|
|
|
row4 += _mm512_loadu_ps(CO1 + 4 * ldc); \ |
|
|
|
row5 += _mm512_loadu_ps(CO1 + 5 * ldc); \ |
|
|
|
row6 += _mm512_loadu_ps(CO1 + 6 * ldc); \ |
|
|
|
row7 += _mm512_loadu_ps(CO1 + 7 * ldc); \ |
|
|
|
_mm512_storeu_ps(CO1 + 0 * ldc, row0); \ |
|
|
|
_mm512_storeu_ps(CO1 + 1 * ldc, row1); \ |
|
|
|
_mm512_storeu_ps(CO1 + 2 * ldc, row2); \ |
|
|
|
_mm512_storeu_ps(CO1 + 3 * ldc, row3); \ |
|
|
|
_mm512_storeu_ps(CO1 + 4 * ldc, row4); \ |
|
|
|
_mm512_storeu_ps(CO1 + 5 * ldc, row5); \ |
|
|
|
_mm512_storeu_ps(CO1 + 6 * ldc, row6); \ |
|
|
|
_mm512_storeu_ps(CO1 + 7 * ldc, row7); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************************/ |
|
|
|
|
|
|
|
#define INIT8x8() \ |
|
|
|
row0 = _mm256_setzero_ps(); \ |
|
|
|
row1 = _mm256_setzero_ps(); \ |
|
|
|
row2 = _mm256_setzero_ps(); \ |
|
|
|
row3 = _mm256_setzero_ps(); \ |
|
|
|
row4 = _mm256_setzero_ps(); \ |
|
|
|
row5 = _mm256_setzero_ps(); \ |
|
|
|
row6 = _mm256_setzero_ps(); \ |
|
|
|
row7 = _mm256_setzero_ps(); \ |
|
|
|
|
|
|
|
#define KERNEL8x8_SUB() \ |
|
|
|
ymm0 = _mm256_loadu_ps(AO); \ |
|
|
|
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 0)); \ |
|
|
|
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 1)); \ |
|
|
|
row0 += ymm0 * ymm2; \ |
|
|
|
row1 += ymm0 * ymm3; \ |
|
|
|
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 2)); \ |
|
|
|
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 3)); \ |
|
|
|
row2 += ymm0 * ymm2; \ |
|
|
|
row3 += ymm0 * ymm3; \ |
|
|
|
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 4)); \ |
|
|
|
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 5)); \ |
|
|
|
row4 += ymm0 * ymm2; \ |
|
|
|
row5 += ymm0 * ymm3; \ |
|
|
|
ymm2 = _mm256_broadcastss_ps(_mm_load_ss(BO + 6)); \ |
|
|
|
ymm3 = _mm256_broadcastss_ps(_mm_load_ss(BO + 7)); \ |
|
|
|
row6 += ymm0 * ymm2; \ |
|
|
|
row7 += ymm0 * ymm3; \ |
|
|
|
BO += 8; \ |
|
|
|
AO += 8; |
|
|
|
|
|
|
|
|
|
|
|
#define SAVE8x8(ALPHA) \ |
|
|
|
ymm0 = _mm256_set1_ps(ALPHA); \ |
|
|
|
row0 *= ymm0; \ |
|
|
|
row1 *= ymm0; \ |
|
|
|
row2 *= ymm0; \ |
|
|
|
row3 *= ymm0; \ |
|
|
|
row4 *= ymm0; \ |
|
|
|
row5 *= ymm0; \ |
|
|
|
row6 *= ymm0; \ |
|
|
|
row7 *= ymm0; \ |
|
|
|
row0 += _mm256_loadu_ps(CO1 + 0 * ldc); \ |
|
|
|
row1 += _mm256_loadu_ps(CO1 + 1 * ldc); \ |
|
|
|
row2 += _mm256_loadu_ps(CO1 + 2 * ldc); \ |
|
|
|
row3 += _mm256_loadu_ps(CO1 + 3 * ldc); \ |
|
|
|
row4 += _mm256_loadu_ps(CO1 + 4 * ldc); \ |
|
|
|
row5 += _mm256_loadu_ps(CO1 + 5 * ldc); \ |
|
|
|
row6 += _mm256_loadu_ps(CO1 + 6 * ldc); \ |
|
|
|
row7 += _mm256_loadu_ps(CO1 + 7 * ldc); \ |
|
|
|
_mm256_storeu_ps(CO1 + 0 * ldc, row0); \ |
|
|
|
_mm256_storeu_ps(CO1 + 1 * ldc, row1); \ |
|
|
|
_mm256_storeu_ps(CO1 + 2 * ldc, row2); \ |
|
|
|
_mm256_storeu_ps(CO1 + 3 * ldc, row3); \ |
|
|
|
_mm256_storeu_ps(CO1 + 4 * ldc, row4); \ |
|
|
|
_mm256_storeu_ps(CO1 + 5 * ldc, row5); \ |
|
|
|
_mm256_storeu_ps(CO1 + 6 * ldc, row6); \ |
|
|
|
_mm256_storeu_ps(CO1 + 7 * ldc, row7); \ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************************/ |
|
|
|
|
|
|
|
#define INIT4x8() \ |
|
|
|
row0 = _mm_setzero_ps(); \ |
|
|
|
row1 = _mm_setzero_ps(); \ |
|
|
|
row2 = _mm_setzero_ps(); \ |
|
|
|
row3 = _mm_setzero_ps(); \ |
|
|
|
row4 = _mm_setzero_ps(); \ |
|
|
|
row5 = _mm_setzero_ps(); \ |
|
|
|
row6 = _mm_setzero_ps(); \ |
|
|
|
row7 = _mm_setzero_ps(); \ |
|
|
|
|
|
|
|
|
|
|
|
#define KERNEL4x8_SUB() \ |
|
|
|
xmm0 = _mm_loadu_ps(AO); \ |
|
|
|
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 0)); \ |
|
|
|
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 1)); \ |
|
|
|
row0 += xmm0 * xmm2; \ |
|
|
|
row1 += xmm0 * xmm3; \ |
|
|
|
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 2)); \ |
|
|
|
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 3)); \ |
|
|
|
row2 += xmm0 * xmm2; \ |
|
|
|
row3 += xmm0 * xmm3; \ |
|
|
|
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 4)); \ |
|
|
|
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 5)); \ |
|
|
|
row4 += xmm0 * xmm2; \ |
|
|
|
row5 += xmm0 * xmm3; \ |
|
|
|
xmm2 = _mm_broadcastss_ps(_mm_load_ss(BO + 6)); \ |
|
|
|
xmm3 = _mm_broadcastss_ps(_mm_load_ss(BO + 7)); \ |
|
|
|
row6 += xmm0 * xmm2; \ |
|
|
|
row7 += xmm0 * xmm3; \ |
|
|
|
BO += 8; \ |
|
|
|
AO += 4; |
|
|
|
|
|
|
|
|
|
|
|
#define SAVE4x8(ALPHA) \ |
|
|
|
xmm0 = _mm_set1_ps(ALPHA); \ |
|
|
|
row0 *= xmm0; \ |
|
|
|
row1 *= xmm0; \ |
|
|
|
row2 *= xmm0; \ |
|
|
|
row3 *= xmm0; \ |
|
|
|
row4 *= xmm0; \ |
|
|
|
row5 *= xmm0; \ |
|
|
|
row6 *= xmm0; \ |
|
|
|
row7 *= xmm0; \ |
|
|
|
row0 += _mm_loadu_ps(CO1 + 0 * ldc); \ |
|
|
|
row1 += _mm_loadu_ps(CO1 + 1 * ldc); \ |
|
|
|
row2 += _mm_loadu_ps(CO1 + 2 * ldc); \ |
|
|
|
row3 += _mm_loadu_ps(CO1 + 3 * ldc); \ |
|
|
|
row4 += _mm_loadu_ps(CO1 + 4 * ldc); \ |
|
|
|
row5 += _mm_loadu_ps(CO1 + 5 * ldc); \ |
|
|
|
row6 += _mm_loadu_ps(CO1 + 6 * ldc); \ |
|
|
|
row7 += _mm_loadu_ps(CO1 + 7 * ldc); \ |
|
|
|
_mm_storeu_ps(CO1 + 0 * ldc, row0); \ |
|
|
|
_mm_storeu_ps(CO1 + 1 * ldc, row1); \ |
|
|
|
_mm_storeu_ps(CO1 + 2 * ldc, row2); \ |
|
|
|
_mm_storeu_ps(CO1 + 3 * ldc, row3); \ |
|
|
|
_mm_storeu_ps(CO1 + 4 * ldc, row4); \ |
|
|
|
_mm_storeu_ps(CO1 + 5 * ldc, row5); \ |
|
|
|
_mm_storeu_ps(CO1 + 6 * ldc, row6); \ |
|
|
|
_mm_storeu_ps(CO1 + 7 * ldc, row7); \ |
|
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************************/ |
|
|
|
|
|
|
|
#define INIT2x8() \ |
|
|
|
row0a = row0b = 0; \ |
|
|
|
row1a = row1b = 0; \ |
|
|
|
row2a = row2b = 0; \ |
|
|
|
row3a = row3b = 0; \ |
|
|
|
row4a = row4b = 0; \ |
|
|
|
row5a = row5b = 0; \ |
|
|
|
row6a = row6b = 0; \ |
|
|
|
row7a = row7b = 0; \ |
|
|
|
|
|
|
|
#define KERNEL2x8_SUB() \ |
|
|
|
xmm0 = *(AO); \ |
|
|
|
xmm1 = *(AO + 1); \ |
|
|
|
xmm2 = *(BO + 0); \ |
|
|
|
xmm3 = *(BO + 1); \ |
|
|
|
row0a += xmm0 * xmm2; \ |
|
|
|
row0b += xmm1 * xmm2; \ |
|
|
|
row1a += xmm0 * xmm3; \ |
|
|
|
row1b += xmm1 * xmm3; \ |
|
|
|
xmm2 = *(BO + 2); \ |
|
|
|
xmm3 = *(BO + 3); \ |
|
|
|
row2a += xmm0 * xmm2; \ |
|
|
|
row2b += xmm1 * xmm2; \ |
|
|
|
row3a += xmm0 * xmm3; \ |
|
|
|
row3b += xmm1 * xmm3; \ |
|
|
|
xmm2 = *(BO + 4); \ |
|
|
|
xmm3 = *(BO + 5); \ |
|
|
|
row4a += xmm0 * xmm2; \ |
|
|
|
row4b += xmm1 * xmm2; \ |
|
|
|
row5a += xmm0 * xmm3; \ |
|
|
|
row5b += xmm1 * xmm3; \ |
|
|
|
xmm2 = *(BO + 6); \ |
|
|
|
xmm3 = *(BO + 7); \ |
|
|
|
row6a += xmm0 * xmm2; \ |
|
|
|
row6b += xmm1 * xmm2; \ |
|
|
|
row7a += xmm0 * xmm3; \ |
|
|
|
row7b += xmm1 * xmm3; \ |
|
|
|
BO += 8; \ |
|
|
|
AO += 2; |
|
|
|
|
|
|
|
|
|
|
|
#define SAVE2x8(ALPHA) \ |
|
|
|
xmm0 = ALPHA; \ |
|
|
|
row0a *= xmm0; \ |
|
|
|
row0b *= xmm0; \ |
|
|
|
row1a *= xmm0; \ |
|
|
|
row1b *= xmm0; \ |
|
|
|
row2a *= xmm0; \ |
|
|
|
row2b *= xmm0; \ |
|
|
|
row3a *= xmm0; \ |
|
|
|
row3b *= xmm0; \ |
|
|
|
row4a *= xmm0; \ |
|
|
|
row4b *= xmm0; \ |
|
|
|
row5a *= xmm0; \ |
|
|
|
row5b *= xmm0; \ |
|
|
|
row6a *= xmm0; \ |
|
|
|
row6b *= xmm0; \ |
|
|
|
row7a *= xmm0; \ |
|
|
|
row7b *= xmm0; \ |
|
|
|
*(CO1 + 0 * ldc + 0) += row0a; \ |
|
|
|
*(CO1 + 0 * ldc + 1) += row0b; \ |
|
|
|
*(CO1 + 1 * ldc + 0) += row1a; \ |
|
|
|
*(CO1 + 1 * ldc + 1) += row1b; \ |
|
|
|
*(CO1 + 2 * ldc + 0) += row2a; \ |
|
|
|
*(CO1 + 2 * ldc + 1) += row2b; \ |
|
|
|
*(CO1 + 3 * ldc + 0) += row3a; \ |
|
|
|
*(CO1 + 3 * ldc + 1) += row3b; \ |
|
|
|
*(CO1 + 4 * ldc + 0) += row4a; \ |
|
|
|
*(CO1 + 4 * ldc + 1) += row4b; \ |
|
|
|
*(CO1 + 5 * ldc + 0) += row5a; \ |
|
|
|
*(CO1 + 5 * ldc + 1) += row5b; \ |
|
|
|
*(CO1 + 6 * ldc + 0) += row6a; \ |
|
|
|
*(CO1 + 6 * ldc + 1) += row6b; \ |
|
|
|
*(CO1 + 7 * ldc + 0) += row7a; \ |
|
|
|
*(CO1 + 7 * ldc + 1) += row7b; \ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*******************************************************************************************/ |
|
|
|
|
|
|
|
#define INIT1x8() \ |
|
|
|
row0 = row1 = row2 = row3 = row4 = row5 = row6 = row7 = 0; |
|
|
|
|
|
|
|
#define KERNEL1x8_SUB() \ |
|
|
|
xmm0 = *(AO ); \ |
|
|
|
xmm2 = *(BO + 0); \ |
|
|
|
xmm3 = *(BO + 1); \ |
|
|
|
row0 += xmm0 * xmm2; \ |
|
|
|
row1 += xmm0 * xmm3; \ |
|
|
|
xmm2 = *(BO + 2); \ |
|
|
|
xmm3 = *(BO + 3); \ |
|
|
|
row2 += xmm0 * xmm2; \ |
|
|
|
row3 += xmm0 * xmm3; \ |
|
|
|
xmm2 = *(BO + 4); \ |
|
|
|
xmm3 = *(BO + 5); \ |
|
|
|
row4 += xmm0 * xmm2; \ |
|
|
|
row5 += xmm0 * xmm3; \ |
|
|
|
xmm2 = *(BO + 6); \ |
|
|
|
xmm3 = *(BO + 7); \ |
|
|
|
row6 += xmm0 * xmm2; \ |
|
|
|
row7 += xmm0 * xmm3; \ |
|
|
|
BO += 8; \ |
|
|
|
AO += 1; |
|
|
|
|
|
|
|
|
|
|
|
#define SAVE1x8(ALPHA) \ |
|
|
|
xmm0 = ALPHA; \ |
|
|
|
row0 *= xmm0; \ |
|
|
|
row1 *= xmm0; \ |
|
|
|
row2 *= xmm0; \ |
|
|
|
row3 *= xmm0; \ |
|
|
|
row4 *= xmm0; \ |
|
|
|
row5 *= xmm0; \ |
|
|
|
row6 *= xmm0; \ |
|
|
|
row7 *= xmm0; \ |
|
|
|
*(CO1 + 0 * ldc) += row0; \ |
|
|
|
*(CO1 + 1 * ldc) += row1; \ |
|
|
|
*(CO1 + 2 * ldc) += row2; \ |
|
|
|
*(CO1 + 3 * ldc) += row3; \ |
|
|
|
*(CO1 + 4 * ldc) += row4; \ |
|
|
|
*(CO1 + 5 * ldc) += row5; \ |
|
|
|
*(CO1 + 6 * ldc) += row6; \ |
|
|
|
*(CO1 + 7 * ldc) += row7; \ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1184,142 +771,6 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f |
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// L8_0 |
|
|
|
while (N >= 8 && 0) { |
|
|
|
float *CO1; |
|
|
|
float *AO; |
|
|
|
int i; |
|
|
|
// L8_10 |
|
|
|
CO1 = C; |
|
|
|
C += 8 * ldc; |
|
|
|
|
|
|
|
AO = A; |
|
|
|
|
|
|
|
i = m; |
|
|
|
|
|
|
|
while (i >= 32 && 0) { |
|
|
|
float *BO, *AOb; |
|
|
|
// L8_11 |
|
|
|
__m512 zmm0, zmm0b, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; |
|
|
|
BO = B; |
|
|
|
int kloop = K; |
|
|
|
AOb = AO + 16 * K; |
|
|
|
|
|
|
|
INIT32x8() |
|
|
|
|
|
|
|
while (kloop > 0) { |
|
|
|
// L12_17 |
|
|
|
KERNEL32x8_SUB() |
|
|
|
kloop--; |
|
|
|
} |
|
|
|
// L8_19 |
|
|
|
SAVE32x8(alpha) |
|
|
|
CO1 += 32; |
|
|
|
AO += 16 * K; |
|
|
|
|
|
|
|
i -= 32; |
|
|
|
} |
|
|
|
while (i >= 16) { |
|
|
|
float *BO; |
|
|
|
// L8_11 |
|
|
|
__m512 zmm0, zmm2, zmm3, row0, row1, row2, row3, row4, row5, row6, row7; |
|
|
|
BO = B; |
|
|
|
int kloop = K; |
|
|
|
|
|
|
|
INIT16x8() |
|
|
|
|
|
|
|
while (kloop > 0) { |
|
|
|
KERNEL16x8_SUB() |
|
|
|
kloop--; |
|
|
|
} |
|
|
|
SAVE16x8(alpha) |
|
|
|
CO1 += 16; |
|
|
|
|
|
|
|
i -= 16; |
|
|
|
} |
|
|
|
while (i >= 8) { |
|
|
|
float *BO; |
|
|
|
// L8_11 |
|
|
|
__m256 ymm0, ymm2, ymm3, row0, row1, row2, row3, row4, row5, row6, row7; |
|
|
|
BO = B; |
|
|
|
int kloop = K; |
|
|
|
|
|
|
|
INIT8x8() |
|
|
|
|
|
|
|
while (kloop > 0) { |
|
|
|
// L12_17 |
|
|
|
KERNEL8x8_SUB() |
|
|
|
kloop--; |
|
|
|
} |
|
|
|
// L8_19 |
|
|
|
SAVE8x8(alpha) |
|
|
|
CO1 += 8; |
|
|
|
|
|
|
|
i -= 8; |
|
|
|
} |
|
|
|
while (i >= 4) { |
|
|
|
// L8_11 |
|
|
|
float *BO; |
|
|
|
__m128 xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; |
|
|
|
BO = B; |
|
|
|
int kloop = K; |
|
|
|
|
|
|
|
INIT4x8() |
|
|
|
// L8_16 |
|
|
|
while (kloop > 0) { |
|
|
|
// L12_17 |
|
|
|
KERNEL4x8_SUB() |
|
|
|
kloop--; |
|
|
|
} |
|
|
|
// L8_19 |
|
|
|
SAVE4x8(alpha) |
|
|
|
CO1 += 4; |
|
|
|
|
|
|
|
i -= 4; |
|
|
|
} |
|
|
|
|
|
|
|
/************************************************************************** |
|
|
|
* Rest of M |
|
|
|
***************************************************************************/ |
|
|
|
|
|
|
|
while (i >= 2) { |
|
|
|
float *BO; |
|
|
|
float xmm0, xmm1, xmm2, xmm3, row0a, row1a, row2a, row3a, row4a, row5a, row6a, row7a, row0b, row1b, row2b, row3b, row4b, row5b, row6b, row7b; |
|
|
|
BO = B; |
|
|
|
|
|
|
|
INIT2x8() |
|
|
|
int kloop = K; |
|
|
|
|
|
|
|
while (kloop > 0) { |
|
|
|
KERNEL2x8_SUB() |
|
|
|
kloop--; |
|
|
|
} |
|
|
|
SAVE2x8(alpha) |
|
|
|
CO1 += 2; |
|
|
|
i -= 2; |
|
|
|
} |
|
|
|
// L13_40 |
|
|
|
while (i >= 1) { |
|
|
|
float *BO; |
|
|
|
float xmm0, xmm2, xmm3, row0, row1, row2, row3, row4, row5, row6, row7; |
|
|
|
int kloop = K; |
|
|
|
BO = B; |
|
|
|
INIT1x8() |
|
|
|
|
|
|
|
while (kloop > 0) { |
|
|
|
KERNEL1x8_SUB() |
|
|
|
kloop--; |
|
|
|
} |
|
|
|
SAVE1x8(alpha) |
|
|
|
CO1 += 1; |
|
|
|
i -= 1; |
|
|
|
} |
|
|
|
|
|
|
|
B += K * 8; |
|
|
|
N -= 8; |
|
|
|
} |
|
|
|
|
|
|
|
while (N >= 4) { |
|
|
|
float *CO1; |
|
|
|
float *AO; |
|
|
|