Update llama.cpp

Update with all new changes
2 years ago · fb9e38d3e8
--- a/LLama/runtimes/ggml-metal.metal
+++ b/LLama/runtimes/ggml-metal.metal
@@ -428,7 +428,7 @@ kernel void kernel_mul_mat_q4_0_f32(
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (ith == 0) {
        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
        dst[r1*ne0 + r0] = sum[0];
    }
 }
@@ -497,7 +497,7 @@ kernel void kernel_mul_mat_q4_1_f32(
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (ith == 0) {
        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
        dst[r1*ne0 + r0] = sum[0];
    }
 }
@@ -775,47 +775,76 @@ kernel void kernel_cpy_f32_f32(

 //============================================ k-quants ======================================================

 #ifndef QK_K
 #define QK_K 256
 #else
 static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
 #endif

 #if QK_K == 256
 #define K_SCALE_SIZE 12
 #else
 #define K_SCALE_SIZE 4
 #endif

 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
    half d;           // super-block scale for quantized scales
    half dmin;        // super-block scale for quantized mins
 } block_q2_k;
 } block_q2_K;
 // 84 bytes / block

 typedef struct {
    uint8_t hmask[QK_K/8];     // quants - high bit
    uint8_t qs[QK_K/4];        // quants - low 2 bits
    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
    half d;                    // super-block scale
 } block_q3_k;
 // 110 bytes / block

 #if QK_K == 64
    uint8_t scales[2];
 #else
    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
 #endif
    half d;             // super-block scale
 } block_q3_K;

 #if QK_K == 64
 typedef struct {
    half    d[2];          // super-block scales/mins
    uint8_t scales[2];
    uint8_t qs[QK_K/2];    // 4-bit quants
 } block_q4_K;
 #else
 typedef struct {
    half d;             // super-block scale for quantized scales
    half dmin;          // super-block scale for quantized mins
    uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
    uint8_t qs[QK_K/2];        // 4--bit quants
 } block_q4_k;
 // 144 bytes / block
 } block_q4_K;
 #endif

 #if QK_K == 64
 typedef struct {
    half  d;                     // super-block scales/mins
    int8_t  scales[QK_K/16];     // 8-bit block scales
    uint8_t qh[QK_K/8];          // quants, high bit
    uint8_t qs[QK_K/2];          // quants, low 4 bits
 } block_q5_K;
 #else
 typedef struct {
    half d;                      // super-block scale for quantized scales
    half dmin;                   // super-block scale for quantized mins
    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
    uint8_t qh[QK_K/8];          // quants, high bit
    uint8_t qs[QK_K/2];          // quants, low 4 bits
 } block_q5_k;
 } block_q5_K;
 // 176 bytes / block
 #endif

 typedef struct {
    uint8_t ql[QK_K/2];      // quants, lower 4 bits
    uint8_t qh[QK_K/4];      // quants, upper 2 bits
    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
    half d;                  // super-block scale
 } block_q6_k;
 } block_q6_K;
 // 210 bytes / block

 static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
@@ -836,7 +865,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {

 //========================================== dequantization =============================

 static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, int k) {
 static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@@ -847,6 +876,7 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i

        device const uint8_t * q = x[i].qs;

 #if QK_K == 256
        int is = 0;
        float dl, ml;
        for (int n = 0; n < QK_K; n += 128) {
@@ -865,14 +895,29 @@ static void dequantize_row_q2_k(device const block_q2_k * x, device float * y, i
            }
            q += 32;
        }
 #else
        float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4);
        float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4);
        float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4);
        float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4);
        for (int l = 0; l < 16; ++l) {
            y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1;
            y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2;
            y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3;
            y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4;
        }
        y += QK_K;
 #endif

    }
 }

 static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, int k) {
 static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

 #if QK_K == 256

    const uint16_t kmask1 = 0x0303;
    const uint16_t kmask2 = 0x0f0f;

@@ -918,22 +963,49 @@ static void dequantize_row_q3_k(device const block_q3_k * x, device float * y, i
            }
            q += 32;
        }
    }
 #else
    for (int i = 0; i < nb; i++) {

        const float d_all = (float)(x[i].d);

        device const uint8_t * q = x[i].qs;
        device const uint8_t * hm = x[i].hmask;

        const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
        const float d2 = d_all * ((x[i].scales[0] >>  4) - 8);
        const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
        const float d4 = d_all * ((x[i].scales[1] >>  4) - 8);

        for (int l = 0; l < 8; ++l) {
            uint8_t h = hm[l];
            y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4));
            y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4));
            y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4));
            y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4));
            y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4));
            y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4));
            y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4));
            y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4));
        }
        y += QK_K;
    }
 #endif

 }

 static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, int k) {
 static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;


    for (int i = 0; i < nb; i++) {

        device const uint8_t * q = x[i].qs;

 #if QK_K == 256
        const float d = x[i].d;
        const float min = x[i].dmin;

        device const uint8_t * q = x[i].qs;
        device const uint8_t * scales = x[i].scales;

        int is = 0;
@@ -945,14 +1017,29 @@ static void dequantize_row_q4_k(device const block_q4_k * x, device float * y, i
            for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l]  >> 4) - m2;
            q += 32; is += 2;
        }
 #else
        device const uint8_t * s = x[i].scales;
        device const half2 * dh = (device const half2 *)x[i].d;
        const float2 d = (float2)dh[0];
        const float d1 = d[0] * (s[0] & 0xF);
        const float d2 = d[0] * (s[1] & 0xF);
        const float m1 = d[1] * (s[0] >>  4);
        const float m2 = d[1] * (s[1] >>  4);
        for (int l = 0; l < 32; ++l) {
            y[l+ 0] = d1 * (q[l] & 0xF) - m1;
            y[l+32] = d2 * (q[l] >>  4) - m2;
        }
        y += QK_K;
 #endif

    }
 }

 static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, int k) {
 static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

 #if QK_K == 256
   for (int i = 0; i < nb; i++) {

        const float d = (float)(x[i].d);
@@ -973,10 +1060,32 @@ static void dequantize_row_q5_k(device const block_q5_k * x, device float * y, i
            u1 <<= 2; u2 <<= 2;
        }
    }
 #else
    for (int i = 0; i < nb; i++) {

        const float d = (float)x[i].d;

        device const uint8_t * ql = x[i].qs;
        device const uint8_t * qh = x[i].qh;
        device const int8_t  * sc = x[i].scales;

        for (int l = 0; l < 8; ++l) {
            y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16));
            y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16));
            y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16));
            y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16));
            y[l+32] = d * sc[2] * ((ql[l+ 0] >>  4) - (qh[l] & 0x10 ? 0 : 16));
            y[l+40] = d * sc[2] * ((ql[l+ 8] >>  4) - (qh[l] & 0x20 ? 0 : 16));
            y[l+48] = d * sc[3] * ((ql[l+16] >>  4) - (qh[l] & 0x40 ? 0 : 16));
            y[l+56] = d * sc[3] * ((ql[l+24] >>  4) - (qh[l] & 0x80 ? 0 : 16));
        }
        y += QK_K;
    }
 #endif

 }

 static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, int k) {
 static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) {
    assert(k % QK_K == 0);
    const int nb = k / QK_K;

@@ -988,6 +1097,7 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i

        const float d = x[i].d;

 #if QK_K == 256
        for (int n = 0; n < QK_K; n += 128) {
            for (int l = 0; l < 32; ++l) {
                int is = l/16;
@@ -1005,10 +1115,23 @@ static void dequantize_row_q6_k(device const block_q6_k * x, device float * y, i
            qh += 32;
            sc += 8;
        }
 #else
        for (int l = 0; l < 16; ++l) {
            const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
            const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
            const int8_t q3 = (int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
            const int8_t q4 = (int8_t)((ql[l+16]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
            y[l+ 0] = d * sc[0] * q1;
            y[l+16] = d * sc[1] * q2;
            y[l+32] = d * sc[2] * q3;
            y[l+48] = d * sc[3] * q4;
        }
        y  += 64;
 #endif
    }
 }

 kernel void kernel_get_rows_q2_k(
 kernel void kernel_get_rows_q2_K(
        device const  void * src0,
        device const   int * src1,
        device       float * dst,
@@ -1019,12 +1142,12 @@ kernel void kernel_get_rows_q2_k(
    const int i = tpig;
    const int r = ((device int32_t *) src1)[i];

    dequantize_row_q2_k(
            (device const block_q2_k *) ((device char *) src0 + r*nb01),
    dequantize_row_q2_K(
            (device const block_q2_K *) ((device char *) src0 + r*nb01),
                       (device float *) ((device char *)  dst + i*nb1), ne00);
 }

 kernel void kernel_get_rows_q3_k(
 kernel void kernel_get_rows_q3_K(
        device const  void * src0,
        device const   int * src1,
        device       float * dst,
@@ -1035,12 +1158,12 @@ kernel void kernel_get_rows_q3_k(
    const int i = tpig;
    const int r = ((device int32_t *) src1)[i];

    dequantize_row_q3_k(
            (device const block_q3_k *) ((device char *) src0 + r*nb01),
    dequantize_row_q3_K(
            (device const block_q3_K *) ((device char *) src0 + r*nb01),
                       (device float *) ((device char *)  dst + i*nb1), ne00);
 }

 kernel void kernel_get_rows_q4_k(
 kernel void kernel_get_rows_q4_K(
        device const  void * src0,
        device const   int * src1,
        device       float * dst,
@@ -1051,12 +1174,12 @@ kernel void kernel_get_rows_q4_k(
    const int i = tpig;
    const int r = ((device int32_t *) src1)[i];

    dequantize_row_q4_k(
            (device const block_q4_k *) ((device char *) src0 + r*nb01),
    dequantize_row_q4_K(
            (device const block_q4_K *) ((device char *) src0 + r*nb01),
                       (device float *) ((device char *)  dst + i*nb1), ne00);
 }

 kernel void kernel_get_rows_q5_k(
 kernel void kernel_get_rows_q5_K(
        device const  void * src0,
        device const   int * src1,
        device       float * dst,
@@ -1067,12 +1190,12 @@ kernel void kernel_get_rows_q5_k(
    const int i = tpig;
    const int r = ((device int32_t *) src1)[i];

    dequantize_row_q5_k(
            (device const block_q5_k *) ((device char *) src0 + r*nb01),
    dequantize_row_q5_K(
            (device const block_q5_K *) ((device char *) src0 + r*nb01),
                       (device float *) ((device char *)  dst + i*nb1), ne00);
 }

 kernel void kernel_get_rows_q6_k(
 kernel void kernel_get_rows_q6_K(
        device const  void * src0,
        device const   int * src1,
        device       float * dst,
@@ -1083,14 +1206,14 @@ kernel void kernel_get_rows_q6_k(
    const int i = tpig;
    const int r = ((device int32_t *) src1)[i];

    dequantize_row_q6_k(
            (device const block_q6_k *) ((device char *) src0 + r*nb01),
    dequantize_row_q6_K(
            (device const block_q6_K *) ((device char *) src0 + r*nb01),
                       (device float *) ((device char *)  dst + i*nb1), ne00);
 }

 //====================================== dot products =========================

 kernel void kernel_mul_mat_q2_k_f32(
 kernel void kernel_mul_mat_q2_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1107,12 +1230,15 @@ kernel void kernel_mul_mat_q2_k_f32(
    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;

    device const block_q2_k * x = (device const block_q2_k *) src0 + r0*nb;
    device const block_q2_K * x = (device const block_q2_K *) src0 + r0*nb;
    device const float     * yy = (device const float      *) src1 + r1*ne10;

    const int nth = tptg.x*tptg.y;
    const int ith = tptg.y*tpitg.x + tpitg.y;

    float sumf = 0;

 #if QK_K == 256
    const int tid = tpitg.y;    // 0...16
    const int il  = tid/4;      // 0...3
    const int ir  = tid%4;      // 0...3
@@ -1125,9 +1251,6 @@ kernel void kernel_mul_mat_q2_k_f32(
    const int y_offset = 64*il + n*ir;
    const int q_offset = 32*ip + n*ir;

    sum[ith] = 0.0f;

    float sumf = 0;
    for (int i = tpitg.x; i < nb; i += tptg.x) {

        device const uint8_t * q = x[i].qs + q_offset;
@@ -1140,7 +1263,6 @@ kernel void kernel_mul_mat_q2_k_f32(

        device const float   * y = yy + i*QK_K + y_offset;

        //float4 s = {0.f, 0.f, 0.f, 0.f};
        float2 s = {0.f, 0.f};
        float smin = 0;
        for (int l = 0; l < n; ++l) {
@@ -1155,25 +1277,38 @@ kernel void kernel_mul_mat_q2_k_f32(
        sumf += dall * (s[0] * d1 + s[1] * d2) - dmin * smin;

    }
    sum[ith] = sumf;
 #else
    const int il = 4 * tpitg.x;

    //int mask1 = (ith%4 == 0);
    //int mask2 = (ith%16 == 0);
    uint32_t aux[2];
    thread const uint8_t * d = (thread const uint8_t *)aux;
    thread const uint8_t * m = (thread const uint8_t *)aux + 4;

    //threadgroup_barrier(mem_flags::mem_threadgroup);
    //for (int i = 1; i < 4; ++i) sum[ith] += mask1 * sum[ith + i];
    //threadgroup_barrier(mem_flags::mem_threadgroup);
    //for (int i = 4; i < 16; i += 4) sum[ith] += mask2 * sum[ith + i];
    //threadgroup_barrier(mem_flags::mem_threadgroup);
    //if (ith == 0) {
    //    for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
    //    dst[r1*ne0 + r0] = sum[0];
    //}
    for (int i = tpitg.y; i < nb; i += tptg.y) {

        device const uint8_t * q = x[i].qs + il;
        device const float   * y = yy + i*QK_K + il;

        const float dall = (float)x[i].d;
        const float dmin = (float)x[i].dmin;

        device const uint32_t * a = (device const uint32_t *)x[i].scales;
        aux[0] = a[0] & 0x0f0f0f0f;
        aux[1] = (a[0] >> 4) & 0x0f0f0f0f;

        for (int l = 0; l < 4; ++l) {
            sumf += y[l+ 0] * (dall * d[0] * ((q[l] >> 0) & 3) - dmin * m[0])
                  + y[l+16] * (dall * d[1] * ((q[l] >> 2) & 3) - dmin * m[1])
                  + y[l+32] * (dall * d[2] * ((q[l] >> 4) & 3) - dmin * m[2])
                  + y[l+48] * (dall * d[3] * ((q[l] >> 6) & 3) - dmin * m[3]);
        }
    }
 #endif

    sum[ith] = sumf;

    //
    // Accumulate the sum from all threads in the threadgroup
    // This version is slightly faster than the commented out one below,
    // which I copy-pasted from ggerganov's q4_0 dot product for metal.
    //
    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (ith%4 == 0) {
@@ -1190,7 +1325,7 @@ kernel void kernel_mul_mat_q2_k_f32(
    }
 }

 kernel void kernel_mul_mat_q3_k_f32(
 kernel void kernel_mul_mat_q3_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1203,23 +1338,25 @@ kernel void kernel_mul_mat_q3_k_f32(
        uint2 tpitg[[thread_position_in_threadgroup]],
        uint2  tptg[[threads_per_threadgroup]]) {

    const uint16_t kmask1 = 0x0303;
    const uint16_t kmask2 = 0x0f0f;

    const uint8_t m3 = 3;
    const int8_t  m4 = 4;

    const int nb = ne00/QK_K;

    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;

    device const block_q3_k * x = (device const block_q3_k *) src0 + r0*nb;
    device const block_q3_K * x = (device const block_q3_K *) src0 + r0*nb;
    device const float     * yy = (device const float      *) src1 + r1*ne10;

    const int nth = tptg.x*tptg.y;
    const int ith = tptg.y*tpitg.x + tpitg.y;

 #if QK_K == 256

    const uint8_t m3 = 3;
    const int8_t  m4 = 4;

    const uint16_t kmask1 = 0x0303;
    const uint16_t kmask2 = 0x0f0f;

    const int tid = tpitg.y;        // expecting 16
    const int ip  = tid/8;          // 0 or 1
    const int il  = tid/2 - 4*ip;   // 0...3
@@ -1273,6 +1410,39 @@ kernel void kernel_mul_mat_q3_k_f32(

    //sum[ith] = sumf;
    sum[ith] = sumf1 - 32.f*sumf2;
 #else
    const int il = 4 * tpitg.x;  // 0, 4, 8, 12
    const int im = il/8;         // 0, 0, 1, 1
    const int in = il%8;         // 0, 4, 0, 4

    float sumf = 0;

    for (int i = tpitg.y; i < nb; i += tptg.y) {

        const float d_all = (float)(x[i].d);

        device const uint8_t * q = x[i].qs + il;
        device const uint8_t * h = x[i].hmask + in;
        device const float   * y = yy + i * QK_K + il;

        const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8);
        const float d2 = d_all * ((x[i].scales[0] >>  4) - 8);
        const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8);
        const float d4 = d_all * ((x[i].scales[1] >>  4) - 8);

        for (int l = 0; l < 4; ++l) {
            const uint8_t hm = h[l] >> im;
            sumf += y[l+ 0] * d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((hm & 0x01) ? 0 : 4))
                  + y[l+16] * d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((hm & 0x04) ? 0 : 4))
                  + y[l+32] * d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((hm & 0x10) ? 0 : 4))
                  + y[l+48] * d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((hm & 0x40) ? 0 : 4));
        }

    }

    sum[ith] = sumf;

 #endif

    //
    // Accumulate the sum from all threads in the threadgroup
@@ -1293,7 +1463,7 @@ kernel void kernel_mul_mat_q3_k_f32(

 }

 kernel void kernel_mul_mat_q4_k_f32(
 kernel void kernel_mul_mat_q4_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1305,21 +1475,25 @@ kernel void kernel_mul_mat_q4_k_f32(
        uint2 tpitg[[thread_position_in_threadgroup]],
        uint2  tptg[[threads_per_threadgroup]]) {

    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

    const int nb = ne00/QK_K;

    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;

    device const block_q4_k * x = (device const block_q4_k *) src0 + r0*nb;
    device const float     * yy = (device const float      *) src1 + r1*ne10;

    const int nth = tptg.x*tptg.y;
    const int ith = tptg.y*tpitg.x + tpitg.y;

    device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
    device const float     * yy = (device const float      *) src1 + r1*ne10;

    float sumf = 0;

 #if QK_K == 256

    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

    const int tid = tpitg.y;   // 0...16
    const int il  = tid/4;     // 0...3
    const int ir  = tid - 4*il;// 0...3
@@ -1332,11 +1506,8 @@ kernel void kernel_mul_mat_q4_k_f32(
    const int q_offset = 32*im + l0;
    const int y_offset = 64*im + l0;

    sum[ith] = 0.0f;

    uchar2 sc1, sc2, sc3, sc4;

    float sumf = 0;
    for (int i = tpitg.x; i < nb; i += tptg.x) {

        device const uint8_t * q1 = (x + i)->qs + q_offset;
@@ -1365,6 +1536,30 @@ kernel void kernel_mul_mat_q4_k_f32(
        sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;

    }
 #else
    uint16_t aux16[2];
    thread const uint8_t * scales = (thread const uint8_t *)aux16;

    const int il  = 4*tpitg.x;

    for (int i = tpitg.y; i < nb; i += tptg.y) {

        device const uint8_t * q = x[i].qs + il;
        device const float   * y = yy + i * QK_K + il;

        const float d = (float)x[i].d[0];
        const float m = (float)x[i].d[1];

        device const uint16_t * a = (device const uint16_t *)x[i].scales;
        aux16[0] = a[0] & 0x0f0f;
        aux16[1] = (a[0] >> 4) & 0x0f0f;

        for (int l = 0; l < 4; ++l) {
            sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
                  + d * scales[1] * (y[l+32] * (q[l] >>  4) + y[l+48] * (q[l+16] >>  4)) - m * scales[3] * (y[l+32] + y[l+48]);
        }
    }
 #endif

    sum[ith] = sumf;

@@ -1401,7 +1596,7 @@ kernel void kernel_mul_mat_q4_k_f32(
    //}
 }

 kernel void kernel_mul_mat_q5_k_f32(
 kernel void kernel_mul_mat_q5_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1413,21 +1608,25 @@ kernel void kernel_mul_mat_q5_k_f32(
        uint2 tpitg[[thread_position_in_threadgroup]],
        uint2  tptg[[threads_per_threadgroup]]) {

    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

    const int nb = ne00/QK_K;

    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;

    device const block_q5_k * x = (device const block_q5_k *) src0 + r0*nb;
    device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
    device const float     * yy = (device const float      *) src1 + r1*ne10;

    const int nth = tptg.x*tptg.y;
    const int ith = tptg.y*tpitg.x + tpitg.y;

    float sumf = 0;

 #if QK_K == 256

    const uint16_t kmask1 = 0x3f3f;
    const uint16_t kmask2 = 0x0f0f;
    const uint16_t kmask3 = 0xc0c0;

    const int tid = tpitg.y;   // 0...16
    const int il  = tid/4;     // 0...3
    const int ir  = tid - 4*il;// 0...3
@@ -1447,7 +1646,6 @@ kernel void kernel_mul_mat_q5_k_f32(

    uchar2 sc1, sc2, sc3, sc4;

    float sumf = 0;
    for (int i = tpitg.x; i < nb; i += tptg.x) {

        device const uint8_t * q1 = (x + i)->qs + q_offset;
@@ -1479,6 +1677,28 @@ kernel void kernel_mul_mat_q5_k_f32(
        sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;

    }
 #else
    const int il  = 4 * tpitg.x;  // 0, 4, 8, 12
    const int im  = il/8;         // 0, 0, 1, 1
    const int in  = il%8;         // 0, 4, 0, 4

    for (int i = tpitg.y; i < nb; i += tptg.y) {

        const float d = (float)x[i].d;
        device const uint8_t * q = x[i].qs + il;
        device const uint8_t * h = x[i].qh + in;
        device const int8_t  * s = x[i].scales;
        device const float   * y = yy + i*QK_K + il;

        for (int l = 0; l < 4; ++l) {
            const uint8_t hl = h[l] >> im;
            sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
                  + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
                  + y[l+32] * d * s[2] * ((q[l+ 0] >>  4) - (hl & 0x10 ? 0 : 16))
                  + y[l+48] * d * s[3] * ((q[l+16] >>  4) - (hl & 0x40 ? 0 : 16));
        }
    }
 #endif
    sum[ith] = sumf;

    //
@@ -1500,7 +1720,7 @@ kernel void kernel_mul_mat_q5_k_f32(

 }

 kernel void kernel_mul_mat_q6_k_f32(
 kernel void kernel_mul_mat_q6_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1522,12 +1742,15 @@ kernel void kernel_mul_mat_q6_k_f32(
    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;

    device const block_q6_k * x = (device const block_q6_k *) src0 + r0*nb;
    device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
    device const float     * yy = (device const float      *) src1 + r1*ne10;

    const int nth = tptg.x*tptg.y;
    const int ith = tptg.y*tpitg.x + tpitg.y;

    float sumf = 0;

 #if QK_K == 256
    // Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
    const int iqs  = 16 * tpitg.y;
    const int ip   = iqs / 128;         // 0 or 1
@@ -1540,7 +1763,6 @@ kernel void kernel_mul_mat_q6_k_f32(
    const int q_offset_l = 64*ip + l0;
    const int q_offset_h = 32*ip + l0;

    float sumf = 0;
    for (int i = tpitg.x; i < nb; i += tptg.x) {

        device const uint8_t * ql = x[i].ql + q_offset_l;
@@ -1562,6 +1784,28 @@ kernel void kernel_mul_mat_q6_k_f32(
        sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);

    }
 #else
    const int il  = 4*tpitg.x;    // 0, 4, 8, 12

    for (int i = tpitg.y; i < nb; i += tptg.y) {
        device const float * y = yy + i * QK_K + il;
        device const uint8_t * ql = x[i].ql + il;
        device const uint8_t * qh = x[i].qh + il;
        device const int8_t  * s  = x[i].scales;

        const float d = x[i].d;

        float4 sums = {0.f, 0.f, 0.f, 0.f};
        for (int l = 0; l < 4; ++l) {
            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
            sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
            sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >>  4) | ((qh[l] & kmask3) >> 0)) - 32);
            sums[3] += y[l+48] * ((int8_t)((ql[l+16] >>  4) | ((qh[l] & kmask4) >> 2)) - 32);
        }
        sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
    }

 #endif

    sum[ith] = sumf;

--- a/LLama/runtimes/libllama.dylib
+++ b/LLama/runtimes/libllama.dylib