BENCH: add benchmarks using codspeed.io

1 year ago · 9f28161837
--- a/.github/workflows/codspeed-bench.yml
+++ b/.github/workflows/codspeed-bench.yml
@@ -0,0 +1,150 @@
 name: Run codspeed benchmarks

 on: [push, pull_request]

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

 permissions:
  contents: read # to fetch code (actions/checkout)

 jobs:
  benchmarks:
    if: "github.repository == 'OpenMathLib/OpenBLAS'"
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
        fortran: [gfortran]
        build: [make]
        pyver: ["3.12"]
    runs-on: ${{ matrix.os }}
    steps:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v3
        with:
            python-version: ${{ matrix.pyver }}

      - name: Print system information
        run: |
          if [ "$RUNNER_OS" == "Linux" ]; then
            cat /proc/cpuinfo
          fi

      - name: Install Dependencies
        run: |
          if [ "$RUNNER_OS" == "Linux" ]; then
            sudo apt-get update
            sudo apt-get install -y gfortran cmake ccache libtinfo5
          else
            echo "::error::$RUNNER_OS not supported"
            exit 1
          fi

      - name: Compilation cache
        uses: actions/cache@v3
        with:
          path: ~/.ccache
          # We include the commit sha in the cache key, as new cache entries are
          # only created if there is no existing entry for the key yet.
          # GNU make and cmake call the compilers differently. It looks like
          # that causes the cache to mismatch. Keep the ccache for both build
          # tools separate to avoid polluting each other.
          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
          restore-keys: |
            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
            ccache-${{ runner.os }}-${{ matrix.build }}

      - name: Write out the .pc
        run: |
             cd benchmark/pybench
             cat > openblas.pc << EOF
             libdir=${{ github.workspace }}
             includedir= ${{ github.workspace }}
             openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
             version=0.0.99
             extralib=-lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
             Name: openblas
             Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
             Version: ${version}
             URL: https://github.com/xianyi/OpenBLAS
             Libs: ${{ github.workspace }}/libopenblas.so -Wl,-rpath,${{ github.workspace }}
             Libs.private: -lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
             Cflags: -I${{ github.workspace}}
             EOF
             cat openblas.pc

      - name: Configure ccache
        run: |
          if [ "${{ matrix.build }}" = "make" ]; then
            # Add ccache to path
            if [ "$RUNNER_OS" = "Linux" ]; then
              echo "/usr/lib/ccache" >> $GITHUB_PATH
            elif [ "$RUNNER_OS" = "macOS" ]; then
              echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
            else
              echo "::error::$RUNNER_OS not supported"
              exit 1
            fi
          fi
          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
          test -d ~/.ccache || mkdir -p ~/.ccache
          echo "max_size = 300M" > ~/.ccache/ccache.conf
          echo "compression = true" >> ~/.ccache/ccache.conf
          ccache -s

      - name: Build OpenBLAS
        run: |
          case "${{ matrix.build }}" in
            "make")
              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
              ;;
            "cmake")
              mkdir build && cd build
              cmake -DDYNAMIC_ARCH=1 \
                    -DNOFORTRAN=0 \
                    -DBUILD_WITHOUT_LAPACK=0 \
                    -DCMAKE_VERBOSE_MAKEFILE=ON \
                    -DCMAKE_BUILD_TYPE=Release \
                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
                    ..
              cmake --build .
              ;;
            *)
              echo "::error::Configuration not supported"
              exit 1
              ;;
          esac

      - name: Show ccache status
        continue-on-error: true
        run: ccache -s

      - name: Install benchmark dependencies
        run: pip install meson ninja numpy pytest pytest-codspeed --user

      - name: Build the wrapper
        run: |
          cd benchmark/pybench
          export PKG_CONFIG_PATH=$PWD
          meson setup build  --prefix=$PWD/build-install
          meson install -C build
          #
          # sanity check
          cd build/openblas_wrap
          python -c'import _flapack; print(dir(_flapack))'

      - name: Run benchmarks
        uses: CodSpeedHQ/action@v2
        with:
          token: ${{ secrets.CODSPEED_TOKEN }}
          run: |
            cd benchmark/pybench
            export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/
            OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py --codspeed

--- a/.gitignore
+++ b/.gitignore
@@ -109,3 +109,4 @@ benchmark/smallscaling
 CMakeCache.txt
 CMakeFiles/*
 .vscode
 **/__pycache__
--- a/benchmark/pybench/benchmarks/bench_blas.py
+++ b/benchmark/pybench/benchmarks/bench_blas.py
@@ -0,0 +1,185 @@
 import pytest
 import numpy as np
 from openblas_wrap import (
    # level 1
    dnrm2, ddot, daxpy,
    # level 3
    dgemm, dsyrk,
    # lapack
    dgesv,                   # linalg.solve
    dgesdd, dgesdd_lwork,    # linalg.svd
    dsyev, dsyev_lwork,      # linalg.eigh
 )

 # ### BLAS level 1 ###

 # dnrm2

 dnrm2_sizes = [100, 1000]

 def run_dnrm2(n, x, incx):
    res = dnrm2(x, n, incx=incx)
    return res


@pytest.mark.parametrize('n', dnrm2_sizes)
 def test_nrm2(benchmark, n):
    rndm = np.random.RandomState(1234)
    x = np.array(rndm.uniform(size=(n,)), dtype=float)
    result = benchmark(run_dnrm2, n, x, 1)


 # ddot

 ddot_sizes = [100, 1000]

 def run_ddot(x, y,):
    res = ddot(x, y)
    return res


@pytest.mark.parametrize('n', ddot_sizes)
 def test_dot(benchmark, n):
    rndm = np.random.RandomState(1234)
    x = np.array(rndm.uniform(size=(n,)), dtype=float)
    y = np.array(rndm.uniform(size=(n,)), dtype=float)
    result = benchmark(run_ddot, x, y)


 # daxpy

 daxpy_sizes = [100, 1000]

 def run_daxpy(x, y,):
    res = daxpy(x, y, a=2.0)
    return res


@pytest.mark.parametrize('n', daxpy_sizes)
 def test_daxpy(benchmark, n):
    rndm = np.random.RandomState(1234)
    x = np.array(rndm.uniform(size=(n,)), dtype=float)
    y = np.array(rndm.uniform(size=(n,)), dtype=float)
    result = benchmark(run_daxpy, x, y)




 # ### BLAS level 3 ###

 # dgemm

 gemm_sizes = [100, 1000]

 def run_gemm(a, b, c):
    alpha = 1.0
    res = dgemm(alpha, a, b, c=c, overwrite_c=True)
    return res


@pytest.mark.parametrize('n', gemm_sizes)
 def test_gemm(benchmark, n):
    rndm = np.random.RandomState(1234)
    a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F')
    b = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F')
    c = np.empty((n, n), dtype=float, order='F')
    result = benchmark(run_gemm, a, b, c)
    assert result is c


 # dsyrk

 syrk_sizes = [100, 1000]


 def run_syrk(a, c):
    res = dsyrk(1.0, a, c=c, overwrite_c=True)
    return res


@pytest.mark.parametrize('n', syrk_sizes)
 def test_syrk(benchmark, n):
    rndm = np.random.RandomState(1234)
    a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F')
    c = np.empty((n, n), dtype=float, order='F')
    result = benchmark(run_syrk, a, c)
    assert result is c


 # ### LAPACK ###

 # linalg.solve

 gesv_sizes = [100, 1000]


 def run_gesv(a, b):
    res = dgesv(a, b, overwrite_a=True, overwrite_b=True)
    return res


@pytest.mark.parametrize('n', gesv_sizes)
 def test_gesv(benchmark, n):
    rndm = np.random.RandomState(1234)
    a = (np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') +
         np.eye(n, order='F'))
    b = np.array(rndm.uniform(size=(n, 1)), order='F')
    lu, piv, x, info = benchmark(run_gesv, a, b)
    assert lu is a
    assert x is b
    assert info == 0


 # linalg.svd

 gesdd_sizes = [(100, 5), (1000, 222)]


 def run_gesdd(a, lwork):
    res = dgesdd(a, lwork=lwork, full_matrices=False, overwrite_a=False)
    return res


@pytest.mark.parametrize('mn', gesdd_sizes)
 def test_gesdd(benchmark, mn):
    m, n = mn
    rndm = np.random.RandomState(1234)
    a = np.array(rndm.uniform(size=(m, n)), dtype=float, order='F')

    lwork, info = dgesdd_lwork(m, n)
    lwork = int(lwork)
    assert info == 0

    u, s, vt, info = benchmark(run_gesdd, a, lwork)

    assert info == 0
    np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=1e-13)


 # linalg.eigh

 syev_sizes = [50, 200]


 def run_syev(a, lwork):
    res = dsyev(a, lwork=lwork, overwrite_a=True)
    return res


@pytest.mark.parametrize('n', syev_sizes)
 def test_syev(benchmark, n):
    rndm = np.random.RandomState(1234)
    a = rndm.uniform(size=(n, n))
    a = np.asarray(a + a.T, dtype=float, order='F')
    a_ = a.copy()

    lwork, info = dsyev_lwork(n)
    lwork = int(lwork)
    assert info == 0

    w, v, info = benchmark(run_syev, a, lwork)

    assert info == 0
    assert a is v  # overwrite_a=True


--- a/benchmark/pybench/meson.build
+++ b/benchmark/pybench/meson.build
@@ -0,0 +1,48 @@
 #
 # Taken from SciPy (of course)
 #
 project(
  'openblas-wrap',
  'c', 'fortran',
  version: '0.1',
  license: 'BSD-3',
  meson_version: '>= 1.1.0',
  default_options: [
    'buildtype=debugoptimized',
    'b_ndebug=if-release',
    'c_std=c17',
    'fortran_std=legacy',
  ],
 )

 py3 = import('python').find_installation(pure: false)
 py3_dep = py3.dependency()

 cc = meson.get_compiler('c')

 _global_c_args = cc.get_supported_arguments(
  '-Wno-unused-but-set-variable',
  '-Wno-unused-function',
  '-Wno-conversion',
  '-Wno-misleading-indentation',
 )
 add_project_arguments(_global_c_args, language : 'c')

 # We need -lm for all C code (assuming it uses math functions, which is safe to
 # assume for SciPy). For C++ it isn't needed, because libstdc++/libc++ is
 # guaranteed to depend on it. For Fortran code, Meson already adds `-lm`.
 m_dep = cc.find_library('m', required : false)
 if m_dep.found()
  add_project_link_arguments('-lm', language : 'c')
 endif

 generate_f2pymod = find_program('openblas_wrap/generate_f2pymod.py')

 openblas = dependency('openblas', method: 'pkg-config', required: true)
 openblas_dep = declare_dependency(
  dependencies: openblas,
  compile_args: []
 )


 subdir('openblas_wrap')
--- a/benchmark/pybench/openblas_wrap/init.py
+++ b/benchmark/pybench/openblas_wrap/init.py
@@ -0,0 +1,28 @@
 """
 Trampoline to hide the LAPACK details (scipy.lapack.linalg or scipy_openblas32 or...)
 from benchmarking.
 """

 __version__ = "0.1"  


 #from scipy.linalg.blas import (
 from ._flapack import (
    # level 1
    dnrm2 as dnrm2,
    ddot as ddot,
    daxpy as daxpy,
    # level 3
    dgemm as dgemm,
    dsyrk as dsyrk,
 )

 #from scipy.linalg.lapack import (
 from openblas_wrap._flapack import (
    # linalg.solve
    dgesv as dgesv,
    # linalg.svd
    dgesdd as dgesdd, dgesdd_lwork as dgesdd_lwork,
    # linalg.eigh
    dsyev as dsyev, dsyev_lwork as dsyev_lwork
 )
--- a/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src
+++ b/benchmark/pybench/openblas_wrap/blas_lapack.pyf.src
@@ -0,0 +1,326 @@
 !
 ! Taken from scipy/linalg
 !
 ! Shorthand notations
 !
 ! <tchar=s,d,cs,zd>
 ! <tchar2c=cs,zd>
 !
 ! <prefix2=s,d>
 ! <prefix2c=c,z>
 ! <prefix3=s,sc>
 ! <prefix4=d,dz>
 ! <prefix6=s,d,c,z,c,z>
 !
 ! <ftype2=real,double precision>
 ! <ftype2c=complex,double complex>
 ! <ftype3=real,complex>
 ! <ftype4=double precision,double complex>
 ! <ftypereal3=real,real>
 ! <ftypereal4=double precision,double precision>
 ! <ftype6=real,double precision,complex,double complex,\2,\3>
 ! <ftype6creal=real,double precision,complex,double complex,\0,\1>
 !
 ! <ctype2=float,double>
 ! <ctype2c=complex_float,complex_double>
 ! <ctype3=float,complex_float>
 ! <ctype4=double,complex_double>
 ! <ctypereal3=float,float>
 ! <ctypereal4=double,double>
 ! <ctype6=float,double,complex_float,complex_double,\2,\3>
 ! <ctype6creal=float,double,complex_float,complex_double,\0,\1>
 !
 !
 ! Level 1 BLAS
 !


 python module _flapack
    usercode '''
 #define F_INT int
 '''

 interface


 subroutine <prefix>axpy(n,a,x,offx,incx,y,offy,incy)
  ! Calculate z = a*x+y, where a is scalar.

  callstatement (*f2py_func)(&n,&a,x+offx,&incx,y+offy,&incy)
  callprotoargument F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*,F_INT*

  <ftype> dimension(*), intent(in) :: x
  <ftype> dimension(*), intent(in,out,out=z) :: y
  <ftype> optional, intent(in):: a=<1.0,\0,(1.0\,0.0),\2>
  integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
  integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
  integer optional, intent(in),depend(x) :: offx=0
  integer optional, intent(in),depend(y) :: offy=0
  check(offx>=0 && offx<len(x)) :: offx
  check(offy>=0 && offy<len(y)) :: offy
  integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
       n = (len(x)-offx)/abs(incx)
  check(len(x)-offx>(n-1)*abs(incx)) :: n
  check(len(y)-offy>(n-1)*abs(incy)) :: n

 end subroutine <prefix>axpy

 function ddot(n,x,offx,incx,y,offy,incy) result (xy)
  ! Computes a vector-vector dot product.

  callstatement ddot_return_value = (*f2py_func)(&n,x+offx,&incx,y+offy,&incy)
  callprotoargument F_INT*,double*,F_INT*,double*,F_INT*
  intent(c) ddot
  fortranname F_FUNC(ddot,DDOT)

  double precision dimension(*), intent(in) :: x
  double precision dimension(*), intent(in) :: y
  double precision ddot,xy
  integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
  integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
  integer optional, intent(in),depend(x) :: offx=0
  integer optional, intent(in),depend(y) :: offy=0
  check(offx>=0 && offx<len(x)) :: offx
  check(offy>=0 && offy<len(y)) :: offy
  integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
       n = (len(x)-offx)/abs(incx)
  check(len(x)-offx>(n-1)*abs(incx)) :: n
  check(len(y)-offy>(n-1)*abs(incy)) :: n

 end function ddot


 function <prefix4>nrm2(n,x,offx,incx) result(n2)

  <ftypereal4> <prefix4>nrm2, n2

  callstatement <prefix4>nrm2_return_value = (*f2py_func)(&n,x+offx,&incx)
  callprotoargument F_INT*,<ctype4>*,F_INT*
  intent(c) <prefix4>nrm2
  fortranname F_FUNC(<prefix4>nrm2,<D,DZ>NRM2)

  <ftype4> dimension(*),intent(in) :: x

  integer optional, intent(in),check(incx>0) :: incx = 1

  integer optional,intent(in),depend(x) :: offx=0
  check(offx>=0 && offx<len(x)) :: offx

  integer optional,intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
  check(len(x)-offx>(n-1)*abs(incx)) :: n

 end function <prefix4>nrm2

 !
 ! Level 3 BLAS
 !


 subroutine <prefix>gemm(m,n,k,alpha,a,b,beta,c,trans_a,trans_b,lda,ka,ldb,kb)
  ! Computes a scalar-matrix-matrix product and adds the result to a
  ! scalar-matrix product.
  !
  ! c = gemm(alpha,a,b,beta=0,c=0,trans_a=0,trans_b=0,overwrite_c=0)
  ! Calculate C <- alpha * op(A) * op(B) + beta * C

  callstatement (*f2py_func)((trans_a?(trans_a==2?"C":"T"):"N"), &
       (trans_b?(trans_b==2?"C":"T"):"N"),&m,&n,&k,&alpha,a,&lda,b,&ldb,&beta,c,&m)
  callprotoargument char*,char*,F_INT*,F_INT*,F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*, &
       F_INT*,<ctype>*,<ctype>*,F_INT*

  integer optional,intent(in),check(trans_a>=0 && trans_a <=2) :: trans_a = 0
  integer optional,intent(in),check(trans_b>=0 && trans_b <=2) :: trans_b = 0
  <ftype> intent(in) :: alpha
  <ftype> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2>

  <ftype> dimension(lda,ka),intent(in) :: a
  <ftype> dimension(ldb,kb),intent(in) :: b
  <ftype> dimension(m,n),intent(in,out,copy),depend(m,n),optional :: c
  check(shape(c,0)==m && shape(c,1)==n) :: c

  integer depend(a),intent(hide) :: lda = shape(a,0)
  integer depend(a),intent(hide) :: ka = shape(a,1)
  integer depend(b),intent(hide) :: ldb = shape(b,0)
  integer depend(b),intent(hide) :: kb = shape(b,1)

  integer depend(a,trans_a,ka,lda),intent(hide):: m = (trans_a?ka:lda)
  integer depend(a,trans_a,ka,lda),intent(hide):: k = (trans_a?lda:ka)
  integer depend(b,trans_b,kb,ldb,k),intent(hide),check(trans_b?kb==k:ldb==k) :: &
       n = (trans_b?ldb:kb)

 end subroutine <prefix>gemm


 subroutine <prefix6><sy,\0,\0,\0,he,he>rk(n,k,alpha,a,beta,c,trans,lower,lda,ka)
  !  performs one of the symmetric rank k operations
  !     C := alpha*A*A**T + beta*C,  or   C := alpha*A**T*A + beta*C,
  !
  ! c = syrk(alpha,a,beta=0,c=0,trans=0,lower=0,overwrite_c=0)
  !
  callstatement (*f2py_func)((lower?"L":"U"), &
        (trans?(trans==2?"C":"T"):"N"), &n,&k,&alpha,a,&lda,&beta,c,&n)
  callprotoargument char*,char*,F_INT*,F_INT*,<ctype6>*,<ctype6>*,F_INT*,<ctype6>*, &
        <ctype6>*,F_INT*

  integer optional, intent(in),check(lower==0||lower==1) :: lower = 0
  integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0

  <ftype6> intent(in) :: alpha
  <ftype6> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2,\2,\2>

  <ftype6> dimension(lda,ka),intent(in) :: a
  <ftype6> dimension(n,n),intent(in,out,copy),depend(n),optional :: c
  check(shape(c,0)==n && shape(c,1)==n) :: c

  integer depend(a),intent(hide) :: lda = shape(a,0)
  integer depend(a),intent(hide) :: ka = shape(a,1)

  integer depend(a, trans, ka, lda), intent(hide) :: n = (trans ? ka : lda)
  integer depend(a, trans, ka, lda), intent(hide) :: k = (trans ? lda : ka)

 end subroutine <prefix6><sy,\0,\0,\0,he,he>rk


 !
 ! LAPACK
 !

 subroutine <prefix>gesv(n,nrhs,a,piv,b,info)
    ! lu,piv,x,info = gesv(a,b,overwrite_a=0,overwrite_b=0)
    ! Solve A * X = B.
    ! A = P * L * U
    ! U is upper diagonal triangular, L is unit lower triangular,
    ! piv pivots columns.

    callstatement {F_INT i;(*f2py_func)(&n,&nrhs,a,&n,piv,b,&n,&info);for(i=0;i\<n;--piv[i++]);}
    callprotoargument F_INT*,F_INT*,<ctype>*,F_INT*,F_INT*,<ctype>*,F_INT*,F_INT*

    integer depend(a),intent(hide):: n = shape(a,0)
    integer depend(b),intent(hide):: nrhs = shape(b,1)
    <ftype> dimension(n,n),check(shape(a,0)==shape(a,1)) :: a
    integer dimension(n),depend(n),intent(out) :: piv
    <ftype> dimension(n,nrhs),check(shape(a,0)==shape(b,0)),depend(n) :: b
    integer intent(out)::info
    intent(in,out,copy,out=x) b
    intent(in,out,copy,out=lu) a
 end subroutine <prefix>gesv


 subroutine <prefix2>gesdd(m,n,minmn,u0,u1,vt0,vt1,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info)
    ! u,s,vt,info = gesdd(a,compute_uv=1,lwork=..,overwrite_a=0)
    ! Compute the singular value decomposition (SVD) using divide and conquer:
    !   A = U * SIGMA * transpose(V)
    ! A - M x N matrix
    ! U - M x M matrix or min(M,N) x N if full_matrices=False
    ! SIGMA - M x N zero matrix with a main diagonal filled with min(M,N)
    !               singular values
    ! transpose(V) - N x N matrix or N x min(M,N) if full_matrices=False

    callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,a,&m,s,u,&u0,vt,&vt0,work,&lwork,iwork,&info)
    callprotoargument char*,F_INT*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,F_INT*,F_INT*

    integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1
    integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1
    integer intent(hide),depend(a):: m = shape(a,0)
    integer intent(hide),depend(a):: n = shape(a,1)
    integer intent(hide),depend(m,n):: minmn = MIN(m,n)
    integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1)
    integer intent(hide),depend(compute_uv,minmn, full_matrices) :: u1 = (compute_uv?(full_matrices?m:minmn):1)
    integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1)
    integer intent(hide),depend(compute_uv,minmn) :: vt1 = (compute_uv?n:1)
    <ftype2> dimension(m,n),intent(in,copy,aligned8) :: a
    <ftype2> dimension(minmn),intent(out),depend(minmn) :: s
    <ftype2> dimension(u0,u1),intent(out),depend(u0, u1) :: u
    <ftype2> dimension(vt0,vt1),intent(out),depend(vt0, vt1) :: vt
    <ftype2> dimension(lwork),intent(hide,cache),depend(lwork) :: work
    integer optional,intent(in),depend(minmn,compute_uv) &
        :: lwork = max((compute_uv?4*minmn*minmn+MAX(m,n)+9*minmn:MAX(14*minmn+4,10*minmn+2+25*(25+8))+MAX(m,n)),1)
    integer intent(hide,cache),dimension(8*minmn),depend(minmn) :: iwork
    integer intent(out)::info

 end subroutine <prefix2>gesdd

 subroutine <prefix2>gesdd_lwork(m,n,minmn,u0,vt0,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info)
    ! LWORK computation for (S/D)GESDD

    fortranname <prefix2>gesdd
    callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,&a,&m,&s,&u,&u0,&vt,&vt0,&work,&lwork,&iwork,&info)
    callprotoargument char*,F_INT*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,F_INT*,F_INT*

    integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1
    integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1
    integer intent(in) :: m
    integer intent(in) :: n
    integer intent(hide),depend(m,n):: minmn = MIN(m,n)
    integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1)
    integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1)
    <ftype2> intent(hide) :: a
    <ftype2> intent(hide) :: s
    <ftype2> intent(hide) :: u
    <ftype2> intent(hide) :: vt
    <ftype2> intent(out) :: work
    integer intent(hide) :: lwork = -1
    integer intent(hide) :: iwork
    integer intent(out) :: info

 end subroutine <prefix2>gesdd_lwork


 subroutine <prefix2>syev(compute_v,lower,n,w,a,lda,work,lwork,info)
    ! w,v,info = syev(a,compute_v=1,lower=0,lwork=3*n-1,overwrite_a=0)
    ! Compute all eigenvalues and, optionally, eigenvectors of a
    ! real symmetric matrix A.
    !
    ! Performance tip:
    !   If compute_v=0 then set also overwrite_a=1.

    callstatement (*f2py_func)((compute_v?"V":"N"),(lower?"L":"U"),&n,a,&lda,w,work,&lwork,&info)
    callprotoargument char*,char*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,F_INT*

    integer optional,intent(in):: compute_v = 1
    check(compute_v==1||compute_v==0) compute_v
    integer optional,intent(in),check(lower==0||lower==1) :: lower = 0

    integer intent(hide),depend(a):: n = shape(a,0)
    integer intent(hide),depend(a):: lda = MAX(1,shape(a,0))
    <ftype2> dimension(n,n),check(shape(a,0)==shape(a,1)) :: a
    intent(in,copy,out,out=v) :: a

    <ftype2> dimension(n),intent(out),depend(n) :: w

    integer optional,intent(in),depend(n) :: lwork=max(3*n-1,1)
    check(lwork>=3*n-1) :: lwork
    <ftype2> dimension(lwork),intent(hide),depend(lwork) :: work

    integer intent(out) :: info

 end subroutine <prefix2>syev


 subroutine <prefix2>syev_lwork(lower,n,w,a,lda,work,lwork,info)
    ! LWORK routines for syev

    fortranname <prefix2>syev

    callstatement (*f2py_func)("N",(lower?"L":"U"),&n,&a,&lda,&w,&work,&lwork,&info)
    callprotoargument char*,char*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,F_INT*
    
     integer intent(in):: n
     integer optional,intent(in),check(lower==0||lower==1) :: lower = 0
     
     integer intent(hide),depend(n):: lda = MAX(1, n)
     <ftype2> intent(hide):: a
     <ftype2> intent(hide):: w
     integer intent(hide):: lwork = -1
    
     <ftype2> intent(out):: work
     integer intent(out):: info
     
 end subroutine <prefix2>syev_lwork

 end interface

 end python module _flapack



--- a/benchmark/pybench/openblas_wrap/generate_f2pymod.py
+++ b/benchmark/pybench/openblas_wrap/generate_f2pymod.py
@@ -0,0 +1,299 @@
 #!/usr/bin/env python3
 """
 Process f2py template files (`filename.pyf.src` -> `filename.pyf`)

 Usage: python generate_pyf.py filename.pyf.src -o filename.pyf
 """

 import os
 import sys
 import re
 import subprocess
 import argparse


 # START OF CODE VENDORED FROM `numpy.distutils.from_template`
 #############################################################
 """
 process_file(filename)

  takes templated file .xxx.src and produces .xxx file where .xxx
  is .pyf .f90 or .f using the following template rules:

  '<..>' denotes a template.

  All function and subroutine blocks in a source file with names that
  contain '<..>' will be replicated according to the rules in '<..>'.

  The number of comma-separated words in '<..>' will determine the number of
  replicates.

  '<..>' may have two different forms, named and short. For example,

  named:
   <p=d,s,z,c> where anywhere inside a block '<p>' will be replaced with
   'd', 's', 'z', and 'c' for each replicate of the block.

   <_c>  is already defined: <_c=s,d,c,z>
   <_t>  is already defined: <_t=real,double precision,complex,double complex>

  short:
   <s,d,c,z>, a short form of the named, useful when no <p> appears inside
   a block.

  In general, '<..>' contains a comma separated list of arbitrary
  expressions. If these expression must contain a comma|leftarrow|rightarrow,
  then prepend the comma|leftarrow|rightarrow with a backslash.

  If an expression matches '\\<index>' then it will be replaced
  by <index>-th expression.

  Note that all '<..>' forms in a block must have the same number of
  comma-separated entries.

 Predefined named template rules:
  <prefix=s,d,c,z>
  <ftype=real,double precision,complex,double complex>
  <ftypereal=real,double precision,\\0,\\1>
  <ctype=float,double,complex_float,complex_double>
  <ctypereal=float,double,\\0,\\1>
 """

 routine_start_re = re.compile(
    r'(\n|\A)((     (\$|\*))|)\s*(subroutine|function)\b',
    re.I
 )
 routine_end_re = re.compile(r'\n\s*end\s*(subroutine|function)\b.*(\n|\Z)', re.I)
 function_start_re = re.compile(r'\n     (\$|\*)\s*function\b', re.I)

 def parse_structure(astr):
    """ Return a list of tuples for each function or subroutine each
    tuple is the start and end of a subroutine or function to be
    expanded.
    """

    spanlist = []
    ind = 0
    while True:
        m = routine_start_re.search(astr, ind)
        if m is None:
            break
        start = m.start()
        if function_start_re.match(astr, start, m.end()):
            while True:
                i = astr.rfind('\n', ind, start)
                if i==-1:
                    break
                start = i
                if astr[i:i+7]!='\n     $':
                    break
        start += 1
        m = routine_end_re.search(astr, m.end())
        ind = end = m and m.end()-1 or len(astr)
        spanlist.append((start, end))
    return spanlist

 template_re = re.compile(r"<\s*(\w[\w\d]*)\s*>")
 named_re = re.compile(r"<\s*(\w[\w\d]*)\s*=\s*(.*?)\s*>")
 list_re = re.compile(r"<\s*((.*?))\s*>")

 def find_repl_patterns(astr):
    reps = named_re.findall(astr)
    names = {}
    for rep in reps:
        name = rep[0].strip() or unique_key(names)
        repl = rep[1].replace(r'\,', '@comma@')
        thelist = conv(repl)
        names[name] = thelist
    return names

 def find_and_remove_repl_patterns(astr):
    names = find_repl_patterns(astr)
    astr = re.subn(named_re, '', astr)[0]
    return astr, names

 item_re = re.compile(r"\A\\(?P<index>\d+)\Z")
 def conv(astr):
    b = astr.split(',')
    l = [x.strip() for x in b]
    for i in range(len(l)):
        m = item_re.match(l[i])
        if m:
            j = int(m.group('index'))
            l[i] = l[j]
    return ','.join(l)

 def unique_key(adict):
    """ Obtain a unique key given a dictionary."""
    allkeys = list(adict.keys())
    done = False
    n = 1
    while not done:
        newkey = '__l%s' % (n)
        if newkey in allkeys:
            n += 1
        else:
            done = True
    return newkey


 template_name_re = re.compile(r'\A\s*(\w[\w\d]*)\s*\Z')
 def expand_sub(substr, names):
    substr = substr.replace(r'\>', '@rightarrow@')
    substr = substr.replace(r'\<', '@leftarrow@')
    lnames = find_repl_patterns(substr)
    substr = named_re.sub(r"<\1>", substr)  # get rid of definition templates

    def listrepl(mobj):
        thelist = conv(mobj.group(1).replace(r'\,', '@comma@'))
        if template_name_re.match(thelist):
            return "<%s>" % (thelist)
        name = None
        for key in lnames.keys():    # see if list is already in dictionary
            if lnames[key] == thelist:
                name = key
        if name is None:      # this list is not in the dictionary yet
            name = unique_key(lnames)
            lnames[name] = thelist
        return "<%s>" % name

    substr = list_re.sub(listrepl, substr) # convert all lists to named templates
                                           # newnames are constructed as needed

    numsubs = None
    base_rule = None
    rules = {}
    for r in template_re.findall(substr):
        if r not in rules:
            thelist = lnames.get(r, names.get(r, None))
            if thelist is None:
                raise ValueError('No replicates found for <%s>' % (r))
            if r not in names and not thelist.startswith('_'):
                names[r] = thelist
            rule = [i.replace('@comma@', ',') for i in thelist.split(',')]
            num = len(rule)

            if numsubs is None:
                numsubs = num
                rules[r] = rule
                base_rule = r
            elif num == numsubs:
                rules[r] = rule
            else:
                print("Mismatch in number of replacements (base <{}={}>) "
                      "for <{}={}>. Ignoring."
                      .format(base_rule, ','.join(rules[base_rule]), r, thelist))
    if not rules:
        return substr

    def namerepl(mobj):
        name = mobj.group(1)
        return rules.get(name, (k+1)*[name])[k]

    newstr = ''
    for k in range(numsubs):
        newstr += template_re.sub(namerepl, substr) + '\n\n'

    newstr = newstr.replace('@rightarrow@', '>')
    newstr = newstr.replace('@leftarrow@', '<')
    return newstr

 def process_str(allstr):
    newstr = allstr
    writestr = ''

    struct = parse_structure(newstr)

    oldend = 0
    names = {}
    names.update(_special_names)
    for sub in struct:
        cleanedstr, defs = find_and_remove_repl_patterns(newstr[oldend:sub[0]])
        writestr += cleanedstr
        names.update(defs)
        writestr += expand_sub(newstr[sub[0]:sub[1]], names)
        oldend =  sub[1]
    writestr += newstr[oldend:]

    return writestr

 include_src_re = re.compile(
    r"(\n|\A)\s*include\s*['\"](?P<name>[\w\d./\\]+\.src)['\"]",
    re.I
 )

 def resolve_includes(source):
    d = os.path.dirname(source)
    with open(source) as fid:
        lines = []
        for line in fid:
            m = include_src_re.match(line)
            if m:
                fn = m.group('name')
                if not os.path.isabs(fn):
                    fn = os.path.join(d, fn)
                if os.path.isfile(fn):
                    lines.extend(resolve_includes(fn))
                else:
                    lines.append(line)
            else:
                lines.append(line)
    return lines

 def process_file(source):
    lines = resolve_includes(source)
    return process_str(''.join(lines))

 _special_names = find_repl_patterns('''
 <_c=s,d,c,z>
 <_t=real,double precision,complex,double complex>
 <prefix=s,d,c,z>
 <ftype=real,double precision,complex,double complex>
 <ctype=float,double,complex_float,complex_double>
 <ftypereal=real,double precision,\\0,\\1>
 <ctypereal=float,double,\\0,\\1>
 ''')

 # END OF CODE VENDORED FROM `numpy.distutils.from_template`
 ###########################################################


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("infile", type=str,
                        help="Path to the input file")
    parser.add_argument("-o", "--outdir", type=str,
                        help="Path to the output directory")
    args = parser.parse_args()

    if not args.infile.endswith(('.pyf', '.pyf.src', '.f.src')):
        raise ValueError(f"Input file has unknown extension: {args.infile}")

    outdir_abs = os.path.join(os.getcwd(), args.outdir)

    # Write out the .pyf/.f file
    if args.infile.endswith(('.pyf.src', '.f.src')):
        code = process_file(args.infile)
        fname_pyf = os.path.join(args.outdir,
                                 os.path.splitext(os.path.split(args.infile)[1])[0])

        with open(fname_pyf, 'w') as f:
            f.write(code)
    else:
        fname_pyf = args.infile

    # Now invoke f2py to generate the C API module file
    if args.infile.endswith(('.pyf.src', '.pyf')):
        p = subprocess.Popen([sys.executable, '-m', 'numpy.f2py', fname_pyf,
                            '--build-dir', outdir_abs], #'--quiet'],
                            stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                            cwd=os.getcwd())
        out, err = p.communicate()
        if not (p.returncode == 0):
            raise RuntimeError(f"Writing {args.outfile} with f2py failed!\n"
                            f"{out}\n"
                            r"{err}")


 if __name__ == "__main__":
    main()
--- a/benchmark/pybench/openblas_wrap/meson.build
+++ b/benchmark/pybench/openblas_wrap/meson.build
@@ -0,0 +1,50 @@
 # find numpy & f2py includes
 inc_numpy = run_command(py3,
  ['-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())'],
  check : true
 ).stdout().strip()

 inc_f2py = run_command(py3,
    ['-c', 'import os; os.chdir(".."); import numpy.f2py; print(numpy.f2py.get_include())'],
    check : true
 ).stdout().strip()


 inc_np = include_directories(inc_numpy, inc_f2py)
 fortranobject_c = inc_f2py / 'fortranobject.c'


 fortranobject_lib = static_library('_fortranobject',
  fortranobject_c,
 #  c_args: numpy_nodepr_api,
  dependencies: py3_dep,
  include_directories: [inc_np, inc_f2py],
  gnu_symbol_visibility: 'hidden',
 )
 fortranobject_dep = declare_dependency(
  link_with: fortranobject_lib,
  include_directories: [inc_np, inc_f2py],
 )


 # f2py generated wrappers

 flapack_module = custom_target('flapack_module',
  output: ['_flapackmodule.c'],
  input: 'blas_lapack.pyf.src',
  command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'],
 )

 py3.extension_module('_flapack',
  flapack_module,
  link_args: [],  # version_link_args,
  dependencies: [openblas_dep, fortranobject_dep],
  install: true,
  subdir: 'openblas_wrap'
 )


 py3.install_sources(
  ['__init__.py'],
  subdir: 'openblas_wrap'
 )
--- a/benchmark/pybench/scipy_openblas.pc
+++ b/benchmark/pybench/scipy_openblas.pc
@@ -0,0 +1,12 @@
 libdir=/home/br/repos/OpenBLAS/
 includedir=/home/br/repos/OpenBLAS/
 openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
 version=0.3.27
 extralib=-lm -lpthread -lgfortran -lquadmath -L${libdir} -lopenblas
 Name: openblas
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: ${version}
 URL: https://github.com/xianyi/OpenBLAS
 Libs: -L${libdir} -lopenblas
 Libs.private: ${extralib}
 Cflags: -I${includedir}