You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemv_n_4.c 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617
  1. /***************************************************************************
  2. Copyright (c) 2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #include "common.h"
  28. #define NBMAX 2048
  29. static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
  30. FLOAT *alpha) {
  31. register FLOAT *ap0 = ap[0];
  32. register FLOAT *ap1 = ap[1];
  33. register FLOAT *ap2 = ap[2];
  34. register FLOAT *ap3 = ap[3];
  35. __asm__("vlrepg %%v0,0(%[x])\n\t"
  36. "vlrepg %%v1,8(%[x])\n\t"
  37. "vlrepg %%v2,16(%[x])\n\t"
  38. "vlrepg %%v3,24(%[x])\n\t"
  39. "vlrepg %%v4,%[alpha]\n\t"
  40. "vfmdb %%v0,%%v0,%%v4\n\t"
  41. "vfmdb %%v1,%%v1,%%v4\n\t"
  42. "vfmdb %%v2,%%v2,%%v4\n\t"
  43. "vfmdb %%v3,%%v3,%%v4\n\t"
  44. "xgr %%r1,%%r1\n\t"
  45. "lghi %%r0,-16\n\t"
  46. "ngr %%r0,%[n]\n\t"
  47. "ltgr %%r0,%%r0\n\t"
  48. "jz 1f\n\t"
  49. "srlg %%r0,%%r0,4\n\t"
  50. "0:\n\t"
  51. "pfd 1,1024(%%r1,%[ap0])\n\t"
  52. "pfd 1,1024(%%r1,%[ap1])\n\t"
  53. "pfd 1,1024(%%r1,%[ap2])\n\t"
  54. "pfd 1,1024(%%r1,%[ap3])\n\t"
  55. "pfd 2,1024(%%r1,%[y])\n\t"
  56. "vl %%v16,0(%%r1,%[ap0])\n\t"
  57. "vl %%v17,0(%%r1,%[ap1])\n\t"
  58. "vl %%v18,0(%%r1,%[ap2])\n\t"
  59. "vl %%v19,0(%%r1,%[ap3])\n\t"
  60. "vl %%v20,16(%%r1,%[ap0])\n\t"
  61. "vl %%v21,16(%%r1,%[ap1])\n\t"
  62. "vl %%v22,16(%%r1,%[ap2])\n\t"
  63. "vl %%v23,16(%%r1,%[ap3])\n\t"
  64. "vl %%v24,32(%%r1,%[ap0])\n\t"
  65. "vl %%v25,32(%%r1,%[ap1])\n\t"
  66. "vl %%v26,32(%%r1,%[ap2])\n\t"
  67. "vl %%v27,32(%%r1,%[ap3])\n\t"
  68. "vl %%v28,48(%%r1,%[ap0])\n\t"
  69. "vl %%v29,48(%%r1,%[ap1])\n\t"
  70. "vl %%v30,48(%%r1,%[ap2])\n\t"
  71. "vl %%v31,48(%%r1,%[ap3])\n\t"
  72. "vl %%v4,0(%%r1,%[y])\n\t"
  73. "vl %%v5,16(%%r1,%[y])\n\t"
  74. "vl %%v6,32(%%r1,%[y])\n\t"
  75. "vl %%v7,48(%%r1,%[y])\n\t"
  76. "vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
  77. "vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
  78. "vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
  79. "vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
  80. "vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
  81. "vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
  82. "vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
  83. "vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
  84. "vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
  85. "vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
  86. "vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
  87. "vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
  88. "vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
  89. "vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
  90. "vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
  91. "vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
  92. "vst %%v4,0(%%r1,%[y])\n\t"
  93. "vst %%v5,16(%%r1,%[y])\n\t"
  94. "vst %%v6,32(%%r1,%[y])\n\t"
  95. "vst %%v7,48(%%r1,%[y])\n\t"
  96. "vl %%v16,64(%%r1,%[ap0])\n\t"
  97. "vl %%v17,64(%%r1,%[ap1])\n\t"
  98. "vl %%v18,64(%%r1,%[ap2])\n\t"
  99. "vl %%v19,64(%%r1,%[ap3])\n\t"
  100. "vl %%v20,80(%%r1,%[ap0])\n\t"
  101. "vl %%v21,80(%%r1,%[ap1])\n\t"
  102. "vl %%v22,80(%%r1,%[ap2])\n\t"
  103. "vl %%v23,80(%%r1,%[ap3])\n\t"
  104. "vl %%v24,96(%%r1,%[ap0])\n\t"
  105. "vl %%v25,96(%%r1,%[ap1])\n\t"
  106. "vl %%v26,96(%%r1,%[ap2])\n\t"
  107. "vl %%v27,96(%%r1,%[ap3])\n\t"
  108. "vl %%v28,112(%%r1,%[ap0])\n\t"
  109. "vl %%v29,112(%%r1,%[ap1])\n\t"
  110. "vl %%v30,112(%%r1,%[ap2])\n\t"
  111. "vl %%v31,112(%%r1,%[ap3])\n\t"
  112. "vl %%v4,64(%%r1,%[y])\n\t"
  113. "vl %%v5,80(%%r1,%[y])\n\t"
  114. "vl %%v6,96(%%r1,%[y])\n\t"
  115. "vl %%v7,112(%%r1,%[y])\n\t"
  116. "vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
  117. "vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
  118. "vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
  119. "vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
  120. "vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
  121. "vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
  122. "vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
  123. "vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
  124. "vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
  125. "vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
  126. "vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
  127. "vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
  128. "vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
  129. "vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
  130. "vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
  131. "vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
  132. "vst %%v4,64(%%r1,%[y])\n\t"
  133. "vst %%v5,80(%%r1,%[y])\n\t"
  134. "vst %%v6,96(%%r1,%[y])\n\t"
  135. "vst %%v7,112(%%r1,%[y])\n\t"
  136. "agfi %%r1,128\n\t"
  137. "brctg %%r0,0b\n\t"
  138. "1:\n\t"
  139. "lghi %%r0,12\n\t"
  140. "ngr %%r0,%[n]\n\t"
  141. "ltgr %%r0,%%r0\n\t"
  142. "jz 3f\n\t"
  143. "srlg %%r0,%%r0,2\n\t"
  144. "2:\n\t"
  145. "vl %%v16,0(%%r1,%[ap0])\n\t"
  146. "vl %%v17,0(%%r1,%[ap1])\n\t"
  147. "vl %%v18,0(%%r1,%[ap2])\n\t"
  148. "vl %%v19,0(%%r1,%[ap3])\n\t"
  149. "vl %%v20,16(%%r1,%[ap0])\n\t"
  150. "vl %%v21,16(%%r1,%[ap1])\n\t"
  151. "vl %%v22,16(%%r1,%[ap2])\n\t"
  152. "vl %%v23,16(%%r1,%[ap3])\n\t"
  153. "vl %%v4,0(%%r1,%[y])\n\t"
  154. "vl %%v5,16(%%r1,%[y])\n\t"
  155. "vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
  156. "vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
  157. "vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
  158. "vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
  159. "vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
  160. "vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
  161. "vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
  162. "vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
  163. "vst %%v4,0(%%r1,%[y])\n\t"
  164. "vst %%v5,16(%%r1,%[y])\n\t"
  165. "agfi %%r1,32\n\t"
  166. "brctg %%r0,2b\n\t"
  167. "3:\n\t"
  168. "nop 0"
  169. : "+m"(*(FLOAT (*)[n]) y)
  170. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0),
  171. "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1),
  172. "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2),
  173. "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3),
  174. "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "Q"(*alpha),
  175. [n] "r"(n)
  176. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  177. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  178. "v26", "v27", "v28", "v29", "v30", "v31");
  179. }
  180. static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
  181. FLOAT *alpha) {
  182. register FLOAT *ap0 = ap[0];
  183. register FLOAT *ap1 = ap[1];
  184. __asm__("vlrepg %%v0,0(%[x])\n\t"
  185. "vlrepg %%v1,8(%[x])\n\t"
  186. "vlrepg %%v2,%[alpha]\n\t"
  187. "vfmdb %%v0,%%v0,%%v2\n\t"
  188. "vfmdb %%v1,%%v1,%%v2\n\t"
  189. "xgr %%r1,%%r1\n\t"
  190. "lghi %%r0,-16\n\t"
  191. "ngr %%r0,%[n]\n\t"
  192. "ltgr %%r0,%%r0\n\t"
  193. "jz 1f\n\t"
  194. "srlg %%r0,%%r0,4\n\t"
  195. "0:\n\t"
  196. "pfd 1,1024(%%r1,%[ap0])\n\t"
  197. "pfd 1,1024(%%r1,%[ap1])\n\t"
  198. "pfd 2,1024(%%r1,%[y])\n\t"
  199. "vl %%v16,0(%%r1,%[ap0])\n\t"
  200. "vl %%v17,0(%%r1,%[ap1])\n\t"
  201. "vl %%v18,16(%%r1,%[ap0])\n\t"
  202. "vl %%v19,16(%%r1,%[ap1])\n\t"
  203. "vl %%v20,32(%%r1,%[ap0])\n\t"
  204. "vl %%v21,32(%%r1,%[ap1])\n\t"
  205. "vl %%v22,48(%%r1,%[ap0])\n\t"
  206. "vl %%v23,48(%%r1,%[ap1])\n\t"
  207. "vl %%v24,64(%%r1,%[ap0])\n\t"
  208. "vl %%v25,64(%%r1,%[ap1])\n\t"
  209. "vl %%v26,80(%%r1,%[ap0])\n\t"
  210. "vl %%v27,80(%%r1,%[ap1])\n\t"
  211. "vl %%v28,96(%%r1,%[ap0])\n\t"
  212. "vl %%v29,96(%%r1,%[ap1])\n\t"
  213. "vl %%v30,112(%%r1,%[ap0])\n\t"
  214. "vl %%v31,112(%%r1,%[ap1])\n\t"
  215. "vl %%v2,0(%%r1,%[y])\n\t"
  216. "vl %%v3,16(%%r1,%[y])\n\t"
  217. "vl %%v4,32(%%r1,%[y])\n\t"
  218. "vl %%v5,48(%%r1,%[y])\n\t"
  219. "vl %%v6,64(%%r1,%[y])\n\t"
  220. "vl %%v7,80(%%r1,%[y])\n\t"
  221. "vl %%v8,96(%%r1,%[y])\n\t"
  222. "vl %%v9,112(%%r1,%[y])\n\t"
  223. "vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
  224. "vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
  225. "vfmadb %%v4,%%v20,%%v0,%%v4\n\t"
  226. "vfmadb %%v5,%%v22,%%v0,%%v5\n\t"
  227. "vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
  228. "vfmadb %%v7,%%v26,%%v0,%%v7\n\t"
  229. "vfmadb %%v8,%%v28,%%v0,%%v8\n\t"
  230. "vfmadb %%v9,%%v30,%%v0,%%v9\n\t"
  231. "vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
  232. "vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
  233. "vfmadb %%v4,%%v21,%%v1,%%v4\n\t"
  234. "vfmadb %%v5,%%v23,%%v1,%%v5\n\t"
  235. "vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
  236. "vfmadb %%v7,%%v27,%%v1,%%v7\n\t"
  237. "vfmadb %%v8,%%v29,%%v1,%%v8\n\t"
  238. "vfmadb %%v9,%%v31,%%v1,%%v9\n\t"
  239. "vst %%v2,0(%%r1,%[y])\n\t"
  240. "vst %%v3,16(%%r1,%[y])\n\t"
  241. "vst %%v4,32(%%r1,%[y])\n\t"
  242. "vst %%v5,48(%%r1,%[y])\n\t"
  243. "vst %%v6,64(%%r1,%[y])\n\t"
  244. "vst %%v7,80(%%r1,%[y])\n\t"
  245. "vst %%v8,96(%%r1,%[y])\n\t"
  246. "vst %%v9,112(%%r1,%[y])\n\t"
  247. "agfi %%r1,128\n\t"
  248. "brctg %%r0,0b\n\t"
  249. "1:\n\t"
  250. "lghi %%r0,12\n\t"
  251. "ngr %%r0,%[n]\n\t"
  252. "ltgr %%r0,%%r0\n\t"
  253. "jz 3f\n\t"
  254. "srlg %%r0,%%r0,2\n\t"
  255. "2:\n\t"
  256. "vl %%v16,0(%%r1,%[ap0])\n\t"
  257. "vl %%v17,0(%%r1,%[ap1])\n\t"
  258. "vl %%v18,16(%%r1,%[ap0])\n\t"
  259. "vl %%v19,16(%%r1,%[ap1])\n\t"
  260. "vl %%v2,0(%%r1,%[y])\n\t"
  261. "vl %%v3,16(%%r1,%[y])\n\t"
  262. "vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
  263. "vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
  264. "vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
  265. "vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
  266. "vst %%v2,0(%%r1,%[y])\n\t"
  267. "vst %%v3,16(%%r1,%[y])\n\t"
  268. "agfi %%r1,32\n\t"
  269. "brctg %%r0,2b\n\t"
  270. "3:\n\t"
  271. "nop 0"
  272. : "+m"(*(FLOAT (*)[n]) y)
  273. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0),
  274. "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1),
  275. "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "Q"(*alpha),
  276. [n] "r"(n)
  277. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  278. "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
  279. "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
  280. }
  281. static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
  282. FLOAT *alpha) {
  283. __asm__("vlrepg %%v0,0(%[x])\n\t"
  284. "vlrepg %%v16,%[alpha]\n\t"
  285. "vfmdb %%v0,%%v0,%%v16\n\t"
  286. "xgr %%r1,%%r1\n\t"
  287. "lghi %%r0,-16\n\t"
  288. "ngr %%r0,%[n]\n\t"
  289. "ltgr %%r0,%%r0\n\t"
  290. "jz 1f\n\t"
  291. "srlg %%r0,%%r0,4\n\t"
  292. "0:\n\t"
  293. "pfd 1,1024(%%r1,%[a0])\n\t"
  294. "pfd 2,1024(%%r1,%[y])\n\t"
  295. "vl %%v16,0(%%r1,%[a0])\n\t"
  296. "vl %%v17,16(%%r1,%[a0])\n\t"
  297. "vl %%v18,32(%%r1,%[a0])\n\t"
  298. "vl %%v19,48(%%r1,%[a0])\n\t"
  299. "vl %%v20,64(%%r1,%[a0])\n\t"
  300. "vl %%v21,80(%%r1,%[a0])\n\t"
  301. "vl %%v22,96(%%r1,%[a0])\n\t"
  302. "vl %%v23,112(%%r1,%[a0])\n\t"
  303. "vl %%v24,0(%%r1,%[y])\n\t"
  304. "vl %%v25,16(%%r1,%[y])\n\t"
  305. "vl %%v26,32(%%r1,%[y])\n\t"
  306. "vl %%v27,48(%%r1,%[y])\n\t"
  307. "vl %%v28,64(%%r1,%[y])\n\t"
  308. "vl %%v29,80(%%r1,%[y])\n\t"
  309. "vl %%v30,96(%%r1,%[y])\n\t"
  310. "vl %%v31,112(%%r1,%[y])\n\t"
  311. "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
  312. "vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
  313. "vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
  314. "vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
  315. "vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
  316. "vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
  317. "vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
  318. "vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
  319. "vst %%v24,0(%%r1,%[y])\n\t"
  320. "vst %%v25,16(%%r1,%[y])\n\t"
  321. "vst %%v26,32(%%r1,%[y])\n\t"
  322. "vst %%v27,48(%%r1,%[y])\n\t"
  323. "vst %%v28,64(%%r1,%[y])\n\t"
  324. "vst %%v29,80(%%r1,%[y])\n\t"
  325. "vst %%v30,96(%%r1,%[y])\n\t"
  326. "vst %%v31,112(%%r1,%[y])\n\t"
  327. "agfi %%r1,128\n\t"
  328. "brctg %%r0,0b\n\t"
  329. "1:\n\t"
  330. "lghi %%r0,12\n\t"
  331. "ngr %%r0,%[n]\n\t"
  332. "ltgr %%r0,%%r0\n\t"
  333. "jz 3f\n\t"
  334. "srlg %%r0,%%r0,2\n\t"
  335. "2:\n\t"
  336. "vl %%v16,0(%%r1,%[a0])\n\t"
  337. "vl %%v17,16(%%r1,%[a0])\n\t"
  338. "vl %%v18,0(%%r1,%[y])\n\t"
  339. "vl %%v19,16(%%r1,%[y])\n\t"
  340. "vfmadb %%v18,%%v16,%%v0,%%v18\n\t"
  341. "vfmadb %%v19,%%v17,%%v0,%%v19\n\t"
  342. "vst %%v18,0(%%r1,%[y])\n\t"
  343. "vst %%v19,16(%%r1,%[y])\n\t"
  344. "agfi %%r1,32\n\t"
  345. "brctg %%r0,2b\n\t"
  346. "3:\n\t"
  347. "nop 0"
  348. : "+m"(*(FLOAT (*)[n]) y)
  349. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
  350. "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha),
  351. [n] "r"(n)
  352. : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
  353. "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  354. "v31");
  355. }
  356. static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
  357. BLASLONG i;
  358. for (i = 0; i < n; i++) {
  359. *dest += src[i];
  360. dest += inc_dest;
  361. }
  362. }
  363. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
  364. BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  365. FLOAT *buffer) {
  366. BLASLONG i;
  367. FLOAT *a_ptr;
  368. FLOAT *x_ptr;
  369. FLOAT *y_ptr;
  370. FLOAT *ap[4];
  371. BLASLONG n1;
  372. BLASLONG m1;
  373. BLASLONG m2;
  374. BLASLONG m3;
  375. BLASLONG n2;
  376. BLASLONG lda4 = lda << 2;
  377. FLOAT xbuffer[8], *ybuffer;
  378. if (m < 1)
  379. return (0);
  380. if (n < 1)
  381. return (0);
  382. ybuffer = buffer;
  383. n1 = n >> 2;
  384. n2 = n & 3;
  385. m3 = m & 3;
  386. m1 = m & -4;
  387. m2 = (m & (NBMAX - 1)) - m3;
  388. y_ptr = y;
  389. BLASLONG NB = NBMAX;
  390. while (NB == NBMAX) {
  391. m1 -= NB;
  392. if (m1 < 0) {
  393. if (m2 == 0)
  394. break;
  395. NB = m2;
  396. }
  397. a_ptr = a;
  398. x_ptr = x;
  399. ap[0] = a_ptr;
  400. ap[1] = a_ptr + lda;
  401. ap[2] = ap[1] + lda;
  402. ap[3] = ap[2] + lda;
  403. if (inc_y != 1)
  404. memset(ybuffer, 0, NB * 8);
  405. else
  406. ybuffer = y_ptr;
  407. if (inc_x == 1) {
  408. for (i = 0; i < n1; i++) {
  409. dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
  410. ap[0] += lda4;
  411. ap[1] += lda4;
  412. ap[2] += lda4;
  413. ap[3] += lda4;
  414. a_ptr += lda4;
  415. x_ptr += 4;
  416. }
  417. if (n2 & 2) {
  418. dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
  419. a_ptr += lda * 2;
  420. x_ptr += 2;
  421. }
  422. if (n2 & 1) {
  423. dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
  424. /* a_ptr += lda;
  425. x_ptr += 1; */
  426. }
  427. } else {
  428. for (i = 0; i < n1; i++) {
  429. xbuffer[0] = x_ptr[0];
  430. x_ptr += inc_x;
  431. xbuffer[1] = x_ptr[0];
  432. x_ptr += inc_x;
  433. xbuffer[2] = x_ptr[0];
  434. x_ptr += inc_x;
  435. xbuffer[3] = x_ptr[0];
  436. x_ptr += inc_x;
  437. dgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha);
  438. ap[0] += lda4;
  439. ap[1] += lda4;
  440. ap[2] += lda4;
  441. ap[3] += lda4;
  442. a_ptr += lda4;
  443. }
  444. for (i = 0; i < n2; i++) {
  445. xbuffer[0] = x_ptr[0];
  446. x_ptr += inc_x;
  447. dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
  448. a_ptr += lda;
  449. }
  450. }
  451. a += NB;
  452. if (inc_y != 1) {
  453. add_y(NB, ybuffer, y_ptr, inc_y);
  454. y_ptr += NB * inc_y;
  455. } else
  456. y_ptr += NB;
  457. }
  458. if (m3 == 0)
  459. return (0);
  460. if (m3 == 3) {
  461. a_ptr = a;
  462. x_ptr = x;
  463. FLOAT temp0 = 0.0;
  464. FLOAT temp1 = 0.0;
  465. FLOAT temp2 = 0.0;
  466. if (lda == 3 && inc_x == 1) {
  467. for (i = 0; i < (n & -4); i += 4) {
  468. temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
  469. temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
  470. temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
  471. temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
  472. temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
  473. temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
  474. a_ptr += 12;
  475. x_ptr += 4;
  476. }
  477. for (; i < n; i++) {
  478. temp0 += a_ptr[0] * x_ptr[0];
  479. temp1 += a_ptr[1] * x_ptr[0];
  480. temp2 += a_ptr[2] * x_ptr[0];
  481. a_ptr += 3;
  482. x_ptr++;
  483. }
  484. } else {
  485. for (i = 0; i < n; i++) {
  486. temp0 += a_ptr[0] * x_ptr[0];
  487. temp1 += a_ptr[1] * x_ptr[0];
  488. temp2 += a_ptr[2] * x_ptr[0];
  489. a_ptr += lda;
  490. x_ptr += inc_x;
  491. }
  492. }
  493. y_ptr[0] += alpha * temp0;
  494. y_ptr += inc_y;
  495. y_ptr[0] += alpha * temp1;
  496. y_ptr += inc_y;
  497. y_ptr[0] += alpha * temp2;
  498. return (0);
  499. }
  500. if (m3 == 2) {
  501. a_ptr = a;
  502. x_ptr = x;
  503. FLOAT temp0 = 0.0;
  504. FLOAT temp1 = 0.0;
  505. if (lda == 2 && inc_x == 1) {
  506. for (i = 0; i < (n & -4); i += 4) {
  507. temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
  508. temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
  509. temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
  510. temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
  511. a_ptr += 8;
  512. x_ptr += 4;
  513. }
  514. for (; i < n; i++) {
  515. temp0 += a_ptr[0] * x_ptr[0];
  516. temp1 += a_ptr[1] * x_ptr[0];
  517. a_ptr += 2;
  518. x_ptr++;
  519. }
  520. } else {
  521. for (i = 0; i < n; i++) {
  522. temp0 += a_ptr[0] * x_ptr[0];
  523. temp1 += a_ptr[1] * x_ptr[0];
  524. a_ptr += lda;
  525. x_ptr += inc_x;
  526. }
  527. }
  528. y_ptr[0] += alpha * temp0;
  529. y_ptr += inc_y;
  530. y_ptr[0] += alpha * temp1;
  531. return (0);
  532. }
  533. if (m3 == 1) {
  534. a_ptr = a;
  535. x_ptr = x;
  536. FLOAT temp = 0.0;
  537. if (lda == 1 && inc_x == 1) {
  538. for (i = 0; i < (n & -4); i += 4) {
  539. temp +=
  540. a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
  541. 2] *
  542. x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
  543. }
  544. for (; i < n; i++) {
  545. temp += a_ptr[i] * x_ptr[i];
  546. }
  547. } else {
  548. for (i = 0; i < n; i++) {
  549. temp += a_ptr[0] * x_ptr[0];
  550. a_ptr += lda;
  551. x_ptr += inc_x;
  552. }
  553. }
  554. y_ptr[0] += alpha * temp;
  555. return (0);
  556. }
  557. return (0);
  558. }