You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemv_n_4.c 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. /***************************************************************************
  2. Copyright (c) 2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #include "common.h"
  28. #define NBMAX 2048
  29. static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
  30. FLOAT *alpha) {
  31. __asm__("vlrepg %%v0,0(%[x])\n\t"
  32. "vlrepg %%v1,8(%[x])\n\t"
  33. "vlrepg %%v2,16(%[x])\n\t"
  34. "vlrepg %%v3,24(%[x])\n\t"
  35. "vlrepg %%v4,%[alpha]\n\t"
  36. "vfmdb %%v0,%%v0,%%v4\n\t"
  37. "vfmdb %%v1,%%v1,%%v4\n\t"
  38. "vfmdb %%v2,%%v2,%%v4\n\t"
  39. "vfmdb %%v3,%%v3,%%v4\n\t"
  40. "xgr %%r1,%%r1\n\t"
  41. "lghi %%r0,-16\n\t"
  42. "ngr %%r0,%[n]\n\t"
  43. "ltgr %%r0,%%r0\n\t"
  44. "jz 1f\n\t"
  45. "srlg %%r0,%%r0,4\n\t"
  46. "0:\n\t"
  47. "pfd 1,1024(%%r1,%[ap0])\n\t"
  48. "pfd 1,1024(%%r1,%[ap1])\n\t"
  49. "pfd 1,1024(%%r1,%[ap2])\n\t"
  50. "pfd 1,1024(%%r1,%[ap3])\n\t"
  51. "pfd 2,1024(%%r1,%[y])\n\t"
  52. "vl %%v16,0(%%r1,%[ap0])\n\t"
  53. "vl %%v17,0(%%r1,%[ap1])\n\t"
  54. "vl %%v18,0(%%r1,%[ap2])\n\t"
  55. "vl %%v19,0(%%r1,%[ap3])\n\t"
  56. "vl %%v20,16(%%r1,%[ap0])\n\t"
  57. "vl %%v21,16(%%r1,%[ap1])\n\t"
  58. "vl %%v22,16(%%r1,%[ap2])\n\t"
  59. "vl %%v23,16(%%r1,%[ap3])\n\t"
  60. "vl %%v24,32(%%r1,%[ap0])\n\t"
  61. "vl %%v25,32(%%r1,%[ap1])\n\t"
  62. "vl %%v26,32(%%r1,%[ap2])\n\t"
  63. "vl %%v27,32(%%r1,%[ap3])\n\t"
  64. "vl %%v28,48(%%r1,%[ap0])\n\t"
  65. "vl %%v29,48(%%r1,%[ap1])\n\t"
  66. "vl %%v30,48(%%r1,%[ap2])\n\t"
  67. "vl %%v31,48(%%r1,%[ap3])\n\t"
  68. "vl %%v4,0(%%r1,%[y])\n\t"
  69. "vl %%v5,16(%%r1,%[y])\n\t"
  70. "vl %%v6,32(%%r1,%[y])\n\t"
  71. "vl %%v7,48(%%r1,%[y])\n\t"
  72. "vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
  73. "vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
  74. "vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
  75. "vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
  76. "vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
  77. "vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
  78. "vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
  79. "vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
  80. "vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
  81. "vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
  82. "vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
  83. "vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
  84. "vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
  85. "vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
  86. "vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
  87. "vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
  88. "vst %%v4,0(%%r1,%[y])\n\t"
  89. "vst %%v5,16(%%r1,%[y])\n\t"
  90. "vst %%v6,32(%%r1,%[y])\n\t"
  91. "vst %%v7,48(%%r1,%[y])\n\t"
  92. "vl %%v16,64(%%r1,%[ap0])\n\t"
  93. "vl %%v17,64(%%r1,%[ap1])\n\t"
  94. "vl %%v18,64(%%r1,%[ap2])\n\t"
  95. "vl %%v19,64(%%r1,%[ap3])\n\t"
  96. "vl %%v20,80(%%r1,%[ap0])\n\t"
  97. "vl %%v21,80(%%r1,%[ap1])\n\t"
  98. "vl %%v22,80(%%r1,%[ap2])\n\t"
  99. "vl %%v23,80(%%r1,%[ap3])\n\t"
  100. "vl %%v24,96(%%r1,%[ap0])\n\t"
  101. "vl %%v25,96(%%r1,%[ap1])\n\t"
  102. "vl %%v26,96(%%r1,%[ap2])\n\t"
  103. "vl %%v27,96(%%r1,%[ap3])\n\t"
  104. "vl %%v28,112(%%r1,%[ap0])\n\t"
  105. "vl %%v29,112(%%r1,%[ap1])\n\t"
  106. "vl %%v30,112(%%r1,%[ap2])\n\t"
  107. "vl %%v31,112(%%r1,%[ap3])\n\t"
  108. "vl %%v4,64(%%r1,%[y])\n\t"
  109. "vl %%v5,80(%%r1,%[y])\n\t"
  110. "vl %%v6,96(%%r1,%[y])\n\t"
  111. "vl %%v7,112(%%r1,%[y])\n\t"
  112. "vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
  113. "vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
  114. "vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
  115. "vfmadb %%v7,%%v28,%%v0,%%v7\n\t"
  116. "vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
  117. "vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
  118. "vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
  119. "vfmadb %%v7,%%v29,%%v1,%%v7\n\t"
  120. "vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
  121. "vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
  122. "vfmadb %%v6,%%v26,%%v2,%%v6\n\t"
  123. "vfmadb %%v7,%%v30,%%v2,%%v7\n\t"
  124. "vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
  125. "vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
  126. "vfmadb %%v6,%%v27,%%v3,%%v6\n\t"
  127. "vfmadb %%v7,%%v31,%%v3,%%v7\n\t"
  128. "vst %%v4,64(%%r1,%[y])\n\t"
  129. "vst %%v5,80(%%r1,%[y])\n\t"
  130. "vst %%v6,96(%%r1,%[y])\n\t"
  131. "vst %%v7,112(%%r1,%[y])\n\t"
  132. "agfi %%r1,128\n\t"
  133. "brctg %%r0,0b\n\t"
  134. "1:\n\t"
  135. "lghi %%r0,12\n\t"
  136. "ngr %%r0,%[n]\n\t"
  137. "ltgr %%r0,%%r0\n\t"
  138. "jz 3f\n\t"
  139. "srlg %%r0,%%r0,2\n\t"
  140. "2:\n\t"
  141. "vl %%v16,0(%%r1,%[ap0])\n\t"
  142. "vl %%v17,0(%%r1,%[ap1])\n\t"
  143. "vl %%v18,0(%%r1,%[ap2])\n\t"
  144. "vl %%v19,0(%%r1,%[ap3])\n\t"
  145. "vl %%v20,16(%%r1,%[ap0])\n\t"
  146. "vl %%v21,16(%%r1,%[ap1])\n\t"
  147. "vl %%v22,16(%%r1,%[ap2])\n\t"
  148. "vl %%v23,16(%%r1,%[ap3])\n\t"
  149. "vl %%v4,0(%%r1,%[y])\n\t"
  150. "vl %%v5,16(%%r1,%[y])\n\t"
  151. "vfmadb %%v4,%%v16,%%v0,%%v4\n\t"
  152. "vfmadb %%v5,%%v20,%%v0,%%v5\n\t"
  153. "vfmadb %%v4,%%v17,%%v1,%%v4\n\t"
  154. "vfmadb %%v5,%%v21,%%v1,%%v5\n\t"
  155. "vfmadb %%v4,%%v18,%%v2,%%v4\n\t"
  156. "vfmadb %%v5,%%v22,%%v2,%%v5\n\t"
  157. "vfmadb %%v4,%%v19,%%v3,%%v4\n\t"
  158. "vfmadb %%v5,%%v23,%%v3,%%v5\n\t"
  159. "vst %%v4,0(%%r1,%[y])\n\t"
  160. "vst %%v5,16(%%r1,%[y])\n\t"
  161. "agfi %%r1,32\n\t"
  162. "brctg %%r0,2b\n\t"
  163. "3:\n\t"
  164. "nop"
  165. : "+m"(*(FLOAT (*)[n]) y)
  166. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
  167. "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
  168. "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]),
  169. "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]),
  170. "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n)
  171. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  172. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  173. "v26", "v27", "v28", "v29", "v30", "v31");
  174. }
  175. static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y,
  176. FLOAT *alpha) {
  177. __asm__("vlrepg %%v0,0(%[x])\n\t"
  178. "vlrepg %%v1,8(%[x])\n\t"
  179. "vlrepg %%v2,%[alpha]\n\t"
  180. "vfmdb %%v0,%%v0,%%v2\n\t"
  181. "vfmdb %%v1,%%v1,%%v2\n\t"
  182. "xgr %%r1,%%r1\n\t"
  183. "lghi %%r0,-16\n\t"
  184. "ngr %%r0,%[n]\n\t"
  185. "ltgr %%r0,%%r0\n\t"
  186. "jz 1f\n\t"
  187. "srlg %%r0,%%r0,4\n\t"
  188. "0:\n\t"
  189. "pfd 1,1024(%%r1,%[ap0])\n\t"
  190. "pfd 1,1024(%%r1,%[ap1])\n\t"
  191. "pfd 2,1024(%%r1,%[y])\n\t"
  192. "vl %%v16,0(%%r1,%[ap0])\n\t"
  193. "vl %%v17,0(%%r1,%[ap1])\n\t"
  194. "vl %%v18,16(%%r1,%[ap0])\n\t"
  195. "vl %%v19,16(%%r1,%[ap1])\n\t"
  196. "vl %%v20,32(%%r1,%[ap0])\n\t"
  197. "vl %%v21,32(%%r1,%[ap1])\n\t"
  198. "vl %%v22,48(%%r1,%[ap0])\n\t"
  199. "vl %%v23,48(%%r1,%[ap1])\n\t"
  200. "vl %%v24,64(%%r1,%[ap0])\n\t"
  201. "vl %%v25,64(%%r1,%[ap1])\n\t"
  202. "vl %%v26,80(%%r1,%[ap0])\n\t"
  203. "vl %%v27,80(%%r1,%[ap1])\n\t"
  204. "vl %%v28,96(%%r1,%[ap0])\n\t"
  205. "vl %%v29,96(%%r1,%[ap1])\n\t"
  206. "vl %%v30,112(%%r1,%[ap0])\n\t"
  207. "vl %%v31,112(%%r1,%[ap1])\n\t"
  208. "vl %%v2,0(%%r1,%[y])\n\t"
  209. "vl %%v3,16(%%r1,%[y])\n\t"
  210. "vl %%v4,32(%%r1,%[y])\n\t"
  211. "vl %%v5,48(%%r1,%[y])\n\t"
  212. "vl %%v6,64(%%r1,%[y])\n\t"
  213. "vl %%v7,80(%%r1,%[y])\n\t"
  214. "vl %%v8,96(%%r1,%[y])\n\t"
  215. "vl %%v9,112(%%r1,%[y])\n\t"
  216. "vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
  217. "vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
  218. "vfmadb %%v4,%%v20,%%v0,%%v4\n\t"
  219. "vfmadb %%v5,%%v22,%%v0,%%v5\n\t"
  220. "vfmadb %%v6,%%v24,%%v0,%%v6\n\t"
  221. "vfmadb %%v7,%%v26,%%v0,%%v7\n\t"
  222. "vfmadb %%v8,%%v28,%%v0,%%v8\n\t"
  223. "vfmadb %%v9,%%v30,%%v0,%%v9\n\t"
  224. "vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
  225. "vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
  226. "vfmadb %%v4,%%v21,%%v1,%%v4\n\t"
  227. "vfmadb %%v5,%%v23,%%v1,%%v5\n\t"
  228. "vfmadb %%v6,%%v25,%%v1,%%v6\n\t"
  229. "vfmadb %%v7,%%v27,%%v1,%%v7\n\t"
  230. "vfmadb %%v8,%%v29,%%v1,%%v8\n\t"
  231. "vfmadb %%v9,%%v31,%%v1,%%v9\n\t"
  232. "vst %%v2,0(%%r1,%[y])\n\t"
  233. "vst %%v3,16(%%r1,%[y])\n\t"
  234. "vst %%v4,32(%%r1,%[y])\n\t"
  235. "vst %%v5,48(%%r1,%[y])\n\t"
  236. "vst %%v6,64(%%r1,%[y])\n\t"
  237. "vst %%v7,80(%%r1,%[y])\n\t"
  238. "vst %%v8,96(%%r1,%[y])\n\t"
  239. "vst %%v9,112(%%r1,%[y])\n\t"
  240. "agfi %%r1,128\n\t"
  241. "brctg %%r0,0b\n\t"
  242. "1:\n\t"
  243. "lghi %%r0,12\n\t"
  244. "ngr %%r0,%[n]\n\t"
  245. "ltgr %%r0,%%r0\n\t"
  246. "jz 3f\n\t"
  247. "srlg %%r0,%%r0,2\n\t"
  248. "2:\n\t"
  249. "vl %%v16,0(%%r1,%[ap0])\n\t"
  250. "vl %%v17,0(%%r1,%[ap1])\n\t"
  251. "vl %%v18,16(%%r1,%[ap0])\n\t"
  252. "vl %%v19,16(%%r1,%[ap1])\n\t"
  253. "vl %%v2,0(%%r1,%[y])\n\t"
  254. "vl %%v3,16(%%r1,%[y])\n\t"
  255. "vfmadb %%v2,%%v16,%%v0,%%v2\n\t"
  256. "vfmadb %%v3,%%v18,%%v0,%%v3\n\t"
  257. "vfmadb %%v2,%%v17,%%v1,%%v2\n\t"
  258. "vfmadb %%v3,%%v19,%%v1,%%v3\n\t"
  259. "vst %%v2,0(%%r1,%[y])\n\t"
  260. "vst %%v3,16(%%r1,%[y])\n\t"
  261. "agfi %%r1,32\n\t"
  262. "brctg %%r0,2b\n\t"
  263. "3:\n\t"
  264. "nop"
  265. : "+m"(*(FLOAT (*)[n]) y)
  266. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
  267. "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
  268. "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n)
  269. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  270. "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
  271. "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
  272. }
  273. static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y,
  274. FLOAT *alpha) {
  275. __asm__("vlrepg %%v0,0(%[x])\n\t"
  276. "vlrepg %%v16,%[alpha]\n\t"
  277. "vfmdb %%v0,%%v0,%%v16\n\t"
  278. "xgr %%r1,%%r1\n\t"
  279. "lghi %%r0,-16\n\t"
  280. "ngr %%r0,%[n]\n\t"
  281. "ltgr %%r0,%%r0\n\t"
  282. "jz 1f\n\t"
  283. "srlg %%r0,%%r0,4\n\t"
  284. "0:\n\t"
  285. "pfd 1,1024(%%r1,%[a0])\n\t"
  286. "pfd 2,1024(%%r1,%[y])\n\t"
  287. "vl %%v16,0(%%r1,%[a0])\n\t"
  288. "vl %%v17,16(%%r1,%[a0])\n\t"
  289. "vl %%v18,32(%%r1,%[a0])\n\t"
  290. "vl %%v19,48(%%r1,%[a0])\n\t"
  291. "vl %%v20,64(%%r1,%[a0])\n\t"
  292. "vl %%v21,80(%%r1,%[a0])\n\t"
  293. "vl %%v22,96(%%r1,%[a0])\n\t"
  294. "vl %%v23,112(%%r1,%[a0])\n\t"
  295. "vl %%v24,0(%%r1,%[y])\n\t"
  296. "vl %%v25,16(%%r1,%[y])\n\t"
  297. "vl %%v26,32(%%r1,%[y])\n\t"
  298. "vl %%v27,48(%%r1,%[y])\n\t"
  299. "vl %%v28,64(%%r1,%[y])\n\t"
  300. "vl %%v29,80(%%r1,%[y])\n\t"
  301. "vl %%v30,96(%%r1,%[y])\n\t"
  302. "vl %%v31,112(%%r1,%[y])\n\t"
  303. "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
  304. "vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
  305. "vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
  306. "vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
  307. "vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
  308. "vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
  309. "vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
  310. "vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
  311. "vst %%v24,0(%%r1,%[y])\n\t"
  312. "vst %%v25,16(%%r1,%[y])\n\t"
  313. "vst %%v26,32(%%r1,%[y])\n\t"
  314. "vst %%v27,48(%%r1,%[y])\n\t"
  315. "vst %%v28,64(%%r1,%[y])\n\t"
  316. "vst %%v29,80(%%r1,%[y])\n\t"
  317. "vst %%v30,96(%%r1,%[y])\n\t"
  318. "vst %%v31,112(%%r1,%[y])\n\t"
  319. "agfi %%r1,128\n\t"
  320. "brctg %%r0,0b\n\t"
  321. "1:\n\t"
  322. "lghi %%r0,12\n\t"
  323. "ngr %%r0,%[n]\n\t"
  324. "ltgr %%r0,%%r0\n\t"
  325. "jz 3f\n\t"
  326. "srlg %%r0,%%r0,2\n\t"
  327. "2:\n\t"
  328. "vl %%v16,0(%%r1,%[a0])\n\t"
  329. "vl %%v17,16(%%r1,%[a0])\n\t"
  330. "vl %%v18,0(%%r1,%[y])\n\t"
  331. "vl %%v19,16(%%r1,%[y])\n\t"
  332. "vfmadb %%v18,%%v16,%%v0,%%v18\n\t"
  333. "vfmadb %%v19,%%v17,%%v0,%%v19\n\t"
  334. "vst %%v18,0(%%r1,%[y])\n\t"
  335. "vst %%v19,16(%%r1,%[y])\n\t"
  336. "agfi %%r1,32\n\t"
  337. "brctg %%r0,2b\n\t"
  338. "3:\n\t"
  339. "nop"
  340. : "+m"(*(FLOAT (*)[n]) y)
  341. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
  342. "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha),
  343. [n] "r"(n)
  344. : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
  345. "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  346. "v31");
  347. }
  348. static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) {
  349. BLASLONG i;
  350. for (i = 0; i < n; i++) {
  351. *dest += src[i];
  352. dest += inc_dest;
  353. }
  354. }
  355. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
  356. BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  357. FLOAT *buffer) {
  358. BLASLONG i;
  359. FLOAT *a_ptr;
  360. FLOAT *x_ptr;
  361. FLOAT *y_ptr;
  362. FLOAT *ap[4];
  363. BLASLONG n1;
  364. BLASLONG m1;
  365. BLASLONG m2;
  366. BLASLONG m3;
  367. BLASLONG n2;
  368. BLASLONG lda4 = lda << 2;
  369. FLOAT xbuffer[8], *ybuffer;
  370. if (m < 1)
  371. return (0);
  372. if (n < 1)
  373. return (0);
  374. ybuffer = buffer;
  375. n1 = n >> 2;
  376. n2 = n & 3;
  377. m3 = m & 3;
  378. m1 = m & -4;
  379. m2 = (m & (NBMAX - 1)) - m3;
  380. y_ptr = y;
  381. BLASLONG NB = NBMAX;
  382. while (NB == NBMAX) {
  383. m1 -= NB;
  384. if (m1 < 0) {
  385. if (m2 == 0)
  386. break;
  387. NB = m2;
  388. }
  389. a_ptr = a;
  390. x_ptr = x;
  391. ap[0] = a_ptr;
  392. ap[1] = a_ptr + lda;
  393. ap[2] = ap[1] + lda;
  394. ap[3] = ap[2] + lda;
  395. if (inc_y != 1)
  396. memset(ybuffer, 0, NB * 8);
  397. else
  398. ybuffer = y_ptr;
  399. if (inc_x == 1) {
  400. for (i = 0; i < n1; i++) {
  401. dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha);
  402. ap[0] += lda4;
  403. ap[1] += lda4;
  404. ap[2] += lda4;
  405. ap[3] += lda4;
  406. a_ptr += lda4;
  407. x_ptr += 4;
  408. }
  409. if (n2 & 2) {
  410. dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha);
  411. a_ptr += lda * 2;
  412. x_ptr += 2;
  413. }
  414. if (n2 & 1) {
  415. dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha);
  416. /* a_ptr += lda;
  417. x_ptr += 1; */
  418. }
  419. } else {
  420. for (i = 0; i < n1; i++) {
  421. xbuffer[0] = x_ptr[0];
  422. x_ptr += inc_x;
  423. xbuffer[1] = x_ptr[0];
  424. x_ptr += inc_x;
  425. xbuffer[2] = x_ptr[0];
  426. x_ptr += inc_x;
  427. xbuffer[3] = x_ptr[0];
  428. x_ptr += inc_x;
  429. dgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha);
  430. ap[0] += lda4;
  431. ap[1] += lda4;
  432. ap[2] += lda4;
  433. ap[3] += lda4;
  434. a_ptr += lda4;
  435. }
  436. for (i = 0; i < n2; i++) {
  437. xbuffer[0] = x_ptr[0];
  438. x_ptr += inc_x;
  439. dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha);
  440. a_ptr += lda;
  441. }
  442. }
  443. a += NB;
  444. if (inc_y != 1) {
  445. add_y(NB, ybuffer, y_ptr, inc_y);
  446. y_ptr += NB * inc_y;
  447. } else
  448. y_ptr += NB;
  449. }
  450. if (m3 == 0)
  451. return (0);
  452. if (m3 == 3) {
  453. a_ptr = a;
  454. x_ptr = x;
  455. FLOAT temp0 = 0.0;
  456. FLOAT temp1 = 0.0;
  457. FLOAT temp2 = 0.0;
  458. if (lda == 3 && inc_x == 1) {
  459. for (i = 0; i < (n & -4); i += 4) {
  460. temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
  461. temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
  462. temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
  463. temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3];
  464. temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
  465. temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
  466. a_ptr += 12;
  467. x_ptr += 4;
  468. }
  469. for (; i < n; i++) {
  470. temp0 += a_ptr[0] * x_ptr[0];
  471. temp1 += a_ptr[1] * x_ptr[0];
  472. temp2 += a_ptr[2] * x_ptr[0];
  473. a_ptr += 3;
  474. x_ptr++;
  475. }
  476. } else {
  477. for (i = 0; i < n; i++) {
  478. temp0 += a_ptr[0] * x_ptr[0];
  479. temp1 += a_ptr[1] * x_ptr[0];
  480. temp2 += a_ptr[2] * x_ptr[0];
  481. a_ptr += lda;
  482. x_ptr += inc_x;
  483. }
  484. }
  485. y_ptr[0] += alpha * temp0;
  486. y_ptr += inc_y;
  487. y_ptr[0] += alpha * temp1;
  488. y_ptr += inc_y;
  489. y_ptr[0] += alpha * temp2;
  490. return (0);
  491. }
  492. if (m3 == 2) {
  493. a_ptr = a;
  494. x_ptr = x;
  495. FLOAT temp0 = 0.0;
  496. FLOAT temp1 = 0.0;
  497. if (lda == 2 && inc_x == 1) {
  498. for (i = 0; i < (n & -4); i += 4) {
  499. temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
  500. temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
  501. temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
  502. temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
  503. a_ptr += 8;
  504. x_ptr += 4;
  505. }
  506. for (; i < n; i++) {
  507. temp0 += a_ptr[0] * x_ptr[0];
  508. temp1 += a_ptr[1] * x_ptr[0];
  509. a_ptr += 2;
  510. x_ptr++;
  511. }
  512. } else {
  513. for (i = 0; i < n; i++) {
  514. temp0 += a_ptr[0] * x_ptr[0];
  515. temp1 += a_ptr[1] * x_ptr[0];
  516. a_ptr += lda;
  517. x_ptr += inc_x;
  518. }
  519. }
  520. y_ptr[0] += alpha * temp0;
  521. y_ptr += inc_y;
  522. y_ptr[0] += alpha * temp1;
  523. return (0);
  524. }
  525. if (m3 == 1) {
  526. a_ptr = a;
  527. x_ptr = x;
  528. FLOAT temp = 0.0;
  529. if (lda == 1 && inc_x == 1) {
  530. for (i = 0; i < (n & -4); i += 4) {
  531. temp +=
  532. a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i +
  533. 2] *
  534. x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3];
  535. }
  536. for (; i < n; i++) {
  537. temp += a_ptr[i] * x_ptr[i];
  538. }
  539. } else {
  540. for (i = 0; i < n; i++) {
  541. temp += a_ptr[0] * x_ptr[0];
  542. a_ptr += lda;
  543. x_ptr += inc_x;
  544. }
  545. }
  546. y_ptr[0] += alpha * temp;
  547. return (0);
  548. }
  549. return (0);
  550. }