You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemv_t_4.c 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760
  1. /***************************************************************************
  2. Copyright (c) 2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #include "common.h"
  28. #define NBMAX 2048
  29. static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
  30. register FLOAT *ap0 = ap[0];
  31. register FLOAT *ap1 = ap[1];
  32. register FLOAT *ap2 = ap[2];
  33. register FLOAT *ap3 = ap[3];
  34. __asm__("vzero %%v0\n\t"
  35. "vzero %%v1\n\t"
  36. "vzero %%v2\n\t"
  37. "vzero %%v3\n\t"
  38. "vzero %%v4\n\t"
  39. "vzero %%v5\n\t"
  40. "vzero %%v6\n\t"
  41. "vzero %%v7\n\t"
  42. "xgr %%r1,%%r1\n\t"
  43. "lghi %%r0,-16\n\t"
  44. "ngr %%r0,%[n]\n\t"
  45. "ltgr %%r0,%%r0\n\t"
  46. "jz 1f\n\t"
  47. "srlg %%r0,%%r0,4\n\t"
  48. "0:\n\t"
  49. "pfd 1,1024(%%r1,%[ap0])\n\t"
  50. "pfd 1,1024(%%r1,%[ap1])\n\t"
  51. "pfd 1,1024(%%r1,%[ap2])\n\t"
  52. "pfd 1,1024(%%r1,%[ap3])\n\t"
  53. "pfd 1,1024(%%r1,%[x])\n\t"
  54. "vl %%v16,0(%%r1,%[x])\n\t"
  55. "vl %%v17,16(%%r1,%[x])\n\t"
  56. "vl %%v18,32(%%r1,%[x])\n\t"
  57. "vl %%v19,48(%%r1,%[x])\n\t"
  58. "vl %%v20,64(%%r1,%[x])\n\t"
  59. "vl %%v21,80(%%r1,%[x])\n\t"
  60. "vl %%v22,96(%%r1,%[x])\n\t"
  61. "vl %%v23,112(%%r1,%[x])\n\t"
  62. "vl %%v24,0(%%r1,%[ap0])\n\t"
  63. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  64. "vl %%v25,0(%%r1,%[ap1])\n\t"
  65. "vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
  66. "vl %%v26,0(%%r1,%[ap2])\n\t"
  67. "vfmadb %%v2,%%v16,%%v26,%%v2\n\t"
  68. "vl %%v27,0(%%r1,%[ap3])\n\t"
  69. "vfmadb %%v3,%%v16,%%v27,%%v3\n\t"
  70. "vl %%v28,16(%%r1,%[ap0])\n\t"
  71. "vfmadb %%v4,%%v17,%%v28,%%v4\n\t"
  72. "vl %%v29,16(%%r1,%[ap1])\n\t"
  73. "vfmadb %%v5,%%v17,%%v29,%%v5\n\t"
  74. "vl %%v30,16(%%r1,%[ap2])\n\t"
  75. "vfmadb %%v6,%%v17,%%v30,%%v6\n\t"
  76. "vl %%v31,16(%%r1,%[ap3])\n\t"
  77. "vfmadb %%v7,%%v17,%%v31,%%v7\n\t"
  78. "vl %%v24,32(%%r1,%[ap0])\n\t"
  79. "vfmadb %%v0,%%v18,%%v24,%%v0\n\t"
  80. "vl %%v25,32(%%r1,%[ap1])\n\t"
  81. "vfmadb %%v1,%%v18,%%v25,%%v1\n\t"
  82. "vl %%v26,32(%%r1,%[ap2])\n\t"
  83. "vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
  84. "vl %%v27,32(%%r1,%[ap3])\n\t"
  85. "vfmadb %%v3,%%v18,%%v27,%%v3\n\t"
  86. "vl %%v28,48(%%r1,%[ap0])\n\t"
  87. "vfmadb %%v4,%%v19,%%v28,%%v4\n\t"
  88. "vl %%v29,48(%%r1,%[ap1])\n\t"
  89. "vfmadb %%v5,%%v19,%%v29,%%v5\n\t"
  90. "vl %%v30,48(%%r1,%[ap2])\n\t"
  91. "vfmadb %%v6,%%v19,%%v30,%%v6\n\t"
  92. "vl %%v31,48(%%r1,%[ap3])\n\t"
  93. "vfmadb %%v7,%%v19,%%v31,%%v7\n\t"
  94. "vl %%v24,64(%%r1,%[ap0])\n\t"
  95. "vfmadb %%v0,%%v20,%%v24,%%v0\n\t"
  96. "vl %%v25,64(%%r1,%[ap1])\n\t"
  97. "vfmadb %%v1,%%v20,%%v25,%%v1\n\t"
  98. "vl %%v26,64(%%r1,%[ap2])\n\t"
  99. "vfmadb %%v2,%%v20,%%v26,%%v2\n\t"
  100. "vl %%v27,64(%%r1,%[ap3])\n\t"
  101. "vfmadb %%v3,%%v20,%%v27,%%v3\n\t"
  102. "vl %%v28,80(%%r1,%[ap0])\n\t"
  103. "vfmadb %%v4,%%v21,%%v28,%%v4\n\t"
  104. "vl %%v29,80(%%r1,%[ap1])\n\t"
  105. "vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
  106. "vl %%v30,80(%%r1,%[ap2])\n\t"
  107. "vfmadb %%v6,%%v21,%%v30,%%v6\n\t"
  108. "vl %%v31,80(%%r1,%[ap3])\n\t"
  109. "vfmadb %%v7,%%v21,%%v31,%%v7\n\t"
  110. "vl %%v24,96(%%r1,%[ap0])\n\t"
  111. "vfmadb %%v0,%%v22,%%v24,%%v0\n\t"
  112. "vl %%v25,96(%%r1,%[ap1])\n\t"
  113. "vfmadb %%v1,%%v22,%%v25,%%v1\n\t"
  114. "vl %%v26,96(%%r1,%[ap2])\n\t"
  115. "vfmadb %%v2,%%v22,%%v26,%%v2\n\t"
  116. "vl %%v27,96(%%r1,%[ap3])\n\t"
  117. "vfmadb %%v3,%%v22,%%v27,%%v3\n\t"
  118. "vl %%v28,112(%%r1,%[ap0])\n\t"
  119. "vfmadb %%v4,%%v23,%%v28,%%v4\n\t"
  120. "vl %%v29,112(%%r1,%[ap1])\n\t"
  121. "vfmadb %%v5,%%v23,%%v29,%%v5\n\t"
  122. "vl %%v30,112(%%r1,%[ap2])\n\t"
  123. "vfmadb %%v6,%%v23,%%v30,%%v6\n\t"
  124. "vl %%v31,112(%%r1,%[ap3])\n\t"
  125. "vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
  126. "agfi %%r1,128\n\t"
  127. "brctg %%r0,0b\n\t"
  128. "1:\n\t"
  129. "lghi %%r0,12\n\t"
  130. "ngr %%r0,%[n]\n\t"
  131. "ltgr %%r0,%%r0\n\t"
  132. "jz 3f\n\t"
  133. "srlg %%r0,%%r0,2\n\t"
  134. "2:\n\t"
  135. "vl %%v16,0(%%r1,%[x])\n\t"
  136. "vl %%v17,16(%%r1,%[x])\n\t"
  137. "vl %%v24,0(%%r1,%[ap0])\n\t"
  138. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  139. "vl %%v25,0(%%r1,%[ap1])\n\t"
  140. "vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
  141. "vl %%v26,0(%%r1,%[ap2])\n\t"
  142. "vfmadb %%v2,%%v16,%%v26,%%v2\n\t"
  143. "vl %%v27,0(%%r1,%[ap3])\n\t"
  144. "vfmadb %%v3,%%v16,%%v27,%%v3\n\t"
  145. "vl %%v28,16(%%r1,%[ap0])\n\t"
  146. "vfmadb %%v4,%%v17,%%v28,%%v4\n\t"
  147. "vl %%v29,16(%%r1,%[ap1])\n\t"
  148. "vfmadb %%v5,%%v17,%%v29,%%v5\n\t"
  149. "vl %%v30,16(%%r1,%[ap2])\n\t"
  150. "vfmadb %%v6,%%v17,%%v30,%%v6\n\t"
  151. "vl %%v31,16(%%r1,%[ap3])\n\t"
  152. "vfmadb %%v7,%%v17,%%v31,%%v7\n\t"
  153. "agfi %%r1,32\n\t"
  154. "brctg %%r0,2b\n\t"
  155. "3:\n\t"
  156. "vfadb %%v0,%%v0,%%v4\n\t"
  157. "vfadb %%v1,%%v1,%%v5\n\t"
  158. "vfadb %%v2,%%v2,%%v6\n\t"
  159. "vfadb %%v3,%%v3,%%v7\n\t"
  160. "vrepg %%v4,%%v0,1\n\t"
  161. "adbr %%f0,%%f4\n\t"
  162. "std %%f0,0(%[y])\n\t"
  163. "vrepg %%v4,%%v1,1\n\t"
  164. "adbr %%f1,%%f4\n\t"
  165. "std %%f1,8(%[y])\n\t"
  166. "vrepg %%v4,%%v2,1\n\t"
  167. "adbr %%f2,%%f4\n\t"
  168. "std %%f2,16(%[y])\n\t"
  169. "vrepg %%v4,%%v3,1\n\t"
  170. "adbr %%f3,%%f4\n\t"
  171. "std %%f3,24(%[y])"
  172. : "=m"(*(FLOAT (*)[4]) y)
  173. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0),
  174. "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1),
  175. "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2),
  176. "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3),
  177. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  178. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  179. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  180. "v26", "v27", "v28", "v29", "v30", "v31");
  181. }
  182. static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
  183. register FLOAT *ap0 = ap[0];
  184. register FLOAT *ap1 = ap[1];
  185. __asm__("vzero %%v0\n\t"
  186. "vzero %%v1\n\t"
  187. "vzero %%v2\n\t"
  188. "vzero %%v3\n\t"
  189. "vzero %%v4\n\t"
  190. "vzero %%v5\n\t"
  191. "vzero %%v6\n\t"
  192. "vzero %%v7\n\t"
  193. "xgr %%r1,%%r1\n\t"
  194. "lghi %%r0,-16\n\t"
  195. "ngr %%r0,%[n]\n\t"
  196. "ltgr %%r0,%%r0\n\t"
  197. "jz 1f\n\t"
  198. "srlg %%r0,%%r0,4\n\t"
  199. "0:\n\t"
  200. "pfd 1,1024(%%r1,%[ap0])\n\t"
  201. "pfd 1,1024(%%r1,%[ap1])\n\t"
  202. "pfd 1,1024(%%r1,%[x])\n\t"
  203. "vl %%v16,0(%%r1,%[x])\n\t"
  204. "vl %%v17,16(%%r1,%[x])\n\t"
  205. "vl %%v18,32(%%r1,%[x])\n\t"
  206. "vl %%v19,48(%%r1,%[x])\n\t"
  207. "vl %%v20,64(%%r1,%[x])\n\t"
  208. "vl %%v21,80(%%r1,%[x])\n\t"
  209. "vl %%v22,96(%%r1,%[x])\n\t"
  210. "vl %%v23,112(%%r1,%[x])\n\t"
  211. "vl %%v24,0(%%r1,%[ap0])\n\t"
  212. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  213. "vl %%v25,0(%%r1,%[ap1])\n\t"
  214. "vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
  215. "vl %%v26,16(%%r1,%[ap0])\n\t"
  216. "vfmadb %%v2,%%v17,%%v26,%%v2\n\t"
  217. "vl %%v27,16(%%r1,%[ap1])\n\t"
  218. "vfmadb %%v3,%%v17,%%v27,%%v3\n\t"
  219. "vl %%v28,32(%%r1,%[ap0])\n\t"
  220. "vfmadb %%v4,%%v18,%%v28,%%v4\n\t"
  221. "vl %%v29,32(%%r1,%[ap1])\n\t"
  222. "vfmadb %%v5,%%v18,%%v29,%%v5\n\t"
  223. "vl %%v30,48(%%r1,%[ap0])\n\t"
  224. "vfmadb %%v6,%%v19,%%v30,%%v6\n\t"
  225. "vl %%v31,48(%%r1,%[ap1])\n\t"
  226. "vfmadb %%v7,%%v19,%%v31,%%v7\n\t"
  227. "vl %%v24,64(%%r1,%[ap0])\n\t"
  228. "vfmadb %%v0,%%v20,%%v24,%%v0\n\t"
  229. "vl %%v25,64(%%r1,%[ap1])\n\t"
  230. "vfmadb %%v1,%%v20,%%v25,%%v1\n\t"
  231. "vl %%v26,80(%%r1,%[ap0])\n\t"
  232. "vfmadb %%v2,%%v21,%%v26,%%v2\n\t"
  233. "vl %%v27,80(%%r1,%[ap1])\n\t"
  234. "vfmadb %%v3,%%v21,%%v27,%%v3\n\t"
  235. "vl %%v28,96(%%r1,%[ap0])\n\t"
  236. "vfmadb %%v4,%%v22,%%v28,%%v4\n\t"
  237. "vl %%v29,96(%%r1,%[ap1])\n\t"
  238. "vfmadb %%v5,%%v22,%%v29,%%v5\n\t"
  239. "vl %%v30,112(%%r1,%[ap0])\n\t"
  240. "vfmadb %%v6,%%v23,%%v30,%%v6\n\t"
  241. "vl %%v31,112(%%r1,%[ap1])\n\t"
  242. "vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
  243. "agfi %%r1,128\n\t"
  244. "brctg %%r0,0b\n\t"
  245. "1:\n\t"
  246. "lghi %%r0,12\n\t"
  247. "ngr %%r0,%[n]\n\t"
  248. "ltgr %%r0,%%r0\n\t"
  249. "jz 3f\n\t"
  250. "srlg %%r0,%%r0,2\n\t"
  251. "2:\n\t"
  252. "vl %%v16,0(%%r1,%[x])\n\t"
  253. "vl %%v17,16(%%r1,%[x])\n\t"
  254. "vl %%v24,0(%%r1,%[ap0])\n\t"
  255. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  256. "vl %%v25,0(%%r1,%[ap1])\n\t"
  257. "vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
  258. "vl %%v26,16(%%r1,%[ap0])\n\t"
  259. "vfmadb %%v2,%%v17,%%v26,%%v2\n\t"
  260. "vl %%v27,16(%%r1,%[ap1])\n\t"
  261. "vfmadb %%v3,%%v17,%%v27,%%v3\n\t"
  262. "agfi %%r1,32\n\t"
  263. "brctg %%r0,2b\n\t"
  264. "3:\n\t"
  265. "vfadb %%v0,%%v0,%%v2\n\t"
  266. "vfadb %%v0,%%v0,%%v4\n\t"
  267. "vfadb %%v0,%%v0,%%v6\n\t"
  268. "vfadb %%v1,%%v1,%%v3\n\t"
  269. "vfadb %%v1,%%v1,%%v5\n\t"
  270. "vfadb %%v1,%%v1,%%v7\n\t"
  271. "vrepg %%v2,%%v0,1\n\t"
  272. "adbr %%f0,%%f2\n\t"
  273. "std %%f0,0(%[y])\n\t"
  274. "vrepg %%v2,%%v1,1\n\t"
  275. "adbr %%f1,%%f2\n\t"
  276. "std %%f1,8(%[y])"
  277. : "=m"(*(FLOAT (*)[2]) y)
  278. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0),
  279. "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1),
  280. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  281. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  282. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  283. "v26", "v27", "v28", "v29", "v30", "v31");
  284. }
  285. static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) {
  286. __asm__("vzero %%v0\n\t"
  287. "vzero %%v1\n\t"
  288. "vzero %%v2\n\t"
  289. "vzero %%v3\n\t"
  290. "vzero %%v4\n\t"
  291. "vzero %%v5\n\t"
  292. "vzero %%v6\n\t"
  293. "vzero %%v7\n\t"
  294. "xgr %%r1,%%r1\n\t"
  295. "lghi %%r0,-16\n\t"
  296. "ngr %%r0,%[n]\n\t"
  297. "ltgr %%r0,%%r0\n\t"
  298. "jz 1f\n\t"
  299. "srlg %%r0,%%r0,4\n\t"
  300. "0:\n\t"
  301. "pfd 1,1024(%%r1,%[a0])\n\t"
  302. "pfd 1,1024(%%r1,%[x])\n\t"
  303. "vl %%v16,0(%%r1,%[x])\n\t"
  304. "vl %%v17,16(%%r1,%[x])\n\t"
  305. "vl %%v18,32(%%r1,%[x])\n\t"
  306. "vl %%v19,48(%%r1,%[x])\n\t"
  307. "vl %%v20,64(%%r1,%[x])\n\t"
  308. "vl %%v21,80(%%r1,%[x])\n\t"
  309. "vl %%v22,96(%%r1,%[x])\n\t"
  310. "vl %%v23,112(%%r1,%[x])\n\t"
  311. "vl %%v24,0(%%r1,%[a0])\n\t"
  312. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  313. "vl %%v25,16(%%r1,%[a0])\n\t"
  314. "vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
  315. "vl %%v26,32(%%r1,%[a0])\n\t"
  316. "vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
  317. "vl %%v27,48(%%r1,%[a0])\n\t"
  318. "vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
  319. "vl %%v28,64(%%r1,%[a0])\n\t"
  320. "vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
  321. "vl %%v29,80(%%r1,%[a0])\n\t"
  322. "vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
  323. "vl %%v30,96(%%r1,%[a0])\n\t"
  324. "vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
  325. "vl %%v31,112(%%r1,%[a0])\n\t"
  326. "vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
  327. "agfi %%r1,128\n\t"
  328. "brctg %%r0,0b\n\t"
  329. "1:\n\t"
  330. "lghi %%r0,12\n\t"
  331. "ngr %%r0,%[n]\n\t"
  332. "ltgr %%r0,%%r0\n\t"
  333. "jz 3f\n\t"
  334. "srlg %%r0,%%r0,2\n\t"
  335. "2:\n\t"
  336. "vl %%v16,0(%%r1,%[x])\n\t"
  337. "vl %%v17,16(%%r1,%[x])\n\t"
  338. "vl %%v24,0(%%r1,%[a0])\n\t"
  339. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  340. "vl %%v25,16(%%r1,%[a0])\n\t"
  341. "vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
  342. "agfi %%r1,32\n\t"
  343. "brctg %%r0,2b\n\t"
  344. "3:\n\t"
  345. "vfadb %%v0,%%v0,%%v1\n\t"
  346. "vfadb %%v0,%%v0,%%v2\n\t"
  347. "vfadb %%v0,%%v0,%%v3\n\t"
  348. "vfadb %%v0,%%v0,%%v4\n\t"
  349. "vfadb %%v0,%%v0,%%v5\n\t"
  350. "vfadb %%v0,%%v0,%%v6\n\t"
  351. "vfadb %%v0,%%v0,%%v7\n\t"
  352. "vrepg %%v1,%%v0,1\n\t"
  353. "adbr %%f0,%%f1\n\t"
  354. "std %%f0,0(%[y])"
  355. : "=m"(*(FLOAT (*)[1]) y)
  356. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
  357. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  358. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  359. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  360. "v26", "v27", "v28", "v29", "v30", "v31");
  361. }
  362. static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
  363. BLASLONG i;
  364. for (i = 0; i < n; i++) {
  365. dest[i] = *src;
  366. src += inc_src;
  367. }
  368. }
  369. static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) {
  370. __asm__("vlrepg %%v0,%[da]\n\t"
  371. "xgr %%r1,%%r1\n\t"
  372. "lghi %%r0,-16\n\t"
  373. "ngr %%r0,%[n]\n\t"
  374. "ltgr %%r0,%%r0\n\t"
  375. "jz 1f\n\t"
  376. "srlg %%r0,%%r0,4\n\t"
  377. "0:\n\t"
  378. "pfd 1,1024(%%r1,%[src])\n\t"
  379. "pfd 2,1024(%%r1,%[dest])\n\t"
  380. "vl %%v16,0(%%r1,%[src])\n\t"
  381. "vl %%v17,16(%%r1,%[src])\n\t"
  382. "vl %%v18,32(%%r1,%[src])\n\t"
  383. "vl %%v19,48(%%r1,%[src])\n\t"
  384. "vl %%v20,64(%%r1,%[src])\n\t"
  385. "vl %%v21,80(%%r1,%[src])\n\t"
  386. "vl %%v22,96(%%r1,%[src])\n\t"
  387. "vl %%v23,112(%%r1,%[src])\n\t"
  388. "vl %%v24, 0(%%r1,%[dest])\n\t"
  389. "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
  390. "vst %%v24, 0(%%r1,%[dest])\n\t"
  391. "vl %%v25, 16(%%r1,%[dest])\n\t"
  392. "vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
  393. "vst %%v25, 16(%%r1,%[dest])\n\t"
  394. "vl %%v26, 32(%%r1,%[dest])\n\t"
  395. "vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
  396. "vst %%v26, 32(%%r1,%[dest])\n\t"
  397. "vl %%v27, 48(%%r1,%[dest])\n\t"
  398. "vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
  399. "vst %%v27, 48(%%r1,%[dest])\n\t"
  400. "vl %%v28, 64(%%r1,%[dest])\n\t"
  401. "vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
  402. "vst %%v28, 64(%%r1,%[dest])\n\t"
  403. "vl %%v29, 80(%%r1,%[dest])\n\t"
  404. "vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
  405. "vst %%v29, 80(%%r1,%[dest])\n\t"
  406. "vl %%v30, 96(%%r1,%[dest])\n\t"
  407. "vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
  408. "vst %%v30, 96(%%r1,%[dest])\n\t"
  409. "vl %%v31, 112(%%r1,%[dest])\n\t"
  410. "vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
  411. "vst %%v31, 112(%%r1,%[dest])\n\t"
  412. "agfi %%r1,128\n\t"
  413. "brctg %%r0,0b\n\t"
  414. "1:\n\t"
  415. "lghi %%r0,12\n\t"
  416. "ngr %%r0,%[n]\n\t"
  417. "ltgr %%r0,%%r0\n\t"
  418. "jz 3f\n\t"
  419. "srlg %%r0,%%r0,2\n\t"
  420. "2:\n\t"
  421. "vl %%v16,0(%%r1,%[src])\n\t"
  422. "vl %%v17,16(%%r1,%[src])\n\t"
  423. "vl %%v24, 0(%%r1,%[dest])\n\t"
  424. "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
  425. "vst %%v24, 0(%%r1,%[dest])\n\t"
  426. "vl %%v25, 16(%%r1,%[dest])\n\t"
  427. "vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
  428. "vst %%v25, 16(%%r1,%[dest])\n\t"
  429. "agfi %%r1,32\n\t"
  430. "brctg %%r0,2b\n\t"
  431. "3:\n\t"
  432. "nop 0"
  433. : "+m"(*(FLOAT (*)[n]) dest)
  434. : [dest] "a"(dest),[da] "Q"(da), "m"(*(const FLOAT (*)[n]) src),
  435. [src] "a"(src),[n] "r"(n)
  436. : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
  437. "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  438. "v31");
  439. }
  440. static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest,
  441. BLASLONG inc_dest) {
  442. if (inc_dest == 1)
  443. add_y_kernel_4(n, da, src, dest);
  444. else {
  445. BLASLONG i;
  446. for (i = 0; i < n; i++) {
  447. *dest += src[i] * da;
  448. dest += inc_dest;
  449. }
  450. }
  451. }
  452. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
  453. BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  454. FLOAT *buffer) {
  455. BLASLONG register i;
  456. BLASLONG register j;
  457. FLOAT *a_ptr;
  458. FLOAT *x_ptr;
  459. FLOAT *y_ptr;
  460. BLASLONG n0;
  461. BLASLONG n1;
  462. BLASLONG m1;
  463. BLASLONG m2;
  464. BLASLONG m3;
  465. BLASLONG n2;
  466. FLOAT ybuffer[2] __attribute__ ((aligned(16)));
  467. FLOAT *xbuffer;
  468. FLOAT *ytemp;
  469. if (m < 1)
  470. return (0);
  471. if (n < 1)
  472. return (0);
  473. xbuffer = buffer;
  474. ytemp = buffer + (m < NBMAX ? m : NBMAX);
  475. n0 = n / NBMAX;
  476. n1 = (n % NBMAX) >> 2;
  477. n2 = n & 3;
  478. m3 = m & 3;
  479. m1 = m & -4;
  480. m2 = (m & (NBMAX - 1)) - m3;
  481. BLASLONG NB = NBMAX;
  482. while (NB == NBMAX) {
  483. m1 -= NB;
  484. if (m1 < 0) {
  485. if (m2 == 0)
  486. break;
  487. NB = m2;
  488. }
  489. y_ptr = y;
  490. a_ptr = a;
  491. x_ptr = x;
  492. if (inc_x == 1)
  493. xbuffer = x_ptr;
  494. else
  495. copy_x(NB, x_ptr, xbuffer, inc_x);
  496. FLOAT *ap[4];
  497. FLOAT *yp;
  498. BLASLONG register lda4 = 4 * lda;
  499. ap[0] = a_ptr;
  500. ap[1] = a_ptr + lda;
  501. ap[2] = ap[1] + lda;
  502. ap[3] = ap[2] + lda;
  503. if (n0 > 0) {
  504. BLASLONG nb1 = NBMAX / 4;
  505. for (j = 0; j < n0; j++) {
  506. yp = ytemp;
  507. for (i = 0; i < nb1; i++) {
  508. dgemv_kernel_4x4(NB, ap, xbuffer, yp);
  509. ap[0] += lda4;
  510. ap[1] += lda4;
  511. ap[2] += lda4;
  512. ap[3] += lda4;
  513. yp += 4;
  514. }
  515. add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y);
  516. y_ptr += nb1 * inc_y * 4;
  517. a_ptr += nb1 * lda4;
  518. }
  519. }
  520. yp = ytemp;
  521. for (i = 0; i < n1; i++) {
  522. dgemv_kernel_4x4(NB, ap, xbuffer, yp);
  523. ap[0] += lda4;
  524. ap[1] += lda4;
  525. ap[2] += lda4;
  526. ap[3] += lda4;
  527. yp += 4;
  528. }
  529. if (n1 > 0) {
  530. add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y);
  531. y_ptr += n1 * inc_y * 4;
  532. a_ptr += n1 * lda4;
  533. }
  534. if (n2 & 2) {
  535. dgemv_kernel_4x2(NB, ap, xbuffer, ybuffer);
  536. a_ptr += lda * 2;
  537. *y_ptr += ybuffer[0] * alpha;
  538. y_ptr += inc_y;
  539. *y_ptr += ybuffer[1] * alpha;
  540. y_ptr += inc_y;
  541. }
  542. if (n2 & 1) {
  543. dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
  544. // a_ptr += lda;
  545. *y_ptr += ybuffer[0] * alpha;
  546. // y_ptr += inc_y;
  547. }
  548. a += NB;
  549. x += NB * inc_x;
  550. }
  551. if (m3 == 0)
  552. return (0);
  553. x_ptr = x;
  554. a_ptr = a;
  555. if (m3 == 3) {
  556. FLOAT xtemp0 = *x_ptr * alpha;
  557. x_ptr += inc_x;
  558. FLOAT xtemp1 = *x_ptr * alpha;
  559. x_ptr += inc_x;
  560. FLOAT xtemp2 = *x_ptr * alpha;
  561. FLOAT *aj = a_ptr;
  562. y_ptr = y;
  563. if (lda == 3 && inc_y == 1) {
  564. for (j = 0; j < (n & -4); j += 4) {
  565. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
  566. y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
  567. y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
  568. y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
  569. aj += 12;
  570. }
  571. for (; j < n; j++) {
  572. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
  573. aj += 3;
  574. }
  575. } else {
  576. if (inc_y == 1) {
  577. BLASLONG register lda2 = lda << 1;
  578. BLASLONG register lda4 = lda << 2;
  579. BLASLONG register lda3 = lda2 + lda;
  580. for (j = 0; j < (n & -4); j += 4) {
  581. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  582. y_ptr[j + 1] +=
  583. *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda +
  584. 2) * xtemp2;
  585. y_ptr[j + 2] +=
  586. *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 +
  587. 2) * xtemp2;
  588. y_ptr[j + 3] +=
  589. *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 +
  590. 2) * xtemp2;
  591. aj += lda4;
  592. }
  593. for (; j < n; j++) {
  594. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  595. aj += lda;
  596. }
  597. } else {
  598. for (j = 0; j < n; j++) {
  599. *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  600. y_ptr += inc_y;
  601. aj += lda;
  602. }
  603. }
  604. }
  605. return (0);
  606. }
  607. if (m3 == 2) {
  608. FLOAT xtemp0 = *x_ptr * alpha;
  609. x_ptr += inc_x;
  610. FLOAT xtemp1 = *x_ptr * alpha;
  611. FLOAT *aj = a_ptr;
  612. y_ptr = y;
  613. if (lda == 2 && inc_y == 1) {
  614. for (j = 0; j < (n & -4); j += 4) {
  615. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
  616. y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
  617. y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
  618. y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
  619. aj += 8;
  620. }
  621. for (; j < n; j++) {
  622. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
  623. aj += 2;
  624. }
  625. } else {
  626. if (inc_y == 1) {
  627. BLASLONG register lda2 = lda << 1;
  628. BLASLONG register lda4 = lda << 2;
  629. BLASLONG register lda3 = lda2 + lda;
  630. for (j = 0; j < (n & -4); j += 4) {
  631. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
  632. y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
  633. y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
  634. y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
  635. aj += lda4;
  636. }
  637. for (; j < n; j++) {
  638. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
  639. aj += lda;
  640. }
  641. } else {
  642. for (j = 0; j < n; j++) {
  643. *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
  644. y_ptr += inc_y;
  645. aj += lda;
  646. }
  647. }
  648. }
  649. return (0);
  650. }
  651. FLOAT xtemp = *x_ptr * alpha;
  652. FLOAT *aj = a_ptr;
  653. y_ptr = y;
  654. if (lda == 1 && inc_y == 1) {
  655. for (j = 0; j < (n & -4); j += 4) {
  656. y_ptr[j] += aj[j] * xtemp;
  657. y_ptr[j + 1] += aj[j + 1] * xtemp;
  658. y_ptr[j + 2] += aj[j + 2] * xtemp;
  659. y_ptr[j + 3] += aj[j + 3] * xtemp;
  660. }
  661. for (; j < n; j++) {
  662. y_ptr[j] += aj[j] * xtemp;
  663. }
  664. } else {
  665. if (inc_y == 1) {
  666. BLASLONG register lda2 = lda << 1;
  667. BLASLONG register lda4 = lda << 2;
  668. BLASLONG register lda3 = lda2 + lda;
  669. for (j = 0; j < (n & -4); j += 4) {
  670. y_ptr[j] += *aj * xtemp;
  671. y_ptr[j + 1] += *(aj + lda) * xtemp;
  672. y_ptr[j + 2] += *(aj + lda2) * xtemp;
  673. y_ptr[j + 3] += *(aj + lda3) * xtemp;
  674. aj += lda4;
  675. }
  676. for (; j < n; j++) {
  677. y_ptr[j] += *aj * xtemp;
  678. aj += lda;
  679. }
  680. } else {
  681. for (j = 0; j < n; j++) {
  682. *y_ptr += *aj * xtemp;
  683. y_ptr += inc_y;
  684. aj += lda;
  685. }
  686. }
  687. }
  688. return (0);
  689. }