You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemv_t_4.c 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752
  1. /***************************************************************************
  2. Copyright (c) 2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #include "common.h"
  28. #define NBMAX 2048
  29. static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
  30. __asm__("vzero %%v0\n\t"
  31. "vzero %%v1\n\t"
  32. "vzero %%v2\n\t"
  33. "vzero %%v3\n\t"
  34. "vzero %%v4\n\t"
  35. "vzero %%v5\n\t"
  36. "vzero %%v6\n\t"
  37. "vzero %%v7\n\t"
  38. "xgr %%r1,%%r1\n\t"
  39. "lghi %%r0,-16\n\t"
  40. "ngr %%r0,%[n]\n\t"
  41. "ltgr %%r0,%%r0\n\t"
  42. "jz 1f\n\t"
  43. "srlg %%r0,%%r0,4\n\t"
  44. "0:\n\t"
  45. "pfd 1,1024(%%r1,%[ap0])\n\t"
  46. "pfd 1,1024(%%r1,%[ap1])\n\t"
  47. "pfd 1,1024(%%r1,%[ap2])\n\t"
  48. "pfd 1,1024(%%r1,%[ap3])\n\t"
  49. "pfd 1,1024(%%r1,%[x])\n\t"
  50. "vl %%v16,0(%%r1,%[x])\n\t"
  51. "vl %%v17,16(%%r1,%[x])\n\t"
  52. "vl %%v18,32(%%r1,%[x])\n\t"
  53. "vl %%v19,48(%%r1,%[x])\n\t"
  54. "vl %%v20,64(%%r1,%[x])\n\t"
  55. "vl %%v21,80(%%r1,%[x])\n\t"
  56. "vl %%v22,96(%%r1,%[x])\n\t"
  57. "vl %%v23,112(%%r1,%[x])\n\t"
  58. "vl %%v24,0(%%r1,%[ap0])\n\t"
  59. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  60. "vl %%v25,0(%%r1,%[ap1])\n\t"
  61. "vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
  62. "vl %%v26,0(%%r1,%[ap2])\n\t"
  63. "vfmadb %%v2,%%v16,%%v26,%%v2\n\t"
  64. "vl %%v27,0(%%r1,%[ap3])\n\t"
  65. "vfmadb %%v3,%%v16,%%v27,%%v3\n\t"
  66. "vl %%v28,16(%%r1,%[ap0])\n\t"
  67. "vfmadb %%v4,%%v17,%%v28,%%v4\n\t"
  68. "vl %%v29,16(%%r1,%[ap1])\n\t"
  69. "vfmadb %%v5,%%v17,%%v29,%%v5\n\t"
  70. "vl %%v30,16(%%r1,%[ap2])\n\t"
  71. "vfmadb %%v6,%%v17,%%v30,%%v6\n\t"
  72. "vl %%v31,16(%%r1,%[ap3])\n\t"
  73. "vfmadb %%v7,%%v17,%%v31,%%v7\n\t"
  74. "vl %%v24,32(%%r1,%[ap0])\n\t"
  75. "vfmadb %%v0,%%v18,%%v24,%%v0\n\t"
  76. "vl %%v25,32(%%r1,%[ap1])\n\t"
  77. "vfmadb %%v1,%%v18,%%v25,%%v1\n\t"
  78. "vl %%v26,32(%%r1,%[ap2])\n\t"
  79. "vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
  80. "vl %%v27,32(%%r1,%[ap3])\n\t"
  81. "vfmadb %%v3,%%v18,%%v27,%%v3\n\t"
  82. "vl %%v28,48(%%r1,%[ap0])\n\t"
  83. "vfmadb %%v4,%%v19,%%v28,%%v4\n\t"
  84. "vl %%v29,48(%%r1,%[ap1])\n\t"
  85. "vfmadb %%v5,%%v19,%%v29,%%v5\n\t"
  86. "vl %%v30,48(%%r1,%[ap2])\n\t"
  87. "vfmadb %%v6,%%v19,%%v30,%%v6\n\t"
  88. "vl %%v31,48(%%r1,%[ap3])\n\t"
  89. "vfmadb %%v7,%%v19,%%v31,%%v7\n\t"
  90. "vl %%v24,64(%%r1,%[ap0])\n\t"
  91. "vfmadb %%v0,%%v20,%%v24,%%v0\n\t"
  92. "vl %%v25,64(%%r1,%[ap1])\n\t"
  93. "vfmadb %%v1,%%v20,%%v25,%%v1\n\t"
  94. "vl %%v26,64(%%r1,%[ap2])\n\t"
  95. "vfmadb %%v2,%%v20,%%v26,%%v2\n\t"
  96. "vl %%v27,64(%%r1,%[ap3])\n\t"
  97. "vfmadb %%v3,%%v20,%%v27,%%v3\n\t"
  98. "vl %%v28,80(%%r1,%[ap0])\n\t"
  99. "vfmadb %%v4,%%v21,%%v28,%%v4\n\t"
  100. "vl %%v29,80(%%r1,%[ap1])\n\t"
  101. "vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
  102. "vl %%v30,80(%%r1,%[ap2])\n\t"
  103. "vfmadb %%v6,%%v21,%%v30,%%v6\n\t"
  104. "vl %%v31,80(%%r1,%[ap3])\n\t"
  105. "vfmadb %%v7,%%v21,%%v31,%%v7\n\t"
  106. "vl %%v24,96(%%r1,%[ap0])\n\t"
  107. "vfmadb %%v0,%%v22,%%v24,%%v0\n\t"
  108. "vl %%v25,96(%%r1,%[ap1])\n\t"
  109. "vfmadb %%v1,%%v22,%%v25,%%v1\n\t"
  110. "vl %%v26,96(%%r1,%[ap2])\n\t"
  111. "vfmadb %%v2,%%v22,%%v26,%%v2\n\t"
  112. "vl %%v27,96(%%r1,%[ap3])\n\t"
  113. "vfmadb %%v3,%%v22,%%v27,%%v3\n\t"
  114. "vl %%v28,112(%%r1,%[ap0])\n\t"
  115. "vfmadb %%v4,%%v23,%%v28,%%v4\n\t"
  116. "vl %%v29,112(%%r1,%[ap1])\n\t"
  117. "vfmadb %%v5,%%v23,%%v29,%%v5\n\t"
  118. "vl %%v30,112(%%r1,%[ap2])\n\t"
  119. "vfmadb %%v6,%%v23,%%v30,%%v6\n\t"
  120. "vl %%v31,112(%%r1,%[ap3])\n\t"
  121. "vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
  122. "agfi %%r1,128\n\t"
  123. "brctg %%r0,0b\n\t"
  124. "1:\n\t"
  125. "lghi %%r0,12\n\t"
  126. "ngr %%r0,%[n]\n\t"
  127. "ltgr %%r0,%%r0\n\t"
  128. "jz 3f\n\t"
  129. "srlg %%r0,%%r0,2\n\t"
  130. "2:\n\t"
  131. "vl %%v16,0(%%r1,%[x])\n\t"
  132. "vl %%v17,16(%%r1,%[x])\n\t"
  133. "vl %%v24,0(%%r1,%[ap0])\n\t"
  134. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  135. "vl %%v25,0(%%r1,%[ap1])\n\t"
  136. "vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
  137. "vl %%v26,0(%%r1,%[ap2])\n\t"
  138. "vfmadb %%v2,%%v16,%%v26,%%v2\n\t"
  139. "vl %%v27,0(%%r1,%[ap3])\n\t"
  140. "vfmadb %%v3,%%v16,%%v27,%%v3\n\t"
  141. "vl %%v28,16(%%r1,%[ap0])\n\t"
  142. "vfmadb %%v4,%%v17,%%v28,%%v4\n\t"
  143. "vl %%v29,16(%%r1,%[ap1])\n\t"
  144. "vfmadb %%v5,%%v17,%%v29,%%v5\n\t"
  145. "vl %%v30,16(%%r1,%[ap2])\n\t"
  146. "vfmadb %%v6,%%v17,%%v30,%%v6\n\t"
  147. "vl %%v31,16(%%r1,%[ap3])\n\t"
  148. "vfmadb %%v7,%%v17,%%v31,%%v7\n\t"
  149. "agfi %%r1,32\n\t"
  150. "brctg %%r0,2b\n\t"
  151. "3:\n\t"
  152. "vfadb %%v0,%%v0,%%v4\n\t"
  153. "vfadb %%v1,%%v1,%%v5\n\t"
  154. "vfadb %%v2,%%v2,%%v6\n\t"
  155. "vfadb %%v3,%%v3,%%v7\n\t"
  156. "vrepg %%v4,%%v0,1\n\t"
  157. "adbr %%f0,%%f4\n\t"
  158. "std %%f0,0(%[y])\n\t"
  159. "vrepg %%v4,%%v1,1\n\t"
  160. "adbr %%f1,%%f4\n\t"
  161. "std %%f1,8(%[y])\n\t"
  162. "vrepg %%v4,%%v2,1\n\t"
  163. "adbr %%f2,%%f4\n\t"
  164. "std %%f2,16(%[y])\n\t"
  165. "vrepg %%v4,%%v3,1\n\t"
  166. "adbr %%f3,%%f4\n\t"
  167. "std %%f3,24(%[y])"
  168. : "=m"(*(FLOAT (*)[4]) y)
  169. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
  170. "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
  171. "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]),
  172. "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]),
  173. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  174. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  175. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  176. "v26", "v27", "v28", "v29", "v30", "v31");
  177. }
  178. static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
  179. __asm__("vzero %%v0\n\t"
  180. "vzero %%v1\n\t"
  181. "vzero %%v2\n\t"
  182. "vzero %%v3\n\t"
  183. "vzero %%v4\n\t"
  184. "vzero %%v5\n\t"
  185. "vzero %%v6\n\t"
  186. "vzero %%v7\n\t"
  187. "xgr %%r1,%%r1\n\t"
  188. "lghi %%r0,-16\n\t"
  189. "ngr %%r0,%[n]\n\t"
  190. "ltgr %%r0,%%r0\n\t"
  191. "jz 1f\n\t"
  192. "srlg %%r0,%%r0,4\n\t"
  193. "0:\n\t"
  194. "pfd 1,1024(%%r1,%[ap0])\n\t"
  195. "pfd 1,1024(%%r1,%[ap1])\n\t"
  196. "pfd 1,1024(%%r1,%[x])\n\t"
  197. "vl %%v16,0(%%r1,%[x])\n\t"
  198. "vl %%v17,16(%%r1,%[x])\n\t"
  199. "vl %%v18,32(%%r1,%[x])\n\t"
  200. "vl %%v19,48(%%r1,%[x])\n\t"
  201. "vl %%v20,64(%%r1,%[x])\n\t"
  202. "vl %%v21,80(%%r1,%[x])\n\t"
  203. "vl %%v22,96(%%r1,%[x])\n\t"
  204. "vl %%v23,112(%%r1,%[x])\n\t"
  205. "vl %%v24,0(%%r1,%[ap0])\n\t"
  206. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  207. "vl %%v25,0(%%r1,%[ap1])\n\t"
  208. "vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
  209. "vl %%v26,16(%%r1,%[ap0])\n\t"
  210. "vfmadb %%v2,%%v17,%%v26,%%v2\n\t"
  211. "vl %%v27,16(%%r1,%[ap1])\n\t"
  212. "vfmadb %%v3,%%v17,%%v27,%%v3\n\t"
  213. "vl %%v28,32(%%r1,%[ap0])\n\t"
  214. "vfmadb %%v4,%%v18,%%v28,%%v4\n\t"
  215. "vl %%v29,32(%%r1,%[ap1])\n\t"
  216. "vfmadb %%v5,%%v18,%%v29,%%v5\n\t"
  217. "vl %%v30,48(%%r1,%[ap0])\n\t"
  218. "vfmadb %%v6,%%v19,%%v30,%%v6\n\t"
  219. "vl %%v31,48(%%r1,%[ap1])\n\t"
  220. "vfmadb %%v7,%%v19,%%v31,%%v7\n\t"
  221. "vl %%v24,64(%%r1,%[ap0])\n\t"
  222. "vfmadb %%v0,%%v20,%%v24,%%v0\n\t"
  223. "vl %%v25,64(%%r1,%[ap1])\n\t"
  224. "vfmadb %%v1,%%v20,%%v25,%%v1\n\t"
  225. "vl %%v26,80(%%r1,%[ap0])\n\t"
  226. "vfmadb %%v2,%%v21,%%v26,%%v2\n\t"
  227. "vl %%v27,80(%%r1,%[ap1])\n\t"
  228. "vfmadb %%v3,%%v21,%%v27,%%v3\n\t"
  229. "vl %%v28,96(%%r1,%[ap0])\n\t"
  230. "vfmadb %%v4,%%v22,%%v28,%%v4\n\t"
  231. "vl %%v29,96(%%r1,%[ap1])\n\t"
  232. "vfmadb %%v5,%%v22,%%v29,%%v5\n\t"
  233. "vl %%v30,112(%%r1,%[ap0])\n\t"
  234. "vfmadb %%v6,%%v23,%%v30,%%v6\n\t"
  235. "vl %%v31,112(%%r1,%[ap1])\n\t"
  236. "vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
  237. "agfi %%r1,128\n\t"
  238. "brctg %%r0,0b\n\t"
  239. "1:\n\t"
  240. "lghi %%r0,12\n\t"
  241. "ngr %%r0,%[n]\n\t"
  242. "ltgr %%r0,%%r0\n\t"
  243. "jz 3f\n\t"
  244. "srlg %%r0,%%r0,2\n\t"
  245. "2:\n\t"
  246. "vl %%v16,0(%%r1,%[x])\n\t"
  247. "vl %%v17,16(%%r1,%[x])\n\t"
  248. "vl %%v24,0(%%r1,%[ap0])\n\t"
  249. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  250. "vl %%v25,0(%%r1,%[ap1])\n\t"
  251. "vfmadb %%v1,%%v16,%%v25,%%v1\n\t"
  252. "vl %%v26,16(%%r1,%[ap0])\n\t"
  253. "vfmadb %%v2,%%v17,%%v26,%%v2\n\t"
  254. "vl %%v27,16(%%r1,%[ap1])\n\t"
  255. "vfmadb %%v3,%%v17,%%v27,%%v3\n\t"
  256. "agfi %%r1,32\n\t"
  257. "brctg %%r0,2b\n\t"
  258. "3:\n\t"
  259. "vfadb %%v0,%%v0,%%v2\n\t"
  260. "vfadb %%v0,%%v0,%%v4\n\t"
  261. "vfadb %%v0,%%v0,%%v6\n\t"
  262. "vfadb %%v1,%%v1,%%v3\n\t"
  263. "vfadb %%v1,%%v1,%%v5\n\t"
  264. "vfadb %%v1,%%v1,%%v7\n\t"
  265. "vrepg %%v2,%%v0,1\n\t"
  266. "adbr %%f0,%%f2\n\t"
  267. "std %%f0,0(%[y])\n\t"
  268. "vrepg %%v2,%%v1,1\n\t"
  269. "adbr %%f1,%%f2\n\t"
  270. "std %%f1,8(%[y])"
  271. : "=m"(*(FLOAT (*)[2]) y)
  272. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
  273. "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
  274. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  275. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  276. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  277. "v26", "v27", "v28", "v29", "v30", "v31");
  278. }
  279. static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) {
  280. __asm__("vzero %%v0\n\t"
  281. "vzero %%v1\n\t"
  282. "vzero %%v2\n\t"
  283. "vzero %%v3\n\t"
  284. "vzero %%v4\n\t"
  285. "vzero %%v5\n\t"
  286. "vzero %%v6\n\t"
  287. "vzero %%v7\n\t"
  288. "xgr %%r1,%%r1\n\t"
  289. "lghi %%r0,-16\n\t"
  290. "ngr %%r0,%[n]\n\t"
  291. "ltgr %%r0,%%r0\n\t"
  292. "jz 1f\n\t"
  293. "srlg %%r0,%%r0,4\n\t"
  294. "0:\n\t"
  295. "pfd 1,1024(%%r1,%[a0])\n\t"
  296. "pfd 1,1024(%%r1,%[x])\n\t"
  297. "vl %%v16,0(%%r1,%[x])\n\t"
  298. "vl %%v17,16(%%r1,%[x])\n\t"
  299. "vl %%v18,32(%%r1,%[x])\n\t"
  300. "vl %%v19,48(%%r1,%[x])\n\t"
  301. "vl %%v20,64(%%r1,%[x])\n\t"
  302. "vl %%v21,80(%%r1,%[x])\n\t"
  303. "vl %%v22,96(%%r1,%[x])\n\t"
  304. "vl %%v23,112(%%r1,%[x])\n\t"
  305. "vl %%v24,0(%%r1,%[a0])\n\t"
  306. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  307. "vl %%v25,16(%%r1,%[a0])\n\t"
  308. "vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
  309. "vl %%v26,32(%%r1,%[a0])\n\t"
  310. "vfmadb %%v2,%%v18,%%v26,%%v2\n\t"
  311. "vl %%v27,48(%%r1,%[a0])\n\t"
  312. "vfmadb %%v3,%%v19,%%v27,%%v3\n\t"
  313. "vl %%v28,64(%%r1,%[a0])\n\t"
  314. "vfmadb %%v4,%%v20,%%v28,%%v4\n\t"
  315. "vl %%v29,80(%%r1,%[a0])\n\t"
  316. "vfmadb %%v5,%%v21,%%v29,%%v5\n\t"
  317. "vl %%v30,96(%%r1,%[a0])\n\t"
  318. "vfmadb %%v6,%%v22,%%v30,%%v6\n\t"
  319. "vl %%v31,112(%%r1,%[a0])\n\t"
  320. "vfmadb %%v7,%%v23,%%v31,%%v7\n\t"
  321. "agfi %%r1,128\n\t"
  322. "brctg %%r0,0b\n\t"
  323. "1:\n\t"
  324. "lghi %%r0,12\n\t"
  325. "ngr %%r0,%[n]\n\t"
  326. "ltgr %%r0,%%r0\n\t"
  327. "jz 3f\n\t"
  328. "srlg %%r0,%%r0,2\n\t"
  329. "2:\n\t"
  330. "vl %%v16,0(%%r1,%[x])\n\t"
  331. "vl %%v17,16(%%r1,%[x])\n\t"
  332. "vl %%v24,0(%%r1,%[a0])\n\t"
  333. "vfmadb %%v0,%%v16,%%v24,%%v0\n\t"
  334. "vl %%v25,16(%%r1,%[a0])\n\t"
  335. "vfmadb %%v1,%%v17,%%v25,%%v1\n\t"
  336. "agfi %%r1,32\n\t"
  337. "brctg %%r0,2b\n\t"
  338. "3:\n\t"
  339. "vfadb %%v0,%%v0,%%v1\n\t"
  340. "vfadb %%v0,%%v0,%%v2\n\t"
  341. "vfadb %%v0,%%v0,%%v3\n\t"
  342. "vfadb %%v0,%%v0,%%v4\n\t"
  343. "vfadb %%v0,%%v0,%%v5\n\t"
  344. "vfadb %%v0,%%v0,%%v6\n\t"
  345. "vfadb %%v0,%%v0,%%v7\n\t"
  346. "vrepg %%v1,%%v0,1\n\t"
  347. "adbr %%f0,%%f1\n\t"
  348. "std %%f0,0(%[y])"
  349. : "=m"(*(FLOAT (*)[1]) y)
  350. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
  351. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  352. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  353. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  354. "v26", "v27", "v28", "v29", "v30", "v31");
  355. }
  356. static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
  357. BLASLONG i;
  358. for (i = 0; i < n; i++) {
  359. dest[i] = *src;
  360. src += inc_src;
  361. }
  362. }
  363. static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) {
  364. __asm__("vlrepg %%v0,%[da]\n\t"
  365. "xgr %%r1,%%r1\n\t"
  366. "lghi %%r0,-16\n\t"
  367. "ngr %%r0,%[n]\n\t"
  368. "ltgr %%r0,%%r0\n\t"
  369. "jz 1f\n\t"
  370. "srlg %%r0,%%r0,4\n\t"
  371. "0:\n\t"
  372. "pfd 1,1024(%%r1,%[src])\n\t"
  373. "pfd 2,1024(%%r1,%[dest])\n\t"
  374. "vl %%v16,0(%%r1,%[src])\n\t"
  375. "vl %%v17,16(%%r1,%[src])\n\t"
  376. "vl %%v18,32(%%r1,%[src])\n\t"
  377. "vl %%v19,48(%%r1,%[src])\n\t"
  378. "vl %%v20,64(%%r1,%[src])\n\t"
  379. "vl %%v21,80(%%r1,%[src])\n\t"
  380. "vl %%v22,96(%%r1,%[src])\n\t"
  381. "vl %%v23,112(%%r1,%[src])\n\t"
  382. "vl %%v24, 0(%%r1,%[dest])\n\t"
  383. "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
  384. "vst %%v24, 0(%%r1,%[dest])\n\t"
  385. "vl %%v25, 16(%%r1,%[dest])\n\t"
  386. "vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
  387. "vst %%v25, 16(%%r1,%[dest])\n\t"
  388. "vl %%v26, 32(%%r1,%[dest])\n\t"
  389. "vfmadb %%v26,%%v18,%%v0,%%v26\n\t"
  390. "vst %%v26, 32(%%r1,%[dest])\n\t"
  391. "vl %%v27, 48(%%r1,%[dest])\n\t"
  392. "vfmadb %%v27,%%v19,%%v0,%%v27\n\t"
  393. "vst %%v27, 48(%%r1,%[dest])\n\t"
  394. "vl %%v28, 64(%%r1,%[dest])\n\t"
  395. "vfmadb %%v28,%%v20,%%v0,%%v28\n\t"
  396. "vst %%v28, 64(%%r1,%[dest])\n\t"
  397. "vl %%v29, 80(%%r1,%[dest])\n\t"
  398. "vfmadb %%v29,%%v21,%%v0,%%v29\n\t"
  399. "vst %%v29, 80(%%r1,%[dest])\n\t"
  400. "vl %%v30, 96(%%r1,%[dest])\n\t"
  401. "vfmadb %%v30,%%v22,%%v0,%%v30\n\t"
  402. "vst %%v30, 96(%%r1,%[dest])\n\t"
  403. "vl %%v31, 112(%%r1,%[dest])\n\t"
  404. "vfmadb %%v31,%%v23,%%v0,%%v31\n\t"
  405. "vst %%v31, 112(%%r1,%[dest])\n\t"
  406. "agfi %%r1,128\n\t"
  407. "brctg %%r0,0b\n\t"
  408. "1:\n\t"
  409. "lghi %%r0,12\n\t"
  410. "ngr %%r0,%[n]\n\t"
  411. "ltgr %%r0,%%r0\n\t"
  412. "jz 3f\n\t"
  413. "srlg %%r0,%%r0,2\n\t"
  414. "2:\n\t"
  415. "vl %%v16,0(%%r1,%[src])\n\t"
  416. "vl %%v17,16(%%r1,%[src])\n\t"
  417. "vl %%v24, 0(%%r1,%[dest])\n\t"
  418. "vfmadb %%v24,%%v16,%%v0,%%v24\n\t"
  419. "vst %%v24, 0(%%r1,%[dest])\n\t"
  420. "vl %%v25, 16(%%r1,%[dest])\n\t"
  421. "vfmadb %%v25,%%v17,%%v0,%%v25\n\t"
  422. "vst %%v25, 16(%%r1,%[dest])\n\t"
  423. "agfi %%r1,32\n\t"
  424. "brctg %%r0,2b\n\t"
  425. "3:\n\t"
  426. "nop"
  427. : "+m"(*(FLOAT (*)[n]) dest)
  428. : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src),
  429. [src] "a"(src),[n] "r"(n)
  430. : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
  431. "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  432. "v31");
  433. }
  434. static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest,
  435. BLASLONG inc_dest) {
  436. if (inc_dest == 1)
  437. add_y_kernel_4(n, da, src, dest);
  438. else {
  439. BLASLONG i;
  440. for (i = 0; i < n; i++) {
  441. *dest += src[i] * da;
  442. dest += inc_dest;
  443. }
  444. }
  445. }
  446. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
  447. BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  448. FLOAT *buffer) {
  449. BLASLONG register i;
  450. BLASLONG register j;
  451. FLOAT *a_ptr;
  452. FLOAT *x_ptr;
  453. FLOAT *y_ptr;
  454. BLASLONG n0;
  455. BLASLONG n1;
  456. BLASLONG m1;
  457. BLASLONG m2;
  458. BLASLONG m3;
  459. BLASLONG n2;
  460. FLOAT ybuffer[2] __attribute__ ((aligned(16)));
  461. FLOAT *xbuffer;
  462. FLOAT *ytemp;
  463. if (m < 1)
  464. return (0);
  465. if (n < 1)
  466. return (0);
  467. xbuffer = buffer;
  468. ytemp = buffer + (m < NBMAX ? m : NBMAX);
  469. n0 = n / NBMAX;
  470. n1 = (n % NBMAX) >> 2;
  471. n2 = n & 3;
  472. m3 = m & 3;
  473. m1 = m & -4;
  474. m2 = (m & (NBMAX - 1)) - m3;
  475. BLASLONG NB = NBMAX;
  476. while (NB == NBMAX) {
  477. m1 -= NB;
  478. if (m1 < 0) {
  479. if (m2 == 0)
  480. break;
  481. NB = m2;
  482. }
  483. y_ptr = y;
  484. a_ptr = a;
  485. x_ptr = x;
  486. if (inc_x == 1)
  487. xbuffer = x_ptr;
  488. else
  489. copy_x(NB, x_ptr, xbuffer, inc_x);
  490. FLOAT *ap[4];
  491. FLOAT *yp;
  492. BLASLONG register lda4 = 4 * lda;
  493. ap[0] = a_ptr;
  494. ap[1] = a_ptr + lda;
  495. ap[2] = ap[1] + lda;
  496. ap[3] = ap[2] + lda;
  497. if (n0 > 0) {
  498. BLASLONG nb1 = NBMAX / 4;
  499. for (j = 0; j < n0; j++) {
  500. yp = ytemp;
  501. for (i = 0; i < nb1; i++) {
  502. dgemv_kernel_4x4(NB, ap, xbuffer, yp);
  503. ap[0] += lda4;
  504. ap[1] += lda4;
  505. ap[2] += lda4;
  506. ap[3] += lda4;
  507. yp += 4;
  508. }
  509. add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y);
  510. y_ptr += nb1 * inc_y * 4;
  511. a_ptr += nb1 * lda4;
  512. }
  513. }
  514. yp = ytemp;
  515. for (i = 0; i < n1; i++) {
  516. dgemv_kernel_4x4(NB, ap, xbuffer, yp);
  517. ap[0] += lda4;
  518. ap[1] += lda4;
  519. ap[2] += lda4;
  520. ap[3] += lda4;
  521. yp += 4;
  522. }
  523. if (n1 > 0) {
  524. add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y);
  525. y_ptr += n1 * inc_y * 4;
  526. a_ptr += n1 * lda4;
  527. }
  528. if (n2 & 2) {
  529. dgemv_kernel_4x2(NB, ap, xbuffer, ybuffer);
  530. a_ptr += lda * 2;
  531. *y_ptr += ybuffer[0] * alpha;
  532. y_ptr += inc_y;
  533. *y_ptr += ybuffer[1] * alpha;
  534. y_ptr += inc_y;
  535. }
  536. if (n2 & 1) {
  537. dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
  538. // a_ptr += lda;
  539. *y_ptr += ybuffer[0] * alpha;
  540. // y_ptr += inc_y;
  541. }
  542. a += NB;
  543. x += NB * inc_x;
  544. }
  545. if (m3 == 0)
  546. return (0);
  547. x_ptr = x;
  548. a_ptr = a;
  549. if (m3 == 3) {
  550. FLOAT xtemp0 = *x_ptr * alpha;
  551. x_ptr += inc_x;
  552. FLOAT xtemp1 = *x_ptr * alpha;
  553. x_ptr += inc_x;
  554. FLOAT xtemp2 = *x_ptr * alpha;
  555. FLOAT *aj = a_ptr;
  556. y_ptr = y;
  557. if (lda == 3 && inc_y == 1) {
  558. for (j = 0; j < (n & -4); j += 4) {
  559. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
  560. y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
  561. y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
  562. y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
  563. aj += 12;
  564. }
  565. for (; j < n; j++) {
  566. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
  567. aj += 3;
  568. }
  569. } else {
  570. if (inc_y == 1) {
  571. BLASLONG register lda2 = lda << 1;
  572. BLASLONG register lda4 = lda << 2;
  573. BLASLONG register lda3 = lda2 + lda;
  574. for (j = 0; j < (n & -4); j += 4) {
  575. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  576. y_ptr[j + 1] +=
  577. *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda +
  578. 2) * xtemp2;
  579. y_ptr[j + 2] +=
  580. *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 +
  581. 2) * xtemp2;
  582. y_ptr[j + 3] +=
  583. *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 +
  584. 2) * xtemp2;
  585. aj += lda4;
  586. }
  587. for (; j < n; j++) {
  588. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  589. aj += lda;
  590. }
  591. } else {
  592. for (j = 0; j < n; j++) {
  593. *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  594. y_ptr += inc_y;
  595. aj += lda;
  596. }
  597. }
  598. }
  599. return (0);
  600. }
  601. if (m3 == 2) {
  602. FLOAT xtemp0 = *x_ptr * alpha;
  603. x_ptr += inc_x;
  604. FLOAT xtemp1 = *x_ptr * alpha;
  605. FLOAT *aj = a_ptr;
  606. y_ptr = y;
  607. if (lda == 2 && inc_y == 1) {
  608. for (j = 0; j < (n & -4); j += 4) {
  609. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
  610. y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
  611. y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
  612. y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
  613. aj += 8;
  614. }
  615. for (; j < n; j++) {
  616. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
  617. aj += 2;
  618. }
  619. } else {
  620. if (inc_y == 1) {
  621. BLASLONG register lda2 = lda << 1;
  622. BLASLONG register lda4 = lda << 2;
  623. BLASLONG register lda3 = lda2 + lda;
  624. for (j = 0; j < (n & -4); j += 4) {
  625. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
  626. y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
  627. y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
  628. y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
  629. aj += lda4;
  630. }
  631. for (; j < n; j++) {
  632. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
  633. aj += lda;
  634. }
  635. } else {
  636. for (j = 0; j < n; j++) {
  637. *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
  638. y_ptr += inc_y;
  639. aj += lda;
  640. }
  641. }
  642. }
  643. return (0);
  644. }
  645. FLOAT xtemp = *x_ptr * alpha;
  646. FLOAT *aj = a_ptr;
  647. y_ptr = y;
  648. if (lda == 1 && inc_y == 1) {
  649. for (j = 0; j < (n & -4); j += 4) {
  650. y_ptr[j] += aj[j] * xtemp;
  651. y_ptr[j + 1] += aj[j + 1] * xtemp;
  652. y_ptr[j + 2] += aj[j + 2] * xtemp;
  653. y_ptr[j + 3] += aj[j + 3] * xtemp;
  654. }
  655. for (; j < n; j++) {
  656. y_ptr[j] += aj[j] * xtemp;
  657. }
  658. } else {
  659. if (inc_y == 1) {
  660. BLASLONG register lda2 = lda << 1;
  661. BLASLONG register lda4 = lda << 2;
  662. BLASLONG register lda3 = lda2 + lda;
  663. for (j = 0; j < (n & -4); j += 4) {
  664. y_ptr[j] += *aj * xtemp;
  665. y_ptr[j + 1] += *(aj + lda) * xtemp;
  666. y_ptr[j + 2] += *(aj + lda2) * xtemp;
  667. y_ptr[j + 3] += *(aj + lda3) * xtemp;
  668. aj += lda4;
  669. }
  670. for (; j < n; j++) {
  671. y_ptr[j] += *aj * xtemp;
  672. aj += lda;
  673. }
  674. } else {
  675. for (j = 0; j < n; j++) {
  676. *y_ptr += *aj * xtemp;
  677. y_ptr += inc_y;
  678. aj += lda;
  679. }
  680. }
  681. }
  682. return (0);
  683. }