You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemv_t_4.c 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745
  1. /***************************************************************************
  2. Copyright (c) 2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #include "common.h"
  28. #define NBMAX 2048
  29. static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
  30. __asm__("vzero %%v0\n\t"
  31. "vzero %%v1\n\t"
  32. "vzero %%v2\n\t"
  33. "vzero %%v3\n\t"
  34. "vzero %%v4\n\t"
  35. "vzero %%v5\n\t"
  36. "vzero %%v6\n\t"
  37. "vzero %%v7\n\t"
  38. "xgr %%r1,%%r1\n\t"
  39. "lghi %%r0,-32\n\t"
  40. "ngr %%r0,%[n]\n\t"
  41. "ltgr %%r0,%%r0\n\t"
  42. "jz 1f\n\t"
  43. "srlg %%r0,%%r0,5\n\t"
  44. "0:\n\t"
  45. "pfd 1,1024(%%r1,%[ap0])\n\t"
  46. "pfd 1,1024(%%r1,%[ap1])\n\t"
  47. "pfd 1,1024(%%r1,%[ap2])\n\t"
  48. "pfd 1,1024(%%r1,%[ap3])\n\t"
  49. "pfd 1,1024(%%r1,%[x])\n\t"
  50. "vl %%v16,0(%%r1,%[x])\n\t"
  51. "vl %%v17,16(%%r1,%[x])\n\t"
  52. "vl %%v18,32(%%r1,%[x])\n\t"
  53. "vl %%v19,48(%%r1,%[x])\n\t"
  54. "vl %%v20,64(%%r1,%[x])\n\t"
  55. "vl %%v21,80(%%r1,%[x])\n\t"
  56. "vl %%v22,96(%%r1,%[x])\n\t"
  57. "vl %%v23,112(%%r1,%[x])\n\t"
  58. "vl %%v24,0(%%r1,%[ap0])\n\t"
  59. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  60. "vl %%v25,0(%%r1,%[ap1])\n\t"
  61. "vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
  62. "vl %%v26,0(%%r1,%[ap2])\n\t"
  63. "vfmasb %%v2,%%v16,%%v26,%%v2\n\t"
  64. "vl %%v27,0(%%r1,%[ap3])\n\t"
  65. "vfmasb %%v3,%%v16,%%v27,%%v3\n\t"
  66. "vl %%v28,16(%%r1,%[ap0])\n\t"
  67. "vfmasb %%v4,%%v17,%%v28,%%v4\n\t"
  68. "vl %%v29,16(%%r1,%[ap1])\n\t"
  69. "vfmasb %%v5,%%v17,%%v29,%%v5\n\t"
  70. "vl %%v30,16(%%r1,%[ap2])\n\t"
  71. "vfmasb %%v6,%%v17,%%v30,%%v6\n\t"
  72. "vl %%v31,16(%%r1,%[ap3])\n\t"
  73. "vfmasb %%v7,%%v17,%%v31,%%v7\n\t"
  74. "vl %%v24,32(%%r1,%[ap0])\n\t"
  75. "vfmasb %%v0,%%v18,%%v24,%%v0\n\t"
  76. "vl %%v25,32(%%r1,%[ap1])\n\t"
  77. "vfmasb %%v1,%%v18,%%v25,%%v1\n\t"
  78. "vl %%v26,32(%%r1,%[ap2])\n\t"
  79. "vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
  80. "vl %%v27,32(%%r1,%[ap3])\n\t"
  81. "vfmasb %%v3,%%v18,%%v27,%%v3\n\t"
  82. "vl %%v28,48(%%r1,%[ap0])\n\t"
  83. "vfmasb %%v4,%%v19,%%v28,%%v4\n\t"
  84. "vl %%v29,48(%%r1,%[ap1])\n\t"
  85. "vfmasb %%v5,%%v19,%%v29,%%v5\n\t"
  86. "vl %%v30,48(%%r1,%[ap2])\n\t"
  87. "vfmasb %%v6,%%v19,%%v30,%%v6\n\t"
  88. "vl %%v31,48(%%r1,%[ap3])\n\t"
  89. "vfmasb %%v7,%%v19,%%v31,%%v7\n\t"
  90. "vl %%v24,64(%%r1,%[ap0])\n\t"
  91. "vfmasb %%v0,%%v20,%%v24,%%v0\n\t"
  92. "vl %%v25,64(%%r1,%[ap1])\n\t"
  93. "vfmasb %%v1,%%v20,%%v25,%%v1\n\t"
  94. "vl %%v26,64(%%r1,%[ap2])\n\t"
  95. "vfmasb %%v2,%%v20,%%v26,%%v2\n\t"
  96. "vl %%v27,64(%%r1,%[ap3])\n\t"
  97. "vfmasb %%v3,%%v20,%%v27,%%v3\n\t"
  98. "vl %%v28,80(%%r1,%[ap0])\n\t"
  99. "vfmasb %%v4,%%v21,%%v28,%%v4\n\t"
  100. "vl %%v29,80(%%r1,%[ap1])\n\t"
  101. "vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
  102. "vl %%v30,80(%%r1,%[ap2])\n\t"
  103. "vfmasb %%v6,%%v21,%%v30,%%v6\n\t"
  104. "vl %%v31,80(%%r1,%[ap3])\n\t"
  105. "vfmasb %%v7,%%v21,%%v31,%%v7\n\t"
  106. "vl %%v24,96(%%r1,%[ap0])\n\t"
  107. "vfmasb %%v0,%%v22,%%v24,%%v0\n\t"
  108. "vl %%v25,96(%%r1,%[ap1])\n\t"
  109. "vfmasb %%v1,%%v22,%%v25,%%v1\n\t"
  110. "vl %%v26,96(%%r1,%[ap2])\n\t"
  111. "vfmasb %%v2,%%v22,%%v26,%%v2\n\t"
  112. "vl %%v27,96(%%r1,%[ap3])\n\t"
  113. "vfmasb %%v3,%%v22,%%v27,%%v3\n\t"
  114. "vl %%v28,112(%%r1,%[ap0])\n\t"
  115. "vfmasb %%v4,%%v23,%%v28,%%v4\n\t"
  116. "vl %%v29,112(%%r1,%[ap1])\n\t"
  117. "vfmasb %%v5,%%v23,%%v29,%%v5\n\t"
  118. "vl %%v30,112(%%r1,%[ap2])\n\t"
  119. "vfmasb %%v6,%%v23,%%v30,%%v6\n\t"
  120. "vl %%v31,112(%%r1,%[ap3])\n\t"
  121. "vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
  122. "agfi %%r1,128\n\t"
  123. "brctg %%r0,0b\n\t"
  124. "1:\n\t"
  125. "lghi %%r0,28\n\t"
  126. "ngr %%r0,%[n]\n\t"
  127. "ltgr %%r0,%%r0\n\t"
  128. "jz 3f\n\t"
  129. "srlg %%r0,%%r0,2\n\t"
  130. "2:\n\t"
  131. "vl %%v16,0(%%r1,%[x])\n\t"
  132. "vl %%v24,0(%%r1,%[ap0])\n\t"
  133. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  134. "vl %%v25,0(%%r1,%[ap1])\n\t"
  135. "vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
  136. "vl %%v26,0(%%r1,%[ap2])\n\t"
  137. "vfmasb %%v2,%%v16,%%v26,%%v2\n\t"
  138. "vl %%v27,0(%%r1,%[ap3])\n\t"
  139. "vfmasb %%v3,%%v16,%%v27,%%v3\n\t"
  140. "agfi %%r1,16\n\t"
  141. "brctg %%r0,2b\n\t"
  142. "3:\n\t"
  143. "vfasb %%v0,%%v0,%%v4\n\t"
  144. "vfasb %%v1,%%v1,%%v5\n\t"
  145. "vfasb %%v2,%%v2,%%v6\n\t"
  146. "vfasb %%v3,%%v3,%%v7\n\t"
  147. "veslg %%v4,%%v0,32\n\t"
  148. "vfasb %%v0,%%v0,%%v4\n\t"
  149. "vrepg %%v4,%%v0,1\n\t"
  150. "aebr %%f0,%%f4\n\t"
  151. "ste %%f0,0(%[y])\n\t"
  152. "veslg %%v4,%%v1,32\n\t"
  153. "vfasb %%v1,%%v1,%%v4\n\t"
  154. "vrepg %%v4,%%v1,1\n\t"
  155. "aebr %%f1,%%f4\n\t"
  156. "ste %%f1,4(%[y])\n\t"
  157. "veslg %%v4,%%v2,32\n\t"
  158. "vfasb %%v2,%%v2,%%v4\n\t"
  159. "vrepg %%v4,%%v2,1\n\t"
  160. "aebr %%f2,%%f4\n\t"
  161. "ste %%f2,8(%[y])\n\t"
  162. "veslg %%v4,%%v3,32\n\t"
  163. "vfasb %%v3,%%v3,%%v4\n\t"
  164. "vrepg %%v4,%%v3,1\n\t"
  165. "aebr %%f3,%%f4\n\t"
  166. "ste %%f3,12(%[y])"
  167. : "=m"(*(FLOAT (*)[4]) y)
  168. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
  169. "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
  170. "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]),
  171. "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]),
  172. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  173. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  174. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  175. "v26", "v27", "v28", "v29", "v30", "v31");
  176. }
  177. static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
  178. __asm__("vzero %%v0\n\t"
  179. "vzero %%v1\n\t"
  180. "vzero %%v2\n\t"
  181. "vzero %%v3\n\t"
  182. "vzero %%v4\n\t"
  183. "vzero %%v5\n\t"
  184. "vzero %%v6\n\t"
  185. "vzero %%v7\n\t"
  186. "xgr %%r1,%%r1\n\t"
  187. "lghi %%r0,-32\n\t"
  188. "ngr %%r0,%[n]\n\t"
  189. "ltgr %%r0,%%r0\n\t"
  190. "jz 1f\n\t"
  191. "srlg %%r0,%%r0,5\n\t"
  192. "0:\n\t"
  193. "pfd 1,1024(%%r1,%[ap0])\n\t"
  194. "pfd 1,1024(%%r1,%[ap1])\n\t"
  195. "pfd 1,1024(%%r1,%[x])\n\t"
  196. "vl %%v16,0(%%r1,%[x])\n\t"
  197. "vl %%v17,16(%%r1,%[x])\n\t"
  198. "vl %%v18,32(%%r1,%[x])\n\t"
  199. "vl %%v19,48(%%r1,%[x])\n\t"
  200. "vl %%v20,64(%%r1,%[x])\n\t"
  201. "vl %%v21,80(%%r1,%[x])\n\t"
  202. "vl %%v22,96(%%r1,%[x])\n\t"
  203. "vl %%v23,112(%%r1,%[x])\n\t"
  204. "vl %%v24,0(%%r1,%[ap0])\n\t"
  205. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  206. "vl %%v25,0(%%r1,%[ap1])\n\t"
  207. "vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
  208. "vl %%v26,16(%%r1,%[ap0])\n\t"
  209. "vfmasb %%v2,%%v17,%%v26,%%v2\n\t"
  210. "vl %%v27,16(%%r1,%[ap1])\n\t"
  211. "vfmasb %%v3,%%v17,%%v27,%%v3\n\t"
  212. "vl %%v28,32(%%r1,%[ap0])\n\t"
  213. "vfmasb %%v4,%%v18,%%v28,%%v4\n\t"
  214. "vl %%v29,32(%%r1,%[ap1])\n\t"
  215. "vfmasb %%v5,%%v18,%%v29,%%v5\n\t"
  216. "vl %%v30,48(%%r1,%[ap0])\n\t"
  217. "vfmasb %%v6,%%v19,%%v30,%%v6\n\t"
  218. "vl %%v31,48(%%r1,%[ap1])\n\t"
  219. "vfmasb %%v7,%%v19,%%v31,%%v7\n\t"
  220. "vl %%v24,64(%%r1,%[ap0])\n\t"
  221. "vfmasb %%v0,%%v20,%%v24,%%v0\n\t"
  222. "vl %%v25,64(%%r1,%[ap1])\n\t"
  223. "vfmasb %%v1,%%v20,%%v25,%%v1\n\t"
  224. "vl %%v26,80(%%r1,%[ap0])\n\t"
  225. "vfmasb %%v2,%%v21,%%v26,%%v2\n\t"
  226. "vl %%v27,80(%%r1,%[ap1])\n\t"
  227. "vfmasb %%v3,%%v21,%%v27,%%v3\n\t"
  228. "vl %%v28,96(%%r1,%[ap0])\n\t"
  229. "vfmasb %%v4,%%v22,%%v28,%%v4\n\t"
  230. "vl %%v29,96(%%r1,%[ap1])\n\t"
  231. "vfmasb %%v5,%%v22,%%v29,%%v5\n\t"
  232. "vl %%v30,112(%%r1,%[ap0])\n\t"
  233. "vfmasb %%v6,%%v23,%%v30,%%v6\n\t"
  234. "vl %%v31,112(%%r1,%[ap1])\n\t"
  235. "vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
  236. "agfi %%r1,128\n\t"
  237. "brctg %%r0,0b\n\t"
  238. "1:\n\t"
  239. "lghi %%r0,28\n\t"
  240. "ngr %%r0,%[n]\n\t"
  241. "ltgr %%r0,%%r0\n\t"
  242. "jz 3f\n\t"
  243. "srlg %%r0,%%r0,2\n\t"
  244. "2:\n\t"
  245. "vl %%v16,0(%%r1,%[x])\n\t"
  246. "vl %%v24,0(%%r1,%[ap0])\n\t"
  247. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  248. "vl %%v25,0(%%r1,%[ap1])\n\t"
  249. "vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
  250. "agfi %%r1,16\n\t"
  251. "brctg %%r0,2b\n\t"
  252. "3:\n\t"
  253. "vfasb %%v0,%%v0,%%v2\n\t"
  254. "vfasb %%v0,%%v0,%%v4\n\t"
  255. "vfasb %%v0,%%v0,%%v6\n\t"
  256. "vfasb %%v1,%%v1,%%v3\n\t"
  257. "vfasb %%v1,%%v1,%%v5\n\t"
  258. "vfasb %%v1,%%v1,%%v7\n\t"
  259. "veslg %%v2,%%v0,32\n\t"
  260. "vfasb %%v0,%%v0,%%v2\n\t"
  261. "vrepg %%v2,%%v0,1\n\t"
  262. "aebr %%f0,%%f2\n\t"
  263. "ste %%f0,0(%[y])\n\t"
  264. "veslg %%v2,%%v1,32\n\t"
  265. "vfasb %%v1,%%v1,%%v2\n\t"
  266. "vrepg %%v2,%%v1,1\n\t"
  267. "aebr %%f1,%%f2\n\t"
  268. "ste %%f1,4(%[y])"
  269. : "=m"(*(FLOAT (*)[2]) y)
  270. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]),
  271. "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]),
  272. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  273. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  274. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  275. "v26", "v27", "v28", "v29", "v30", "v31");
  276. }
  277. static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) {
  278. __asm__("vzero %%v0\n\t"
  279. "vzero %%v1\n\t"
  280. "vzero %%v2\n\t"
  281. "vzero %%v3\n\t"
  282. "vzero %%v4\n\t"
  283. "vzero %%v5\n\t"
  284. "vzero %%v6\n\t"
  285. "vzero %%v7\n\t"
  286. "xgr %%r1,%%r1\n\t"
  287. "lghi %%r0,-32\n\t"
  288. "ngr %%r0,%[n]\n\t"
  289. "ltgr %%r0,%%r0\n\t"
  290. "jz 1f\n\t"
  291. "srlg %%r0,%%r0,5\n\t"
  292. "0:\n\t"
  293. "pfd 1,1024(%%r1,%[a0])\n\t"
  294. "pfd 1,1024(%%r1,%[x])\n\t"
  295. "vl %%v16,0(%%r1,%[x])\n\t"
  296. "vl %%v17,16(%%r1,%[x])\n\t"
  297. "vl %%v18,32(%%r1,%[x])\n\t"
  298. "vl %%v19,48(%%r1,%[x])\n\t"
  299. "vl %%v20,64(%%r1,%[x])\n\t"
  300. "vl %%v21,80(%%r1,%[x])\n\t"
  301. "vl %%v22,96(%%r1,%[x])\n\t"
  302. "vl %%v23,112(%%r1,%[x])\n\t"
  303. "vl %%v24,0(%%r1,%[a0])\n\t"
  304. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  305. "vl %%v25,16(%%r1,%[a0])\n\t"
  306. "vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
  307. "vl %%v26,32(%%r1,%[a0])\n\t"
  308. "vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
  309. "vl %%v27,48(%%r1,%[a0])\n\t"
  310. "vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
  311. "vl %%v28,64(%%r1,%[a0])\n\t"
  312. "vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
  313. "vl %%v29,80(%%r1,%[a0])\n\t"
  314. "vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
  315. "vl %%v30,96(%%r1,%[a0])\n\t"
  316. "vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
  317. "vl %%v31,112(%%r1,%[a0])\n\t"
  318. "vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
  319. "agfi %%r1,128\n\t"
  320. "brctg %%r0,0b\n\t"
  321. "1:\n\t"
  322. "lghi %%r0,28\n\t"
  323. "ngr %%r0,%[n]\n\t"
  324. "ltgr %%r0,%%r0\n\t"
  325. "jz 3f\n\t"
  326. "srlg %%r0,%%r0,2\n\t"
  327. "2:\n\t"
  328. "vl %%v16,0(%%r1,%[x])\n\t"
  329. "vl %%v24,0(%%r1,%[a0])\n\t"
  330. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  331. "agfi %%r1,16\n\t"
  332. "brctg %%r0,2b\n\t"
  333. "3:\n\t"
  334. "vfasb %%v0,%%v0,%%v1\n\t"
  335. "vfasb %%v0,%%v0,%%v2\n\t"
  336. "vfasb %%v0,%%v0,%%v3\n\t"
  337. "vfasb %%v0,%%v0,%%v4\n\t"
  338. "vfasb %%v0,%%v0,%%v5\n\t"
  339. "vfasb %%v0,%%v0,%%v6\n\t"
  340. "vfasb %%v0,%%v0,%%v7\n\t"
  341. "veslg %%v1,%%v0,32\n\t"
  342. "vfasb %%v0,%%v0,%%v1\n\t"
  343. "vrepg %%v1,%%v0,1\n\t"
  344. "aebr %%f0,%%f1\n\t"
  345. "ste %%f0,0(%[y])"
  346. : "=m"(*(FLOAT (*)[1]) y)
  347. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
  348. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  349. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  350. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  351. "v26", "v27", "v28", "v29", "v30", "v31");
  352. }
  353. static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
  354. BLASLONG i;
  355. for (i = 0; i < n; i++) {
  356. dest[i] = *src;
  357. src += inc_src;
  358. }
  359. }
  360. static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) {
  361. __asm__("vlrepf %%v0,%[da]\n\t"
  362. "xgr %%r1,%%r1\n\t"
  363. "lghi %%r0,-32\n\t"
  364. "ngr %%r0,%[n]\n\t"
  365. "ltgr %%r0,%%r0\n\t"
  366. "jz 1f\n\t"
  367. "srlg %%r0,%%r0,5\n\t"
  368. "0:\n\t"
  369. "pfd 1,1024(%%r1,%[src])\n\t"
  370. "pfd 2,1024(%%r1,%[dest])\n\t"
  371. "vl %%v16,0(%%r1,%[src])\n\t"
  372. "vl %%v17,16(%%r1,%[src])\n\t"
  373. "vl %%v18,32(%%r1,%[src])\n\t"
  374. "vl %%v19,48(%%r1,%[src])\n\t"
  375. "vl %%v20,64(%%r1,%[src])\n\t"
  376. "vl %%v21,80(%%r1,%[src])\n\t"
  377. "vl %%v22,96(%%r1,%[src])\n\t"
  378. "vl %%v23,112(%%r1,%[src])\n\t"
  379. "vl %%v24, 0(%%r1,%[dest])\n\t"
  380. "vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
  381. "vst %%v24, 0(%%r1,%[dest])\n\t"
  382. "vl %%v25, 16(%%r1,%[dest])\n\t"
  383. "vfmasb %%v25,%%v17,%%v0,%%v25\n\t"
  384. "vst %%v25, 16(%%r1,%[dest])\n\t"
  385. "vl %%v26, 32(%%r1,%[dest])\n\t"
  386. "vfmasb %%v26,%%v18,%%v0,%%v26\n\t"
  387. "vst %%v26, 32(%%r1,%[dest])\n\t"
  388. "vl %%v27, 48(%%r1,%[dest])\n\t"
  389. "vfmasb %%v27,%%v19,%%v0,%%v27\n\t"
  390. "vst %%v27, 48(%%r1,%[dest])\n\t"
  391. "vl %%v28, 64(%%r1,%[dest])\n\t"
  392. "vfmasb %%v28,%%v20,%%v0,%%v28\n\t"
  393. "vst %%v28, 64(%%r1,%[dest])\n\t"
  394. "vl %%v29, 80(%%r1,%[dest])\n\t"
  395. "vfmasb %%v29,%%v21,%%v0,%%v29\n\t"
  396. "vst %%v29, 80(%%r1,%[dest])\n\t"
  397. "vl %%v30, 96(%%r1,%[dest])\n\t"
  398. "vfmasb %%v30,%%v22,%%v0,%%v30\n\t"
  399. "vst %%v30, 96(%%r1,%[dest])\n\t"
  400. "vl %%v31, 112(%%r1,%[dest])\n\t"
  401. "vfmasb %%v31,%%v23,%%v0,%%v31\n\t"
  402. "vst %%v31, 112(%%r1,%[dest])\n\t"
  403. "agfi %%r1,128\n\t"
  404. "brctg %%r0,0b\n\t"
  405. "1:\n\t"
  406. "lghi %%r0,28\n\t"
  407. "ngr %%r0,%[n]\n\t"
  408. "ltgr %%r0,%%r0\n\t"
  409. "jz 3f\n\t"
  410. "srlg %%r0,%%r0,2\n\t"
  411. "2:\n\t"
  412. "vl %%v16,0(%%r1,%[src])\n\t"
  413. "vl %%v24, 0(%%r1,%[dest])\n\t"
  414. "vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
  415. "vst %%v24, 0(%%r1,%[dest])\n\t"
  416. "agfi %%r1,16\n\t"
  417. "brctg %%r0,2b\n\t"
  418. "3:\n\t"
  419. "nop"
  420. : "+m"(*(FLOAT (*)[n]) dest)
  421. : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src),
  422. [src] "a"(src),[n] "r"(n)
  423. : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
  424. "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  425. "v31");
  426. }
  427. static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest,
  428. BLASLONG inc_dest) {
  429. if (inc_dest == 1)
  430. add_y_kernel_4(n, da, src, dest);
  431. else {
  432. BLASLONG i;
  433. for (i = 0; i < n; i++) {
  434. *dest += src[i] * da;
  435. dest += inc_dest;
  436. }
  437. }
  438. }
  439. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
  440. BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  441. FLOAT *buffer) {
  442. BLASLONG register i;
  443. BLASLONG register j;
  444. FLOAT *a_ptr;
  445. FLOAT *x_ptr;
  446. FLOAT *y_ptr;
  447. BLASLONG n0;
  448. BLASLONG n1;
  449. BLASLONG m1;
  450. BLASLONG m2;
  451. BLASLONG m3;
  452. BLASLONG n2;
  453. FLOAT ybuffer[2] __attribute__ ((aligned(16)));
  454. FLOAT *xbuffer;
  455. FLOAT *ytemp;
  456. if (m < 1)
  457. return (0);
  458. if (n < 1)
  459. return (0);
  460. xbuffer = buffer;
  461. ytemp = buffer + (m < NBMAX ? m : NBMAX);
  462. n0 = n / NBMAX;
  463. n1 = (n % NBMAX) >> 2;
  464. n2 = n & 3;
  465. m3 = m & 3;
  466. m1 = m & -4;
  467. m2 = (m & (NBMAX - 1)) - m3;
  468. BLASLONG NB = NBMAX;
  469. while (NB == NBMAX) {
  470. m1 -= NB;
  471. if (m1 < 0) {
  472. if (m2 == 0)
  473. break;
  474. NB = m2;
  475. }
  476. y_ptr = y;
  477. a_ptr = a;
  478. x_ptr = x;
  479. if (inc_x == 1)
  480. xbuffer = x_ptr;
  481. else
  482. copy_x(NB, x_ptr, xbuffer, inc_x);
  483. FLOAT *ap[4];
  484. FLOAT *yp;
  485. BLASLONG register lda4 = 4 * lda;
  486. ap[0] = a_ptr;
  487. ap[1] = a_ptr + lda;
  488. ap[2] = ap[1] + lda;
  489. ap[3] = ap[2] + lda;
  490. if (n0 > 0) {
  491. BLASLONG nb1 = NBMAX / 4;
  492. for (j = 0; j < n0; j++) {
  493. yp = ytemp;
  494. for (i = 0; i < nb1; i++) {
  495. sgemv_kernel_4x4(NB, ap, xbuffer, yp);
  496. ap[0] += lda4;
  497. ap[1] += lda4;
  498. ap[2] += lda4;
  499. ap[3] += lda4;
  500. yp += 4;
  501. }
  502. add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y);
  503. y_ptr += nb1 * inc_y * 4;
  504. a_ptr += nb1 * lda4;
  505. }
  506. }
  507. yp = ytemp;
  508. for (i = 0; i < n1; i++) {
  509. sgemv_kernel_4x4(NB, ap, xbuffer, yp);
  510. ap[0] += lda4;
  511. ap[1] += lda4;
  512. ap[2] += lda4;
  513. ap[3] += lda4;
  514. yp += 4;
  515. }
  516. if (n1 > 0) {
  517. add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y);
  518. y_ptr += n1 * inc_y * 4;
  519. a_ptr += n1 * lda4;
  520. }
  521. if (n2 & 2) {
  522. sgemv_kernel_4x2(NB, ap, xbuffer, ybuffer);
  523. a_ptr += lda * 2;
  524. *y_ptr += ybuffer[0] * alpha;
  525. y_ptr += inc_y;
  526. *y_ptr += ybuffer[1] * alpha;
  527. y_ptr += inc_y;
  528. }
  529. if (n2 & 1) {
  530. sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
  531. // a_ptr += lda;
  532. *y_ptr += ybuffer[0] * alpha;
  533. // y_ptr += inc_y;
  534. }
  535. a += NB;
  536. x += NB * inc_x;
  537. }
  538. if (m3 == 0)
  539. return (0);
  540. x_ptr = x;
  541. a_ptr = a;
  542. if (m3 == 3) {
  543. FLOAT xtemp0 = *x_ptr * alpha;
  544. x_ptr += inc_x;
  545. FLOAT xtemp1 = *x_ptr * alpha;
  546. x_ptr += inc_x;
  547. FLOAT xtemp2 = *x_ptr * alpha;
  548. FLOAT *aj = a_ptr;
  549. y_ptr = y;
  550. if (lda == 3 && inc_y == 1) {
  551. for (j = 0; j < (n & -4); j += 4) {
  552. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
  553. y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
  554. y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
  555. y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
  556. aj += 12;
  557. }
  558. for (; j < n; j++) {
  559. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
  560. aj += 3;
  561. }
  562. } else {
  563. if (inc_y == 1) {
  564. BLASLONG register lda2 = lda << 1;
  565. BLASLONG register lda4 = lda << 2;
  566. BLASLONG register lda3 = lda2 + lda;
  567. for (j = 0; j < (n & -4); j += 4) {
  568. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  569. y_ptr[j + 1] +=
  570. *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda +
  571. 2) * xtemp2;
  572. y_ptr[j + 2] +=
  573. *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 +
  574. 2) * xtemp2;
  575. y_ptr[j + 3] +=
  576. *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 +
  577. 2) * xtemp2;
  578. aj += lda4;
  579. }
  580. for (; j < n; j++) {
  581. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  582. aj += lda;
  583. }
  584. } else {
  585. for (j = 0; j < n; j++) {
  586. *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  587. y_ptr += inc_y;
  588. aj += lda;
  589. }
  590. }
  591. }
  592. return (0);
  593. }
  594. if (m3 == 2) {
  595. FLOAT xtemp0 = *x_ptr * alpha;
  596. x_ptr += inc_x;
  597. FLOAT xtemp1 = *x_ptr * alpha;
  598. FLOAT *aj = a_ptr;
  599. y_ptr = y;
  600. if (lda == 2 && inc_y == 1) {
  601. for (j = 0; j < (n & -4); j += 4) {
  602. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
  603. y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
  604. y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
  605. y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
  606. aj += 8;
  607. }
  608. for (; j < n; j++) {
  609. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
  610. aj += 2;
  611. }
  612. } else {
  613. if (inc_y == 1) {
  614. BLASLONG register lda2 = lda << 1;
  615. BLASLONG register lda4 = lda << 2;
  616. BLASLONG register lda3 = lda2 + lda;
  617. for (j = 0; j < (n & -4); j += 4) {
  618. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
  619. y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
  620. y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
  621. y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
  622. aj += lda4;
  623. }
  624. for (; j < n; j++) {
  625. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
  626. aj += lda;
  627. }
  628. } else {
  629. for (j = 0; j < n; j++) {
  630. *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
  631. y_ptr += inc_y;
  632. aj += lda;
  633. }
  634. }
  635. }
  636. return (0);
  637. }
  638. FLOAT xtemp = *x_ptr * alpha;
  639. FLOAT *aj = a_ptr;
  640. y_ptr = y;
  641. if (lda == 1 && inc_y == 1) {
  642. for (j = 0; j < (n & -4); j += 4) {
  643. y_ptr[j] += aj[j] * xtemp;
  644. y_ptr[j + 1] += aj[j + 1] * xtemp;
  645. y_ptr[j + 2] += aj[j + 2] * xtemp;
  646. y_ptr[j + 3] += aj[j + 3] * xtemp;
  647. }
  648. for (; j < n; j++) {
  649. y_ptr[j] += aj[j] * xtemp;
  650. }
  651. } else {
  652. if (inc_y == 1) {
  653. BLASLONG register lda2 = lda << 1;
  654. BLASLONG register lda4 = lda << 2;
  655. BLASLONG register lda3 = lda2 + lda;
  656. for (j = 0; j < (n & -4); j += 4) {
  657. y_ptr[j] += *aj * xtemp;
  658. y_ptr[j + 1] += *(aj + lda) * xtemp;
  659. y_ptr[j + 2] += *(aj + lda2) * xtemp;
  660. y_ptr[j + 3] += *(aj + lda3) * xtemp;
  661. aj += lda4;
  662. }
  663. for (; j < n; j++) {
  664. y_ptr[j] += *aj * xtemp;
  665. aj += lda;
  666. }
  667. } else {
  668. for (j = 0; j < n; j++) {
  669. *y_ptr += *aj * xtemp;
  670. y_ptr += inc_y;
  671. aj += lda;
  672. }
  673. }
  674. }
  675. return (0);
  676. }