You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemv_t_4.c 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753
  1. /***************************************************************************
  2. Copyright (c) 2019, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #include "common.h"
  28. #define NBMAX 2048
  29. static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
  30. register FLOAT *ap0 = ap[0];
  31. register FLOAT *ap1 = ap[1];
  32. register FLOAT *ap2 = ap[2];
  33. register FLOAT *ap3 = ap[3];
  34. __asm__("vzero %%v0\n\t"
  35. "vzero %%v1\n\t"
  36. "vzero %%v2\n\t"
  37. "vzero %%v3\n\t"
  38. "vzero %%v4\n\t"
  39. "vzero %%v5\n\t"
  40. "vzero %%v6\n\t"
  41. "vzero %%v7\n\t"
  42. "xgr %%r1,%%r1\n\t"
  43. "lghi %%r0,-32\n\t"
  44. "ngr %%r0,%[n]\n\t"
  45. "ltgr %%r0,%%r0\n\t"
  46. "jz 1f\n\t"
  47. "srlg %%r0,%%r0,5\n\t"
  48. "0:\n\t"
  49. "pfd 1,1024(%%r1,%[ap0])\n\t"
  50. "pfd 1,1024(%%r1,%[ap1])\n\t"
  51. "pfd 1,1024(%%r1,%[ap2])\n\t"
  52. "pfd 1,1024(%%r1,%[ap3])\n\t"
  53. "pfd 1,1024(%%r1,%[x])\n\t"
  54. "vl %%v16,0(%%r1,%[x])\n\t"
  55. "vl %%v17,16(%%r1,%[x])\n\t"
  56. "vl %%v18,32(%%r1,%[x])\n\t"
  57. "vl %%v19,48(%%r1,%[x])\n\t"
  58. "vl %%v20,64(%%r1,%[x])\n\t"
  59. "vl %%v21,80(%%r1,%[x])\n\t"
  60. "vl %%v22,96(%%r1,%[x])\n\t"
  61. "vl %%v23,112(%%r1,%[x])\n\t"
  62. "vl %%v24,0(%%r1,%[ap0])\n\t"
  63. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  64. "vl %%v25,0(%%r1,%[ap1])\n\t"
  65. "vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
  66. "vl %%v26,0(%%r1,%[ap2])\n\t"
  67. "vfmasb %%v2,%%v16,%%v26,%%v2\n\t"
  68. "vl %%v27,0(%%r1,%[ap3])\n\t"
  69. "vfmasb %%v3,%%v16,%%v27,%%v3\n\t"
  70. "vl %%v28,16(%%r1,%[ap0])\n\t"
  71. "vfmasb %%v4,%%v17,%%v28,%%v4\n\t"
  72. "vl %%v29,16(%%r1,%[ap1])\n\t"
  73. "vfmasb %%v5,%%v17,%%v29,%%v5\n\t"
  74. "vl %%v30,16(%%r1,%[ap2])\n\t"
  75. "vfmasb %%v6,%%v17,%%v30,%%v6\n\t"
  76. "vl %%v31,16(%%r1,%[ap3])\n\t"
  77. "vfmasb %%v7,%%v17,%%v31,%%v7\n\t"
  78. "vl %%v24,32(%%r1,%[ap0])\n\t"
  79. "vfmasb %%v0,%%v18,%%v24,%%v0\n\t"
  80. "vl %%v25,32(%%r1,%[ap1])\n\t"
  81. "vfmasb %%v1,%%v18,%%v25,%%v1\n\t"
  82. "vl %%v26,32(%%r1,%[ap2])\n\t"
  83. "vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
  84. "vl %%v27,32(%%r1,%[ap3])\n\t"
  85. "vfmasb %%v3,%%v18,%%v27,%%v3\n\t"
  86. "vl %%v28,48(%%r1,%[ap0])\n\t"
  87. "vfmasb %%v4,%%v19,%%v28,%%v4\n\t"
  88. "vl %%v29,48(%%r1,%[ap1])\n\t"
  89. "vfmasb %%v5,%%v19,%%v29,%%v5\n\t"
  90. "vl %%v30,48(%%r1,%[ap2])\n\t"
  91. "vfmasb %%v6,%%v19,%%v30,%%v6\n\t"
  92. "vl %%v31,48(%%r1,%[ap3])\n\t"
  93. "vfmasb %%v7,%%v19,%%v31,%%v7\n\t"
  94. "vl %%v24,64(%%r1,%[ap0])\n\t"
  95. "vfmasb %%v0,%%v20,%%v24,%%v0\n\t"
  96. "vl %%v25,64(%%r1,%[ap1])\n\t"
  97. "vfmasb %%v1,%%v20,%%v25,%%v1\n\t"
  98. "vl %%v26,64(%%r1,%[ap2])\n\t"
  99. "vfmasb %%v2,%%v20,%%v26,%%v2\n\t"
  100. "vl %%v27,64(%%r1,%[ap3])\n\t"
  101. "vfmasb %%v3,%%v20,%%v27,%%v3\n\t"
  102. "vl %%v28,80(%%r1,%[ap0])\n\t"
  103. "vfmasb %%v4,%%v21,%%v28,%%v4\n\t"
  104. "vl %%v29,80(%%r1,%[ap1])\n\t"
  105. "vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
  106. "vl %%v30,80(%%r1,%[ap2])\n\t"
  107. "vfmasb %%v6,%%v21,%%v30,%%v6\n\t"
  108. "vl %%v31,80(%%r1,%[ap3])\n\t"
  109. "vfmasb %%v7,%%v21,%%v31,%%v7\n\t"
  110. "vl %%v24,96(%%r1,%[ap0])\n\t"
  111. "vfmasb %%v0,%%v22,%%v24,%%v0\n\t"
  112. "vl %%v25,96(%%r1,%[ap1])\n\t"
  113. "vfmasb %%v1,%%v22,%%v25,%%v1\n\t"
  114. "vl %%v26,96(%%r1,%[ap2])\n\t"
  115. "vfmasb %%v2,%%v22,%%v26,%%v2\n\t"
  116. "vl %%v27,96(%%r1,%[ap3])\n\t"
  117. "vfmasb %%v3,%%v22,%%v27,%%v3\n\t"
  118. "vl %%v28,112(%%r1,%[ap0])\n\t"
  119. "vfmasb %%v4,%%v23,%%v28,%%v4\n\t"
  120. "vl %%v29,112(%%r1,%[ap1])\n\t"
  121. "vfmasb %%v5,%%v23,%%v29,%%v5\n\t"
  122. "vl %%v30,112(%%r1,%[ap2])\n\t"
  123. "vfmasb %%v6,%%v23,%%v30,%%v6\n\t"
  124. "vl %%v31,112(%%r1,%[ap3])\n\t"
  125. "vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
  126. "agfi %%r1,128\n\t"
  127. "brctg %%r0,0b\n\t"
  128. "1:\n\t"
  129. "lghi %%r0,28\n\t"
  130. "ngr %%r0,%[n]\n\t"
  131. "ltgr %%r0,%%r0\n\t"
  132. "jz 3f\n\t"
  133. "srlg %%r0,%%r0,2\n\t"
  134. "2:\n\t"
  135. "vl %%v16,0(%%r1,%[x])\n\t"
  136. "vl %%v24,0(%%r1,%[ap0])\n\t"
  137. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  138. "vl %%v25,0(%%r1,%[ap1])\n\t"
  139. "vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
  140. "vl %%v26,0(%%r1,%[ap2])\n\t"
  141. "vfmasb %%v2,%%v16,%%v26,%%v2\n\t"
  142. "vl %%v27,0(%%r1,%[ap3])\n\t"
  143. "vfmasb %%v3,%%v16,%%v27,%%v3\n\t"
  144. "agfi %%r1,16\n\t"
  145. "brctg %%r0,2b\n\t"
  146. "3:\n\t"
  147. "vfasb %%v0,%%v0,%%v4\n\t"
  148. "vfasb %%v1,%%v1,%%v5\n\t"
  149. "vfasb %%v2,%%v2,%%v6\n\t"
  150. "vfasb %%v3,%%v3,%%v7\n\t"
  151. "veslg %%v4,%%v0,32\n\t"
  152. "vfasb %%v0,%%v0,%%v4\n\t"
  153. "vrepg %%v4,%%v0,1\n\t"
  154. "aebr %%f0,%%f4\n\t"
  155. "ste %%f0,0(%[y])\n\t"
  156. "veslg %%v4,%%v1,32\n\t"
  157. "vfasb %%v1,%%v1,%%v4\n\t"
  158. "vrepg %%v4,%%v1,1\n\t"
  159. "aebr %%f1,%%f4\n\t"
  160. "ste %%f1,4(%[y])\n\t"
  161. "veslg %%v4,%%v2,32\n\t"
  162. "vfasb %%v2,%%v2,%%v4\n\t"
  163. "vrepg %%v4,%%v2,1\n\t"
  164. "aebr %%f2,%%f4\n\t"
  165. "ste %%f2,8(%[y])\n\t"
  166. "veslg %%v4,%%v3,32\n\t"
  167. "vfasb %%v3,%%v3,%%v4\n\t"
  168. "vrepg %%v4,%%v3,1\n\t"
  169. "aebr %%f3,%%f4\n\t"
  170. "ste %%f3,12(%[y])"
  171. : "=m"(*(FLOAT (*)[4]) y)
  172. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0),
  173. "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1),
  174. "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2),
  175. "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3),
  176. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  177. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  178. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  179. "v26", "v27", "v28", "v29", "v30", "v31");
  180. }
  181. static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) {
  182. register FLOAT *ap0 = ap[0];
  183. register FLOAT *ap1 = ap[1];
  184. __asm__("vzero %%v0\n\t"
  185. "vzero %%v1\n\t"
  186. "vzero %%v2\n\t"
  187. "vzero %%v3\n\t"
  188. "vzero %%v4\n\t"
  189. "vzero %%v5\n\t"
  190. "vzero %%v6\n\t"
  191. "vzero %%v7\n\t"
  192. "xgr %%r1,%%r1\n\t"
  193. "lghi %%r0,-32\n\t"
  194. "ngr %%r0,%[n]\n\t"
  195. "ltgr %%r0,%%r0\n\t"
  196. "jz 1f\n\t"
  197. "srlg %%r0,%%r0,5\n\t"
  198. "0:\n\t"
  199. "pfd 1,1024(%%r1,%[ap0])\n\t"
  200. "pfd 1,1024(%%r1,%[ap1])\n\t"
  201. "pfd 1,1024(%%r1,%[x])\n\t"
  202. "vl %%v16,0(%%r1,%[x])\n\t"
  203. "vl %%v17,16(%%r1,%[x])\n\t"
  204. "vl %%v18,32(%%r1,%[x])\n\t"
  205. "vl %%v19,48(%%r1,%[x])\n\t"
  206. "vl %%v20,64(%%r1,%[x])\n\t"
  207. "vl %%v21,80(%%r1,%[x])\n\t"
  208. "vl %%v22,96(%%r1,%[x])\n\t"
  209. "vl %%v23,112(%%r1,%[x])\n\t"
  210. "vl %%v24,0(%%r1,%[ap0])\n\t"
  211. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  212. "vl %%v25,0(%%r1,%[ap1])\n\t"
  213. "vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
  214. "vl %%v26,16(%%r1,%[ap0])\n\t"
  215. "vfmasb %%v2,%%v17,%%v26,%%v2\n\t"
  216. "vl %%v27,16(%%r1,%[ap1])\n\t"
  217. "vfmasb %%v3,%%v17,%%v27,%%v3\n\t"
  218. "vl %%v28,32(%%r1,%[ap0])\n\t"
  219. "vfmasb %%v4,%%v18,%%v28,%%v4\n\t"
  220. "vl %%v29,32(%%r1,%[ap1])\n\t"
  221. "vfmasb %%v5,%%v18,%%v29,%%v5\n\t"
  222. "vl %%v30,48(%%r1,%[ap0])\n\t"
  223. "vfmasb %%v6,%%v19,%%v30,%%v6\n\t"
  224. "vl %%v31,48(%%r1,%[ap1])\n\t"
  225. "vfmasb %%v7,%%v19,%%v31,%%v7\n\t"
  226. "vl %%v24,64(%%r1,%[ap0])\n\t"
  227. "vfmasb %%v0,%%v20,%%v24,%%v0\n\t"
  228. "vl %%v25,64(%%r1,%[ap1])\n\t"
  229. "vfmasb %%v1,%%v20,%%v25,%%v1\n\t"
  230. "vl %%v26,80(%%r1,%[ap0])\n\t"
  231. "vfmasb %%v2,%%v21,%%v26,%%v2\n\t"
  232. "vl %%v27,80(%%r1,%[ap1])\n\t"
  233. "vfmasb %%v3,%%v21,%%v27,%%v3\n\t"
  234. "vl %%v28,96(%%r1,%[ap0])\n\t"
  235. "vfmasb %%v4,%%v22,%%v28,%%v4\n\t"
  236. "vl %%v29,96(%%r1,%[ap1])\n\t"
  237. "vfmasb %%v5,%%v22,%%v29,%%v5\n\t"
  238. "vl %%v30,112(%%r1,%[ap0])\n\t"
  239. "vfmasb %%v6,%%v23,%%v30,%%v6\n\t"
  240. "vl %%v31,112(%%r1,%[ap1])\n\t"
  241. "vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
  242. "agfi %%r1,128\n\t"
  243. "brctg %%r0,0b\n\t"
  244. "1:\n\t"
  245. "lghi %%r0,28\n\t"
  246. "ngr %%r0,%[n]\n\t"
  247. "ltgr %%r0,%%r0\n\t"
  248. "jz 3f\n\t"
  249. "srlg %%r0,%%r0,2\n\t"
  250. "2:\n\t"
  251. "vl %%v16,0(%%r1,%[x])\n\t"
  252. "vl %%v24,0(%%r1,%[ap0])\n\t"
  253. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  254. "vl %%v25,0(%%r1,%[ap1])\n\t"
  255. "vfmasb %%v1,%%v16,%%v25,%%v1\n\t"
  256. "agfi %%r1,16\n\t"
  257. "brctg %%r0,2b\n\t"
  258. "3:\n\t"
  259. "vfasb %%v0,%%v0,%%v2\n\t"
  260. "vfasb %%v0,%%v0,%%v4\n\t"
  261. "vfasb %%v0,%%v0,%%v6\n\t"
  262. "vfasb %%v1,%%v1,%%v3\n\t"
  263. "vfasb %%v1,%%v1,%%v5\n\t"
  264. "vfasb %%v1,%%v1,%%v7\n\t"
  265. "veslg %%v2,%%v0,32\n\t"
  266. "vfasb %%v0,%%v0,%%v2\n\t"
  267. "vrepg %%v2,%%v0,1\n\t"
  268. "aebr %%f0,%%f2\n\t"
  269. "ste %%f0,0(%[y])\n\t"
  270. "veslg %%v2,%%v1,32\n\t"
  271. "vfasb %%v1,%%v1,%%v2\n\t"
  272. "vrepg %%v2,%%v1,1\n\t"
  273. "aebr %%f1,%%f2\n\t"
  274. "ste %%f1,4(%[y])"
  275. : "=m"(*(FLOAT (*)[2]) y)
  276. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0),
  277. "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1),
  278. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  279. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  280. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  281. "v26", "v27", "v28", "v29", "v30", "v31");
  282. }
  283. static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) {
  284. __asm__("vzero %%v0\n\t"
  285. "vzero %%v1\n\t"
  286. "vzero %%v2\n\t"
  287. "vzero %%v3\n\t"
  288. "vzero %%v4\n\t"
  289. "vzero %%v5\n\t"
  290. "vzero %%v6\n\t"
  291. "vzero %%v7\n\t"
  292. "xgr %%r1,%%r1\n\t"
  293. "lghi %%r0,-32\n\t"
  294. "ngr %%r0,%[n]\n\t"
  295. "ltgr %%r0,%%r0\n\t"
  296. "jz 1f\n\t"
  297. "srlg %%r0,%%r0,5\n\t"
  298. "0:\n\t"
  299. "pfd 1,1024(%%r1,%[a0])\n\t"
  300. "pfd 1,1024(%%r1,%[x])\n\t"
  301. "vl %%v16,0(%%r1,%[x])\n\t"
  302. "vl %%v17,16(%%r1,%[x])\n\t"
  303. "vl %%v18,32(%%r1,%[x])\n\t"
  304. "vl %%v19,48(%%r1,%[x])\n\t"
  305. "vl %%v20,64(%%r1,%[x])\n\t"
  306. "vl %%v21,80(%%r1,%[x])\n\t"
  307. "vl %%v22,96(%%r1,%[x])\n\t"
  308. "vl %%v23,112(%%r1,%[x])\n\t"
  309. "vl %%v24,0(%%r1,%[a0])\n\t"
  310. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  311. "vl %%v25,16(%%r1,%[a0])\n\t"
  312. "vfmasb %%v1,%%v17,%%v25,%%v1\n\t"
  313. "vl %%v26,32(%%r1,%[a0])\n\t"
  314. "vfmasb %%v2,%%v18,%%v26,%%v2\n\t"
  315. "vl %%v27,48(%%r1,%[a0])\n\t"
  316. "vfmasb %%v3,%%v19,%%v27,%%v3\n\t"
  317. "vl %%v28,64(%%r1,%[a0])\n\t"
  318. "vfmasb %%v4,%%v20,%%v28,%%v4\n\t"
  319. "vl %%v29,80(%%r1,%[a0])\n\t"
  320. "vfmasb %%v5,%%v21,%%v29,%%v5\n\t"
  321. "vl %%v30,96(%%r1,%[a0])\n\t"
  322. "vfmasb %%v6,%%v22,%%v30,%%v6\n\t"
  323. "vl %%v31,112(%%r1,%[a0])\n\t"
  324. "vfmasb %%v7,%%v23,%%v31,%%v7\n\t"
  325. "agfi %%r1,128\n\t"
  326. "brctg %%r0,0b\n\t"
  327. "1:\n\t"
  328. "lghi %%r0,28\n\t"
  329. "ngr %%r0,%[n]\n\t"
  330. "ltgr %%r0,%%r0\n\t"
  331. "jz 3f\n\t"
  332. "srlg %%r0,%%r0,2\n\t"
  333. "2:\n\t"
  334. "vl %%v16,0(%%r1,%[x])\n\t"
  335. "vl %%v24,0(%%r1,%[a0])\n\t"
  336. "vfmasb %%v0,%%v16,%%v24,%%v0\n\t"
  337. "agfi %%r1,16\n\t"
  338. "brctg %%r0,2b\n\t"
  339. "3:\n\t"
  340. "vfasb %%v0,%%v0,%%v1\n\t"
  341. "vfasb %%v0,%%v0,%%v2\n\t"
  342. "vfasb %%v0,%%v0,%%v3\n\t"
  343. "vfasb %%v0,%%v0,%%v4\n\t"
  344. "vfasb %%v0,%%v0,%%v5\n\t"
  345. "vfasb %%v0,%%v0,%%v6\n\t"
  346. "vfasb %%v0,%%v0,%%v7\n\t"
  347. "veslg %%v1,%%v0,32\n\t"
  348. "vfasb %%v0,%%v0,%%v1\n\t"
  349. "vrepg %%v1,%%v0,1\n\t"
  350. "aebr %%f0,%%f1\n\t"
  351. "ste %%f0,0(%[y])"
  352. : "=m"(*(FLOAT (*)[1]) y)
  353. : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0),
  354. "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n)
  355. : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  356. "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
  357. "v26", "v27", "v28", "v29", "v30", "v31");
  358. }
  359. static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
  360. BLASLONG i;
  361. for (i = 0; i < n; i++) {
  362. dest[i] = *src;
  363. src += inc_src;
  364. }
  365. }
  366. static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) {
  367. __asm__("vlrepf %%v0,%[da]\n\t"
  368. "xgr %%r1,%%r1\n\t"
  369. "lghi %%r0,-32\n\t"
  370. "ngr %%r0,%[n]\n\t"
  371. "ltgr %%r0,%%r0\n\t"
  372. "jz 1f\n\t"
  373. "srlg %%r0,%%r0,5\n\t"
  374. "0:\n\t"
  375. "pfd 1,1024(%%r1,%[src])\n\t"
  376. "pfd 2,1024(%%r1,%[dest])\n\t"
  377. "vl %%v16,0(%%r1,%[src])\n\t"
  378. "vl %%v17,16(%%r1,%[src])\n\t"
  379. "vl %%v18,32(%%r1,%[src])\n\t"
  380. "vl %%v19,48(%%r1,%[src])\n\t"
  381. "vl %%v20,64(%%r1,%[src])\n\t"
  382. "vl %%v21,80(%%r1,%[src])\n\t"
  383. "vl %%v22,96(%%r1,%[src])\n\t"
  384. "vl %%v23,112(%%r1,%[src])\n\t"
  385. "vl %%v24, 0(%%r1,%[dest])\n\t"
  386. "vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
  387. "vst %%v24, 0(%%r1,%[dest])\n\t"
  388. "vl %%v25, 16(%%r1,%[dest])\n\t"
  389. "vfmasb %%v25,%%v17,%%v0,%%v25\n\t"
  390. "vst %%v25, 16(%%r1,%[dest])\n\t"
  391. "vl %%v26, 32(%%r1,%[dest])\n\t"
  392. "vfmasb %%v26,%%v18,%%v0,%%v26\n\t"
  393. "vst %%v26, 32(%%r1,%[dest])\n\t"
  394. "vl %%v27, 48(%%r1,%[dest])\n\t"
  395. "vfmasb %%v27,%%v19,%%v0,%%v27\n\t"
  396. "vst %%v27, 48(%%r1,%[dest])\n\t"
  397. "vl %%v28, 64(%%r1,%[dest])\n\t"
  398. "vfmasb %%v28,%%v20,%%v0,%%v28\n\t"
  399. "vst %%v28, 64(%%r1,%[dest])\n\t"
  400. "vl %%v29, 80(%%r1,%[dest])\n\t"
  401. "vfmasb %%v29,%%v21,%%v0,%%v29\n\t"
  402. "vst %%v29, 80(%%r1,%[dest])\n\t"
  403. "vl %%v30, 96(%%r1,%[dest])\n\t"
  404. "vfmasb %%v30,%%v22,%%v0,%%v30\n\t"
  405. "vst %%v30, 96(%%r1,%[dest])\n\t"
  406. "vl %%v31, 112(%%r1,%[dest])\n\t"
  407. "vfmasb %%v31,%%v23,%%v0,%%v31\n\t"
  408. "vst %%v31, 112(%%r1,%[dest])\n\t"
  409. "agfi %%r1,128\n\t"
  410. "brctg %%r0,0b\n\t"
  411. "1:\n\t"
  412. "lghi %%r0,28\n\t"
  413. "ngr %%r0,%[n]\n\t"
  414. "ltgr %%r0,%%r0\n\t"
  415. "jz 3f\n\t"
  416. "srlg %%r0,%%r0,2\n\t"
  417. "2:\n\t"
  418. "vl %%v16,0(%%r1,%[src])\n\t"
  419. "vl %%v24, 0(%%r1,%[dest])\n\t"
  420. "vfmasb %%v24,%%v16,%%v0,%%v24\n\t"
  421. "vst %%v24, 0(%%r1,%[dest])\n\t"
  422. "agfi %%r1,16\n\t"
  423. "brctg %%r0,2b\n\t"
  424. "3:\n\t"
  425. "nop 0"
  426. : "+m"(*(FLOAT (*)[n]) dest)
  427. : [dest] "a"(dest),[da] "Q"(da), "m"(*(const FLOAT (*)[n]) src),
  428. [src] "a"(src),[n] "r"(n)
  429. : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21",
  430. "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  431. "v31");
  432. }
  433. static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest,
  434. BLASLONG inc_dest) {
  435. if (inc_dest == 1)
  436. add_y_kernel_4(n, da, src, dest);
  437. else {
  438. BLASLONG i;
  439. for (i = 0; i < n; i++) {
  440. *dest += src[i] * da;
  441. dest += inc_dest;
  442. }
  443. }
  444. }
  445. int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
  446. BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
  447. FLOAT *buffer) {
  448. BLASLONG register i;
  449. BLASLONG register j;
  450. FLOAT *a_ptr;
  451. FLOAT *x_ptr;
  452. FLOAT *y_ptr;
  453. BLASLONG n0;
  454. BLASLONG n1;
  455. BLASLONG m1;
  456. BLASLONG m2;
  457. BLASLONG m3;
  458. BLASLONG n2;
  459. FLOAT ybuffer[2] __attribute__ ((aligned(16)));
  460. FLOAT *xbuffer;
  461. FLOAT *ytemp;
  462. if (m < 1)
  463. return (0);
  464. if (n < 1)
  465. return (0);
  466. xbuffer = buffer;
  467. ytemp = buffer + (m < NBMAX ? m : NBMAX);
  468. n0 = n / NBMAX;
  469. n1 = (n % NBMAX) >> 2;
  470. n2 = n & 3;
  471. m3 = m & 3;
  472. m1 = m & -4;
  473. m2 = (m & (NBMAX - 1)) - m3;
  474. BLASLONG NB = NBMAX;
  475. while (NB == NBMAX) {
  476. m1 -= NB;
  477. if (m1 < 0) {
  478. if (m2 == 0)
  479. break;
  480. NB = m2;
  481. }
  482. y_ptr = y;
  483. a_ptr = a;
  484. x_ptr = x;
  485. if (inc_x == 1)
  486. xbuffer = x_ptr;
  487. else
  488. copy_x(NB, x_ptr, xbuffer, inc_x);
  489. FLOAT *ap[4];
  490. FLOAT *yp;
  491. BLASLONG register lda4 = 4 * lda;
  492. ap[0] = a_ptr;
  493. ap[1] = a_ptr + lda;
  494. ap[2] = ap[1] + lda;
  495. ap[3] = ap[2] + lda;
  496. if (n0 > 0) {
  497. BLASLONG nb1 = NBMAX / 4;
  498. for (j = 0; j < n0; j++) {
  499. yp = ytemp;
  500. for (i = 0; i < nb1; i++) {
  501. sgemv_kernel_4x4(NB, ap, xbuffer, yp);
  502. ap[0] += lda4;
  503. ap[1] += lda4;
  504. ap[2] += lda4;
  505. ap[3] += lda4;
  506. yp += 4;
  507. }
  508. add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y);
  509. y_ptr += nb1 * inc_y * 4;
  510. a_ptr += nb1 * lda4;
  511. }
  512. }
  513. yp = ytemp;
  514. for (i = 0; i < n1; i++) {
  515. sgemv_kernel_4x4(NB, ap, xbuffer, yp);
  516. ap[0] += lda4;
  517. ap[1] += lda4;
  518. ap[2] += lda4;
  519. ap[3] += lda4;
  520. yp += 4;
  521. }
  522. if (n1 > 0) {
  523. add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y);
  524. y_ptr += n1 * inc_y * 4;
  525. a_ptr += n1 * lda4;
  526. }
  527. if (n2 & 2) {
  528. sgemv_kernel_4x2(NB, ap, xbuffer, ybuffer);
  529. a_ptr += lda * 2;
  530. *y_ptr += ybuffer[0] * alpha;
  531. y_ptr += inc_y;
  532. *y_ptr += ybuffer[1] * alpha;
  533. y_ptr += inc_y;
  534. }
  535. if (n2 & 1) {
  536. sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
  537. // a_ptr += lda;
  538. *y_ptr += ybuffer[0] * alpha;
  539. // y_ptr += inc_y;
  540. }
  541. a += NB;
  542. x += NB * inc_x;
  543. }
  544. if (m3 == 0)
  545. return (0);
  546. x_ptr = x;
  547. a_ptr = a;
  548. if (m3 == 3) {
  549. FLOAT xtemp0 = *x_ptr * alpha;
  550. x_ptr += inc_x;
  551. FLOAT xtemp1 = *x_ptr * alpha;
  552. x_ptr += inc_x;
  553. FLOAT xtemp2 = *x_ptr * alpha;
  554. FLOAT *aj = a_ptr;
  555. y_ptr = y;
  556. if (lda == 3 && inc_y == 1) {
  557. for (j = 0; j < (n & -4); j += 4) {
  558. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
  559. y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
  560. y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
  561. y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
  562. aj += 12;
  563. }
  564. for (; j < n; j++) {
  565. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
  566. aj += 3;
  567. }
  568. } else {
  569. if (inc_y == 1) {
  570. BLASLONG register lda2 = lda << 1;
  571. BLASLONG register lda4 = lda << 2;
  572. BLASLONG register lda3 = lda2 + lda;
  573. for (j = 0; j < (n & -4); j += 4) {
  574. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  575. y_ptr[j + 1] +=
  576. *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda +
  577. 2) * xtemp2;
  578. y_ptr[j + 2] +=
  579. *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 +
  580. 2) * xtemp2;
  581. y_ptr[j + 3] +=
  582. *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 +
  583. 2) * xtemp2;
  584. aj += lda4;
  585. }
  586. for (; j < n; j++) {
  587. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  588. aj += lda;
  589. }
  590. } else {
  591. for (j = 0; j < n; j++) {
  592. *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
  593. y_ptr += inc_y;
  594. aj += lda;
  595. }
  596. }
  597. }
  598. return (0);
  599. }
  600. if (m3 == 2) {
  601. FLOAT xtemp0 = *x_ptr * alpha;
  602. x_ptr += inc_x;
  603. FLOAT xtemp1 = *x_ptr * alpha;
  604. FLOAT *aj = a_ptr;
  605. y_ptr = y;
  606. if (lda == 2 && inc_y == 1) {
  607. for (j = 0; j < (n & -4); j += 4) {
  608. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
  609. y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
  610. y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
  611. y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
  612. aj += 8;
  613. }
  614. for (; j < n; j++) {
  615. y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
  616. aj += 2;
  617. }
  618. } else {
  619. if (inc_y == 1) {
  620. BLASLONG register lda2 = lda << 1;
  621. BLASLONG register lda4 = lda << 2;
  622. BLASLONG register lda3 = lda2 + lda;
  623. for (j = 0; j < (n & -4); j += 4) {
  624. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
  625. y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
  626. y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
  627. y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
  628. aj += lda4;
  629. }
  630. for (; j < n; j++) {
  631. y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
  632. aj += lda;
  633. }
  634. } else {
  635. for (j = 0; j < n; j++) {
  636. *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
  637. y_ptr += inc_y;
  638. aj += lda;
  639. }
  640. }
  641. }
  642. return (0);
  643. }
  644. FLOAT xtemp = *x_ptr * alpha;
  645. FLOAT *aj = a_ptr;
  646. y_ptr = y;
  647. if (lda == 1 && inc_y == 1) {
  648. for (j = 0; j < (n & -4); j += 4) {
  649. y_ptr[j] += aj[j] * xtemp;
  650. y_ptr[j + 1] += aj[j + 1] * xtemp;
  651. y_ptr[j + 2] += aj[j + 2] * xtemp;
  652. y_ptr[j + 3] += aj[j + 3] * xtemp;
  653. }
  654. for (; j < n; j++) {
  655. y_ptr[j] += aj[j] * xtemp;
  656. }
  657. } else {
  658. if (inc_y == 1) {
  659. BLASLONG register lda2 = lda << 1;
  660. BLASLONG register lda4 = lda << 2;
  661. BLASLONG register lda3 = lda2 + lda;
  662. for (j = 0; j < (n & -4); j += 4) {
  663. y_ptr[j] += *aj * xtemp;
  664. y_ptr[j + 1] += *(aj + lda) * xtemp;
  665. y_ptr[j + 2] += *(aj + lda2) * xtemp;
  666. y_ptr[j + 3] += *(aj + lda3) * xtemp;
  667. aj += lda4;
  668. }
  669. for (; j < n; j++) {
  670. y_ptr[j] += *aj * xtemp;
  671. aj += lda;
  672. }
  673. } else {
  674. for (j = 0; j < n; j++) {
  675. *y_ptr += *aj * xtemp;
  676. y_ptr += inc_y;
  677. aj += lda;
  678. }
  679. }
  680. }
  681. return (0);
  682. }