You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_n.S 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef DOUBLE
  41. #define PREFETCHSIZE 44
  42. #else
  43. #define PREFETCHSIZE 88
  44. #endif
  45. #define M %i0
  46. #define N %i1
  47. #define A %i5
  48. #define LDA %i2
  49. #define X %i3
  50. #define INCX %i4
  51. #define Y %l0
  52. #define INCY %l1
  53. #define BUFFER %l2
  54. #define I %l3
  55. #define J %l5
  56. #define A1 %o0
  57. #define A2 %o1
  58. #define A3 %o2
  59. #define A4 %o3
  60. #define Y1 %l4
  61. #define YY %l6
  62. #ifdef DOUBLE
  63. #define t1 %f0
  64. #define t2 %f2
  65. #define t3 %f4
  66. #define t4 %f6
  67. #define y1 %f8
  68. #define y2 %f10
  69. #define y3 %f12
  70. #define y4 %f14
  71. #define y5 %f16
  72. #define y6 %f18
  73. #define y7 %f20
  74. #define y8 %f22
  75. #define a1 %f24
  76. #define a2 %f26
  77. #define a3 %f28
  78. #define a4 %f30
  79. #define a5 %f32
  80. #define a6 %f34
  81. #define a7 %f36
  82. #define a8 %f38
  83. #define a9 %f40
  84. #define a10 %f42
  85. #define a11 %f44
  86. #define a12 %f46
  87. #define a13 %f48
  88. #define a14 %f50
  89. #define a15 %f52
  90. #define a16 %f54
  91. #define x1 %f56
  92. #define x2 %f58
  93. #define x3 %f60
  94. #define x4 %f62
  95. #define FZERO %f50
  96. #define ALPHA_R %f52
  97. #define ALPHA_I %f54
  98. #else
  99. #define t1 %f0
  100. #define t2 %f1
  101. #define t3 %f2
  102. #define t4 %f3
  103. #define y1 %f4
  104. #define y2 %f5
  105. #define y3 %f6
  106. #define y4 %f7
  107. #define y5 %f8
  108. #define y6 %f9
  109. #define y7 %f10
  110. #define y8 %f11
  111. #define a1 %f12
  112. #define a2 %f13
  113. #define a3 %f14
  114. #define a4 %f15
  115. #define a5 %f16
  116. #define a6 %f17
  117. #define a7 %f18
  118. #define a8 %f19
  119. #define a9 %f20
  120. #define a10 %f21
  121. #define a11 %f22
  122. #define a12 %f23
  123. #define a13 %f24
  124. #define a14 %f25
  125. #define a15 %f26
  126. #define a16 %f27
  127. #define x1 %f28
  128. #define x2 %f29
  129. #define x3 %f30
  130. #define x4 %f31
  131. #define FZERO %f25
  132. #define ALPHA_R %f26
  133. #define ALPHA_I %f27
  134. #endif
  135. #ifndef __64BIT__
  136. #define STACK_ALPHA_R [%sp + STACK_START + 16]
  137. #ifndef DOUBLE
  138. #define STACK_ALPHA_I [%sp + STACK_START + 20]
  139. #else
  140. #define STACK_ALPHA_I [%sp + STACK_START + 24]
  141. #endif
  142. #else
  143. #define STACK_ALPHA_R [%sp + STACK_START + 32]
  144. #define STACK_ALPHA_I [%sp + STACK_START + 40]
  145. #endif
  146. #ifndef CONJ
  147. #define FSUBX FSUB
  148. #define FADDX FADD
  149. #else
  150. #define FSUBX FADD
  151. #define FADDX FSUB
  152. #endif
  153. PROLOGUE
  154. SAVESP
  155. #ifndef __64BIT__
  156. #ifdef DOUBLE
  157. st %i3, [%sp + STACK_START + 16] /* ALPHA_R */
  158. st %i4, [%sp + STACK_START + 20]
  159. st %i5, [%sp + STACK_START + 24] /* ALPHA_I */
  160. ld [%sp + STACK_START + 32], A
  161. ld [%sp + STACK_START + 36], LDA
  162. ld [%sp + STACK_START + 40], X
  163. ld [%sp + STACK_START + 44], INCX
  164. ld [%sp + STACK_START + 48], Y
  165. ld [%sp + STACK_START + 52], INCY
  166. ld [%sp + STACK_START + 56], BUFFER
  167. #else
  168. st %i3, [%sp + STACK_START + 16] /* ALPHA_R */
  169. st %i4, [%sp + STACK_START + 20] /* ALPHA_I */
  170. ld [%sp + STACK_START + 28], LDA
  171. ld [%sp + STACK_START + 32], X
  172. ld [%sp + STACK_START + 36], INCX
  173. ld [%sp + STACK_START + 40], Y
  174. ld [%sp + STACK_START + 44], INCY
  175. ld [%sp + STACK_START + 48], BUFFER
  176. #endif
  177. #else
  178. ldx [%sp + STACK_START + 56], LDA
  179. ldx [%sp + STACK_START + 64], X
  180. ldx [%sp + STACK_START + 72], INCX
  181. ldx [%sp + STACK_START + 80], Y
  182. ldx [%sp + STACK_START + 88], INCY
  183. ldx [%sp + STACK_START + 96], BUFFER
  184. #ifdef DOUBLE
  185. std %f6, STACK_ALPHA_R
  186. std %f8, STACK_ALPHA_I
  187. #else
  188. st %f7, STACK_ALPHA_R
  189. st %f9, STACK_ALPHA_I
  190. #endif
  191. #endif
  192. sll LDA, ZBASE_SHIFT, LDA
  193. cmp M, 0
  194. ble %icc, .LL999
  195. sll INCX, ZBASE_SHIFT, INCX
  196. cmp N, 0
  197. ble %icc, .LL999
  198. sll INCY, ZBASE_SHIFT, INCY
  199. cmp INCY, 2 * SIZE
  200. be %icc, .LL20
  201. mov Y, YY
  202. #ifdef DOUBLE
  203. FCLR(19)
  204. #else
  205. FCLR(25)
  206. #endif
  207. add M, 3, J
  208. sra J, 2, J
  209. mov BUFFER, YY
  210. mov BUFFER, Y1
  211. .LL01:
  212. STF FZERO, [Y1 + 0 * SIZE]
  213. nop
  214. STF FZERO, [Y1 + 1 * SIZE]
  215. STF FZERO, [Y1 + 2 * SIZE]
  216. STF FZERO, [Y1 + 3 * SIZE]
  217. STF FZERO, [Y1 + 4 * SIZE]
  218. nop
  219. STF FZERO, [Y1 + 5 * SIZE]
  220. deccc J
  221. STF FZERO, [Y1 + 6 * SIZE]
  222. nop
  223. STF FZERO, [Y1 + 7 * SIZE]
  224. bg,pn %icc, .LL01
  225. add Y1, 8 * SIZE, Y1
  226. .LL20:
  227. sra N, 1, J
  228. cmp J, 0
  229. ble,pn %icc, .LL30
  230. nop
  231. .LL21:
  232. mov YY, Y1
  233. mov A, A1
  234. LDF STACK_ALPHA_R, ALPHA_R
  235. LDF STACK_ALPHA_I, ALPHA_I
  236. add A, LDA, A2
  237. add A2, LDA, A
  238. LDF [X + 0 * SIZE], x1
  239. LDF [X + 1 * SIZE], x2
  240. add X, INCX, X
  241. LDF [X + 0 * SIZE], x3
  242. LDF [X + 1 * SIZE], x4
  243. add X, INCX, X
  244. FMUL ALPHA_R, x1, a1
  245. FMUL ALPHA_I, x2, a4
  246. FMUL ALPHA_I, x1, a2
  247. FMUL ALPHA_R, x2, a3
  248. FMUL ALPHA_R, x3, a5
  249. FMUL ALPHA_I, x4, a8
  250. FMUL ALPHA_I, x3, a6
  251. FMUL ALPHA_R, x4, a7
  252. #ifndef XCONJ
  253. FSUB a1, a4, x1
  254. FADD a2, a3, x2
  255. FSUB a5, a8, x3
  256. FADD a6, a7, x4
  257. #else
  258. FADD a1, a4, x1
  259. FSUB a2, a3, x2
  260. FADD a5, a8, x3
  261. FSUB a6, a7, x4
  262. #endif
  263. sra M, 2, I
  264. cmp I, 0
  265. ble,pn %icc, .LL27
  266. nop
  267. LDF [A1 + 0 * SIZE], a1
  268. LDF [A1 + 1 * SIZE], a2
  269. LDF [A1 + 2 * SIZE], a3
  270. LDF [A1 + 3 * SIZE], a4
  271. LDF [A1 + 4 * SIZE], a9
  272. LDF [A1 + 5 * SIZE], a10
  273. LDF [A1 + 6 * SIZE], a11
  274. LDF [A1 + 7 * SIZE], a12
  275. LDF [A2 + 0 * SIZE], a5
  276. LDF [A2 + 1 * SIZE], a6
  277. LDF [A2 + 2 * SIZE], a7
  278. LDF [A2 + 3 * SIZE], a8
  279. LDF [A2 + 4 * SIZE], a13
  280. LDF [A2 + 5 * SIZE], a14
  281. LDF [A2 + 6 * SIZE], a15
  282. LDF [A2 + 7 * SIZE], a16
  283. LDF [Y1 + 0 * SIZE], y1
  284. LDF [Y1 + 1 * SIZE], y2
  285. LDF [Y1 + 2 * SIZE], y3
  286. FMUL a1, x1, t1
  287. deccc I
  288. FMUL a1, x2, t2
  289. LDF [A1 + 8 * SIZE], a1
  290. FMUL a3, x1, t3
  291. FMUL a3, x2, t4
  292. ble,pn %icc, .LL26
  293. LDF [A1 + 10 * SIZE], a3
  294. FADD y1, t1, y1
  295. LDF [Y1 + 3 * SIZE], y4
  296. FMUL a2, x2, t1
  297. FADD y2, t2, y2
  298. FMUL a2, x1, t2
  299. LDF [A1 + 9 * SIZE], a2
  300. FADD y3, t3, y3
  301. LDF [Y1 + 4 * SIZE], y5
  302. FMUL a4, x2, t3
  303. FADD y4, t4, y4
  304. FMUL a4, x1, t4
  305. LDF [A1 + 11 * SIZE], a4
  306. FSUBX y1, t1, y1
  307. LDF [Y1 + 5 * SIZE], y6
  308. FMUL a5, x3, t1
  309. FADDX y2, t2, y2
  310. FMUL a5, x4, t2
  311. LDF [A2 + 8 * SIZE], a5
  312. FSUBX y3, t3, y3
  313. LDF [Y1 + 6 * SIZE], y7
  314. FMUL a7, x3, t3
  315. FADDX y4, t4, y4
  316. FMUL a7, x4, t4
  317. LDF [A2 + 10 * SIZE], a7
  318. FADD y1, t1, y1
  319. LDF [Y1 + 7 * SIZE], y8
  320. FMUL a6, x4, t1
  321. FADD y2, t2, y2
  322. FMUL a6, x3, t2
  323. LDF [A2 + 9 * SIZE], a6
  324. FADD y3, t3, y3
  325. FMUL a8, x4, t3
  326. FADD y4, t4, y4
  327. FMUL a8, x3, t4
  328. LDF [A2 + 11 * SIZE], a8
  329. FSUBX y1, t1, y1
  330. FMUL a9, x1, t1
  331. FADDX y2, t2, y2
  332. FMUL a9, x2, t2
  333. LDF [A1 + 12 * SIZE], a9
  334. FSUBX y3, t3, y3
  335. deccc I
  336. FMUL a11, x1, t3
  337. FADDX y4, t4, y4
  338. FMUL a11, x2, t4
  339. ble,pn %icc, .LL23
  340. LDF [A1 + 14 * SIZE], a11
  341. .LL22:
  342. FADD y5, t1, y5
  343. prefetch [A1 + PREFETCHSIZE * SIZE], 1
  344. FMUL a10, x2, t1
  345. LDF [Y1 + 7 * SIZE], y8
  346. FADD y6, t2, y6
  347. FMUL a10, x1, t2
  348. LDF [A1 + 13 * SIZE], a10
  349. FADD y7, t3, y7
  350. FMUL a12, x2, t3
  351. STF y1, [Y1 + 0 * SIZE]
  352. FADD y8, t4, y8
  353. FMUL a12, x1, t4
  354. LDF [A1 + 15 * SIZE], a12
  355. FSUBX y5, t1, y5
  356. FMUL a13, x3, t1
  357. STF y2, [Y1 + 1 * SIZE]
  358. FADDX y6, t2, y6
  359. FMUL a13, x4, t2
  360. LDF [A2 + 12 * SIZE], a13
  361. FSUBX y7, t3, y7
  362. FMUL a15, x3, t3
  363. STF y3, [Y1 + 2 * SIZE]
  364. FADDX y8, t4, y8
  365. FMUL a15, x4, t4
  366. LDF [A2 + 14 * SIZE], a15
  367. FADD y5, t1, y5
  368. FMUL a14, x4, t1
  369. STF y4, [Y1 + 3 * SIZE]
  370. FADD y6, t2, y6
  371. FMUL a14, x3, t2
  372. LDF [A2 + 13 * SIZE], a14
  373. FADD y7, t3, y7
  374. FMUL a16, x4, t3
  375. LDF [Y1 + 8 * SIZE], y1
  376. FADD y8, t4, y8
  377. FMUL a16, x3, t4
  378. LDF [A2 + 15 * SIZE], a16
  379. FSUBX y5, t1, y5
  380. FMUL a1, x1, t1
  381. LDF [Y1 + 9 * SIZE], y2
  382. FADDX y6, t2, y6
  383. FMUL a1, x2, t2
  384. LDF [A1 + 16 * SIZE], a1
  385. FSUBX y7, t3, y7
  386. FMUL a3, x1, t3
  387. LDF [Y1 + 10 * SIZE], y3
  388. FADDX y8, t4, y8
  389. FMUL a3, x2, t4
  390. LDF [A1 + 18 * SIZE], a3
  391. FADD y1, t1, y1
  392. prefetch [A2 + PREFETCHSIZE * SIZE], 1
  393. FMUL a2, x2, t1
  394. LDF [Y1 + 11 * SIZE], y4
  395. FADD y2, t2, y2
  396. FMUL a2, x1, t2
  397. LDF [A1 + 17 * SIZE], a2
  398. FADD y3, t3, y3
  399. FMUL a4, x2, t3
  400. STF y5, [Y1 + 4 * SIZE]
  401. FADD y4, t4, y4
  402. FMUL a4, x1, t4
  403. LDF [A1 + 19 * SIZE], a4
  404. FSUBX y1, t1, y1
  405. FMUL a5, x3, t1
  406. STF y6, [Y1 + 5 * SIZE]
  407. FADDX y2, t2, y2
  408. FMUL a5, x4, t2
  409. LDF [A2 + 16 * SIZE], a5
  410. FSUBX y3, t3, y3
  411. FMUL a7, x3, t3
  412. STF y7, [Y1 + 6 * SIZE]
  413. FADDX y4, t4, y4
  414. deccc I
  415. FMUL a7, x4, t4
  416. LDF [A2 + 18 * SIZE], a7
  417. FADD y1, t1, y1
  418. FMUL a6, x4, t1
  419. STF y8, [Y1 + 7 * SIZE]
  420. FADD y2, t2, y2
  421. FMUL a6, x3, t2
  422. LDF [A2 + 17 * SIZE], a6
  423. FADD y3, t3, y3
  424. add A1, 8 * SIZE, A1
  425. FMUL a8, x4, t3
  426. LDF [Y1 + 12 * SIZE], y5
  427. FADD y4, t4, y4
  428. FMUL a8, x3, t4
  429. LDF [A2 + 19 * SIZE], a8
  430. FSUBX y1, t1, y1
  431. add A2, 8 * SIZE, A2
  432. FMUL a9, x1, t1
  433. LDF [Y1 + 13 * SIZE], y6
  434. FADDX y2, t2, y2
  435. add Y1, 8 * SIZE, Y1
  436. FMUL a9, x2, t2
  437. LDF [A1 + 12 * SIZE], a9
  438. FSUBX y3, t3, y3
  439. FMUL a11, x1, t3
  440. LDF [Y1 + 6 * SIZE], y7
  441. FADDX y4, t4, y4
  442. FMUL a11, x2, t4
  443. bg,pn %icc, .LL22
  444. LDF [A1 + 14 * SIZE], a11
  445. .LL23:
  446. FADD y5, t1, y5
  447. FMUL a10, x2, t1
  448. LDF [Y1 + 7 * SIZE], y8
  449. FADD y6, t2, y6
  450. FMUL a10, x1, t2
  451. LDF [A1 + 13 * SIZE], a10
  452. FADD y7, t3, y7
  453. FMUL a12, x2, t3
  454. STF y1, [Y1 + 0 * SIZE]
  455. FADD y8, t4, y8
  456. FMUL a12, x1, t4
  457. LDF [A1 + 15 * SIZE], a12
  458. FSUBX y5, t1, y5
  459. FMUL a13, x3, t1
  460. STF y2, [Y1 + 1 * SIZE]
  461. FADDX y6, t2, y6
  462. FMUL a13, x4, t2
  463. LDF [A2 + 12 * SIZE], a13
  464. FSUBX y7, t3, y7
  465. FMUL a15, x3, t3
  466. STF y3, [Y1 + 2 * SIZE]
  467. FADDX y8, t4, y8
  468. FMUL a15, x4, t4
  469. LDF [A2 + 14 * SIZE], a15
  470. FADD y5, t1, y5
  471. FMUL a14, x4, t1
  472. STF y4, [Y1 + 3 * SIZE]
  473. FADD y6, t2, y6
  474. FMUL a14, x3, t2
  475. LDF [A2 + 13 * SIZE], a14
  476. FADD y7, t3, y7
  477. FMUL a16, x4, t3
  478. LDF [Y1 + 8 * SIZE], y1
  479. FADD y8, t4, y8
  480. FMUL a16, x3, t4
  481. LDF [A2 + 15 * SIZE], a16
  482. FSUBX y5, t1, y5
  483. add A1, 8 * SIZE, A1
  484. FMUL a1, x1, t1
  485. LDF [Y1 + 9 * SIZE], y2
  486. FADDX y6, t2, y6
  487. add A2, 8 * SIZE, A2
  488. FMUL a1, x2, t2
  489. LDF [A1 + 8 * SIZE], a1
  490. FSUBX y7, t3, y7
  491. FMUL a3, x1, t3
  492. LDF [Y1 + 10 * SIZE], y3
  493. FADDX y8, t4, y8
  494. add Y1, 8 * SIZE, Y1
  495. FMUL a3, x2, t4
  496. LDF [A1 + 10 * SIZE], a3
  497. STF y5, [Y1 - 4 * SIZE]
  498. STF y6, [Y1 - 3 * SIZE]
  499. STF y7, [Y1 - 2 * SIZE]
  500. STF y8, [Y1 - 1 * SIZE]
  501. .LL26:
  502. FADD y1, t1, y1
  503. LDF [Y1 + 3 * SIZE], y4
  504. FMUL a2, x2, t1
  505. FADD y2, t2, y2
  506. FMUL a2, x1, t2
  507. FADD y3, t3, y3
  508. LDF [Y1 + 4 * SIZE], y5
  509. FMUL a4, x2, t3
  510. FADD y4, t4, y4
  511. FMUL a4, x1, t4
  512. FSUBX y1, t1, y1
  513. LDF [Y1 + 5 * SIZE], y6
  514. FMUL a5, x3, t1
  515. FADDX y2, t2, y2
  516. FMUL a5, x4, t2
  517. FSUBX y3, t3, y3
  518. LDF [Y1 + 6 * SIZE], y7
  519. FADDX y4, t4, y4
  520. FMUL a7, x4, t4
  521. FADD y1, t1, y1
  522. LDF [Y1 + 7 * SIZE], y8
  523. FMUL a7, x3, t3
  524. FMUL a6, x4, t1
  525. FADD y2, t2, y2
  526. FMUL a6, x3, t2
  527. FADD y3, t3, y3
  528. FMUL a8, x4, t3
  529. FADD y4, t4, y4
  530. FMUL a8, x3, t4
  531. FSUBX y1, t1, y1
  532. FMUL a9, x1, t1
  533. FADDX y2, t2, y2
  534. FMUL a9, x2, t2
  535. FSUBX y3, t3, y3
  536. FMUL a11, x1, t3
  537. FADDX y4, t4, y4
  538. FMUL a11, x2, t4
  539. FADD y5, t1, y5
  540. FMUL a10, x2, t1
  541. FADD y6, t2, y6
  542. FMUL a10, x1, t2
  543. FADD y7, t3, y7
  544. FMUL a12, x2, t3
  545. FADD y8, t4, y8
  546. FMUL a12, x1, t4
  547. FSUBX y5, t1, y5
  548. FMUL a13, x3, t1
  549. FADDX y6, t2, y6
  550. FMUL a13, x4, t2
  551. FSUBX y7, t3, y7
  552. FMUL a15, x3, t3
  553. FADDX y8, t4, y8
  554. FMUL a15, x4, t4
  555. FADD y5, t1, y5
  556. FMUL a14, x4, t1
  557. FADD y6, t2, y6
  558. FMUL a14, x3, t2
  559. FADD y7, t3, y7
  560. FMUL a16, x4, t3
  561. FADD y8, t4, y8
  562. FMUL a16, x3, t4
  563. STF y1, [Y1 + 0 * SIZE]
  564. FSUBX y5, t1, y5
  565. STF y2, [Y1 + 1 * SIZE]
  566. FADDX y6, t2, y6
  567. STF y3, [Y1 + 2 * SIZE]
  568. FSUBX y7, t3, y7
  569. STF y4, [Y1 + 3 * SIZE]
  570. FADDX y8, t4, y8
  571. STF y5, [Y1 + 4 * SIZE]
  572. add A1, 8 * SIZE, A1
  573. STF y6, [Y1 + 5 * SIZE]
  574. add A2, 8 * SIZE, A2
  575. STF y7, [Y1 + 6 * SIZE]
  576. STF y8, [Y1 + 7 * SIZE]
  577. add Y1, 8 * SIZE, Y1
  578. .LL27:
  579. andcc M, 2, I
  580. ble,pn %icc, .LL28
  581. nop
  582. LDF [A1 + 0 * SIZE], a1
  583. LDF [A1 + 1 * SIZE], a2
  584. LDF [A1 + 2 * SIZE], a3
  585. LDF [A1 + 3 * SIZE], a4
  586. LDF [Y1 + 0 * SIZE], y1
  587. LDF [Y1 + 1 * SIZE], y2
  588. LDF [Y1 + 2 * SIZE], y3
  589. LDF [Y1 + 3 * SIZE], y4
  590. FMUL a1, x1, t1
  591. LDF [A2 + 0 * SIZE], a5
  592. FMUL a1, x2, t2
  593. LDF [A2 + 1 * SIZE], a6
  594. FMUL a3, x1, t3
  595. LDF [A2 + 2 * SIZE], a7
  596. FMUL a3, x2, t4
  597. LDF [A2 + 3 * SIZE], a8
  598. FADD y1, t1, y1
  599. FMUL a2, x2, t1
  600. FADD y2, t2, y2
  601. FMUL a2, x1, t2
  602. FADD y3, t3, y3
  603. FMUL a4, x2, t3
  604. FADD y4, t4, y4
  605. FMUL a4, x1, t4
  606. FSUBX y1, t1, y1
  607. FMUL a5, x3, t1
  608. FADDX y2, t2, y2
  609. FMUL a5, x4, t2
  610. FSUBX y3, t3, y3
  611. FMUL a7, x3, t3
  612. FADDX y4, t4, y4
  613. FMUL a7, x4, t4
  614. FADD y1, t1, y1
  615. FMUL a6, x4, t1
  616. FADD y2, t2, y2
  617. FMUL a6, x3, t2
  618. FADD y3, t3, y3
  619. FMUL a8, x4, t3
  620. FADD y4, t4, y4
  621. FMUL a8, x3, t4
  622. FSUBX y1, t1, y1
  623. FADDX y2, t2, y2
  624. FSUBX y3, t3, y3
  625. FADDX y4, t4, y4
  626. STF y1, [Y1 + 0 * SIZE]
  627. add A1, 4 * SIZE, A1
  628. STF y2, [Y1 + 1 * SIZE]
  629. add A2, 4 * SIZE, A2
  630. STF y3, [Y1 + 2 * SIZE]
  631. nop
  632. STF y4, [Y1 + 3 * SIZE]
  633. add Y1, 4 * SIZE, Y1
  634. .LL28:
  635. andcc M, 1, I
  636. ble,pn %icc, .LL29
  637. nop
  638. LDF [A1 + 0 * SIZE], a1
  639. LDF [A1 + 1 * SIZE], a2
  640. LDF [A2 + 0 * SIZE], a3
  641. LDF [A2 + 1 * SIZE], a4
  642. LDF [Y1 + 0 * SIZE], y1
  643. LDF [Y1 + 1 * SIZE], y2
  644. FMUL a1, x1, t1
  645. FMUL a1, x2, t2
  646. FMUL a2, x2, t3
  647. FMUL a2, x1, t4
  648. FADD y1, t1, y1
  649. FMUL a3, x3, t1
  650. FADD y2, t2, y2
  651. FMUL a3, x4, t2
  652. FSUBX y1, t3, y1
  653. FMUL a4, x4, t3
  654. FADDX y2, t4, y2
  655. FMUL a4, x3, t4
  656. FADD y1, t1, y1
  657. FADD y2, t2, y2
  658. FSUBX y1, t3, y1
  659. FADDX y2, t4, y2
  660. STF y1, [Y1 + 0 * SIZE]
  661. STF y2, [Y1 + 1 * SIZE]
  662. .LL29:
  663. deccc J
  664. bg %icc, .LL21
  665. nop
  666. .LL30:
  667. andcc N, 1, J
  668. ble,pn %icc, .LL990
  669. nop
  670. .LL31:
  671. mov YY, Y1
  672. mov A, A1
  673. LDF STACK_ALPHA_R, ALPHA_R
  674. LDF STACK_ALPHA_I, ALPHA_I
  675. LDF [X + 0 * SIZE], x1
  676. LDF [X + 1 * SIZE], x2
  677. FMUL ALPHA_R, x1, a1 /* AC */
  678. FMUL ALPHA_I, x1, a2 /* AD */
  679. FMUL ALPHA_R, x2, a3 /* BC */
  680. FMUL ALPHA_I, x2, a4 /* BD */
  681. #ifndef XCONJ
  682. FSUB a1, a4, x1
  683. FADD a2, a3, x2
  684. #else
  685. FADD a1, a4, x1
  686. FSUB a2, a3, x2
  687. #endif
  688. sra M, 2, I
  689. cmp I, 0
  690. ble,pn %icc, .LL37
  691. nop
  692. LDF [A1 + 0 * SIZE], a1
  693. LDF [A1 + 1 * SIZE], a2
  694. LDF [A1 + 2 * SIZE], a3
  695. LDF [A1 + 3 * SIZE], a4
  696. LDF [A1 + 4 * SIZE], a9
  697. LDF [A1 + 5 * SIZE], a10
  698. LDF [A1 + 6 * SIZE], a11
  699. LDF [A1 + 7 * SIZE], a12
  700. LDF [Y1 + 0 * SIZE], y1
  701. LDF [Y1 + 1 * SIZE], y2
  702. LDF [Y1 + 2 * SIZE], y3
  703. LDF [Y1 + 3 * SIZE], y4
  704. LDF [Y1 + 4 * SIZE], y5
  705. LDF [Y1 + 5 * SIZE], y6
  706. LDF [Y1 + 6 * SIZE], y7
  707. LDF [Y1 + 7 * SIZE], y8
  708. FMUL a1, x1, t1
  709. deccc I
  710. FMUL a1, x2, t2
  711. LDF [A1 + 8 * SIZE], a1
  712. FMUL a3, x1, t3
  713. FMUL a3, x2, t4
  714. ble,pn %icc, .LL33
  715. LDF [A1 + 10 * SIZE], a3
  716. .LL32:
  717. FADD y1, t1, y1
  718. prefetch [A1 + PREFETCHSIZE * SIZE], 1
  719. FMUL a2, x2, t1
  720. FADD y2, t2, y2
  721. FMUL a2, x1, t2
  722. LDF [A1 + 9 * SIZE], a2
  723. FADD y3, t3, y3
  724. FMUL a4, x2, t3
  725. FADD y4, t4, y4
  726. FMUL a4, x1, t4
  727. LDF [A1 + 11 * SIZE], a4
  728. FSUBX y1, t1, y1
  729. FMUL a9, x1, t1
  730. FADDX y2, t2, y2
  731. FMUL a9, x2, t2
  732. LDF [A1 + 12 * SIZE], a9
  733. FSUBX y3, t3, y3
  734. FMUL a11, x1, t3
  735. FADDX y4, t4, y4
  736. FMUL a11, x2, t4
  737. LDF [A1 + 14 * SIZE], a11
  738. STF y1, [Y1 + 0 * SIZE]
  739. STF y2, [Y1 + 1 * SIZE]
  740. STF y3, [Y1 + 2 * SIZE]
  741. STF y4, [Y1 + 3 * SIZE]
  742. FADD y5, t1, y5
  743. FMUL a10, x2, t1
  744. LDF [Y1 + 8 * SIZE], y1
  745. FADD y6, t2, y6
  746. FMUL a10, x1, t2
  747. LDF [A1 + 13 * SIZE], a10
  748. FADD y7, t3, y7
  749. deccc I
  750. FMUL a12, x2, t3
  751. LDF [Y1 + 9 * SIZE], y2
  752. FADD y8, t4, y8
  753. FMUL a12, x1, t4
  754. LDF [A1 + 15 * SIZE], a12
  755. FSUBX y5, t1, y5
  756. add A1, 8 * SIZE, A1
  757. FMUL a1, x1, t1
  758. LDF [Y1 + 10 * SIZE], y3
  759. FADDX y6, t2, y6
  760. FMUL a1, x2, t2
  761. LDF [A1 + 8 * SIZE], a1
  762. FSUBX y7, t3, y7
  763. FMUL a3, x1, t3
  764. LDF [Y1 + 11 * SIZE], y4
  765. FADDX y8, t4, y8
  766. FMUL a3, x2, t4
  767. LDF [A1 + 10 * SIZE], a3
  768. STF y5, [Y1 + 4 * SIZE]
  769. STF y6, [Y1 + 5 * SIZE]
  770. STF y7, [Y1 + 6 * SIZE]
  771. STF y8, [Y1 + 7 * SIZE]
  772. LDF [Y1 + 12 * SIZE], y5
  773. LDF [Y1 + 13 * SIZE], y6
  774. LDF [Y1 + 14 * SIZE], y7
  775. add Y1, 8 * SIZE, Y1
  776. bg,pn %icc, .LL32
  777. LDF [Y1 + 7 * SIZE], y8
  778. .LL33:
  779. FADD y1, t1, y1
  780. FMUL a2, x2, t1
  781. FADD y2, t2, y2
  782. FMUL a2, x1, t2
  783. FADD y3, t3, y3
  784. FMUL a4, x2, t3
  785. FADD y4, t4, y4
  786. FMUL a4, x1, t4
  787. FSUBX y1, t1, y1
  788. FMUL a9, x1, t1
  789. FADDX y2, t2, y2
  790. FMUL a9, x2, t2
  791. FSUBX y3, t3, y3
  792. FMUL a11, x1, t3
  793. FADDX y4, t4, y4
  794. FMUL a11, x2, t4
  795. FADD y5, t1, y5
  796. FMUL a10, x2, t1
  797. FADD y6, t2, y6
  798. FMUL a10, x1, t2
  799. FADD y7, t3, y7
  800. FMUL a12, x2, t3
  801. FADD y8, t4, y8
  802. FMUL a12, x1, t4
  803. FSUBX y5, t1, y5
  804. FADDX y6, t2, y6
  805. FSUBX y7, t3, y7
  806. FADDX y8, t4, y8
  807. STF y1, [Y1 + 0 * SIZE]
  808. STF y2, [Y1 + 1 * SIZE]
  809. STF y3, [Y1 + 2 * SIZE]
  810. STF y4, [Y1 + 3 * SIZE]
  811. STF y5, [Y1 + 4 * SIZE]
  812. STF y6, [Y1 + 5 * SIZE]
  813. STF y7, [Y1 + 6 * SIZE]
  814. STF y8, [Y1 + 7 * SIZE]
  815. add A1, 8 * SIZE, A1
  816. add Y1, 8 * SIZE, Y1
  817. .LL37:
  818. andcc M, 2, I
  819. ble,pn %icc, .LL38
  820. nop
  821. LDF [A1 + 0 * SIZE], a1
  822. LDF [A1 + 1 * SIZE], a2
  823. LDF [A1 + 2 * SIZE], a3
  824. LDF [A1 + 3 * SIZE], a4
  825. LDF [Y1 + 0 * SIZE], y1
  826. FMUL a1, x1, t1
  827. LDF [Y1 + 1 * SIZE], y2
  828. FMUL a1, x2, t2
  829. LDF [Y1 + 2 * SIZE], y3
  830. FMUL a3, x1, t3
  831. LDF [Y1 + 3 * SIZE], y4
  832. FMUL a3, x2, t4
  833. FADD y1, t1, y1
  834. FMUL a2, x2, t1
  835. FADD y2, t2, y2
  836. FMUL a2, x1, t2
  837. FADD y3, t3, y3
  838. FMUL a4, x2, t3
  839. FADD y4, t4, y4
  840. FMUL a4, x1, t4
  841. FSUBX y1, t1, y1
  842. FADDX y2, t2, y2
  843. FSUBX y3, t3, y3
  844. FADDX y4, t4, y4
  845. STF y1, [Y1 + 0 * SIZE]
  846. STF y2, [Y1 + 1 * SIZE]
  847. STF y3, [Y1 + 2 * SIZE]
  848. STF y4, [Y1 + 3 * SIZE]
  849. add A1, 4 * SIZE, A1
  850. add Y1, 4 * SIZE, Y1
  851. .LL38:
  852. andcc M, 1, I
  853. ble,pn %icc, .LL990
  854. nop
  855. LDF [A1 + 0 * SIZE], a1
  856. LDF [A1 + 1 * SIZE], a2
  857. LDF [Y1 + 0 * SIZE], y1
  858. LDF [Y1 + 1 * SIZE], y2
  859. FMUL a1, x1, t1
  860. FMUL a1, x2, t2
  861. FMUL a2, x2, t3
  862. FMUL a2, x1, t4
  863. FADD y1, t1, y1
  864. FADD y2, t2, y2
  865. FSUBX y1, t3, y1
  866. FADDX y2, t4, y2
  867. STF y1, [Y1 + 0 * SIZE]
  868. STF y2, [Y1 + 1 * SIZE]
  869. .LL990:
  870. cmp INCY, 2 * SIZE
  871. be %icc, .LL999
  872. mov Y, Y1
  873. sra M, 2, I
  874. cmp I, 0
  875. ble,pn %icc, .LL995
  876. nop
  877. .LL991:
  878. LDF [BUFFER + 0 * SIZE], a1
  879. LDF [BUFFER + 1 * SIZE], a2
  880. LDF [Y + 0 * SIZE], y1
  881. LDF [Y + 1 * SIZE], y2
  882. add Y, INCY, Y
  883. LDF [BUFFER + 2 * SIZE], a3
  884. LDF [BUFFER + 3 * SIZE], a4
  885. LDF [Y + 0 * SIZE], y3
  886. LDF [Y + 1 * SIZE], y4
  887. add Y, INCY, Y
  888. LDF [BUFFER + 4 * SIZE], a5
  889. LDF [BUFFER + 5 * SIZE], a6
  890. LDF [Y + 0 * SIZE], y5
  891. LDF [Y + 1 * SIZE], y6
  892. add Y, INCY, Y
  893. LDF [BUFFER + 6 * SIZE], a7
  894. LDF [BUFFER + 7 * SIZE], a8
  895. LDF [Y + 0 * SIZE], y7
  896. LDF [Y + 1 * SIZE], y8
  897. add Y, INCY, Y
  898. FADD y1, a1, y1
  899. FADD y2, a2, y2
  900. FADD y3, a3, y3
  901. FADD y4, a4, y4
  902. FADD y5, a5, y5
  903. FADD y6, a6, y6
  904. FADD y7, a7, y7
  905. FADD y8, a8, y8
  906. STF y1, [Y1 + 0 * SIZE]
  907. STF y2, [Y1 + 1 * SIZE]
  908. add Y1, INCY, Y1
  909. STF y3, [Y1 + 0 * SIZE]
  910. STF y4, [Y1 + 1 * SIZE]
  911. add Y1, INCY, Y1
  912. STF y5, [Y1 + 0 * SIZE]
  913. STF y6, [Y1 + 1 * SIZE]
  914. add Y1, INCY, Y1
  915. STF y7, [Y1 + 0 * SIZE]
  916. STF y8, [Y1 + 1 * SIZE]
  917. add Y1, INCY, Y1
  918. deccc I
  919. bg,pn %icc, .LL991
  920. add BUFFER, 8 * SIZE, BUFFER
  921. .LL995:
  922. andcc M, 2, I
  923. ble,pn %icc, .LL996
  924. nop
  925. LDF [BUFFER + 0 * SIZE], a1
  926. LDF [BUFFER + 1 * SIZE], a2
  927. LDF [Y + 0 * SIZE], y1
  928. LDF [Y + 1 * SIZE], y2
  929. add Y, INCY, Y
  930. LDF [BUFFER + 2 * SIZE], a3
  931. LDF [BUFFER + 3 * SIZE], a4
  932. LDF [Y + 0 * SIZE], y3
  933. LDF [Y + 1 * SIZE], y4
  934. add Y, INCY, Y
  935. FADD y1, a1, y1
  936. FADD y2, a2, y2
  937. FADD y3, a3, y3
  938. FADD y4, a4, y4
  939. STF y1, [Y1 + 0 * SIZE]
  940. STF y2, [Y1 + 1 * SIZE]
  941. add Y1, INCY, Y1
  942. STF y3, [Y1 + 0 * SIZE]
  943. STF y4, [Y1 + 1 * SIZE]
  944. add Y1, INCY, Y1
  945. add BUFFER, 4 * SIZE, BUFFER
  946. .LL996:
  947. andcc M, 1, I
  948. ble,pn %icc, .LL999
  949. nop
  950. LDF [BUFFER + 0 * SIZE], a1
  951. LDF [BUFFER + 1 * SIZE], a2
  952. LDF [Y + 0 * SIZE], y1
  953. LDF [Y + 1 * SIZE], y2
  954. FADD y1, a1, y1
  955. FADD y2, a2, y2
  956. STF y1, [Y1 + 0 * SIZE]
  957. STF y2, [Y1 + 1 * SIZE]
  958. .LL999:
  959. return %i7 + 8
  960. clr %o0
  961. EPILOGUE