You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemv_t.S 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define P 4000
  41. #define M %i0
  42. #define N %i1
  43. #define A %i5
  44. #define LDA %i2
  45. #define X %i3
  46. #define INCX %i4
  47. #define Y %l0
  48. #define INCY %l1
  49. #define BUFFER %l2
  50. #define I %l3
  51. #define IS %l4
  52. #define J %l5
  53. #define MIN_M %l6
  54. #define XP %l7
  55. #define A1 %o0
  56. #define A2 %o1
  57. #define A3 %o2
  58. #define A4 %o3
  59. #define X1 %o4
  60. #define Y1 %o5
  61. #define PNLDA %g1
  62. #define Y2 %o7 /* Danger? */
  63. #ifdef DOUBLE
  64. #define t1 %f0
  65. #define t2 %f2
  66. #define t3 %f4
  67. #define t4 %f6
  68. #define c1 %f8
  69. #define c2 %f10
  70. #define c3 %f12
  71. #define c4 %f14
  72. #define c5 %f16
  73. #define c6 %f18
  74. #define c7 %f20
  75. #define c8 %f22
  76. #define c9 %f24
  77. #define c10 %f26
  78. #define c11 %f28
  79. #define c12 %f30
  80. #define c13 %f32
  81. #define c14 %f34
  82. #define c15 %f36
  83. #define c16 %f38
  84. #define a1 %f40
  85. #define a2 %f42
  86. #define a3 %f44
  87. #define a4 %f46
  88. #define a5 %f48
  89. #define a6 %f50
  90. #define a7 %f52
  91. #define a8 %f54
  92. #define b1 %f56
  93. #define b2 %f58
  94. #define b3 %f60
  95. #define b4 %f62
  96. #else
  97. #define t1 %f0
  98. #define t2 %f1
  99. #define t3 %f2
  100. #define t4 %f3
  101. #define c1 %f4
  102. #define c2 %f5
  103. #define c3 %f6
  104. #define c4 %f7
  105. #define c5 %f8
  106. #define c6 %f9
  107. #define c7 %f10
  108. #define c8 %f11
  109. #define c9 %f12
  110. #define c10 %f13
  111. #define c11 %f14
  112. #define c12 %f15
  113. #define c13 %f16
  114. #define c14 %f17
  115. #define c15 %f18
  116. #define c16 %f19
  117. #define a1 %f20
  118. #define a2 %f21
  119. #define a3 %f22
  120. #define a4 %f23
  121. #define a5 %f24
  122. #define a6 %f25
  123. #define a7 %f26
  124. #define a8 %f27
  125. #define b1 %f28
  126. #define b2 %f29
  127. #define b3 %f30
  128. #define b4 %f31
  129. #endif
  130. #ifndef __64BIT__
  131. #define ALPHA_R [%sp + STACK_START + 16]
  132. #ifndef DOUBLE
  133. #define ALPHA_I [%sp + STACK_START + 20]
  134. #else
  135. #define ALPHA_I [%sp + STACK_START + 24]
  136. #endif
  137. #else
  138. #define ALPHA_R [%sp + STACK_START + 32]
  139. #define ALPHA_I [%sp + STACK_START + 40]
  140. #endif
  141. #ifdef DOUBLE
  142. #define PREFETCHSIZE 18
  143. #else
  144. #define PREFETCHSIZE 36
  145. #endif
  146. PROLOGUE
  147. SAVESP
  148. nop
  149. #ifndef __64BIT__
  150. #ifdef DOUBLE
  151. st %i3, [%sp + STACK_START + 16] /* ALPHA_R */
  152. st %i4, [%sp + STACK_START + 20]
  153. st %i5, [%sp + STACK_START + 24] /* ALPHA_I */
  154. ld [%sp + STACK_START + 32], A
  155. ld [%sp + STACK_START + 36], LDA
  156. ld [%sp + STACK_START + 40], X
  157. ld [%sp + STACK_START + 44], INCX
  158. ld [%sp + STACK_START + 48], Y
  159. ld [%sp + STACK_START + 52], INCY
  160. ld [%sp + STACK_START + 56], BUFFER
  161. #else
  162. st %i3, [%sp + STACK_START + 16] /* ALPHA_R */
  163. st %i4, [%sp + STACK_START + 20] /* ALPHA_I */
  164. ld [%sp + STACK_START + 28], LDA
  165. ld [%sp + STACK_START + 32], X
  166. ld [%sp + STACK_START + 36], INCX
  167. ld [%sp + STACK_START + 40], Y
  168. ld [%sp + STACK_START + 44], INCY
  169. ld [%sp + STACK_START + 48], BUFFER
  170. #endif
  171. #else
  172. ldx [%sp + STACK_START + 56], LDA
  173. ldx [%sp + STACK_START + 64], X
  174. ldx [%sp + STACK_START + 72], INCX
  175. ldx [%sp + STACK_START + 80], Y
  176. ldx [%sp + STACK_START + 88], INCY
  177. ldx [%sp + STACK_START + 96], BUFFER
  178. #ifdef DOUBLE
  179. std %f6, ALPHA_R
  180. std %f8, ALPHA_I
  181. #else
  182. st %f7, ALPHA_R
  183. st %f9, ALPHA_I
  184. #endif
  185. #endif
  186. clr IS
  187. mov P, I
  188. sll LDA, ZBASE_SHIFT, LDA
  189. sll I, ZBASE_SHIFT, I
  190. smul LDA, N, PNLDA
  191. sll INCX, ZBASE_SHIFT, INCX
  192. sll INCY, ZBASE_SHIFT, INCY
  193. sub I, PNLDA, PNLDA
  194. .LL10:
  195. sll IS, ZBASE_SHIFT, I
  196. sub M, IS, MIN_M
  197. mov P, J
  198. cmp MIN_M, J
  199. nop
  200. movg %icc, J, MIN_M
  201. nop
  202. cmp INCX, 2 * SIZE
  203. beq .LL100
  204. add X, I, XP
  205. sra MIN_M, 2, I
  206. mov BUFFER, XP
  207. cmp I, 0
  208. ble,pn %icc, .LL15
  209. mov BUFFER, Y1
  210. .LL11:
  211. LDF [X + 0 * SIZE], a1
  212. LDF [X + 1 * SIZE], a2
  213. add X, INCX, X
  214. LDF [X + 0 * SIZE], a3
  215. LDF [X + 1 * SIZE], a4
  216. add X, INCX, X
  217. LDF [X + 0 * SIZE], a5
  218. LDF [X + 1 * SIZE], a6
  219. add X, INCX, X
  220. LDF [X + 0 * SIZE], a7
  221. LDF [X + 1 * SIZE], a8
  222. add X, INCX, X
  223. STF a1, [Y1 + 0 * SIZE]
  224. add I, -1, I
  225. STF a2, [Y1 + 1 * SIZE]
  226. cmp I, 0
  227. STF a3, [Y1 + 2 * SIZE]
  228. STF a4, [Y1 + 3 * SIZE]
  229. STF a5, [Y1 + 4 * SIZE]
  230. STF a6, [Y1 + 5 * SIZE]
  231. STF a7, [Y1 + 6 * SIZE]
  232. STF a8, [Y1 + 7 * SIZE]
  233. bg,pn %icc, .LL11
  234. add Y1, 8 * SIZE, Y1
  235. .LL15:
  236. and MIN_M, 3, I
  237. cmp I, 0
  238. ble,pn %icc, .LL100
  239. nop
  240. .LL16:
  241. LDF [X + 0 * SIZE], a1
  242. LDF [X + 1 * SIZE], a2
  243. add X, INCX, X
  244. add I, -1, I
  245. cmp I, 0
  246. nop
  247. STF a1, [Y1 + 0 * SIZE]
  248. STF a2, [Y1 + 1 * SIZE]
  249. bg,pn %icc, .LL16
  250. add Y1, 2 * SIZE, Y1
  251. .LL100:
  252. sra N, 2, J
  253. cmp J, 0
  254. ble %icc, .LL200
  255. mov Y, Y1
  256. .LL110:
  257. FCLR(0)
  258. FMOV t1, c1
  259. sra MIN_M, 2, I
  260. FMOV t1, c2
  261. add A, LDA, A2
  262. FMOV t1, c3
  263. mov A, A1
  264. FMOV t1, c4
  265. add A2, LDA, A3
  266. FMOV t1, c5
  267. FMOV t1, c6
  268. FMOV t1, c7
  269. FMOV t1, c8
  270. FMOV t1, c9
  271. FMOV t1, c10
  272. FMOV t1, c11
  273. FMOV t1, c12
  274. FMOV t1, c13
  275. FMOV t1, c14
  276. FMOV t1, c15
  277. FMOV t1, c16
  278. add A3, LDA, A4
  279. FMOV t1, t2
  280. mov XP, X1
  281. FMOV t1, t3
  282. add A4, LDA, A
  283. cmp I, 0
  284. ble %icc, .LL115
  285. FMOV t1, t4
  286. LDF [A1 + 0 * SIZE], a1
  287. nop
  288. LDF [A1 + 1 * SIZE], a2
  289. add A1, 2 * SIZE, A1
  290. LDF [A2 + 0 * SIZE], a3
  291. LDF [A2 + 1 * SIZE], a4
  292. add A2, 2 * SIZE, A2
  293. LDF [A3 + 0 * SIZE], a5
  294. LDF [A3 + 1 * SIZE], a6
  295. add A3, 2 * SIZE, A3
  296. LDF [A4 + 0 * SIZE], a7
  297. LDF [A4 + 1 * SIZE], a8
  298. add A4, 2 * SIZE, A4
  299. LDF [X1 + 0 * SIZE], b1
  300. nop
  301. LDF [X1 + 1 * SIZE], b2
  302. nop
  303. LDF [X1 + 2 * SIZE], b3
  304. add X1, 4 * SIZE, X1
  305. deccc I
  306. ble .LL112
  307. prefetch [Y1 + 7 * SIZE], 2
  308. #ifndef XCONJ
  309. #define FADDX FADD
  310. #else
  311. #define FADDX FSUB
  312. #endif
  313. .LL111:
  314. FADD c13, t1, c13
  315. prefetch [A1 + PREFETCHSIZE * SIZE], 1
  316. FMUL a1, b1, t1
  317. nop
  318. FADDX c14, t2, c14
  319. nop
  320. FMUL a1, b2, t2
  321. LDF [A1 + 0 * SIZE], a1
  322. FADD c15, t3, c15
  323. nop
  324. FMUL a2, b1, t3
  325. LDF [X1 - 1 * SIZE], b4
  326. FADD c16, t4, c16
  327. nop
  328. FMUL a2, b2, t4
  329. LDF [A1 + 1 * SIZE], a2
  330. FADD c1, t1, c1
  331. nop
  332. FMUL a3, b1, t1
  333. nop
  334. FADDX c2, t2, c2
  335. nop
  336. FMUL a3, b2, t2
  337. LDF [A2 + 0 * SIZE], a3
  338. FADD c3, t3, c3
  339. nop
  340. FMUL a4, b1, t3
  341. nop
  342. FADD c4, t4, c4
  343. nop
  344. FMUL a4, b2, t4
  345. LDF [A2 + 1 * SIZE], a4
  346. FADD c5, t1, c5
  347. nop
  348. FMUL a5, b1, t1
  349. nop
  350. FADDX c6, t2, c6
  351. nop
  352. FMUL a5, b2, t2
  353. LDF [A3 + 0 * SIZE], a5
  354. FADD c7, t3, c7
  355. nop
  356. FMUL a6, b1, t3
  357. nop
  358. FADD c8, t4, c8
  359. nop
  360. FMUL a6, b2, t4
  361. LDF [A3 + 1 * SIZE], a6
  362. FADD c9, t1, c9
  363. nop
  364. FMUL a7, b1, t1
  365. nop
  366. FADDX c10, t2, c10
  367. nop
  368. FMUL a7, b2, t2
  369. LDF [A4 + 0 * SIZE], a7
  370. FADD c11, t3, c11
  371. nop
  372. FMUL a8, b1, t3
  373. LDF [X1 + 0 * SIZE], b1
  374. FADD c12, t4, c12
  375. nop
  376. FMUL a8, b2, t4
  377. LDF [A4 + 1 * SIZE], a8
  378. FADD c13, t1, c13
  379. nop
  380. FMUL a1, b3, t1
  381. prefetch [A2 + PREFETCHSIZE * SIZE], 1
  382. FADDX c14, t2, c14
  383. nop
  384. FMUL a1, b4, t2
  385. LDF [A1 + 2 * SIZE], a1
  386. FADD c15, t3, c15
  387. nop
  388. FMUL a2, b3, t3
  389. LDF [X1 + 1 * SIZE], b2
  390. FADD c16, t4, c16
  391. nop
  392. FMUL a2, b4, t4
  393. LDF [A1 + 3 * SIZE], a2
  394. FADD c1, t1, c1
  395. nop
  396. FMUL a3, b3, t1
  397. nop
  398. FADDX c2, t2, c2
  399. nop
  400. FMUL a3, b4, t2
  401. LDF [A2 + 2 * SIZE], a3
  402. FADD c3, t3, c3
  403. nop
  404. FMUL a4, b3, t3
  405. nop
  406. FADD c4, t4, c4
  407. nop
  408. FMUL a4, b4, t4
  409. LDF [A2 + 3 * SIZE], a4
  410. FADD c5, t1, c5
  411. nop
  412. FMUL a5, b3, t1
  413. nop
  414. FADDX c6, t2, c6
  415. nop
  416. FMUL a5, b4, t2
  417. LDF [A3 + 2 * SIZE], a5
  418. FADD c7, t3, c7
  419. nop
  420. FMUL a6, b3, t3
  421. nop
  422. FADD c8, t4, c8
  423. nop
  424. FMUL a6, b4, t4
  425. LDF [A3 + 3 * SIZE], a6
  426. FADD c9, t1, c9
  427. nop
  428. FMUL a7, b3, t1
  429. nop
  430. FADDX c10, t2, c10
  431. nop
  432. FMUL a7, b4, t2
  433. LDF [A4 + 2 * SIZE], a7
  434. FADD c11, t3, c11
  435. nop
  436. FMUL a8, b3, t3
  437. LDF [X1 + 2 * SIZE], b3
  438. FADD c12, t4, c12
  439. nop
  440. FMUL a8, b4, t4
  441. LDF [A4 + 3 * SIZE], a8
  442. FADD c13, t1, c13
  443. prefetch [A3 + PREFETCHSIZE * SIZE], 1
  444. FMUL a1, b1, t1
  445. nop
  446. FADDX c14, t2, c14
  447. nop
  448. FMUL a1, b2, t2
  449. LDF [A1 + 4 * SIZE], a1
  450. FADD c15, t3, c15
  451. nop
  452. FMUL a2, b1, t3
  453. LDF [X1 + 3 * SIZE], b4
  454. FADD c16, t4, c16
  455. nop
  456. FMUL a2, b2, t4
  457. LDF [A1 + 5 * SIZE], a2
  458. FADD c1, t1, c1
  459. nop
  460. FMUL a3, b1, t1
  461. nop
  462. FADDX c2, t2, c2
  463. nop
  464. FMUL a3, b2, t2
  465. LDF [A2 + 4 * SIZE], a3
  466. FADD c3, t3, c3
  467. nop
  468. FMUL a4, b1, t3
  469. nop
  470. FADD c4, t4, c4
  471. nop
  472. FMUL a4, b2, t4
  473. LDF [A2 + 5 * SIZE], a4
  474. FADD c5, t1, c5
  475. nop
  476. FMUL a5, b1, t1
  477. nop
  478. FADDX c6, t2, c6
  479. nop
  480. FMUL a5, b2, t2
  481. LDF [A3 + 4 * SIZE], a5
  482. FADD c7, t3, c7
  483. deccc I
  484. FMUL a6, b1, t3
  485. nop
  486. FADD c8, t4, c8
  487. nop
  488. FMUL a6, b2, t4
  489. LDF [A3 + 5 * SIZE], a6
  490. FADD c9, t1, c9
  491. nop
  492. FMUL a7, b1, t1
  493. nop
  494. FADDX c10, t2, c10
  495. nop
  496. FMUL a7, b2, t2
  497. LDF [A4 + 4 * SIZE], a7
  498. FADD c11, t3, c11
  499. nop
  500. FMUL a8, b1, t3
  501. LDF [X1 + 4 * SIZE], b1
  502. FADD c12, t4, c12
  503. nop
  504. FMUL a8, b2, t4
  505. LDF [A4 + 5 * SIZE], a8
  506. FADD c13, t1, c13
  507. prefetch [A4 + PREFETCHSIZE * SIZE], 1
  508. FMUL a1, b3, t1
  509. nop
  510. FADDX c14, t2, c14
  511. nop
  512. FMUL a1, b4, t2
  513. LDF [A1 + 6 * SIZE], a1
  514. FADD c15, t3, c15
  515. nop
  516. FMUL a2, b3, t3
  517. LDF [X1 + 5 * SIZE], b2
  518. FADD c16, t4, c16
  519. nop
  520. FMUL a2, b4, t4
  521. LDF [A1 + 7 * SIZE], a2
  522. FADD c1, t1, c1
  523. add A1, 8 * SIZE, A1
  524. FMUL a3, b3, t1
  525. nop
  526. FADDX c2, t2, c2
  527. nop
  528. FMUL a3, b4, t2
  529. LDF [A2 + 6 * SIZE], a3
  530. FADD c3, t3, c3
  531. nop
  532. FMUL a4, b3, t3
  533. nop
  534. FADD c4, t4, c4
  535. nop
  536. FMUL a4, b4, t4
  537. LDF [A2 + 7 * SIZE], a4
  538. FADD c5, t1, c5
  539. add A2, 8 * SIZE, A2
  540. FMUL a5, b3, t1
  541. nop
  542. FADDX c6, t2, c6
  543. nop
  544. FMUL a5, b4, t2
  545. LDF [A3 + 6 * SIZE], a5
  546. FADD c7, t3, c7
  547. add A4, 8 * SIZE, A4
  548. FMUL a6, b3, t3
  549. nop
  550. FADD c8, t4, c8
  551. nop
  552. FMUL a6, b4, t4
  553. LDF [A3 + 7 * SIZE], a6
  554. FADD c9, t1, c9
  555. add A3, 8 * SIZE, A3
  556. FMUL a7, b3, t1
  557. nop
  558. FADDX c10, t2, c10
  559. add X1, 8 * SIZE, X1
  560. FMUL a7, b4, t2
  561. LDF [A4 - 2 * SIZE], a7
  562. FADD c11, t3, c11
  563. nop
  564. FMUL a8, b3, t3
  565. LDF [X1 - 2 * SIZE], b3
  566. FADD c12, t4, c12
  567. FMUL a8, b4, t4
  568. bg,pn %icc, .LL111
  569. LDF [A4 - 1 * SIZE], a8
  570. .LL112:
  571. FADD c13, t1, c13
  572. nop
  573. FMUL a1, b1, t1
  574. LDF [X1 - 1 * SIZE], b4
  575. FADDX c14, t2, c14
  576. nop
  577. FMUL a1, b2, t2
  578. LDF [A1 + 0 * SIZE], a1
  579. FADD c15, t3, c15
  580. nop
  581. FMUL a2, b1, t3
  582. LDF [X1 - 1 * SIZE], b4
  583. FADD c16, t4, c16
  584. nop
  585. FMUL a2, b2, t4
  586. LDF [A1 + 1 * SIZE], a2
  587. FADD c1, t1, c1
  588. nop
  589. FMUL a3, b1, t1
  590. nop
  591. FADDX c2, t2, c2
  592. nop
  593. FMUL a3, b2, t2
  594. LDF [A2 + 0 * SIZE], a3
  595. FADD c3, t3, c3
  596. nop
  597. FMUL a4, b1, t3
  598. nop
  599. FADD c4, t4, c4
  600. nop
  601. FMUL a4, b2, t4
  602. LDF [A2 + 1 * SIZE], a4
  603. FADD c5, t1, c5
  604. nop
  605. FMUL a5, b1, t1
  606. nop
  607. FADDX c6, t2, c6
  608. nop
  609. FMUL a5, b2, t2
  610. LDF [A3 + 0 * SIZE], a5
  611. FADD c7, t3, c7
  612. nop
  613. FMUL a6, b1, t3
  614. nop
  615. FADD c8, t4, c8
  616. nop
  617. FMUL a6, b2, t4
  618. LDF [A3 + 1 * SIZE], a6
  619. FADD c9, t1, c9
  620. nop
  621. FMUL a7, b1, t1
  622. nop
  623. FADDX c10, t2, c10
  624. nop
  625. FMUL a7, b2, t2
  626. LDF [A4 + 0 * SIZE], a7
  627. FADD c11, t3, c11
  628. nop
  629. FMUL a8, b1, t3
  630. LDF [X1 + 0 * SIZE], b1
  631. FADD c12, t4, c12
  632. nop
  633. FMUL a8, b2, t4
  634. LDF [A4 + 1 * SIZE], a8
  635. FADD c13, t1, c13
  636. nop
  637. FMUL a1, b3, t1
  638. LDF [X1 + 1 * SIZE], b2
  639. FADDX c14, t2, c14
  640. nop
  641. FMUL a1, b4, t2
  642. LDF [A1 + 2 * SIZE], a1
  643. FADD c15, t3, c15
  644. nop
  645. FMUL a2, b3, t3
  646. nop
  647. FADD c16, t4, c16
  648. nop
  649. FMUL a2, b4, t4
  650. LDF [A1 + 3 * SIZE], a2
  651. FADD c1, t1, c1
  652. nop
  653. FMUL a3, b3, t1
  654. nop
  655. FADDX c2, t2, c2
  656. nop
  657. FMUL a3, b4, t2
  658. LDF [A2 + 2 * SIZE], a3
  659. FADD c3, t3, c3
  660. nop
  661. FMUL a4, b3, t3
  662. nop
  663. FADD c4, t4, c4
  664. nop
  665. FMUL a4, b4, t4
  666. LDF [A2 + 3 * SIZE], a4
  667. FADD c5, t1, c5
  668. nop
  669. FMUL a5, b3, t1
  670. nop
  671. FADDX c6, t2, c6
  672. nop
  673. FMUL a5, b4, t2
  674. LDF [A3 + 2 * SIZE], a5
  675. FADD c7, t3, c7
  676. nop
  677. FMUL a6, b3, t3
  678. nop
  679. FADD c8, t4, c8
  680. nop
  681. FMUL a6, b4, t4
  682. LDF [A3 + 3 * SIZE], a6
  683. FADD c9, t1, c9
  684. nop
  685. FMUL a7, b3, t1
  686. nop
  687. FADDX c10, t2, c10
  688. nop
  689. FMUL a7, b4, t2
  690. LDF [A4 + 2 * SIZE], a7
  691. FADD c11, t3, c11
  692. nop
  693. FMUL a8, b3, t3
  694. LDF [X1 + 2 * SIZE], b3
  695. FADD c12, t4, c12
  696. nop
  697. FMUL a8, b4, t4
  698. LDF [A4 + 3 * SIZE], a8
  699. FADD c13, t1, c13
  700. nop
  701. FMUL a1, b1, t1
  702. LDF [X1 + 3 * SIZE], b4
  703. FADDX c14, t2, c14
  704. add X1, 4 * SIZE, X1
  705. FMUL a1, b2, t2
  706. LDF [A1 + 4 * SIZE], a1
  707. FADD c15, t3, c15
  708. nop
  709. FMUL a2, b1, t3
  710. nop
  711. FADD c16, t4, c16
  712. nop
  713. FMUL a2, b2, t4
  714. LDF [A1 + 5 * SIZE], a2
  715. FADD c1, t1, c1
  716. add A1, 6 * SIZE, A1
  717. FMUL a3, b1, t1
  718. nop
  719. FADDX c2, t2, c2
  720. nop
  721. FMUL a3, b2, t2
  722. LDF [A2 + 4 * SIZE], a3
  723. FADD c3, t3, c3
  724. nop
  725. FMUL a4, b1, t3
  726. nop
  727. FADD c4, t4, c4
  728. nop
  729. FMUL a4, b2, t4
  730. LDF [A2 + 5 * SIZE], a4
  731. FADD c5, t1, c5
  732. add A2, 6 * SIZE, A2
  733. FMUL a5, b1, t1
  734. nop
  735. FADDX c6, t2, c6
  736. nop
  737. FMUL a5, b2, t2
  738. LDF [A3 + 4 * SIZE], a5
  739. FADD c7, t3, c7
  740. nop
  741. FMUL a6, b1, t3
  742. nop
  743. FADD c8, t4, c8
  744. nop
  745. FMUL a6, b2, t4
  746. LDF [A3 + 5 * SIZE], a6
  747. FADD c9, t1, c9
  748. add A3, 6 * SIZE, A3
  749. FMUL a7, b1, t1
  750. nop
  751. FADDX c10, t2, c10
  752. nop
  753. FMUL a7, b2, t2
  754. LDF [A4 + 4 * SIZE], a7
  755. FADD c11, t3, c11
  756. nop
  757. FMUL a8, b1, t3
  758. nop
  759. FADD c12, t4, c12
  760. nop
  761. FMUL a8, b2, t4
  762. LDF [A4 + 5 * SIZE], a8
  763. FADD c13, t1, c13
  764. add A4, 6 * SIZE, A4
  765. FMUL a1, b3, t1
  766. nop
  767. FADDX c14, t2, c14
  768. nop
  769. FMUL a1, b4, t2
  770. nop
  771. FADD c15, t3, c15
  772. FMUL a2, b3, t3
  773. FADD c16, t4, c16
  774. FMUL a2, b4, t4
  775. FADD c1, t1, c1
  776. FMUL a3, b3, t1
  777. FADDX c2, t2, c2
  778. FMUL a3, b4, t2
  779. FADD c3, t3, c3
  780. FMUL a4, b3, t3
  781. FADD c4, t4, c4
  782. FMUL a4, b4, t4
  783. FADD c5, t1, c5
  784. FMUL a5, b3, t1
  785. FADDX c6, t2, c6
  786. FMUL a5, b4, t2
  787. FADD c7, t3, c7
  788. FMUL a6, b3, t3
  789. FADD c8, t4, c8
  790. FMUL a6, b4, t4
  791. FADD c9, t1, c9
  792. FMUL a7, b3, t1
  793. FADDX c10, t2, c10
  794. FMUL a7, b4, t2
  795. FADD c11, t3, c11
  796. FMUL a8, b3, t3
  797. FADD c12, t4, c12
  798. FMUL a8, b4, t4
  799. .LL115:
  800. andcc MIN_M, 3, I
  801. LDF ALPHA_R, b3
  802. mov Y1, Y2
  803. ble,pn %icc, .LL119
  804. LDF ALPHA_I, b4
  805. .L116:
  806. LDF [A1 + 0 * SIZE], a1
  807. LDF [A1 + 1 * SIZE], a2
  808. add A1, 2 * SIZE, A1
  809. LDF [X1 + 0 * SIZE], b1
  810. LDF [X1 + 1 * SIZE], b2
  811. add X1, 2 * SIZE, X1
  812. LDF [A2 + 0 * SIZE], a3
  813. LDF [A2 + 1 * SIZE], a4
  814. add A2, 2 * SIZE, A2
  815. LDF [A3 + 0 * SIZE], a5
  816. LDF [A3 + 1 * SIZE], a6
  817. add A3, 2 * SIZE, A3
  818. LDF [A4 + 0 * SIZE], a7
  819. LDF [A4 + 1 * SIZE], a8
  820. add A4, 2 * SIZE, A4
  821. FADD c13, t1, c13
  822. FMUL a1, b1, t1
  823. FADDX c14, t2, c14
  824. FMUL a1, b2, t2
  825. FADD c15, t3, c15
  826. FMUL a2, b1, t3
  827. FADD c16, t4, c16
  828. FMUL a2, b2, t4
  829. FADD c1, t1, c1
  830. FMUL a3, b1, t1
  831. FADDX c2, t2, c2
  832. FMUL a3, b2, t2
  833. FADD c3, t3, c3
  834. FMUL a4, b1, t3
  835. FADD c4, t4, c4
  836. FMUL a4, b2, t4
  837. FADD c5, t1, c5
  838. FMUL a5, b1, t1
  839. FADDX c6, t2, c6
  840. FMUL a5, b2, t2
  841. FADD c7, t3, c7
  842. FMUL a6, b1, t3
  843. FADD c8, t4, c8
  844. FMUL a6, b2, t4
  845. FADD c9, t1, c9
  846. FMUL a7, b1, t1
  847. FADDX c10, t2, c10
  848. FMUL a7, b2, t2
  849. FADD c11, t3, c11
  850. FMUL a8, b1, t3
  851. FADD c12, t4, c12
  852. FMUL a8, b2, t4
  853. deccc I
  854. bg %icc, .L116
  855. nop
  856. .LL119:
  857. FADD c13, t1, c13
  858. LDF [Y1 + 0 * SIZE], a1
  859. FADDX c14, t2, c14
  860. LDF [Y1 + 1 * SIZE] ,a2
  861. add Y1, INCY, Y1
  862. FADD c15, t3, c15
  863. LDF [Y1 + 0 * SIZE], a3
  864. FADD c16, t4, c16
  865. LDF [Y1 + 1 * SIZE] ,a4
  866. add Y1, INCY, Y1
  867. #if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ))
  868. FSUB c1, c4, c1
  869. LDF [Y1 + 0 * SIZE], a5
  870. FSUB c5, c8, c5
  871. LDF [Y1 + 1 * SIZE] ,a6
  872. add Y1, INCY, Y1
  873. FSUB c9, c12, c9
  874. LDF [Y1 + 0 * SIZE], a7
  875. FSUB c13, c16, c13
  876. LDF [Y1 + 1 * SIZE] ,a8
  877. add Y1, INCY, Y1
  878. #else
  879. FADD c1, c4, c1
  880. LDF [Y1 + 0 * SIZE], a5
  881. FADD c5, c8, c5
  882. LDF [Y1 + 1 * SIZE] ,a6
  883. add Y1, INCY, Y1
  884. FADD c9, c12, c9
  885. LDF [Y1 + 0 * SIZE], a7
  886. FADD c13, c16, c13
  887. LDF [Y1 + 1 * SIZE] ,a8
  888. add Y1, INCY, Y1
  889. #endif
  890. #ifndef CONJ
  891. FADD c2, c3, c2
  892. FCLR(0)
  893. FADD c6, c7, c6
  894. FADD c10, c11, c10
  895. FADD c14, c15, c14
  896. #else
  897. FSUB c2, c3, c2
  898. FCLR(0)
  899. FSUB c6, c7, c6
  900. FSUB c10, c11, c10
  901. FSUB c14, c15, c14
  902. #endif
  903. FMUL b3, c1, c3
  904. FMOV t1, t2
  905. FMUL b4, c1, c4
  906. FMOV t1, t3
  907. FMUL b4, c2, c1
  908. FMOV t1, t4
  909. FMUL b3, c2, c2
  910. FMUL b3, c5, c7
  911. FMUL b4, c5, c8
  912. FMUL b4, c6, c5
  913. FMUL b3, c6, c6
  914. FMUL b3, c9, c11
  915. FMUL b4, c9, c12
  916. FMUL b4, c10, c9
  917. FMUL b3, c10, c10
  918. FMUL b3, c13, c15
  919. FSUB c3, c1, c1
  920. FMUL b4, c13, c16
  921. FADD c2, c4, c2
  922. FMUL b4, c14, c13
  923. FSUB c7, c5, c5
  924. FMUL b3, c14, c14
  925. FADD c6, c8, c6
  926. FSUB c11, c9, c9
  927. FADD c10, c12, c10
  928. FSUB c15, c13, c13
  929. FADD c14, c16, c14
  930. FADD a1, c1, a1
  931. FADD a2, c2, a2
  932. FADD a3, c5, a3
  933. FADD a4, c6, a4
  934. STF a1, [Y2 + 0 * SIZE]
  935. FADD a5, c9, a5
  936. STF a2, [Y2 + 1 * SIZE]
  937. FADD a6, c10, a6
  938. add Y2, INCY, Y2
  939. STF a3, [Y2 + 0 * SIZE]
  940. FADD a7, c13, a7
  941. STF a4, [Y2 + 1 * SIZE]
  942. FADD a8, c14, a8
  943. add Y2, INCY, Y2
  944. STF a5, [Y2 + 0 * SIZE]
  945. FMOV t1, c1
  946. add J, -1, J
  947. STF a6, [Y2 + 1 * SIZE]
  948. FMOV t1, c2
  949. cmp J, 0
  950. add Y2, INCY, Y2
  951. STF a7, [Y2 + 0 * SIZE]
  952. FMOV t1, c3
  953. STF a8, [Y2 + 1 * SIZE]
  954. FMOV t1, c4
  955. add Y2, INCY, Y2
  956. FMOV t1, c5
  957. bg %icc, .LL110
  958. FMOV t1, c6
  959. .LL200:
  960. FCLR(0)
  961. and N, 2, J
  962. cmp J, 0
  963. FMOV t1, c1
  964. ble %icc, .LL300
  965. FMOV t1, c2
  966. sra MIN_M, 2, I
  967. FMOV t1, t2
  968. add A, LDA, A2
  969. FMOV t1, c3
  970. mov A, A1
  971. FMOV t1, t3
  972. cmp I, 0
  973. FMOV t1, c4
  974. FMOV t1, c5
  975. FMOV t1, c6
  976. FMOV t1, c7
  977. FMOV t1, c8
  978. add A2, LDA, A
  979. FMOV t1, t4
  980. ble %icc, .LL215
  981. mov XP, X1
  982. LDF [A1 + 0 * SIZE], a1
  983. LDF [A1 + 1 * SIZE], a2
  984. LDF [A1 + 2 * SIZE], a5
  985. LDF [A1 + 3 * SIZE], a6
  986. add A1, 4 * SIZE, A1
  987. LDF [A2 + 0 * SIZE], a3
  988. LDF [A2 + 1 * SIZE], a4
  989. LDF [A2 + 2 * SIZE], a7
  990. LDF [A2 + 3 * SIZE], a8
  991. add A2, 4 * SIZE, A2
  992. LDF [X1 + 0 * SIZE], b1
  993. add I, -1, I
  994. LDF [X1 + 1 * SIZE], b2
  995. cmp I, 0
  996. LDF [X1 + 2 * SIZE], b3
  997. LDF [X1 + 3 * SIZE], b4
  998. ble %icc, .LL212
  999. add X1, 4 * SIZE, X1
  1000. .LL211:
  1001. prefetch [A1 + PREFETCHSIZE * SIZE], 1
  1002. FADD c5, t1, c5
  1003. FMUL a1, b1, t1
  1004. FADDX c6, t2, c6
  1005. FMUL a1, b2, t2
  1006. LDF [A1 + 0 * SIZE], a1
  1007. FADD c7, t3, c7
  1008. FMUL a2, b1, t3
  1009. FADD c8, t4, c8
  1010. FMUL a2, b2, t4
  1011. LDF [A1 + 1 * SIZE], a2
  1012. FADD c1, t1, c1
  1013. FMUL a3, b1, t1
  1014. FADDX c2, t2, c2
  1015. FMUL a3, b2, t2
  1016. LDF [A2 + 0 * SIZE], a3
  1017. FADD c3, t3, c3
  1018. FMUL a4, b1, t3
  1019. LDF [X1 + 0 * SIZE], b1
  1020. FADD c4, t4, c4
  1021. FMUL a4, b2, t4
  1022. LDF [A2 + 1 * SIZE], a4
  1023. FADD c5, t1, c5
  1024. LDF [X1 + 1 * SIZE], b2
  1025. FMUL a5, b3, t1
  1026. FADDX c6, t2, c6
  1027. FMUL a5, b4, t2
  1028. LDF [A1 + 2 * SIZE], a5
  1029. FADD c7, t3, c7
  1030. add I, -1, I
  1031. FMUL a6, b3, t3
  1032. FADD c8, t4, c8
  1033. cmp I, 0
  1034. FMUL a6, b4, t4
  1035. LDF [A1 + 3 * SIZE], a6
  1036. FADD c1, t1, c1
  1037. FMUL a7, b3, t1
  1038. FADDX c2, t2, c2
  1039. FMUL a7, b4, t2
  1040. LDF [A2 + 2 * SIZE], a7
  1041. FADD c3, t3, c3
  1042. FMUL a8, b3, t3
  1043. LDF [X1 + 2 * SIZE], b3
  1044. FADD c4, t4, c4
  1045. FMUL a8, b4, t4
  1046. LDF [A2 + 3 * SIZE], a8
  1047. prefetch [A2 + PREFETCHSIZE * SIZE], 1
  1048. FADD c5, t1, c5
  1049. LDF [X1 + 3 * SIZE], b4
  1050. FMUL a1, b1, t1
  1051. FADDX c6, t2, c6
  1052. FMUL a1, b2, t2
  1053. LDF [A1 + 4 * SIZE], a1
  1054. FADD c7, t3, c7
  1055. FMUL a2, b1, t3
  1056. FADD c8, t4, c8
  1057. FMUL a2, b2, t4
  1058. LDF [A1 + 5 * SIZE], a2
  1059. FADD c1, t1, c1
  1060. FMUL a3, b1, t1
  1061. FADDX c2, t2, c2
  1062. FMUL a3, b2, t2
  1063. LDF [A2 + 4 * SIZE], a3
  1064. FADD c3, t3, c3
  1065. FMUL a4, b1, t3
  1066. LDF [X1 + 4 * SIZE], b1
  1067. FADD c4, t4, c4
  1068. FMUL a4, b2, t4
  1069. LDF [A2 + 5 * SIZE], a4
  1070. FADD c5, t1, c5
  1071. LDF [X1 + 5 * SIZE], b2
  1072. FMUL a5, b3, t1
  1073. FADDX c6, t2, c6
  1074. FMUL a5, b4, t2
  1075. LDF [A1 + 6 * SIZE], a5
  1076. FADD c7, t3, c7
  1077. FMUL a6, b3, t3
  1078. FADD c8, t4, c8
  1079. FMUL a6, b4, t4
  1080. LDF [A1 + 7 * SIZE], a6
  1081. add A1, 8 * SIZE, A1
  1082. FADD c1, t1, c1
  1083. FMUL a7, b3, t1
  1084. FADDX c2, t2, c2
  1085. FMUL a7, b4, t2
  1086. LDF [A2 + 6 * SIZE], a7
  1087. FADD c3, t3, c3
  1088. FMUL a8, b3, t3
  1089. LDF [X1 + 6 * SIZE], b3
  1090. FADD c4, t4, c4
  1091. add X1, 8 * SIZE, X1
  1092. FMUL a8, b4, t4
  1093. LDF [A2 + 7 * SIZE], a8
  1094. add A2, 8 * SIZE, A2
  1095. bg,pn %icc, .LL211
  1096. LDF [X1 - 1 * SIZE], b4
  1097. .LL212:
  1098. FADD c5, t1, c5
  1099. FMUL a1, b1, t1
  1100. FADDX c6, t2, c6
  1101. FMUL a1, b2, t2
  1102. LDF [A1 + 0 * SIZE], a1
  1103. FADD c7, t3, c7
  1104. FMUL a2, b1, t3
  1105. FADD c8, t4, c8
  1106. FMUL a2, b2, t4
  1107. LDF [A1 + 1 * SIZE], a2
  1108. FADD c1, t1, c1
  1109. FMUL a3, b1, t1
  1110. FADDX c2, t2, c2
  1111. FMUL a3, b2, t2
  1112. LDF [A2 + 0 * SIZE], a3
  1113. FADD c3, t3, c3
  1114. FMUL a4, b1, t3
  1115. LDF [X1 + 0 * SIZE], b1
  1116. FADD c4, t4, c4
  1117. FMUL a4, b2, t4
  1118. LDF [A2 + 1 * SIZE], a4
  1119. FADD c5, t1, c5
  1120. LDF [X1 + 1 * SIZE], b2
  1121. FMUL a5, b3, t1
  1122. FADDX c6, t2, c6
  1123. FMUL a5, b4, t2
  1124. LDF [A1 + 2 * SIZE], a5
  1125. FADD c7, t3, c7
  1126. FMUL a6, b3, t3
  1127. FADD c8, t4, c8
  1128. FMUL a6, b4, t4
  1129. LDF [A1 + 3 * SIZE], a6
  1130. add A1, 4 * SIZE, A1
  1131. FADD c1, t1, c1
  1132. FMUL a7, b3, t1
  1133. FADDX c2, t2, c2
  1134. FMUL a7, b4, t2
  1135. LDF [A2 + 2 * SIZE], a7
  1136. FADD c3, t3, c3
  1137. FMUL a8, b3, t3
  1138. LDF [X1 + 2 * SIZE], b3
  1139. FADD c4, t4, c4
  1140. FMUL a8, b4, t4
  1141. LDF [A2 + 3 * SIZE], a8
  1142. add A2, 4 * SIZE, A2
  1143. FADD c5, t1, c5
  1144. LDF [X1 + 3 * SIZE], b4
  1145. add X1, 4 * SIZE, X1
  1146. FMUL a1, b1, t1
  1147. FADDX c6, t2, c6
  1148. FMUL a1, b2, t2
  1149. FADD c7, t3, c7
  1150. FMUL a2, b1, t3
  1151. FADD c8, t4, c8
  1152. FMUL a2, b2, t4
  1153. FADD c1, t1, c1
  1154. FMUL a3, b1, t1
  1155. FADDX c2, t2, c2
  1156. FMUL a3, b2, t2
  1157. FADD c3, t3, c3
  1158. FMUL a4, b1, t3
  1159. FADD c4, t4, c4
  1160. FMUL a4, b2, t4
  1161. FADD c5, t1, c5
  1162. FMUL a5, b3, t1
  1163. FADDX c6, t2, c6
  1164. FMUL a5, b4, t2
  1165. FADD c7, t3, c7
  1166. FMUL a6, b3, t3
  1167. FADD c8, t4, c8
  1168. FMUL a6, b4, t4
  1169. FADD c1, t1, c1
  1170. FMUL a7, b3, t1
  1171. FADDX c2, t2, c2
  1172. FMUL a7, b4, t2
  1173. FADD c3, t3, c3
  1174. FMUL a8, b3, t3
  1175. FADD c4, t4, c4
  1176. FMUL a8, b4, t4
  1177. .LL215:
  1178. andcc MIN_M, 3, I
  1179. LDF ALPHA_R, b3
  1180. mov Y1, Y2
  1181. ble %icc, .LL219
  1182. LDF ALPHA_I, b4
  1183. LDF [A1 + 0 * SIZE], a1
  1184. add I, -1, I
  1185. LDF [A1 + 1 * SIZE], a2
  1186. cmp I, 0
  1187. add A1, 2 * SIZE, A1
  1188. LDF [A2 + 0 * SIZE], a3
  1189. LDF [A2 + 1 * SIZE], a4
  1190. add A2, 2 * SIZE, A2
  1191. LDF [X1 + 0 * SIZE], b1
  1192. LDF [X1 + 1 * SIZE], b2
  1193. ble %icc, .LL217
  1194. add X1, 2 * SIZE, X1
  1195. .LL216:
  1196. FADD c5, t1, c5
  1197. FMUL a1, b1, t1
  1198. FADDX c6, t2, c6
  1199. FMUL a1, b2, t2
  1200. LDF [A1 + 0 * SIZE], a1
  1201. FADD c7, t3, c7
  1202. add I, -1, I
  1203. FMUL a2, b1, t3
  1204. FADD c8, t4, c8
  1205. cmp I, 0
  1206. FMUL a2, b2, t4
  1207. LDF [A1 + 1 * SIZE], a2
  1208. add A1, 2 * SIZE, A1
  1209. FADD c1, t1, c1
  1210. FMUL a3, b1, t1
  1211. FADDX c2, t2, c2
  1212. FMUL a3, b2, t2
  1213. LDF [A2 + 0 * SIZE], a3
  1214. FADD c3, t3, c3
  1215. FMUL a4, b1, t3
  1216. LDF [X1 + 0 * SIZE], b1
  1217. FADD c4, t4, c4
  1218. add X1, 2 * SIZE, X1
  1219. FMUL a4, b2, t4
  1220. LDF [A2 + 1 * SIZE], a4
  1221. add A2, 2 * SIZE, A2
  1222. bg,pn %icc, .LL216
  1223. LDF [X1 - 1 * SIZE], b2
  1224. .LL217:
  1225. FADD c5, t1, c5
  1226. FMUL a1, b1, t1
  1227. FADDX c6, t2, c6
  1228. FMUL a1, b2, t2
  1229. FADD c7, t3, c7
  1230. FMUL a2, b1, t3
  1231. FADD c8, t4, c8
  1232. FMUL a2, b2, t4
  1233. FADD c1, t1, c1
  1234. FMUL a3, b1, t1
  1235. FADDX c2, t2, c2
  1236. FMUL a3, b2, t2
  1237. FADD c3, t3, c3
  1238. FMUL a4, b1, t3
  1239. FADD c4, t4, c4
  1240. FMUL a4, b2, t4
  1241. .LL219:
  1242. FADD c5, t1, c5
  1243. LDF [Y1 + 0 * SIZE], a1
  1244. FADDX c6, t2, c6
  1245. LDF [Y1 + 1 * SIZE] ,a2
  1246. add Y1, INCY, Y1
  1247. FADD c7, t3, c7
  1248. LDF [Y1 + 0 * SIZE], a3
  1249. FADD c8, t4, c8
  1250. LDF [Y1 + 1 * SIZE] ,a4
  1251. add Y1, INCY, Y1
  1252. #if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ))
  1253. FSUB c1, c4, c1
  1254. FSUB c5, c8, c5
  1255. #else
  1256. FADD c1, c4, c1
  1257. FADD c5, c8, c5
  1258. #endif
  1259. #ifndef CONJ
  1260. FADD c2, c3, c2
  1261. FADD c6, c7, c6
  1262. #else
  1263. FSUB c2, c3, c2
  1264. FSUB c6, c7, c6
  1265. #endif
  1266. FMUL b3, c1, c3
  1267. FMUL b4, c1, c4
  1268. FMUL b4, c2, c1
  1269. FMUL b3, c2, c2
  1270. FMUL b3, c5, c7
  1271. FMUL b4, c5, c8
  1272. FMUL b4, c6, c5
  1273. FMUL b3, c6, c6
  1274. FSUB c3, c1, c1
  1275. FADD c2, c4, c2
  1276. FSUB c7, c5, c5
  1277. FADD c6, c8, c6
  1278. FADD a1, c1, a1
  1279. FADD a2, c2, a2
  1280. FADD a3, c5, a3
  1281. FADD a4, c6, a4
  1282. STF a1, [Y2 + 0 * SIZE]
  1283. STF a2, [Y2 + 1 * SIZE]
  1284. add Y2, INCY, Y2
  1285. STF a3, [Y2 + 0 * SIZE]
  1286. STF a4, [Y2 + 1 * SIZE]
  1287. .LL300:
  1288. andcc N, 1, J
  1289. FCLR(0)
  1290. ble %icc, .LL400
  1291. FMOV t1, c1
  1292. .LL310:
  1293. sra MIN_M, 2, I
  1294. FMOV t1, c2
  1295. FMOV t1, c3
  1296. FMOV t1, c4
  1297. mov A, A1
  1298. FMOV t1, t2
  1299. add A, LDA, A
  1300. FMOV t1, t3
  1301. cmp I, 0
  1302. FMOV t1, t4
  1303. ble %icc, .LL315
  1304. mov XP, X1
  1305. LDF [A1 + 0 * SIZE], a1
  1306. LDF [A1 + 1 * SIZE], a2
  1307. LDF [A1 + 2 * SIZE], a3
  1308. LDF [A1 + 3 * SIZE], a4
  1309. LDF [A1 + 4 * SIZE], a5
  1310. LDF [A1 + 5 * SIZE], a6
  1311. LDF [A1 + 6 * SIZE], a7
  1312. LDF [A1 + 7 * SIZE], a8
  1313. add A1, 8 * SIZE, A1
  1314. LDF [X1 + 0 * SIZE], c9
  1315. add I, -1, I
  1316. LDF [X1 + 1 * SIZE], c10
  1317. cmp I, 0
  1318. LDF [X1 + 2 * SIZE], c11
  1319. LDF [X1 + 3 * SIZE], c12
  1320. LDF [X1 + 4 * SIZE], c13
  1321. LDF [X1 + 5 * SIZE], c14
  1322. LDF [X1 + 6 * SIZE], c15
  1323. LDF [X1 + 7 * SIZE], c16
  1324. ble %icc, .LL312
  1325. add X1, 8 * SIZE, X1
  1326. .LL311:
  1327. prefetch [A1 + PREFETCHSIZE * SIZE], 1
  1328. FADD c1, t1, c1
  1329. FMUL a1, c9, t1
  1330. FADDX c2, t2, c2
  1331. FMUL a1, c10, t2
  1332. LDF [A1 + 0 * SIZE], a1
  1333. FADD c3, t3, c3
  1334. FMUL a2, c9, t3
  1335. LDF [X1 + 0 * SIZE], c9
  1336. FADD c4, t4, c4
  1337. FMUL a2, c10, t4
  1338. LDF [A1 + 1 * SIZE], a2
  1339. LDF [X1 + 1 * SIZE], c10
  1340. FADD c1, t1, c1
  1341. FMUL a3, c11, t1
  1342. FADDX c2, t2, c2
  1343. FMUL a3, c12, t2
  1344. LDF [A1 + 2 * SIZE], a3
  1345. FADD c3, t3, c3
  1346. add I, -1, I
  1347. FMUL a4, c11, t3
  1348. LDF [X1 + 2 * SIZE], c11
  1349. FADD c4, t4, c4
  1350. cmp I, 0
  1351. FMUL a4, c12, t4
  1352. LDF [A1 + 3 * SIZE], a4
  1353. LDF [X1 + 3 * SIZE], c12
  1354. FADD c1, t1, c1
  1355. FMUL a5, c13, t1
  1356. FADDX c2, t2, c2
  1357. FMUL a5, c14, t2
  1358. LDF [A1 + 4 * SIZE], a5
  1359. FADD c3, t3, c3
  1360. FMUL a6, c13, t3
  1361. LDF [X1 + 4 * SIZE], c13
  1362. FADD c4, t4, c4
  1363. FMUL a6, c14, t4
  1364. LDF [A1 + 5 * SIZE], a6
  1365. LDF [X1 + 5 * SIZE], c14
  1366. FADD c1, t1, c1
  1367. FMUL a7, c15, t1
  1368. FADDX c2, t2, c2
  1369. FMUL a7, c16, t2
  1370. LDF [A1 + 6 * SIZE], a7
  1371. FADD c3, t3, c3
  1372. FMUL a8, c15, t3
  1373. LDF [X1 + 6 * SIZE], c15
  1374. FADD c4, t4, c4
  1375. add X1, 8 * SIZE, X1
  1376. FMUL a8, c16, t4
  1377. LDF [A1 + 7 * SIZE], a8
  1378. add A1, 8 * SIZE, A1
  1379. bg,pn %icc, .LL311
  1380. LDF [X1 - 1 * SIZE], c16
  1381. .LL312:
  1382. FADD c1, t1, c1
  1383. FMUL a1, c9, t1
  1384. FADDX c2, t2, c2
  1385. FMUL a1, c10, t2
  1386. FADD c3, t3, c3
  1387. FMUL a2, c9, t3
  1388. FADD c4, t4, c4
  1389. FMUL a2, c10, t4
  1390. FADD c1, t1, c1
  1391. FMUL a3, c11, t1
  1392. FADDX c2, t2, c2
  1393. FMUL a3, c12, t2
  1394. FADD c3, t3, c3
  1395. FMUL a4, c11, t3
  1396. FADD c4, t4, c4
  1397. FMUL a4, c12, t4
  1398. FADD c1, t1, c1
  1399. FMUL a5, c13, t1
  1400. FADDX c2, t2, c2
  1401. FMUL a5, c14, t2
  1402. FADD c3, t3, c3
  1403. FMUL a6, c13, t3
  1404. FADD c4, t4, c4
  1405. FMUL a6, c14, t4
  1406. FADD c1, t1, c1
  1407. FMUL a7, c15, t1
  1408. FADDX c2, t2, c2
  1409. FMUL a7, c16, t2
  1410. FADD c3, t3, c3
  1411. FMUL a8, c15, t3
  1412. FADD c4, t4, c4
  1413. FMUL a8, c16, t4
  1414. .LL315:
  1415. andcc MIN_M, 3, I
  1416. LDF ALPHA_R, b3
  1417. mov Y1, Y2
  1418. ble %icc, .LL319
  1419. LDF ALPHA_I, b4
  1420. LDF [A1 + 0 * SIZE], a1
  1421. add I, -1, I
  1422. LDF [A1 + 1 * SIZE], a2
  1423. add A1, 2 * SIZE, A1
  1424. LDF [X1 + 0 * SIZE], b1
  1425. cmp I, 0
  1426. LDF [X1 + 1 * SIZE], b2
  1427. ble %icc, .LL317
  1428. add X1, 2 * SIZE, X1
  1429. .LL316:
  1430. FADD c1, t1, c1
  1431. add I, -1, I
  1432. FMUL a1, b1, t1
  1433. FADDX c2, t2, c2
  1434. FMUL a1, b2, t2
  1435. LDF [A1 + 0 * SIZE], a1
  1436. FADD c3, t3, c3
  1437. cmp I, 0
  1438. FMUL a2, b1, t3
  1439. LDF [X1 + 0 * SIZE], b1
  1440. FADD c4, t4, c4
  1441. add X1, 2 * SIZE, X1
  1442. FMUL a2, b2, t4
  1443. LDF [A1 + 1 * SIZE], a2
  1444. add A1, 2 * SIZE, A1
  1445. bg,pn %icc, .LL316
  1446. LDF [X1 - 1 * SIZE], b2
  1447. .LL317:
  1448. FADD c1, t1, c1
  1449. FMUL a1, b1, t1
  1450. FADDX c2, t2, c2
  1451. FMUL a1, b2, t2
  1452. FADD c3, t3, c3
  1453. FMUL a2, b1, t3
  1454. FADD c4, t4, c4
  1455. FMUL a2, b2, t4
  1456. .LL319:
  1457. FADD c1, t1, c1
  1458. LDF [Y1 + 0 * SIZE], a1
  1459. FADDX c2, t2, c2
  1460. LDF [Y1 + 1 * SIZE] ,a2
  1461. add Y1, INCY, Y1
  1462. FADD c3, t3, c3
  1463. FADD c4, t4, c4
  1464. #if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ))
  1465. FSUB c1, c4, c1
  1466. #else
  1467. FADD c1, c4, c1
  1468. #endif
  1469. #ifndef CONJ
  1470. FADD c2, c3, c2
  1471. #else
  1472. FSUB c2, c3, c2
  1473. #endif
  1474. FMUL b3, c1, c3
  1475. FMUL b4, c1, c4
  1476. FMUL b4, c2, c1
  1477. FMUL b3, c2, c2
  1478. FSUB c3, c1, c1
  1479. FADD c2, c4, c2
  1480. FADD a1, c1, a1
  1481. FADD a2, c2, a2
  1482. STF a1, [Y2 + 0 * SIZE]
  1483. STF a2, [Y2 + 1 * SIZE]
  1484. .LL400:
  1485. mov P, I
  1486. add IS, I, IS
  1487. cmp IS, M
  1488. bl %icc, .LL10
  1489. add A, PNLDA, A
  1490. .LL999:
  1491. return %i7 + 8
  1492. clr %o0
  1493. EPILOGUE