You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2_sse.S 35 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esi)
  43. #define STACK_N 8 + STACK + ARGS(%esi)
  44. #define STACK_K 12 + STACK + ARGS(%esi)
  45. #define STACK_ALPHA_R 16 + STACK + ARGS(%esi)
  46. #define STACK_ALPHA_I 20 + STACK + ARGS(%esi)
  47. #define STACK_A 24 + STACK + ARGS(%esi)
  48. #define STACK_B 28 + STACK + ARGS(%esi)
  49. #define STACK_C 32 + STACK + ARGS(%esi)
  50. #define STACK_LDC 36 + STACK + ARGS(%esi)
  51. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  52. #define POSINV 0(%esp)
  53. #define ALPHA_R 16(%esp)
  54. #define ALPHA_I 32(%esp)
  55. #define K 48(%esp)
  56. #define N 52(%esp)
  57. #define M 56(%esp)
  58. #define A 60(%esp)
  59. #define C 64(%esp)
  60. #define J 68(%esp)
  61. #define OLD_STACK 72(%esp)
  62. #define OFFSET 76(%esp)
  63. #define KK 80(%esp)
  64. #define KKK 84(%esp)
  65. #define BUFFER 128(%esp)
  66. #define B %edi
  67. #define LDC %ebp
  68. #define AA %edx
  69. #define BB %ecx
  70. #define STACK_ALIGN 4096
  71. #define STACK_OFFSET 1024
  72. #ifdef ATHLON
  73. #define PREFETCHSIZE 64
  74. #define WPREFETCHSIZE 80
  75. #define PREFETCH prefetch
  76. #define PREFETCHW prefetchw
  77. #endif
  78. #if defined(OPTERON) || defined(BARCELONA)
  79. #define PREFETCHSIZE (16 * 10 + 8)
  80. #define WPREFETCHSIZE 112
  81. #define PREFETCH prefetch
  82. #define PREFETCHW prefetchw
  83. #endif
  84. #ifdef PENTIUM4
  85. #define PREFETCH prefetcht0
  86. #define PREFETCHW prefetcht0
  87. #define PREFETCHSIZE 168
  88. #endif
  89. #if defined(OPTERON) || !defined(HAVE_SSE2)
  90. #define movsd movlps
  91. #endif
  92. #ifdef HAVE_SSE2
  93. #define xorps pxor
  94. #endif
  95. #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA)
  96. #define KERNEL1(address) \
  97. mulps %xmm0, %xmm2; \
  98. addps %xmm2, %xmm4; \
  99. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  100. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  101. mulps %xmm0, %xmm2; \
  102. addps %xmm2, %xmm5; \
  103. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  104. mulps %xmm0, %xmm2; \
  105. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  106. addps %xmm2, %xmm6; \
  107. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  108. addps %xmm0, %xmm7; \
  109. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  110. #define KERNEL2(address) \
  111. mulps %xmm0, %xmm3; \
  112. addps %xmm3, %xmm4; \
  113. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  114. mulps %xmm0, %xmm3; \
  115. addps %xmm3, %xmm5; \
  116. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  117. mulps %xmm0, %xmm3; \
  118. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  119. addps %xmm3, %xmm6; \
  120. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  121. addps %xmm0, %xmm7; \
  122. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  123. #define KERNEL3(address) \
  124. mulps %xmm0, %xmm2; \
  125. addps %xmm2, %xmm4; \
  126. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  127. mulps %xmm0, %xmm2; \
  128. addps %xmm2, %xmm5; \
  129. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  130. mulps %xmm0, %xmm2; \
  131. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  132. addps %xmm2, %xmm6; \
  133. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  134. addps %xmm0, %xmm7; \
  135. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  136. #define KERNEL4(address) \
  137. mulps %xmm0, %xmm3; \
  138. addps %xmm3, %xmm4; \
  139. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  140. mulps %xmm0, %xmm3; \
  141. addps %xmm3, %xmm5; \
  142. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  143. mulps %xmm0, %xmm3; \
  144. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  145. addps %xmm3, %xmm6; \
  146. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  147. addps %xmm0, %xmm7; \
  148. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  149. #define KERNEL5(address) \
  150. PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \
  151. mulps %xmm1, %xmm2; \
  152. addps %xmm2, %xmm4; \
  153. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  154. mulps %xmm1, %xmm2; \
  155. addps %xmm2, %xmm5; \
  156. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  157. mulps %xmm1, %xmm2; \
  158. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  159. addps %xmm2, %xmm6; \
  160. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  161. addps %xmm1, %xmm7; \
  162. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  163. #define KERNEL6(address) \
  164. mulps %xmm1, %xmm3; \
  165. addps %xmm3, %xmm4; \
  166. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  167. mulps %xmm1, %xmm3; \
  168. addps %xmm3, %xmm5; \
  169. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  170. mulps %xmm1, %xmm3; \
  171. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  172. addps %xmm3, %xmm6; \
  173. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  174. addps %xmm1, %xmm7; \
  175. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  176. #define KERNEL7(address) \
  177. mulps %xmm1, %xmm2; \
  178. addps %xmm2, %xmm4; \
  179. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  180. mulps %xmm1, %xmm2; \
  181. addps %xmm2, %xmm5; \
  182. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  183. mulps %xmm1, %xmm2; \
  184. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  185. addps %xmm2, %xmm6; \
  186. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  187. addps %xmm1, %xmm7; \
  188. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  189. #define KERNEL8(address) \
  190. mulps %xmm1, %xmm3; \
  191. addps %xmm3, %xmm4; \
  192. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  193. mulps %xmm1, %xmm3; \
  194. addps %xmm3, %xmm5; \
  195. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  196. mulps %xmm1, %xmm3; \
  197. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  198. addps %xmm3, %xmm6; \
  199. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  200. addps %xmm1, %xmm7; \
  201. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
  202. #endif
  203. #ifdef PENTIUM4
  204. #define KERNEL1(address) \
  205. mulps %xmm0, %xmm2; \
  206. addps %xmm2, %xmm4; \
  207. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  208. mulps %xmm0, %xmm2; \
  209. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  210. addps %xmm2, %xmm5; \
  211. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  212. mulps %xmm0, %xmm2; \
  213. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  214. addps %xmm2, %xmm6; \
  215. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  216. addps %xmm0, %xmm7; \
  217. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  218. #define KERNEL2(address) \
  219. mulps %xmm0, %xmm3; \
  220. addps %xmm3, %xmm4; \
  221. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  222. mulps %xmm0, %xmm3; \
  223. addps %xmm3, %xmm5; \
  224. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  225. mulps %xmm0, %xmm3; \
  226. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  227. addps %xmm3, %xmm6; \
  228. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  229. addps %xmm0, %xmm7; \
  230. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  231. #define KERNEL3(address) \
  232. mulps %xmm0, %xmm2; \
  233. addps %xmm2, %xmm4; \
  234. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  235. mulps %xmm0, %xmm2; \
  236. addps %xmm2, %xmm5; \
  237. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  238. mulps %xmm0, %xmm2; \
  239. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  240. addps %xmm2, %xmm6; \
  241. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  242. addps %xmm0, %xmm7; \
  243. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  244. #define KERNEL4(address) \
  245. mulps %xmm0, %xmm3; \
  246. addps %xmm3, %xmm4; \
  247. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  248. mulps %xmm0, %xmm3; \
  249. addps %xmm3, %xmm5; \
  250. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  251. mulps %xmm0, %xmm3; \
  252. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  253. addps %xmm3, %xmm6; \
  254. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  255. addps %xmm0, %xmm7; \
  256. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  257. #define KERNEL5(address) \
  258. mulps %xmm1, %xmm2; \
  259. addps %xmm2, %xmm4; \
  260. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  261. mulps %xmm1, %xmm2; \
  262. addps %xmm2, %xmm5; \
  263. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  264. mulps %xmm1, %xmm2; \
  265. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  266. addps %xmm2, %xmm6; \
  267. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  268. addps %xmm1, %xmm7; \
  269. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  270. #define KERNEL6(address) \
  271. mulps %xmm1, %xmm3; \
  272. addps %xmm3, %xmm4; \
  273. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  274. mulps %xmm1, %xmm3; \
  275. addps %xmm3, %xmm5; \
  276. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  277. mulps %xmm1, %xmm3; \
  278. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  279. addps %xmm3, %xmm6; \
  280. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  281. addps %xmm1, %xmm7; \
  282. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  283. #define KERNEL7(address) \
  284. mulps %xmm1, %xmm2; \
  285. addps %xmm2, %xmm4; \
  286. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  287. mulps %xmm1, %xmm2; \
  288. addps %xmm2, %xmm5; \
  289. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  290. mulps %xmm1, %xmm2; \
  291. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  292. addps %xmm2, %xmm6; \
  293. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  294. addps %xmm1, %xmm7; \
  295. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  296. #define KERNEL8(address) \
  297. mulps %xmm1, %xmm3; \
  298. addps %xmm3, %xmm4; \
  299. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  300. mulps %xmm1, %xmm3; \
  301. addps %xmm3, %xmm5; \
  302. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  303. mulps %xmm1, %xmm3; \
  304. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  305. addps %xmm3, %xmm6; \
  306. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  307. addps %xmm1, %xmm7; \
  308. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  309. #endif
  310. PROLOGUE
  311. pushl %ebp
  312. pushl %edi
  313. pushl %esi
  314. pushl %ebx
  315. PROFCODE
  316. movl %esp, %esi # save old stack
  317. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  318. andl $-STACK_ALIGN, %esp # align stack
  319. addl $STACK_OFFSET, %esp
  320. STACK_TOUCHING
  321. movl STACK_M, %ebx
  322. movl STACK_N, %eax
  323. movl STACK_K, %ecx
  324. movl STACK_A, %edx
  325. movl %ebx, M
  326. movl %eax, N
  327. movl %ecx, K
  328. movl %edx, A
  329. movl %esi, OLD_STACK
  330. movl STACK_B, %edi
  331. movl STACK_C, %ebx
  332. #ifdef TRMMKERNEL
  333. movss STACK_OFFT, %xmm4
  334. #endif
  335. movss STACK_ALPHA_R, %xmm0
  336. movss STACK_ALPHA_I, %xmm1
  337. xorps %xmm7, %xmm7
  338. cmpeqps %xmm7, %xmm7
  339. pslld $31, %xmm7 # Generate mask
  340. xorps %xmm2, %xmm2
  341. shufps $0, %xmm0, %xmm0
  342. movaps %xmm0, 0 + ALPHA_R
  343. movss %xmm1, 4 + ALPHA_I
  344. movss %xmm1, 12 + ALPHA_I
  345. xorps %xmm7, %xmm1
  346. movss %xmm1, 0 + ALPHA_I
  347. movss %xmm1, 8 + ALPHA_I
  348. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  349. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  350. movss %xmm7, 0 + POSINV
  351. movss %xmm2, 4 + POSINV
  352. movss %xmm7, 8 + POSINV
  353. movss %xmm2, 12 + POSINV
  354. #else
  355. movss %xmm2, 0 + POSINV
  356. movss %xmm7, 4 + POSINV
  357. movss %xmm2, 8 + POSINV
  358. movss %xmm7, 12 + POSINV
  359. #endif
  360. EMMS
  361. movl %ebx, C
  362. movl STACK_LDC, LDC
  363. #ifdef TRMMKERNEL
  364. movss %xmm4, OFFSET
  365. movss %xmm4, KK
  366. #ifndef LEFT
  367. negl KK
  368. #endif
  369. #endif
  370. sall $ZBASE_SHIFT, LDC
  371. movl %eax, J # j = n
  372. sarl $1, J
  373. jle .L100
  374. ALIGN_4
  375. .L01:
  376. #if defined(TRMMKERNEL) && defined(LEFT)
  377. movl OFFSET, %eax
  378. movl %eax, KK
  379. #endif
  380. /* Copying to Sub Buffer */
  381. leal BUFFER, %ecx
  382. movaps POSINV, %xmm7
  383. movl K, %eax
  384. sarl $1, %eax
  385. jle .L03
  386. ALIGN_4
  387. .L02:
  388. movss 0 * SIZE(B), %xmm0
  389. movss 1 * SIZE(B), %xmm1
  390. movss 2 * SIZE(B), %xmm2
  391. movss 3 * SIZE(B), %xmm3
  392. shufps $0, %xmm0, %xmm0
  393. shufps $0, %xmm1, %xmm1
  394. shufps $0, %xmm2, %xmm2
  395. shufps $0, %xmm3, %xmm3
  396. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  397. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  398. xorps %xmm7, %xmm1
  399. xorps %xmm7, %xmm3
  400. #else
  401. xorps %xmm7, %xmm0
  402. xorps %xmm7, %xmm2
  403. #endif
  404. movaps %xmm0, 0 * SIZE(BB)
  405. movaps %xmm1, 4 * SIZE(BB)
  406. movaps %xmm2, 8 * SIZE(BB)
  407. movaps %xmm3, 12 * SIZE(BB)
  408. movss 4 * SIZE(B), %xmm0
  409. movss 5 * SIZE(B), %xmm1
  410. movss 6 * SIZE(B), %xmm2
  411. movss 7 * SIZE(B), %xmm3
  412. shufps $0, %xmm0, %xmm0
  413. shufps $0, %xmm1, %xmm1
  414. shufps $0, %xmm2, %xmm2
  415. shufps $0, %xmm3, %xmm3
  416. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  417. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  418. xorps %xmm7, %xmm1
  419. xorps %xmm7, %xmm3
  420. #else
  421. xorps %xmm7, %xmm0
  422. xorps %xmm7, %xmm2
  423. #endif
  424. movaps %xmm0, 16 * SIZE(BB)
  425. movaps %xmm1, 20 * SIZE(BB)
  426. movaps %xmm2, 24 * SIZE(BB)
  427. movaps %xmm3, 28 * SIZE(BB)
  428. #ifdef PENTIUM4
  429. prefetcht1 104 * SIZE(BB)
  430. #endif
  431. addl $ 8 * SIZE, %edi
  432. addl $32 * SIZE, %ecx
  433. decl %eax
  434. jne .L02
  435. ALIGN_4
  436. .L03:
  437. movl K, %eax
  438. andl $1, %eax
  439. BRANCH
  440. jle .L05
  441. ALIGN_4
  442. .L04:
  443. movss 0 * SIZE(B), %xmm0
  444. movss 1 * SIZE(B), %xmm1
  445. movss 2 * SIZE(B), %xmm2
  446. movss 3 * SIZE(B), %xmm3
  447. shufps $0, %xmm0, %xmm0
  448. shufps $0, %xmm1, %xmm1
  449. shufps $0, %xmm2, %xmm2
  450. shufps $0, %xmm3, %xmm3
  451. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  452. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  453. xorps %xmm7, %xmm1
  454. xorps %xmm7, %xmm3
  455. #else
  456. xorps %xmm7, %xmm0
  457. xorps %xmm7, %xmm2
  458. #endif
  459. movaps %xmm0, 0 * SIZE(BB)
  460. movaps %xmm1, 4 * SIZE(BB)
  461. movaps %xmm2, 8 * SIZE(BB)
  462. movaps %xmm3, 12 * SIZE(BB)
  463. addl $ 4 * SIZE, %edi
  464. ALIGN_4
  465. .L05:
  466. movl C, %esi
  467. movl A, %edx
  468. movl M, %ebx
  469. sarl $1, %ebx
  470. jle .L30
  471. ALIGN_4
  472. .L10:
  473. #if !defined(TRMMKERNEL) || \
  474. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  475. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  476. leal BUFFER, BB # boffset1 = boffset
  477. #else
  478. leal BUFFER, BB # boffset1 = boffset
  479. movl KK, %eax
  480. leal (, %eax, 8), %eax
  481. leal (AA, %eax, 2), AA
  482. leal (BB, %eax, 8), BB
  483. #endif
  484. movaps 0 * SIZE(AA), %xmm0
  485. xorps %xmm4, %xmm4
  486. movaps 16 * SIZE(AA), %xmm1
  487. xorps %xmm5, %xmm5
  488. movaps 0 * SIZE(BB), %xmm2
  489. xorps %xmm6, %xmm6
  490. movaps 16 * SIZE(BB), %xmm3
  491. xorps %xmm7, %xmm7
  492. #if defined(OPTERON) || defined(BARCELONA)
  493. prefetchw 4 * SIZE(%esi)
  494. prefetchw 4 * SIZE(%esi, LDC)
  495. #endif
  496. #ifdef PENTIUM4
  497. prefetchnta 4 * SIZE(%esi)
  498. prefetchnta 4 * SIZE(%esi, LDC)
  499. #endif
  500. #ifndef TRMMKERNEL
  501. movl K, %eax
  502. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  503. movl K, %eax
  504. subl KK, %eax
  505. movl %eax, KKK
  506. #else
  507. movl KK, %eax
  508. #ifdef LEFT
  509. addl $2, %eax
  510. #else
  511. addl $2, %eax
  512. #endif
  513. movl %eax, KKK
  514. #endif
  515. #if 1
  516. andl $-8, %eax
  517. sall $4, %eax
  518. je .L15
  519. .L1X:
  520. KERNEL1(32 * 0)
  521. KERNEL2(32 * 0)
  522. KERNEL3(32 * 0)
  523. KERNEL4(32 * 0)
  524. KERNEL5(32 * 0)
  525. KERNEL6(32 * 0)
  526. KERNEL7(32 * 0)
  527. KERNEL8(32 * 0)
  528. cmpl $128 * 1, %eax
  529. jle .L12
  530. KERNEL1(32 * 1)
  531. KERNEL2(32 * 1)
  532. KERNEL3(32 * 1)
  533. KERNEL4(32 * 1)
  534. KERNEL5(32 * 1)
  535. KERNEL6(32 * 1)
  536. KERNEL7(32 * 1)
  537. KERNEL8(32 * 1)
  538. cmpl $128 * 2, %eax
  539. jle .L12
  540. KERNEL1(32 * 2)
  541. KERNEL2(32 * 2)
  542. KERNEL3(32 * 2)
  543. KERNEL4(32 * 2)
  544. KERNEL5(32 * 2)
  545. KERNEL6(32 * 2)
  546. KERNEL7(32 * 2)
  547. KERNEL8(32 * 2)
  548. cmpl $128 * 3, %eax
  549. jle .L12
  550. KERNEL1(32 * 3)
  551. KERNEL2(32 * 3)
  552. KERNEL3(32 * 3)
  553. KERNEL4(32 * 3)
  554. KERNEL5(32 * 3)
  555. KERNEL6(32 * 3)
  556. KERNEL7(32 * 3)
  557. KERNEL8(32 * 3)
  558. cmpl $128 * 4, %eax
  559. jle .L12
  560. KERNEL1(32 * 4)
  561. KERNEL2(32 * 4)
  562. KERNEL3(32 * 4)
  563. KERNEL4(32 * 4)
  564. KERNEL5(32 * 4)
  565. KERNEL6(32 * 4)
  566. KERNEL7(32 * 4)
  567. KERNEL8(32 * 4)
  568. cmpl $128 * 5, %eax
  569. jle .L12
  570. KERNEL1(32 * 5)
  571. KERNEL2(32 * 5)
  572. KERNEL3(32 * 5)
  573. KERNEL4(32 * 5)
  574. KERNEL5(32 * 5)
  575. KERNEL6(32 * 5)
  576. KERNEL7(32 * 5)
  577. KERNEL8(32 * 5)
  578. cmpl $128 * 6, %eax
  579. jle .L12
  580. KERNEL1(32 * 6)
  581. KERNEL2(32 * 6)
  582. KERNEL3(32 * 6)
  583. KERNEL4(32 * 6)
  584. KERNEL5(32 * 6)
  585. KERNEL6(32 * 6)
  586. KERNEL7(32 * 6)
  587. KERNEL8(32 * 6)
  588. cmpl $128 * 7, %eax
  589. jle .L12
  590. KERNEL1(32 * 7)
  591. KERNEL2(32 * 7)
  592. KERNEL3(32 * 7)
  593. KERNEL4(32 * 7)
  594. KERNEL5(32 * 7)
  595. KERNEL6(32 * 7)
  596. KERNEL7(32 * 7)
  597. KERNEL8(32 * 7)
  598. addl $128 * 8 * SIZE, BB
  599. addl $128 * 2 * SIZE, AA
  600. subl $128 * 8, %eax
  601. jg .L1X
  602. jmp .L15
  603. .L12:
  604. leal (AA, %eax, 1), AA
  605. leal (BB, %eax, 4), BB
  606. ALIGN_4
  607. #else
  608. sarl $3, %eax
  609. je .L15
  610. ALIGN_4
  611. .L11:
  612. KERNEL1(32 * 0)
  613. KERNEL2(32 * 0)
  614. KERNEL3(32 * 0)
  615. KERNEL4(32 * 0)
  616. KERNEL5(32 * 0)
  617. KERNEL6(32 * 0)
  618. KERNEL7(32 * 0)
  619. KERNEL8(32 * 0)
  620. addl $ 32 * SIZE, AA
  621. addl $128 * SIZE, BB
  622. decl %eax
  623. jne .L11
  624. ALIGN_4
  625. #endif
  626. .L15:
  627. #ifndef TRMMKERNEL
  628. movl K, %eax
  629. #else
  630. movl KKK, %eax
  631. #endif
  632. movaps ALPHA_R, %xmm1
  633. movaps ALPHA_I, %xmm3
  634. andl $7, %eax # if (k & 1)
  635. BRANCH
  636. je .L14
  637. ALIGN_4
  638. .L13:
  639. mulps %xmm0, %xmm2
  640. addps %xmm2, %xmm4
  641. movaps 4 * SIZE(BB), %xmm2
  642. mulps %xmm0, %xmm2
  643. addps %xmm2, %xmm5
  644. movaps 8 * SIZE(BB), %xmm2
  645. mulps %xmm0, %xmm2
  646. mulps 12 * SIZE(BB), %xmm0
  647. addps %xmm2, %xmm6
  648. movaps 16 * SIZE(BB), %xmm2
  649. addps %xmm0, %xmm7
  650. movaps 4 * SIZE(AA), %xmm0
  651. addl $ 4 * SIZE, AA
  652. addl $16 * SIZE, BB
  653. decl %eax
  654. jg .L13
  655. ALIGN_4
  656. .L14:
  657. shufps $0xb1, %xmm5, %xmm5
  658. shufps $0xb1, %xmm7, %xmm7
  659. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  660. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  661. subps %xmm5, %xmm4
  662. subps %xmm7, %xmm6
  663. #else
  664. addps %xmm5, %xmm4
  665. addps %xmm7, %xmm6
  666. #endif
  667. movaps %xmm4, %xmm5
  668. movaps %xmm6, %xmm7
  669. shufps $0xb1, %xmm4, %xmm4
  670. shufps $0xb1, %xmm6, %xmm6
  671. mulps %xmm1, %xmm5
  672. mulps %xmm3, %xmm4
  673. mulps %xmm1, %xmm7
  674. mulps %xmm3, %xmm6
  675. addps %xmm5, %xmm4
  676. addps %xmm7, %xmm6
  677. #ifndef TRMMKERNEL
  678. shufps $0xe4, %xmm0, %xmm0
  679. movsd 0 * SIZE(%esi), %xmm0
  680. movhps 2 * SIZE(%esi), %xmm0
  681. shufps $0xe4, %xmm2, %xmm2
  682. movsd 0 * SIZE(%esi, LDC), %xmm2
  683. movhps 2 * SIZE(%esi, LDC), %xmm2
  684. addps %xmm0, %xmm4
  685. addps %xmm2, %xmm6
  686. #endif
  687. movlps %xmm4, 0 * SIZE(%esi)
  688. movhps %xmm4, 2 * SIZE(%esi)
  689. movlps %xmm6, 0 * SIZE(%esi, LDC)
  690. movhps %xmm6, 2 * SIZE(%esi, LDC)
  691. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  692. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  693. movl K, %eax
  694. subl KKK, %eax
  695. leal (,%eax, 8), %eax
  696. leal (AA, %eax, 2), AA
  697. leal (BB, %eax, 8), BB
  698. #endif
  699. #if defined(TRMMKERNEL) && defined(LEFT)
  700. addl $2, KK
  701. #endif
  702. addl $4 * SIZE, %esi # coffset += 4
  703. decl %ebx # i --
  704. jg .L10
  705. ALIGN_4
  706. .L30:
  707. movl M, %ebx
  708. andl $1, %ebx
  709. jle .L99
  710. ALIGN_4
  711. .L40:
  712. #if !defined(TRMMKERNEL) || \
  713. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  714. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  715. leal BUFFER, BB # boffset1 = boffset
  716. #else
  717. leal BUFFER, BB # boffset1 = boffset
  718. movl KK, %eax
  719. leal (, %eax, 8), %eax
  720. leal (AA, %eax, 1), AA
  721. leal (BB, %eax, 8), BB
  722. #endif
  723. #ifdef movsd
  724. xorps %xmm0, %xmm0
  725. #endif
  726. movsd 0 * SIZE(AA), %xmm0
  727. xorps %xmm4, %xmm4
  728. #ifdef movsd
  729. xorps %xmm1, %xmm1
  730. #endif
  731. movsd 8 * SIZE(AA), %xmm1
  732. xorps %xmm5, %xmm5
  733. movaps 0 * SIZE(BB), %xmm2
  734. xorps %xmm6, %xmm6
  735. movaps 16 * SIZE(BB), %xmm3
  736. xorps %xmm7, %xmm7
  737. #ifndef TRMMKERNEL
  738. movl K, %eax
  739. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  740. movl K, %eax
  741. subl KK, %eax
  742. movl %eax, KKK
  743. #else
  744. movl KK, %eax
  745. #ifdef LEFT
  746. addl $1, %eax
  747. #else
  748. addl $2, %eax
  749. #endif
  750. movl %eax, KKK
  751. #endif
  752. sarl $3, %eax
  753. je .L42
  754. ALIGN_4
  755. .L41:
  756. mulps %xmm0, %xmm2
  757. prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA)
  758. addps %xmm2, %xmm4
  759. movaps 4 * SIZE(BB), %xmm2
  760. mulps %xmm0, %xmm2
  761. addps %xmm2, %xmm5
  762. movaps 8 * SIZE(BB), %xmm2
  763. mulps %xmm0, %xmm2
  764. mulps 12 * SIZE(BB), %xmm0
  765. addps %xmm2, %xmm6
  766. movaps 32 * SIZE(BB), %xmm2
  767. addps %xmm0, %xmm7
  768. movsd 2 * SIZE(AA), %xmm0
  769. mulps %xmm0, %xmm3
  770. addps %xmm3, %xmm4
  771. movaps 20 * SIZE(BB), %xmm3
  772. mulps %xmm0, %xmm3
  773. addps %xmm3, %xmm5
  774. movaps 24 * SIZE(BB), %xmm3
  775. mulps %xmm0, %xmm3
  776. mulps 28 * SIZE(BB), %xmm0
  777. addps %xmm3, %xmm6
  778. movaps 48 * SIZE(BB), %xmm3
  779. addps %xmm0, %xmm7
  780. movsd 4 * SIZE(AA), %xmm0
  781. mulps %xmm0, %xmm2
  782. addps %xmm2, %xmm4
  783. movaps 36 * SIZE(BB), %xmm2
  784. mulps %xmm0, %xmm2
  785. addps %xmm2, %xmm5
  786. movaps 40 * SIZE(BB), %xmm2
  787. mulps %xmm0, %xmm2
  788. mulps 44 * SIZE(BB), %xmm0
  789. addps %xmm2, %xmm6
  790. movaps 64 * SIZE(BB), %xmm2
  791. addps %xmm0, %xmm7
  792. movsd 6 * SIZE(AA), %xmm0
  793. mulps %xmm0, %xmm3
  794. addps %xmm3, %xmm4
  795. movaps 52 * SIZE(BB), %xmm3
  796. mulps %xmm0, %xmm3
  797. addps %xmm3, %xmm5
  798. movaps 56 * SIZE(BB), %xmm3
  799. mulps %xmm0, %xmm3
  800. mulps 60 * SIZE(BB), %xmm0
  801. addps %xmm3, %xmm6
  802. movaps 80 * SIZE(BB), %xmm3
  803. addps %xmm0, %xmm7
  804. movsd 16 * SIZE(AA), %xmm0
  805. mulps %xmm1, %xmm2
  806. #if defined(OPTERON) || defined(BARCELONA)
  807. prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
  808. #endif
  809. addps %xmm2, %xmm4
  810. movaps 68 * SIZE(BB), %xmm2
  811. mulps %xmm1, %xmm2
  812. addps %xmm2, %xmm5
  813. movaps 72 * SIZE(BB), %xmm2
  814. mulps %xmm1, %xmm2
  815. mulps 76 * SIZE(BB), %xmm1
  816. addps %xmm2, %xmm6
  817. movaps 96 * SIZE(BB), %xmm2
  818. addps %xmm1, %xmm7
  819. movsd 10 * SIZE(AA), %xmm1
  820. mulps %xmm1, %xmm3
  821. addps %xmm3, %xmm4
  822. movaps 84 * SIZE(BB), %xmm3
  823. mulps %xmm1, %xmm3
  824. addps %xmm3, %xmm5
  825. movaps 88 * SIZE(BB), %xmm3
  826. mulps %xmm1, %xmm3
  827. mulps 92 * SIZE(BB), %xmm1
  828. addps %xmm3, %xmm6
  829. movaps 112 * SIZE(BB), %xmm3
  830. addps %xmm1, %xmm7
  831. movsd 12 * SIZE(AA), %xmm1
  832. mulps %xmm1, %xmm2
  833. addps %xmm2, %xmm4
  834. movaps 100 * SIZE(BB), %xmm2
  835. mulps %xmm1, %xmm2
  836. addps %xmm2, %xmm5
  837. movaps 104 * SIZE(BB), %xmm2
  838. mulps %xmm1, %xmm2
  839. mulps 108 * SIZE(BB), %xmm1
  840. addps %xmm2, %xmm6
  841. movaps 128 * SIZE(BB), %xmm2
  842. addps %xmm1, %xmm7
  843. movsd 14 * SIZE(AA), %xmm1
  844. mulps %xmm1, %xmm3
  845. addps %xmm3, %xmm4
  846. movaps 116 * SIZE(BB), %xmm3
  847. mulps %xmm1, %xmm3
  848. addps %xmm3, %xmm5
  849. movaps 120 * SIZE(BB), %xmm3
  850. mulps %xmm1, %xmm3
  851. mulps 124 * SIZE(BB), %xmm1
  852. addps %xmm3, %xmm6
  853. movaps 144 * SIZE(BB), %xmm3
  854. addps %xmm1, %xmm7
  855. movsd 24 * SIZE(AA), %xmm1
  856. addl $ 16 * SIZE, AA
  857. addl $128 * SIZE, BB
  858. decl %eax
  859. jne .L41
  860. ALIGN_4
  861. .L42:
  862. #ifndef TRMMKERNEL
  863. movl K, %eax
  864. #else
  865. movl KKK, %eax
  866. #endif
  867. movaps ALPHA_R, %xmm1
  868. movaps ALPHA_I, %xmm3
  869. andl $7, %eax # if (k & 1)
  870. BRANCH
  871. je .L44
  872. ALIGN_4
  873. .L43:
  874. mulps %xmm0, %xmm2
  875. addps %xmm2, %xmm4
  876. movaps 4 * SIZE(BB), %xmm2
  877. mulps %xmm0, %xmm2
  878. addps %xmm2, %xmm5
  879. movaps 8 * SIZE(BB), %xmm2
  880. mulps %xmm0, %xmm2
  881. mulps 12 * SIZE(BB), %xmm0
  882. addps %xmm2, %xmm6
  883. movaps 16 * SIZE(BB), %xmm2
  884. addps %xmm0, %xmm7
  885. movsd 2 * SIZE(AA), %xmm0
  886. addl $ 2 * SIZE, AA
  887. addl $16 * SIZE, BB
  888. decl %eax
  889. jg .L43
  890. ALIGN_4
  891. .L44:
  892. shufps $0xb1, %xmm5, %xmm5
  893. shufps $0xb1, %xmm7, %xmm7
  894. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  895. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  896. subps %xmm5, %xmm4
  897. subps %xmm7, %xmm6
  898. #else
  899. addps %xmm5, %xmm4
  900. addps %xmm7, %xmm6
  901. #endif
  902. movaps %xmm4, %xmm5
  903. movaps %xmm6, %xmm7
  904. shufps $0xb1, %xmm4, %xmm4
  905. shufps $0xb1, %xmm6, %xmm6
  906. mulps %xmm1, %xmm5
  907. mulps %xmm3, %xmm4
  908. mulps %xmm1, %xmm7
  909. mulps %xmm3, %xmm6
  910. addps %xmm5, %xmm4
  911. addps %xmm7, %xmm6
  912. #ifndef TRMMKERNEL
  913. shufps $0xe4, %xmm4, %xmm4
  914. shufps $0xe4, %xmm6, %xmm6
  915. #ifdef movsd
  916. xorps %xmm0, %xmm0
  917. #endif
  918. movsd 0 * SIZE(%esi), %xmm0
  919. #ifdef movsd
  920. xorps %xmm2, %xmm2
  921. #endif
  922. movsd 0 * SIZE(%esi, LDC), %xmm2
  923. addps %xmm0, %xmm4
  924. addps %xmm2, %xmm6
  925. #endif
  926. movlps %xmm4, 0 * SIZE(%esi)
  927. movlps %xmm6, 0 * SIZE(%esi, LDC)
  928. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  929. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  930. movl K, %eax
  931. subl KKK, %eax
  932. leal (,%eax, 8), %eax
  933. leal (AA, %eax, 1), AA
  934. leal (BB, %eax, 8), BB
  935. #endif
  936. #if defined(TRMMKERNEL) && defined(LEFT)
  937. addl $1, KK
  938. #endif
  939. ALIGN_4
  940. .L99:
  941. #if defined(TRMMKERNEL) && !defined(LEFT)
  942. addl $2, KK
  943. #endif
  944. leal (LDC, LDC), %eax
  945. addl %eax, C # c += 2 * ldc
  946. decl J # j --
  947. jg .L01
  948. ALIGN_4
  949. .L100:
  950. movl N, %eax
  951. andl $1, %eax
  952. jle .L999
  953. ALIGN_4
  954. .L101:
  955. #if defined(TRMMKERNEL) && defined(LEFT)
  956. movl OFFSET, %eax
  957. movl %eax, KK
  958. #endif
  959. /* Copying to Sub Buffer */
  960. leal BUFFER, %ecx
  961. movaps POSINV, %xmm7
  962. movl K, %eax
  963. sarl $2, %eax
  964. jle .L103
  965. ALIGN_4
  966. .L102:
  967. movss 0 * SIZE(B), %xmm0
  968. movss 1 * SIZE(B), %xmm1
  969. movss 2 * SIZE(B), %xmm2
  970. movss 3 * SIZE(B), %xmm3
  971. shufps $0, %xmm0, %xmm0
  972. shufps $0, %xmm1, %xmm1
  973. shufps $0, %xmm2, %xmm2
  974. shufps $0, %xmm3, %xmm3
  975. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  976. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  977. xorps %xmm7, %xmm1
  978. xorps %xmm7, %xmm3
  979. #else
  980. xorps %xmm7, %xmm0
  981. xorps %xmm7, %xmm2
  982. #endif
  983. movaps %xmm0, 0 * SIZE(BB)
  984. movaps %xmm1, 4 * SIZE(BB)
  985. movaps %xmm2, 8 * SIZE(BB)
  986. movaps %xmm3, 12 * SIZE(BB)
  987. movss 4 * SIZE(B), %xmm0
  988. movss 5 * SIZE(B), %xmm1
  989. movss 6 * SIZE(B), %xmm2
  990. movss 7 * SIZE(B), %xmm3
  991. shufps $0, %xmm0, %xmm0
  992. shufps $0, %xmm1, %xmm1
  993. shufps $0, %xmm2, %xmm2
  994. shufps $0, %xmm3, %xmm3
  995. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  996. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  997. xorps %xmm7, %xmm1
  998. xorps %xmm7, %xmm3
  999. #else
  1000. xorps %xmm7, %xmm0
  1001. xorps %xmm7, %xmm2
  1002. #endif
  1003. movaps %xmm0, 16 * SIZE(BB)
  1004. movaps %xmm1, 20 * SIZE(BB)
  1005. movaps %xmm2, 24 * SIZE(BB)
  1006. movaps %xmm3, 28 * SIZE(BB)
  1007. prefetcht0 104 * SIZE(B)
  1008. addl $ 8 * SIZE, B
  1009. addl $32 * SIZE, BB
  1010. decl %eax
  1011. jne .L102
  1012. ALIGN_4
  1013. .L103:
  1014. movl K, %eax
  1015. andl $3, %eax
  1016. BRANCH
  1017. jle .L105
  1018. ALIGN_4
  1019. .L104:
  1020. movss 0 * SIZE(B), %xmm0
  1021. movss 1 * SIZE(B), %xmm1
  1022. shufps $0, %xmm0, %xmm0
  1023. shufps $0, %xmm1, %xmm1
  1024. #if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
  1025. defined(TN) || defined(TT) || defined(TR) || defined(TC)
  1026. xorps %xmm7, %xmm1
  1027. #else
  1028. xorps %xmm7, %xmm0
  1029. #endif
  1030. movaps %xmm0, 0 * SIZE(BB)
  1031. movaps %xmm1, 4 * SIZE(BB)
  1032. addl $ 2 * SIZE, %edi
  1033. addl $ 8 * SIZE, %ecx
  1034. decl %eax
  1035. jne .L104
  1036. ALIGN_4
  1037. .L105:
  1038. movl C, %esi
  1039. movl A, AA
  1040. movl M, %ebx
  1041. sarl $1, %ebx
  1042. jle .L130
  1043. ALIGN_4
  1044. .L110:
  1045. #if !defined(TRMMKERNEL) || \
  1046. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1047. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1048. leal BUFFER, BB # boffset1 = boffset
  1049. #else
  1050. leal BUFFER, BB # boffset1 = boffset
  1051. movl KK, %eax
  1052. leal (, %eax, 8), %eax
  1053. leal (AA, %eax, 2), AA
  1054. leal (BB, %eax, 4), BB
  1055. #endif
  1056. xorps %xmm4, %xmm4
  1057. xorps %xmm5, %xmm5
  1058. xorps %xmm6, %xmm6
  1059. xorps %xmm7, %xmm7
  1060. movaps 0 * SIZE(AA), %xmm0
  1061. movaps 16 * SIZE(AA), %xmm1
  1062. movaps 0 * SIZE(BB), %xmm2
  1063. movaps 16 * SIZE(BB), %xmm3
  1064. #if defined(OPTERON) || defined(BARCELONA)
  1065. prefetchw 4 * SIZE(%esi)
  1066. #endif
  1067. #ifdef PENTIUM4
  1068. prefetchnta 4 * SIZE(%esi)
  1069. #endif
  1070. #ifndef TRMMKERNEL
  1071. movl K, %eax
  1072. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1073. movl K, %eax
  1074. subl KK, %eax
  1075. movl %eax, KKK
  1076. #else
  1077. movl KK, %eax
  1078. #ifdef LEFT
  1079. addl $2, %eax
  1080. #else
  1081. addl $1, %eax
  1082. #endif
  1083. movl %eax, KKK
  1084. #endif
  1085. sarl $3, %eax
  1086. je .L112
  1087. ALIGN_4
  1088. .L111:
  1089. mulps %xmm0, %xmm2
  1090. addps %xmm2, %xmm4
  1091. movaps 4 * SIZE(BB), %xmm2
  1092. mulps %xmm0, %xmm2
  1093. movaps 4 * SIZE(AA), %xmm0
  1094. addps %xmm2, %xmm5
  1095. movaps 8 * SIZE(BB), %xmm2
  1096. mulps %xmm0, %xmm2
  1097. addps %xmm2, %xmm6
  1098. movaps 12 * SIZE(BB), %xmm2
  1099. mulps %xmm0, %xmm2
  1100. movaps 8 * SIZE(AA), %xmm0
  1101. addps %xmm2, %xmm7
  1102. movaps 32 * SIZE(BB), %xmm2
  1103. mulps %xmm0, %xmm3
  1104. addps %xmm3, %xmm4
  1105. movaps 20 * SIZE(BB), %xmm3
  1106. mulps %xmm0, %xmm3
  1107. movaps 12 * SIZE(AA), %xmm0
  1108. addps %xmm3, %xmm5
  1109. movaps 24 * SIZE(BB), %xmm3
  1110. mulps %xmm0, %xmm3
  1111. addps %xmm3, %xmm6
  1112. movaps 28 * SIZE(BB), %xmm3
  1113. mulps %xmm0, %xmm3
  1114. movaps 32 * SIZE(AA), %xmm0
  1115. addps %xmm3, %xmm7
  1116. movaps 48 * SIZE(BB), %xmm3
  1117. mulps %xmm1, %xmm2
  1118. addps %xmm2, %xmm4
  1119. movaps 36 * SIZE(BB), %xmm2
  1120. mulps %xmm1, %xmm2
  1121. movaps 20 * SIZE(AA), %xmm1
  1122. addps %xmm2, %xmm5
  1123. movaps 40 * SIZE(BB), %xmm2
  1124. mulps %xmm1, %xmm2
  1125. addps %xmm2, %xmm6
  1126. movaps 44 * SIZE(BB), %xmm2
  1127. mulps %xmm1, %xmm2
  1128. movaps 24 * SIZE(AA), %xmm1
  1129. addps %xmm2, %xmm7
  1130. movaps 64 * SIZE(BB), %xmm2
  1131. mulps %xmm1, %xmm3
  1132. addps %xmm3, %xmm4
  1133. movaps 52 * SIZE(BB), %xmm3
  1134. mulps %xmm1, %xmm3
  1135. movaps 28 * SIZE(AA), %xmm1
  1136. addps %xmm3, %xmm5
  1137. movaps 56 * SIZE(BB), %xmm3
  1138. mulps %xmm1, %xmm3
  1139. addps %xmm3, %xmm6
  1140. movaps 60 * SIZE(BB), %xmm3
  1141. mulps %xmm1, %xmm3
  1142. movaps 48 * SIZE(AA), %xmm1
  1143. addps %xmm3, %xmm7
  1144. movaps 80 * SIZE(BB), %xmm3
  1145. addl $ 32 * SIZE, AA
  1146. addl $ 64 * SIZE, BB
  1147. decl %eax
  1148. jne .L111
  1149. ALIGN_4
  1150. .L112:
  1151. #ifndef TRMMKERNEL
  1152. movl K, %eax
  1153. #else
  1154. movl KKK, %eax
  1155. #endif
  1156. movaps ALPHA_R, %xmm1
  1157. movaps ALPHA_I, %xmm3
  1158. andl $7, %eax # if (k & 1)
  1159. BRANCH
  1160. je .L114
  1161. ALIGN_4
  1162. .L113:
  1163. mulps %xmm0, %xmm2
  1164. mulps 4 * SIZE(BB), %xmm0
  1165. addps %xmm2, %xmm4
  1166. movaps 8 * SIZE(BB), %xmm2
  1167. addps %xmm0, %xmm5
  1168. movaps 4 * SIZE(AA), %xmm0
  1169. addl $ 4 * SIZE, AA
  1170. addl $ 8 * SIZE, BB
  1171. decl %eax
  1172. jg .L113
  1173. ALIGN_4
  1174. .L114:
  1175. addps %xmm6, %xmm4
  1176. addps %xmm7, %xmm5
  1177. shufps $0xb1, %xmm5, %xmm5
  1178. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1179. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1180. subps %xmm5, %xmm4
  1181. #else
  1182. addps %xmm5, %xmm4
  1183. #endif
  1184. movaps %xmm4, %xmm5
  1185. shufps $0xb1, %xmm4, %xmm4
  1186. mulps %xmm1, %xmm5
  1187. mulps %xmm3, %xmm4
  1188. addps %xmm5, %xmm4
  1189. #ifndef TRMMKERNEL
  1190. shufps $0xe4, %xmm4, %xmm4
  1191. movsd 0 * SIZE(%esi), %xmm0
  1192. movhps 2 * SIZE(%esi), %xmm0
  1193. addps %xmm0, %xmm4
  1194. #endif
  1195. movlps %xmm4, 0 * SIZE(%esi)
  1196. movhps %xmm4, 2 * SIZE(%esi)
  1197. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1198. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1199. movl K, %eax
  1200. subl KKK, %eax
  1201. leal (,%eax, 8), %eax
  1202. leal (AA, %eax, 2), AA
  1203. leal (BB, %eax, 4), BB
  1204. #endif
  1205. #if defined(TRMMKERNEL) && defined(LEFT)
  1206. addl $2, KK
  1207. #endif
  1208. addl $4 * SIZE, %esi # coffset += 4
  1209. decl %ebx # i --
  1210. jg .L110
  1211. ALIGN_4
  1212. .L130:
  1213. movl M, %ebx
  1214. andl $1, %ebx
  1215. jle .L999
  1216. ALIGN_4
  1217. .L140:
  1218. #if !defined(TRMMKERNEL) || \
  1219. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1220. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1221. leal BUFFER, BB # boffset1 = boffset
  1222. #else
  1223. leal BUFFER, BB # boffset1 = boffset
  1224. movl KK, %eax
  1225. leal (, %eax, 8), %eax
  1226. leal (AA, %eax, 1), AA
  1227. leal (BB, %eax, 4), BB
  1228. #endif
  1229. movaps 0 * SIZE(AA), %xmm0
  1230. xorps %xmm4, %xmm4
  1231. movaps 8 * SIZE(AA), %xmm1
  1232. xorps %xmm5, %xmm5
  1233. movaps 0 * SIZE(BB), %xmm2
  1234. xorps %xmm6, %xmm6
  1235. movaps 16 * SIZE(BB), %xmm3
  1236. xorps %xmm7, %xmm7
  1237. #ifndef TRMMKERNEL
  1238. movl K, %eax
  1239. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1240. movl K, %eax
  1241. subl KK, %eax
  1242. movl %eax, KKK
  1243. #else
  1244. movl KK, %eax
  1245. #ifdef LEFT
  1246. addl $1, %eax
  1247. #else
  1248. addl $1, %eax
  1249. #endif
  1250. movl %eax, KKK
  1251. #endif
  1252. sarl $3, %eax
  1253. je .L142
  1254. ALIGN_4
  1255. .L141:
  1256. mulps %xmm0, %xmm2
  1257. addps %xmm2, %xmm4
  1258. movaps 4 * SIZE(BB), %xmm2
  1259. mulps %xmm0, %xmm2
  1260. movsd 2 * SIZE(AA), %xmm0
  1261. addps %xmm2, %xmm5
  1262. movaps 8 * SIZE(BB), %xmm2
  1263. mulps %xmm0, %xmm2
  1264. addps %xmm2, %xmm6
  1265. movaps 12 * SIZE(BB), %xmm2
  1266. mulps %xmm0, %xmm2
  1267. movsd 4 * SIZE(AA), %xmm0
  1268. addps %xmm2, %xmm7
  1269. movaps 32 * SIZE(BB), %xmm2
  1270. mulps %xmm0, %xmm3
  1271. addps %xmm3, %xmm4
  1272. movaps 20 * SIZE(BB), %xmm3
  1273. mulps %xmm0, %xmm3
  1274. movsd 6 * SIZE(AA), %xmm0
  1275. addps %xmm3, %xmm5
  1276. movaps 24 * SIZE(BB), %xmm3
  1277. mulps %xmm0, %xmm3
  1278. addps %xmm3, %xmm6
  1279. movaps 28 * SIZE(BB), %xmm3
  1280. mulps %xmm0, %xmm3
  1281. movsd 16 * SIZE(AA), %xmm0
  1282. addps %xmm3, %xmm7
  1283. movaps 48 * SIZE(BB), %xmm3
  1284. mulps %xmm1, %xmm2
  1285. addps %xmm2, %xmm4
  1286. movaps 36 * SIZE(BB), %xmm2
  1287. mulps %xmm1, %xmm2
  1288. movsd 10 * SIZE(AA), %xmm1
  1289. addps %xmm2, %xmm5
  1290. movaps 40 * SIZE(BB), %xmm2
  1291. mulps %xmm1, %xmm2
  1292. addps %xmm2, %xmm6
  1293. movaps 44 * SIZE(BB), %xmm2
  1294. mulps %xmm1, %xmm2
  1295. movsd 12 * SIZE(AA), %xmm1
  1296. addps %xmm2, %xmm7
  1297. movaps 64 * SIZE(BB), %xmm2
  1298. mulps %xmm1, %xmm3
  1299. addps %xmm3, %xmm4
  1300. movaps 52 * SIZE(BB), %xmm3
  1301. mulps %xmm1, %xmm3
  1302. movsd 14 * SIZE(AA), %xmm1
  1303. addps %xmm3, %xmm5
  1304. movaps 56 * SIZE(BB), %xmm3
  1305. mulps %xmm1, %xmm3
  1306. addps %xmm3, %xmm6
  1307. movaps 60 * SIZE(BB), %xmm3
  1308. mulps %xmm1, %xmm3
  1309. movsd 24 * SIZE(AA), %xmm1
  1310. addps %xmm3, %xmm7
  1311. movaps 80 * SIZE(BB), %xmm3
  1312. addl $ 16 * SIZE, AA
  1313. addl $ 64 * SIZE, BB
  1314. decl %eax
  1315. jne .L141
  1316. ALIGN_4
  1317. .L142:
  1318. #ifndef TRMMKERNEL
  1319. movl K, %eax
  1320. #else
  1321. movl KKK, %eax
  1322. #endif
  1323. movaps ALPHA_R, %xmm1
  1324. movaps ALPHA_I, %xmm3
  1325. andl $7, %eax # if (k & 1)
  1326. BRANCH
  1327. je .L144
  1328. ALIGN_4
  1329. .L143:
  1330. mulps %xmm0, %xmm2
  1331. mulps 4 * SIZE(BB), %xmm0
  1332. addps %xmm2, %xmm4
  1333. movaps 8 * SIZE(BB), %xmm2
  1334. addps %xmm0, %xmm5
  1335. movsd 2 * SIZE(AA), %xmm0
  1336. addl $2 * SIZE, AA
  1337. addl $8 * SIZE, BB
  1338. decl %eax
  1339. jg .L143
  1340. ALIGN_4
  1341. .L144:
  1342. addps %xmm6, %xmm4
  1343. addps %xmm7, %xmm5
  1344. shufps $0xb1, %xmm5, %xmm5
  1345. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1346. defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1347. subps %xmm5, %xmm4
  1348. #else
  1349. addps %xmm5, %xmm4
  1350. #endif
  1351. movaps %xmm4, %xmm5
  1352. shufps $0xb1, %xmm4, %xmm4
  1353. mulps %xmm1, %xmm5
  1354. mulps %xmm3, %xmm4
  1355. addps %xmm5, %xmm4
  1356. #ifndef TRMMKERNEL
  1357. shufps $0xe4, %xmm4, %xmm4
  1358. #ifdef movsd
  1359. xorps %xmm0, %xmm0
  1360. #endif
  1361. movsd 0 * SIZE(%esi), %xmm0
  1362. addps %xmm0, %xmm4
  1363. #endif
  1364. movlps %xmm4, 0 * SIZE(%esi)
  1365. ALIGN_4
  1366. .L999:
  1367. EMMS
  1368. movl OLD_STACK, %esp
  1369. popl %ebx
  1370. popl %esi
  1371. popl %edi
  1372. popl %ebp
  1373. ret
  1374. EPILOGUE