You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_4x2_piledriver.S 52 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /*********************************************************************
  28. *
  29. * 2014/06/28 Saar
  30. * BLASTEST : OK
  31. * CTEST : OK
  32. * TEST : OK
  33. *
  34. *
  35. * 2013/10/31 Saar
  36. *
  37. * Parameter:
  38. * UNROLL_M 4
  39. * UNROLL_N 2
  40. * CGEMM_P 768
  41. * CGEMM_Q 168
  42. * A_PR1 512
  43. * B_PR1 256
  44. *
  45. * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
  46. *
  47. * 4608x4608 154.0 GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 )
  48. * 4608x4608 148.3 GFLOPS with 4 threads on 4 modules (ACML: 96.0 ) (BULLDOZER: 143.2 )
  49. * 3456x3456 74.3 GFLOPS with 2 threads on 2 modules (ACML: 47.3 ) (BULLDOZER: 72.3 )
  50. * 3456x3456 37.3 GFLOPS with 1 threads on 1 modules (ACML: 24.2 ) (BULLDOZER: 36.5 )
  51. *
  52. * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1):
  53. *
  54. * 6912x6912 421.5 GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 )
  55. * 6912x6912 407.0 GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 )
  56. * 6912x6912 234.2 GFLOPS with 8 threads on 8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 )
  57. * 4608x4608 123.1 GFLOPS with 4 threads on 4 modules (ACML: 87.9 ) (BULLDOZER: 120.9 )
  58. * 3456x3456 62.6 GFLOPS with 2 threads on 2 modules (ACML: 44.5 ) (BULLDOZER: 62.1 )
  59. * 3456x3456 31.8 GFLOPS with 1 threads on 1 modules (ACML: 22.6 ) (BULLDOZER: 31.4 )
  60. *
  61. *********************************************************************/
  62. #define ASSEMBLER
  63. #include "common.h"
  64. #define OLD_M %rdi
  65. #define OLD_N %rsi
  66. #define M %r13
  67. #define J %r14
  68. #define OLD_K %rdx
  69. #define A %rcx
  70. #define B %r8
  71. #define C %r9
  72. #define LDC %r10
  73. #define I %r11
  74. #define AO %rdi
  75. #define BO %rsi
  76. #define CO1 %r15
  77. #define K %r12
  78. #define BI %rbp
  79. #define SP %rbx
  80. #define BO1 %rdi
  81. #define BO2 %r15
  82. #ifndef WINDOWS_ABI
  83. #define STACKSIZE 96
  84. #else
  85. #define STACKSIZE 320
  86. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  87. #define OLD_A 48 + STACKSIZE(%rsp)
  88. #define OLD_B 56 + STACKSIZE(%rsp)
  89. #define OLD_C 64 + STACKSIZE(%rsp)
  90. #define OLD_LDC 72 + STACKSIZE(%rsp)
  91. #define OLD_OFFSET 80 + STACKSIZE(%rsp)
  92. #endif
  93. #define L_BUFFER_SIZE 256*8*4
  94. #define Ndiv6 24(%rsp)
  95. #define Nmod6 32(%rsp)
  96. #define N 40(%rsp)
  97. #define ALPHA_R 48(%rsp)
  98. #define ALPHA_I 56(%rsp)
  99. #define OFFSET 64(%rsp)
  100. #define KK 72(%rsp)
  101. #define KKK 80(%rsp)
  102. #define BUFFER1 128(%rsp)
  103. #if defined(OS_WINDOWS)
  104. #if L_BUFFER_SIZE > 16384
  105. #define STACK_TOUCH \
  106. movl $0, 4096 * 4(%rsp);\
  107. movl $0, 4096 * 3(%rsp);\
  108. movl $0, 4096 * 2(%rsp);\
  109. movl $0, 4096 * 1(%rsp);
  110. #elif L_BUFFER_SIZE > 12288
  111. #define STACK_TOUCH \
  112. movl $0, 4096 * 3(%rsp);\
  113. movl $0, 4096 * 2(%rsp);\
  114. movl $0, 4096 * 1(%rsp);
  115. #elif L_BUFFER_SIZE > 8192
  116. #define STACK_TOUCH \
  117. movl $0, 4096 * 2(%rsp);\
  118. movl $0, 4096 * 1(%rsp);
  119. #elif L_BUFFER_SIZE > 4096
  120. #define STACK_TOUCH \
  121. movl $0, 4096 * 1(%rsp);
  122. #else
  123. #define STACK_TOUCH
  124. #endif
  125. #else
  126. #define STACK_TOUCH
  127. #endif
  128. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  129. #define VFMADD_R vfmaddps
  130. #define VFMADD_I vfmaddps
  131. #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
  132. #define VFMADD_R vfnmaddps
  133. #define VFMADD_I vfmaddps
  134. #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
  135. #define VFMADD_R vfmaddps
  136. #define VFMADD_I vfnmaddps
  137. #else
  138. #define VFMADD_R vfnmaddps
  139. #define VFMADD_I vfnmaddps
  140. #endif
  141. #define A_PR1 512
  142. #define B_PR1 256
  143. #define KERNEL4x2_1(xx) \
  144. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  145. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  146. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  147. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  148. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  149. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  150. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  151. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  152. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  153. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  154. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  155. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  156. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  157. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  158. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  159. #define KERNEL4x2_2(xx) \
  160. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  161. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  162. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  163. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  164. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  165. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  166. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  167. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  168. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  169. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  170. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  171. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  172. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  173. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  174. #define KERNEL4x2_3(xx) \
  175. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  176. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  177. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  178. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  179. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  180. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  181. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  182. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  183. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  184. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  185. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  186. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  187. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  188. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  189. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  190. #define KERNEL4x2_4(xx) \
  191. vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  192. vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  193. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  194. vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  195. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  196. vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  197. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  198. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  199. vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  200. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  201. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  202. vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  203. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  204. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  205. addq $16, BI ;\
  206. addq $32, %rax ;\
  207. #define KERNEL4x2_SUB(xx) \
  208. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  209. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  210. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  211. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  212. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  213. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  214. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  215. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  216. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  217. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  218. VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\
  219. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  220. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  221. VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\
  222. addq $4, BI ;\
  223. addq $8, %rax ;\
  224. /************************************************************************************************/
  225. #define KERNEL2x2_1(xx) \
  226. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  227. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  228. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  229. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  230. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  231. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  232. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  233. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  234. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  235. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  236. #define KERNEL2x2_2(xx) \
  237. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  238. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  239. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  240. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  241. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  242. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  243. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  244. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  245. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  246. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  247. #define KERNEL2x2_3(xx) \
  248. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  249. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  250. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  251. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  252. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  253. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  254. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  255. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  256. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  257. #define KERNEL2x2_4(xx) \
  258. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  259. vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  260. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  261. vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  262. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  263. vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  264. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  265. vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  266. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  267. addq $16, BI ;\
  268. addq $16, %rax ;\
  269. #define KERNEL2x2_SUB(xx) \
  270. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  271. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  272. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  273. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  274. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  275. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  276. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  277. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  278. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  279. addq $4, BI ;\
  280. addq $4, %rax ;\
  281. /************************************************************************************************/
  282. #define KERNEL1x2_1(xx) \
  283. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  284. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  285. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  286. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  287. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  288. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  289. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  290. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  291. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  292. #define KERNEL1x2_2(xx) \
  293. vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  294. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  295. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  296. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  297. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  298. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  299. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  300. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  301. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
  302. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  303. #define KERNEL1x2_3(xx) \
  304. vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  305. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  306. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  307. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  308. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  309. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm6 ;\
  310. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  311. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm7 ;\
  312. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  313. #define KERNEL1x2_4(xx) \
  314. vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  315. vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  316. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  317. vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm5 ;\
  318. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  319. vbroadcastss 6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  320. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  321. vbroadcastss 7 * SIZE(BO, BI, SIZE), %xmm7 ;\
  322. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  323. addq $16, BI ;\
  324. addq $8, %rax ;\
  325. #define KERNEL1x2_SUB(xx) \
  326. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  327. vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
  328. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  329. vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
  330. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  331. vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
  332. VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\
  333. vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
  334. VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\
  335. addq $4, BI ;\
  336. addq $2, %rax ;\
  337. /************************************************************************************************/
  338. #define KERNEL4x1_1(xx) \
  339. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  340. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  341. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  342. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  343. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  344. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  345. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  346. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  347. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  348. #define KERNEL4x1_2(xx) \
  349. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  350. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  351. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  352. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  353. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  354. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  355. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  356. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  357. #define KERNEL4x1_3(xx) \
  358. prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\
  359. vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  360. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  361. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  362. vmovups 4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  363. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  364. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  365. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  366. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  367. #define KERNEL4x1_4(xx) \
  368. vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  369. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  370. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  371. vmovups 12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  372. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  373. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  374. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  375. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  376. addq $8, BI ;\
  377. addq $32, %rax ;\
  378. #define KERNEL4x1_SUB(xx) \
  379. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  380. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  381. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  382. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
  383. VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\
  384. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  385. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  386. VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\
  387. addq $2, BI ;\
  388. addq $8, %rax ;\
  389. /************************************************************************************************/
  390. #define KERNEL2x1_1(xx) \
  391. prefetcht0 A_PR1(AO,%rax,SIZE) ;\
  392. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  393. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  394. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  395. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  396. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  397. #define KERNEL2x1_2(xx) \
  398. vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  399. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  400. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  401. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  402. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  403. #define KERNEL2x1_3(xx) \
  404. vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  405. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  406. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  407. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  408. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  409. #define KERNEL2x1_4(xx) \
  410. vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  411. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  412. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  413. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  414. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  415. addq $8, BI ;\
  416. addq $16, %rax ;\
  417. #define KERNEL2x1_SUB(xx) \
  418. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  419. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  420. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  421. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  422. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  423. addq $2, BI ;\
  424. addq $4, %rax ;\
  425. /************************************************************************************************/
  426. #define KERNEL1x1_1(xx) \
  427. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  428. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  429. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  430. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  431. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  432. #define KERNEL1x1_2(xx) \
  433. vmovsd -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  434. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  435. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  436. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  437. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  438. #define KERNEL1x1_3(xx) \
  439. vmovsd -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  440. vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm4 ;\
  441. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  442. vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm5 ;\
  443. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  444. #define KERNEL1x1_4(xx) \
  445. vmovsd -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  446. vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm4 ;\
  447. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  448. vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  449. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  450. addq $8, BI ;\
  451. addq $8, %rax ;\
  452. #define KERNEL1x1_SUB(xx) \
  453. vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
  454. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
  455. VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\
  456. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
  457. VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\
  458. addq $2, BI ;\
  459. addq $2, %rax ;\
  460. /************************************************************************************************/
  461. PROLOGUE
  462. PROFCODE
  463. subq $STACKSIZE, %rsp
  464. movq %rbx, (%rsp)
  465. movq %rbp, 8(%rsp)
  466. movq %r12, 16(%rsp)
  467. movq %r13, 24(%rsp)
  468. movq %r14, 32(%rsp)
  469. movq %r15, 40(%rsp)
  470. vzeroupper
  471. #ifdef WINDOWS_ABI
  472. movq %rdi, 48(%rsp)
  473. movq %rsi, 56(%rsp)
  474. vmovups %xmm6, 64(%rsp)
  475. vmovups %xmm7, 80(%rsp)
  476. vmovups %xmm8, 96(%rsp)
  477. vmovups %xmm9, 112(%rsp)
  478. vmovups %xmm10, 128(%rsp)
  479. vmovups %xmm11, 144(%rsp)
  480. vmovups %xmm12, 160(%rsp)
  481. vmovups %xmm13, 176(%rsp)
  482. vmovups %xmm14, 192(%rsp)
  483. vmovups %xmm15, 208(%rsp)
  484. movq ARG1, OLD_M
  485. movq ARG2, OLD_N
  486. movq ARG3, OLD_K
  487. movq OLD_A, A
  488. movq OLD_B, B
  489. movq OLD_C, C
  490. movq OLD_LDC, LDC
  491. #ifdef TRMMKERNEL
  492. movsd OLD_OFFSET, %xmm12
  493. #endif
  494. vmovaps %xmm3, %xmm0
  495. vmovsd OLD_ALPHA_I, %xmm1
  496. #else
  497. movq STACKSIZE + 8(%rsp), LDC
  498. #ifdef TRMMKERNEL
  499. movsd STACKSIZE + 16(%rsp), %xmm12
  500. #endif
  501. #endif
  502. movq %rsp, SP # save old stack
  503. subq $128 + L_BUFFER_SIZE, %rsp
  504. andq $-4096, %rsp # align stack
  505. STACK_TOUCH
  506. cmpq $0, OLD_M
  507. je .L999
  508. cmpq $0, OLD_N
  509. je .L999
  510. cmpq $0, OLD_K
  511. je .L999
  512. movq OLD_M, M
  513. movq OLD_N, N
  514. movq OLD_K, K
  515. vmovss %xmm0, ALPHA_R
  516. vmovss %xmm1, ALPHA_I
  517. salq $ZBASE_SHIFT, LDC
  518. movq N, %rax
  519. xorq %rdx, %rdx
  520. movq $2, %rdi
  521. divq %rdi // N / 2
  522. movq %rax, Ndiv6 // N / 2
  523. movq %rdx, Nmod6 // N % 2
  524. #ifdef TRMMKERNEL
  525. vmovsd %xmm12, OFFSET
  526. vmovsd %xmm12, KK
  527. #ifndef LEFT
  528. negq KK
  529. #endif
  530. #endif
  531. .L2_0:
  532. movq Ndiv6, J
  533. cmpq $0, J
  534. je .L1_0
  535. ALIGN_4
  536. .L2_01:
  537. // copy to sub buffer
  538. movq B, BO1
  539. leaq BUFFER1, BO // first buffer to BO
  540. movq K, %rax
  541. ALIGN_4
  542. .L2_02b:
  543. vmovups (BO1), %xmm0
  544. vmovups %xmm0, (BO)
  545. addq $4*SIZE,BO1
  546. addq $4*SIZE,BO
  547. decq %rax
  548. jnz .L2_02b
  549. .L2_02c:
  550. movq BO1, B // next offset of B
  551. .L2_10:
  552. movq C, CO1
  553. leaq (C, LDC, 2), C // c += 2 * ldc
  554. #if defined(TRMMKERNEL) && defined(LEFT)
  555. movq OFFSET, %rax
  556. movq %rax, KK
  557. #endif
  558. movq A, AO // aoffset = a
  559. addq $16 * SIZE, AO
  560. movq M, I
  561. sarq $2, I // i = (m >> 2)
  562. je .L2_20
  563. ALIGN_4
  564. .L2_11:
  565. #if !defined(TRMMKERNEL) || \
  566. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  567. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  568. leaq BUFFER1, BO // first buffer to BO
  569. addq $8 * SIZE, BO
  570. #else
  571. movq KK, %rax
  572. leaq BUFFER1, BO // first buffer to BO
  573. addq $8 * SIZE, BO
  574. movq %rax, BI // Index for BO
  575. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  576. leaq (BO, BI, SIZE), BO
  577. salq $3, %rax // rax = rax * 8 ; number of values
  578. leaq (AO, %rax, SIZE), AO
  579. #endif
  580. vzeroall
  581. #ifndef TRMMKERNEL
  582. movq K, %rax
  583. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  584. movq K, %rax
  585. subq KK, %rax
  586. movq %rax, KKK
  587. #else
  588. movq KK, %rax
  589. #ifdef LEFT
  590. addq $4, %rax // number of values in AO
  591. #else
  592. addq $2, %rax // number of values in BO
  593. #endif
  594. movq %rax, KKK
  595. #endif
  596. andq $-8, %rax // K = K - ( K % 8 )
  597. je .L2_16
  598. movq %rax, BI // Index for BO
  599. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  600. salq $3, %rax // rax = rax * 8 ; number of values
  601. leaq (AO, %rax, SIZE), AO
  602. leaq (BO, BI, SIZE), BO
  603. negq BI
  604. negq %rax
  605. ALIGN_4
  606. .L2_12:
  607. prefetcht0 B_PR1(BO,BI,SIZE)
  608. KERNEL4x2_1(xxx)
  609. KERNEL4x2_2(xxx)
  610. KERNEL4x2_3(xxx)
  611. KERNEL4x2_4(xxx)
  612. prefetcht0 B_PR1(BO,BI,SIZE)
  613. KERNEL4x2_1(xxx)
  614. KERNEL4x2_2(xxx)
  615. KERNEL4x2_3(xxx)
  616. KERNEL4x2_4(xxx)
  617. je .L2_16
  618. prefetcht0 B_PR1(BO,BI,SIZE)
  619. KERNEL4x2_1(xxx)
  620. KERNEL4x2_2(xxx)
  621. KERNEL4x2_3(xxx)
  622. KERNEL4x2_4(xxx)
  623. prefetcht0 B_PR1(BO,BI,SIZE)
  624. KERNEL4x2_1(xxx)
  625. KERNEL4x2_2(xxx)
  626. KERNEL4x2_3(xxx)
  627. KERNEL4x2_4(xxx)
  628. je .L2_16
  629. jmp .L2_12
  630. ALIGN_4
  631. .L2_16:
  632. #ifndef TRMMKERNEL
  633. movq K, %rax
  634. #else
  635. movq KKK, %rax
  636. #endif
  637. andq $7, %rax # if (k & 1)
  638. je .L2_19
  639. movq %rax, BI // Index for BO
  640. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  641. salq $3, %rax // rax = rax * 8 ; number of values
  642. leaq (AO, %rax, SIZE), AO
  643. leaq (BO, BI, SIZE), BO
  644. negq BI
  645. negq %rax
  646. ALIGN_4
  647. .L2_17:
  648. KERNEL4x2_SUB(xxx)
  649. jl .L2_17
  650. ALIGN_4
  651. .L2_19:
  652. vbroadcastss ALPHA_R, %xmm0
  653. vbroadcastss ALPHA_I, %xmm1
  654. // swap high and low 64 bytes
  655. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  656. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  657. vshufps $0xb1, %xmm13, %xmm13, %xmm13
  658. vshufps $0xb1, %xmm15, %xmm15, %xmm15
  659. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  660. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  661. vaddsubps %xmm9, %xmm8 , %xmm8
  662. vaddsubps %xmm11,%xmm10, %xmm10
  663. vaddsubps %xmm13,%xmm12, %xmm12
  664. vaddsubps %xmm15,%xmm14, %xmm14
  665. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  666. vshufps $0xb1, %xmm10, %xmm10, %xmm11
  667. vshufps $0xb1, %xmm12, %xmm12, %xmm13
  668. vshufps $0xb1, %xmm14, %xmm14, %xmm15
  669. #else
  670. vaddsubps %xmm8, %xmm9 ,%xmm9
  671. vaddsubps %xmm10, %xmm11,%xmm11
  672. vaddsubps %xmm12, %xmm13,%xmm13
  673. vaddsubps %xmm14, %xmm15,%xmm15
  674. vmovaps %xmm9, %xmm8
  675. vmovaps %xmm11, %xmm10
  676. vmovaps %xmm13, %xmm12
  677. vmovaps %xmm15, %xmm14
  678. // swap high and low 64 bytes
  679. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  680. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  681. vshufps $0xb1, %xmm13, %xmm13, %xmm13
  682. vshufps $0xb1, %xmm15, %xmm15, %xmm15
  683. #endif
  684. // multiply with ALPHA_R
  685. vmulps %xmm8 , %xmm0, %xmm8
  686. vmulps %xmm10, %xmm0, %xmm10
  687. vmulps %xmm12, %xmm0, %xmm12
  688. vmulps %xmm14, %xmm0, %xmm14
  689. // multiply with ALPHA_I
  690. vmulps %xmm9 , %xmm1, %xmm9
  691. vmulps %xmm11, %xmm1, %xmm11
  692. vmulps %xmm13, %xmm1, %xmm13
  693. vmulps %xmm15, %xmm1, %xmm15
  694. vaddsubps %xmm9, %xmm8 , %xmm8
  695. vaddsubps %xmm11,%xmm10, %xmm10
  696. vaddsubps %xmm13,%xmm12, %xmm12
  697. vaddsubps %xmm15,%xmm14, %xmm14
  698. #ifndef TRMMKERNEL
  699. vaddps (CO1), %xmm8 , %xmm8
  700. vaddps 4 * SIZE(CO1), %xmm12, %xmm12
  701. vaddps (CO1, LDC), %xmm10, %xmm10
  702. vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14
  703. #endif
  704. vmovups %xmm8 , (CO1)
  705. vmovups %xmm12 , 4 * SIZE(CO1)
  706. vmovups %xmm10 , (CO1, LDC)
  707. vmovups %xmm14 , 4 * SIZE(CO1, LDC)
  708. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  709. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  710. movq K, %rax
  711. subq KKK, %rax
  712. movq %rax, BI // Index for BO
  713. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  714. leaq (BO, BI, SIZE), BO
  715. salq $3, %rax // rax = rax * 8 ; number of values
  716. leaq (AO, %rax, SIZE), AO
  717. #endif
  718. #if defined(TRMMKERNEL) && defined(LEFT)
  719. addq $4, KK
  720. #endif
  721. addq $8 * SIZE, CO1 # coffset += 8
  722. decq I # i --
  723. jg .L2_11
  724. ALIGN_4
  725. /**************************************************************************
  726. * Rest of M
  727. ***************************************************************************/
  728. .L2_20:
  729. testq $3, M
  730. jz .L2_60 // to next 2 lines of N
  731. testq $2, M
  732. jz .L2_40
  733. ALIGN_4
  734. .L2_21:
  735. #if !defined(TRMMKERNEL) || \
  736. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  737. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  738. leaq BUFFER1, BO // first buffer to BO
  739. addq $8 * SIZE, BO
  740. #else
  741. movq KK, %rax
  742. leaq BUFFER1, BO // first buffer to BO
  743. addq $8 * SIZE, BO
  744. movq %rax, BI // Index for BO
  745. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  746. leaq (BO, BI, SIZE), BO
  747. salq $2, %rax // rax = rax * 4 ; number of values
  748. leaq (AO, %rax, SIZE), AO
  749. #endif
  750. vzeroall
  751. #ifndef TRMMKERNEL
  752. movq K, %rax
  753. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  754. movq K, %rax
  755. subq KK, %rax
  756. movq %rax, KKK
  757. #else
  758. movq KK, %rax
  759. #ifdef LEFT
  760. addq $2, %rax // number of values in AO
  761. #else
  762. addq $2, %rax // number of values in BO
  763. #endif
  764. movq %rax, KKK
  765. #endif
  766. andq $-8, %rax // K = K - ( K % 8 )
  767. je .L2_26
  768. movq %rax, BI // Index for BO
  769. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  770. salq $2, %rax // rax = rax * 4 ; number of values
  771. leaq (AO, %rax, SIZE), AO
  772. leaq (BO, BI, SIZE), BO
  773. negq BI
  774. negq %rax
  775. ALIGN_4
  776. .L2_22:
  777. prefetcht0 B_PR1(BO,BI,SIZE)
  778. KERNEL2x2_1(xxx)
  779. KERNEL2x2_2(xxx)
  780. KERNEL2x2_3(xxx)
  781. KERNEL2x2_4(xxx)
  782. prefetcht0 B_PR1(BO,BI,SIZE)
  783. KERNEL2x2_1(xxx)
  784. KERNEL2x2_2(xxx)
  785. KERNEL2x2_3(xxx)
  786. KERNEL2x2_4(xxx)
  787. je .L2_26
  788. prefetcht0 B_PR1(BO,BI,SIZE)
  789. KERNEL2x2_1(xxx)
  790. KERNEL2x2_2(xxx)
  791. KERNEL2x2_3(xxx)
  792. KERNEL2x2_4(xxx)
  793. prefetcht0 B_PR1(BO,BI,SIZE)
  794. KERNEL2x2_1(xxx)
  795. KERNEL2x2_2(xxx)
  796. KERNEL2x2_3(xxx)
  797. KERNEL2x2_4(xxx)
  798. je .L2_26
  799. jmp .L2_22
  800. ALIGN_4
  801. .L2_26:
  802. #ifndef TRMMKERNEL
  803. movq K, %rax
  804. #else
  805. movq KKK, %rax
  806. #endif
  807. andq $7, %rax # if (k & 1)
  808. je .L2_29
  809. movq %rax, BI // Index for BO
  810. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  811. salq $2, %rax // rax = rax * 4 ; number of values
  812. leaq (AO, %rax, SIZE), AO
  813. leaq (BO, BI, SIZE), BO
  814. negq BI
  815. negq %rax
  816. ALIGN_4
  817. .L2_27:
  818. KERNEL2x2_SUB(xxx)
  819. jl .L2_27
  820. ALIGN_4
  821. .L2_29:
  822. vbroadcastss ALPHA_R, %xmm0
  823. vbroadcastss ALPHA_I, %xmm1
  824. // swap high and low 64 bytes
  825. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  826. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  827. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  828. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  829. vaddsubps %xmm9, %xmm8 , %xmm8
  830. vaddsubps %xmm11,%xmm10, %xmm10
  831. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  832. vshufps $0xb1, %xmm10, %xmm10, %xmm11
  833. #else
  834. vaddsubps %xmm8, %xmm9 ,%xmm9
  835. vaddsubps %xmm10, %xmm11,%xmm11
  836. vmovaps %xmm9, %xmm8
  837. vmovaps %xmm11, %xmm10
  838. // swap high and low 64 bytes
  839. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  840. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  841. #endif
  842. // multiply with ALPHA_R
  843. vmulps %xmm8 , %xmm0, %xmm8
  844. vmulps %xmm10, %xmm0, %xmm10
  845. // multiply with ALPHA_I
  846. vmulps %xmm9 , %xmm1, %xmm9
  847. vmulps %xmm11, %xmm1, %xmm11
  848. vaddsubps %xmm9, %xmm8 , %xmm8
  849. vaddsubps %xmm11,%xmm10, %xmm10
  850. #ifndef TRMMKERNEL
  851. vaddps (CO1), %xmm8 , %xmm8
  852. vaddps (CO1, LDC), %xmm10, %xmm10
  853. #endif
  854. vmovups %xmm8 , (CO1)
  855. vmovups %xmm10 , (CO1, LDC)
  856. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  857. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  858. movq K, %rax
  859. subq KKK, %rax
  860. movq %rax, BI // Index for BO
  861. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  862. leaq (BO, BI, SIZE), BO
  863. salq $2, %rax // rax = rax * 4 ; number of values
  864. leaq (AO, %rax, SIZE), AO
  865. #endif
  866. #if defined(TRMMKERNEL) && defined(LEFT)
  867. addq $2, KK
  868. #endif
  869. addq $4 * SIZE, CO1 # coffset += 4
  870. ALIGN_4
  871. /**************************************************************************/
  872. .L2_40:
  873. testq $1, M
  874. jz .L2_60 // to next 2 lines of N
  875. ALIGN_4
  876. .L2_41:
  877. #if !defined(TRMMKERNEL) || \
  878. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  879. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  880. leaq BUFFER1, BO // first buffer to BO
  881. addq $8 * SIZE, BO
  882. #else
  883. movq KK, %rax
  884. leaq BUFFER1, BO // first buffer to BO
  885. addq $8 * SIZE, BO
  886. movq %rax, BI // Index for BO
  887. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  888. leaq (BO, BI, SIZE), BO
  889. salq $1, %rax // rax = rax * 2 ; number of values
  890. leaq (AO, %rax, SIZE), AO
  891. #endif
  892. vzeroall
  893. #ifndef TRMMKERNEL
  894. movq K, %rax
  895. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  896. movq K, %rax
  897. subq KK, %rax
  898. movq %rax, KKK
  899. #else
  900. movq KK, %rax
  901. #ifdef LEFT
  902. addq $1, %rax // number of values in AO
  903. #else
  904. addq $2, %rax // number of values in BO
  905. #endif
  906. movq %rax, KKK
  907. #endif
  908. andq $-8, %rax // K = K - ( K % 8 )
  909. je .L2_46
  910. movq %rax, BI // Index for BO
  911. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  912. salq $1, %rax // rax = rax * 2 ; number of values
  913. leaq (AO, %rax, SIZE), AO
  914. leaq (BO, BI, SIZE), BO
  915. negq BI
  916. negq %rax
  917. ALIGN_4
  918. .L2_42:
  919. prefetcht0 B_PR1(BO,BI,SIZE)
  920. KERNEL1x2_1(xxx)
  921. KERNEL1x2_2(xxx)
  922. KERNEL1x2_3(xxx)
  923. KERNEL1x2_4(xxx)
  924. prefetcht0 B_PR1(BO,BI,SIZE)
  925. KERNEL1x2_1(xxx)
  926. KERNEL1x2_2(xxx)
  927. KERNEL1x2_3(xxx)
  928. KERNEL1x2_4(xxx)
  929. je .L2_46
  930. prefetcht0 B_PR1(BO,BI,SIZE)
  931. KERNEL1x2_1(xxx)
  932. KERNEL1x2_2(xxx)
  933. KERNEL1x2_3(xxx)
  934. KERNEL1x2_4(xxx)
  935. prefetcht0 B_PR1(BO,BI,SIZE)
  936. KERNEL1x2_1(xxx)
  937. KERNEL1x2_2(xxx)
  938. KERNEL1x2_3(xxx)
  939. KERNEL1x2_4(xxx)
  940. je .L2_46
  941. jmp .L2_42
  942. ALIGN_4
  943. .L2_46:
  944. #ifndef TRMMKERNEL
  945. movq K, %rax
  946. #else
  947. movq KKK, %rax
  948. #endif
  949. andq $7, %rax # if (k & 1)
  950. je .L2_49
  951. movq %rax, BI // Index for BO
  952. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  953. salq $1, %rax // rax = rax * 2 ; number of values
  954. leaq (AO, %rax, SIZE), AO
  955. leaq (BO, BI, SIZE), BO
  956. negq BI
  957. negq %rax
  958. ALIGN_4
  959. .L2_47:
  960. KERNEL1x2_SUB(xxx)
  961. jl .L2_47
  962. ALIGN_4
  963. .L2_49:
  964. vbroadcastss ALPHA_R, %xmm0
  965. vbroadcastss ALPHA_I, %xmm1
  966. // swap high and low 64 bytes
  967. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  968. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  969. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  970. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  971. vaddsubps %xmm9, %xmm8 , %xmm8
  972. vaddsubps %xmm11,%xmm10, %xmm10
  973. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  974. vshufps $0xb1, %xmm10, %xmm10, %xmm11
  975. #else
  976. vaddsubps %xmm8, %xmm9 ,%xmm9
  977. vaddsubps %xmm10, %xmm11,%xmm11
  978. vmovaps %xmm9, %xmm8
  979. vmovaps %xmm11, %xmm10
  980. // swap high and low 64 bytes
  981. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  982. vshufps $0xb1, %xmm11, %xmm11, %xmm11
  983. #endif
  984. // multiply with ALPHA_R
  985. vmulps %xmm8 , %xmm0, %xmm8
  986. vmulps %xmm10, %xmm0, %xmm10
  987. // multiply with ALPHA_I
  988. vmulps %xmm9 , %xmm1, %xmm9
  989. vmulps %xmm11, %xmm1, %xmm11
  990. vaddsubps %xmm9, %xmm8 , %xmm8
  991. vaddsubps %xmm11,%xmm10, %xmm10
  992. #ifndef TRMMKERNEL
  993. vmovsd (CO1), %xmm14
  994. vaddps %xmm14, %xmm8 , %xmm8
  995. vmovsd (CO1, LDC), %xmm15
  996. vaddps %xmm15, %xmm10, %xmm10
  997. #endif
  998. vmovsd %xmm8 , (CO1)
  999. vmovsd %xmm10 , (CO1, LDC)
  1000. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1001. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1002. movq K, %rax
  1003. subq KKK, %rax
  1004. movq %rax, BI // Index for BO
  1005. leaq ( ,BI,4), BI // BI = BI * 4 ; number of values
  1006. leaq (BO, BI, SIZE), BO
  1007. salq $1, %rax // rax = rax * 2 ; number of values
  1008. leaq (AO, %rax, SIZE), AO
  1009. #endif
  1010. #if defined(TRMMKERNEL) && defined(LEFT)
  1011. addq $1, KK
  1012. #endif
  1013. addq $2 * SIZE, CO1 # coffset += 2
  1014. ALIGN_4
  1015. .L2_60:
  1016. #if defined(TRMMKERNEL) && !defined(LEFT)
  1017. addq $2, KK
  1018. #endif
  1019. decq J // j --
  1020. jg .L2_01 // next 2 lines of N
  1021. .L1_0:
  1022. /************************************************************************************************
  1023. * Loop for Nmod6 % 2 > 0
  1024. *************************************************************************************************/
  1025. movq Nmod6, J
  1026. andq $1, J // j % 2
  1027. je .L999
  1028. ALIGN_4
  1029. .L1_01:
  1030. // copy to sub buffer
  1031. movq B, BO1
  1032. leaq BUFFER1, BO // first buffer to BO
  1033. movq K, %rax
  1034. ALIGN_4
  1035. .L1_02b:
  1036. vmovsd (BO1), %xmm0
  1037. vmovsd %xmm0, (BO)
  1038. addq $2*SIZE,BO1
  1039. addq $2*SIZE,BO
  1040. decq %rax
  1041. jnz .L1_02b
  1042. .L1_02c:
  1043. movq BO1, B // next offset of B
  1044. .L1_10:
  1045. movq C, CO1
  1046. leaq (C, LDC, 1), C // c += 1 * ldc
  1047. #if defined(TRMMKERNEL) && defined(LEFT)
  1048. movq OFFSET, %rax
  1049. movq %rax, KK
  1050. #endif
  1051. movq A, AO // aoffset = a
  1052. addq $16 * SIZE, AO
  1053. movq M, I
  1054. sarq $2, I // i = (m >> 2)
  1055. je .L1_20
  1056. ALIGN_4
  1057. .L1_11:
  1058. #if !defined(TRMMKERNEL) || \
  1059. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1060. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1061. leaq BUFFER1, BO // first buffer to BO
  1062. addq $4 * SIZE, BO
  1063. #else
  1064. movq KK, %rax
  1065. leaq BUFFER1, BO // first buffer to BO
  1066. addq $4 * SIZE, BO
  1067. movq %rax, BI // Index for BO
  1068. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1069. leaq (BO, BI, SIZE), BO
  1070. salq $3, %rax // rax = rax * 8 ; number of values
  1071. leaq (AO, %rax, SIZE), AO
  1072. #endif
  1073. vzeroall
  1074. #ifndef TRMMKERNEL
  1075. movq K, %rax
  1076. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1077. movq K, %rax
  1078. subq KK, %rax
  1079. movq %rax, KKK
  1080. #else
  1081. movq KK, %rax
  1082. #ifdef LEFT
  1083. addq $4, %rax // number of values in AO
  1084. #else
  1085. addq $1, %rax // number of values in BO
  1086. #endif
  1087. movq %rax, KKK
  1088. #endif
  1089. andq $-8, %rax // K = K - ( K % 8 )
  1090. je .L1_16
  1091. movq %rax, BI // Index for BO
  1092. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1093. salq $3, %rax // rax = rax * 8 ; number of values
  1094. leaq (AO, %rax, SIZE), AO
  1095. leaq (BO, BI, SIZE), BO
  1096. negq BI
  1097. negq %rax
  1098. ALIGN_4
  1099. .L1_12:
  1100. prefetcht0 B_PR1(BO,BI,SIZE)
  1101. KERNEL4x1_1(xxx)
  1102. KERNEL4x1_2(xxx)
  1103. KERNEL4x1_3(xxx)
  1104. KERNEL4x1_4(xxx)
  1105. KERNEL4x1_1(xxx)
  1106. KERNEL4x1_2(xxx)
  1107. KERNEL4x1_3(xxx)
  1108. KERNEL4x1_4(xxx)
  1109. je .L1_16
  1110. prefetcht0 B_PR1(BO,BI,SIZE)
  1111. KERNEL4x1_1(xxx)
  1112. KERNEL4x1_2(xxx)
  1113. KERNEL4x1_3(xxx)
  1114. KERNEL4x1_4(xxx)
  1115. KERNEL4x1_1(xxx)
  1116. KERNEL4x1_2(xxx)
  1117. KERNEL4x1_3(xxx)
  1118. KERNEL4x1_4(xxx)
  1119. je .L1_16
  1120. jmp .L1_12
  1121. ALIGN_4
  1122. .L1_16:
  1123. #ifndef TRMMKERNEL
  1124. movq K, %rax
  1125. #else
  1126. movq KKK, %rax
  1127. #endif
  1128. andq $7, %rax # if (k & 1)
  1129. je .L1_19
  1130. movq %rax, BI // Index for BO
  1131. leaq ( ,BI,2), BI // BI = BI * 4 ; number of values
  1132. salq $3, %rax // rax = rax * 8 ; number of values
  1133. leaq (AO, %rax, SIZE), AO
  1134. leaq (BO, BI, SIZE), BO
  1135. negq BI
  1136. negq %rax
  1137. ALIGN_4
  1138. .L1_17:
  1139. KERNEL4x1_SUB(xxx)
  1140. jl .L1_17
  1141. ALIGN_4
  1142. .L1_19:
  1143. vbroadcastss ALPHA_R, %xmm0
  1144. vbroadcastss ALPHA_I, %xmm1
  1145. // swap high and low 64 bytes
  1146. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1147. vshufps $0xb1, %xmm13, %xmm13, %xmm13
  1148. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1149. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1150. vaddsubps %xmm9, %xmm8 , %xmm8
  1151. vaddsubps %xmm13,%xmm12, %xmm12
  1152. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  1153. vshufps $0xb1, %xmm12, %xmm12, %xmm13
  1154. #else
  1155. vaddsubps %xmm8, %xmm9 ,%xmm9
  1156. vaddsubps %xmm12, %xmm13,%xmm13
  1157. vmovaps %xmm9, %xmm8
  1158. vmovaps %xmm13, %xmm12
  1159. // swap high and low 64 bytes
  1160. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1161. vshufps $0xb1, %xmm13, %xmm13, %xmm13
  1162. #endif
  1163. // multiply with ALPHA_R
  1164. vmulps %xmm8 , %xmm0, %xmm8
  1165. vmulps %xmm12, %xmm0, %xmm12
  1166. // multiply with ALPHA_I
  1167. vmulps %xmm9 , %xmm1, %xmm9
  1168. vmulps %xmm13, %xmm1, %xmm13
  1169. vaddsubps %xmm9, %xmm8 , %xmm8
  1170. vaddsubps %xmm13,%xmm12, %xmm12
  1171. #ifndef TRMMKERNEL
  1172. vaddps (CO1), %xmm8 , %xmm8
  1173. vaddps 4 * SIZE(CO1), %xmm12, %xmm12
  1174. #endif
  1175. vmovups %xmm8 , (CO1)
  1176. vmovups %xmm12 , 4 * SIZE(CO1)
  1177. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1178. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1179. movq K, %rax
  1180. subq KKK, %rax
  1181. movq %rax, BI // Index for BO
  1182. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1183. leaq (BO, BI, SIZE), BO
  1184. salq $3, %rax // rax = rax * 8 ; number of values
  1185. leaq (AO, %rax, SIZE), AO
  1186. #endif
  1187. #if defined(TRMMKERNEL) && defined(LEFT)
  1188. addq $4, KK
  1189. #endif
  1190. addq $8 * SIZE, CO1 # coffset += 8
  1191. decq I # i --
  1192. jg .L1_11
  1193. ALIGN_4
  1194. /**************************************************************************
  1195. * Rest of M
  1196. ***************************************************************************/
  1197. .L1_20:
  1198. testq $3, M
  1199. jz .L999
  1200. testq $2, M
  1201. jz .L1_40
  1202. ALIGN_4
  1203. .L1_21:
  1204. #if !defined(TRMMKERNEL) || \
  1205. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1206. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1207. leaq BUFFER1, BO // first buffer to BO
  1208. addq $4 * SIZE, BO
  1209. #else
  1210. movq KK, %rax
  1211. leaq BUFFER1, BO // first buffer to BO
  1212. addq $4 * SIZE, BO
  1213. movq %rax, BI // Index for BO
  1214. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1215. leaq (BO, BI, SIZE), BO
  1216. salq $2, %rax // rax = rax * 4 ; number of values
  1217. leaq (AO, %rax, SIZE), AO
  1218. #endif
  1219. vzeroall
  1220. #ifndef TRMMKERNEL
  1221. movq K, %rax
  1222. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1223. movq K, %rax
  1224. subq KK, %rax
  1225. movq %rax, KKK
  1226. #else
  1227. movq KK, %rax
  1228. #ifdef LEFT
  1229. addq $2, %rax // number of values in AO
  1230. #else
  1231. addq $1, %rax // number of values in BO
  1232. #endif
  1233. movq %rax, KKK
  1234. #endif
  1235. andq $-8, %rax // K = K - ( K % 8 )
  1236. je .L1_26
  1237. movq %rax, BI // Index for BO
  1238. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1239. salq $2, %rax // rax = rax * 4 ; number of values
  1240. leaq (AO, %rax, SIZE), AO
  1241. leaq (BO, BI, SIZE), BO
  1242. negq BI
  1243. negq %rax
  1244. ALIGN_4
  1245. .L1_22:
  1246. prefetcht0 B_PR1(BO,BI,SIZE)
  1247. KERNEL2x1_1(xxx)
  1248. KERNEL2x1_2(xxx)
  1249. KERNEL2x1_3(xxx)
  1250. KERNEL2x1_4(xxx)
  1251. KERNEL2x1_1(xxx)
  1252. KERNEL2x1_2(xxx)
  1253. KERNEL2x1_3(xxx)
  1254. KERNEL2x1_4(xxx)
  1255. je .L1_26
  1256. prefetcht0 B_PR1(BO,BI,SIZE)
  1257. KERNEL2x1_1(xxx)
  1258. KERNEL2x1_2(xxx)
  1259. KERNEL2x1_3(xxx)
  1260. KERNEL2x1_4(xxx)
  1261. KERNEL2x1_1(xxx)
  1262. KERNEL2x1_2(xxx)
  1263. KERNEL2x1_3(xxx)
  1264. KERNEL2x1_4(xxx)
  1265. je .L1_26
  1266. jmp .L1_22
  1267. ALIGN_4
  1268. .L1_26:
  1269. #ifndef TRMMKERNEL
  1270. movq K, %rax
  1271. #else
  1272. movq KKK, %rax
  1273. #endif
  1274. andq $7, %rax # if (k & 1)
  1275. je .L1_29
  1276. movq %rax, BI // Index for BO
  1277. leaq ( ,BI,2), BI // BI = BI * 2; number of values
  1278. salq $2, %rax // rax = rax * 4 ; number of values
  1279. leaq (AO, %rax, SIZE), AO
  1280. leaq (BO, BI, SIZE), BO
  1281. negq BI
  1282. negq %rax
  1283. ALIGN_4
  1284. .L1_27:
  1285. KERNEL2x1_SUB(xxx)
  1286. jl .L1_27
  1287. ALIGN_4
  1288. .L1_29:
  1289. vbroadcastss ALPHA_R, %xmm0
  1290. vbroadcastss ALPHA_I, %xmm1
  1291. // swap high and low 64 bytes
  1292. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1293. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1294. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1295. vaddsubps %xmm9, %xmm8 , %xmm8
  1296. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  1297. #else
  1298. vaddsubps %xmm8, %xmm9 ,%xmm9
  1299. vmovaps %xmm9, %xmm8
  1300. // swap high and low 64 bytes
  1301. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1302. #endif
  1303. // multiply with ALPHA_R
  1304. vmulps %xmm8 , %xmm0, %xmm8
  1305. // multiply with ALPHA_I
  1306. vmulps %xmm9 , %xmm1, %xmm9
  1307. vaddsubps %xmm9, %xmm8 , %xmm8
  1308. #ifndef TRMMKERNEL
  1309. vaddps (CO1), %xmm8 , %xmm8
  1310. #endif
  1311. vmovups %xmm8 , (CO1)
  1312. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1313. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1314. movq K, %rax
  1315. subq KKK, %rax
  1316. movq %rax, BI // Index for BO
  1317. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1318. leaq (BO, BI, SIZE), BO
  1319. salq $2, %rax // rax = rax * 4 ; number of values
  1320. leaq (AO, %rax, SIZE), AO
  1321. #endif
  1322. #if defined(TRMMKERNEL) && defined(LEFT)
  1323. addq $2, KK
  1324. #endif
  1325. addq $4 * SIZE, CO1 # coffset += 4
  1326. ALIGN_4
  1327. /**************************************************************************/
  1328. .L1_40:
  1329. testq $1, M
  1330. jz .L999 // to next 2 lines of N
  1331. ALIGN_4
  1332. .L1_41:
  1333. #if !defined(TRMMKERNEL) || \
  1334. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1335. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1336. leaq BUFFER1, BO // first buffer to BO
  1337. addq $4 * SIZE, BO
  1338. #else
  1339. movq KK, %rax
  1340. leaq BUFFER1, BO // first buffer to BO
  1341. addq $4 * SIZE, BO
  1342. movq %rax, BI // Index for BO
  1343. leaq (,BI,2), BI // BI = BI * 2 ; number of values
  1344. leaq (BO, BI, SIZE), BO
  1345. salq $1, %rax // rax = rax * 2 ; number of values
  1346. leaq (AO, %rax, SIZE), AO
  1347. #endif
  1348. vzeroall
  1349. #ifndef TRMMKERNEL
  1350. movq K, %rax
  1351. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1352. movq K, %rax
  1353. subq KK, %rax
  1354. movq %rax, KKK
  1355. #else
  1356. movq KK, %rax
  1357. #ifdef LEFT
  1358. addq $1, %rax // number of values in AO
  1359. #else
  1360. addq $1, %rax // number of values in BO
  1361. #endif
  1362. movq %rax, KKK
  1363. #endif
  1364. andq $-8, %rax // K = K - ( K % 8 )
  1365. je .L1_46
  1366. movq %rax, BI // Index for BO
  1367. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1368. salq $1, %rax // rax = rax * 2 ; number of values
  1369. leaq (AO, %rax, SIZE), AO
  1370. leaq (BO, BI, SIZE), BO
  1371. negq BI
  1372. negq %rax
  1373. ALIGN_4
  1374. .L1_42:
  1375. prefetcht0 B_PR1(BO,BI,SIZE)
  1376. KERNEL1x1_1(xxx)
  1377. KERNEL1x1_2(xxx)
  1378. KERNEL1x1_3(xxx)
  1379. KERNEL1x1_4(xxx)
  1380. KERNEL1x1_1(xxx)
  1381. KERNEL1x1_2(xxx)
  1382. KERNEL1x1_3(xxx)
  1383. KERNEL1x1_4(xxx)
  1384. je .L1_46
  1385. prefetcht0 B_PR1(BO,BI,SIZE)
  1386. KERNEL1x1_1(xxx)
  1387. KERNEL1x1_2(xxx)
  1388. KERNEL1x1_3(xxx)
  1389. KERNEL1x1_4(xxx)
  1390. KERNEL1x1_1(xxx)
  1391. KERNEL1x1_2(xxx)
  1392. KERNEL1x1_3(xxx)
  1393. KERNEL1x1_4(xxx)
  1394. je .L1_46
  1395. jmp .L1_42
  1396. ALIGN_4
  1397. .L1_46:
  1398. #ifndef TRMMKERNEL
  1399. movq K, %rax
  1400. #else
  1401. movq KKK, %rax
  1402. #endif
  1403. andq $7, %rax # if (k & 1)
  1404. je .L1_49
  1405. movq %rax, BI // Index for BO
  1406. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1407. salq $1, %rax // rax = rax * 2 ; number of values
  1408. leaq (AO, %rax, SIZE), AO
  1409. leaq (BO, BI, SIZE), BO
  1410. negq BI
  1411. negq %rax
  1412. ALIGN_4
  1413. .L1_47:
  1414. KERNEL1x1_SUB(xxx)
  1415. jl .L1_47
  1416. ALIGN_4
  1417. .L1_49:
  1418. vbroadcastss ALPHA_R, %xmm0
  1419. vbroadcastss ALPHA_I, %xmm1
  1420. // swap high and low 64 bytes
  1421. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1422. #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
  1423. defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1424. vaddsubps %xmm9, %xmm8 , %xmm8
  1425. vshufps $0xb1, %xmm8 , %xmm8, %xmm9
  1426. #else
  1427. vaddsubps %xmm8, %xmm9 ,%xmm9
  1428. vmovaps %xmm9, %xmm8
  1429. // swap high and low 64 bytes
  1430. vshufps $0xb1, %xmm9 , %xmm9, %xmm9
  1431. #endif
  1432. // multiply with ALPHA_R
  1433. vmulps %xmm8 , %xmm0, %xmm8
  1434. // multiply with ALPHA_I
  1435. vmulps %xmm9 , %xmm1, %xmm9
  1436. vaddsubps %xmm9, %xmm8 , %xmm8
  1437. #ifndef TRMMKERNEL
  1438. vmovsd (CO1), %xmm14
  1439. vaddps %xmm14, %xmm8 , %xmm8
  1440. #endif
  1441. vmovsd %xmm8 , (CO1)
  1442. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1443. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1444. movq K, %rax
  1445. subq KKK, %rax
  1446. movq %rax, BI // Index for BO
  1447. leaq ( ,BI,2), BI // BI = BI * 2 ; number of values
  1448. leaq (BO, BI, SIZE), BO
  1449. salq $1, %rax // rax = rax * 2 ; number of values
  1450. leaq (AO, %rax, SIZE), AO
  1451. #endif
  1452. #if defined(TRMMKERNEL) && defined(LEFT)
  1453. addq $1, KK
  1454. #endif
  1455. addq $2 * SIZE, CO1 # coffset += 2
  1456. ALIGN_4
  1457. .L999:
  1458. vzeroupper
  1459. movq SP, %rsp
  1460. movq (%rsp), %rbx
  1461. movq 8(%rsp), %rbp
  1462. movq 16(%rsp), %r12
  1463. movq 24(%rsp), %r13
  1464. movq 32(%rsp), %r14
  1465. movq 40(%rsp), %r15
  1466. #ifdef WINDOWS_ABI
  1467. movq 48(%rsp), %rdi
  1468. movq 56(%rsp), %rsi
  1469. vmovups 64(%rsp), %xmm6
  1470. vmovups 80(%rsp), %xmm7
  1471. vmovups 96(%rsp), %xmm8
  1472. vmovups 112(%rsp), %xmm9
  1473. vmovups 128(%rsp), %xmm10
  1474. vmovups 144(%rsp), %xmm11
  1475. vmovups 160(%rsp), %xmm12
  1476. vmovups 176(%rsp), %xmm13
  1477. vmovups 192(%rsp), %xmm14
  1478. vmovups 208(%rsp), %xmm15
  1479. #endif
  1480. addq $STACKSIZE, %rsp
  1481. ret
  1482. EPILOGUE