You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_tcopy_logic_16_power8.S 5.3 kB

9 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. /***************************************************************************
  2. Copyright (c) 2013-2016, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /**************************************************************************************
  28. * 2016/04/21 Werner Saar (wernsaar@googlemail.com)
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. * LAPACK-TEST : OK
  33. **************************************************************************************/
  34. srawi. I, M, 2
  35. ble SCOPYT_L2_BEGIN
  36. SCOPYT_L4_BEGIN:
  37. mr A0, A
  38. add A1, A0, LDA
  39. add A2, A1, LDA
  40. add A3, A2, LDA
  41. add A, A3, LDA
  42. mr B16, B
  43. addi B, B, 64*SIZE
  44. sradi. J, N, 4
  45. ble SCOPYT_L4x8_BEGIN
  46. mr BO, B16
  47. SCOPYT_L4x16_LOOP:
  48. dcbtst BO, M16
  49. dcbtst BO, PREB
  50. dcbt A0, PREA
  51. dcbt A1, PREA
  52. dcbt A2, PREA
  53. dcbt A3, PREA
  54. COPY_4x16
  55. addi A0, A0, 16*SIZE
  56. addi A1, A1, 16*SIZE
  57. addi A2, A2, 16*SIZE
  58. addi A3, A3, 16*SIZE
  59. add BO, BO, M16
  60. addic. J, J, -1
  61. ble SCOPYT_L4x8_BEGIN
  62. dcbtst BO, M16
  63. dcbtst BO, PREB
  64. COPY_4x16
  65. addi A0, A0, 16*SIZE
  66. addi A1, A1, 16*SIZE
  67. addi A2, A2, 16*SIZE
  68. addi A3, A3, 16*SIZE
  69. add BO, BO, M16
  70. addic. J, J, -1
  71. bgt SCOPYT_L4x16_LOOP
  72. SCOPYT_L4x8_BEGIN:
  73. andi. T1, N, 8
  74. ble SCOPYT_L4x4_BEGIN
  75. mr BO, B8
  76. COPY_4x8
  77. addi A0, A0, 8*SIZE
  78. addi A1, A1, 8*SIZE
  79. addi A2, A2, 8*SIZE
  80. addi A3, A3, 8*SIZE
  81. addi B8, B8, 32*SIZE
  82. SCOPYT_L4x4_BEGIN:
  83. andi. T1, N, 4
  84. ble SCOPYT_L4x2_BEGIN
  85. mr BO, B4
  86. COPY_4x4
  87. addi A0, A0, 4*SIZE
  88. addi A1, A1, 4*SIZE
  89. addi A2, A2, 4*SIZE
  90. addi A3, A3, 4*SIZE
  91. addi B4, B4, 16*SIZE
  92. SCOPYT_L4x2_BEGIN:
  93. andi. T1, N, 2
  94. ble SCOPYT_L4x1_BEGIN
  95. mr BO, B2
  96. COPY_4x2
  97. addi A0, A0, 2*SIZE
  98. addi A1, A1, 2*SIZE
  99. addi A2, A2, 2*SIZE
  100. addi A3, A3, 2*SIZE
  101. addi B2, B2, 8*SIZE
  102. SCOPYT_L4x1_BEGIN:
  103. andi. T1, N, 1
  104. ble SCOPYT_L4_END
  105. mr BO, B1
  106. COPY_4x1
  107. addi A0, A0, 1*SIZE
  108. addi A1, A1, 1*SIZE
  109. addi A2, A2, 1*SIZE
  110. addi A3, A3, 1*SIZE
  111. addi B1, B1, 4*SIZE
  112. SCOPYT_L4_END:
  113. addic. I, I, -1
  114. bgt SCOPYT_L4_BEGIN
  115. SCOPYT_L2_BEGIN:
  116. andi. T1, M, 2
  117. ble SCOPYT_L1_BEGIN
  118. mr A0, A
  119. add A1, A0, LDA
  120. add A, A1, LDA
  121. mr B16, B
  122. addi B, B, 32*SIZE
  123. sradi. J, N, 4
  124. ble SCOPYT_L2x8_BEGIN
  125. mr BO, B16
  126. SCOPYT_L2x16_LOOP:
  127. COPY_2x16
  128. addi A0, A0, 16*SIZE
  129. addi A1, A1, 16*SIZE
  130. add BO, BO, M16
  131. addic. J, J, -1
  132. bgt SCOPYT_L2x16_LOOP
  133. SCOPYT_L2x8_BEGIN:
  134. andi. T1, N, 8
  135. ble SCOPYT_L2x4_BEGIN
  136. mr BO, B8
  137. COPY_2x8
  138. addi A0, A0, 8*SIZE
  139. addi A1, A1, 8*SIZE
  140. addi B8, B8, 16*SIZE
  141. SCOPYT_L2x4_BEGIN:
  142. andi. T1, N, 4
  143. ble SCOPYT_L2x2_BEGIN
  144. mr BO, B4
  145. COPY_2x4
  146. addi A0, A0, 4*SIZE
  147. addi A1, A1, 4*SIZE
  148. addi B4, B4, 8*SIZE
  149. SCOPYT_L2x2_BEGIN:
  150. andi. T1, N, 2
  151. ble SCOPYT_L2x1_BEGIN
  152. mr BO, B2
  153. COPY_2x2
  154. addi A0, A0, 2*SIZE
  155. addi A1, A1, 2*SIZE
  156. addi B2, B2, 4*SIZE
  157. SCOPYT_L2x1_BEGIN:
  158. andi. T1, N, 1
  159. ble SCOPYT_L2_END
  160. mr BO, B1
  161. COPY_2x1
  162. addi A0, A0, 1*SIZE
  163. addi A1, A1, 1*SIZE
  164. addi B1, B1, 2*SIZE
  165. SCOPYT_L2_END:
  166. SCOPYT_L1_BEGIN:
  167. andi. T1, M, 1
  168. ble L999
  169. mr A0, A
  170. add A, A0, LDA
  171. mr B16, B
  172. addi B, B, 16*SIZE
  173. sradi. J, N, 4
  174. ble SCOPYT_L1x8_BEGIN
  175. mr BO, B16
  176. SCOPYT_L1x16_LOOP:
  177. COPY_1x16
  178. addi A0, A0, 16*SIZE
  179. add BO, BO, M16
  180. addic. J, J, -1
  181. bgt SCOPYT_L1x16_LOOP
  182. SCOPYT_L1x8_BEGIN:
  183. andi. T1, N, 8
  184. ble SCOPYT_L1x4_BEGIN
  185. mr BO, B8
  186. COPY_1x8
  187. addi A0, A0, 8*SIZE
  188. addi B8, B8, 8*SIZE
  189. SCOPYT_L1x4_BEGIN:
  190. andi. T1, N, 4
  191. ble SCOPYT_L1x2_BEGIN
  192. mr BO, B4
  193. COPY_1x4
  194. addi A0, A0, 4*SIZE
  195. addi B4, B4, 4*SIZE
  196. SCOPYT_L1x2_BEGIN:
  197. andi. T1, N, 2
  198. ble SCOPYT_L1x1_BEGIN
  199. mr BO, B2
  200. COPY_1x2
  201. addi A0, A0, 2*SIZE
  202. addi B2, B2, 2*SIZE
  203. SCOPYT_L1x1_BEGIN:
  204. andi. T1, N, 1
  205. ble SCOPYT_L1_END
  206. mr BO, B1
  207. COPY_1x1
  208. addi A0, A0, 1*SIZE
  209. addi B1, B1, 1*SIZE
  210. SCOPYT_L1_END: