You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zhemm3m_ucopy_8.c 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. #ifndef USE_ALPHA
  41. #define REAL_PART(a, b) (a)
  42. #define IMAGE_PART(a, b) (b)
  43. #else
  44. #define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b))
  45. #define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b))
  46. #endif
  47. #if defined(REAL_ONLY)
  48. #define CMULT(a, b) (REAL_PART(a, b))
  49. #elif defined(IMAGE_ONLY)
  50. #define CMULT(a, b) (IMAGE_PART(a, b))
  51. #else
  52. #define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b))
  53. #endif
  54. int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY,
  55. #ifdef USE_ALPHA
  56. FLOAT alpha_r, FLOAT alpha_i,
  57. #endif
  58. FLOAT *b){
  59. BLASLONG i, js, offset;
  60. FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
  61. FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
  62. lda *= 2;
  63. js = (n >> 3);
  64. while (js > 0){
  65. offset = posX - posY;
  66. if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
  67. if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
  68. if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
  69. if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
  70. if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
  71. if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
  72. if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
  73. if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
  74. i = m;
  75. while (i > 0) {
  76. if (offset > 0) {
  77. data01 = CMULT(*(ao1 + 0), -*(ao1 + 1));
  78. data02 = CMULT(*(ao2 + 0), -*(ao2 + 1));
  79. data03 = CMULT(*(ao3 + 0), -*(ao3 + 1));
  80. data04 = CMULT(*(ao4 + 0), -*(ao4 + 1));
  81. data05 = CMULT(*(ao5 + 0), -*(ao5 + 1));
  82. data06 = CMULT(*(ao6 + 0), -*(ao6 + 1));
  83. data07 = CMULT(*(ao7 + 0), -*(ao7 + 1));
  84. data08 = CMULT(*(ao8 + 0), -*(ao8 + 1));
  85. } else
  86. if (offset < -7) {
  87. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  88. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  89. data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
  90. data04 = CMULT(*(ao4 + 0), *(ao4 + 1));
  91. data05 = CMULT(*(ao5 + 0), *(ao5 + 1));
  92. data06 = CMULT(*(ao6 + 0), *(ao6 + 1));
  93. data07 = CMULT(*(ao7 + 0), *(ao7 + 1));
  94. data08 = CMULT(*(ao8 + 0), *(ao8 + 1));
  95. } else {
  96. switch (offset) {
  97. case 0 :
  98. data01 = CMULT(*(ao1 + 0), ZERO);
  99. data02 = CMULT(*(ao2 + 0), -*(ao2 + 1));
  100. data03 = CMULT(*(ao3 + 0), -*(ao3 + 1));
  101. data04 = CMULT(*(ao4 + 0), -*(ao4 + 1));
  102. data05 = CMULT(*(ao5 + 0), -*(ao5 + 1));
  103. data06 = CMULT(*(ao6 + 0), -*(ao6 + 1));
  104. data07 = CMULT(*(ao7 + 0), -*(ao7 + 1));
  105. data08 = CMULT(*(ao8 + 0), -*(ao8 + 1));
  106. break;
  107. case -1 :
  108. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  109. data02 = CMULT(*(ao2 + 0), ZERO);
  110. data03 = CMULT(*(ao3 + 0), -*(ao3 + 1));
  111. data04 = CMULT(*(ao4 + 0), -*(ao4 + 1));
  112. data05 = CMULT(*(ao5 + 0), -*(ao5 + 1));
  113. data06 = CMULT(*(ao6 + 0), -*(ao6 + 1));
  114. data07 = CMULT(*(ao7 + 0), -*(ao7 + 1));
  115. data08 = CMULT(*(ao8 + 0), -*(ao8 + 1));
  116. break;
  117. case -2 :
  118. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  119. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  120. data03 = CMULT(*(ao3 + 0), ZERO);
  121. data04 = CMULT(*(ao4 + 0), -*(ao4 + 1));
  122. data05 = CMULT(*(ao5 + 0), -*(ao5 + 1));
  123. data06 = CMULT(*(ao6 + 0), -*(ao6 + 1));
  124. data07 = CMULT(*(ao7 + 0), -*(ao7 + 1));
  125. data08 = CMULT(*(ao8 + 0), -*(ao8 + 1));
  126. break;
  127. case -3 :
  128. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  129. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  130. data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
  131. data04 = CMULT(*(ao4 + 0), ZERO);
  132. data05 = CMULT(*(ao5 + 0), -*(ao5 + 1));
  133. data06 = CMULT(*(ao6 + 0), -*(ao6 + 1));
  134. data07 = CMULT(*(ao7 + 0), -*(ao7 + 1));
  135. data08 = CMULT(*(ao8 + 0), -*(ao8 + 1));
  136. break;
  137. case -4 :
  138. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  139. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  140. data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
  141. data04 = CMULT(*(ao4 + 0), *(ao4 + 1));
  142. data05 = CMULT(*(ao5 + 0), ZERO);
  143. data06 = CMULT(*(ao6 + 0), -*(ao6 + 1));
  144. data07 = CMULT(*(ao7 + 0), -*(ao7 + 1));
  145. data08 = CMULT(*(ao8 + 0), -*(ao8 + 1));
  146. break;
  147. case -5 :
  148. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  149. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  150. data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
  151. data04 = CMULT(*(ao4 + 0), *(ao4 + 1));
  152. data05 = CMULT(*(ao5 + 0), *(ao5 + 1));
  153. data06 = CMULT(*(ao6 + 0), ZERO);
  154. data07 = CMULT(*(ao7 + 0), -*(ao7 + 1));
  155. data08 = CMULT(*(ao8 + 0), -*(ao8 + 1));
  156. break;
  157. case -6 :
  158. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  159. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  160. data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
  161. data04 = CMULT(*(ao4 + 0), *(ao4 + 1));
  162. data05 = CMULT(*(ao5 + 0), *(ao5 + 1));
  163. data06 = CMULT(*(ao6 + 0), *(ao6 + 1));
  164. data07 = CMULT(*(ao7 + 0), ZERO);
  165. data08 = CMULT(*(ao8 + 0), -*(ao8 + 1));
  166. break;
  167. case -7 :
  168. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  169. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  170. data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
  171. data04 = CMULT(*(ao4 + 0), *(ao4 + 1));
  172. data05 = CMULT(*(ao5 + 0), *(ao5 + 1));
  173. data06 = CMULT(*(ao6 + 0), *(ao6 + 1));
  174. data07 = CMULT(*(ao7 + 0), *(ao7 + 1));
  175. data08 = CMULT(*(ao8 + 0), ZERO);
  176. break;
  177. }
  178. }
  179. if (offset > 0) ao1 += 2; else ao1 += lda;
  180. if (offset > -1) ao2 += 2; else ao2 += lda;
  181. if (offset > -2) ao3 += 2; else ao3 += lda;
  182. if (offset > -3) ao4 += 2; else ao4 += lda;
  183. if (offset > -4) ao5 += 2; else ao5 += lda;
  184. if (offset > -5) ao6 += 2; else ao6 += lda;
  185. if (offset > -6) ao7 += 2; else ao7 += lda;
  186. if (offset > -7) ao8 += 2; else ao8 += lda;
  187. b[ 0] = data01;
  188. b[ 1] = data02;
  189. b[ 2] = data03;
  190. b[ 3] = data04;
  191. b[ 4] = data05;
  192. b[ 5] = data06;
  193. b[ 6] = data07;
  194. b[ 7] = data08;
  195. b += 8;
  196. offset --;
  197. i --;
  198. }
  199. posX += 8;
  200. js --;
  201. }
  202. if (n & 4) {
  203. offset = posX - posY;
  204. if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
  205. if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
  206. if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
  207. if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
  208. i = m;
  209. while (i > 0) {
  210. if (offset > 0) {
  211. data01 = CMULT(*(ao1 + 0), -*(ao1 + 1));
  212. data02 = CMULT(*(ao2 + 0), -*(ao2 + 1));
  213. data03 = CMULT(*(ao3 + 0), -*(ao3 + 1));
  214. data04 = CMULT(*(ao4 + 0), -*(ao4 + 1));
  215. } else
  216. if (offset < -3) {
  217. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  218. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  219. data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
  220. data04 = CMULT(*(ao4 + 0), *(ao4 + 1));
  221. } else {
  222. switch (offset) {
  223. case 0 :
  224. data01 = CMULT(*(ao1 + 0), ZERO);
  225. data02 = CMULT(*(ao2 + 0), -*(ao2 + 1));
  226. data03 = CMULT(*(ao3 + 0), -*(ao3 + 1));
  227. data04 = CMULT(*(ao4 + 0), -*(ao4 + 1));
  228. break;
  229. case -1 :
  230. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  231. data02 = CMULT(*(ao2 + 0), ZERO);
  232. data03 = CMULT(*(ao3 + 0), -*(ao3 + 1));
  233. data04 = CMULT(*(ao4 + 0), -*(ao4 + 1));
  234. break;
  235. case -2 :
  236. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  237. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  238. data03 = CMULT(*(ao3 + 0), ZERO);
  239. data04 = CMULT(*(ao4 + 0), -*(ao4 + 1));
  240. break;
  241. case -3 :
  242. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  243. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  244. data03 = CMULT(*(ao3 + 0), *(ao3 + 1));
  245. data04 = CMULT(*(ao4 + 0), ZERO);
  246. break;
  247. }
  248. }
  249. if (offset > 0) ao1 += 2; else ao1 += lda;
  250. if (offset > -1) ao2 += 2; else ao2 += lda;
  251. if (offset > -2) ao3 += 2; else ao3 += lda;
  252. if (offset > -3) ao4 += 2; else ao4 += lda;
  253. b[ 0] = data01;
  254. b[ 1] = data02;
  255. b[ 2] = data03;
  256. b[ 3] = data04;
  257. b += 4;
  258. offset --;
  259. i --;
  260. }
  261. posX += 4;
  262. }
  263. if (n & 2) {
  264. offset = posX - posY;
  265. if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
  266. if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
  267. i = m;
  268. while (i > 0) {
  269. if (offset > 0) {
  270. data01 = CMULT(*(ao1 + 0), -*(ao1 + 1));
  271. data02 = CMULT(*(ao2 + 0), -*(ao2 + 1));
  272. } else
  273. if (offset < -1) {
  274. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  275. data02 = CMULT(*(ao2 + 0), *(ao2 + 1));
  276. } else {
  277. switch (offset) {
  278. case 0 :
  279. data01 = CMULT(*(ao1 + 0), ZERO);
  280. data02 = CMULT(*(ao2 + 0), -*(ao2 + 1));
  281. break;
  282. case -1 :
  283. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  284. data02 = CMULT(*(ao2 + 0), ZERO);
  285. break;
  286. }
  287. }
  288. if (offset > 0) ao1 += 2; else ao1 += lda;
  289. if (offset > -1) ao2 += 2; else ao2 += lda;
  290. b[ 0] = data01;
  291. b[ 1] = data02;
  292. b += 2;
  293. offset --;
  294. i --;
  295. }
  296. posX += 2;
  297. }
  298. if (n & 1) {
  299. offset = posX - posY;
  300. if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
  301. i = m;
  302. while (i > 0) {
  303. if (offset > 0) {
  304. data01 = CMULT(*(ao1 + 0), -*(ao1 + 1));
  305. } else
  306. if (offset < 0) {
  307. data01 = CMULT(*(ao1 + 0), *(ao1 + 1));
  308. } else {
  309. data01 = CMULT(*(ao1 + 0), ZERO);
  310. }
  311. if (offset > 0) ao1 += 2; else ao1 += lda;
  312. b[ 0] = data01;
  313. b ++;
  314. offset --;
  315. i --;
  316. }
  317. }
  318. return 0;
  319. }