You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trmm_ltcopy_6.c 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #include <stdio.h>
  39. #include "common.h"
  40. int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
  41. BLASLONG i, js, ii;
  42. BLASLONG X;
  43. FLOAT data01, data02, data05, data06;
  44. FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6;
  45. js = (n / 6);
  46. if (js > 0){
  47. do {
  48. X = posX;
  49. if (posX <= posY) {
  50. ao1 = a + posY + (posX + 0) * lda;
  51. ao2 = a + posY + (posX + 1) * lda;
  52. ao3 = a + posY + (posX + 2) * lda;
  53. ao4 = a + posY + (posX + 3) * lda;
  54. ao5 = a + posY + (posX + 4) * lda;
  55. ao6 = a + posY + (posX + 5) * lda;
  56. } else {
  57. ao1 = a + posX + (posY + 0) * lda;
  58. ao2 = a + posX + (posY + 1) * lda;
  59. ao3 = a + posX + (posY + 2) * lda;
  60. ao4 = a + posX + (posY + 3) * lda;
  61. ao5 = a + posX + (posY + 4) * lda;
  62. ao6 = a + posX + (posY + 5) * lda;
  63. }
  64. i = (m / 6);
  65. if (i > 0) {
  66. do {
  67. if (X > posY) {
  68. ao1 += 6;
  69. ao2 += 6;
  70. ao3 += 6;
  71. ao4 += 6;
  72. ao5 += 6;
  73. ao6 += 6;
  74. b += 36;
  75. } else if (X < posY) {
  76. for (ii = 0; ii < 6; ii++){
  77. b[ 0] = *(ao1 + 0);
  78. b[ 1] = *(ao1 + 1);
  79. b[ 2] = *(ao1 + 2);
  80. b[ 3] = *(ao1 + 3);
  81. b[ 4] = *(ao1 + 4);
  82. b[ 5] = *(ao1 + 5);
  83. ao1 += lda;
  84. b += 6;
  85. }
  86. ao2 += 6 * lda;
  87. ao3 += 6 * lda;
  88. ao4 += 6 * lda;
  89. ao5 += 6 * lda;
  90. ao6 += 6 * lda;
  91. } else {
  92. #ifdef UNIT
  93. b[ 0] = ONE;
  94. #else
  95. b[ 0] = *(ao1 + 0);
  96. #endif
  97. b[ 1] = *(ao1 + 1);
  98. b[ 2] = *(ao1 + 2);
  99. b[ 3] = *(ao1 + 3);
  100. b[ 4] = *(ao1 + 4);
  101. b[ 5] = *(ao1 + 5);
  102. b[ 6] = ZERO;
  103. #ifdef UNIT
  104. b[ 7] = ONE;
  105. #else
  106. b[ 7] = *(ao2 + 1);
  107. #endif
  108. b[ 8] = *(ao2 + 2);
  109. b[ 9] = *(ao2 + 3);
  110. b[10] = *(ao2 + 4);
  111. b[11] = *(ao2 + 5);
  112. b[12] = ZERO;
  113. b[13] = ZERO;
  114. #ifdef UNIT
  115. b[14] = ONE;
  116. #else
  117. b[14] = *(ao3 + 2);
  118. #endif
  119. b[15] = *(ao3 + 3);
  120. b[16] = *(ao3 + 4);
  121. b[17] = *(ao3 + 5);
  122. b[18] = ZERO;
  123. b[19] = ZERO;
  124. b[20] = ZERO;
  125. #ifdef UNIT
  126. b[21] = ONE;
  127. #else
  128. b[21] = *(ao4 + 3);
  129. #endif
  130. b[22] = *(ao4 + 4);
  131. b[23] = *(ao4 + 5);
  132. b[24] = ZERO;
  133. b[25] = ZERO;
  134. b[26] = ZERO;
  135. b[27] = ZERO;
  136. #ifdef UNIT
  137. b[28] = ONE;
  138. #else
  139. b[28] = *(ao5 + 4);
  140. #endif
  141. b[29] = *(ao5 + 5);
  142. b[30] = ZERO;
  143. b[31] = ZERO;
  144. b[32] = ZERO;
  145. b[33] = ZERO;
  146. b[34] = ZERO;
  147. #ifdef UNIT
  148. b[35] = ONE;
  149. #else
  150. b[35] = *(ao6 + 5);
  151. #endif
  152. ao1 += 6;
  153. ao2 += 6;
  154. ao3 += 6;
  155. ao4 += 6;
  156. ao5 += 6;
  157. ao6 += 6;
  158. b += 36;
  159. }
  160. X += 6;
  161. i --;
  162. } while (i > 0);
  163. }
  164. i = (m % 6);
  165. if (i) {
  166. if (X > posY) {
  167. b += 6 * i;
  168. } else if (X < posY) {
  169. for (ii = 0; ii < i; ii++){
  170. b[ 0] = *(ao1 + 0);
  171. b[ 1] = *(ao1 + 1);
  172. b[ 2] = *(ao1 + 2);
  173. b[ 3] = *(ao1 + 3);
  174. b[ 4] = *(ao1 + 4);
  175. b[ 5] = *(ao1 + 5);
  176. ao1 += lda;
  177. ao2 += lda;
  178. ao3 += lda;
  179. ao4 += lda;
  180. ao5 += lda;
  181. ao6 += lda;
  182. b += 6;
  183. }
  184. } else {
  185. #ifdef UNIT
  186. b[ 0] = ONE;
  187. #else
  188. b[ 0] = *(ao1 + 0);
  189. #endif
  190. b[ 1] = *(ao1 + 1);
  191. b[ 2] = *(ao1 + 2);
  192. b[ 3] = *(ao1 + 3);
  193. b[ 4] = *(ao1 + 4);
  194. b[ 5] = *(ao1 + 5);
  195. b += 6;
  196. if (i >= 2) {
  197. b[ 0] = ZERO;
  198. #ifdef UNIT
  199. b[ 1] = ONE;
  200. #else
  201. b[ 1] = *(ao2 + 1);
  202. #endif
  203. b[ 2] = *(ao2 + 2);
  204. b[ 3] = *(ao2 + 3);
  205. b[ 4] = *(ao2 + 4);
  206. b[ 5] = *(ao2 + 5);
  207. b += 6;
  208. }
  209. if (i >= 3) {
  210. b[ 0] = ZERO;
  211. b[ 1] = ZERO;
  212. #ifdef UNIT
  213. b[ 2] = ONE;
  214. #else
  215. b[ 2] = *(ao3 + 2);
  216. #endif
  217. b[ 3] = *(ao3 + 3);
  218. b[ 4] = *(ao3 + 4);
  219. b[ 5] = *(ao3 + 5);
  220. b += 6;
  221. }
  222. if (i >= 4) {
  223. b[ 0] = ZERO;
  224. b[ 1] = ZERO;
  225. b[ 2] = ZERO;
  226. #ifdef UNIT
  227. b[ 3] = ONE;
  228. #else
  229. b[ 3] = *(ao4 + 3);
  230. #endif
  231. b[ 4] = *(ao4 + 4);
  232. b[ 5] = *(ao4 + 5);
  233. b += 6;
  234. }
  235. if (i >= 5) {
  236. b[ 0] = ZERO;
  237. b[ 1] = ZERO;
  238. b[ 2] = ZERO;
  239. b[ 3] = ZERO;
  240. #ifdef UNIT
  241. b[ 4] = ONE;
  242. #else
  243. b[ 4] = *(ao5 + 4);
  244. #endif
  245. b[ 5] = *(ao5 + 5);
  246. b += 6;
  247. }
  248. }
  249. }
  250. posY += 6;
  251. js --;
  252. } while (js > 0);
  253. } /* End of main loop */
  254. if ((n % 6) & 4){
  255. X = posX;
  256. if (posX <= posY) {
  257. ao1 = a + posY + (posX + 0) * lda;
  258. ao2 = a + posY + (posX + 1) * lda;
  259. ao3 = a + posY + (posX + 2) * lda;
  260. ao4 = a + posY + (posX + 3) * lda;
  261. } else {
  262. ao1 = a + posX + (posY + 0) * lda;
  263. ao2 = a + posX + (posY + 1) * lda;
  264. ao3 = a + posX + (posY + 2) * lda;
  265. ao4 = a + posX + (posY + 3) * lda;
  266. }
  267. i = (m >> 1);
  268. if (i > 0) {
  269. do {
  270. if (X > posY) {
  271. ao1 += 2;
  272. ao2 += 2;
  273. ao3 += 2;
  274. ao4 += 2;
  275. b += 8;
  276. } else if (X < posY) {
  277. for (ii = 0; ii < 2; ii++){
  278. b[ 0] = *(ao1 + 0);
  279. b[ 1] = *(ao1 + 1);
  280. b[ 2] = *(ao1 + 2);
  281. b[ 3] = *(ao1 + 3);
  282. ao1 += lda;
  283. b += 4;
  284. }
  285. ao2 += 2 * lda;
  286. ao3 += 2 * lda;
  287. ao4 += 2 * lda;
  288. } else {
  289. #ifdef UNIT
  290. b[ 0] = ONE;
  291. #else
  292. b[ 0] = *(ao1 + 0);
  293. #endif
  294. b[ 1] = *(ao1 + 1);
  295. b[ 2] = *(ao1 + 2);
  296. b[ 3] = *(ao1 + 3);
  297. b[ 4] = ZERO;
  298. #ifdef UNIT
  299. b[ 5] = ONE;
  300. #else
  301. b[ 5] = *(ao2 + 1);
  302. #endif
  303. b[ 6] = *(ao2 + 2);
  304. b[ 7] = *(ao2 + 3);
  305. b[ 8] = ZERO;
  306. b[ 9] = ZERO;
  307. #ifdef UNIT
  308. b[ 10] = ONE;
  309. #else
  310. b[ 10] = *(ao3 + 2);
  311. #endif
  312. b[ 11] = *(ao3 + 3);
  313. b[ 12] = ZERO;
  314. b[ 13] = ZERO;
  315. b[ 14] = ZERO;
  316. #ifdef UNIT
  317. b[ 15] = ONE;
  318. #else
  319. b[ 15] = *(ao4 + 3);
  320. #endif
  321. ao1 += 4;
  322. ao2 += 4;
  323. ao3 += 4;
  324. ao4 += 4;
  325. b += 16;
  326. X += 4;
  327. i -= 2;
  328. continue;
  329. }
  330. X += 2;
  331. i --;
  332. } while (i > 0);
  333. }
  334. i = (m & 1);
  335. if (i > 0) {
  336. if (X > posY) {
  337. /* ao1 += i;
  338. ao2 += i;
  339. ao3 += i;
  340. ao4 += i; */
  341. b += 4 * i;
  342. } else if (X < posY) {
  343. for (ii = 0; ii < i; ii++){
  344. b[ 0] = *(ao1 + 0);
  345. b[ 1] = *(ao1 + 1);
  346. b[ 2] = *(ao1 + 2);
  347. b[ 3] = *(ao1 + 3);
  348. // ao1 += lda;
  349. // ao2 += lda;
  350. // ao3 += lda;
  351. // ao4 += lda;
  352. b += 4;
  353. }
  354. } else {
  355. #ifdef UNIT
  356. b[ 0] = ONE;
  357. #else
  358. b[ 0] = *(ao1 + 0);
  359. #endif
  360. b[ 1] = *(ao1 + 1);
  361. b[ 2] = *(ao1 + 2);
  362. b[ 3] = *(ao1 + 3);
  363. b += 4;
  364. }
  365. }
  366. posY += 4;
  367. }
  368. if ((n % 6) & 2){
  369. X = posX;
  370. if (posX <= posY) {
  371. ao1 = a + posY + (posX + 0) * lda;
  372. ao2 = a + posY + (posX + 1) * lda;
  373. } else {
  374. ao1 = a + posX + (posY + 0) * lda;
  375. ao2 = a + posX + (posY + 1) * lda;
  376. }
  377. i = (m >> 1);
  378. if (i > 0) {
  379. do {
  380. if (X > posY) {
  381. ao1 += 2;
  382. ao2 += 2;
  383. b += 4;
  384. } else if (X < posY) {
  385. data01 = *(ao1 + 0);
  386. data02 = *(ao1 + 1);
  387. data05 = *(ao2 + 0);
  388. data06 = *(ao2 + 1);
  389. b[ 0] = data01;
  390. b[ 1] = data02;
  391. b[ 2] = data05;
  392. b[ 3] = data06;
  393. ao1 += 2 * lda;
  394. ao2 += 2 * lda;
  395. b += 4;
  396. } else {
  397. #ifdef UNIT
  398. data02 = *(ao1 + 1);
  399. b[ 0] = ONE;
  400. b[ 1] = data02;
  401. b[ 2] = ZERO;
  402. b[ 3] = ONE;
  403. #else
  404. data01 = *(ao1 + 0);
  405. data02 = *(ao1 + 1);
  406. data06 = *(ao2 + 1);
  407. b[ 0] = data01;
  408. b[ 1] = data02;
  409. b[ 2] = ZERO;
  410. b[ 3] = data06;
  411. #endif
  412. ao1 += 2;
  413. ao2 += 2;
  414. b += 4;
  415. }
  416. X += 2;
  417. i --;
  418. } while (i > 0);
  419. }
  420. i = (m & 1);
  421. if (i) {
  422. if (X > posY) {
  423. ao1 += 1;
  424. ao2 += 1;
  425. b += 2;
  426. } else if (X < posY) {
  427. data01 = *(ao1 + 0);
  428. data02 = *(ao1 + 1);
  429. b[ 0] = data01;
  430. b[ 1] = data02;
  431. ao1 += lda;
  432. b += 2;
  433. } else {
  434. #ifdef UNIT
  435. data02 = *(ao1 + 1);
  436. b[ 0] = ONE;
  437. b[ 1] = data02;
  438. #else
  439. data01 = *(ao1 + 0);
  440. data02 = *(ao1 + 1);
  441. b[ 0] = data01;
  442. b[ 1] = data02;
  443. #endif
  444. b += 2;
  445. }
  446. }
  447. posY += 2;
  448. }
  449. if ((n % 6) & 1){
  450. X = posX;
  451. if (posX <= posY) {
  452. ao1 = a + posY + (posX + 0) * lda;
  453. } else {
  454. ao1 = a + posX + (posY + 0) * lda;
  455. }
  456. i = m;
  457. if (i > 0) {
  458. do {
  459. if (X > posY) {
  460. b += 1;
  461. ao1 += 1;
  462. } else if (X < posY) {
  463. data01 = *(ao1 + 0);
  464. b[ 0] = data01;
  465. ao1 += lda;
  466. b += 1;
  467. } else {
  468. #ifdef UNIT
  469. b[ 0] = ONE;
  470. #else
  471. data01 = *(ao1 + 0);
  472. b[ 0] = data01;
  473. #endif
  474. ao1 += 1;
  475. b += 1;
  476. }
  477. X ++;
  478. i --;
  479. } while (i > 0);
  480. }
  481. posY += 1;
  482. }
  483. return 0;
  484. }