You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symcopy.h 30 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. /* This implementation is completely wrong. I'll rewrite this */
  39. #ifndef SYMCOPY_H
  40. #define SYMCOPY_H
  41. #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
  42. static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  43. BLASLONG is, js;
  44. FLOAT *aa1, *aa2;
  45. FLOAT *b1, *b2;
  46. FLOAT *bb1, *bb2;
  47. FLOAT *cc1, *cc2;
  48. FLOAT a11, a12;
  49. FLOAT a21, a22;
  50. b1 = b;
  51. b2 = b;
  52. for (js = 0; js < m; js += 2){
  53. aa1 = a + 0 * lda;
  54. aa2 = a + 1 * lda;
  55. a += 2 * lda + 2;
  56. bb1 = b1 + 0 * m;
  57. bb2 = b1 + 1 * m;
  58. b1 += 2 * m + 2;
  59. cc1 = b2 + 0 * m;
  60. cc2 = b2 + 1 * m;
  61. b2 += 2 * m + 2;
  62. if (m - js >= 2){
  63. a11 = *(aa1 + 0);
  64. a21 = *(aa1 + 1);
  65. a22 = *(aa2 + 1);
  66. *(bb1 + 0) = a11;
  67. *(bb1 + 1) = a21;
  68. *(bb2 + 0) = a21;
  69. *(bb2 + 1) = a22;
  70. aa1 += 2;
  71. aa2 += 2;
  72. bb1 += 2;
  73. bb2 += 2;
  74. cc1 += 2 * m;
  75. cc2 += 2 * m;
  76. is = ((m - js - 2) >> 1);
  77. while (is > 0){
  78. a11 = *(aa1 + 0);
  79. a21 = *(aa1 + 1);
  80. a12 = *(aa2 + 0);
  81. a22 = *(aa2 + 1);
  82. aa1 += 2;
  83. aa2 += 2;
  84. *(bb1 + 0) = a11;
  85. *(bb1 + 1) = a21;
  86. *(bb2 + 0) = a12;
  87. *(bb2 + 1) = a22;
  88. *(cc1 + 0) = a11;
  89. *(cc1 + 1) = a12;
  90. *(cc2 + 0) = a21;
  91. *(cc2 + 1) = a22;
  92. bb1 += 2;
  93. bb2 += 2;
  94. cc1 += 2 * m;
  95. cc2 += 2 * m;
  96. is --;
  97. }
  98. is = ((m - js - 2) & 1);
  99. if (is == 1){
  100. a11 = *(aa1 + 0);
  101. a12 = *(aa2 + 0);
  102. *(bb1 + 0) = a11;
  103. *(bb2 + 0) = a12;
  104. *(cc1 + 0) = a11;
  105. *(cc1 + 1) = a12;
  106. }
  107. }
  108. if (m - js == 1){
  109. a11 = *(aa1 + 0);
  110. *(bb1 + 0) = a11;
  111. }
  112. }
  113. }
  114. static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  115. BLASLONG is, js;
  116. FLOAT *aa1, *aa2;
  117. FLOAT *b1, *b2;
  118. FLOAT *bb1, *bb2;
  119. FLOAT *cc1, *cc2;
  120. FLOAT a11, a12;
  121. FLOAT a21, a22;
  122. b1 = b;
  123. b2 = b;
  124. for (js = 0; js < m; js += 2){
  125. aa1 = a + 0 * lda;
  126. aa2 = a + 1 * lda;
  127. a += 2 * lda;
  128. bb1 = b1 + 0 * m;
  129. bb2 = b1 + 1 * m;
  130. b1 += 2 * m;
  131. cc1 = b2 + 0 * m;
  132. cc2 = b2 + 1 * m;
  133. b2 += 2;
  134. if (m - js >= 2){
  135. for (is = 0; is < js; is += 2){
  136. a11 = *(aa1 + 0);
  137. a21 = *(aa1 + 1);
  138. a12 = *(aa2 + 0);
  139. a22 = *(aa2 + 1);
  140. aa1 += 2;
  141. aa2 += 2;
  142. *(bb1 + 0) = a11;
  143. *(bb1 + 1) = a21;
  144. *(bb2 + 0) = a12;
  145. *(bb2 + 1) = a22;
  146. *(cc1 + 0) = a11;
  147. *(cc1 + 1) = a12;
  148. *(cc2 + 0) = a21;
  149. *(cc2 + 1) = a22;
  150. bb1 += 2;
  151. bb2 += 2;
  152. cc1 += 2 * m;
  153. cc2 += 2 * m;
  154. }
  155. a11 = *(aa1 + 0);
  156. a12 = *(aa2 + 0);
  157. a22 = *(aa2 + 1);
  158. *(bb1 + 0) = a11;
  159. *(bb1 + 1) = a12;
  160. *(bb2 + 0) = a12;
  161. *(bb2 + 1) = a22;
  162. }
  163. if (m - js == 1){
  164. for (is = 0; is < js; is += 2){
  165. a11 = *(aa1 + 0);
  166. a21 = *(aa1 + 1);
  167. aa1 += 2;
  168. *(bb1 + 0) = a11;
  169. *(bb1 + 1) = a21;
  170. *(cc1 + 0) = a11;
  171. *(cc2 + 0) = a21;
  172. bb1 += 2;
  173. cc1 += 2 * m;
  174. cc2 += 2 * m;
  175. }
  176. a11 = *(aa1 + 0);
  177. *(bb1 + 0) = a11;
  178. }
  179. }
  180. }
  181. static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  182. BLASLONG is, js;
  183. FLOAT *aa1, *aa2;
  184. FLOAT *b1, *b2;
  185. FLOAT *bb1, *bb2;
  186. FLOAT *cc1, *cc2;
  187. FLOAT a11, a21, a31, a41;
  188. FLOAT a12, a22, a32, a42;
  189. b1 = b;
  190. b2 = b;
  191. lda *= 2;
  192. for (js = 0; js < m; js += 2){
  193. aa1 = a + 0 * lda;
  194. aa2 = a + 1 * lda;
  195. a += 2 * lda + 4;
  196. bb1 = b1 + 0 * m;
  197. bb2 = b1 + 2 * m;
  198. b1 += 4 * m + 4;
  199. cc1 = b2 + 0 * m;
  200. cc2 = b2 + 2 * m;
  201. b2 += 4 * m + 4;
  202. if (m - js >= 2){
  203. a11 = *(aa1 + 0);
  204. a21 = *(aa1 + 1);
  205. a31 = *(aa1 + 2);
  206. a41 = *(aa1 + 3);
  207. a12 = *(aa2 + 2);
  208. a22 = *(aa2 + 3);
  209. *(bb1 + 0) = a11;
  210. *(bb1 + 1) = a21;
  211. *(bb1 + 2) = a31;
  212. *(bb1 + 3) = a41;
  213. *(bb2 + 0) = a31;
  214. *(bb2 + 1) = a41;
  215. *(bb2 + 2) = a12;
  216. *(bb2 + 3) = a22;
  217. aa1 += 4;
  218. aa2 += 4;
  219. bb1 += 4;
  220. bb2 += 4;
  221. cc1 += 4 * m;
  222. cc2 += 4 * m;
  223. is = ((m - js - 2) >> 1);
  224. while (is > 0){
  225. a11 = *(aa1 + 0);
  226. a21 = *(aa1 + 1);
  227. a31 = *(aa1 + 2);
  228. a41 = *(aa1 + 3);
  229. a12 = *(aa2 + 0);
  230. a22 = *(aa2 + 1);
  231. a32 = *(aa2 + 2);
  232. a42 = *(aa2 + 3);
  233. aa1 += 4;
  234. aa2 += 4;
  235. *(bb1 + 0) = a11;
  236. *(bb1 + 1) = a21;
  237. *(bb1 + 2) = a31;
  238. *(bb1 + 3) = a41;
  239. *(bb2 + 0) = a12;
  240. *(bb2 + 1) = a22;
  241. *(bb2 + 2) = a32;
  242. *(bb2 + 3) = a42;
  243. *(cc1 + 0) = a11;
  244. *(cc1 + 1) = a21;
  245. *(cc1 + 2) = a12;
  246. *(cc1 + 3) = a22;
  247. *(cc2 + 0) = a31;
  248. *(cc2 + 1) = a41;
  249. *(cc2 + 2) = a32;
  250. *(cc2 + 3) = a42;
  251. bb1 += 4;
  252. bb2 += 4;
  253. cc1 += 4 * m;
  254. cc2 += 4 * m;
  255. is --;
  256. }
  257. if (m & 1){
  258. a11 = *(aa1 + 0);
  259. a21 = *(aa1 + 1);
  260. a12 = *(aa2 + 0);
  261. a22 = *(aa2 + 1);
  262. *(bb1 + 0) = a11;
  263. *(bb1 + 1) = a21;
  264. *(bb2 + 0) = a12;
  265. *(bb2 + 1) = a22;
  266. *(cc1 + 0) = a11;
  267. *(cc1 + 1) = a21;
  268. *(cc1 + 2) = a12;
  269. *(cc1 + 3) = a22;
  270. }
  271. }
  272. if (m - js == 1){
  273. a11 = *(aa1 + 0);
  274. a21 = *(aa1 + 1);
  275. *(bb1 + 0) = a11;
  276. *(bb1 + 1) = a21;
  277. }
  278. }
  279. }
  280. static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  281. BLASLONG is, js;
  282. FLOAT *aa1, *aa2;
  283. FLOAT *b1, *b2;
  284. FLOAT *bb1, *bb2;
  285. FLOAT *cc1, *cc2;
  286. FLOAT a11, a21, a31, a41;
  287. FLOAT a12, a22, a32, a42;
  288. b1 = b;
  289. b2 = b;
  290. lda *= 2;
  291. for (js = 0; js < m; js += 2){
  292. aa1 = a + 0 * lda;
  293. aa2 = a + 1 * lda;
  294. a += 2 * lda;
  295. bb1 = b1 + 0 * m;
  296. bb2 = b1 + 2 * m;
  297. b1 += 4 * m;
  298. cc1 = b2 + 0 * m;
  299. cc2 = b2 + 2 * m;
  300. b2 += 4;
  301. if (m - js >= 2){
  302. for (is = 0; is < js; is += 2){
  303. a11 = *(aa1 + 0);
  304. a21 = *(aa1 + 1);
  305. a31 = *(aa1 + 2);
  306. a41 = *(aa1 + 3);
  307. a12 = *(aa2 + 0);
  308. a22 = *(aa2 + 1);
  309. a32 = *(aa2 + 2);
  310. a42 = *(aa2 + 3);
  311. aa1 += 4;
  312. aa2 += 4;
  313. *(bb1 + 0) = a11;
  314. *(bb1 + 1) = a21;
  315. *(bb1 + 2) = a31;
  316. *(bb1 + 3) = a41;
  317. *(bb2 + 0) = a12;
  318. *(bb2 + 1) = a22;
  319. *(bb2 + 2) = a32;
  320. *(bb2 + 3) = a42;
  321. *(cc1 + 0) = a11;
  322. *(cc1 + 1) = a21;
  323. *(cc1 + 2) = a12;
  324. *(cc1 + 3) = a22;
  325. *(cc2 + 0) = a31;
  326. *(cc2 + 1) = a41;
  327. *(cc2 + 2) = a32;
  328. *(cc2 + 3) = a42;
  329. bb1 += 4;
  330. bb2 += 4;
  331. cc1 += 4 * m;
  332. cc2 += 4 * m;
  333. }
  334. a11 = *(aa1 + 0);
  335. a21 = *(aa1 + 1);
  336. a12 = *(aa2 + 0);
  337. a22 = *(aa2 + 1);
  338. a32 = *(aa2 + 2);
  339. a42 = *(aa2 + 3);
  340. *(bb1 + 0) = a11;
  341. *(bb1 + 1) = a21;
  342. *(bb1 + 2) = a12;
  343. *(bb1 + 3) = a22;
  344. *(bb2 + 0) = a12;
  345. *(bb2 + 1) = a22;
  346. *(bb2 + 2) = a32;
  347. *(bb2 + 3) = a42;
  348. }
  349. if (m - js == 1){
  350. for (is = 0; is < js; is += 2){
  351. a11 = *(aa1 + 0);
  352. a21 = *(aa1 + 1);
  353. a31 = *(aa1 + 2);
  354. a41 = *(aa1 + 3);
  355. aa1 += 4;
  356. *(bb1 + 0) = a11;
  357. *(bb1 + 1) = a21;
  358. *(bb1 + 2) = a31;
  359. *(bb1 + 3) = a41;
  360. *(cc1 + 0) = a11;
  361. *(cc1 + 1) = a21;
  362. *(cc2 + 0) = a31;
  363. *(cc2 + 1) = a41;
  364. bb1 += 4;
  365. cc1 += 4 * m;
  366. cc2 += 4 * m;
  367. }
  368. a11 = *(aa1 + 0);
  369. a21 = *(aa1 + 1);
  370. *(bb1 + 0) = a11;
  371. *(bb1 + 1) = a21;
  372. }
  373. }
  374. }
  375. static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  376. BLASLONG is, js;
  377. FLOAT *aa1, *aa2;
  378. FLOAT *b1, *b2;
  379. FLOAT *bb1, *bb2;
  380. FLOAT *cc1, *cc2;
  381. FLOAT a11, a21, a31, a41;
  382. FLOAT a12, a22, a32, a42;
  383. b1 = b;
  384. b2 = b;
  385. lda *= 2;
  386. for (js = 0; js < m; js += 2){
  387. aa1 = a + 0 * lda;
  388. aa2 = a + 1 * lda;
  389. a += 2 * lda + 4;
  390. bb1 = b1 + 0 * m;
  391. bb2 = b1 + 2 * m;
  392. b1 += 4 * m + 4;
  393. cc1 = b2 + 0 * m;
  394. cc2 = b2 + 2 * m;
  395. b2 += 4 * m + 4;
  396. if (m - js >= 2){
  397. a11 = *(aa1 + 0);
  398. a31 = *(aa1 + 2);
  399. a41 = *(aa1 + 3);
  400. a12 = *(aa2 + 2);
  401. *(bb1 + 0) = a11;
  402. *(bb1 + 1) = 0.;
  403. *(bb1 + 2) = a31;
  404. *(bb1 + 3) = a41;
  405. *(bb2 + 0) = a31;
  406. *(bb2 + 1) = -a41;
  407. *(bb2 + 2) = a12;
  408. *(bb2 + 3) = 0.;
  409. aa1 += 4;
  410. aa2 += 4;
  411. bb1 += 4;
  412. bb2 += 4;
  413. cc1 += 4 * m;
  414. cc2 += 4 * m;
  415. is = ((m - js - 2) >> 1);
  416. while (is > 0){
  417. a11 = *(aa1 + 0);
  418. a21 = *(aa1 + 1);
  419. a31 = *(aa1 + 2);
  420. a41 = *(aa1 + 3);
  421. a12 = *(aa2 + 0);
  422. a22 = *(aa2 + 1);
  423. a32 = *(aa2 + 2);
  424. a42 = *(aa2 + 3);
  425. aa1 += 4;
  426. aa2 += 4;
  427. *(bb1 + 0) = a11;
  428. *(bb1 + 1) = a21;
  429. *(bb1 + 2) = a31;
  430. *(bb1 + 3) = a41;
  431. *(bb2 + 0) = a12;
  432. *(bb2 + 1) = a22;
  433. *(bb2 + 2) = a32;
  434. *(bb2 + 3) = a42;
  435. *(cc1 + 0) = a11;
  436. *(cc1 + 1) = -a21;
  437. *(cc1 + 2) = a12;
  438. *(cc1 + 3) = -a22;
  439. *(cc2 + 0) = a31;
  440. *(cc2 + 1) = -a41;
  441. *(cc2 + 2) = a32;
  442. *(cc2 + 3) = -a42;
  443. bb1 += 4;
  444. bb2 += 4;
  445. cc1 += 4 * m;
  446. cc2 += 4 * m;
  447. is --;
  448. }
  449. if (m & 1){
  450. a11 = *(aa1 + 0);
  451. a21 = *(aa1 + 1);
  452. a12 = *(aa2 + 0);
  453. a22 = *(aa2 + 1);
  454. *(bb1 + 0) = a11;
  455. *(bb1 + 1) = a21;
  456. *(bb2 + 0) = a12;
  457. *(bb2 + 1) = a22;
  458. *(cc1 + 0) = a11;
  459. *(cc1 + 1) = -a21;
  460. *(cc1 + 2) = a12;
  461. *(cc1 + 3) = -a22;
  462. }
  463. }
  464. if (m - js == 1){
  465. a11 = *(aa1 + 0);
  466. *(bb1 + 0) = a11;
  467. *(bb1 + 1) = 0.;
  468. }
  469. }
  470. }
  471. static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  472. BLASLONG is, js;
  473. FLOAT *aa1, *aa2;
  474. FLOAT *b1, *b2;
  475. FLOAT *bb1, *bb2;
  476. FLOAT *cc1, *cc2;
  477. FLOAT a11, a21, a31, a41;
  478. FLOAT a12, a22, a32, a42;
  479. b1 = b;
  480. b2 = b;
  481. lda *= 2;
  482. for (js = 0; js < m; js += 2){
  483. aa1 = a + 0 * lda;
  484. aa2 = a + 1 * lda;
  485. a += 2 * lda;
  486. bb1 = b1 + 0 * m;
  487. bb2 = b1 + 2 * m;
  488. b1 += 4 * m;
  489. cc1 = b2 + 0 * m;
  490. cc2 = b2 + 2 * m;
  491. b2 += 4;
  492. if (m - js >= 2){
  493. for (is = 0; is < js; is += 2){
  494. a11 = *(aa1 + 0);
  495. a21 = *(aa1 + 1);
  496. a31 = *(aa1 + 2);
  497. a41 = *(aa1 + 3);
  498. a12 = *(aa2 + 0);
  499. a22 = *(aa2 + 1);
  500. a32 = *(aa2 + 2);
  501. a42 = *(aa2 + 3);
  502. aa1 += 4;
  503. aa2 += 4;
  504. *(bb1 + 0) = a11;
  505. *(bb1 + 1) = a21;
  506. *(bb1 + 2) = a31;
  507. *(bb1 + 3) = a41;
  508. *(bb2 + 0) = a12;
  509. *(bb2 + 1) = a22;
  510. *(bb2 + 2) = a32;
  511. *(bb2 + 3) = a42;
  512. *(cc1 + 0) = a11;
  513. *(cc1 + 1) = -a21;
  514. *(cc1 + 2) = a12;
  515. *(cc1 + 3) = -a22;
  516. *(cc2 + 0) = a31;
  517. *(cc2 + 1) = -a41;
  518. *(cc2 + 2) = a32;
  519. *(cc2 + 3) = -a42;
  520. bb1 += 4;
  521. bb2 += 4;
  522. cc1 += 4 * m;
  523. cc2 += 4 * m;
  524. }
  525. a11 = *(aa1 + 0);
  526. a12 = *(aa2 + 0);
  527. a22 = *(aa2 + 1);
  528. a32 = *(aa2 + 2);
  529. *(bb1 + 0) = a11;
  530. *(bb1 + 1) = 0.;
  531. *(bb1 + 2) = a12;
  532. *(bb1 + 3) = -a22;
  533. *(bb2 + 0) = a12;
  534. *(bb2 + 1) = a22;
  535. *(bb2 + 2) = a32;
  536. *(bb2 + 3) = 0.;
  537. }
  538. if (m - js == 1){
  539. for (is = 0; is < js; is += 2){
  540. a11 = *(aa1 + 0);
  541. a21 = *(aa1 + 1);
  542. a31 = *(aa1 + 2);
  543. a41 = *(aa1 + 3);
  544. aa1 += 4;
  545. *(bb1 + 0) = a11;
  546. *(bb1 + 1) = a21;
  547. *(bb1 + 2) = a31;
  548. *(bb1 + 3) = a41;
  549. *(cc1 + 0) = a11;
  550. *(cc1 + 1) = -a21;
  551. *(cc2 + 0) = a31;
  552. *(cc2 + 1) = -a41;
  553. bb1 += 4;
  554. cc1 += 4 * m;
  555. cc2 += 4 * m;
  556. }
  557. a11 = *(aa1 + 0);
  558. *(bb1 + 0) = a11;
  559. *(bb1 + 1) = 0.;
  560. }
  561. }
  562. }
  563. static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  564. BLASLONG is, js;
  565. FLOAT *aa1, *aa2;
  566. FLOAT *b1, *b2;
  567. FLOAT *bb1, *bb2;
  568. FLOAT *cc1, *cc2;
  569. FLOAT a11, a21, a31, a41;
  570. FLOAT a12, a22, a32, a42;
  571. b1 = b;
  572. b2 = b;
  573. lda *= 2;
  574. for (js = 0; js < m; js += 2){
  575. aa1 = a + 0 * lda;
  576. aa2 = a + 1 * lda;
  577. a += 2 * lda + 4;
  578. bb1 = b1 + 0 * m;
  579. bb2 = b1 + 2 * m;
  580. b1 += 4 * m + 4;
  581. cc1 = b2 + 0 * m;
  582. cc2 = b2 + 2 * m;
  583. b2 += 4 * m + 4;
  584. if (m - js >= 2){
  585. a11 = *(aa1 + 0);
  586. a31 = *(aa1 + 2);
  587. a41 = *(aa1 + 3);
  588. a12 = *(aa2 + 2);
  589. *(bb1 + 0) = a11;
  590. *(bb1 + 1) = 0.;
  591. *(bb1 + 2) = a31;
  592. *(bb1 + 3) = -a41;
  593. *(bb2 + 0) = a31;
  594. *(bb2 + 1) = a41;
  595. *(bb2 + 2) = a12;
  596. *(bb2 + 3) = 0.;
  597. aa1 += 4;
  598. aa2 += 4;
  599. bb1 += 4;
  600. bb2 += 4;
  601. cc1 += 4 * m;
  602. cc2 += 4 * m;
  603. is = ((m - js - 2) >> 1);
  604. while (is > 0){
  605. a11 = *(aa1 + 0);
  606. a21 = *(aa1 + 1);
  607. a31 = *(aa1 + 2);
  608. a41 = *(aa1 + 3);
  609. a12 = *(aa2 + 0);
  610. a22 = *(aa2 + 1);
  611. a32 = *(aa2 + 2);
  612. a42 = *(aa2 + 3);
  613. aa1 += 4;
  614. aa2 += 4;
  615. *(bb1 + 0) = a11;
  616. *(bb1 + 1) = -a21;
  617. *(bb1 + 2) = a31;
  618. *(bb1 + 3) = -a41;
  619. *(bb2 + 0) = a12;
  620. *(bb2 + 1) = -a22;
  621. *(bb2 + 2) = a32;
  622. *(bb2 + 3) = -a42;
  623. *(cc1 + 0) = a11;
  624. *(cc1 + 1) = a21;
  625. *(cc1 + 2) = a12;
  626. *(cc1 + 3) = a22;
  627. *(cc2 + 0) = a31;
  628. *(cc2 + 1) = a41;
  629. *(cc2 + 2) = a32;
  630. *(cc2 + 3) = a42;
  631. bb1 += 4;
  632. bb2 += 4;
  633. cc1 += 4 * m;
  634. cc2 += 4 * m;
  635. is --;
  636. }
  637. if (m & 1){
  638. a11 = *(aa1 + 0);
  639. a21 = *(aa1 + 1);
  640. a12 = *(aa2 + 0);
  641. a22 = *(aa2 + 1);
  642. *(bb1 + 0) = a11;
  643. *(bb1 + 1) = -a21;
  644. *(bb2 + 0) = a12;
  645. *(bb2 + 1) = -a22;
  646. *(cc1 + 0) = a11;
  647. *(cc1 + 1) = a21;
  648. *(cc1 + 2) = a12;
  649. *(cc1 + 3) = a22;
  650. }
  651. }
  652. if (m - js == 1){
  653. a11 = *(aa1 + 0);
  654. *(bb1 + 0) = a11;
  655. *(bb1 + 1) = 0.;
  656. }
  657. }
  658. }
  659. static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  660. BLASLONG is, js;
  661. FLOAT *aa1, *aa2;
  662. FLOAT *b1, *b2;
  663. FLOAT *bb1, *bb2;
  664. FLOAT *cc1, *cc2;
  665. FLOAT a11, a21, a31, a41;
  666. FLOAT a12, a22, a32, a42;
  667. b1 = b;
  668. b2 = b;
  669. lda *= 2;
  670. for (js = 0; js < m; js += 2){
  671. aa1 = a + 0 * lda;
  672. aa2 = a + 1 * lda;
  673. a += 2 * lda;
  674. bb1 = b1 + 0 * m;
  675. bb2 = b1 + 2 * m;
  676. b1 += 4 * m;
  677. cc1 = b2 + 0 * m;
  678. cc2 = b2 + 2 * m;
  679. b2 += 4;
  680. if (m - js >= 2){
  681. for (is = 0; is < js; is += 2){
  682. a11 = *(aa1 + 0);
  683. a21 = *(aa1 + 1);
  684. a31 = *(aa1 + 2);
  685. a41 = *(aa1 + 3);
  686. a12 = *(aa2 + 0);
  687. a22 = *(aa2 + 1);
  688. a32 = *(aa2 + 2);
  689. a42 = *(aa2 + 3);
  690. aa1 += 4;
  691. aa2 += 4;
  692. *(bb1 + 0) = a11;
  693. *(bb1 + 1) = -a21;
  694. *(bb1 + 2) = a31;
  695. *(bb1 + 3) = -a41;
  696. *(bb2 + 0) = a12;
  697. *(bb2 + 1) = -a22;
  698. *(bb2 + 2) = a32;
  699. *(bb2 + 3) = -a42;
  700. *(cc1 + 0) = a11;
  701. *(cc1 + 1) = a21;
  702. *(cc1 + 2) = a12;
  703. *(cc1 + 3) = a22;
  704. *(cc2 + 0) = a31;
  705. *(cc2 + 1) = a41;
  706. *(cc2 + 2) = a32;
  707. *(cc2 + 3) = a42;
  708. bb1 += 4;
  709. bb2 += 4;
  710. cc1 += 4 * m;
  711. cc2 += 4 * m;
  712. }
  713. a11 = *(aa1 + 0);
  714. a12 = *(aa2 + 0);
  715. a22 = *(aa2 + 1);
  716. a32 = *(aa2 + 2);
  717. *(bb1 + 0) = a11;
  718. *(bb1 + 1) = 0.;
  719. *(bb1 + 2) = a12;
  720. *(bb1 + 3) = a22;
  721. *(bb2 + 0) = a12;
  722. *(bb2 + 1) = -a22;
  723. *(bb2 + 2) = a32;
  724. *(bb2 + 3) = 0.;
  725. }
  726. if (m - js == 1){
  727. for (is = 0; is < js; is += 2){
  728. a11 = *(aa1 + 0);
  729. a21 = *(aa1 + 1);
  730. a31 = *(aa1 + 2);
  731. a41 = *(aa1 + 3);
  732. aa1 += 4;
  733. *(bb1 + 0) = a11;
  734. *(bb1 + 1) = -a21;
  735. *(bb1 + 2) = a31;
  736. *(bb1 + 3) = -a41;
  737. *(cc1 + 0) = a11;
  738. *(cc1 + 1) = a21;
  739. *(cc2 + 0) = a31;
  740. *(cc2 + 1) = a41;
  741. bb1 += 4;
  742. cc1 += 4 * m;
  743. cc2 += 4 * m;
  744. }
  745. a11 = *(aa1 + 0);
  746. *(bb1 + 0) = a11;
  747. *(bb1 + 1) = 0.;
  748. }
  749. }
  750. }
  751. static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  752. BLASLONG is, js;
  753. FLOAT *aa1, *aa2;
  754. FLOAT *b1, *b2;
  755. FLOAT *bb1, *bb2;
  756. FLOAT *cc1, *cc2;
  757. FLOAT a11, a12;
  758. FLOAT a21, a22;
  759. b1 = b;
  760. b2 = b;
  761. for (js = 0; js < m; js += 2){
  762. aa1 = a + 0 * lda;
  763. aa2 = a + 1 * lda;
  764. a += 2 * lda + 2;
  765. bb1 = b1 + 0 * m;
  766. bb2 = b1 + 1 * m;
  767. b1 += 2 * m + 2;
  768. cc1 = b2 + 0 * m;
  769. cc2 = b2 + 1 * m;
  770. b2 += 2 * m + 2;
  771. if (m - js >= 2){
  772. a11 = *(aa1 + 0);
  773. a21 = *(aa1 + 1);
  774. a22 = *(aa2 + 1);
  775. *(bb1 + 0) = a11;
  776. *(bb1 + 1) = a21;
  777. *(bb2 + 0) = a21;
  778. *(bb2 + 1) = a22;
  779. aa1 += 2;
  780. aa2 += 2;
  781. bb1 += 2;
  782. bb2 += 2;
  783. cc1 += 2 * m;
  784. cc2 += 2 * m;
  785. is = ((m - js - 2) >> 1);
  786. while (is > 0){
  787. a11 = *(aa1 + 0);
  788. a21 = *(aa1 + 1);
  789. a12 = *(aa2 + 0);
  790. a22 = *(aa2 + 1);
  791. aa1 += 2;
  792. aa2 += 2;
  793. *(bb1 + 0) = a11;
  794. *(bb1 + 1) = a21;
  795. *(bb2 + 0) = a12;
  796. *(bb2 + 1) = a22;
  797. *(cc1 + 0) = a11;
  798. *(cc1 + 1) = a12;
  799. *(cc2 + 0) = a21;
  800. *(cc2 + 1) = a22;
  801. bb1 += 2;
  802. bb2 += 2;
  803. cc1 += 2 * m;
  804. cc2 += 2 * m;
  805. is --;
  806. }
  807. is = ((m - js - 2) & 1);
  808. if (is == 1){
  809. a11 = *(aa1 + 0);
  810. a12 = *(aa2 + 0);
  811. *(bb1 + 0) = a11;
  812. *(bb2 + 0) = a12;
  813. *(cc1 + 0) = a11;
  814. *(cc1 + 1) = a12;
  815. }
  816. }
  817. if (m - js == 1){
  818. a11 = *(aa1 + 0);
  819. *(bb1 + 0) = a11;
  820. }
  821. }
  822. }
  823. static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  824. BLASLONG is, js;
  825. FLOAT *aa1, *aa2;
  826. FLOAT *b1, *b2;
  827. FLOAT *bb1, *bb2;
  828. FLOAT *cc1, *cc2;
  829. FLOAT a11, a12;
  830. FLOAT a21, a22;
  831. b1 = b;
  832. b2 = b;
  833. for (js = 0; js < m; js += 2){
  834. aa1 = a + 0 * lda;
  835. aa2 = a + 1 * lda;
  836. a += 2 * lda + 2;
  837. bb1 = b1 + 0 * m;
  838. bb2 = b1 + 1 * m;
  839. b1 += 2 * m + 2;
  840. cc1 = b2 + 0 * m;
  841. cc2 = b2 + 1 * m;
  842. b2 += 2 * m + 2;
  843. if (m - js >= 2){
  844. a11 = *(aa1 + 0);
  845. a21 = *(aa1 + 1);
  846. a22 = *(aa2 + 1);
  847. *(bb1 + 0) = a11;
  848. *(bb1 + 1) = a21;
  849. *(bb2 + 0) = a21;
  850. *(bb2 + 1) = a22;
  851. aa1 += 2;
  852. aa2 += 2;
  853. bb1 += 2;
  854. bb2 += 2;
  855. cc1 += 2 * m;
  856. cc2 += 2 * m;
  857. is = ((m - js - 2) >> 1);
  858. while (is > 0){
  859. a11 = *(aa1 + 0);
  860. a21 = *(aa1 + 1);
  861. a12 = *(aa2 + 0);
  862. a22 = *(aa2 + 1);
  863. aa1 += 2;
  864. aa2 += 2;
  865. *(bb1 + 0) = a11;
  866. *(bb1 + 1) = a21;
  867. *(bb2 + 0) = a12;
  868. *(bb2 + 1) = a22;
  869. *(cc1 + 0) = a11;
  870. *(cc1 + 1) = a12;
  871. *(cc2 + 0) = a21;
  872. *(cc2 + 1) = a22;
  873. bb1 += 2;
  874. bb2 += 2;
  875. cc1 += 2 * m;
  876. cc2 += 2 * m;
  877. is --;
  878. }
  879. is = ((m - js - 2) & 1);
  880. if (is == 1){
  881. a11 = *(aa1 + 0);
  882. a12 = *(aa2 + 0);
  883. *(bb1 + 0) = a11;
  884. *(bb2 + 0) = a12;
  885. *(cc1 + 0) = a11;
  886. *(cc1 + 1) = a12;
  887. }
  888. }
  889. if (m - js == 1){
  890. a11 = *(aa1 + 0);
  891. *(bb1 + 0) = a11;
  892. }
  893. }
  894. }
  895. static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  896. BLASLONG is, js;
  897. FLOAT *aa1, *aa2;
  898. FLOAT *b1, *b2;
  899. FLOAT *bb1, *bb2;
  900. FLOAT *cc1, *cc2;
  901. FLOAT a11, a12;
  902. FLOAT a21, a22;
  903. b1 = b;
  904. b2 = b;
  905. for (js = 0; js < m; js += 2){
  906. aa1 = a + 0 * lda;
  907. aa2 = a + 1 * lda;
  908. a += 2 * lda;
  909. bb1 = b1 + 0 * m;
  910. bb2 = b1 + 1 * m;
  911. b1 += 2 * m;
  912. cc1 = b2 + 0 * m;
  913. cc2 = b2 + 1 * m;
  914. b2 += 2;
  915. if (m - js >= 2){
  916. for (is = 0; is < js; is += 2){
  917. a11 = *(aa1 + 0);
  918. a21 = *(aa1 + 1);
  919. a12 = *(aa2 + 0);
  920. a22 = *(aa2 + 1);
  921. aa1 += 2;
  922. aa2 += 2;
  923. *(bb1 + 0) = a11;
  924. *(bb1 + 1) = a21;
  925. *(bb2 + 0) = a12;
  926. *(bb2 + 1) = a22;
  927. *(cc1 + 0) = a11;
  928. *(cc1 + 1) = a12;
  929. *(cc2 + 0) = a21;
  930. *(cc2 + 1) = a22;
  931. bb1 += 2;
  932. bb2 += 2;
  933. cc1 += 2 * m;
  934. cc2 += 2 * m;
  935. }
  936. a11 = *(aa1 + 0);
  937. a12 = *(aa2 + 0);
  938. a22 = *(aa2 + 1);
  939. *(bb1 + 0) = a11;
  940. *(bb1 + 1) = a12;
  941. *(bb2 + 0) = a12;
  942. *(bb2 + 1) = a22;
  943. }
  944. if (m - js == 1){
  945. for (is = 0; is < js; is += 2){
  946. a11 = *(aa1 + 0);
  947. a21 = *(aa1 + 1);
  948. aa1 += 2;
  949. *(bb1 + 0) = a11;
  950. *(bb1 + 1) = a21;
  951. *(cc1 + 0) = a11;
  952. *(cc2 + 0) = a21;
  953. bb1 += 2;
  954. cc1 += 2 * m;
  955. cc2 += 2 * m;
  956. }
  957. a11 = *(aa1 + 0);
  958. *(bb1 + 0) = a11;
  959. }
  960. }
  961. }
  962. static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  963. BLASLONG is, js;
  964. FLOAT *aa1, *aa2;
  965. FLOAT *b1, *b2;
  966. FLOAT *bb1, *bb2;
  967. FLOAT *cc1, *cc2;
  968. FLOAT a11, a12;
  969. FLOAT a21, a22;
  970. b1 = b;
  971. b2 = b;
  972. for (js = 0; js < m; js += 2){
  973. aa1 = a + 0 * lda;
  974. aa2 = a + 1 * lda;
  975. a += 2 * lda;
  976. bb1 = b1 + 0 * m;
  977. bb2 = b1 + 1 * m;
  978. b1 += 2 * m;
  979. cc1 = b2 + 0 * m;
  980. cc2 = b2 + 1 * m;
  981. b2 += 2;
  982. if (m - js >= 2){
  983. for (is = 0; is < js; is += 2){
  984. a11 = *(aa1 + 0);
  985. a21 = *(aa1 + 1);
  986. a12 = *(aa2 + 0);
  987. a22 = *(aa2 + 1);
  988. aa1 += 2;
  989. aa2 += 2;
  990. *(bb1 + 0) = a11;
  991. *(bb1 + 1) = a21;
  992. *(bb2 + 0) = a12;
  993. *(bb2 + 1) = a22;
  994. *(cc1 + 0) = a11;
  995. *(cc1 + 1) = a12;
  996. *(cc2 + 0) = a21;
  997. *(cc2 + 1) = a22;
  998. bb1 += 2;
  999. bb2 += 2;
  1000. cc1 += 2 * m;
  1001. cc2 += 2 * m;
  1002. }
  1003. a11 = *(aa1 + 0);
  1004. a12 = *(aa2 + 0);
  1005. a22 = *(aa2 + 1);
  1006. *(bb1 + 0) = a11;
  1007. *(bb1 + 1) = a12;
  1008. *(bb2 + 0) = a12;
  1009. *(bb2 + 1) = a22;
  1010. }
  1011. if (m - js == 1){
  1012. for (is = 0; is < js; is += 2){
  1013. a11 = *(aa1 + 0);
  1014. a21 = *(aa1 + 1);
  1015. aa1 += 2;
  1016. *(bb1 + 0) = a11;
  1017. *(bb1 + 1) = a21;
  1018. *(cc1 + 0) = a11;
  1019. *(cc2 + 0) = a21;
  1020. bb1 += 2;
  1021. cc1 += 2 * m;
  1022. cc2 += 2 * m;
  1023. }
  1024. a11 = *(aa1 + 0);
  1025. *(bb1 + 0) = a11;
  1026. }
  1027. }
  1028. }
  1029. static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  1030. BLASLONG is, js;
  1031. FLOAT *aa1, *aa2;
  1032. FLOAT *b1, *b2;
  1033. FLOAT *bb1, *bb2;
  1034. FLOAT *cc1, *cc2;
  1035. FLOAT a11, a21, a31, a41;
  1036. FLOAT a12, a22, a32, a42;
  1037. b1 = b;
  1038. b2 = b;
  1039. lda *= 2;
  1040. for (js = 0; js < m; js += 2){
  1041. aa1 = a + 0 * lda;
  1042. aa2 = a + 1 * lda;
  1043. a += 2 * lda + 4;
  1044. bb1 = b1 + 0 * m;
  1045. bb2 = b1 + 2 * m;
  1046. b1 += 4 * m + 4;
  1047. cc1 = b2 + 0 * m;
  1048. cc2 = b2 + 2 * m;
  1049. b2 += 4 * m + 4;
  1050. if (m - js >= 2){
  1051. a11 = *(aa1 + 0);
  1052. a21 = *(aa1 + 1);
  1053. a31 = *(aa1 + 2);
  1054. a41 = *(aa1 + 3);
  1055. a12 = *(aa2 + 2);
  1056. a22 = *(aa2 + 3);
  1057. *(bb1 + 0) = a11;
  1058. *(bb1 + 1) = a21;
  1059. *(bb1 + 2) = a31;
  1060. *(bb1 + 3) = a41;
  1061. *(bb2 + 0) = a31;
  1062. *(bb2 + 1) = a41;
  1063. *(bb2 + 2) = a12;
  1064. *(bb2 + 3) = a22;
  1065. aa1 += 4;
  1066. aa2 += 4;
  1067. bb1 += 4;
  1068. bb2 += 4;
  1069. cc1 += 4 * m;
  1070. cc2 += 4 * m;
  1071. is = ((m - js - 2) >> 1);
  1072. while (is > 0){
  1073. a11 = *(aa1 + 0);
  1074. a21 = *(aa1 + 1);
  1075. a31 = *(aa1 + 2);
  1076. a41 = *(aa1 + 3);
  1077. a12 = *(aa2 + 0);
  1078. a22 = *(aa2 + 1);
  1079. a32 = *(aa2 + 2);
  1080. a42 = *(aa2 + 3);
  1081. aa1 += 4;
  1082. aa2 += 4;
  1083. *(bb1 + 0) = a11;
  1084. *(bb1 + 1) = a21;
  1085. *(bb1 + 2) = a31;
  1086. *(bb1 + 3) = a41;
  1087. *(bb2 + 0) = a12;
  1088. *(bb2 + 1) = a22;
  1089. *(bb2 + 2) = a32;
  1090. *(bb2 + 3) = a42;
  1091. *(cc1 + 0) = a11;
  1092. *(cc1 + 1) = a21;
  1093. *(cc1 + 2) = a12;
  1094. *(cc1 + 3) = a22;
  1095. *(cc2 + 0) = a31;
  1096. *(cc2 + 1) = a41;
  1097. *(cc2 + 2) = a32;
  1098. *(cc2 + 3) = a42;
  1099. bb1 += 4;
  1100. bb2 += 4;
  1101. cc1 += 4 * m;
  1102. cc2 += 4 * m;
  1103. is --;
  1104. }
  1105. if (m & 1){
  1106. a11 = *(aa1 + 0);
  1107. a21 = *(aa1 + 1);
  1108. a12 = *(aa2 + 0);
  1109. a22 = *(aa2 + 1);
  1110. *(bb1 + 0) = a11;
  1111. *(bb1 + 1) = a21;
  1112. *(bb2 + 0) = a12;
  1113. *(bb2 + 1) = a22;
  1114. *(cc1 + 0) = a11;
  1115. *(cc1 + 1) = a21;
  1116. *(cc1 + 2) = a12;
  1117. *(cc1 + 3) = a22;
  1118. }
  1119. }
  1120. if (m - js == 1){
  1121. a11 = *(aa1 + 0);
  1122. a21 = *(aa1 + 1);
  1123. *(bb1 + 0) = a11;
  1124. *(bb1 + 1) = a21;
  1125. }
  1126. }
  1127. }
  1128. static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  1129. BLASLONG is, js;
  1130. FLOAT *aa1, *aa2;
  1131. FLOAT *b1, *b2;
  1132. FLOAT *bb1, *bb2;
  1133. FLOAT *cc1, *cc2;
  1134. FLOAT a11, a21, a31, a41;
  1135. FLOAT a12, a22, a32, a42;
  1136. b1 = b;
  1137. b2 = b;
  1138. lda *= 2;
  1139. for (js = 0; js < m; js += 2){
  1140. aa1 = a + 0 * lda;
  1141. aa2 = a + 1 * lda;
  1142. a += 2 * lda + 4;
  1143. bb1 = b1 + 0 * m;
  1144. bb2 = b1 + 2 * m;
  1145. b1 += 4 * m + 4;
  1146. cc1 = b2 + 0 * m;
  1147. cc2 = b2 + 2 * m;
  1148. b2 += 4 * m + 4;
  1149. if (m - js >= 2){
  1150. a11 = *(aa1 + 0);
  1151. a21 = *(aa1 + 1);
  1152. a31 = *(aa1 + 2);
  1153. a41 = *(aa1 + 3);
  1154. a12 = *(aa2 + 2);
  1155. a22 = *(aa2 + 3);
  1156. *(bb1 + 0) = a11;
  1157. *(bb1 + 1) = a21;
  1158. *(bb1 + 2) = a31;
  1159. *(bb1 + 3) = a41;
  1160. *(bb2 + 0) = a31;
  1161. *(bb2 + 1) = a41;
  1162. *(bb2 + 2) = a12;
  1163. *(bb2 + 3) = a22;
  1164. aa1 += 4;
  1165. aa2 += 4;
  1166. bb1 += 4;
  1167. bb2 += 4;
  1168. cc1 += 4 * m;
  1169. cc2 += 4 * m;
  1170. is = ((m - js - 2) >> 1);
  1171. while (is > 0){
  1172. a11 = *(aa1 + 0);
  1173. a21 = *(aa1 + 1);
  1174. a31 = *(aa1 + 2);
  1175. a41 = *(aa1 + 3);
  1176. a12 = *(aa2 + 0);
  1177. a22 = *(aa2 + 1);
  1178. a32 = *(aa2 + 2);
  1179. a42 = *(aa2 + 3);
  1180. aa1 += 4;
  1181. aa2 += 4;
  1182. *(bb1 + 0) = a11;
  1183. *(bb1 + 1) = a21;
  1184. *(bb1 + 2) = a31;
  1185. *(bb1 + 3) = a41;
  1186. *(bb2 + 0) = a12;
  1187. *(bb2 + 1) = a22;
  1188. *(bb2 + 2) = a32;
  1189. *(bb2 + 3) = a42;
  1190. *(cc1 + 0) = a11;
  1191. *(cc1 + 1) = a21;
  1192. *(cc1 + 2) = a12;
  1193. *(cc1 + 3) = a22;
  1194. *(cc2 + 0) = a31;
  1195. *(cc2 + 1) = a41;
  1196. *(cc2 + 2) = a32;
  1197. *(cc2 + 3) = a42;
  1198. bb1 += 4;
  1199. bb2 += 4;
  1200. cc1 += 4 * m;
  1201. cc2 += 4 * m;
  1202. is --;
  1203. }
  1204. if (m & 1){
  1205. a11 = *(aa1 + 0);
  1206. a21 = *(aa1 + 1);
  1207. a12 = *(aa2 + 0);
  1208. a22 = *(aa2 + 1);
  1209. *(bb1 + 0) = a11;
  1210. *(bb1 + 1) = a21;
  1211. *(bb2 + 0) = a12;
  1212. *(bb2 + 1) = a22;
  1213. *(cc1 + 0) = a11;
  1214. *(cc1 + 1) = a21;
  1215. *(cc1 + 2) = a12;
  1216. *(cc1 + 3) = a22;
  1217. }
  1218. }
  1219. if (m - js == 1){
  1220. a11 = *(aa1 + 0);
  1221. a21 = *(aa1 + 1);
  1222. *(bb1 + 0) = a11;
  1223. *(bb1 + 1) = a21;
  1224. }
  1225. }
  1226. }
  1227. static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  1228. BLASLONG is, js;
  1229. FLOAT *aa1, *aa2;
  1230. FLOAT *b1, *b2;
  1231. FLOAT *bb1, *bb2;
  1232. FLOAT *cc1, *cc2;
  1233. FLOAT a11, a21, a31, a41;
  1234. FLOAT a12, a22, a32, a42;
  1235. b1 = b;
  1236. b2 = b;
  1237. lda *= 2;
  1238. for (js = 0; js < m; js += 2){
  1239. aa1 = a + 0 * lda;
  1240. aa2 = a + 1 * lda;
  1241. a += 2 * lda;
  1242. bb1 = b1 + 0 * m;
  1243. bb2 = b1 + 2 * m;
  1244. b1 += 4 * m;
  1245. cc1 = b2 + 0 * m;
  1246. cc2 = b2 + 2 * m;
  1247. b2 += 4;
  1248. if (m - js >= 2){
  1249. for (is = 0; is < js; is += 2){
  1250. a11 = *(aa1 + 0);
  1251. a21 = *(aa1 + 1);
  1252. a31 = *(aa1 + 2);
  1253. a41 = *(aa1 + 3);
  1254. a12 = *(aa2 + 0);
  1255. a22 = *(aa2 + 1);
  1256. a32 = *(aa2 + 2);
  1257. a42 = *(aa2 + 3);
  1258. aa1 += 4;
  1259. aa2 += 4;
  1260. *(bb1 + 0) = a11;
  1261. *(bb1 + 1) = a21;
  1262. *(bb1 + 2) = a31;
  1263. *(bb1 + 3) = a41;
  1264. *(bb2 + 0) = a12;
  1265. *(bb2 + 1) = a22;
  1266. *(bb2 + 2) = a32;
  1267. *(bb2 + 3) = a42;
  1268. *(cc1 + 0) = a11;
  1269. *(cc1 + 1) = a21;
  1270. *(cc1 + 2) = a12;
  1271. *(cc1 + 3) = a22;
  1272. *(cc2 + 0) = a31;
  1273. *(cc2 + 1) = a41;
  1274. *(cc2 + 2) = a32;
  1275. *(cc2 + 3) = a42;
  1276. bb1 += 4;
  1277. bb2 += 4;
  1278. cc1 += 4 * m;
  1279. cc2 += 4 * m;
  1280. }
  1281. a11 = *(aa1 + 0);
  1282. a21 = *(aa1 + 1);
  1283. a12 = *(aa2 + 0);
  1284. a22 = *(aa2 + 1);
  1285. a32 = *(aa2 + 2);
  1286. a42 = *(aa2 + 3);
  1287. *(bb1 + 0) = a11;
  1288. *(bb1 + 1) = a21;
  1289. *(bb1 + 2) = a12;
  1290. *(bb1 + 3) = a22;
  1291. *(bb2 + 0) = a12;
  1292. *(bb2 + 1) = a22;
  1293. *(bb2 + 2) = a32;
  1294. *(bb2 + 3) = a42;
  1295. }
  1296. if (m - js == 1){
  1297. for (is = 0; is < js; is += 2){
  1298. a11 = *(aa1 + 0);
  1299. a21 = *(aa1 + 1);
  1300. a31 = *(aa1 + 2);
  1301. a41 = *(aa1 + 3);
  1302. aa1 += 4;
  1303. *(bb1 + 0) = a11;
  1304. *(bb1 + 1) = a21;
  1305. *(bb1 + 2) = a31;
  1306. *(bb1 + 3) = a41;
  1307. *(cc1 + 0) = a11;
  1308. *(cc1 + 1) = a21;
  1309. *(cc2 + 0) = a31;
  1310. *(cc2 + 1) = a41;
  1311. bb1 += 4;
  1312. cc1 += 4 * m;
  1313. cc2 += 4 * m;
  1314. }
  1315. a11 = *(aa1 + 0);
  1316. a21 = *(aa1 + 1);
  1317. *(bb1 + 0) = a11;
  1318. *(bb1 + 1) = a21;
  1319. }
  1320. }
  1321. }
  1322. static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  1323. BLASLONG is, js;
  1324. FLOAT *aa1, *aa2;
  1325. FLOAT *b1, *b2;
  1326. FLOAT *bb1, *bb2;
  1327. FLOAT *cc1, *cc2;
  1328. FLOAT a11, a21, a31, a41;
  1329. FLOAT a12, a22, a32, a42;
  1330. b1 = b;
  1331. b2 = b;
  1332. lda *= 2;
  1333. for (js = 0; js < m; js += 2){
  1334. aa1 = a + 0 * lda;
  1335. aa2 = a + 1 * lda;
  1336. a += 2 * lda;
  1337. bb1 = b1 + 0 * m;
  1338. bb2 = b1 + 2 * m;
  1339. b1 += 4 * m;
  1340. cc1 = b2 + 0 * m;
  1341. cc2 = b2 + 2 * m;
  1342. b2 += 4;
  1343. if (m - js >= 2){
  1344. for (is = 0; is < js; is += 2){
  1345. a11 = *(aa1 + 0);
  1346. a21 = *(aa1 + 1);
  1347. a31 = *(aa1 + 2);
  1348. a41 = *(aa1 + 3);
  1349. a12 = *(aa2 + 0);
  1350. a22 = *(aa2 + 1);
  1351. a32 = *(aa2 + 2);
  1352. a42 = *(aa2 + 3);
  1353. aa1 += 4;
  1354. aa2 += 4;
  1355. *(bb1 + 0) = a11;
  1356. *(bb1 + 1) = a21;
  1357. *(bb1 + 2) = a31;
  1358. *(bb1 + 3) = a41;
  1359. *(bb2 + 0) = a12;
  1360. *(bb2 + 1) = a22;
  1361. *(bb2 + 2) = a32;
  1362. *(bb2 + 3) = a42;
  1363. *(cc1 + 0) = a11;
  1364. *(cc1 + 1) = a21;
  1365. *(cc1 + 2) = a12;
  1366. *(cc1 + 3) = a22;
  1367. *(cc2 + 0) = a31;
  1368. *(cc2 + 1) = a41;
  1369. *(cc2 + 2) = a32;
  1370. *(cc2 + 3) = a42;
  1371. bb1 += 4;
  1372. bb2 += 4;
  1373. cc1 += 4 * m;
  1374. cc2 += 4 * m;
  1375. }
  1376. a11 = *(aa1 + 0);
  1377. a21 = *(aa1 + 1);
  1378. a12 = *(aa2 + 0);
  1379. a22 = *(aa2 + 1);
  1380. a32 = *(aa2 + 2);
  1381. a42 = *(aa2 + 3);
  1382. *(bb1 + 0) = a11;
  1383. *(bb1 + 1) = a21;
  1384. *(bb1 + 2) = a12;
  1385. *(bb1 + 3) = a22;
  1386. *(bb2 + 0) = a12;
  1387. *(bb2 + 1) = a22;
  1388. *(bb2 + 2) = a32;
  1389. *(bb2 + 3) = a42;
  1390. }
  1391. if (m - js == 1){
  1392. for (is = 0; is < js; is += 2){
  1393. a11 = *(aa1 + 0);
  1394. a21 = *(aa1 + 1);
  1395. a31 = *(aa1 + 2);
  1396. a41 = *(aa1 + 3);
  1397. aa1 += 4;
  1398. *(bb1 + 0) = a11;
  1399. *(bb1 + 1) = a21;
  1400. *(bb1 + 2) = a31;
  1401. *(bb1 + 3) = a41;
  1402. *(cc1 + 0) = a11;
  1403. *(cc1 + 1) = a21;
  1404. *(cc2 + 0) = a31;
  1405. *(cc2 + 1) = a41;
  1406. bb1 += 4;
  1407. cc1 += 4 * m;
  1408. cc2 += 4 * m;
  1409. }
  1410. a11 = *(aa1 + 0);
  1411. a21 = *(aa1 + 1);
  1412. *(bb1 + 0) = a11;
  1413. *(bb1 + 1) = a21;
  1414. }
  1415. }
  1416. }
  1417. #endif
  1418. #endif