You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

symcopy.h 30 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. /* This implementation is completely wrong. I'll rewrite this */
  39. #ifndef SYMCOPY_H
  40. #define SYMCOPY_H
  41. #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
  42. static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  43. BLASLONG is, js;
  44. FLOAT *aa1, *aa2;
  45. FLOAT *b1, *b2;
  46. FLOAT *bb1, *bb2;
  47. FLOAT *cc1, *cc2;
  48. FLOAT a11, a12;
  49. FLOAT a21, a22;
  50. b1 = b;
  51. b2 = b;
  52. for (js = 0; js < m; js += 2){
  53. aa1 = a + 0 * lda;
  54. aa2 = a + 1 * lda;
  55. a += 2 * lda + 2;
  56. bb1 = b1 + 0 * m;
  57. bb2 = b1 + 1 * m;
  58. b1 += 2 * m + 2;
  59. cc1 = b2 + 0 * m;
  60. cc2 = b2 + 1 * m;
  61. b2 += 2 * m + 2;
  62. if (m - js >= 2){
  63. a11 = *(aa1 + 0);
  64. a21 = *(aa1 + 1);
  65. a22 = *(aa2 + 1);
  66. *(bb1 + 0) = a11;
  67. *(bb1 + 1) = a21;
  68. *(bb2 + 0) = a21;
  69. *(bb2 + 1) = a22;
  70. aa1 += 2;
  71. aa2 += 2;
  72. bb1 += 2;
  73. bb2 += 2;
  74. cc1 += 2 * m;
  75. cc2 += 2 * m;
  76. is = ((m - js - 2) >> 1);
  77. while (is > 0){
  78. a11 = *(aa1 + 0);
  79. a21 = *(aa1 + 1);
  80. a12 = *(aa2 + 0);
  81. a22 = *(aa2 + 1);
  82. aa1 += 2;
  83. aa2 += 2;
  84. *(bb1 + 0) = a11;
  85. *(bb1 + 1) = a21;
  86. *(bb2 + 0) = a12;
  87. *(bb2 + 1) = a22;
  88. *(cc1 + 0) = a11;
  89. *(cc1 + 1) = a12;
  90. *(cc2 + 0) = a21;
  91. *(cc2 + 1) = a22;
  92. bb1 += 2;
  93. bb2 += 2;
  94. cc1 += 2 * m;
  95. cc2 += 2 * m;
  96. is --;
  97. }
  98. is = ((m - js - 2) & 1);
  99. if (is == 1){
  100. a11 = *(aa1 + 0);
  101. a12 = *(aa2 + 0);
  102. *(bb1 + 0) = a11;
  103. *(bb2 + 0) = a12;
  104. *(cc1 + 0) = a11;
  105. *(cc1 + 1) = a12;
  106. }
  107. }
  108. if (m - js == 1){
  109. a11 = *(aa1 + 0);
  110. *(bb1 + 0) = a11;
  111. }
  112. }
  113. }
  114. static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  115. BLASLONG is, js;
  116. FLOAT *aa1, *aa2;
  117. FLOAT *b1, *b2;
  118. FLOAT *bb1, *bb2;
  119. FLOAT *cc1, *cc2;
  120. FLOAT a11, a12;
  121. FLOAT a21, a22;
  122. b1 = b;
  123. b2 = b;
  124. for (js = 0; js < m; js += 2){
  125. aa1 = a + 0 * lda;
  126. aa2 = a + 1 * lda;
  127. a += 2 * lda;
  128. bb1 = b1 + 0 * m;
  129. bb2 = b1 + 1 * m;
  130. b1 += 2 * m;
  131. cc1 = b2 + 0 * m;
  132. cc2 = b2 + 1 * m;
  133. b2 += 2;
  134. if (m - js >= 2){
  135. for (is = 0; is < js; is += 2){
  136. a11 = *(aa1 + 0);
  137. a21 = *(aa1 + 1);
  138. a12 = *(aa2 + 0);
  139. a22 = *(aa2 + 1);
  140. aa1 += 2;
  141. aa2 += 2;
  142. *(bb1 + 0) = a11;
  143. *(bb1 + 1) = a21;
  144. *(bb2 + 0) = a12;
  145. *(bb2 + 1) = a22;
  146. *(cc1 + 0) = a11;
  147. *(cc1 + 1) = a12;
  148. *(cc2 + 0) = a21;
  149. *(cc2 + 1) = a22;
  150. bb1 += 2;
  151. bb2 += 2;
  152. cc1 += 2 * m;
  153. cc2 += 2 * m;
  154. }
  155. a11 = *(aa1 + 0);
  156. a12 = *(aa2 + 0);
  157. a22 = *(aa2 + 1);
  158. *(bb1 + 0) = a11;
  159. *(bb1 + 1) = a12;
  160. *(bb2 + 0) = a12;
  161. *(bb2 + 1) = a22;
  162. }
  163. if (m - js == 1){
  164. for (is = 0; is < js; is += 2){
  165. a11 = *(aa1 + 0);
  166. a21 = *(aa1 + 1);
  167. aa1 += 2;
  168. *(bb1 + 0) = a11;
  169. *(bb1 + 1) = a21;
  170. *(cc1 + 0) = a11;
  171. *(cc2 + 0) = a21;
  172. bb1 += 2;
  173. cc1 += 2 * m;
  174. cc2 += 2 * m;
  175. }
  176. a11 = *(aa1 + 0);
  177. *(bb1 + 0) = a11;
  178. }
  179. }
  180. }
  181. static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  182. BLASLONG is, js;
  183. FLOAT *aa1, *aa2;
  184. FLOAT *b1, *b2;
  185. FLOAT *bb1, *bb2;
  186. FLOAT *cc1, *cc2;
  187. FLOAT a11, a21, a31, a41;
  188. FLOAT a12, a22, a32, a42;
  189. b1 = b;
  190. b2 = b;
  191. lda *= 2;
  192. for (js = 0; js < m; js += 2){
  193. aa1 = a + 0 * lda;
  194. aa2 = a + 1 * lda;
  195. a += 2 * lda + 4;
  196. bb1 = b1 + 0 * m;
  197. bb2 = b1 + 2 * m;
  198. b1 += 4 * m + 4;
  199. cc1 = b2 + 0 * m;
  200. cc2 = b2 + 2 * m;
  201. b2 += 4 * m + 4;
  202. if (m - js >= 2){
  203. a11 = *(aa1 + 0);
  204. a21 = *(aa1 + 1);
  205. a31 = *(aa1 + 2);
  206. a41 = *(aa1 + 3);
  207. a12 = *(aa2 + 2);
  208. a22 = *(aa2 + 3);
  209. *(bb1 + 0) = a11;
  210. *(bb1 + 1) = a21;
  211. *(bb1 + 2) = a31;
  212. *(bb1 + 3) = a41;
  213. *(bb2 + 0) = a31;
  214. *(bb2 + 1) = a41;
  215. *(bb2 + 2) = a12;
  216. *(bb2 + 3) = a22;
  217. aa1 += 4;
  218. aa2 += 4;
  219. bb1 += 4;
  220. bb2 += 4;
  221. cc1 += 4 * m;
  222. cc2 += 4 * m;
  223. is = ((m - js - 2) >> 1);
  224. while (is > 0){
  225. a11 = *(aa1 + 0);
  226. a21 = *(aa1 + 1);
  227. a31 = *(aa1 + 2);
  228. a41 = *(aa1 + 3);
  229. a12 = *(aa2 + 0);
  230. a22 = *(aa2 + 1);
  231. a32 = *(aa2 + 2);
  232. a42 = *(aa2 + 3);
  233. aa1 += 4;
  234. aa2 += 4;
  235. *(bb1 + 0) = a11;
  236. *(bb1 + 1) = a21;
  237. *(bb1 + 2) = a31;
  238. *(bb1 + 3) = a41;
  239. *(bb2 + 0) = a12;
  240. *(bb2 + 1) = a22;
  241. *(bb2 + 2) = a32;
  242. *(bb2 + 3) = a42;
  243. *(cc1 + 0) = a11;
  244. *(cc1 + 1) = a21;
  245. *(cc1 + 2) = a12;
  246. *(cc1 + 3) = a22;
  247. *(cc2 + 0) = a31;
  248. *(cc2 + 1) = a41;
  249. *(cc2 + 2) = a32;
  250. *(cc2 + 3) = a42;
  251. bb1 += 4;
  252. bb2 += 4;
  253. cc1 += 4 * m;
  254. cc2 += 4 * m;
  255. is --;
  256. }
  257. if (m & 1){
  258. a11 = *(aa1 + 0);
  259. a21 = *(aa1 + 1);
  260. a12 = *(aa2 + 0);
  261. a22 = *(aa2 + 1);
  262. *(bb1 + 0) = a11;
  263. *(bb1 + 1) = a21;
  264. *(bb2 + 0) = a12;
  265. *(bb2 + 1) = a22;
  266. *(cc1 + 0) = a11;
  267. *(cc1 + 1) = a21;
  268. *(cc1 + 2) = a12;
  269. *(cc1 + 3) = a22;
  270. }
  271. }
  272. if (m - js == 1){
  273. a11 = *(aa1 + 0);
  274. a21 = *(aa1 + 1);
  275. *(bb1 + 0) = a11;
  276. *(bb1 + 1) = a21;
  277. }
  278. }
  279. }
  280. static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  281. BLASLONG is, js;
  282. FLOAT *aa1, *aa2;
  283. FLOAT *b1, *b2;
  284. FLOAT *bb1, *bb2;
  285. FLOAT *cc1, *cc2;
  286. FLOAT a11, a21, a31, a41;
  287. FLOAT a12, a22, a32, a42;
  288. b1 = b;
  289. b2 = b;
  290. lda *= 2;
  291. for (js = 0; js < m; js += 2){
  292. aa1 = a + 0 * lda;
  293. aa2 = a + 1 * lda;
  294. a += 2 * lda;
  295. bb1 = b1 + 0 * m;
  296. bb2 = b1 + 2 * m;
  297. b1 += 4 * m;
  298. cc1 = b2 + 0 * m;
  299. cc2 = b2 + 2 * m;
  300. b2 += 4;
  301. if (m - js >= 2){
  302. for (is = 0; is < js; is += 2){
  303. a11 = *(aa1 + 0);
  304. a21 = *(aa1 + 1);
  305. a31 = *(aa1 + 2);
  306. a41 = *(aa1 + 3);
  307. a12 = *(aa2 + 0);
  308. a22 = *(aa2 + 1);
  309. a32 = *(aa2 + 2);
  310. a42 = *(aa2 + 3);
  311. aa1 += 4;
  312. aa2 += 4;
  313. *(bb1 + 0) = a11;
  314. *(bb1 + 1) = a21;
  315. *(bb1 + 2) = a31;
  316. *(bb1 + 3) = a41;
  317. *(bb2 + 0) = a12;
  318. *(bb2 + 1) = a22;
  319. *(bb2 + 2) = a32;
  320. *(bb2 + 3) = a42;
  321. *(cc1 + 0) = a11;
  322. *(cc1 + 1) = a21;
  323. *(cc1 + 2) = a12;
  324. *(cc1 + 3) = a22;
  325. *(cc2 + 0) = a31;
  326. *(cc2 + 1) = a41;
  327. *(cc2 + 2) = a32;
  328. *(cc2 + 3) = a42;
  329. bb1 += 4;
  330. bb2 += 4;
  331. cc1 += 4 * m;
  332. cc2 += 4 * m;
  333. }
  334. a11 = *(aa1 + 0);
  335. a21 = *(aa1 + 1);
  336. a12 = *(aa2 + 0);
  337. a22 = *(aa2 + 1);
  338. a32 = *(aa2 + 2);
  339. a42 = *(aa2 + 3);
  340. *(bb1 + 0) = a11;
  341. *(bb1 + 1) = a21;
  342. *(bb1 + 2) = a12;
  343. *(bb1 + 3) = a22;
  344. *(bb2 + 0) = a12;
  345. *(bb2 + 1) = a22;
  346. *(bb2 + 2) = a32;
  347. *(bb2 + 3) = a42;
  348. }
  349. if (m - js == 1){
  350. for (is = 0; is < js; is += 2){
  351. a11 = *(aa1 + 0);
  352. a21 = *(aa1 + 1);
  353. a31 = *(aa1 + 2);
  354. a41 = *(aa1 + 3);
  355. aa1 += 4;
  356. *(bb1 + 0) = a11;
  357. *(bb1 + 1) = a21;
  358. *(bb1 + 2) = a31;
  359. *(bb1 + 3) = a41;
  360. *(cc1 + 0) = a11;
  361. *(cc1 + 1) = a21;
  362. *(cc2 + 0) = a31;
  363. *(cc2 + 1) = a41;
  364. bb1 += 4;
  365. cc1 += 4 * m;
  366. cc2 += 4 * m;
  367. }
  368. a11 = *(aa1 + 0);
  369. a21 = *(aa1 + 1);
  370. *(bb1 + 0) = a11;
  371. *(bb1 + 1) = a21;
  372. }
  373. }
  374. }
  375. static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  376. BLASLONG is, js;
  377. FLOAT *aa1, *aa2;
  378. FLOAT *b1, *b2;
  379. FLOAT *bb1, *bb2;
  380. FLOAT *cc1, *cc2;
  381. FLOAT a11, a21, a31, a41;
  382. FLOAT a12, a22, a32, a42;
  383. b1 = b;
  384. b2 = b;
  385. lda *= 2;
  386. for (js = 0; js < m; js += 2){
  387. aa1 = a + 0 * lda;
  388. aa2 = a + 1 * lda;
  389. a += 2 * lda + 4;
  390. bb1 = b1 + 0 * m;
  391. bb2 = b1 + 2 * m;
  392. b1 += 4 * m + 4;
  393. cc1 = b2 + 0 * m;
  394. cc2 = b2 + 2 * m;
  395. b2 += 4 * m + 4;
  396. if (m - js >= 2){
  397. a11 = *(aa1 + 0);
  398. a31 = *(aa1 + 2);
  399. a41 = *(aa1 + 3);
  400. a12 = *(aa2 + 2);
  401. *(bb1 + 0) = a11;
  402. *(bb1 + 1) = 0.;
  403. *(bb1 + 2) = a31;
  404. *(bb1 + 3) = a41;
  405. *(bb2 + 0) = a31;
  406. *(bb2 + 1) = -a41;
  407. *(bb2 + 2) = a12;
  408. *(bb2 + 3) = 0.;
  409. aa1 += 4;
  410. aa2 += 4;
  411. bb1 += 4;
  412. bb2 += 4;
  413. cc1 += 4 * m;
  414. cc2 += 4 * m;
  415. is = ((m - js - 2) >> 1);
  416. while (is > 0){
  417. a11 = *(aa1 + 0);
  418. a21 = *(aa1 + 1);
  419. a31 = *(aa1 + 2);
  420. a41 = *(aa1 + 3);
  421. a12 = *(aa2 + 0);
  422. a22 = *(aa2 + 1);
  423. a32 = *(aa2 + 2);
  424. a42 = *(aa2 + 3);
  425. aa1 += 4;
  426. aa2 += 4;
  427. *(bb1 + 0) = a11;
  428. *(bb1 + 1) = a21;
  429. *(bb1 + 2) = a31;
  430. *(bb1 + 3) = a41;
  431. *(bb2 + 0) = a12;
  432. *(bb2 + 1) = a22;
  433. *(bb2 + 2) = a32;
  434. *(bb2 + 3) = a42;
  435. *(cc1 + 0) = a11;
  436. *(cc1 + 1) = -a21;
  437. *(cc1 + 2) = a12;
  438. *(cc1 + 3) = -a22;
  439. *(cc2 + 0) = a31;
  440. *(cc2 + 1) = -a41;
  441. *(cc2 + 2) = a32;
  442. *(cc2 + 3) = -a42;
  443. bb1 += 4;
  444. bb2 += 4;
  445. cc1 += 4 * m;
  446. cc2 += 4 * m;
  447. is --;
  448. }
  449. if (m & 1){
  450. a11 = *(aa1 + 0);
  451. a21 = *(aa1 + 1);
  452. a12 = *(aa2 + 0);
  453. a22 = *(aa2 + 1);
  454. *(bb1 + 0) = a11;
  455. *(bb1 + 1) = a21;
  456. *(bb2 + 0) = a12;
  457. *(bb2 + 1) = a22;
  458. *(cc1 + 0) = a11;
  459. *(cc1 + 1) = -a21;
  460. *(cc1 + 2) = a12;
  461. *(cc1 + 3) = -a22;
  462. }
  463. }
  464. if (m - js == 1){
  465. a11 = *(aa1 + 0);
  466. *(bb1 + 0) = a11;
  467. *(bb1 + 1) = 0.;
  468. }
  469. }
  470. }
  471. static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  472. BLASLONG is, js;
  473. FLOAT *aa1, *aa2;
  474. FLOAT *b1, *b2;
  475. FLOAT *bb1, *bb2;
  476. FLOAT *cc1, *cc2;
  477. FLOAT a11, a21, a31, a41;
  478. FLOAT a12, a22, a32, a42;
  479. b1 = b;
  480. b2 = b;
  481. lda *= 2;
  482. for (js = 0; js < m; js += 2){
  483. aa1 = a + 0 * lda;
  484. aa2 = a + 1 * lda;
  485. a += 2 * lda;
  486. bb1 = b1 + 0 * m;
  487. bb2 = b1 + 2 * m;
  488. b1 += 4 * m;
  489. cc1 = b2 + 0 * m;
  490. cc2 = b2 + 2 * m;
  491. b2 += 4;
  492. if (m - js >= 2){
  493. for (is = 0; is < js; is += 2){
  494. a11 = *(aa1 + 0);
  495. a21 = *(aa1 + 1);
  496. a31 = *(aa1 + 2);
  497. a41 = *(aa1 + 3);
  498. a12 = *(aa2 + 0);
  499. a22 = *(aa2 + 1);
  500. a32 = *(aa2 + 2);
  501. a42 = *(aa2 + 3);
  502. aa1 += 4;
  503. aa2 += 4;
  504. *(bb1 + 0) = a11;
  505. *(bb1 + 1) = a21;
  506. *(bb1 + 2) = a31;
  507. *(bb1 + 3) = a41;
  508. *(bb2 + 0) = a12;
  509. *(bb2 + 1) = a22;
  510. *(bb2 + 2) = a32;
  511. *(bb2 + 3) = a42;
  512. *(cc1 + 0) = a11;
  513. *(cc1 + 1) = -a21;
  514. *(cc1 + 2) = a12;
  515. *(cc1 + 3) = -a22;
  516. *(cc2 + 0) = a31;
  517. *(cc2 + 1) = -a41;
  518. *(cc2 + 2) = a32;
  519. *(cc2 + 3) = -a42;
  520. bb1 += 4;
  521. bb2 += 4;
  522. cc1 += 4 * m;
  523. cc2 += 4 * m;
  524. }
  525. a11 = *(aa1 + 0);
  526. a12 = *(aa2 + 0);
  527. a22 = *(aa2 + 1);
  528. a32 = *(aa2 + 2);
  529. *(bb1 + 0) = a11;
  530. *(bb1 + 1) = 0.;
  531. *(bb1 + 2) = a12;
  532. *(bb1 + 3) = -a22;
  533. *(bb2 + 0) = a12;
  534. *(bb2 + 1) = a22;
  535. *(bb2 + 2) = a32;
  536. *(bb2 + 3) = 0.;
  537. }
  538. if (m - js == 1){
  539. for (is = 0; is < js; is += 2){
  540. a11 = *(aa1 + 0);
  541. a21 = *(aa1 + 1);
  542. a31 = *(aa1 + 2);
  543. a41 = *(aa1 + 3);
  544. aa1 += 4;
  545. *(bb1 + 0) = a11;
  546. *(bb1 + 1) = a21;
  547. *(bb1 + 2) = a31;
  548. *(bb1 + 3) = a41;
  549. *(cc1 + 0) = a11;
  550. *(cc1 + 1) = -a21;
  551. *(cc2 + 0) = a31;
  552. *(cc2 + 1) = -a41;
  553. bb1 += 4;
  554. cc1 += 4 * m;
  555. cc2 += 4 * m;
  556. }
  557. a11 = *(aa1 + 0);
  558. *(bb1 + 0) = a11;
  559. *(bb1 + 1) = 0.;
  560. }
  561. }
  562. }
  563. static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  564. BLASLONG is, js;
  565. FLOAT *aa1, *aa2;
  566. FLOAT *b1, *b2;
  567. FLOAT *bb1, *bb2;
  568. FLOAT *cc1, *cc2;
  569. FLOAT a11, a21, a31, a41;
  570. FLOAT a12, a22, a32, a42;
  571. b1 = b;
  572. b2 = b;
  573. lda *= 2;
  574. for (js = 0; js < m; js += 2){
  575. aa1 = a + 0 * lda;
  576. aa2 = a + 1 * lda;
  577. a += 2 * lda + 4;
  578. bb1 = b1 + 0 * m;
  579. bb2 = b1 + 2 * m;
  580. b1 += 4 * m + 4;
  581. cc1 = b2 + 0 * m;
  582. cc2 = b2 + 2 * m;
  583. b2 += 4 * m + 4;
  584. if (m - js >= 2){
  585. a11 = *(aa1 + 0);
  586. a31 = *(aa1 + 2);
  587. a41 = *(aa1 + 3);
  588. a12 = *(aa2 + 2);
  589. *(bb1 + 0) = a11;
  590. *(bb1 + 1) = 0.;
  591. *(bb1 + 2) = a31;
  592. *(bb1 + 3) = -a41;
  593. *(bb2 + 0) = a31;
  594. *(bb2 + 1) = a41;
  595. *(bb2 + 2) = a12;
  596. *(bb2 + 3) = 0.;
  597. aa1 += 4;
  598. aa2 += 4;
  599. bb1 += 4;
  600. bb2 += 4;
  601. cc1 += 4 * m;
  602. cc2 += 4 * m;
  603. is = ((m - js - 2) >> 1);
  604. while (is > 0){
  605. a11 = *(aa1 + 0);
  606. a21 = *(aa1 + 1);
  607. a31 = *(aa1 + 2);
  608. a41 = *(aa1 + 3);
  609. a12 = *(aa2 + 0);
  610. a22 = *(aa2 + 1);
  611. a32 = *(aa2 + 2);
  612. a42 = *(aa2 + 3);
  613. aa1 += 4;
  614. aa2 += 4;
  615. *(bb1 + 0) = a11;
  616. *(bb1 + 1) = -a21;
  617. *(bb1 + 2) = a31;
  618. *(bb1 + 3) = -a41;
  619. *(bb2 + 0) = a12;
  620. *(bb2 + 1) = -a22;
  621. *(bb2 + 2) = a32;
  622. *(bb2 + 3) = -a42;
  623. *(cc1 + 0) = a11;
  624. *(cc1 + 1) = a21;
  625. *(cc1 + 2) = a12;
  626. *(cc1 + 3) = a22;
  627. *(cc2 + 0) = a31;
  628. *(cc2 + 1) = a41;
  629. *(cc2 + 2) = a32;
  630. *(cc2 + 3) = a42;
  631. bb1 += 4;
  632. bb2 += 4;
  633. cc1 += 4 * m;
  634. cc2 += 4 * m;
  635. is --;
  636. }
  637. if (m & 1){
  638. a11 = *(aa1 + 0);
  639. a21 = *(aa1 + 1);
  640. a12 = *(aa2 + 0);
  641. a22 = *(aa2 + 1);
  642. *(bb1 + 0) = a11;
  643. *(bb1 + 1) = -a21;
  644. *(bb2 + 0) = a12;
  645. *(bb2 + 1) = -a22;
  646. *(cc1 + 0) = a11;
  647. *(cc1 + 1) = a21;
  648. *(cc1 + 2) = a12;
  649. *(cc1 + 3) = a22;
  650. }
  651. }
  652. if (m - js == 1){
  653. a11 = *(aa1 + 0);
  654. *(bb1 + 0) = a11;
  655. *(bb1 + 1) = 0.;
  656. }
  657. }
  658. }
  659. static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  660. BLASLONG is, js;
  661. FLOAT *aa1, *aa2;
  662. FLOAT *b1, *b2;
  663. FLOAT *bb1, *bb2;
  664. FLOAT *cc1, *cc2;
  665. FLOAT a11, a21, a31, a41;
  666. FLOAT a12, a22, a32, a42;
  667. b1 = b;
  668. b2 = b;
  669. lda *= 2;
  670. for (js = 0; js < m; js += 2){
  671. aa1 = a + 0 * lda;
  672. aa2 = a + 1 * lda;
  673. a += 2 * lda;
  674. bb1 = b1 + 0 * m;
  675. bb2 = b1 + 2 * m;
  676. b1 += 4 * m;
  677. cc1 = b2 + 0 * m;
  678. cc2 = b2 + 2 * m;
  679. b2 += 4;
  680. if (m - js >= 2){
  681. for (is = 0; is < js; is += 2){
  682. a11 = *(aa1 + 0);
  683. a21 = *(aa1 + 1);
  684. a31 = *(aa1 + 2);
  685. a41 = *(aa1 + 3);
  686. a12 = *(aa2 + 0);
  687. a22 = *(aa2 + 1);
  688. a32 = *(aa2 + 2);
  689. a42 = *(aa2 + 3);
  690. aa1 += 4;
  691. aa2 += 4;
  692. *(bb1 + 0) = a11;
  693. *(bb1 + 1) = -a21;
  694. *(bb1 + 2) = a31;
  695. *(bb1 + 3) = -a41;
  696. *(bb2 + 0) = a12;
  697. *(bb2 + 1) = -a22;
  698. *(bb2 + 2) = a32;
  699. *(bb2 + 3) = -a42;
  700. *(cc1 + 0) = a11;
  701. *(cc1 + 1) = a21;
  702. *(cc1 + 2) = a12;
  703. *(cc1 + 3) = a22;
  704. *(cc2 + 0) = a31;
  705. *(cc2 + 1) = a41;
  706. *(cc2 + 2) = a32;
  707. *(cc2 + 3) = a42;
  708. bb1 += 4;
  709. bb2 += 4;
  710. cc1 += 4 * m;
  711. cc2 += 4 * m;
  712. }
  713. a11 = *(aa1 + 0);
  714. a12 = *(aa2 + 0);
  715. a22 = *(aa2 + 1);
  716. a32 = *(aa2 + 2);
  717. *(bb1 + 0) = a11;
  718. *(bb1 + 1) = 0.;
  719. *(bb1 + 2) = a12;
  720. *(bb1 + 3) = a22;
  721. *(bb2 + 0) = a12;
  722. *(bb2 + 1) = -a22;
  723. *(bb2 + 2) = a32;
  724. *(bb2 + 3) = 0.;
  725. }
  726. if (m - js == 1){
  727. for (is = 0; is < js; is += 2){
  728. a11 = *(aa1 + 0);
  729. a21 = *(aa1 + 1);
  730. a31 = *(aa1 + 2);
  731. a41 = *(aa1 + 3);
  732. aa1 += 4;
  733. *(bb1 + 0) = a11;
  734. *(bb1 + 1) = -a21;
  735. *(bb1 + 2) = a31;
  736. *(bb1 + 3) = -a41;
  737. *(cc1 + 0) = a11;
  738. *(cc1 + 1) = a21;
  739. *(cc2 + 0) = a31;
  740. *(cc2 + 1) = a41;
  741. bb1 += 4;
  742. cc1 += 4 * m;
  743. cc2 += 4 * m;
  744. }
  745. a11 = *(aa1 + 0);
  746. *(bb1 + 0) = a11;
  747. *(bb1 + 1) = 0.;
  748. }
  749. }
  750. }
  751. static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  752. BLASLONG is, js;
  753. FLOAT *aa1, *aa2;
  754. FLOAT *b1, *b2;
  755. FLOAT *bb1, *bb2;
  756. FLOAT *cc1, *cc2;
  757. FLOAT a11, a12;
  758. FLOAT a21, a22;
  759. b1 = b;
  760. b2 = b;
  761. for (js = 0; js < m; js += 2){
  762. aa1 = a + 0 * lda;
  763. aa2 = a + 1 * lda;
  764. a += 2 * lda + 2;
  765. bb1 = b1 + 0 * m;
  766. bb2 = b1 + 1 * m;
  767. b1 += 2 * m + 2;
  768. cc1 = b2 + 0 * m;
  769. cc2 = b2 + 1 * m;
  770. b2 += 2 * m + 2;
  771. if (m - js >= 2){
  772. a11 = *(aa1 + 0);
  773. a21 = *(aa1 + 1);
  774. a22 = *(aa2 + 1);
  775. *(bb1 + 0) = a11;
  776. *(bb1 + 1) = a21;
  777. *(bb2 + 0) = a21;
  778. *(bb2 + 1) = a22;
  779. aa1 += 2;
  780. aa2 += 2;
  781. bb1 += 2;
  782. bb2 += 2;
  783. cc1 += 2 * m;
  784. cc2 += 2 * m;
  785. is = ((m - js - 2) >> 1);
  786. while (is > 0){
  787. a11 = *(aa1 + 0);
  788. a21 = *(aa1 + 1);
  789. a12 = *(aa2 + 0);
  790. a22 = *(aa2 + 1);
  791. aa1 += 2;
  792. aa2 += 2;
  793. *(bb1 + 0) = a11;
  794. *(bb1 + 1) = a21;
  795. *(bb2 + 0) = a12;
  796. *(bb2 + 1) = a22;
  797. *(cc1 + 0) = a11;
  798. *(cc1 + 1) = a12;
  799. *(cc2 + 0) = a21;
  800. *(cc2 + 1) = a22;
  801. bb1 += 2;
  802. bb2 += 2;
  803. cc1 += 2 * m;
  804. cc2 += 2 * m;
  805. is --;
  806. }
  807. is = ((m - js - 2) & 1);
  808. if (is == 1){
  809. a11 = *(aa1 + 0);
  810. a12 = *(aa2 + 0);
  811. *(bb1 + 0) = a11;
  812. *(bb2 + 0) = a12;
  813. *(cc1 + 0) = a11;
  814. *(cc1 + 1) = a12;
  815. }
  816. }
  817. if (m - js == 1){
  818. a11 = *(aa1 + 0);
  819. *(bb1 + 0) = a11;
  820. }
  821. }
  822. }
  823. static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  824. BLASLONG is, js;
  825. FLOAT *aa1, *aa2;
  826. FLOAT *b1, *b2;
  827. FLOAT *bb1, *bb2;
  828. FLOAT *cc1, *cc2;
  829. FLOAT a11, a12;
  830. FLOAT a21, a22;
  831. b1 = b;
  832. b2 = b;
  833. for (js = 0; js < m; js += 2){
  834. aa1 = a + 0 * lda;
  835. aa2 = a + 1 * lda;
  836. a += 2 * lda + 2;
  837. bb1 = b1 + 0 * m;
  838. bb2 = b1 + 1 * m;
  839. b1 += 2 * m + 2;
  840. cc1 = b2 + 0 * m;
  841. cc2 = b2 + 1 * m;
  842. b2 += 2 * m + 2;
  843. if (m - js >= 2){
  844. a11 = *(aa1 + 0);
  845. a21 = *(aa1 + 1);
  846. a22 = *(aa2 + 1);
  847. *(bb1 + 0) = a11;
  848. *(bb1 + 1) = a21;
  849. *(bb2 + 0) = a21;
  850. *(bb2 + 1) = a22;
  851. aa1 += 2;
  852. aa2 += 2;
  853. bb1 += 2;
  854. bb2 += 2;
  855. cc1 += 2 * m;
  856. cc2 += 2 * m;
  857. is = ((m - js - 2) >> 1);
  858. while (is > 0){
  859. a11 = *(aa1 + 0);
  860. a21 = *(aa1 + 1);
  861. a12 = *(aa2 + 0);
  862. a22 = *(aa2 + 1);
  863. aa1 += 2;
  864. aa2 += 2;
  865. *(bb1 + 0) = a11;
  866. *(bb1 + 1) = a21;
  867. *(bb2 + 0) = a12;
  868. *(bb2 + 1) = a22;
  869. *(cc1 + 0) = a11;
  870. *(cc1 + 1) = a12;
  871. *(cc2 + 0) = a21;
  872. *(cc2 + 1) = a22;
  873. bb1 += 2;
  874. bb2 += 2;
  875. cc1 += 2 * m;
  876. cc2 += 2 * m;
  877. is --;
  878. }
  879. is = ((m - js - 2) & 1);
  880. if (is == 1){
  881. a11 = *(aa1 + 0);
  882. a12 = *(aa2 + 0);
  883. *(bb1 + 0) = a11;
  884. *(bb2 + 0) = a12;
  885. *(cc1 + 0) = a11;
  886. *(cc1 + 1) = a12;
  887. }
  888. }
  889. if (m - js == 1){
  890. a11 = *(aa1 + 0);
  891. *(bb1 + 0) = a11;
  892. }
  893. }
  894. }
  895. static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  896. BLASLONG is, js;
  897. FLOAT *aa1, *aa2;
  898. FLOAT *b1, *b2;
  899. FLOAT *bb1, *bb2;
  900. FLOAT *cc1, *cc2;
  901. FLOAT a11, a12;
  902. FLOAT a21, a22;
  903. b1 = b;
  904. b2 = b;
  905. for (js = 0; js < m; js += 2){
  906. aa1 = a + 0 * lda;
  907. aa2 = a + 1 * lda;
  908. a += 2 * lda;
  909. bb1 = b1 + 0 * m;
  910. bb2 = b1 + 1 * m;
  911. b1 += 2 * m;
  912. cc1 = b2 + 0 * m;
  913. cc2 = b2 + 1 * m;
  914. b2 += 2;
  915. if (m - js >= 2){
  916. for (is = 0; is < js; is += 2){
  917. a11 = *(aa1 + 0);
  918. a21 = *(aa1 + 1);
  919. a12 = *(aa2 + 0);
  920. a22 = *(aa2 + 1);
  921. aa1 += 2;
  922. aa2 += 2;
  923. *(bb1 + 0) = a11;
  924. *(bb1 + 1) = a21;
  925. *(bb2 + 0) = a12;
  926. *(bb2 + 1) = a22;
  927. *(cc1 + 0) = a11;
  928. *(cc1 + 1) = a12;
  929. *(cc2 + 0) = a21;
  930. *(cc2 + 1) = a22;
  931. bb1 += 2;
  932. bb2 += 2;
  933. cc1 += 2 * m;
  934. cc2 += 2 * m;
  935. }
  936. a11 = *(aa1 + 0);
  937. a12 = *(aa2 + 0);
  938. a22 = *(aa2 + 1);
  939. *(bb1 + 0) = a11;
  940. *(bb1 + 1) = a12;
  941. *(bb2 + 0) = a12;
  942. *(bb2 + 1) = a22;
  943. }
  944. if (m - js == 1){
  945. for (is = 0; is < js; is += 2){
  946. a11 = *(aa1 + 0);
  947. a21 = *(aa1 + 1);
  948. aa1 += 2;
  949. *(bb1 + 0) = a11;
  950. *(bb1 + 1) = a21;
  951. *(cc1 + 0) = a11;
  952. *(cc2 + 0) = a21;
  953. bb1 += 2;
  954. cc1 += 2 * m;
  955. cc2 += 2 * m;
  956. }
  957. a11 = *(aa1 + 0);
  958. *(bb1 + 0) = a11;
  959. }
  960. }
  961. }
  962. static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  963. BLASLONG is, js;
  964. FLOAT *aa1, *aa2;
  965. FLOAT *b1, *b2;
  966. FLOAT *bb1, *bb2;
  967. FLOAT *cc1, *cc2;
  968. FLOAT a11, a12;
  969. FLOAT a21, a22;
  970. b1 = b;
  971. b2 = b;
  972. for (js = 0; js < m; js += 2){
  973. aa1 = a + 0 * lda;
  974. aa2 = a + 1 * lda;
  975. a += 2 * lda;
  976. bb1 = b1 + 0 * m;
  977. bb2 = b1 + 1 * m;
  978. b1 += 2 * m;
  979. cc1 = b2 + 0 * m;
  980. cc2 = b2 + 1 * m;
  981. b2 += 2;
  982. if (m - js >= 2){
  983. for (is = 0; is < js; is += 2){
  984. a11 = *(aa1 + 0);
  985. a21 = *(aa1 + 1);
  986. a12 = *(aa2 + 0);
  987. a22 = *(aa2 + 1);
  988. aa1 += 2;
  989. aa2 += 2;
  990. *(bb1 + 0) = a11;
  991. *(bb1 + 1) = a21;
  992. *(bb2 + 0) = a12;
  993. *(bb2 + 1) = a22;
  994. *(cc1 + 0) = a11;
  995. *(cc1 + 1) = a12;
  996. *(cc2 + 0) = a21;
  997. *(cc2 + 1) = a22;
  998. bb1 += 2;
  999. bb2 += 2;
  1000. cc1 += 2 * m;
  1001. cc2 += 2 * m;
  1002. }
  1003. a11 = *(aa1 + 0);
  1004. a12 = *(aa2 + 0);
  1005. a22 = *(aa2 + 1);
  1006. *(bb1 + 0) = a11;
  1007. *(bb1 + 1) = a12;
  1008. *(bb2 + 0) = a12;
  1009. *(bb2 + 1) = a22;
  1010. }
  1011. if (m - js == 1){
  1012. for (is = 0; is < js; is += 2){
  1013. a11 = *(aa1 + 0);
  1014. a21 = *(aa1 + 1);
  1015. aa1 += 2;
  1016. *(bb1 + 0) = a11;
  1017. *(bb1 + 1) = a21;
  1018. *(cc1 + 0) = a11;
  1019. *(cc2 + 0) = a21;
  1020. bb1 += 2;
  1021. cc1 += 2 * m;
  1022. cc2 += 2 * m;
  1023. }
  1024. a11 = *(aa1 + 0);
  1025. *(bb1 + 0) = a11;
  1026. }
  1027. }
  1028. }
  1029. static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  1030. BLASLONG is, js;
  1031. FLOAT *aa1, *aa2;
  1032. FLOAT *b1, *b2;
  1033. FLOAT *bb1, *bb2;
  1034. FLOAT *cc1, *cc2;
  1035. FLOAT a11, a21, a31, a41;
  1036. FLOAT a12, a22, a32, a42;
  1037. b1 = b;
  1038. b2 = b;
  1039. lda *= 2;
  1040. for (js = 0; js < m; js += 2){
  1041. aa1 = a + 0 * lda;
  1042. aa2 = a + 1 * lda;
  1043. a += 2 * lda + 4;
  1044. bb1 = b1 + 0 * m;
  1045. bb2 = b1 + 2 * m;
  1046. b1 += 4 * m + 4;
  1047. cc1 = b2 + 0 * m;
  1048. cc2 = b2 + 2 * m;
  1049. b2 += 4 * m + 4;
  1050. if (m - js >= 2){
  1051. a11 = *(aa1 + 0);
  1052. a21 = *(aa1 + 1);
  1053. a31 = *(aa1 + 2);
  1054. a41 = *(aa1 + 3);
  1055. a12 = *(aa2 + 2);
  1056. a22 = *(aa2 + 3);
  1057. *(bb1 + 0) = a11;
  1058. *(bb1 + 1) = a21;
  1059. *(bb1 + 2) = a31;
  1060. *(bb1 + 3) = a41;
  1061. *(bb2 + 0) = a31;
  1062. *(bb2 + 1) = a41;
  1063. *(bb2 + 2) = a12;
  1064. *(bb2 + 3) = a22;
  1065. aa1 += 4;
  1066. aa2 += 4;
  1067. bb1 += 4;
  1068. bb2 += 4;
  1069. cc1 += 4 * m;
  1070. cc2 += 4 * m;
  1071. is = ((m - js - 2) >> 1);
  1072. while (is > 0){
  1073. a11 = *(aa1 + 0);
  1074. a21 = *(aa1 + 1);
  1075. a31 = *(aa1 + 2);
  1076. a41 = *(aa1 + 3);
  1077. a12 = *(aa2 + 0);
  1078. a22 = *(aa2 + 1);
  1079. a32 = *(aa2 + 2);
  1080. a42 = *(aa2 + 3);
  1081. aa1 += 4;
  1082. aa2 += 4;
  1083. *(bb1 + 0) = a11;
  1084. *(bb1 + 1) = a21;
  1085. *(bb1 + 2) = a31;
  1086. *(bb1 + 3) = a41;
  1087. *(bb2 + 0) = a12;
  1088. *(bb2 + 1) = a22;
  1089. *(bb2 + 2) = a32;
  1090. *(bb2 + 3) = a42;
  1091. *(cc1 + 0) = a11;
  1092. *(cc1 + 1) = a21;
  1093. *(cc1 + 2) = a12;
  1094. *(cc1 + 3) = a22;
  1095. *(cc2 + 0) = a31;
  1096. *(cc2 + 1) = a41;
  1097. *(cc2 + 2) = a32;
  1098. *(cc2 + 3) = a42;
  1099. bb1 += 4;
  1100. bb2 += 4;
  1101. cc1 += 4 * m;
  1102. cc2 += 4 * m;
  1103. is --;
  1104. }
  1105. if (m & 1){
  1106. a11 = *(aa1 + 0);
  1107. a21 = *(aa1 + 1);
  1108. a12 = *(aa2 + 0);
  1109. a22 = *(aa2 + 1);
  1110. *(bb1 + 0) = a11;
  1111. *(bb1 + 1) = a21;
  1112. *(bb2 + 0) = a12;
  1113. *(bb2 + 1) = a22;
  1114. *(cc1 + 0) = a11;
  1115. *(cc1 + 1) = a21;
  1116. *(cc1 + 2) = a12;
  1117. *(cc1 + 3) = a22;
  1118. }
  1119. }
  1120. if (m - js == 1){
  1121. a11 = *(aa1 + 0);
  1122. a21 = *(aa1 + 1);
  1123. *(bb1 + 0) = a11;
  1124. *(bb1 + 1) = a21;
  1125. }
  1126. }
  1127. }
  1128. static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  1129. BLASLONG is, js;
  1130. FLOAT *aa1, *aa2;
  1131. FLOAT *b1, *b2;
  1132. FLOAT *bb1, *bb2;
  1133. FLOAT *cc1, *cc2;
  1134. FLOAT a11, a21, a31, a41;
  1135. FLOAT a12, a22, a32, a42;
  1136. b1 = b;
  1137. b2 = b;
  1138. lda *= 2;
  1139. for (js = 0; js < m; js += 2){
  1140. aa1 = a + 0 * lda;
  1141. aa2 = a + 1 * lda;
  1142. a += 2 * lda + 4;
  1143. bb1 = b1 + 0 * m;
  1144. bb2 = b1 + 2 * m;
  1145. b1 += 4 * m + 4;
  1146. cc1 = b2 + 0 * m;
  1147. cc2 = b2 + 2 * m;
  1148. b2 += 4 * m + 4;
  1149. if (m - js >= 2){
  1150. a11 = *(aa1 + 0);
  1151. a21 = *(aa1 + 1);
  1152. a31 = *(aa1 + 2);
  1153. a41 = *(aa1 + 3);
  1154. a12 = *(aa2 + 2);
  1155. a22 = *(aa2 + 3);
  1156. *(bb1 + 0) = a11;
  1157. *(bb1 + 1) = a21;
  1158. *(bb1 + 2) = a31;
  1159. *(bb1 + 3) = a41;
  1160. *(bb2 + 0) = a31;
  1161. *(bb2 + 1) = a41;
  1162. *(bb2 + 2) = a12;
  1163. *(bb2 + 3) = a22;
  1164. aa1 += 4;
  1165. aa2 += 4;
  1166. bb1 += 4;
  1167. bb2 += 4;
  1168. cc1 += 4 * m;
  1169. cc2 += 4 * m;
  1170. is = ((m - js - 2) >> 1);
  1171. while (is > 0){
  1172. a11 = *(aa1 + 0);
  1173. a21 = *(aa1 + 1);
  1174. a31 = *(aa1 + 2);
  1175. a41 = *(aa1 + 3);
  1176. a12 = *(aa2 + 0);
  1177. a22 = *(aa2 + 1);
  1178. a32 = *(aa2 + 2);
  1179. a42 = *(aa2 + 3);
  1180. aa1 += 4;
  1181. aa2 += 4;
  1182. *(bb1 + 0) = a11;
  1183. *(bb1 + 1) = a21;
  1184. *(bb1 + 2) = a31;
  1185. *(bb1 + 3) = a41;
  1186. *(bb2 + 0) = a12;
  1187. *(bb2 + 1) = a22;
  1188. *(bb2 + 2) = a32;
  1189. *(bb2 + 3) = a42;
  1190. *(cc1 + 0) = a11;
  1191. *(cc1 + 1) = a21;
  1192. *(cc1 + 2) = a12;
  1193. *(cc1 + 3) = a22;
  1194. *(cc2 + 0) = a31;
  1195. *(cc2 + 1) = a41;
  1196. *(cc2 + 2) = a32;
  1197. *(cc2 + 3) = a42;
  1198. bb1 += 4;
  1199. bb2 += 4;
  1200. cc1 += 4 * m;
  1201. cc2 += 4 * m;
  1202. is --;
  1203. }
  1204. if (m & 1){
  1205. a11 = *(aa1 + 0);
  1206. a21 = *(aa1 + 1);
  1207. a12 = *(aa2 + 0);
  1208. a22 = *(aa2 + 1);
  1209. *(bb1 + 0) = a11;
  1210. *(bb1 + 1) = a21;
  1211. *(bb2 + 0) = a12;
  1212. *(bb2 + 1) = a22;
  1213. *(cc1 + 0) = a11;
  1214. *(cc1 + 1) = a21;
  1215. *(cc1 + 2) = a12;
  1216. *(cc1 + 3) = a22;
  1217. }
  1218. }
  1219. if (m - js == 1){
  1220. a11 = *(aa1 + 0);
  1221. a21 = *(aa1 + 1);
  1222. *(bb1 + 0) = a11;
  1223. *(bb1 + 1) = a21;
  1224. }
  1225. }
  1226. }
  1227. static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  1228. BLASLONG is, js;
  1229. FLOAT *aa1, *aa2;
  1230. FLOAT *b1, *b2;
  1231. FLOAT *bb1, *bb2;
  1232. FLOAT *cc1, *cc2;
  1233. FLOAT a11, a21, a31, a41;
  1234. FLOAT a12, a22, a32, a42;
  1235. b1 = b;
  1236. b2 = b;
  1237. lda *= 2;
  1238. for (js = 0; js < m; js += 2){
  1239. aa1 = a + 0 * lda;
  1240. aa2 = a + 1 * lda;
  1241. a += 2 * lda;
  1242. bb1 = b1 + 0 * m;
  1243. bb2 = b1 + 2 * m;
  1244. b1 += 4 * m;
  1245. cc1 = b2 + 0 * m;
  1246. cc2 = b2 + 2 * m;
  1247. b2 += 4;
  1248. if (m - js >= 2){
  1249. for (is = 0; is < js; is += 2){
  1250. a11 = *(aa1 + 0);
  1251. a21 = *(aa1 + 1);
  1252. a31 = *(aa1 + 2);
  1253. a41 = *(aa1 + 3);
  1254. a12 = *(aa2 + 0);
  1255. a22 = *(aa2 + 1);
  1256. a32 = *(aa2 + 2);
  1257. a42 = *(aa2 + 3);
  1258. aa1 += 4;
  1259. aa2 += 4;
  1260. *(bb1 + 0) = a11;
  1261. *(bb1 + 1) = a21;
  1262. *(bb1 + 2) = a31;
  1263. *(bb1 + 3) = a41;
  1264. *(bb2 + 0) = a12;
  1265. *(bb2 + 1) = a22;
  1266. *(bb2 + 2) = a32;
  1267. *(bb2 + 3) = a42;
  1268. *(cc1 + 0) = a11;
  1269. *(cc1 + 1) = a21;
  1270. *(cc1 + 2) = a12;
  1271. *(cc1 + 3) = a22;
  1272. *(cc2 + 0) = a31;
  1273. *(cc2 + 1) = a41;
  1274. *(cc2 + 2) = a32;
  1275. *(cc2 + 3) = a42;
  1276. bb1 += 4;
  1277. bb2 += 4;
  1278. cc1 += 4 * m;
  1279. cc2 += 4 * m;
  1280. }
  1281. a11 = *(aa1 + 0);
  1282. a21 = *(aa1 + 1);
  1283. a12 = *(aa2 + 0);
  1284. a22 = *(aa2 + 1);
  1285. a32 = *(aa2 + 2);
  1286. a42 = *(aa2 + 3);
  1287. *(bb1 + 0) = a11;
  1288. *(bb1 + 1) = a21;
  1289. *(bb1 + 2) = a12;
  1290. *(bb1 + 3) = a22;
  1291. *(bb2 + 0) = a12;
  1292. *(bb2 + 1) = a22;
  1293. *(bb2 + 2) = a32;
  1294. *(bb2 + 3) = a42;
  1295. }
  1296. if (m - js == 1){
  1297. for (is = 0; is < js; is += 2){
  1298. a11 = *(aa1 + 0);
  1299. a21 = *(aa1 + 1);
  1300. a31 = *(aa1 + 2);
  1301. a41 = *(aa1 + 3);
  1302. aa1 += 4;
  1303. *(bb1 + 0) = a11;
  1304. *(bb1 + 1) = a21;
  1305. *(bb1 + 2) = a31;
  1306. *(bb1 + 3) = a41;
  1307. *(cc1 + 0) = a11;
  1308. *(cc1 + 1) = a21;
  1309. *(cc2 + 0) = a31;
  1310. *(cc2 + 1) = a41;
  1311. bb1 += 4;
  1312. cc1 += 4 * m;
  1313. cc2 += 4 * m;
  1314. }
  1315. a11 = *(aa1 + 0);
  1316. a21 = *(aa1 + 1);
  1317. *(bb1 + 0) = a11;
  1318. *(bb1 + 1) = a21;
  1319. }
  1320. }
  1321. }
  1322. static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){
  1323. BLASLONG is, js;
  1324. FLOAT *aa1, *aa2;
  1325. FLOAT *b1, *b2;
  1326. FLOAT *bb1, *bb2;
  1327. FLOAT *cc1, *cc2;
  1328. FLOAT a11, a21, a31, a41;
  1329. FLOAT a12, a22, a32, a42;
  1330. b1 = b;
  1331. b2 = b;
  1332. lda *= 2;
  1333. for (js = 0; js < m; js += 2){
  1334. aa1 = a + 0 * lda;
  1335. aa2 = a + 1 * lda;
  1336. a += 2 * lda;
  1337. bb1 = b1 + 0 * m;
  1338. bb2 = b1 + 2 * m;
  1339. b1 += 4 * m;
  1340. cc1 = b2 + 0 * m;
  1341. cc2 = b2 + 2 * m;
  1342. b2 += 4;
  1343. if (m - js >= 2){
  1344. for (is = 0; is < js; is += 2){
  1345. a11 = *(aa1 + 0);
  1346. a21 = *(aa1 + 1);
  1347. a31 = *(aa1 + 2);
  1348. a41 = *(aa1 + 3);
  1349. a12 = *(aa2 + 0);
  1350. a22 = *(aa2 + 1);
  1351. a32 = *(aa2 + 2);
  1352. a42 = *(aa2 + 3);
  1353. aa1 += 4;
  1354. aa2 += 4;
  1355. *(bb1 + 0) = a11;
  1356. *(bb1 + 1) = a21;
  1357. *(bb1 + 2) = a31;
  1358. *(bb1 + 3) = a41;
  1359. *(bb2 + 0) = a12;
  1360. *(bb2 + 1) = a22;
  1361. *(bb2 + 2) = a32;
  1362. *(bb2 + 3) = a42;
  1363. *(cc1 + 0) = a11;
  1364. *(cc1 + 1) = a21;
  1365. *(cc1 + 2) = a12;
  1366. *(cc1 + 3) = a22;
  1367. *(cc2 + 0) = a31;
  1368. *(cc2 + 1) = a41;
  1369. *(cc2 + 2) = a32;
  1370. *(cc2 + 3) = a42;
  1371. bb1 += 4;
  1372. bb2 += 4;
  1373. cc1 += 4 * m;
  1374. cc2 += 4 * m;
  1375. }
  1376. a11 = *(aa1 + 0);
  1377. a21 = *(aa1 + 1);
  1378. a12 = *(aa2 + 0);
  1379. a22 = *(aa2 + 1);
  1380. a32 = *(aa2 + 2);
  1381. a42 = *(aa2 + 3);
  1382. *(bb1 + 0) = a11;
  1383. *(bb1 + 1) = a21;
  1384. *(bb1 + 2) = a12;
  1385. *(bb1 + 3) = a22;
  1386. *(bb2 + 0) = a12;
  1387. *(bb2 + 1) = a22;
  1388. *(bb2 + 2) = a32;
  1389. *(bb2 + 3) = a42;
  1390. }
  1391. if (m - js == 1){
  1392. for (is = 0; is < js; is += 2){
  1393. a11 = *(aa1 + 0);
  1394. a21 = *(aa1 + 1);
  1395. a31 = *(aa1 + 2);
  1396. a41 = *(aa1 + 3);
  1397. aa1 += 4;
  1398. *(bb1 + 0) = a11;
  1399. *(bb1 + 1) = a21;
  1400. *(bb1 + 2) = a31;
  1401. *(bb1 + 3) = a41;
  1402. *(cc1 + 0) = a11;
  1403. *(cc1 + 1) = a21;
  1404. *(cc2 + 0) = a31;
  1405. *(cc2 + 1) = a41;
  1406. bb1 += 4;
  1407. cc1 += 4 * m;
  1408. cc2 += 4 * m;
  1409. }
  1410. a11 = *(aa1 + 0);
  1411. a21 = *(aa1 + 1);
  1412. *(bb1 + 0) = a11;
  1413. *(bb1 + 1) = a21;
  1414. }
  1415. }
  1416. }
  1417. #endif
  1418. #endif