You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_loader.py 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383
  1. import os
  2. import numpy as np
  3. import pandas as pd
  4. import mindspore.dataset as ds
  5. import mindspore.dataset.transforms.c_transforms as C
  6. import mindspore.dataset.transforms.py_transforms as P
  7. import mindspore.dataset.vision.c_transforms as CV
  8. import mindspore.dataset.transforms.vision.py_transforms as VP
  9. from mindspore import Tensor
  10. from utils.tools import StandardScaler
  11. from utils.timefeatures import time_features
  12. import warnings
  13. warnings.filterwarnings('ignore')
  14. class Dataset_ETT_hour(Dataset):
  15. def __init__(self, root_path, flag='train', size=None,
  16. features='S', data_path='ETTh1.csv',
  17. target='OT', scale=True, inverse=False, timeenc=0, freq='h', cols=None):
  18. # size [seq_len, label_len, pred_len]
  19. # info
  20. if size == None:
  21. self.seq_len = 24*4*4
  22. self.label_len = 24*4
  23. self.pred_len = 24*4
  24. else:
  25. self.seq_len = size[0]
  26. self.label_len = size[1]
  27. self.pred_len = size[2]
  28. # init
  29. assert flag in ['train', 'test', 'val']
  30. type_map = {'train':0, 'val':1, 'test':2}
  31. self.set_type = type_map[flag]
  32. self.features = features
  33. self.target = target
  34. self.scale = scale
  35. self.inverse = inverse
  36. self.timeenc = timeenc
  37. self.freq = freq
  38. self.root_path = root_path
  39. self.data_path = data_path
  40. self.__read_data__()
  41. def __read_data__(self):
  42. self.scaler = StandardScaler()
  43. df_raw = pd.read_csv(os.path.join(self.root_path,
  44. self.data_path))
  45. border1s = [0, 12*30*24 - self.seq_len, 12*30*24+4*30*24 - self.seq_len]
  46. border2s = [12*30*24, 12*30*24+4*30*24, 12*30*24+8*30*24]
  47. border1 = border1s[self.set_type]
  48. border2 = border2s[self.set_type]
  49. if self.features=='M' or self.features=='MS':
  50. cols_data = df_raw.columns[1:]
  51. df_data = df_raw[cols_data]
  52. elif self.features=='S':
  53. df_data = df_raw[[self.target]]
  54. if self.scale:
  55. train_data = df_data[border1s[0]:border2s[0]]
  56. self.scaler.fit(train_data.values)
  57. data = self.scaler.transform(df_data.values)
  58. else:
  59. data = df_data.values
  60. df_stamp = df_raw[['date']][border1:border2]
  61. df_stamp['date'] = pd.to_datetime(df_stamp.date)
  62. data_stamp = time_features(df_stamp, timeenc=self.timeenc, freq=self.freq)
  63. self.data_x = data[border1:border2]
  64. if self.inverse:
  65. self.data_y = df_data.values[border1:border2]
  66. else:
  67. self.data_y = data[border1:border2]
  68. self.data_stamp = data_stamp
  69. def __getitem__(self, index):
  70. s_begin = index
  71. s_end = s_begin + self.seq_len
  72. r_begin = s_end - self.label_len
  73. r_end = r_begin + self.label_len + self.pred_len
  74. seq_x = self.data_x[s_begin:s_end]
  75. if self.inverse:
  76. seq_y = np.concatenate([self.data_x[r_begin:r_begin+self.label_len], self.data_y[r_begin+self.label_len:r_end]], 0)
  77. else:
  78. seq_y = self.data_y[r_begin:r_end]
  79. seq_x_mark = self.data_stamp[s_begin:s_end]
  80. seq_y_mark = self.data_stamp[r_begin:r_end]
  81. return seq_x, seq_y, seq_x_mark, seq_y_mark
  82. def __len__(self):
  83. return len(self.data_x) - self.seq_len- self.pred_len + 1
  84. def inverse_transform(self, data):
  85. return self.scaler.inverse_transform(data)
  86. class Dataset_ETT_minute(Dataset):
  87. def __init__(self, root_path, flag='train', size=None,
  88. features='S', data_path='ETTm1.csv',
  89. target='OT', scale=True, inverse=False, timeenc=0, freq='t', cols=None):
  90. # size [seq_len, label_len, pred_len]
  91. # info
  92. if size == None:
  93. self.seq_len = 24*4*4
  94. self.label_len = 24*4
  95. self.pred_len = 24*4
  96. else:
  97. self.seq_len = size[0]
  98. self.label_len = size[1]
  99. self.pred_len = size[2]
  100. # init
  101. assert flag in ['train', 'test', 'val']
  102. type_map = {'train':0, 'val':1, 'test':2}
  103. self.set_type = type_map[flag]
  104. self.features = features
  105. self.target = target
  106. self.scale = scale
  107. self.inverse = inverse
  108. self.timeenc = timeenc
  109. self.freq = freq
  110. self.root_path = root_path
  111. self.data_path = data_path
  112. self.__read_data__()
  113. def __read_data__(self):
  114. self.scaler = StandardScaler()
  115. df_raw = pd.read_csv(os.path.join(self.root_path,
  116. self.data_path))
  117. border1s = [0, 12*30*24*4 - self.seq_len, 12*30*24*4+4*30*24*4 - self.seq_len]
  118. border2s = [12*30*24*4, 12*30*24*4+4*30*24*4, 12*30*24*4+8*30*24*4]
  119. border1 = border1s[self.set_type]
  120. border2 = border2s[self.set_type]
  121. if self.features=='M' or self.features=='MS':
  122. cols_data = df_raw.columns[1:]
  123. df_data = df_raw[cols_data]
  124. elif self.features=='S':
  125. df_data = df_raw[[self.target]]
  126. if self.scale:
  127. train_data = df_data[border1s[0]:border2s[0]]
  128. self.scaler.fit(train_data.values)
  129. data = self.scaler.transform(df_data.values)
  130. else:
  131. data = df_data.values
  132. df_stamp = df_raw[['date']][border1:border2]
  133. df_stamp['date'] = pd.to_datetime(df_stamp.date)
  134. data_stamp = time_features(df_stamp, timeenc=self.timeenc, freq=self.freq)
  135. self.data_x = data[border1:border2]
  136. if self.inverse:
  137. self.data_y = df_data.values[border1:border2]
  138. else:
  139. self.data_y = data[border1:border2]
  140. self.data_stamp = data_stamp
  141. def __getitem__(self, index):
  142. s_begin = index
  143. s_end = s_begin + self.seq_len
  144. r_begin = s_end - self.label_len
  145. r_end = r_begin + self.label_len + self.pred_len
  146. seq_x = self.data_x[s_begin:s_end]
  147. if self.inverse:
  148. seq_y = np.concatenate([self.data_x[r_begin:r_begin+self.label_len], self.data_y[r_begin+self.label_len:r_end]], 0)
  149. else:
  150. seq_y = self.data_y[r_begin:r_end]
  151. seq_x_mark = self.data_stamp[s_begin:s_end]
  152. seq_y_mark = self.data_stamp[r_begin:r_end]
  153. return seq_x, seq_y, seq_x_mark, seq_y_mark
  154. def __len__(self):
  155. return len(self.data_x) - self.seq_len - self.pred_len + 1
  156. def inverse_transform(self, data):
  157. return self.scaler.inverse_transform(data)
  158. class Dataset_Custom(Dataset):
  159. def __init__(self, root_path, flag='train', size=None,
  160. features='S', data_path='ETTh1.csv',
  161. target='OT', scale=True, inverse=False, timeenc=0, freq='h', cols=None):
  162. # size [seq_len, label_len, pred_len]
  163. # info
  164. if size == None:
  165. self.seq_len = 24*4*4
  166. self.label_len = 24*4
  167. self.pred_len = 24*4
  168. else:
  169. self.seq_len = size[0]
  170. self.label_len = size[1]
  171. self.pred_len = size[2]
  172. # init
  173. assert flag in ['train', 'test', 'val']
  174. type_map = {'train':0, 'val':1, 'test':2}
  175. self.set_type = type_map[flag]
  176. self.features = features
  177. self.target = target
  178. self.scale = scale
  179. self.inverse = inverse
  180. self.timeenc = timeenc
  181. self.freq = freq
  182. self.cols=cols
  183. self.root_path = root_path
  184. self.data_path = data_path
  185. self.__read_data__()
  186. def __read_data__(self):
  187. self.scaler = StandardScaler()
  188. df_raw = pd.read_csv(os.path.join(self.root_path,
  189. self.data_path))
  190. '''
  191. df_raw.columns: ['date', ...(other features), target feature]
  192. '''
  193. # cols = list(df_raw.columns);
  194. if self.cols:
  195. cols=self.cols.copy()
  196. cols.remove(self.target)
  197. else:
  198. cols = list(df_raw.columns); cols.remove(self.target); cols.remove('date')
  199. df_raw = df_raw[['date']+cols+[self.target]]
  200. num_train = int(len(df_raw)*0.7)
  201. num_test = int(len(df_raw)*0.2)
  202. num_vali = len(df_raw) - num_train - num_test
  203. border1s = [0, num_train-self.seq_len, len(df_raw)-num_test-self.seq_len]
  204. border2s = [num_train, num_train+num_vali, len(df_raw)]
  205. # start and end of train, val, test data
  206. border1 = border1s[self.set_type]
  207. border2 = border2s[self.set_type]
  208. if self.features=='M' or self.features=='MS':
  209. cols_data = df_raw.columns[1:]
  210. df_data = df_raw[cols_data]
  211. elif self.features=='S':
  212. df_data = df_raw[[self.target]]
  213. if self.scale:
  214. train_data = df_data[border1s[0]:border2s[0]]
  215. self.scaler.fit(train_data.values)
  216. data = self.scaler.transform(df_data.values)
  217. else:
  218. data = df_data.values
  219. df_stamp = df_raw[['date']][border1:border2]
  220. df_stamp['date'] = pd.to_datetime(df_stamp.date)
  221. data_stamp = time_features(df_stamp, timeenc=self.timeenc, freq=self.freq)
  222. self.data_x = data[border1:border2]
  223. if self.inverse:
  224. self.data_y = df_data.values[border1:border2]
  225. else:
  226. self.data_y = data[border1:border2]
  227. self.data_stamp = data_stamp
  228. def __getitem__(self, index):
  229. s_begin = index
  230. s_end = s_begin + self.seq_len
  231. r_begin = s_end - self.label_len
  232. r_end = r_begin + self.label_len + self.pred_len
  233. seq_x = self.data_x[s_begin:s_end]
  234. if self.inverse:
  235. seq_y = np.concatenate([self.data_x[r_begin:r_begin+self.label_len], self.data_y[r_begin+self.label_len:r_end]], 0)
  236. else:
  237. seq_y = self.data_y[r_begin:r_end]
  238. seq_x_mark = self.data_stamp[s_begin:s_end]
  239. seq_y_mark = self.data_stamp[r_begin:r_end]
  240. return seq_x, seq_y, seq_x_mark, seq_y_mark
  241. def __len__(self):
  242. return len(self.data_x) - self.seq_len- self.pred_len + 1
  243. def inverse_transform(self, data):
  244. return self.scaler.inverse_transform(data)
  245. class Dataset_Pred(Dataset):
  246. def __init__(self, root_path, flag='pred', size=None,
  247. features='S', data_path='ETTh1.csv',
  248. target='OT', scale=True, inverse=False, timeenc=0, freq='15min', cols=None):
  249. # size [seq_len, label_len, pred_len]
  250. # info
  251. if size == None:
  252. self.seq_len = 24*4*4
  253. self.label_len = 24*4
  254. self.pred_len = 24*4
  255. else:
  256. self.seq_len = size[0]
  257. self.label_len = size[1]
  258. self.pred_len = size[2]
  259. # init
  260. assert flag in ['pred']
  261. self.features = features
  262. self.target = target
  263. self.scale = scale
  264. self.inverse = inverse
  265. self.timeenc = timeenc
  266. self.freq = freq
  267. self.cols=cols
  268. self.root_path = root_path
  269. self.data_path = data_path
  270. self.__read_data__()
  271. def __read_data__(self):
  272. self.scaler = StandardScaler()
  273. df_raw = pd.read_csv(os.path.join(self.root_path,
  274. self.data_path))
  275. '''
  276. df_raw.columns: ['date', ...(other features), target feature]
  277. '''
  278. if self.cols:
  279. cols=self.cols.copy()
  280. cols.remove(self.target)
  281. else:
  282. cols = list(df_raw.columns); cols.remove(self.target); cols.remove('date')
  283. df_raw = df_raw[['date']+cols+[self.target]]
  284. border1 = len(df_raw)-self.seq_len
  285. border2 = len(df_raw)
  286. if self.features=='M' or self.features=='MS':
  287. cols_data = df_raw.columns[1:]
  288. df_data = df_raw[cols_data]
  289. elif self.features=='S':
  290. df_data = df_raw[[self.target]]
  291. if self.scale:
  292. self.scaler.fit(df_data.values)
  293. data = self.scaler.transform(df_data.values)
  294. else:
  295. data = df_data.values
  296. tmp_stamp = df_raw[['date']][border1:border2]
  297. tmp_stamp['date'] = pd.to_datetime(tmp_stamp.date)
  298. pred_dates = pd.date_range(tmp_stamp.date.values[-1], periods=self.pred_len+1, freq=self.freq)
  299. df_stamp = pd.DataFrame(columns = ['date'])
  300. df_stamp.date = list(tmp_stamp.date.values) + list(pred_dates[1:])
  301. data_stamp = time_features(df_stamp, timeenc=self.timeenc, freq=self.freq[-1:])
  302. self.data_x = data[border1:border2]
  303. if self.inverse:
  304. self.data_y = df_data.values[border1:border2]
  305. else:
  306. self.data_y = data[border1:border2]
  307. self.data_stamp = data_stamp
  308. def __getitem__(self, index):
  309. s_begin = index
  310. s_end = s_begin + self.seq_len
  311. r_begin = s_end - self.label_len
  312. r_end = r_begin + self.label_len + self.pred_len
  313. seq_x = self.data_x[s_begin:s_end]
  314. if self.inverse:
  315. seq_y = self.data_x[r_begin:r_begin+self.label_len]
  316. else:
  317. seq_y = self.data_y[r_begin:r_begin+self.label_len]
  318. seq_x_mark = self.data_stamp[s_begin:s_end]
  319. seq_y_mark = self.data_stamp[r_begin:r_end]
  320. return seq_x, seq_y, seq_x_mark, seq_y_mark
  321. def __len__(self):
  322. return len(self.data_x) - self.seq_len + 1
  323. def inverse_transform(self, data):
  324. return self.scaler.inverse_transform(data)

基于MindSpore的多模态股票价格预测系统研究 Informer,LSTM,RNN