You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_loader.py 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. import os
  2. import numpy as np
  3. import pandas as pd
  4. import torch
  5. from torch.utils.data import Dataset, DataLoader
  6. # from sklearn.preprocessing import StandardScaler
  7. from utils.tools import StandardScaler
  8. from utils.timefeatures import time_features
  9. import warnings
  10. warnings.filterwarnings('ignore')
  11. class Dataset_ETT_hour(Dataset):
  12. def __init__(self, root_path, flag='train', size=None,
  13. features='S', data_path='ETTh1.csv',
  14. target='OT', scale=True, inverse=False, timeenc=0, freq='h', cols=None):
  15. # size [seq_len, label_len, pred_len]
  16. # info
  17. if size == None:
  18. self.seq_len = 24*4*4
  19. self.label_len = 24*4
  20. self.pred_len = 24*4
  21. else:
  22. self.seq_len = size[0]
  23. self.label_len = size[1]
  24. self.pred_len = size[2]
  25. # init
  26. assert flag in ['train', 'test', 'val']
  27. type_map = {'train':0, 'val':1, 'test':2}
  28. self.set_type = type_map[flag]
  29. self.features = features
  30. self.target = target
  31. self.scale = scale
  32. self.inverse = inverse
  33. self.timeenc = timeenc
  34. self.freq = freq
  35. self.root_path = root_path
  36. self.data_path = data_path
  37. self.__read_data__()
  38. def __read_data__(self):
  39. self.scaler = StandardScaler()
  40. df_raw = pd.read_csv(os.path.join(self.root_path,
  41. self.data_path))
  42. border1s = [0, 12*30*24 - self.seq_len, 12*30*24+4*30*24 - self.seq_len]
  43. border2s = [12*30*24, 12*30*24+4*30*24, 12*30*24+8*30*24]
  44. border1 = border1s[self.set_type]
  45. border2 = border2s[self.set_type]
  46. if self.features=='M' or self.features=='MS':
  47. cols_data = df_raw.columns[1:]
  48. df_data = df_raw[cols_data]
  49. elif self.features=='S':
  50. df_data = df_raw[[self.target]]
  51. if self.scale:
  52. train_data = df_data[border1s[0]:border2s[0]]
  53. self.scaler.fit(train_data.values)
  54. data = self.scaler.transform(df_data.values)
  55. else:
  56. data = df_data.values
  57. df_stamp = df_raw[['date']][border1:border2]
  58. df_stamp['date'] = pd.to_datetime(df_stamp.date)
  59. data_stamp = time_features(df_stamp, timeenc=self.timeenc, freq=self.freq)
  60. self.data_x = data[border1:border2]
  61. if self.inverse:
  62. self.data_y = df_data.values[border1:border2]
  63. else:
  64. self.data_y = data[border1:border2]
  65. self.data_stamp = data_stamp
  66. def __getitem__(self, index):
  67. s_begin = index
  68. s_end = s_begin + self.seq_len
  69. r_begin = s_end - self.label_len
  70. r_end = r_begin + self.label_len + self.pred_len
  71. seq_x = self.data_x[s_begin:s_end]
  72. if self.inverse:
  73. seq_y = np.concatenate([self.data_x[r_begin:r_begin+self.label_len], self.data_y[r_begin+self.label_len:r_end]], 0)
  74. else:
  75. seq_y = self.data_y[r_begin:r_end]
  76. seq_x_mark = self.data_stamp[s_begin:s_end]
  77. seq_y_mark = self.data_stamp[r_begin:r_end]
  78. return seq_x, seq_y, seq_x_mark, seq_y_mark
  79. def __len__(self):
  80. return len(self.data_x) - self.seq_len- self.pred_len + 1
  81. def inverse_transform(self, data):
  82. return self.scaler.inverse_transform(data)
  83. class Dataset_ETT_minute(Dataset):
  84. def __init__(self, root_path, flag='train', size=None,
  85. features='S', data_path='ETTm1.csv',
  86. target='OT', scale=True, inverse=False, timeenc=0, freq='t', cols=None):
  87. # size [seq_len, label_len, pred_len]
  88. # info
  89. if size == None:
  90. self.seq_len = 24*4*4
  91. self.label_len = 24*4
  92. self.pred_len = 24*4
  93. else:
  94. self.seq_len = size[0]
  95. self.label_len = size[1]
  96. self.pred_len = size[2]
  97. # init
  98. assert flag in ['train', 'test', 'val']
  99. type_map = {'train':0, 'val':1, 'test':2}
  100. self.set_type = type_map[flag]
  101. self.features = features
  102. self.target = target
  103. self.scale = scale
  104. self.inverse = inverse
  105. self.timeenc = timeenc
  106. self.freq = freq
  107. self.root_path = root_path
  108. self.data_path = data_path
  109. self.__read_data__()
  110. def __read_data__(self):
  111. self.scaler = StandardScaler()
  112. df_raw = pd.read_csv(os.path.join(self.root_path,
  113. self.data_path))
  114. border1s = [0, 12*30*24*4 - self.seq_len, 12*30*24*4+4*30*24*4 - self.seq_len]
  115. border2s = [12*30*24*4, 12*30*24*4+4*30*24*4, 12*30*24*4+8*30*24*4]
  116. border1 = border1s[self.set_type]
  117. border2 = border2s[self.set_type]
  118. if self.features=='M' or self.features=='MS':
  119. cols_data = df_raw.columns[1:]
  120. df_data = df_raw[cols_data]
  121. elif self.features=='S':
  122. df_data = df_raw[[self.target]]
  123. if self.scale:
  124. train_data = df_data[border1s[0]:border2s[0]]
  125. self.scaler.fit(train_data.values)
  126. data = self.scaler.transform(df_data.values)
  127. else:
  128. data = df_data.values
  129. df_stamp = df_raw[['date']][border1:border2]
  130. df_stamp['date'] = pd.to_datetime(df_stamp.date)
  131. data_stamp = time_features(df_stamp, timeenc=self.timeenc, freq=self.freq)
  132. self.data_x = data[border1:border2]
  133. if self.inverse:
  134. self.data_y = df_data.values[border1:border2]
  135. else:
  136. self.data_y = data[border1:border2]
  137. self.data_stamp = data_stamp
  138. def __getitem__(self, index):
  139. s_begin = index
  140. s_end = s_begin + self.seq_len
  141. r_begin = s_end - self.label_len
  142. r_end = r_begin + self.label_len + self.pred_len
  143. seq_x = self.data_x[s_begin:s_end]
  144. if self.inverse:
  145. seq_y = np.concatenate([self.data_x[r_begin:r_begin+self.label_len], self.data_y[r_begin+self.label_len:r_end]], 0)
  146. else:
  147. seq_y = self.data_y[r_begin:r_end]
  148. seq_x_mark = self.data_stamp[s_begin:s_end]
  149. seq_y_mark = self.data_stamp[r_begin:r_end]
  150. return seq_x, seq_y, seq_x_mark, seq_y_mark
  151. def __len__(self):
  152. return len(self.data_x) - self.seq_len - self.pred_len + 1
  153. def inverse_transform(self, data):
  154. return self.scaler.inverse_transform(data)
  155. class Dataset_Custom(Dataset):
  156. def __init__(self, root_path, flag='train', size=None,
  157. features='S', data_path='ETTh1.csv',
  158. target='OT', scale=True, inverse=False, timeenc=0, freq='h', cols=None):
  159. # size [seq_len, label_len, pred_len]
  160. # info
  161. if size == None:
  162. self.seq_len = 24*4*4
  163. self.label_len = 24*4
  164. self.pred_len = 24*4
  165. else:
  166. self.seq_len = size[0]
  167. self.label_len = size[1]
  168. self.pred_len = size[2]
  169. # init
  170. assert flag in ['train', 'test', 'val']
  171. type_map = {'train':0, 'val':1, 'test':2}
  172. self.set_type = type_map[flag]
  173. self.features = features
  174. self.target = target
  175. self.scale = scale
  176. self.inverse = inverse
  177. self.timeenc = timeenc
  178. self.freq = freq
  179. self.cols=cols
  180. self.root_path = root_path
  181. self.data_path = data_path
  182. self.__read_data__()
  183. def __read_data__(self):
  184. self.scaler = StandardScaler()
  185. df_raw = pd.read_csv(os.path.join(self.root_path,
  186. self.data_path))
  187. '''
  188. df_raw.columns: ['date', ...(other features), target feature]
  189. '''
  190. # cols = list(df_raw.columns);
  191. if self.cols:
  192. cols=self.cols.copy()
  193. cols.remove(self.target)
  194. else:
  195. cols = list(df_raw.columns); cols.remove(self.target); cols.remove('date')
  196. df_raw = df_raw[['date']+cols+[self.target]]
  197. num_train = int(len(df_raw)*0.7)
  198. num_test = int(len(df_raw)*0.2)
  199. num_vali = len(df_raw) - num_train - num_test
  200. border1s = [0, num_train-self.seq_len, len(df_raw)-num_test-self.seq_len]
  201. border2s = [num_train, num_train+num_vali, len(df_raw)]
  202. # start and end of train, val, test data
  203. border1 = border1s[self.set_type]
  204. border2 = border2s[self.set_type]
  205. if self.features=='M' or self.features=='MS':
  206. cols_data = df_raw.columns[1:]
  207. df_data = df_raw[cols_data]
  208. elif self.features=='S':
  209. df_data = df_raw[[self.target]]
  210. if self.scale:
  211. train_data = df_data[border1s[0]:border2s[0]]
  212. self.scaler.fit(train_data.values)
  213. data = self.scaler.transform(df_data.values)
  214. else:
  215. data = df_data.values
  216. df_stamp = df_raw[['date']][border1:border2]
  217. df_stamp['date'] = pd.to_datetime(df_stamp.date)
  218. data_stamp = time_features(df_stamp, timeenc=self.timeenc, freq=self.freq)
  219. self.data_x = data[border1:border2]
  220. if self.inverse:
  221. self.data_y = df_data.values[border1:border2]
  222. else:
  223. self.data_y = data[border1:border2]
  224. self.data_stamp = data_stamp
  225. def __getitem__(self, index):
  226. s_begin = index
  227. s_end = s_begin + self.seq_len
  228. r_begin = s_end - self.label_len
  229. r_end = r_begin + self.label_len + self.pred_len
  230. seq_x = self.data_x[s_begin:s_end]
  231. if self.inverse:
  232. seq_y = np.concatenate([self.data_x[r_begin:r_begin+self.label_len], self.data_y[r_begin+self.label_len:r_end]], 0)
  233. else:
  234. seq_y = self.data_y[r_begin:r_end]
  235. seq_x_mark = self.data_stamp[s_begin:s_end]
  236. seq_y_mark = self.data_stamp[r_begin:r_end]
  237. return seq_x, seq_y, seq_x_mark, seq_y_mark
  238. def __len__(self):
  239. return len(self.data_x) - self.seq_len- self.pred_len + 1
  240. def inverse_transform(self, data):
  241. return self.scaler.inverse_transform(data)
  242. class Dataset_Pred(Dataset):
  243. def __init__(self, root_path, flag='pred', size=None,
  244. features='S', data_path='ETTh1.csv',
  245. target='OT', scale=True, inverse=False, timeenc=0, freq='15min', cols=None):
  246. # size [seq_len, label_len, pred_len]
  247. # info
  248. if size == None:
  249. self.seq_len = 24*4*4
  250. self.label_len = 24*4
  251. self.pred_len = 24*4
  252. else:
  253. self.seq_len = size[0]
  254. self.label_len = size[1]
  255. self.pred_len = size[2]
  256. # init
  257. assert flag in ['pred']
  258. self.features = features
  259. self.target = target
  260. self.scale = scale
  261. self.inverse = inverse
  262. self.timeenc = timeenc
  263. self.freq = freq
  264. self.cols=cols
  265. self.root_path = root_path
  266. self.data_path = data_path
  267. self.__read_data__()
  268. def __read_data__(self):
  269. self.scaler = StandardScaler()
  270. df_raw = pd.read_csv(os.path.join(self.root_path,
  271. self.data_path))
  272. '''
  273. df_raw.columns: ['date', ...(other features), target feature]
  274. '''
  275. if self.cols:
  276. cols=self.cols.copy()
  277. cols.remove(self.target)
  278. else:
  279. cols = list(df_raw.columns); cols.remove(self.target); cols.remove('date')
  280. df_raw = df_raw[['date']+cols+[self.target]]
  281. border1 = len(df_raw)-self.seq_len
  282. border2 = len(df_raw)
  283. if self.features=='M' or self.features=='MS':
  284. cols_data = df_raw.columns[1:]
  285. df_data = df_raw[cols_data]
  286. elif self.features=='S':
  287. df_data = df_raw[[self.target]]
  288. if self.scale:
  289. self.scaler.fit(df_data.values)
  290. data = self.scaler.transform(df_data.values)
  291. else:
  292. data = df_data.values
  293. tmp_stamp = df_raw[['date']][border1:border2]
  294. tmp_stamp['date'] = pd.to_datetime(tmp_stamp.date)
  295. pred_dates = pd.date_range(tmp_stamp.date.values[-1], periods=self.pred_len+1, freq=self.freq)
  296. df_stamp = pd.DataFrame(columns = ['date'])
  297. df_stamp.date = list(tmp_stamp.date.values) + list(pred_dates[1:])
  298. data_stamp = time_features(df_stamp, timeenc=self.timeenc, freq=self.freq[-1:])
  299. self.data_x = data[border1:border2]
  300. if self.inverse:
  301. self.data_y = df_data.values[border1:border2]
  302. else:
  303. self.data_y = data[border1:border2]
  304. self.data_stamp = data_stamp
  305. def __getitem__(self, index):
  306. s_begin = index
  307. s_end = s_begin + self.seq_len
  308. r_begin = s_end - self.label_len
  309. r_end = r_begin + self.label_len + self.pred_len
  310. seq_x = self.data_x[s_begin:s_end]
  311. if self.inverse:
  312. seq_y = self.data_x[r_begin:r_begin+self.label_len]
  313. else:
  314. seq_y = self.data_y[r_begin:r_begin+self.label_len]
  315. seq_x_mark = self.data_stamp[s_begin:s_end]
  316. seq_y_mark = self.data_stamp[r_begin:r_end]
  317. return seq_x, seq_y, seq_x_mark, seq_y_mark
  318. def __len__(self):
  319. return len(self.data_x) - self.seq_len + 1
  320. def inverse_transform(self, data):
  321. return self.scaler.inverse_transform(data)

基于MindSpore的多模态股票价格预测系统研究 Informer,LSTM,RNN