You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

load_data.py 12 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. import os
  2. import numpy as np
  3. ###########################################################################
  4. # criteo
  5. ###########################################################################
  6. def download_criteo(path):
  7. import tarfile
  8. import pandas as pd
  9. from six.moves import urllib
  10. if not os.path.exists(path):
  11. os.makedirs(path)
  12. assert os.path.isdir(path), 'Please provide a directory path.'
  13. # this source may be invalid, please use other valid sources.
  14. origin = (
  15. 'https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz'
  16. )
  17. print('Downloading data from %s' % origin)
  18. dataset = os.path.join(path, 'criteo.tar.gz')
  19. urllib.request.urlretrieve(origin, dataset)
  20. print("Extracting criteo zip...")
  21. with tarfile.open(dataset) as f:
  22. f.extractall(path=path)
  23. print("Create local files...")
  24. # save csv filed
  25. df = pd.read_csv(os.path.join(path, "train.txt"), sep='\t', header=None)
  26. df.columns = ['label'] + ["I" +
  27. str(i) for i in range(1, 14)] + ["C"+str(i) for i in range(14, 40)]
  28. df.to_csv(os.path.join(path, "train.csv"), index=0)
  29. print('Csv file saved.')
  30. # save numpy arrays
  31. target_path = [os.path.join(path, filename) for filename in [
  32. 'train_dense_feats.npy', 'train_sparse_feats.npy', 'train_labels.npy',
  33. 'test_dense_feats.npy', 'test_sparse_feats.npy', 'test_labels.npy']]
  34. dense_feats = [col for col in df.columns if col.startswith('I')]
  35. sparse_feats = [col for col in df.columns if col.startswith('C')]
  36. labels = df['label']
  37. dense_feats = process_dense_feats(df, dense_feats)
  38. sparse_feats = process_sparse_feats(df, sparse_feats)
  39. num_data = dense_feats.shape[0]
  40. perm = np.random.permutation(num_data)
  41. # split data in 2 parts
  42. test_num = num_data // 10
  43. processed_data = [
  44. dense_feats[perm[:-test_num]], # train dense
  45. sparse_feats[perm[:-test_num]], # train sparse
  46. labels[perm[:-test_num]], # train labels
  47. dense_feats[perm[-test_num:]], # validate dense
  48. sparse_feats[perm[-test_num:]], # validate sparse
  49. labels[perm[-test_num:]], # validate labels
  50. ]
  51. print('Array shapes:')
  52. for i in range(len(processed_data)):
  53. print(os.path.split(target_path[i])
  54. [-1].split('.')[0], processed_data[i].shape)
  55. np.save(target_path[i], processed_data[i])
  56. print('Numpy arrays saved.')
  57. def process_dense_feats(data, feats):
  58. d = data.copy()
  59. d = d[feats].fillna(0.0)
  60. for f in feats:
  61. d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
  62. return d
  63. def process_sparse_feats(data, feats):
  64. from sklearn.preprocessing import LabelEncoder
  65. # process to embeddings.
  66. d = data.copy()
  67. d = d[feats].fillna("-1")
  68. for f in feats:
  69. label_encoder = LabelEncoder()
  70. d[f] = label_encoder.fit_transform(d[f])
  71. feature_cnt = 0
  72. for f in feats:
  73. d[f] += feature_cnt
  74. feature_cnt += d[f].nunique()
  75. return d
  76. def process_head_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), nrows=20000, return_val=True):
  77. import pandas as pd
  78. csv_path = os.path.join(path, "train.csv")
  79. if not os.path.exists(csv_path):
  80. download_criteo(path)
  81. df = pd.read_csv(csv_path, nrows=nrows, header=0)
  82. dense_feats = [col for col in df.columns if col.startswith('I')]
  83. sparse_feats = [col for col in df.columns if col.startswith('C')]
  84. labels = np.array(df['label']).reshape(-1, 1)
  85. dense_feats = np.array(process_dense_feats(df, dense_feats))
  86. sparse_feats = np.array(process_sparse_feats(
  87. df, sparse_feats)).astype(np.int32)
  88. if return_val:
  89. test_num = nrows // 10
  90. train_dense = dense_feats[:-test_num]
  91. train_sparse = sparse_feats[:-test_num]
  92. train_label = labels[:-test_num]
  93. validate_dense = dense_feats[-test_num:]
  94. validate_sparse = sparse_feats[-test_num:]
  95. validate_label = labels[-test_num:]
  96. return (train_dense, validate_dense), (train_sparse, validate_sparse), (train_label, validate_label)
  97. else:
  98. return dense_feats, sparse_feats, labels
  99. def process_sampled_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo')):
  100. # all data should be available! no checking.
  101. processed_data = [np.load(os.path.join(path, filename))
  102. for filename in ['sampled_dense_feats.npy', 'sampled_sparse_feats.npy', 'sampled_labels.npy']]
  103. return tuple(processed_data)
  104. def process_all_criteo_data(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/criteo'), return_val=True):
  105. file_paths = [os.path.join(path, filename) for filename in [
  106. 'train_dense_feats.npy', 'test_dense_feats.npy', 'train_sparse_feats.npy',
  107. 'test_sparse_feats.npy', 'train_labels.npy', 'test_labels.npy']]
  108. if not all([os.path.exists(p) for p in file_paths]):
  109. download_criteo(path)
  110. files = [np.load(filename) for filename in file_paths]
  111. if return_val:
  112. return (files[0], files[1]), (files[2], files[3]), (files[4], files[5])
  113. else:
  114. return files[0], files[2], files[4]
  115. ###########################################################################
  116. # adult
  117. ###########################################################################
  118. def maybe_download(train_data, test_data):
  119. import pandas as pd
  120. """if adult data "train.csv" and "test.csv" are not in your directory,
  121. download them.
  122. """
  123. COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
  124. "marital_status", "occupation", "relationship", "race", "gender",
  125. "capital_gain", "capital_loss", "hours_per_week", "native_country",
  126. "income_bracket"]
  127. if not os.path.exists(train_data):
  128. print("downloading training data...")
  129. df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
  130. names=COLUMNS, skipinitialspace=True)
  131. else:
  132. df_train = pd.read_csv("train.csv")
  133. if not os.path.exists(test_data):
  134. print("downloading testing data...")
  135. df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
  136. names=COLUMNS, skipinitialspace=True, skiprows=1)
  137. else:
  138. df_test = pd.read_csv("test.csv")
  139. return df_train, df_test
  140. def cross_columns(x_cols):
  141. """simple helper to build the crossed columns in a pandas dataframe
  142. """
  143. crossed_columns = dict()
  144. colnames = ['_'.join(x_c) for x_c in x_cols]
  145. for cname, x_c in zip(colnames, x_cols):
  146. crossed_columns[cname] = x_c
  147. return crossed_columns
  148. def val2idx(df, cols):
  149. """helper to index categorical columns before embeddings.
  150. """
  151. val_types = dict()
  152. for c in cols:
  153. val_types[c] = df[c].unique()
  154. val_to_idx = dict()
  155. for k, v in val_types.items():
  156. val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
  157. for k, v in val_to_idx.items():
  158. df[k] = df[k].apply(lambda x: v[x])
  159. unique_vals = dict()
  160. for c in cols:
  161. unique_vals[c] = df[c].nunique()
  162. return df, unique_vals
  163. def onehot(x):
  164. from sklearn.preprocessing import OneHotEncoder
  165. return np.array(OneHotEncoder().fit_transform(x).todense())
  166. def wide(df_train, df_test, wide_cols, x_cols, target):
  167. import pandas as pd
  168. print('Processing wide data')
  169. df_train['IS_TRAIN'] = 1
  170. df_test['IS_TRAIN'] = 0
  171. df_wide = pd.concat([df_train, df_test])
  172. crossed_columns_d = cross_columns(x_cols)
  173. categorical_columns = list(
  174. df_wide.select_dtypes(include=['object']).columns)
  175. wide_cols += list(crossed_columns_d.keys())
  176. for k, v in crossed_columns_d.items():
  177. df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)
  178. df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
  179. dummy_cols = [
  180. c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())]
  181. df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])
  182. train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
  183. test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
  184. assert all(train.columns == test.columns)
  185. cols = [c for c in train.columns if c != target]
  186. X_train = train[cols].values
  187. y_train = train[target].values.reshape(-1, 1)
  188. X_test = test[cols].values
  189. y_test = test[target].values.reshape(-1, 1)
  190. return X_train, y_train, X_test, y_test
  191. def load_adult_data(return_val=True):
  192. import pandas as pd
  193. df_train, df_test = maybe_download("train.csv", "test.csv")
  194. df_train['income_label'] = (
  195. df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
  196. df_test['income_label'] = (
  197. df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
  198. age_groups = [0, 25, 65, 90]
  199. age_labels = range(len(age_groups) - 1)
  200. df_train['age_group'] = pd.cut(
  201. df_train['age'], age_groups, labels=age_labels)
  202. df_test['age_group'] = pd.cut(
  203. df_test['age'], age_groups, labels=age_labels)
  204. # columns for wide model
  205. wide_cols = ['workclass', 'education', 'marital_status', 'occupation',
  206. 'relationship', 'race', 'gender', 'native_country', 'age_group']
  207. x_cols = (['education', 'occupation'], ['native_country', 'occupation'])
  208. # columns for deep model
  209. embedding_cols = ['workclass', 'education', 'marital_status', 'occupation',
  210. 'relationship', 'race', 'gender', 'native_country']
  211. cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
  212. target = 'income_label'
  213. x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide(
  214. df_train, df_test, wide_cols, x_cols, target)
  215. x_train_wide = np.array(x_train_wide).astype(np.float32)
  216. x_test_wide = np.array(x_test_wide).astype(np.float32)
  217. print('Processing deep data')
  218. df_train['IS_TRAIN'] = 1
  219. df_test['IS_TRAIN'] = 0
  220. df_deep = pd.concat([df_train, df_test])
  221. deep_cols = embedding_cols + cont_cols
  222. df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']]
  223. from sklearn.preprocessing import StandardScaler
  224. scaler = StandardScaler()
  225. df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]),
  226. columns=cont_cols)
  227. df_deep, unique_vals = val2idx(df_deep, embedding_cols)
  228. train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
  229. test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
  230. x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32)
  231. y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32)
  232. x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32)
  233. y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32)
  234. x_train_deep = np.transpose(x_train_deep)
  235. x_test_deep = np.transpose(x_test_deep)
  236. y_train = onehot(y_train)
  237. y_test = onehot(y_test)
  238. if return_val:
  239. return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test
  240. else:
  241. return x_train_deep, x_train_wide, y_train
  242. ###########################################################################
  243. # avazu
  244. ###########################################################################
  245. def process_avazu(path=os.path.join(os.path.split(os.path.abspath(__file__))[0], '../datasets/avazu')):
  246. import pandas as pd
  247. # please download in advance from https://www.kaggle.com/c/avazu-ctr-prediction/data
  248. train_file = os.path.join(path, 'train.csv')
  249. # test_file = os.path.join(path, 'test.csv') # useless, no labels
  250. df_train = pd.read_csv(train_file)
  251. sparse_feats = process_sparse_feats(df_train, df_train.columns[2:])
  252. # the embedding num for each feature:
  253. # [240, 7, 7, 4737, 7745, 26, 8552, 559, 36, 2686408, 6729486, 8251, 5, 4, 2626, 8, 9, 435, 4, 68, 172, 60]
  254. # sum: 9449445
  255. np.save(os.path.join(path, 'sparse.npy'), sparse_feats)
  256. if __name__ == '__main__':
  257. download_criteo(os.path.join(os.path.split(
  258. os.path.abspath(__file__)), '../datasets/criteo'))

分布式深度学习系统

Contributors (1)