You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

load_data.py 8.1 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. import numpy as np
  2. import six.moves.cPickle as pickle
  3. import gzip
  4. import os
  5. def load_mnist_data(dataset):
  6. """ Load the dataset
  7. Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py
  8. :type dataset: string
  9. :param dataset: the path to the dataset (here MNIST)
  10. """
  11. # Download the MNIST dataset if it is not present
  12. data_dir, data_file = os.path.split(dataset)
  13. if data_dir == "" and not os.path.isfile(dataset):
  14. # Check if dataset is in the data directory.
  15. new_path = os.path.join(
  16. os.path.split(__file__)[0],
  17. dataset
  18. )
  19. if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
  20. dataset = new_path
  21. if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
  22. from six.moves import urllib
  23. origin = (
  24. 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
  25. )
  26. print('Downloading data from %s' % origin)
  27. urllib.request.urlretrieve(origin, dataset)
  28. # Load the dataset
  29. with gzip.open(dataset, 'rb') as f:
  30. try:
  31. train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
  32. except:
  33. train_set, valid_set, test_set = pickle.load(f)
  34. # train_set, valid_set, test_set format: tuple(input, target)
  35. # input is a numpy.ndarray of 2 dimensions (a matrix), np.float32
  36. # where each row corresponds to an example. target is a
  37. # numpy.ndarray of 1 dimension (vector), np.int64 that has the same length
  38. # as the number of rows in the input. It should give the target
  39. # to the example with the same index in the input.
  40. return train_set, valid_set, test_set
  41. def convert_to_one_hot(vals, max_val=0):
  42. """Helper method to convert label array to one-hot array."""
  43. if max_val == 0:
  44. max_val = vals.max() + 1
  45. one_hot_vals = np.zeros((vals.size, max_val))
  46. one_hot_vals[np.arange(vals.size), vals] = 1
  47. return one_hot_vals
  48. ###########################################################################
  49. # adult
  50. ###########################################################################
  51. def maybe_download(train_data, test_data):
  52. import pandas as pd
  53. """if adult data "train.csv" and "test.csv" are not in your directory,
  54. download them.
  55. """
  56. COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
  57. "marital_status", "occupation", "relationship", "race", "gender",
  58. "capital_gain", "capital_loss", "hours_per_week", "native_country",
  59. "income_bracket"]
  60. if not os.path.exists(train_data):
  61. print("downloading training data...")
  62. df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
  63. names=COLUMNS, skipinitialspace=True)
  64. else:
  65. df_train = pd.read_csv("train.csv")
  66. if not os.path.exists(test_data):
  67. print("downloading testing data...")
  68. df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
  69. names=COLUMNS, skipinitialspace=True, skiprows=1)
  70. else:
  71. df_test = pd.read_csv("test.csv")
  72. return df_train, df_test
  73. def cross_columns(x_cols):
  74. """simple helper to build the crossed columns in a pandas dataframe
  75. """
  76. crossed_columns = dict()
  77. colnames = ['_'.join(x_c) for x_c in x_cols]
  78. for cname, x_c in zip(colnames, x_cols):
  79. crossed_columns[cname] = x_c
  80. return crossed_columns
  81. def val2idx(df, cols):
  82. """helper to index categorical columns before embeddings.
  83. """
  84. val_types = dict()
  85. for c in cols:
  86. val_types[c] = df[c].unique()
  87. val_to_idx = dict()
  88. for k, v in val_types.items():
  89. val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
  90. for k, v in val_to_idx.items():
  91. df[k] = df[k].apply(lambda x: v[x])
  92. unique_vals = dict()
  93. for c in cols:
  94. unique_vals[c] = df[c].nunique()
  95. return df, unique_vals
  96. def onehot(x):
  97. from sklearn.preprocessing import OneHotEncoder
  98. return np.array(OneHotEncoder().fit_transform(x).todense())
  99. def wide(df_train, df_test, wide_cols, x_cols, target):
  100. import pandas as pd
  101. print('Processing wide data')
  102. df_train['IS_TRAIN'] = 1
  103. df_test['IS_TRAIN'] = 0
  104. df_wide = pd.concat([df_train, df_test])
  105. crossed_columns_d = cross_columns(x_cols)
  106. categorical_columns = list(
  107. df_wide.select_dtypes(include=['object']).columns)
  108. wide_cols += list(crossed_columns_d.keys())
  109. for k, v in crossed_columns_d.items():
  110. df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)
  111. df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
  112. dummy_cols = [
  113. c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())]
  114. df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])
  115. train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
  116. test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
  117. assert all(train.columns == test.columns)
  118. cols = [c for c in train.columns if c != target]
  119. X_train = train[cols].values
  120. y_train = train[target].values.reshape(-1, 1)
  121. X_test = test[cols].values
  122. y_test = test[target].values.reshape(-1, 1)
  123. return X_train, y_train, X_test, y_test
  124. def load_adult_data(return_val=True):
  125. import pandas as pd
  126. df_train, df_test = maybe_download("train.csv", "test.csv")
  127. df_train['income_label'] = (
  128. df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
  129. df_test['income_label'] = (
  130. df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
  131. age_groups = [0, 25, 65, 90]
  132. age_labels = range(len(age_groups) - 1)
  133. df_train['age_group'] = pd.cut(
  134. df_train['age'], age_groups, labels=age_labels)
  135. df_test['age_group'] = pd.cut(
  136. df_test['age'], age_groups, labels=age_labels)
  137. # columns for wide model
  138. wide_cols = ['workclass', 'education', 'marital_status', 'occupation',
  139. 'relationship', 'race', 'gender', 'native_country', 'age_group']
  140. x_cols = (['education', 'occupation'], ['native_country', 'occupation'])
  141. # columns for deep model
  142. embedding_cols = ['workclass', 'education', 'marital_status', 'occupation',
  143. 'relationship', 'race', 'gender', 'native_country']
  144. cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
  145. target = 'income_label'
  146. x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide(
  147. df_train, df_test, wide_cols, x_cols, target)
  148. x_train_wide = np.array(x_train_wide).astype(np.float32)
  149. x_test_wide = np.array(x_test_wide).astype(np.float32)
  150. print('Processing deep data')
  151. df_train['IS_TRAIN'] = 1
  152. df_test['IS_TRAIN'] = 0
  153. df_deep = pd.concat([df_train, df_test])
  154. deep_cols = embedding_cols + cont_cols
  155. df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']]
  156. from sklearn.preprocessing import StandardScaler
  157. scaler = StandardScaler()
  158. df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]),
  159. columns=cont_cols)
  160. df_deep, unique_vals = val2idx(df_deep, embedding_cols)
  161. train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
  162. test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
  163. x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32)
  164. y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32)
  165. x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32)
  166. y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32)
  167. x_train_deep = np.transpose(x_train_deep)
  168. x_test_deep = np.transpose(x_test_deep)
  169. y_train = onehot(y_train)
  170. y_test = onehot(y_test)
  171. if return_val:
  172. return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test
  173. else:
  174. return x_train_deep, x_train_wide, y_train