|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220 |
- import numpy as np
- import six.moves.cPickle as pickle
- import gzip
- import os
-
-
- def load_mnist_data(dataset):
- """ Load the dataset
- Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py
- :type dataset: string
- :param dataset: the path to the dataset (here MNIST)
- """
- # Download the MNIST dataset if it is not present
- data_dir, data_file = os.path.split(dataset)
- if data_dir == "" and not os.path.isfile(dataset):
- # Check if dataset is in the data directory.
- new_path = os.path.join(
- os.path.split(__file__)[0],
- dataset
- )
- if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
- dataset = new_path
-
- if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
- from six.moves import urllib
- origin = (
- 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
- )
- print('Downloading data from %s' % origin)
- urllib.request.urlretrieve(origin, dataset)
-
- # Load the dataset
- with gzip.open(dataset, 'rb') as f:
- try:
- train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
- except:
- train_set, valid_set, test_set = pickle.load(f)
- # train_set, valid_set, test_set format: tuple(input, target)
- # input is a numpy.ndarray of 2 dimensions (a matrix), np.float32
- # where each row corresponds to an example. target is a
- # numpy.ndarray of 1 dimension (vector), np.int64 that has the same length
- # as the number of rows in the input. It should give the target
- # to the example with the same index in the input.
- return train_set, valid_set, test_set
-
-
- def convert_to_one_hot(vals, max_val=0):
- """Helper method to convert label array to one-hot array."""
- if max_val == 0:
- max_val = vals.max() + 1
- one_hot_vals = np.zeros((vals.size, max_val))
- one_hot_vals[np.arange(vals.size), vals] = 1
- return one_hot_vals
-
- ###########################################################################
- # adult
- ###########################################################################
-
-
- def maybe_download(train_data, test_data):
- import pandas as pd
- """if adult data "train.csv" and "test.csv" are not in your directory,
- download them.
- """
-
- COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
- "marital_status", "occupation", "relationship", "race", "gender",
- "capital_gain", "capital_loss", "hours_per_week", "native_country",
- "income_bracket"]
-
- if not os.path.exists(train_data):
- print("downloading training data...")
- df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
- names=COLUMNS, skipinitialspace=True)
- else:
- df_train = pd.read_csv("train.csv")
-
- if not os.path.exists(test_data):
- print("downloading testing data...")
- df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
- names=COLUMNS, skipinitialspace=True, skiprows=1)
- else:
- df_test = pd.read_csv("test.csv")
-
- return df_train, df_test
-
-
- def cross_columns(x_cols):
- """simple helper to build the crossed columns in a pandas dataframe
- """
- crossed_columns = dict()
- colnames = ['_'.join(x_c) for x_c in x_cols]
- for cname, x_c in zip(colnames, x_cols):
- crossed_columns[cname] = x_c
- return crossed_columns
-
-
- def val2idx(df, cols):
- """helper to index categorical columns before embeddings.
- """
- val_types = dict()
- for c in cols:
- val_types[c] = df[c].unique()
-
- val_to_idx = dict()
- for k, v in val_types.items():
- val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])}
-
- for k, v in val_to_idx.items():
- df[k] = df[k].apply(lambda x: v[x])
-
- unique_vals = dict()
- for c in cols:
- unique_vals[c] = df[c].nunique()
-
- return df, unique_vals
-
-
- def onehot(x):
- from sklearn.preprocessing import OneHotEncoder
- return np.array(OneHotEncoder().fit_transform(x).todense())
-
-
- def wide(df_train, df_test, wide_cols, x_cols, target):
- import pandas as pd
- print('Processing wide data')
- df_train['IS_TRAIN'] = 1
- df_test['IS_TRAIN'] = 0
- df_wide = pd.concat([df_train, df_test])
-
- crossed_columns_d = cross_columns(x_cols)
- categorical_columns = list(
- df_wide.select_dtypes(include=['object']).columns)
-
- wide_cols += list(crossed_columns_d.keys())
-
- for k, v in crossed_columns_d.items():
- df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1)
-
- df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']]
-
- dummy_cols = [
- c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())]
- df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols])
-
- train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
- test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
- assert all(train.columns == test.columns)
-
- cols = [c for c in train.columns if c != target]
- X_train = train[cols].values
- y_train = train[target].values.reshape(-1, 1)
- X_test = test[cols].values
- y_test = test[target].values.reshape(-1, 1)
- return X_train, y_train, X_test, y_test
-
-
- def load_adult_data(return_val=True):
- import pandas as pd
- df_train, df_test = maybe_download("train.csv", "test.csv")
-
- df_train['income_label'] = (
- df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
- df_test['income_label'] = (
- df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-
- age_groups = [0, 25, 65, 90]
- age_labels = range(len(age_groups) - 1)
- df_train['age_group'] = pd.cut(
- df_train['age'], age_groups, labels=age_labels)
- df_test['age_group'] = pd.cut(
- df_test['age'], age_groups, labels=age_labels)
-
- # columns for wide model
- wide_cols = ['workclass', 'education', 'marital_status', 'occupation',
- 'relationship', 'race', 'gender', 'native_country', 'age_group']
- x_cols = (['education', 'occupation'], ['native_country', 'occupation'])
-
- # columns for deep model
- embedding_cols = ['workclass', 'education', 'marital_status', 'occupation',
- 'relationship', 'race', 'gender', 'native_country']
- cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week']
-
- target = 'income_label'
-
- x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide(
- df_train, df_test, wide_cols, x_cols, target)
- x_train_wide = np.array(x_train_wide).astype(np.float32)
- x_test_wide = np.array(x_test_wide).astype(np.float32)
-
- print('Processing deep data')
- df_train['IS_TRAIN'] = 1
- df_test['IS_TRAIN'] = 0
- df_deep = pd.concat([df_train, df_test])
-
- deep_cols = embedding_cols + cont_cols
- df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']]
- from sklearn.preprocessing import StandardScaler
- scaler = StandardScaler()
- df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]),
- columns=cont_cols)
- df_deep, unique_vals = val2idx(df_deep, embedding_cols)
-
- train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1)
- test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1)
-
- x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32)
- y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32)
- x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32)
- y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32)
-
- x_train_deep = np.transpose(x_train_deep)
- x_test_deep = np.transpose(x_test_deep)
- y_train = onehot(y_train)
- y_test = onehot(y_test)
-
- if return_val:
- return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test
- else:
- return x_train_deep, x_train_wide, y_train
|