import numpy as np import six.moves.cPickle as pickle import gzip import os def load_mnist_data(dataset): """ Load the dataset Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py :type dataset: string :param dataset: the path to the dataset (here MNIST) """ # Download the MNIST dataset if it is not present data_dir, data_file = os.path.split(dataset) if data_dir == "" and not os.path.isfile(dataset): # Check if dataset is in the data directory. new_path = os.path.join( os.path.split(__file__)[0], dataset ) if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': dataset = new_path if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': from six.moves import urllib origin = ( 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' ) print('Downloading data from %s' % origin) urllib.request.urlretrieve(origin, dataset) # Load the dataset with gzip.open(dataset, 'rb') as f: try: train_set, valid_set, test_set = pickle.load(f, encoding='latin1') except: train_set, valid_set, test_set = pickle.load(f) # train_set, valid_set, test_set format: tuple(input, target) # input is a numpy.ndarray of 2 dimensions (a matrix), np.float32 # where each row corresponds to an example. target is a # numpy.ndarray of 1 dimension (vector), np.int64 that has the same length # as the number of rows in the input. It should give the target # to the example with the same index in the input. return train_set, valid_set, test_set def convert_to_one_hot(vals, max_val=0): """Helper method to convert label array to one-hot array.""" if max_val == 0: max_val = vals.max() + 1 one_hot_vals = np.zeros((vals.size, max_val)) one_hot_vals[np.arange(vals.size), vals] = 1 return one_hot_vals ########################################################################### # adult ########################################################################### def maybe_download(train_data, test_data): import pandas as pd """if adult data "train.csv" and "test.csv" are not in your directory, download them. """ COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket"] if not os.path.exists(train_data): print("downloading training data...") df_train = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", names=COLUMNS, skipinitialspace=True) else: df_train = pd.read_csv("train.csv") if not os.path.exists(test_data): print("downloading testing data...") df_test = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", names=COLUMNS, skipinitialspace=True, skiprows=1) else: df_test = pd.read_csv("test.csv") return df_train, df_test def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns def val2idx(df, cols): """helper to index categorical columns before embeddings. """ val_types = dict() for c in cols: val_types[c] = df[c].unique() val_to_idx = dict() for k, v in val_types.items(): val_to_idx[k] = {o: i for i, o in enumerate(val_types[k])} for k, v in val_to_idx.items(): df[k] = df[k].apply(lambda x: v[x]) unique_vals = dict() for c in cols: unique_vals[c] = df[c].nunique() return df, unique_vals def onehot(x): from sklearn.preprocessing import OneHotEncoder return np.array(OneHotEncoder().fit_transform(x).todense()) def wide(df_train, df_test, wide_cols, x_cols, target): import pandas as pd print('Processing wide data') df_train['IS_TRAIN'] = 1 df_test['IS_TRAIN'] = 0 df_wide = pd.concat([df_train, df_test]) crossed_columns_d = cross_columns(x_cols) categorical_columns = list( df_wide.select_dtypes(include=['object']).columns) wide_cols += list(crossed_columns_d.keys()) for k, v in crossed_columns_d.items(): df_wide[k] = df_wide[v].apply(lambda x: '-'.join(x), axis=1) df_wide = df_wide[wide_cols + [target] + ['IS_TRAIN']] dummy_cols = [ c for c in wide_cols if c in categorical_columns + list(crossed_columns_d.keys())] df_wide = pd.get_dummies(df_wide, columns=[x for x in dummy_cols]) train = df_wide[df_wide.IS_TRAIN == 1].drop('IS_TRAIN', axis=1) test = df_wide[df_wide.IS_TRAIN == 0].drop('IS_TRAIN', axis=1) assert all(train.columns == test.columns) cols = [c for c in train.columns if c != target] X_train = train[cols].values y_train = train[target].values.reshape(-1, 1) X_test = test[cols].values y_test = test[target].values.reshape(-1, 1) return X_train, y_train, X_test, y_test def load_adult_data(return_val=True): import pandas as pd df_train, df_test = maybe_download("train.csv", "test.csv") df_train['income_label'] = ( df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) df_test['income_label'] = ( df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int) age_groups = [0, 25, 65, 90] age_labels = range(len(age_groups) - 1) df_train['age_group'] = pd.cut( df_train['age'], age_groups, labels=age_labels) df_test['age_group'] = pd.cut( df_test['age'], age_groups, labels=age_labels) # columns for wide model wide_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'age_group'] x_cols = (['education', 'occupation'], ['native_country', 'occupation']) # columns for deep model embedding_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country'] cont_cols = ['age', 'capital_gain', 'capital_loss', 'hours_per_week'] target = 'income_label' x_train_wide, y_train_wide, x_test_wide, y_test_wide = wide( df_train, df_test, wide_cols, x_cols, target) x_train_wide = np.array(x_train_wide).astype(np.float32) x_test_wide = np.array(x_test_wide).astype(np.float32) print('Processing deep data') df_train['IS_TRAIN'] = 1 df_test['IS_TRAIN'] = 0 df_deep = pd.concat([df_train, df_test]) deep_cols = embedding_cols + cont_cols df_deep = df_deep[deep_cols + [target, 'IS_TRAIN']] from sklearn.preprocessing import StandardScaler scaler = StandardScaler() df_deep[cont_cols] = pd.DataFrame(scaler.fit_transform(df_train[cont_cols]), columns=cont_cols) df_deep, unique_vals = val2idx(df_deep, embedding_cols) train = df_deep[df_deep.IS_TRAIN == 1].drop('IS_TRAIN', axis=1) test = df_deep[df_deep.IS_TRAIN == 0].drop('IS_TRAIN', axis=1) x_train_deep = np.array([train[c] for c in deep_cols]).astype(np.float32) y_train = np.array(train[target].values).reshape(-1, 1).astype(np.int32) x_test_deep = np.array([test[c] for c in deep_cols]).astype(np.float32) y_test = np.array(test[target].values).reshape(-1, 1).astype(np.int32) x_train_deep = np.transpose(x_train_deep) x_test_deep = np.transpose(x_test_deep) y_train = onehot(y_train) y_test = onehot(y_test) if return_val: return x_train_deep, x_train_wide, y_train, x_test_deep, x_test_wide, y_test else: return x_train_deep, x_train_wide, y_train