import numpy as np import openml # Function to load and preprocess the dataset def load_and_preprocess_dataset(dataset_id): dataset = openml.datasets.get_dataset(dataset_id, download_data=True, download_qualities=False, download_features_meta_data=False) X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute) # Convert data types for col in X.select_dtypes(include='bool').columns: X[col] = X[col].astype(int) y = y.cat.codes.astype(int) X, y = X.to_numpy(), y.to_numpy() return X, y # Function to split data (one shot) def split_dataset(X, y, test_size = 0.3): # For every class: 1 : (1-test_size)*(len-1) : test_size*(len-1) label_indices, unlabel_indices, test_indices = [], [], [] for class_label in np.unique(y): idxs = np.where(y == class_label)[0] np.random.shuffle(idxs) n_train_unlabel = int((1-test_size)*(len(idxs)-1)) label_indices.append(idxs[0]) unlabel_indices.extend(idxs[1:1+n_train_unlabel]) test_indices.extend(idxs[1+n_train_unlabel:]) X_label, y_label = X[label_indices], y[label_indices] X_unlabel, y_unlabel = X[unlabel_indices], y[unlabel_indices] X_test, y_test = X[test_indices], y[test_indices] return X_label, y_label, X_unlabel, y_unlabel, X_test, y_test