You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

get_dataset.py 1.3 kB

1234567891011121314151617181920212223242526272829
  1. import numpy as np
  2. import openml
  3. # Function to load and preprocess the dataset
  4. def load_and_preprocess_dataset(dataset_id):
  5. dataset = openml.datasets.get_dataset(dataset_id, download_data=True, download_qualities=False, download_features_meta_data=False)
  6. X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)
  7. # Convert data types
  8. for col in X.select_dtypes(include='bool').columns:
  9. X[col] = X[col].astype(int)
  10. y = y.cat.codes.astype(int)
  11. X, y = X.to_numpy(), y.to_numpy()
  12. return X, y
  13. # Function to split data (one shot)
  14. def split_dataset(X, y, test_size = 0.3):
  15. # For every class: 1 : (1-test_size)*(len-1) : test_size*(len-1)
  16. label_indices, unlabel_indices, test_indices = [], [], []
  17. for class_label in np.unique(y):
  18. idxs = np.where(y == class_label)[0]
  19. np.random.shuffle(idxs)
  20. n_train_unlabel = int((1-test_size)*(len(idxs)-1))
  21. label_indices.append(idxs[0])
  22. unlabel_indices.extend(idxs[1:1+n_train_unlabel])
  23. test_indices.extend(idxs[1+n_train_unlabel:])
  24. X_label, y_label = X[label_indices], y[label_indices]
  25. X_unlabel, y_unlabel = X[unlabel_indices], y[unlabel_indices]
  26. X_test, y_test = X[test_indices], y[test_indices]
  27. return X_label, y_label, X_unlabel, y_unlabel, X_test, y_test

An efficient Python toolkit for Abductive Learning (ABL), a novel paradigm that integrates machine learning and logical reasoning in a unified framework.