diff --git a/examples/dataset_m5_workflow/main.py b/examples/dataset_m5_workflow/main.py index 74338b9..763669d 100644 --- a/examples/dataset_m5_workflow/main.py +++ b/examples/dataset_m5_workflow/main.py @@ -176,6 +176,8 @@ class M5DatasetWorkflow: mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list]) print(f"mixture_score: {mixture_score}, mixture_learnware: {mixture_id}") + if not mixture_learnware_list: + mixture_learnware_list = [single_learnware_list[0]] reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) diff --git a/examples/dataset_pfs_workflow/main.py b/examples/dataset_pfs_workflow/main.py index 356cdbc..784c383 100644 --- a/examples/dataset_pfs_workflow/main.py +++ b/examples/dataset_pfs_workflow/main.py @@ -173,6 +173,8 @@ class PFSDatasetWorkflow: mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list]) print(f"mixture_score: {mixture_score}, mixture_learnware: {mixture_id}") + if not mixture_learnware_list: + mixture_learnware_list = [single_learnware_list[0]] reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) diff --git a/examples/dataset_text_workflow/example_files/example_init.py b/examples/dataset_text_workflow/example_files/example_init.py index f15d335..1772a19 100644 --- a/examples/dataset_text_workflow/example_files/example_init.py +++ b/examples/dataset_text_workflow/example_files/example_init.py @@ -1,54 +1,29 @@ import os -import joblib +import pickle + import numpy as np + from learnware.model import BaseModel -import torch -from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER -import torchtext.functional as F -import torchtext.transforms as T -from torch.hub import load_state_dict_from_url class Model(BaseModel): def __init__(self): - super().__init__(input_shape=None, output_shape=(2,)) + super(Model, self).__init__(input_shape=(1,), output_shape=(1,)) dir_path = os.path.dirname(os.path.abspath(__file__)) - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - num_classes = 2 - input_dim = 768 - classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim) - self.model = XLMR_BASE_ENCODER.get_model(head=classifier_head).to(self.device) - self.model.load_state_dict(torch.load(os.path.join(dir_path, "model.pth"), map_location=torch.device("cpu"))) + modelv_path = os.path.join(dir_path, "modelv.pth") + with open(modelv_path, "rb") as f: + self.modelv = pickle.load(f) + + modell_path = os.path.join(dir_path, "modell.pth") + with open(modell_path, "rb") as f: + self.modell = pickle.load(f) def fit(self, X: np.ndarray, y: np.ndarray): pass def predict(self, X: np.ndarray) -> np.ndarray: - X = sentence_preprocess(X) - X = F.to_tensor(X, padding_value=1).to(self.device) - return self.model(X) + return self.modell.predict(self.modelv.transform(X)) def finetune(self, X: np.ndarray, y: np.ndarray): pass - - -def sentence_preprocess(x_datapipe): - padding_idx = 1 - bos_idx = 0 - eos_idx = 2 - max_seq_len = 256 - xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt" - xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model" - - text_transform = T.Sequential( - T.SentencePieceTokenizer(xlmr_spm_model_path), - T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)), - T.Truncate(max_seq_len - 2), - T.AddToken(token=bos_idx, begin=True), - T.AddToken(token=eos_idx, begin=False), - ) - - x_datapipe = [text_transform(x) for x in x_datapipe] - # x_datapipe = x_datapipe.map(text_transform) - return x_datapipe diff --git a/examples/dataset_text_workflow/example_files/example_yaml.yaml b/examples/dataset_text_workflow/example_files/example_yaml.yaml index f9817c7..d29f7dd 100644 --- a/examples/dataset_text_workflow/example_files/example_yaml.yaml +++ b/examples/dataset_text_workflow/example_files/example_yaml.yaml @@ -1,8 +1,8 @@ model: class_name: Model - kwargs: {} + kwargs: { } stat_specifications: - module_path: learnware.specification class_name: RKMETextSpecification file_name: rkme.json - kwargs: {} \ No newline at end of file + kwargs: { } \ No newline at end of file diff --git a/examples/dataset_text_workflow/example_files/requirements.txt b/examples/dataset_text_workflow/example_files/requirements.txt index 8d26293..da986b1 100644 --- a/examples/dataset_text_workflow/example_files/requirements.txt +++ b/examples/dataset_text_workflow/example_files/requirements.txt @@ -1,3 +1,4 @@ -torch==2.0.1 -torchdata -torchtext \ No newline at end of file +numpy +pickle +lightgbm +scikit-learn \ No newline at end of file diff --git a/examples/dataset_text_workflow/get_data.py b/examples/dataset_text_workflow/get_data.py index 770fd80..a55e8d7 100644 --- a/examples/dataset_text_workflow/get_data.py +++ b/examples/dataset_text_workflow/get_data.py @@ -1,14 +1,16 @@ -from torchtext.datasets import SST2 +import os +import pandas as pd -def get_sst2(data_root="./data"): - train_datapipe = SST2(root="./data", split="train") - X_train = [x[0] for x in train_datapipe] - y_train = [x[1] for x in train_datapipe] +def get_data(data_root="./data"): + dtrain = pd.read_csv(os.path.join(data_root, "train.csv")) + dtest = pd.read_csv(os.path.join(data_root, "test.csv")) - dev_datapipe = SST2(root="./data", split="dev") - - X_test = [x[0] for x in dev_datapipe] - y_test = [x[1] for x in dev_datapipe] - return X_train, y_train, X_test, y_test + # returned X(DataFrame), y(Series) + return ( + dtrain[["discourse_text", "discourse_type"]], + dtrain["discourse_effectiveness"], + dtest[["discourse_text", "discourse_type"]], + dtest["discourse_effectiveness"], + ) diff --git a/examples/dataset_text_workflow/main.py b/examples/dataset_text_workflow/main.py index 179bf53..72de04e 100644 --- a/examples/dataset_text_workflow/main.py +++ b/examples/dataset_text_workflow/main.py @@ -1,30 +1,28 @@ -import numpy as np -import torch -from get_data import get_sst2 import os -import random -from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction -from learnware.learnware import Learnware -from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser -import time +import fire import pickle +import time +import zipfile +from shutil import copyfile, rmtree -from learnware.market import instantiate_learnware_market, BaseUserInfo -from learnware.specification import RKMETextSpecification -from learnware.logger import get_module_logger +import numpy as np -from shutil import copyfile, rmtree -import zipfile +import learnware.specification as specification +from get_data import get_data +from learnware.logger import get_module_logger +from learnware.market import instantiate_learnware_market, BaseUserInfo +from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser +from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction logger = get_module_logger("text_test", level="INFO") origin_data_root = "./data/origin_data" processed_data_root = "./data/processed_data" tmp_dir = "./data/tmp" learnware_pool_dir = "./data/learnware_pool" -dataset = "sst2" -n_uploaders = 10 -n_users = 5 -n_classes = 2 +dataset = "ae" # argumentative essays +n_uploaders = 7 +n_users = 7 +n_classes = 3 data_root = os.path.join(origin_data_root, dataset) data_save_root = os.path.join(processed_data_root, dataset) user_save_root = os.path.join(data_save_root, "user") @@ -36,18 +34,17 @@ os.makedirs(uploader_save_root, exist_ok=True) os.makedirs(model_save_root, exist_ok=True) output_description = { - "Dimension": 2, + "Dimension": 1, "Description": { - "0": "the probability of being negative", - "1": "the probability of being positive", + "0": "classify as 0(ineffective), 1(effective), or 2(adequate).", }, } semantic_specs = [ { "Data": {"Values": ["Text"], "Type": "Class"}, "Task": {"Values": ["Classification"], "Type": "Class"}, - "Library": {"Values": ["PyTorch"], "Type": "Class"}, - "Scenario": {"Values": ["Business"], "Type": "Tag"}, + "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, + "Scenario": {"Values": ["Education"], "Type": "Tag"}, "Description": {"Values": "", "Type": "String"}, "Name": {"Values": "learnware_1", "Type": "String"}, "Output": output_description, @@ -57,176 +54,220 @@ semantic_specs = [ user_semantic = { "Data": {"Values": ["Text"], "Type": "Class"}, "Task": {"Values": ["Classification"], "Type": "Class"}, - "Library": {"Values": ["PyTorch"], "Type": "Class"}, - "Scenario": {"Values": ["Business"], "Type": "Tag"}, + "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, + "Scenario": {"Values": ["Education"], "Type": "Tag"}, "Description": {"Values": "", "Type": "String"}, "Name": {"Values": "", "Type": "String"}, "Output": output_description, } -def prepare_data(): - if dataset == "sst2": - X_train, y_train, X_test, y_test = get_sst2(data_root) - else: - return - generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root) - generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root) - - -def prepare_model(): - dataloader = TextDataLoader(data_save_root, train=True) - for i in range(n_uploaders): - logger.info("Train on uploader: %d" % (i)) - X, y = dataloader.get_idx_data(i) - model = train(X, y, out_classes=n_classes) - model_save_path = os.path.join(model_save_root, "uploader_%d.pth" % (i)) - torch.save(model.state_dict(), model_save_path) - logger.info("Model saved to '%s'" % (model_save_path)) - - -def prepare_learnware(data_path, model_path, init_file_path, yaml_path, env_file_path, save_root, zip_name): - os.makedirs(save_root, exist_ok=True) - tmp_spec_path = os.path.join(save_root, "rkme.json") - tmp_model_path = os.path.join(save_root, "model.pth") - tmp_yaml_path = os.path.join(save_root, "learnware.yaml") - tmp_init_path = os.path.join(save_root, "__init__.py") - tmp_env_path = os.path.join(save_root, "requirements.txt") - - with open(data_path, "rb") as f: - X = pickle.load(f) - semantic_spec = semantic_specs[0] - - st = time.time() - user_spec = RKMETextSpecification() - user_spec.generate_stat_spec_from_data(X=X) - ed = time.time() - logger.info("Stat spec generated in %.3f s" % (ed - st)) - user_spec.save(tmp_spec_path) - copyfile(model_path, tmp_model_path) - copyfile(yaml_path, tmp_yaml_path) - copyfile(init_file_path, tmp_init_path) - copyfile(env_file_path, tmp_env_path) - zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name)) - with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj: - zip_obj.write(tmp_spec_path, "rkme.json") - zip_obj.write(tmp_model_path, "model.pth") - zip_obj.write(tmp_yaml_path, "learnware.yaml") - zip_obj.write(tmp_init_path, "__init__.py") - zip_obj.write(tmp_env_path, "requirements.txt") - rmtree(save_root) - logger.info("New Learnware Saved to %s" % (zip_file_name)) - return zip_file_name - - -def prepare_market(): - text_market = instantiate_learnware_market(market_id="sst2", rebuild=True) - try: - rmtree(learnware_pool_dir) - except: - pass - os.makedirs(learnware_pool_dir, exist_ok=True) - for i in range(n_uploaders): - data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i)) - model_path = os.path.join(model_save_root, "uploader_%d.pth" % (i)) - init_file_path = "./example_files/example_init.py" - yaml_file_path = "./example_files/example_yaml.yaml" - env_file_path = "./example_files/requirements.txt" - new_learnware_path = prepare_learnware( - data_path, model_path, init_file_path, yaml_file_path, env_file_path, tmp_dir, "%s_%d" % (dataset, i) - ) +class TextDatasetWorkflow: + def _init_text_dataset(self): + self._prepare_data() + self._prepare_model() + + def _prepare_data(self): + X_train, y_train, X_test, y_test = get_data(data_root) + + generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root) + generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root) + + + def _prepare_model(self): + dataloader = TextDataLoader(data_save_root, train=True) + for i in range(n_uploaders): + logger.info("Train on uploader: %d" % (i)) + X, y = dataloader.get_idx_data(i) + vectorizer, lgbm = train(X, y, out_classes=n_classes) + + modelv_save_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) + modell_save_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) + + with open(modelv_save_path, "wb") as f: + pickle.dump(vectorizer, f) + + with open(modell_save_path, "wb") as f: + pickle.dump(lgbm, f) + + logger.info("Model saved to '%s' and '%s'" % (modelv_save_path, modell_save_path)) + + + def _prepare_learnware(self, + data_path, modelv_path, modell_path, init_file_path, yaml_path, env_file_path, save_root, zip_name + ): + os.makedirs(save_root, exist_ok=True) + tmp_spec_path = os.path.join(save_root, "rkme.json") + + tmp_modelv_path = os.path.join(save_root, "modelv.pth") + tmp_modell_path = os.path.join(save_root, "modell.pth") + + tmp_yaml_path = os.path.join(save_root, "learnware.yaml") + tmp_init_path = os.path.join(save_root, "__init__.py") + tmp_env_path = os.path.join(save_root, "requirements.txt") + + with open(data_path, "rb") as f: + X = pickle.load(f) semantic_spec = semantic_specs[0] - semantic_spec["Name"]["Values"] = "learnware_%d" % (i) - semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i) - text_market.add_learnware(new_learnware_path, semantic_spec) - - logger.info("Total Item: %d" % (len(text_market))) - - -def test_search(gamma=0.1, load_market=True): - if load_market: - text_market = instantiate_learnware_market(market_id="sst2") - else: - prepare_market() - text_market = instantiate_learnware_market(market_id="sst2") - logger.info("Number of items in the market: %d" % len(text_market)) - - select_list = [] - avg_list = [] - improve_list = [] - job_selector_score_list = [] - ensemble_score_list = [] - pruning_score_list = [] - for i in range(n_users): - user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i)) - user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i)) - with open(user_data_path, "rb") as f: - user_data = pickle.load(f) - with open(user_label_path, "rb") as f: - user_label = pickle.load(f) - - user_stat_spec = RKMETextSpecification() - user_stat_spec.generate_stat_spec_from_data(X=user_data) - user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec}) - logger.info("Searching Market for user: %d" % (i)) - sorted_score_list, single_learnware_list, mixture_score, mixture_learnware_list = text_market.search_learnware( - user_info + + st = time.time() + + user_spec = specification.RKMETextSpecification() + + user_spec.generate_stat_spec_from_data(X=X) + ed = time.time() + logger.info("Stat spec generated in %.3f s" % (ed - st)) + user_spec.save(tmp_spec_path) + + copyfile(modelv_path, tmp_modelv_path) + copyfile(modell_path, tmp_modell_path) + + copyfile(yaml_path, tmp_yaml_path) + copyfile(init_file_path, tmp_init_path) + copyfile(env_file_path, tmp_env_path) + zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name)) + with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj: + zip_obj.write(tmp_spec_path, "rkme.json") + + zip_obj.write(tmp_modelv_path, "modelv.pth") + zip_obj.write(tmp_modell_path, "modell.pth") + + zip_obj.write(tmp_yaml_path, "learnware.yaml") + zip_obj.write(tmp_init_path, "__init__.py") + zip_obj.write(tmp_env_path, "requirements.txt") + rmtree(save_root) + logger.info("New Learnware Saved to %s" % (zip_file_name)) + return zip_file_name + + + def prepare_market(self, regenerate_flag=False): + if regenerate_flag: + self._init_text_dataset() + text_market = instantiate_learnware_market(market_id="ae", rebuild=True) + try: + rmtree(learnware_pool_dir) + except: + pass + os.makedirs(learnware_pool_dir, exist_ok=True) + for i in range(n_uploaders): + data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i)) + + modelv_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) + modell_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) + + init_file_path = "./example_files/example_init.py" + yaml_file_path = "./example_files/example_yaml.yaml" + env_file_path = "./example_files/requirements.txt" + new_learnware_path = self._prepare_learnware( + data_path, + modelv_path, + modell_path, + init_file_path, + yaml_file_path, + env_file_path, + tmp_dir, + "%s_%d" % (dataset, i), + ) + semantic_spec = semantic_specs[0] + semantic_spec["Name"]["Values"] = "learnware_%d" % (i) + semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i) + text_market.add_learnware(new_learnware_path, semantic_spec) + + logger.info("Total Item: %d" % (len(text_market))) + + + def test(self, regenerate_flag=False): + self.prepare_market(regenerate_flag) + text_market = instantiate_learnware_market(market_id="ae") + print("Total Item: %d" % len(text_market)) + + select_list = [] + avg_list = [] + improve_list = [] + job_selector_score_list = [] + ensemble_score_list = [] + pruning_score_list = [] + for i in range(n_users): + user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i)) + user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i)) + with open(user_data_path, "rb") as f: + user_data = pickle.load(f) + with open(user_label_path, "rb") as f: + user_label = pickle.load(f) + + user_stat_spec = specification.RKMETextSpecification() + user_stat_spec.generate_stat_spec_from_data(X=user_data) + user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec}) + logger.info("Searching Market for user: %d" % (i)) + sorted_score_list, single_learnware_list, mixture_score, mixture_learnware_list = text_market.search_learnware( + user_info + ) + + print(f"search result of user{i}:") + print( + f"single model num: {len(sorted_score_list)}, max_score: {sorted_score_list[0]}, min_score: {sorted_score_list[-1]}" + ) + + l = len(sorted_score_list) + acc_list = [] + for idx in range(l): + learnware = single_learnware_list[idx] + score = sorted_score_list[idx] + pred_y = learnware.predict(user_data) + acc = eval_prediction(pred_y, user_label) + acc_list.append(acc) + print( + f"Top1-score: {sorted_score_list[0]}, learnware_id: {single_learnware_list[0].id}, acc: {acc_list[0]}" + ) + + mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list]) + print(f"mixture_score: {mixture_score}, mixture_learnware: {mixture_id}") + if not mixture_learnware_list: + mixture_learnware_list = [single_learnware_list[0]] + + # test reuse (job selector) + reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100) + reuse_predict = reuse_baseline.predict(user_data=user_data) + reuse_score = eval_prediction(reuse_predict, user_label) + job_selector_score_list.append(reuse_score) + print(f"mixture reuse loss(job selector): {reuse_score}") + + # test reuse (ensemble) + reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label") + ensemble_predict_y = reuse_ensemble.predict(user_data=user_data) + ensemble_score = eval_prediction(ensemble_predict_y, user_label) + ensemble_score_list.append(ensemble_score) + print(f"mixture reuse accuracy (ensemble): {ensemble_score}") + + # test reuse (ensemblePruning) + reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list) + pruning_predict_y = reuse_pruning.predict(user_data=user_data) + pruning_score = eval_prediction(pruning_predict_y, user_label) + pruning_score_list.append(pruning_score) + print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n") + + select_list.append(acc_list[0]) + avg_list.append(np.mean(acc_list)) + improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list)) + + logger.info( + "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f" + % (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list)) + ) + logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) + logger.info( + "Average Job Selector Reuse Performance: %.3f +/- %.3f" + % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) + ) + logger.info( + "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" + % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) + ) + logger.info( + "Selective Ensemble Reuse Performance: %.3f +/- %.3f" + % (np.mean(pruning_score_list), np.std(pruning_score_list)) ) - l = len(sorted_score_list) - acc_list = [] - for idx in range(l): - learnware = single_learnware_list[idx] - score = sorted_score_list[idx] - pred_y = learnware.predict(user_data) - acc = eval_prediction(pred_y, user_label) - acc_list.append(acc) - logger.info("search rank: %d, score: %.3f, learnware_id: %s, acc: %.3f" % (idx, score, learnware.id, acc)) - - # test reuse (job selector) - reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100) - reuse_predict = reuse_baseline.predict(user_data=user_data) - reuse_score = eval_prediction(reuse_predict, user_label) - job_selector_score_list.append(reuse_score) - print(f"mixture reuse loss(job selector): {reuse_score}") - - # test reuse (ensemble) - reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label") - ensemble_predict_y = reuse_ensemble.predict(user_data=user_data) - ensemble_score = eval_prediction(ensemble_predict_y, user_label) - ensemble_score_list.append(ensemble_score) - print(f"mixture reuse accuracy (ensemble): {ensemble_score}") - - # test reuse (ensemblePruning) - reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list) - pruning_predict_y = reuse_pruning.predict(user_data=user_data) - pruning_score = eval_prediction(pruning_predict_y, user_label) - pruning_score_list.append(pruning_score) - print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n") - - select_list.append(acc_list[0]) - avg_list.append(np.mean(acc_list)) - improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list)) - - logger.info( - "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f" - % (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list)) - ) - logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) - logger.info( - "Average Job Selector Reuse Performance: %.3f +/- %.3f" - % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) - ) - logger.info( - "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" - % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) - ) - logger.info( - "Selective Ensemble Reuse Performance: %.3f +/- %.3f" - % (np.mean(pruning_score_list), np.std(pruning_score_list)) - ) if __name__ == "__main__": - prepare_data() - prepare_model() - test_search(load_market=False) + fire.Fire(TextDatasetWorkflow) diff --git a/examples/dataset_text_workflow/utils.py b/examples/dataset_text_workflow/utils.py index c438d08..247f706 100644 --- a/examples/dataset_text_workflow/utils.py +++ b/examples/dataset_text_workflow/utils.py @@ -1,19 +1,10 @@ import os -import numpy as np -import random -import math - -import torch -import torch.nn as nn -import torch.optim as optim - import pickle -import torchtext.transforms as T -from torch.hub import load_state_dict_from_url -from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER -import torchtext.functional as F -from torch.optim import AdamW -from torch.utils.data import DataLoader + +import numpy as np +import pandas as pd +from lightgbm import LGBMClassifier +from sklearn.feature_extraction.text import TfidfVectorizer class TextDataLoader: @@ -43,20 +34,25 @@ class TextDataLoader: return X, y -def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None): +def generate_uploader(data_x: pd.Series, data_y: pd.Series, n_uploaders=50, data_save_root=None): if data_save_root is None: return os.makedirs(data_save_root, exist_ok=True) - n = len(data_x) + + types = data_x["discourse_type"].unique() + for i in range(n_uploaders): - selected_X = data_x[i * (n // n_uploaders) : (i + 1) * (n // n_uploaders)] - selected_y = data_y[i * (n // n_uploaders) : (i + 1) * (n // n_uploaders)] + indices = data_x["discourse_type"] == types[i] + selected_X = data_x[indices]["discourse_text"].to_list() + selected_y = data_y[indices].to_list() + X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) with open(X_save_dir, "wb") as f: pickle.dump(selected_X, f) with open(y_save_dir, "wb") as f: pickle.dump(selected_y, f) + print("Saving to %s" % (X_save_dir)) @@ -64,114 +60,36 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None): if data_save_root is None: return os.makedirs(data_save_root, exist_ok=True) - n = len(data_x) + + types = data_x["discourse_type"].unique() + for i in range(n_users): - selected_X = data_x[i * (n // n_users) : (i + 1) * (n // n_users)] - selected_y = data_y[i * (n // n_users) : (i + 1) * (n // n_users)] + indices = data_x["discourse_type"] == types[i] + selected_X = data_x[indices]["discourse_text"].to_list() + selected_y = data_y[indices].to_list() + X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) with open(X_save_dir, "wb") as f: pickle.dump(selected_X, f) with open(y_save_dir, "wb") as f: pickle.dump(selected_y, f) - print("Saving to %s" % (X_save_dir)) - - -def sentence_preprocess(x_datapipe): - padding_idx = 1 - bos_idx = 0 - eos_idx = 2 - max_seq_len = 256 - xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt" - xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model" - - text_transform = T.Sequential( - T.SentencePieceTokenizer(xlmr_spm_model_path), - T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)), - T.Truncate(max_seq_len - 2), - T.AddToken(token=bos_idx, begin=True), - T.AddToken(token=eos_idx, begin=False), - ) - x_datapipe = [text_transform(x) for x in x_datapipe] - # x_datapipe = x_datapipe.map(text_transform) - return x_datapipe - - -def train_step(model, criteria, optim, input, target): - output = model(input) - loss = criteria(output, target) - optim.zero_grad() - loss.backward() - optim.step() - - -def eval_step(model, criteria, input, target): - output = model(input) - loss = criteria(output, target).item() - return float(loss), (output.argmax(1) == target).type(torch.float).sum().item() - - -def evaluate(model, criteria, dev_dataloader): - model.eval() - total_loss = 0 - correct_predictions = 0 - total_predictions = 0 - counter = 0 - with torch.no_grad(): - for batch in dev_dataloader: - input = F.to_tensor(batch["token_ids"], padding_value=1).to(DEVICE) - target = torch.tensor(batch["target"]).to(DEVICE) - loss, predictions = eval_step(model, criteria, input, target) - total_loss += loss - correct_predictions += predictions - total_predictions += len(target) - counter += 1 - - return total_loss / counter, correct_predictions / total_predictions - - -DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print("Saving to %s" % (X_save_dir)) # Train Uploaders' models -def train(X, y, out_classes, epochs=35, batch_size=128): - # print(X.shape, y.shape) - from torchdata.datapipes.iter import IterableWrapper - - X = sentence_preprocess(X) - data_size = len(X) - train_datapipe = list(zip(X, y)) - train_datapipe = IterableWrapper(train_datapipe) - train_datapipe = train_datapipe.batch(batch_size) - train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"]) - train_dataloader = DataLoader(train_datapipe, batch_size=None) - - num_classes = 2 - input_dim = 768 - classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim) - model = XLMR_BASE_ENCODER.get_model(head=classifier_head) - learning_rate = 1e-5 - optim = AdamW(model.parameters(), lr=learning_rate) - criteria = nn.CrossEntropyLoss() +def train(X, y, out_classes): + vectorizer = TfidfVectorizer(stop_words="english") + X_tfidf = vectorizer.fit_transform(X) - model.to(DEVICE) + lgbm = LGBMClassifier(boosting_type="dart", n_estimators=500, num_leaves=21) + lgbm.fit(X_tfidf, y) - num_epochs = 10 - - for e in range(num_epochs): - for batch in train_dataloader: - input = F.to_tensor(batch["token_ids"], padding_value=1).to(DEVICE) - target = torch.tensor(batch["target"]).to(DEVICE) - train_step(model, criteria, optim, input, target) - - loss, accuracy = evaluate(model, criteria, train_dataloader) - print("Epoch = [{}], loss = [{}], accuracy = [{}]".format(e, loss, accuracy)) - return model + return vectorizer, lgbm def eval_prediction(pred_y, target_y): - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if not isinstance(pred_y, np.ndarray): pred_y = pred_y.detach().cpu().numpy() if len(pred_y.shape) == 1: @@ -179,9 +97,8 @@ def eval_prediction(pred_y, target_y): else: predicted = np.argmax(pred_y, 1) annos = np.array(target_y) - # print(predicted, annos) - # annos = target_y + total = predicted.shape[0] correct = (predicted == annos).sum().item() - criterion = nn.CrossEntropyLoss() + return correct / total diff --git a/examples/dataset_text_workflow2/example_files/example_init.py b/examples/dataset_text_workflow2/example_files/example_init.py deleted file mode 100644 index 1772a19..0000000 --- a/examples/dataset_text_workflow2/example_files/example_init.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import pickle - -import numpy as np - -from learnware.model import BaseModel - - -class Model(BaseModel): - def __init__(self): - super(Model, self).__init__(input_shape=(1,), output_shape=(1,)) - dir_path = os.path.dirname(os.path.abspath(__file__)) - - modelv_path = os.path.join(dir_path, "modelv.pth") - with open(modelv_path, "rb") as f: - self.modelv = pickle.load(f) - - modell_path = os.path.join(dir_path, "modell.pth") - with open(modell_path, "rb") as f: - self.modell = pickle.load(f) - - def fit(self, X: np.ndarray, y: np.ndarray): - pass - - def predict(self, X: np.ndarray) -> np.ndarray: - return self.modell.predict(self.modelv.transform(X)) - - def finetune(self, X: np.ndarray, y: np.ndarray): - pass diff --git a/examples/dataset_text_workflow2/example_files/example_yaml.yaml b/examples/dataset_text_workflow2/example_files/example_yaml.yaml deleted file mode 100644 index d29f7dd..0000000 --- a/examples/dataset_text_workflow2/example_files/example_yaml.yaml +++ /dev/null @@ -1,8 +0,0 @@ -model: - class_name: Model - kwargs: { } -stat_specifications: - - module_path: learnware.specification - class_name: RKMETextSpecification - file_name: rkme.json - kwargs: { } \ No newline at end of file diff --git a/examples/dataset_text_workflow2/example_files/requirements.txt b/examples/dataset_text_workflow2/example_files/requirements.txt deleted file mode 100644 index da986b1..0000000 --- a/examples/dataset_text_workflow2/example_files/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -numpy -pickle -lightgbm -scikit-learn \ No newline at end of file diff --git a/examples/dataset_text_workflow2/get_data.py b/examples/dataset_text_workflow2/get_data.py deleted file mode 100644 index a55e8d7..0000000 --- a/examples/dataset_text_workflow2/get_data.py +++ /dev/null @@ -1,16 +0,0 @@ -import os - -import pandas as pd - - -def get_data(data_root="./data"): - dtrain = pd.read_csv(os.path.join(data_root, "train.csv")) - dtest = pd.read_csv(os.path.join(data_root, "test.csv")) - - # returned X(DataFrame), y(Series) - return ( - dtrain[["discourse_text", "discourse_type"]], - dtrain["discourse_effectiveness"], - dtest[["discourse_text", "discourse_type"]], - dtest["discourse_effectiveness"], - ) diff --git a/examples/dataset_text_workflow2/main.py b/examples/dataset_text_workflow2/main.py deleted file mode 100644 index f1b7dc1..0000000 --- a/examples/dataset_text_workflow2/main.py +++ /dev/null @@ -1,257 +0,0 @@ -import os -import pickle -import time -import zipfile -from shutil import copyfile, rmtree - -import numpy as np - -import learnware.specification as specification -from get_data import get_data -from learnware.logger import get_module_logger -from learnware.market import instantiate_learnware_market, BaseUserInfo -from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser -from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction - -logger = get_module_logger("text_test", level="INFO") -origin_data_root = "./data/origin_data" -processed_data_root = "./data/processed_data" -tmp_dir = "./data/tmp" -learnware_pool_dir = "./data/learnware_pool" -dataset = "ae" # argumentative essays -n_uploaders = 7 -n_users = 7 -n_classes = 3 -data_root = os.path.join(origin_data_root, dataset) -data_save_root = os.path.join(processed_data_root, dataset) -user_save_root = os.path.join(data_save_root, "user") -uploader_save_root = os.path.join(data_save_root, "uploader") -model_save_root = os.path.join(data_save_root, "uploader_model") -os.makedirs(data_root, exist_ok=True) -os.makedirs(user_save_root, exist_ok=True) -os.makedirs(uploader_save_root, exist_ok=True) -os.makedirs(model_save_root, exist_ok=True) - -output_description = { - "Dimension": 1, - "Description": { - "0": "classify as 0(ineffective), 1(effective), or 2(adequate).", - }, -} -semantic_specs = [ - { - "Data": {"Values": ["Text"], "Type": "Class"}, - "Task": {"Values": ["Classification"], "Type": "Class"}, - "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, - "Scenario": {"Values": ["Education"], "Type": "Tag"}, - "Description": {"Values": "", "Type": "String"}, - "Name": {"Values": "learnware_1", "Type": "String"}, - "Output": output_description, - } -] - -user_semantic = { - "Data": {"Values": ["Text"], "Type": "Class"}, - "Task": {"Values": ["Classification"], "Type": "Class"}, - "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, - "Scenario": {"Values": ["Education"], "Type": "Tag"}, - "Description": {"Values": "", "Type": "String"}, - "Name": {"Values": "", "Type": "String"}, - "Output": output_description, -} - - -def prepare_data(): - X_train, y_train, X_test, y_test = get_data(data_root) - - generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root) - generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root) - - -def prepare_model(): - dataloader = TextDataLoader(data_save_root, train=True) - for i in range(n_uploaders): - logger.info("Train on uploader: %d" % (i)) - X, y = dataloader.get_idx_data(i) - vectorizer, lgbm = train(X, y, out_classes=n_classes) - - modelv_save_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) - modell_save_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) - - with open(modelv_save_path, "wb") as f: - pickle.dump(vectorizer, f) - - with open(modell_save_path, "wb") as f: - pickle.dump(lgbm, f) - - logger.info("Model saved to '%s' and '%s'" % (modelv_save_path, modell_save_path)) - - -def prepare_learnware( - data_path, modelv_path, modell_path, init_file_path, yaml_path, env_file_path, save_root, zip_name -): - os.makedirs(save_root, exist_ok=True) - tmp_spec_path = os.path.join(save_root, "rkme.json") - - tmp_modelv_path = os.path.join(save_root, "modelv.pth") - tmp_modell_path = os.path.join(save_root, "modell.pth") - - tmp_yaml_path = os.path.join(save_root, "learnware.yaml") - tmp_init_path = os.path.join(save_root, "__init__.py") - tmp_env_path = os.path.join(save_root, "requirements.txt") - - with open(data_path, "rb") as f: - X = pickle.load(f) - semantic_spec = semantic_specs[0] - - st = time.time() - - user_spec = specification.RKMETextSpecification() - - user_spec.generate_stat_spec_from_data(X=X) - ed = time.time() - logger.info("Stat spec generated in %.3f s" % (ed - st)) - user_spec.save(tmp_spec_path) - - copyfile(modelv_path, tmp_modelv_path) - copyfile(modell_path, tmp_modell_path) - - copyfile(yaml_path, tmp_yaml_path) - copyfile(init_file_path, tmp_init_path) - copyfile(env_file_path, tmp_env_path) - zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name)) - with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj: - zip_obj.write(tmp_spec_path, "rkme.json") - - zip_obj.write(tmp_modelv_path, "modelv.pth") - zip_obj.write(tmp_modell_path, "modell.pth") - - zip_obj.write(tmp_yaml_path, "learnware.yaml") - zip_obj.write(tmp_init_path, "__init__.py") - zip_obj.write(tmp_env_path, "requirements.txt") - rmtree(save_root) - logger.info("New Learnware Saved to %s" % (zip_file_name)) - return zip_file_name - - -def prepare_market(): - text_market = instantiate_learnware_market(market_id="ae", rebuild=True) - try: - rmtree(learnware_pool_dir) - except: - pass - os.makedirs(learnware_pool_dir, exist_ok=True) - for i in range(n_uploaders): - data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i)) - - modelv_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) - modell_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) - - init_file_path = "./example_files/example_init.py" - yaml_file_path = "./example_files/example_yaml.yaml" - env_file_path = "./example_files/requirements.txt" - new_learnware_path = prepare_learnware( - data_path, - modelv_path, - modell_path, - init_file_path, - yaml_file_path, - env_file_path, - tmp_dir, - "%s_%d" % (dataset, i), - ) - semantic_spec = semantic_specs[0] - semantic_spec["Name"]["Values"] = "learnware_%d" % (i) - semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i) - text_market.add_learnware(new_learnware_path, semantic_spec) - - logger.info("Total Item: %d" % (len(text_market))) - - -def test_search(load_market=True): - if load_market: - text_market = instantiate_learnware_market(market_id="ae") - else: - prepare_market() - text_market = instantiate_learnware_market(market_id="ae") - logger.info("Number of items in the market: %d" % len(text_market)) - - select_list = [] - avg_list = [] - improve_list = [] - job_selector_score_list = [] - ensemble_score_list = [] - pruning_score_list = [] - for i in range(n_users): - user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i)) - user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i)) - with open(user_data_path, "rb") as f: - user_data = pickle.load(f) - with open(user_label_path, "rb") as f: - user_label = pickle.load(f) - - user_stat_spec = specification.RKMETextSpecification() - user_stat_spec.generate_stat_spec_from_data(X=user_data) - user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec}) - logger.info("Searching Market for user: %d" % (i)) - sorted_score_list, single_learnware_list, mixture_score, mixture_learnware_list = text_market.search_learnware( - user_info - ) - l = len(sorted_score_list) - acc_list = [] - for idx in range(l): - learnware = single_learnware_list[idx] - score = sorted_score_list[idx] - pred_y = learnware.predict(user_data) - acc = eval_prediction(pred_y, user_label) - acc_list.append(acc) - logger.info("search rank: %d, score: %.3f, learnware_id: %s, acc: %.3f" % (idx, score, learnware.id, acc)) - - # test reuse (job selector) - reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100) - reuse_predict = reuse_baseline.predict(user_data=user_data) - reuse_score = eval_prediction(reuse_predict, user_label) - job_selector_score_list.append(reuse_score) - print(f"mixture reuse loss(job selector): {reuse_score}") - - # test reuse (ensemble) - reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label") - ensemble_predict_y = reuse_ensemble.predict(user_data=user_data) - ensemble_score = eval_prediction(ensemble_predict_y, user_label) - ensemble_score_list.append(ensemble_score) - print(f"mixture reuse accuracy (ensemble): {ensemble_score}") - - # test reuse (ensemblePruning) - reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list) - pruning_predict_y = reuse_pruning.predict(user_data=user_data) - pruning_score = eval_prediction(pruning_predict_y, user_label) - pruning_score_list.append(pruning_score) - print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n") - - select_list.append(acc_list[0]) - avg_list.append(np.mean(acc_list)) - improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list)) - - logger.info( - "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f" - % (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list)) - ) - logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) - logger.info( - "Average Job Selector Reuse Performance: %.3f +/- %.3f" - % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) - ) - logger.info( - "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" - % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) - ) - logger.info( - "Selective Ensemble Reuse Performance: %.3f +/- %.3f" - % (np.mean(pruning_score_list), np.std(pruning_score_list)) - ) - - -if __name__ == "__main__": - prepare_data() - prepare_model() - test_search(load_market=False) diff --git a/examples/dataset_text_workflow2/utils.py b/examples/dataset_text_workflow2/utils.py deleted file mode 100644 index 247f706..0000000 --- a/examples/dataset_text_workflow2/utils.py +++ /dev/null @@ -1,104 +0,0 @@ -import os -import pickle - -import numpy as np -import pandas as pd -from lightgbm import LGBMClassifier -from sklearn.feature_extraction.text import TfidfVectorizer - - -class TextDataLoader: - def __init__(self, data_root, train: bool = True): - self.data_root = data_root - self.train = train - - def get_idx_data(self, idx=0): - if self.train: - X_path = os.path.join(self.data_root, "uploader", "uploader_%d_X.pkl" % (idx)) - y_path = os.path.join(self.data_root, "uploader", "uploader_%d_y.pkl" % (idx)) - if not (os.path.exists(X_path) and os.path.exists(y_path)): - raise Exception("Index Error") - with open(X_path, "rb") as f: - X = pickle.load(f) - with open(y_path, "rb") as f: - y = pickle.load(f) - else: - X_path = os.path.join(self.data_root, "user", "user_%d_X.pkl" % (idx)) - y_path = os.path.join(self.data_root, "user", "user_%d_y.pkl" % (idx)) - if not (os.path.exists(X_path) and os.path.exists(y_path)): - raise Exception("Index Error") - with open(X_path, "rb") as f: - X = pickle.load(f) - with open(y_path, "rb") as f: - y = pickle.load(f) - return X, y - - -def generate_uploader(data_x: pd.Series, data_y: pd.Series, n_uploaders=50, data_save_root=None): - if data_save_root is None: - return - os.makedirs(data_save_root, exist_ok=True) - - types = data_x["discourse_type"].unique() - - for i in range(n_uploaders): - indices = data_x["discourse_type"] == types[i] - selected_X = data_x[indices]["discourse_text"].to_list() - selected_y = data_y[indices].to_list() - - X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) - y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) - with open(X_save_dir, "wb") as f: - pickle.dump(selected_X, f) - with open(y_save_dir, "wb") as f: - pickle.dump(selected_y, f) - - print("Saving to %s" % (X_save_dir)) - - -def generate_user(data_x, data_y, n_users=50, data_save_root=None): - if data_save_root is None: - return - os.makedirs(data_save_root, exist_ok=True) - - types = data_x["discourse_type"].unique() - - for i in range(n_users): - indices = data_x["discourse_type"] == types[i] - selected_X = data_x[indices]["discourse_text"].to_list() - selected_y = data_y[indices].to_list() - - X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) - y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) - with open(X_save_dir, "wb") as f: - pickle.dump(selected_X, f) - with open(y_save_dir, "wb") as f: - pickle.dump(selected_y, f) - - print("Saving to %s" % (X_save_dir)) - - -# Train Uploaders' models -def train(X, y, out_classes): - vectorizer = TfidfVectorizer(stop_words="english") - X_tfidf = vectorizer.fit_transform(X) - - lgbm = LGBMClassifier(boosting_type="dart", n_estimators=500, num_leaves=21) - lgbm.fit(X_tfidf, y) - - return vectorizer, lgbm - - -def eval_prediction(pred_y, target_y): - if not isinstance(pred_y, np.ndarray): - pred_y = pred_y.detach().cpu().numpy() - if len(pred_y.shape) == 1: - predicted = np.array(pred_y) - else: - predicted = np.argmax(pred_y, 1) - annos = np.array(target_y) - - total = predicted.shape[0] - correct = (predicted == annos).sum().item() - - return correct / total diff --git a/learnware/market/easy/searcher.py b/learnware/market/easy/searcher.py index 63a47eb..d55cc6a 100644 --- a/learnware/market/easy/searcher.py +++ b/learnware/market/easy/searcher.py @@ -599,7 +599,7 @@ class EasyStatSearcher(BaseSearcher): merge_score_list = self._convert_dist_to_score(sorted_dist_list + [mixture_dist]) sorted_score_list = merge_score_list[:-1] mixture_score = merge_score_list[-1] - if int(mixture_score * 100) == int(sorted_score_list[0] * 100): + if len(mixture_learnware_list) == 1 or int(mixture_score * 100) == int(sorted_score_list[0] * 100): mixture_score = None mixture_learnware_list = [] logger.info( diff --git a/learnware/specification/regular/table/__init__.py b/learnware/specification/regular/table/__init__.py index 443648a..a380ea3 100644 --- a/learnware/specification/regular/table/__init__.py +++ b/learnware/specification/regular/table/__init__.py @@ -1,14 +1,27 @@ +from .utils import is_fast_pytorch_kmeans_available + from ....utils import is_torch_available from ....logger import get_module_logger logger = get_module_logger("regular_table_spec") -if not is_torch_available(verbose=False): +if not is_torch_available(verbose=False) or not is_fast_pytorch_kmeans_available(verbose=False): RKMETableSpecification = None RKMEStatSpecification = None rkme_solve_qp = None + uninstall_packages = [ + value + for flag, value in zip( + [ + is_torch_available(verbose=False), + is_fast_pytorch_kmeans_available(verbose=False), + ], + ["torch", "fast_pytorch_kmeans"], + ) + if flag is False + ] logger.warning( - "RKMETableSpecification, RKMEStatSpecification and rkme_solve_qp are skipped because torch is not installed!" + f"RKMETableSpecification, RKMEStatSpecification and rkme_solve_qp are skipped because {uninstall_packages} is not installed!" ) else: from .rkme import RKMETableSpecification, RKMEStatSpecification, rkme_solve_qp diff --git a/learnware/specification/regular/table/rkme.py b/learnware/specification/regular/table/rkme.py index f7c850c..68cfc1f 100644 --- a/learnware/specification/regular/table/rkme.py +++ b/learnware/specification/regular/table/rkme.py @@ -1,17 +1,15 @@ from __future__ import annotations import os -import copy import torch import json import codecs -import random +import scipy import numpy as np from qpsolvers import solve_qp, Problem, solve_problem from collections import Counter -from typing import Tuple, Any, List, Union, Dict -import scipy -from sklearn.cluster import MiniBatchKMeans +from typing import Any, Union +from fast_pytorch_kmeans import KMeans from ..base import RegularStatSpecification from ....logger import get_module_logger @@ -140,11 +138,14 @@ class RKMETableSpecification(RegularStatSpecification): K : int Size of the construced reduced set. """ - X = X.astype("float32") - kmeans = MiniBatchKMeans(n_clusters=K, max_iter=100, verbose=False, n_init="auto") + if isinstance(X, np.ndarray): + X = X.astype("float32") + X = torch.from_numpy(X) + + X = X.to(self._device) + kmeans = KMeans(n_clusters=K, mode='euclidean', max_iter=100, verbose=0) kmeans.fit(X) - center = torch.from_numpy(kmeans.cluster_centers_).double() - self.z = center + self.z = kmeans.centroids.double() def _update_beta(self, X: Any, nonnegative_beta: bool = True): """Fix Z and update beta using its closed-form solution. diff --git a/learnware/specification/regular/table/utils.py b/learnware/specification/regular/table/utils.py new file mode 100644 index 0000000..3243b72 --- /dev/null +++ b/learnware/specification/regular/table/utils.py @@ -0,0 +1,15 @@ +from ....logger import get_module_logger + +logger = get_module_logger("regular_table_spec_utils") + + +def is_fast_pytorch_kmeans_available(verbose=False): + try: + import fast_pytorch_kmeans + except ModuleNotFoundError as err: + if verbose is True: + logger.warning( + "ModuleNotFoundError: fast_pytorch_kmeans is not installed, please install fast_pytorch_kmeans!" + ) + return False + return True diff --git a/learnware/specification/regular/text/__init__.py b/learnware/specification/regular/text/__init__.py index d679589..eda4208 100644 --- a/learnware/specification/regular/text/__init__.py +++ b/learnware/specification/regular/text/__init__.py @@ -1,11 +1,16 @@ from .utils import is_sentence_transformers_available +from ..table.utils import is_fast_pytorch_kmeans_available from ....utils import is_torch_available from ....logger import get_module_logger logger = get_module_logger("regular_text_spec") -if not is_sentence_transformers_available(verbose=False) or not is_torch_available(verbose=False): +if ( + not is_sentence_transformers_available(verbose=False) + or not is_torch_available(verbose=False) + or not is_fast_pytorch_kmeans_available(verbose=False) +): RKMETextSpecification = None uninstall_packages = [ value @@ -13,8 +18,9 @@ if not is_sentence_transformers_available(verbose=False) or not is_torch_availab [ is_sentence_transformers_available(verbose=False), is_torch_available(verbose=False), + is_fast_pytorch_kmeans_available(verbose=False), ], - ["sentence_transformers", "torch"], + ["sentence_transformers", "torch", "fast_pytorch_kmeans"], ) if flag is False ] diff --git a/setup.py b/setup.py index a75507d..1992bdc 100644 --- a/setup.py +++ b/setup.py @@ -89,10 +89,11 @@ DEV_REQUIRED = [ FULL_REQUIRED = [ # The default full requirements for learnware package - "torch==2.1.0", - "torchvision==0.16.0", + "torch==2.0.1", + "torchvision==0.15.2", "torch-optimizer>=0.3.0", - "sentence_transformers>=2.2.2", + "sentence_transformers==2.2.2", + "fast_pytorch_kmeans==0.2.0.1", ] # In MACOS, the lightgbm package should be installed with brew.