| @@ -176,6 +176,8 @@ class M5DatasetWorkflow: | |||
| mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list]) | |||
| print(f"mixture_score: {mixture_score}, mixture_learnware: {mixture_id}") | |||
| if not mixture_learnware_list: | |||
| mixture_learnware_list = [single_learnware_list[0]] | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) | |||
| job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) | |||
| @@ -173,6 +173,8 @@ class PFSDatasetWorkflow: | |||
| mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list]) | |||
| print(f"mixture_score: {mixture_score}, mixture_learnware: {mixture_id}") | |||
| if not mixture_learnware_list: | |||
| mixture_learnware_list = [single_learnware_list[0]] | |||
| reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False) | |||
| job_selector_predict_y = reuse_job_selector.predict(user_data=test_x) | |||
| @@ -1,54 +1,29 @@ | |||
| import os | |||
| import joblib | |||
| import pickle | |||
| import numpy as np | |||
| from learnware.model import BaseModel | |||
| import torch | |||
| from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER | |||
| import torchtext.functional as F | |||
| import torchtext.transforms as T | |||
| from torch.hub import load_state_dict_from_url | |||
| class Model(BaseModel): | |||
| def __init__(self): | |||
| super().__init__(input_shape=None, output_shape=(2,)) | |||
| super(Model, self).__init__(input_shape=(1,), output_shape=(1,)) | |||
| dir_path = os.path.dirname(os.path.abspath(__file__)) | |||
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |||
| num_classes = 2 | |||
| input_dim = 768 | |||
| classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim) | |||
| self.model = XLMR_BASE_ENCODER.get_model(head=classifier_head).to(self.device) | |||
| self.model.load_state_dict(torch.load(os.path.join(dir_path, "model.pth"), map_location=torch.device("cpu"))) | |||
| modelv_path = os.path.join(dir_path, "modelv.pth") | |||
| with open(modelv_path, "rb") as f: | |||
| self.modelv = pickle.load(f) | |||
| modell_path = os.path.join(dir_path, "modell.pth") | |||
| with open(modell_path, "rb") as f: | |||
| self.modell = pickle.load(f) | |||
| def fit(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| def predict(self, X: np.ndarray) -> np.ndarray: | |||
| X = sentence_preprocess(X) | |||
| X = F.to_tensor(X, padding_value=1).to(self.device) | |||
| return self.model(X) | |||
| return self.modell.predict(self.modelv.transform(X)) | |||
| def finetune(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| def sentence_preprocess(x_datapipe): | |||
| padding_idx = 1 | |||
| bos_idx = 0 | |||
| eos_idx = 2 | |||
| max_seq_len = 256 | |||
| xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt" | |||
| xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model" | |||
| text_transform = T.Sequential( | |||
| T.SentencePieceTokenizer(xlmr_spm_model_path), | |||
| T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)), | |||
| T.Truncate(max_seq_len - 2), | |||
| T.AddToken(token=bos_idx, begin=True), | |||
| T.AddToken(token=eos_idx, begin=False), | |||
| ) | |||
| x_datapipe = [text_transform(x) for x in x_datapipe] | |||
| # x_datapipe = x_datapipe.map(text_transform) | |||
| return x_datapipe | |||
| @@ -1,8 +1,8 @@ | |||
| model: | |||
| class_name: Model | |||
| kwargs: {} | |||
| kwargs: { } | |||
| stat_specifications: | |||
| - module_path: learnware.specification | |||
| class_name: RKMETextSpecification | |||
| file_name: rkme.json | |||
| kwargs: {} | |||
| kwargs: { } | |||
| @@ -1,3 +1,4 @@ | |||
| torch==2.0.1 | |||
| torchdata | |||
| torchtext | |||
| numpy | |||
| pickle | |||
| lightgbm | |||
| scikit-learn | |||
| @@ -1,14 +1,16 @@ | |||
| from torchtext.datasets import SST2 | |||
| import os | |||
| import pandas as pd | |||
| def get_sst2(data_root="./data"): | |||
| train_datapipe = SST2(root="./data", split="train") | |||
| X_train = [x[0] for x in train_datapipe] | |||
| y_train = [x[1] for x in train_datapipe] | |||
| def get_data(data_root="./data"): | |||
| dtrain = pd.read_csv(os.path.join(data_root, "train.csv")) | |||
| dtest = pd.read_csv(os.path.join(data_root, "test.csv")) | |||
| dev_datapipe = SST2(root="./data", split="dev") | |||
| X_test = [x[0] for x in dev_datapipe] | |||
| y_test = [x[1] for x in dev_datapipe] | |||
| return X_train, y_train, X_test, y_test | |||
| # returned X(DataFrame), y(Series) | |||
| return ( | |||
| dtrain[["discourse_text", "discourse_type"]], | |||
| dtrain["discourse_effectiveness"], | |||
| dtest[["discourse_text", "discourse_type"]], | |||
| dtest["discourse_effectiveness"], | |||
| ) | |||
| @@ -1,30 +1,28 @@ | |||
| import numpy as np | |||
| import torch | |||
| from get_data import get_sst2 | |||
| import os | |||
| import random | |||
| from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction | |||
| from learnware.learnware import Learnware | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser | |||
| import time | |||
| import fire | |||
| import pickle | |||
| import time | |||
| import zipfile | |||
| from shutil import copyfile, rmtree | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.specification import RKMETextSpecification | |||
| from learnware.logger import get_module_logger | |||
| import numpy as np | |||
| from shutil import copyfile, rmtree | |||
| import zipfile | |||
| import learnware.specification as specification | |||
| from get_data import get_data | |||
| from learnware.logger import get_module_logger | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser | |||
| from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction | |||
| logger = get_module_logger("text_test", level="INFO") | |||
| origin_data_root = "./data/origin_data" | |||
| processed_data_root = "./data/processed_data" | |||
| tmp_dir = "./data/tmp" | |||
| learnware_pool_dir = "./data/learnware_pool" | |||
| dataset = "sst2" | |||
| n_uploaders = 10 | |||
| n_users = 5 | |||
| n_classes = 2 | |||
| dataset = "ae" # argumentative essays | |||
| n_uploaders = 7 | |||
| n_users = 7 | |||
| n_classes = 3 | |||
| data_root = os.path.join(origin_data_root, dataset) | |||
| data_save_root = os.path.join(processed_data_root, dataset) | |||
| user_save_root = os.path.join(data_save_root, "user") | |||
| @@ -36,18 +34,17 @@ os.makedirs(uploader_save_root, exist_ok=True) | |||
| os.makedirs(model_save_root, exist_ok=True) | |||
| output_description = { | |||
| "Dimension": 2, | |||
| "Dimension": 1, | |||
| "Description": { | |||
| "0": "the probability of being negative", | |||
| "1": "the probability of being positive", | |||
| "0": "classify as 0(ineffective), 1(effective), or 2(adequate).", | |||
| }, | |||
| } | |||
| semantic_specs = [ | |||
| { | |||
| "Data": {"Values": ["Text"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Library": {"Values": ["PyTorch"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Education"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "learnware_1", "Type": "String"}, | |||
| "Output": output_description, | |||
| @@ -57,176 +54,220 @@ semantic_specs = [ | |||
| user_semantic = { | |||
| "Data": {"Values": ["Text"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Library": {"Values": ["PyTorch"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Business"], "Type": "Tag"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Education"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "", "Type": "String"}, | |||
| "Output": output_description, | |||
| } | |||
| def prepare_data(): | |||
| if dataset == "sst2": | |||
| X_train, y_train, X_test, y_test = get_sst2(data_root) | |||
| else: | |||
| return | |||
| generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root) | |||
| generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root) | |||
| def prepare_model(): | |||
| dataloader = TextDataLoader(data_save_root, train=True) | |||
| for i in range(n_uploaders): | |||
| logger.info("Train on uploader: %d" % (i)) | |||
| X, y = dataloader.get_idx_data(i) | |||
| model = train(X, y, out_classes=n_classes) | |||
| model_save_path = os.path.join(model_save_root, "uploader_%d.pth" % (i)) | |||
| torch.save(model.state_dict(), model_save_path) | |||
| logger.info("Model saved to '%s'" % (model_save_path)) | |||
| def prepare_learnware(data_path, model_path, init_file_path, yaml_path, env_file_path, save_root, zip_name): | |||
| os.makedirs(save_root, exist_ok=True) | |||
| tmp_spec_path = os.path.join(save_root, "rkme.json") | |||
| tmp_model_path = os.path.join(save_root, "model.pth") | |||
| tmp_yaml_path = os.path.join(save_root, "learnware.yaml") | |||
| tmp_init_path = os.path.join(save_root, "__init__.py") | |||
| tmp_env_path = os.path.join(save_root, "requirements.txt") | |||
| with open(data_path, "rb") as f: | |||
| X = pickle.load(f) | |||
| semantic_spec = semantic_specs[0] | |||
| st = time.time() | |||
| user_spec = RKMETextSpecification() | |||
| user_spec.generate_stat_spec_from_data(X=X) | |||
| ed = time.time() | |||
| logger.info("Stat spec generated in %.3f s" % (ed - st)) | |||
| user_spec.save(tmp_spec_path) | |||
| copyfile(model_path, tmp_model_path) | |||
| copyfile(yaml_path, tmp_yaml_path) | |||
| copyfile(init_file_path, tmp_init_path) | |||
| copyfile(env_file_path, tmp_env_path) | |||
| zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name)) | |||
| with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj: | |||
| zip_obj.write(tmp_spec_path, "rkme.json") | |||
| zip_obj.write(tmp_model_path, "model.pth") | |||
| zip_obj.write(tmp_yaml_path, "learnware.yaml") | |||
| zip_obj.write(tmp_init_path, "__init__.py") | |||
| zip_obj.write(tmp_env_path, "requirements.txt") | |||
| rmtree(save_root) | |||
| logger.info("New Learnware Saved to %s" % (zip_file_name)) | |||
| return zip_file_name | |||
| def prepare_market(): | |||
| text_market = instantiate_learnware_market(market_id="sst2", rebuild=True) | |||
| try: | |||
| rmtree(learnware_pool_dir) | |||
| except: | |||
| pass | |||
| os.makedirs(learnware_pool_dir, exist_ok=True) | |||
| for i in range(n_uploaders): | |||
| data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i)) | |||
| model_path = os.path.join(model_save_root, "uploader_%d.pth" % (i)) | |||
| init_file_path = "./example_files/example_init.py" | |||
| yaml_file_path = "./example_files/example_yaml.yaml" | |||
| env_file_path = "./example_files/requirements.txt" | |||
| new_learnware_path = prepare_learnware( | |||
| data_path, model_path, init_file_path, yaml_file_path, env_file_path, tmp_dir, "%s_%d" % (dataset, i) | |||
| ) | |||
| class TextDatasetWorkflow: | |||
| def _init_text_dataset(self): | |||
| self._prepare_data() | |||
| self._prepare_model() | |||
| def _prepare_data(self): | |||
| X_train, y_train, X_test, y_test = get_data(data_root) | |||
| generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root) | |||
| generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root) | |||
| def _prepare_model(self): | |||
| dataloader = TextDataLoader(data_save_root, train=True) | |||
| for i in range(n_uploaders): | |||
| logger.info("Train on uploader: %d" % (i)) | |||
| X, y = dataloader.get_idx_data(i) | |||
| vectorizer, lgbm = train(X, y, out_classes=n_classes) | |||
| modelv_save_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) | |||
| modell_save_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) | |||
| with open(modelv_save_path, "wb") as f: | |||
| pickle.dump(vectorizer, f) | |||
| with open(modell_save_path, "wb") as f: | |||
| pickle.dump(lgbm, f) | |||
| logger.info("Model saved to '%s' and '%s'" % (modelv_save_path, modell_save_path)) | |||
| def _prepare_learnware(self, | |||
| data_path, modelv_path, modell_path, init_file_path, yaml_path, env_file_path, save_root, zip_name | |||
| ): | |||
| os.makedirs(save_root, exist_ok=True) | |||
| tmp_spec_path = os.path.join(save_root, "rkme.json") | |||
| tmp_modelv_path = os.path.join(save_root, "modelv.pth") | |||
| tmp_modell_path = os.path.join(save_root, "modell.pth") | |||
| tmp_yaml_path = os.path.join(save_root, "learnware.yaml") | |||
| tmp_init_path = os.path.join(save_root, "__init__.py") | |||
| tmp_env_path = os.path.join(save_root, "requirements.txt") | |||
| with open(data_path, "rb") as f: | |||
| X = pickle.load(f) | |||
| semantic_spec = semantic_specs[0] | |||
| semantic_spec["Name"]["Values"] = "learnware_%d" % (i) | |||
| semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i) | |||
| text_market.add_learnware(new_learnware_path, semantic_spec) | |||
| logger.info("Total Item: %d" % (len(text_market))) | |||
| def test_search(gamma=0.1, load_market=True): | |||
| if load_market: | |||
| text_market = instantiate_learnware_market(market_id="sst2") | |||
| else: | |||
| prepare_market() | |||
| text_market = instantiate_learnware_market(market_id="sst2") | |||
| logger.info("Number of items in the market: %d" % len(text_market)) | |||
| select_list = [] | |||
| avg_list = [] | |||
| improve_list = [] | |||
| job_selector_score_list = [] | |||
| ensemble_score_list = [] | |||
| pruning_score_list = [] | |||
| for i in range(n_users): | |||
| user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i)) | |||
| user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i)) | |||
| with open(user_data_path, "rb") as f: | |||
| user_data = pickle.load(f) | |||
| with open(user_label_path, "rb") as f: | |||
| user_label = pickle.load(f) | |||
| user_stat_spec = RKMETextSpecification() | |||
| user_stat_spec.generate_stat_spec_from_data(X=user_data) | |||
| user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec}) | |||
| logger.info("Searching Market for user: %d" % (i)) | |||
| sorted_score_list, single_learnware_list, mixture_score, mixture_learnware_list = text_market.search_learnware( | |||
| user_info | |||
| st = time.time() | |||
| user_spec = specification.RKMETextSpecification() | |||
| user_spec.generate_stat_spec_from_data(X=X) | |||
| ed = time.time() | |||
| logger.info("Stat spec generated in %.3f s" % (ed - st)) | |||
| user_spec.save(tmp_spec_path) | |||
| copyfile(modelv_path, tmp_modelv_path) | |||
| copyfile(modell_path, tmp_modell_path) | |||
| copyfile(yaml_path, tmp_yaml_path) | |||
| copyfile(init_file_path, tmp_init_path) | |||
| copyfile(env_file_path, tmp_env_path) | |||
| zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name)) | |||
| with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj: | |||
| zip_obj.write(tmp_spec_path, "rkme.json") | |||
| zip_obj.write(tmp_modelv_path, "modelv.pth") | |||
| zip_obj.write(tmp_modell_path, "modell.pth") | |||
| zip_obj.write(tmp_yaml_path, "learnware.yaml") | |||
| zip_obj.write(tmp_init_path, "__init__.py") | |||
| zip_obj.write(tmp_env_path, "requirements.txt") | |||
| rmtree(save_root) | |||
| logger.info("New Learnware Saved to %s" % (zip_file_name)) | |||
| return zip_file_name | |||
| def prepare_market(self, regenerate_flag=False): | |||
| if regenerate_flag: | |||
| self._init_text_dataset() | |||
| text_market = instantiate_learnware_market(market_id="ae", rebuild=True) | |||
| try: | |||
| rmtree(learnware_pool_dir) | |||
| except: | |||
| pass | |||
| os.makedirs(learnware_pool_dir, exist_ok=True) | |||
| for i in range(n_uploaders): | |||
| data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i)) | |||
| modelv_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) | |||
| modell_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) | |||
| init_file_path = "./example_files/example_init.py" | |||
| yaml_file_path = "./example_files/example_yaml.yaml" | |||
| env_file_path = "./example_files/requirements.txt" | |||
| new_learnware_path = self._prepare_learnware( | |||
| data_path, | |||
| modelv_path, | |||
| modell_path, | |||
| init_file_path, | |||
| yaml_file_path, | |||
| env_file_path, | |||
| tmp_dir, | |||
| "%s_%d" % (dataset, i), | |||
| ) | |||
| semantic_spec = semantic_specs[0] | |||
| semantic_spec["Name"]["Values"] = "learnware_%d" % (i) | |||
| semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i) | |||
| text_market.add_learnware(new_learnware_path, semantic_spec) | |||
| logger.info("Total Item: %d" % (len(text_market))) | |||
| def test(self, regenerate_flag=False): | |||
| self.prepare_market(regenerate_flag) | |||
| text_market = instantiate_learnware_market(market_id="ae") | |||
| print("Total Item: %d" % len(text_market)) | |||
| select_list = [] | |||
| avg_list = [] | |||
| improve_list = [] | |||
| job_selector_score_list = [] | |||
| ensemble_score_list = [] | |||
| pruning_score_list = [] | |||
| for i in range(n_users): | |||
| user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i)) | |||
| user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i)) | |||
| with open(user_data_path, "rb") as f: | |||
| user_data = pickle.load(f) | |||
| with open(user_label_path, "rb") as f: | |||
| user_label = pickle.load(f) | |||
| user_stat_spec = specification.RKMETextSpecification() | |||
| user_stat_spec.generate_stat_spec_from_data(X=user_data) | |||
| user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec}) | |||
| logger.info("Searching Market for user: %d" % (i)) | |||
| sorted_score_list, single_learnware_list, mixture_score, mixture_learnware_list = text_market.search_learnware( | |||
| user_info | |||
| ) | |||
| print(f"search result of user{i}:") | |||
| print( | |||
| f"single model num: {len(sorted_score_list)}, max_score: {sorted_score_list[0]}, min_score: {sorted_score_list[-1]}" | |||
| ) | |||
| l = len(sorted_score_list) | |||
| acc_list = [] | |||
| for idx in range(l): | |||
| learnware = single_learnware_list[idx] | |||
| score = sorted_score_list[idx] | |||
| pred_y = learnware.predict(user_data) | |||
| acc = eval_prediction(pred_y, user_label) | |||
| acc_list.append(acc) | |||
| print( | |||
| f"Top1-score: {sorted_score_list[0]}, learnware_id: {single_learnware_list[0].id}, acc: {acc_list[0]}" | |||
| ) | |||
| mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list]) | |||
| print(f"mixture_score: {mixture_score}, mixture_learnware: {mixture_id}") | |||
| if not mixture_learnware_list: | |||
| mixture_learnware_list = [single_learnware_list[0]] | |||
| # test reuse (job selector) | |||
| reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100) | |||
| reuse_predict = reuse_baseline.predict(user_data=user_data) | |||
| reuse_score = eval_prediction(reuse_predict, user_label) | |||
| job_selector_score_list.append(reuse_score) | |||
| print(f"mixture reuse loss(job selector): {reuse_score}") | |||
| # test reuse (ensemble) | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label") | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=user_data) | |||
| ensemble_score = eval_prediction(ensemble_predict_y, user_label) | |||
| ensemble_score_list.append(ensemble_score) | |||
| print(f"mixture reuse accuracy (ensemble): {ensemble_score}") | |||
| # test reuse (ensemblePruning) | |||
| reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list) | |||
| pruning_predict_y = reuse_pruning.predict(user_data=user_data) | |||
| pruning_score = eval_prediction(pruning_predict_y, user_label) | |||
| pruning_score_list.append(pruning_score) | |||
| print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n") | |||
| select_list.append(acc_list[0]) | |||
| avg_list.append(np.mean(acc_list)) | |||
| improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list)) | |||
| logger.info( | |||
| "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f" | |||
| % (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list)) | |||
| ) | |||
| logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) | |||
| logger.info( | |||
| "Average Job Selector Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Selective Ensemble Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(pruning_score_list), np.std(pruning_score_list)) | |||
| ) | |||
| l = len(sorted_score_list) | |||
| acc_list = [] | |||
| for idx in range(l): | |||
| learnware = single_learnware_list[idx] | |||
| score = sorted_score_list[idx] | |||
| pred_y = learnware.predict(user_data) | |||
| acc = eval_prediction(pred_y, user_label) | |||
| acc_list.append(acc) | |||
| logger.info("search rank: %d, score: %.3f, learnware_id: %s, acc: %.3f" % (idx, score, learnware.id, acc)) | |||
| # test reuse (job selector) | |||
| reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100) | |||
| reuse_predict = reuse_baseline.predict(user_data=user_data) | |||
| reuse_score = eval_prediction(reuse_predict, user_label) | |||
| job_selector_score_list.append(reuse_score) | |||
| print(f"mixture reuse loss(job selector): {reuse_score}") | |||
| # test reuse (ensemble) | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label") | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=user_data) | |||
| ensemble_score = eval_prediction(ensemble_predict_y, user_label) | |||
| ensemble_score_list.append(ensemble_score) | |||
| print(f"mixture reuse accuracy (ensemble): {ensemble_score}") | |||
| # test reuse (ensemblePruning) | |||
| reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list) | |||
| pruning_predict_y = reuse_pruning.predict(user_data=user_data) | |||
| pruning_score = eval_prediction(pruning_predict_y, user_label) | |||
| pruning_score_list.append(pruning_score) | |||
| print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n") | |||
| select_list.append(acc_list[0]) | |||
| avg_list.append(np.mean(acc_list)) | |||
| improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list)) | |||
| logger.info( | |||
| "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f" | |||
| % (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list)) | |||
| ) | |||
| logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) | |||
| logger.info( | |||
| "Average Job Selector Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Selective Ensemble Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(pruning_score_list), np.std(pruning_score_list)) | |||
| ) | |||
| if __name__ == "__main__": | |||
| prepare_data() | |||
| prepare_model() | |||
| test_search(load_market=False) | |||
| fire.Fire(TextDatasetWorkflow) | |||
| @@ -1,19 +1,10 @@ | |||
| import os | |||
| import numpy as np | |||
| import random | |||
| import math | |||
| import torch | |||
| import torch.nn as nn | |||
| import torch.optim as optim | |||
| import pickle | |||
| import torchtext.transforms as T | |||
| from torch.hub import load_state_dict_from_url | |||
| from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER | |||
| import torchtext.functional as F | |||
| from torch.optim import AdamW | |||
| from torch.utils.data import DataLoader | |||
| import numpy as np | |||
| import pandas as pd | |||
| from lightgbm import LGBMClassifier | |||
| from sklearn.feature_extraction.text import TfidfVectorizer | |||
| class TextDataLoader: | |||
| @@ -43,20 +34,25 @@ class TextDataLoader: | |||
| return X, y | |||
| def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None): | |||
| def generate_uploader(data_x: pd.Series, data_y: pd.Series, n_uploaders=50, data_save_root=None): | |||
| if data_save_root is None: | |||
| return | |||
| os.makedirs(data_save_root, exist_ok=True) | |||
| n = len(data_x) | |||
| types = data_x["discourse_type"].unique() | |||
| for i in range(n_uploaders): | |||
| selected_X = data_x[i * (n // n_uploaders) : (i + 1) * (n // n_uploaders)] | |||
| selected_y = data_y[i * (n // n_uploaders) : (i + 1) * (n // n_uploaders)] | |||
| indices = data_x["discourse_type"] == types[i] | |||
| selected_X = data_x[indices]["discourse_text"].to_list() | |||
| selected_y = data_y[indices].to_list() | |||
| X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) | |||
| y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) | |||
| with open(X_save_dir, "wb") as f: | |||
| pickle.dump(selected_X, f) | |||
| with open(y_save_dir, "wb") as f: | |||
| pickle.dump(selected_y, f) | |||
| print("Saving to %s" % (X_save_dir)) | |||
| @@ -64,114 +60,36 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None): | |||
| if data_save_root is None: | |||
| return | |||
| os.makedirs(data_save_root, exist_ok=True) | |||
| n = len(data_x) | |||
| types = data_x["discourse_type"].unique() | |||
| for i in range(n_users): | |||
| selected_X = data_x[i * (n // n_users) : (i + 1) * (n // n_users)] | |||
| selected_y = data_y[i * (n // n_users) : (i + 1) * (n // n_users)] | |||
| indices = data_x["discourse_type"] == types[i] | |||
| selected_X = data_x[indices]["discourse_text"].to_list() | |||
| selected_y = data_y[indices].to_list() | |||
| X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) | |||
| y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) | |||
| with open(X_save_dir, "wb") as f: | |||
| pickle.dump(selected_X, f) | |||
| with open(y_save_dir, "wb") as f: | |||
| pickle.dump(selected_y, f) | |||
| print("Saving to %s" % (X_save_dir)) | |||
| def sentence_preprocess(x_datapipe): | |||
| padding_idx = 1 | |||
| bos_idx = 0 | |||
| eos_idx = 2 | |||
| max_seq_len = 256 | |||
| xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt" | |||
| xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model" | |||
| text_transform = T.Sequential( | |||
| T.SentencePieceTokenizer(xlmr_spm_model_path), | |||
| T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)), | |||
| T.Truncate(max_seq_len - 2), | |||
| T.AddToken(token=bos_idx, begin=True), | |||
| T.AddToken(token=eos_idx, begin=False), | |||
| ) | |||
| x_datapipe = [text_transform(x) for x in x_datapipe] | |||
| # x_datapipe = x_datapipe.map(text_transform) | |||
| return x_datapipe | |||
| def train_step(model, criteria, optim, input, target): | |||
| output = model(input) | |||
| loss = criteria(output, target) | |||
| optim.zero_grad() | |||
| loss.backward() | |||
| optim.step() | |||
| def eval_step(model, criteria, input, target): | |||
| output = model(input) | |||
| loss = criteria(output, target).item() | |||
| return float(loss), (output.argmax(1) == target).type(torch.float).sum().item() | |||
| def evaluate(model, criteria, dev_dataloader): | |||
| model.eval() | |||
| total_loss = 0 | |||
| correct_predictions = 0 | |||
| total_predictions = 0 | |||
| counter = 0 | |||
| with torch.no_grad(): | |||
| for batch in dev_dataloader: | |||
| input = F.to_tensor(batch["token_ids"], padding_value=1).to(DEVICE) | |||
| target = torch.tensor(batch["target"]).to(DEVICE) | |||
| loss, predictions = eval_step(model, criteria, input, target) | |||
| total_loss += loss | |||
| correct_predictions += predictions | |||
| total_predictions += len(target) | |||
| counter += 1 | |||
| return total_loss / counter, correct_predictions / total_predictions | |||
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |||
| print("Saving to %s" % (X_save_dir)) | |||
| # Train Uploaders' models | |||
| def train(X, y, out_classes, epochs=35, batch_size=128): | |||
| # print(X.shape, y.shape) | |||
| from torchdata.datapipes.iter import IterableWrapper | |||
| X = sentence_preprocess(X) | |||
| data_size = len(X) | |||
| train_datapipe = list(zip(X, y)) | |||
| train_datapipe = IterableWrapper(train_datapipe) | |||
| train_datapipe = train_datapipe.batch(batch_size) | |||
| train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"]) | |||
| train_dataloader = DataLoader(train_datapipe, batch_size=None) | |||
| num_classes = 2 | |||
| input_dim = 768 | |||
| classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim) | |||
| model = XLMR_BASE_ENCODER.get_model(head=classifier_head) | |||
| learning_rate = 1e-5 | |||
| optim = AdamW(model.parameters(), lr=learning_rate) | |||
| criteria = nn.CrossEntropyLoss() | |||
| def train(X, y, out_classes): | |||
| vectorizer = TfidfVectorizer(stop_words="english") | |||
| X_tfidf = vectorizer.fit_transform(X) | |||
| model.to(DEVICE) | |||
| lgbm = LGBMClassifier(boosting_type="dart", n_estimators=500, num_leaves=21) | |||
| lgbm.fit(X_tfidf, y) | |||
| num_epochs = 10 | |||
| for e in range(num_epochs): | |||
| for batch in train_dataloader: | |||
| input = F.to_tensor(batch["token_ids"], padding_value=1).to(DEVICE) | |||
| target = torch.tensor(batch["target"]).to(DEVICE) | |||
| train_step(model, criteria, optim, input, target) | |||
| loss, accuracy = evaluate(model, criteria, train_dataloader) | |||
| print("Epoch = [{}], loss = [{}], accuracy = [{}]".format(e, loss, accuracy)) | |||
| return model | |||
| return vectorizer, lgbm | |||
| def eval_prediction(pred_y, target_y): | |||
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |||
| if not isinstance(pred_y, np.ndarray): | |||
| pred_y = pred_y.detach().cpu().numpy() | |||
| if len(pred_y.shape) == 1: | |||
| @@ -179,9 +97,8 @@ def eval_prediction(pred_y, target_y): | |||
| else: | |||
| predicted = np.argmax(pred_y, 1) | |||
| annos = np.array(target_y) | |||
| # print(predicted, annos) | |||
| # annos = target_y | |||
| total = predicted.shape[0] | |||
| correct = (predicted == annos).sum().item() | |||
| criterion = nn.CrossEntropyLoss() | |||
| return correct / total | |||
| @@ -1,29 +0,0 @@ | |||
| import os | |||
| import pickle | |||
| import numpy as np | |||
| from learnware.model import BaseModel | |||
| class Model(BaseModel): | |||
| def __init__(self): | |||
| super(Model, self).__init__(input_shape=(1,), output_shape=(1,)) | |||
| dir_path = os.path.dirname(os.path.abspath(__file__)) | |||
| modelv_path = os.path.join(dir_path, "modelv.pth") | |||
| with open(modelv_path, "rb") as f: | |||
| self.modelv = pickle.load(f) | |||
| modell_path = os.path.join(dir_path, "modell.pth") | |||
| with open(modell_path, "rb") as f: | |||
| self.modell = pickle.load(f) | |||
| def fit(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| def predict(self, X: np.ndarray) -> np.ndarray: | |||
| return self.modell.predict(self.modelv.transform(X)) | |||
| def finetune(self, X: np.ndarray, y: np.ndarray): | |||
| pass | |||
| @@ -1,8 +0,0 @@ | |||
| model: | |||
| class_name: Model | |||
| kwargs: { } | |||
| stat_specifications: | |||
| - module_path: learnware.specification | |||
| class_name: RKMETextSpecification | |||
| file_name: rkme.json | |||
| kwargs: { } | |||
| @@ -1,4 +0,0 @@ | |||
| numpy | |||
| pickle | |||
| lightgbm | |||
| scikit-learn | |||
| @@ -1,16 +0,0 @@ | |||
| import os | |||
| import pandas as pd | |||
| def get_data(data_root="./data"): | |||
| dtrain = pd.read_csv(os.path.join(data_root, "train.csv")) | |||
| dtest = pd.read_csv(os.path.join(data_root, "test.csv")) | |||
| # returned X(DataFrame), y(Series) | |||
| return ( | |||
| dtrain[["discourse_text", "discourse_type"]], | |||
| dtrain["discourse_effectiveness"], | |||
| dtest[["discourse_text", "discourse_type"]], | |||
| dtest["discourse_effectiveness"], | |||
| ) | |||
| @@ -1,257 +0,0 @@ | |||
| import os | |||
| import pickle | |||
| import time | |||
| import zipfile | |||
| from shutil import copyfile, rmtree | |||
| import numpy as np | |||
| import learnware.specification as specification | |||
| from get_data import get_data | |||
| from learnware.logger import get_module_logger | |||
| from learnware.market import instantiate_learnware_market, BaseUserInfo | |||
| from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser | |||
| from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction | |||
| logger = get_module_logger("text_test", level="INFO") | |||
| origin_data_root = "./data/origin_data" | |||
| processed_data_root = "./data/processed_data" | |||
| tmp_dir = "./data/tmp" | |||
| learnware_pool_dir = "./data/learnware_pool" | |||
| dataset = "ae" # argumentative essays | |||
| n_uploaders = 7 | |||
| n_users = 7 | |||
| n_classes = 3 | |||
| data_root = os.path.join(origin_data_root, dataset) | |||
| data_save_root = os.path.join(processed_data_root, dataset) | |||
| user_save_root = os.path.join(data_save_root, "user") | |||
| uploader_save_root = os.path.join(data_save_root, "uploader") | |||
| model_save_root = os.path.join(data_save_root, "uploader_model") | |||
| os.makedirs(data_root, exist_ok=True) | |||
| os.makedirs(user_save_root, exist_ok=True) | |||
| os.makedirs(uploader_save_root, exist_ok=True) | |||
| os.makedirs(model_save_root, exist_ok=True) | |||
| output_description = { | |||
| "Dimension": 1, | |||
| "Description": { | |||
| "0": "classify as 0(ineffective), 1(effective), or 2(adequate).", | |||
| }, | |||
| } | |||
| semantic_specs = [ | |||
| { | |||
| "Data": {"Values": ["Text"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Education"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "learnware_1", "Type": "String"}, | |||
| "Output": output_description, | |||
| } | |||
| ] | |||
| user_semantic = { | |||
| "Data": {"Values": ["Text"], "Type": "Class"}, | |||
| "Task": {"Values": ["Classification"], "Type": "Class"}, | |||
| "Library": {"Values": ["Scikit-learn"], "Type": "Class"}, | |||
| "Scenario": {"Values": ["Education"], "Type": "Tag"}, | |||
| "Description": {"Values": "", "Type": "String"}, | |||
| "Name": {"Values": "", "Type": "String"}, | |||
| "Output": output_description, | |||
| } | |||
| def prepare_data(): | |||
| X_train, y_train, X_test, y_test = get_data(data_root) | |||
| generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root) | |||
| generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root) | |||
| def prepare_model(): | |||
| dataloader = TextDataLoader(data_save_root, train=True) | |||
| for i in range(n_uploaders): | |||
| logger.info("Train on uploader: %d" % (i)) | |||
| X, y = dataloader.get_idx_data(i) | |||
| vectorizer, lgbm = train(X, y, out_classes=n_classes) | |||
| modelv_save_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) | |||
| modell_save_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) | |||
| with open(modelv_save_path, "wb") as f: | |||
| pickle.dump(vectorizer, f) | |||
| with open(modell_save_path, "wb") as f: | |||
| pickle.dump(lgbm, f) | |||
| logger.info("Model saved to '%s' and '%s'" % (modelv_save_path, modell_save_path)) | |||
| def prepare_learnware( | |||
| data_path, modelv_path, modell_path, init_file_path, yaml_path, env_file_path, save_root, zip_name | |||
| ): | |||
| os.makedirs(save_root, exist_ok=True) | |||
| tmp_spec_path = os.path.join(save_root, "rkme.json") | |||
| tmp_modelv_path = os.path.join(save_root, "modelv.pth") | |||
| tmp_modell_path = os.path.join(save_root, "modell.pth") | |||
| tmp_yaml_path = os.path.join(save_root, "learnware.yaml") | |||
| tmp_init_path = os.path.join(save_root, "__init__.py") | |||
| tmp_env_path = os.path.join(save_root, "requirements.txt") | |||
| with open(data_path, "rb") as f: | |||
| X = pickle.load(f) | |||
| semantic_spec = semantic_specs[0] | |||
| st = time.time() | |||
| user_spec = specification.RKMETextSpecification() | |||
| user_spec.generate_stat_spec_from_data(X=X) | |||
| ed = time.time() | |||
| logger.info("Stat spec generated in %.3f s" % (ed - st)) | |||
| user_spec.save(tmp_spec_path) | |||
| copyfile(modelv_path, tmp_modelv_path) | |||
| copyfile(modell_path, tmp_modell_path) | |||
| copyfile(yaml_path, tmp_yaml_path) | |||
| copyfile(init_file_path, tmp_init_path) | |||
| copyfile(env_file_path, tmp_env_path) | |||
| zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name)) | |||
| with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj: | |||
| zip_obj.write(tmp_spec_path, "rkme.json") | |||
| zip_obj.write(tmp_modelv_path, "modelv.pth") | |||
| zip_obj.write(tmp_modell_path, "modell.pth") | |||
| zip_obj.write(tmp_yaml_path, "learnware.yaml") | |||
| zip_obj.write(tmp_init_path, "__init__.py") | |||
| zip_obj.write(tmp_env_path, "requirements.txt") | |||
| rmtree(save_root) | |||
| logger.info("New Learnware Saved to %s" % (zip_file_name)) | |||
| return zip_file_name | |||
| def prepare_market(): | |||
| text_market = instantiate_learnware_market(market_id="ae", rebuild=True) | |||
| try: | |||
| rmtree(learnware_pool_dir) | |||
| except: | |||
| pass | |||
| os.makedirs(learnware_pool_dir, exist_ok=True) | |||
| for i in range(n_uploaders): | |||
| data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i)) | |||
| modelv_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i)) | |||
| modell_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i)) | |||
| init_file_path = "./example_files/example_init.py" | |||
| yaml_file_path = "./example_files/example_yaml.yaml" | |||
| env_file_path = "./example_files/requirements.txt" | |||
| new_learnware_path = prepare_learnware( | |||
| data_path, | |||
| modelv_path, | |||
| modell_path, | |||
| init_file_path, | |||
| yaml_file_path, | |||
| env_file_path, | |||
| tmp_dir, | |||
| "%s_%d" % (dataset, i), | |||
| ) | |||
| semantic_spec = semantic_specs[0] | |||
| semantic_spec["Name"]["Values"] = "learnware_%d" % (i) | |||
| semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i) | |||
| text_market.add_learnware(new_learnware_path, semantic_spec) | |||
| logger.info("Total Item: %d" % (len(text_market))) | |||
| def test_search(load_market=True): | |||
| if load_market: | |||
| text_market = instantiate_learnware_market(market_id="ae") | |||
| else: | |||
| prepare_market() | |||
| text_market = instantiate_learnware_market(market_id="ae") | |||
| logger.info("Number of items in the market: %d" % len(text_market)) | |||
| select_list = [] | |||
| avg_list = [] | |||
| improve_list = [] | |||
| job_selector_score_list = [] | |||
| ensemble_score_list = [] | |||
| pruning_score_list = [] | |||
| for i in range(n_users): | |||
| user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i)) | |||
| user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i)) | |||
| with open(user_data_path, "rb") as f: | |||
| user_data = pickle.load(f) | |||
| with open(user_label_path, "rb") as f: | |||
| user_label = pickle.load(f) | |||
| user_stat_spec = specification.RKMETextSpecification() | |||
| user_stat_spec.generate_stat_spec_from_data(X=user_data) | |||
| user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec}) | |||
| logger.info("Searching Market for user: %d" % (i)) | |||
| sorted_score_list, single_learnware_list, mixture_score, mixture_learnware_list = text_market.search_learnware( | |||
| user_info | |||
| ) | |||
| l = len(sorted_score_list) | |||
| acc_list = [] | |||
| for idx in range(l): | |||
| learnware = single_learnware_list[idx] | |||
| score = sorted_score_list[idx] | |||
| pred_y = learnware.predict(user_data) | |||
| acc = eval_prediction(pred_y, user_label) | |||
| acc_list.append(acc) | |||
| logger.info("search rank: %d, score: %.3f, learnware_id: %s, acc: %.3f" % (idx, score, learnware.id, acc)) | |||
| # test reuse (job selector) | |||
| reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100) | |||
| reuse_predict = reuse_baseline.predict(user_data=user_data) | |||
| reuse_score = eval_prediction(reuse_predict, user_label) | |||
| job_selector_score_list.append(reuse_score) | |||
| print(f"mixture reuse loss(job selector): {reuse_score}") | |||
| # test reuse (ensemble) | |||
| reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label") | |||
| ensemble_predict_y = reuse_ensemble.predict(user_data=user_data) | |||
| ensemble_score = eval_prediction(ensemble_predict_y, user_label) | |||
| ensemble_score_list.append(ensemble_score) | |||
| print(f"mixture reuse accuracy (ensemble): {ensemble_score}") | |||
| # test reuse (ensemblePruning) | |||
| reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list) | |||
| pruning_predict_y = reuse_pruning.predict(user_data=user_data) | |||
| pruning_score = eval_prediction(pruning_predict_y, user_label) | |||
| pruning_score_list.append(pruning_score) | |||
| print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n") | |||
| select_list.append(acc_list[0]) | |||
| avg_list.append(np.mean(acc_list)) | |||
| improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list)) | |||
| logger.info( | |||
| "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f" | |||
| % (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list)) | |||
| ) | |||
| logger.info("Average performance improvement: %.3f" % (np.mean(improve_list))) | |||
| logger.info( | |||
| "Average Job Selector Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(job_selector_score_list), np.std(job_selector_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Averaging Ensemble Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(ensemble_score_list), np.std(ensemble_score_list)) | |||
| ) | |||
| logger.info( | |||
| "Selective Ensemble Reuse Performance: %.3f +/- %.3f" | |||
| % (np.mean(pruning_score_list), np.std(pruning_score_list)) | |||
| ) | |||
| if __name__ == "__main__": | |||
| prepare_data() | |||
| prepare_model() | |||
| test_search(load_market=False) | |||
| @@ -1,104 +0,0 @@ | |||
| import os | |||
| import pickle | |||
| import numpy as np | |||
| import pandas as pd | |||
| from lightgbm import LGBMClassifier | |||
| from sklearn.feature_extraction.text import TfidfVectorizer | |||
| class TextDataLoader: | |||
| def __init__(self, data_root, train: bool = True): | |||
| self.data_root = data_root | |||
| self.train = train | |||
| def get_idx_data(self, idx=0): | |||
| if self.train: | |||
| X_path = os.path.join(self.data_root, "uploader", "uploader_%d_X.pkl" % (idx)) | |||
| y_path = os.path.join(self.data_root, "uploader", "uploader_%d_y.pkl" % (idx)) | |||
| if not (os.path.exists(X_path) and os.path.exists(y_path)): | |||
| raise Exception("Index Error") | |||
| with open(X_path, "rb") as f: | |||
| X = pickle.load(f) | |||
| with open(y_path, "rb") as f: | |||
| y = pickle.load(f) | |||
| else: | |||
| X_path = os.path.join(self.data_root, "user", "user_%d_X.pkl" % (idx)) | |||
| y_path = os.path.join(self.data_root, "user", "user_%d_y.pkl" % (idx)) | |||
| if not (os.path.exists(X_path) and os.path.exists(y_path)): | |||
| raise Exception("Index Error") | |||
| with open(X_path, "rb") as f: | |||
| X = pickle.load(f) | |||
| with open(y_path, "rb") as f: | |||
| y = pickle.load(f) | |||
| return X, y | |||
| def generate_uploader(data_x: pd.Series, data_y: pd.Series, n_uploaders=50, data_save_root=None): | |||
| if data_save_root is None: | |||
| return | |||
| os.makedirs(data_save_root, exist_ok=True) | |||
| types = data_x["discourse_type"].unique() | |||
| for i in range(n_uploaders): | |||
| indices = data_x["discourse_type"] == types[i] | |||
| selected_X = data_x[indices]["discourse_text"].to_list() | |||
| selected_y = data_y[indices].to_list() | |||
| X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i)) | |||
| y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i)) | |||
| with open(X_save_dir, "wb") as f: | |||
| pickle.dump(selected_X, f) | |||
| with open(y_save_dir, "wb") as f: | |||
| pickle.dump(selected_y, f) | |||
| print("Saving to %s" % (X_save_dir)) | |||
| def generate_user(data_x, data_y, n_users=50, data_save_root=None): | |||
| if data_save_root is None: | |||
| return | |||
| os.makedirs(data_save_root, exist_ok=True) | |||
| types = data_x["discourse_type"].unique() | |||
| for i in range(n_users): | |||
| indices = data_x["discourse_type"] == types[i] | |||
| selected_X = data_x[indices]["discourse_text"].to_list() | |||
| selected_y = data_y[indices].to_list() | |||
| X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i)) | |||
| y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i)) | |||
| with open(X_save_dir, "wb") as f: | |||
| pickle.dump(selected_X, f) | |||
| with open(y_save_dir, "wb") as f: | |||
| pickle.dump(selected_y, f) | |||
| print("Saving to %s" % (X_save_dir)) | |||
| # Train Uploaders' models | |||
| def train(X, y, out_classes): | |||
| vectorizer = TfidfVectorizer(stop_words="english") | |||
| X_tfidf = vectorizer.fit_transform(X) | |||
| lgbm = LGBMClassifier(boosting_type="dart", n_estimators=500, num_leaves=21) | |||
| lgbm.fit(X_tfidf, y) | |||
| return vectorizer, lgbm | |||
| def eval_prediction(pred_y, target_y): | |||
| if not isinstance(pred_y, np.ndarray): | |||
| pred_y = pred_y.detach().cpu().numpy() | |||
| if len(pred_y.shape) == 1: | |||
| predicted = np.array(pred_y) | |||
| else: | |||
| predicted = np.argmax(pred_y, 1) | |||
| annos = np.array(target_y) | |||
| total = predicted.shape[0] | |||
| correct = (predicted == annos).sum().item() | |||
| return correct / total | |||
| @@ -599,7 +599,7 @@ class EasyStatSearcher(BaseSearcher): | |||
| merge_score_list = self._convert_dist_to_score(sorted_dist_list + [mixture_dist]) | |||
| sorted_score_list = merge_score_list[:-1] | |||
| mixture_score = merge_score_list[-1] | |||
| if int(mixture_score * 100) == int(sorted_score_list[0] * 100): | |||
| if len(mixture_learnware_list) == 1 or int(mixture_score * 100) == int(sorted_score_list[0] * 100): | |||
| mixture_score = None | |||
| mixture_learnware_list = [] | |||
| logger.info( | |||
| @@ -1,14 +1,27 @@ | |||
| from .utils import is_fast_pytorch_kmeans_available | |||
| from ....utils import is_torch_available | |||
| from ....logger import get_module_logger | |||
| logger = get_module_logger("regular_table_spec") | |||
| if not is_torch_available(verbose=False): | |||
| if not is_torch_available(verbose=False) or not is_fast_pytorch_kmeans_available(verbose=False): | |||
| RKMETableSpecification = None | |||
| RKMEStatSpecification = None | |||
| rkme_solve_qp = None | |||
| uninstall_packages = [ | |||
| value | |||
| for flag, value in zip( | |||
| [ | |||
| is_torch_available(verbose=False), | |||
| is_fast_pytorch_kmeans_available(verbose=False), | |||
| ], | |||
| ["torch", "fast_pytorch_kmeans"], | |||
| ) | |||
| if flag is False | |||
| ] | |||
| logger.warning( | |||
| "RKMETableSpecification, RKMEStatSpecification and rkme_solve_qp are skipped because torch is not installed!" | |||
| f"RKMETableSpecification, RKMEStatSpecification and rkme_solve_qp are skipped because {uninstall_packages} is not installed!" | |||
| ) | |||
| else: | |||
| from .rkme import RKMETableSpecification, RKMEStatSpecification, rkme_solve_qp | |||
| @@ -1,17 +1,15 @@ | |||
| from __future__ import annotations | |||
| import os | |||
| import copy | |||
| import torch | |||
| import json | |||
| import codecs | |||
| import random | |||
| import scipy | |||
| import numpy as np | |||
| from qpsolvers import solve_qp, Problem, solve_problem | |||
| from collections import Counter | |||
| from typing import Tuple, Any, List, Union, Dict | |||
| import scipy | |||
| from sklearn.cluster import MiniBatchKMeans | |||
| from typing import Any, Union | |||
| from fast_pytorch_kmeans import KMeans | |||
| from ..base import RegularStatSpecification | |||
| from ....logger import get_module_logger | |||
| @@ -140,11 +138,14 @@ class RKMETableSpecification(RegularStatSpecification): | |||
| K : int | |||
| Size of the construced reduced set. | |||
| """ | |||
| X = X.astype("float32") | |||
| kmeans = MiniBatchKMeans(n_clusters=K, max_iter=100, verbose=False, n_init="auto") | |||
| if isinstance(X, np.ndarray): | |||
| X = X.astype("float32") | |||
| X = torch.from_numpy(X) | |||
| X = X.to(self._device) | |||
| kmeans = KMeans(n_clusters=K, mode='euclidean', max_iter=100, verbose=0) | |||
| kmeans.fit(X) | |||
| center = torch.from_numpy(kmeans.cluster_centers_).double() | |||
| self.z = center | |||
| self.z = kmeans.centroids.double() | |||
| def _update_beta(self, X: Any, nonnegative_beta: bool = True): | |||
| """Fix Z and update beta using its closed-form solution. | |||
| @@ -0,0 +1,15 @@ | |||
| from ....logger import get_module_logger | |||
| logger = get_module_logger("regular_table_spec_utils") | |||
| def is_fast_pytorch_kmeans_available(verbose=False): | |||
| try: | |||
| import fast_pytorch_kmeans | |||
| except ModuleNotFoundError as err: | |||
| if verbose is True: | |||
| logger.warning( | |||
| "ModuleNotFoundError: fast_pytorch_kmeans is not installed, please install fast_pytorch_kmeans!" | |||
| ) | |||
| return False | |||
| return True | |||
| @@ -1,11 +1,16 @@ | |||
| from .utils import is_sentence_transformers_available | |||
| from ..table.utils import is_fast_pytorch_kmeans_available | |||
| from ....utils import is_torch_available | |||
| from ....logger import get_module_logger | |||
| logger = get_module_logger("regular_text_spec") | |||
| if not is_sentence_transformers_available(verbose=False) or not is_torch_available(verbose=False): | |||
| if ( | |||
| not is_sentence_transformers_available(verbose=False) | |||
| or not is_torch_available(verbose=False) | |||
| or not is_fast_pytorch_kmeans_available(verbose=False) | |||
| ): | |||
| RKMETextSpecification = None | |||
| uninstall_packages = [ | |||
| value | |||
| @@ -13,8 +18,9 @@ if not is_sentence_transformers_available(verbose=False) or not is_torch_availab | |||
| [ | |||
| is_sentence_transformers_available(verbose=False), | |||
| is_torch_available(verbose=False), | |||
| is_fast_pytorch_kmeans_available(verbose=False), | |||
| ], | |||
| ["sentence_transformers", "torch"], | |||
| ["sentence_transformers", "torch", "fast_pytorch_kmeans"], | |||
| ) | |||
| if flag is False | |||
| ] | |||
| @@ -89,10 +89,11 @@ DEV_REQUIRED = [ | |||
| FULL_REQUIRED = [ | |||
| # The default full requirements for learnware package | |||
| "torch==2.1.0", | |||
| "torchvision==0.16.0", | |||
| "torch==2.0.1", | |||
| "torchvision==0.15.2", | |||
| "torch-optimizer>=0.3.0", | |||
| "sentence_transformers>=2.2.2", | |||
| "sentence_transformers==2.2.2", | |||
| "fast_pytorch_kmeans==0.2.0.1", | |||
| ] | |||
| # In MACOS, the lightgbm package should be installed with brew. | |||