|
- import os
- import fire
- import pickle
- import time
- import zipfile
- from shutil import copyfile, rmtree
-
- import numpy as np
-
- import learnware.specification as specification
- from get_data import get_data
- from learnware.logger import get_module_logger
- from learnware.market import instantiate_learnware_market, BaseUserInfo
- from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser
- from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction
-
- logger = get_module_logger("text_test", level="INFO")
- origin_data_root = "./data/origin_data"
- processed_data_root = "./data/processed_data"
- tmp_dir = "./data/tmp"
- learnware_pool_dir = "./data/learnware_pool"
- dataset = "ae" # argumentative essays
- n_uploaders = 7
- n_users = 7
- n_classes = 3
- data_root = os.path.join(origin_data_root, dataset)
- data_save_root = os.path.join(processed_data_root, dataset)
- user_save_root = os.path.join(data_save_root, "user")
- uploader_save_root = os.path.join(data_save_root, "uploader")
- model_save_root = os.path.join(data_save_root, "uploader_model")
- os.makedirs(data_root, exist_ok=True)
- os.makedirs(user_save_root, exist_ok=True)
- os.makedirs(uploader_save_root, exist_ok=True)
- os.makedirs(model_save_root, exist_ok=True)
-
- output_description = {
- "Dimension": 1,
- "Description": {
- "0": "classify as 0(ineffective), 1(effective), or 2(adequate).",
- },
- }
- semantic_specs = [
- {
- "Data": {"Values": ["Text"], "Type": "Class"},
- "Task": {"Values": ["Classification"], "Type": "Class"},
- "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
- "Scenario": {"Values": ["Education"], "Type": "Tag"},
- "Description": {"Values": "", "Type": "String"},
- "Name": {"Values": "learnware_1", "Type": "String"},
- "Output": output_description,
- }
- ]
-
- user_semantic = {
- "Data": {"Values": ["Text"], "Type": "Class"},
- "Task": {"Values": ["Classification"], "Type": "Class"},
- "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
- "Scenario": {"Values": ["Education"], "Type": "Tag"},
- "Description": {"Values": "", "Type": "String"},
- "Name": {"Values": "", "Type": "String"},
- "Output": output_description,
- }
-
-
- class TextDatasetWorkflow:
- def _init_text_dataset(self):
- self._prepare_data()
- self._prepare_model()
-
- def _prepare_data(self):
- X_train, y_train, X_test, y_test = get_data(data_root)
-
- generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root)
- generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root)
-
-
- def _prepare_model(self):
- dataloader = TextDataLoader(data_save_root, train=True)
- for i in range(n_uploaders):
- logger.info("Train on uploader: %d" % (i))
- X, y = dataloader.get_idx_data(i)
- vectorizer, lgbm = train(X, y, out_classes=n_classes)
-
- modelv_save_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i))
- modell_save_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i))
-
- with open(modelv_save_path, "wb") as f:
- pickle.dump(vectorizer, f)
-
- with open(modell_save_path, "wb") as f:
- pickle.dump(lgbm, f)
-
- logger.info("Model saved to '%s' and '%s'" % (modelv_save_path, modell_save_path))
-
-
- def _prepare_learnware(self,
- data_path, modelv_path, modell_path, init_file_path, yaml_path, env_file_path, save_root, zip_name
- ):
- os.makedirs(save_root, exist_ok=True)
- tmp_spec_path = os.path.join(save_root, "rkme.json")
-
- tmp_modelv_path = os.path.join(save_root, "modelv.pth")
- tmp_modell_path = os.path.join(save_root, "modell.pth")
-
- tmp_yaml_path = os.path.join(save_root, "learnware.yaml")
- tmp_init_path = os.path.join(save_root, "__init__.py")
- tmp_env_path = os.path.join(save_root, "requirements.txt")
-
- with open(data_path, "rb") as f:
- X = pickle.load(f)
- semantic_spec = semantic_specs[0]
-
- st = time.time()
-
- user_spec = specification.RKMETextSpecification()
-
- user_spec.generate_stat_spec_from_data(X=X)
- ed = time.time()
- logger.info("Stat spec generated in %.3f s" % (ed - st))
- user_spec.save(tmp_spec_path)
-
- copyfile(modelv_path, tmp_modelv_path)
- copyfile(modell_path, tmp_modell_path)
-
- copyfile(yaml_path, tmp_yaml_path)
- copyfile(init_file_path, tmp_init_path)
- copyfile(env_file_path, tmp_env_path)
- zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name))
- with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj:
- zip_obj.write(tmp_spec_path, "rkme.json")
-
- zip_obj.write(tmp_modelv_path, "modelv.pth")
- zip_obj.write(tmp_modell_path, "modell.pth")
-
- zip_obj.write(tmp_yaml_path, "learnware.yaml")
- zip_obj.write(tmp_init_path, "__init__.py")
- zip_obj.write(tmp_env_path, "requirements.txt")
- rmtree(save_root)
- logger.info("New Learnware Saved to %s" % (zip_file_name))
- return zip_file_name
-
-
- def prepare_market(self, regenerate_flag=False):
- if regenerate_flag:
- self._init_text_dataset()
- text_market = instantiate_learnware_market(market_id="ae", rebuild=True)
- try:
- rmtree(learnware_pool_dir)
- except:
- pass
- os.makedirs(learnware_pool_dir, exist_ok=True)
- for i in range(n_uploaders):
- data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i))
-
- modelv_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i))
- modell_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i))
-
- init_file_path = "./example_files/example_init.py"
- yaml_file_path = "./example_files/example_yaml.yaml"
- env_file_path = "./example_files/requirements.txt"
- new_learnware_path = self._prepare_learnware(
- data_path,
- modelv_path,
- modell_path,
- init_file_path,
- yaml_file_path,
- env_file_path,
- tmp_dir,
- "%s_%d" % (dataset, i),
- )
- semantic_spec = semantic_specs[0]
- semantic_spec["Name"]["Values"] = "learnware_%d" % (i)
- semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i)
- text_market.add_learnware(new_learnware_path, semantic_spec)
-
- logger.info("Total Item: %d" % (len(text_market)))
-
-
- def test(self, regenerate_flag=False):
- self.prepare_market(regenerate_flag)
- text_market = instantiate_learnware_market(market_id="ae")
- print("Total Item: %d" % len(text_market))
-
- select_list = []
- avg_list = []
- improve_list = []
- job_selector_score_list = []
- ensemble_score_list = []
- pruning_score_list = []
- for i in range(n_users):
- user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i))
- user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i))
- with open(user_data_path, "rb") as f:
- user_data = pickle.load(f)
- with open(user_label_path, "rb") as f:
- user_label = pickle.load(f)
-
- user_stat_spec = specification.RKMETextSpecification()
- user_stat_spec.generate_stat_spec_from_data(X=user_data)
- user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec})
- logger.info("Searching Market for user: %d" % (i))
-
- search_result = text_market.search_learnware(user_info)
- single_result = search_result.get_single_results()
- multiple_result = search_result.get_multiple_results()
-
- print(f"search result of user{i}:")
- print(
- f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}"
- )
-
- l = len(single_result)
- acc_list = []
- for idx in range(l):
- learnware = single_result[idx].learnware
- score = single_result[idx].score
- pred_y = learnware.predict(user_data)
- acc = eval_prediction(pred_y, user_label)
- acc_list.append(acc)
- print(
- f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, acc: {acc_list[0]}"
- )
-
- if len(multiple_result) > 0:
- mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares])
- print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}")
- mixture_learnware_list = multiple_result[0].learnwares
- else:
- mixture_learnware_list = [single_result[0].learnware]
-
- # test reuse (job selector)
- reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100)
- reuse_predict = reuse_baseline.predict(user_data=user_data)
- reuse_score = eval_prediction(reuse_predict, user_label)
- job_selector_score_list.append(reuse_score)
- print(f"mixture reuse loss(job selector): {reuse_score}")
-
- # test reuse (ensemble)
- reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label")
- ensemble_predict_y = reuse_ensemble.predict(user_data=user_data)
- ensemble_score = eval_prediction(ensemble_predict_y, user_label)
- ensemble_score_list.append(ensemble_score)
- print(f"mixture reuse accuracy (ensemble): {ensemble_score}")
-
- # test reuse (ensemblePruning)
- reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list)
- pruning_predict_y = reuse_pruning.predict(user_data=user_data)
- pruning_score = eval_prediction(pruning_predict_y, user_label)
- pruning_score_list.append(pruning_score)
- print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n")
-
- select_list.append(acc_list[0])
- avg_list.append(np.mean(acc_list))
- improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list))
-
- logger.info(
- "Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f"
- % (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list))
- )
- logger.info("Average performance improvement: %.3f" % (np.mean(improve_list)))
- logger.info(
- "Average Job Selector Reuse Performance: %.3f +/- %.3f"
- % (np.mean(job_selector_score_list), np.std(job_selector_score_list))
- )
- logger.info(
- "Averaging Ensemble Reuse Performance: %.3f +/- %.3f"
- % (np.mean(ensemble_score_list), np.std(ensemble_score_list))
- )
- logger.info(
- "Selective Ensemble Reuse Performance: %.3f +/- %.3f"
- % (np.mean(pruning_score_list), np.std(pruning_score_list))
- )
-
-
- if __name__ == "__main__":
- fire.Fire(TextDatasetWorkflow)
|