|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- import os
- import fire
- import time
- import zipfile
- import numpy as np
- from tqdm import tqdm
- from shutil import copyfile, rmtree
-
- import learnware
- from learnware.market import instantiate_learnware_market, BaseUserInfo
- from learnware.reuse import JobSelectorReuser, AveragingReuser
- from learnware.specification import generate_rkme_table_spec
- from m5 import DataLoader
- from learnware.logger import get_module_logger
-
- logger = get_module_logger("m5_test", level="INFO")
-
-
- output_description = {
- "Dimension": 1,
- "Description": {},
- }
-
- input_description = {
- "Dimension": 82,
- "Description": {},
- }
-
- semantic_specs = [
- {
- "Data": {"Values": ["Table"], "Type": "Class"},
- "Task": {"Values": ["Regression"], "Type": "Class"},
- "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
- "Scenario": {"Values": ["Business"], "Type": "Tag"},
- "Description": {"Values": "", "Type": "String"},
- "Name": {"Values": "learnware_1", "Type": "String"},
- "Input": input_description,
- "Output": output_description,
- }
- ]
-
- user_semantic = {
- "Data": {"Values": ["Table"], "Type": "Class"},
- "Task": {"Values": ["Regression"], "Type": "Class"},
- "Library": {"Values": ["Scikit-learn"], "Type": "Class"},
- "Scenario": {"Values": ["Business"], "Type": "Tag"},
- "Description": {"Values": "", "Type": "String"},
- "Name": {"Values": "", "Type": "String"},
- "Input": input_description,
- "Output": output_description,
- }
-
-
- class M5DatasetWorkflow:
- def _init_m5_dataset(self):
- m5 = DataLoader()
- m5.regenerate_data()
-
- algo_list = ["ridge", "lgb"]
- for algo in algo_list:
- m5.set_algo(algo)
- m5.retrain_models()
-
- def _init_learnware_market(self):
- """initialize learnware market"""
- # database_ops.clear_learnware_table()
- learnware.init()
-
- easy_market = instantiate_learnware_market(name="easy", rebuild=True)
- print("Total Item:", len(easy_market))
-
- zip_path_list = []
- curr_root = os.path.dirname(os.path.abspath(__file__))
- curr_root = os.path.join(curr_root, "learnware_pool")
- for zip_path in os.listdir(curr_root):
- zip_path_list.append(os.path.join(curr_root, zip_path))
-
- for idx, zip_path in enumerate(zip_path_list):
- semantic_spec = semantic_specs[0]
- semantic_spec["Name"]["Values"] = "learnware_%d" % (idx)
- semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (idx)
- easy_market.add_learnware(zip_path, semantic_spec)
-
- print("Total Item:", len(easy_market))
-
- def prepare_learnware(self, regenerate_flag=False):
- if regenerate_flag:
- self._init_m5_dataset()
-
- m5 = DataLoader()
- idx_list = m5.get_idx_list()
- algo_list = ["lgb"] # algo_list = ["ridge", "lgb"]
-
- curr_root = os.path.dirname(os.path.abspath(__file__))
- curr_root = os.path.join(curr_root, "learnware_pool")
- os.makedirs(curr_root, exist_ok=True)
-
- for idx in tqdm(idx_list):
- train_x, train_y, test_x, test_y = m5.get_idx_data(idx)
- st = time.time()
- spec = generate_rkme_table_spec(X=train_x, gamma=0.1, cuda_idx=0)
- ed = time.time()
- logger.info("Stat spec generated in %.3f s" % (ed - st))
-
- for algo in algo_list:
- m5.set_algo(algo)
- dir_path = os.path.join(curr_root, f"{algo}_{idx}")
- os.makedirs(dir_path, exist_ok=True)
-
- spec_path = os.path.join(dir_path, "rkme.json")
- spec.save(spec_path)
-
- model_path = m5.get_model_path(idx)
- model_file = os.path.join(dir_path, "model.out")
- copyfile(model_path, model_file)
-
- init_file = os.path.join(dir_path, "__init__.py")
- copyfile("example_init.py", init_file)
-
- yaml_file = os.path.join(dir_path, "learnware.yaml")
- copyfile("example.yaml", yaml_file)
-
- zip_file = dir_path + ".zip"
- with zipfile.ZipFile(zip_file, "w") as zip_obj:
- for foldername, subfolders, filenames in os.walk(dir_path):
- for filename in filenames:
- file_path = os.path.join(foldername, filename)
- zip_info = zipfile.ZipInfo(filename)
- zip_info.compress_type = zipfile.ZIP_STORED
- with open(file_path, "rb") as file:
- zip_obj.writestr(zip_info, file.read())
-
- rmtree(dir_path)
-
- def test(self, regenerate_flag=False):
- self.prepare_learnware(regenerate_flag)
- self._init_learnware_market()
-
- easy_market = instantiate_learnware_market(name="easy")
- print("Total Item:", len(easy_market))
-
- m5 = DataLoader()
- idx_list = m5.get_idx_list()
- os.makedirs("./user_spec", exist_ok=True)
- single_score_list = []
- random_score_list = []
- job_selector_score_list = []
- ensemble_score_list = []
- improve_list = []
-
- for idx in idx_list:
- train_x, train_y, test_x, test_y = m5.get_idx_data(idx)
- user_spec = generate_rkme_table_spec(X=test_x, gamma=0.1, cuda_idx=0)
- user_spec_path = f"./user_spec/user_{idx}.json"
- user_spec.save(user_spec_path)
-
- user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETableSpecification": user_spec})
- search_result = easy_market.search_learnware(user_info)
- single_result = search_result.get_single_results()
- multiple_result = search_result.get_multiple_results()
-
- print(f"search result of user{idx}:")
- print(
- f"single model num: {len(single_result)}, max_score: {single_result[0].score}, min_score: {single_result[-1].score}"
- )
- loss_list = []
- for single_item in single_result:
- pred_y = single_item.learnware.predict(test_x)
- loss_list.append(m5.score(test_y, pred_y))
- print(
- f"Top1-score: {single_result[0].score}, learnware_id: {single_result[0].learnware.id}, loss: {loss_list[0]}"
- )
-
- if len(multiple_result) > 0:
- mixture_id = " ".join([learnware.id for learnware in multiple_result[0].learnwares])
- print(f"mixture_score: {multiple_result[0].score}, mixture_learnware: {mixture_id}")
- mixture_learnware_list = multiple_result[0].learnwares
- else:
- mixture_learnware_list = [single_result[0].learnware]
-
- reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False)
- job_selector_predict_y = reuse_job_selector.predict(user_data=test_x)
- job_selector_score = m5.score(test_y, job_selector_predict_y)
- print(f"mixture reuse loss (job selector): {job_selector_score}")
-
- reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_prob")
- ensemble_predict_y = reuse_ensemble.predict(user_data=test_x)
- ensemble_score = m5.score(test_y, ensemble_predict_y)
- print(f"mixture reuse loss (ensemble): {ensemble_score}\n")
-
- single_score_list.append(loss_list[0])
- random_score_list.append(np.mean(loss_list))
- job_selector_score_list.append(job_selector_score)
- ensemble_score_list.append(ensemble_score)
- improve_list.append((np.mean(loss_list) - loss_list[0]) / np.mean(loss_list))
-
- logger.info("Single search score %.3f +/- %.3f" % (np.mean(single_score_list), np.std(single_score_list)))
- logger.info("Random search score: %.3f +/- %.3f" % (np.mean(random_score_list), np.std(random_score_list)))
- logger.info("Average score improvement: %.3f" % (np.mean(improve_list)))
- logger.info(
- "Job selector score: %.3f +/- %.3f" % (np.mean(job_selector_score_list), np.std(job_selector_score_list))
- )
- logger.info(
- "Average ensemble score: %.3f +/- %.3f" % (np.mean(ensemble_score_list), np.std(ensemble_score_list))
- )
-
-
- if __name__ == "__main__":
- fire.Fire(M5DatasetWorkflow)
|