Browse Source

Merge branch 'main' of https://github.com/Learnware-LAMDA/Learnware into bxdd-temp1

tags/v0.3.2
bxdd 2 years ago
parent
commit
1c91a856fa
20 changed files with 342 additions and 784 deletions
  1. +2
    -0
      examples/dataset_m5_workflow/main.py
  2. +2
    -0
      examples/dataset_pfs_workflow/main.py
  3. +12
    -37
      examples/dataset_text_workflow/example_files/example_init.py
  4. +2
    -2
      examples/dataset_text_workflow/example_files/example_yaml.yaml
  5. +4
    -3
      examples/dataset_text_workflow/example_files/requirements.txt
  6. +12
    -10
      examples/dataset_text_workflow/get_data.py
  7. +225
    -184
      examples/dataset_text_workflow/main.py
  8. +30
    -113
      examples/dataset_text_workflow/utils.py
  9. +0
    -29
      examples/dataset_text_workflow2/example_files/example_init.py
  10. +0
    -8
      examples/dataset_text_workflow2/example_files/example_yaml.yaml
  11. +0
    -4
      examples/dataset_text_workflow2/example_files/requirements.txt
  12. +0
    -16
      examples/dataset_text_workflow2/get_data.py
  13. +0
    -257
      examples/dataset_text_workflow2/main.py
  14. +0
    -104
      examples/dataset_text_workflow2/utils.py
  15. +1
    -1
      learnware/market/easy/searcher.py
  16. +15
    -2
      learnware/specification/regular/table/__init__.py
  17. +10
    -9
      learnware/specification/regular/table/rkme.py
  18. +15
    -0
      learnware/specification/regular/table/utils.py
  19. +8
    -2
      learnware/specification/regular/text/__init__.py
  20. +4
    -3
      setup.py

+ 2
- 0
examples/dataset_m5_workflow/main.py View File

@@ -176,6 +176,8 @@ class M5DatasetWorkflow:

mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list])
print(f"mixture_score: {mixture_score}, mixture_learnware: {mixture_id}")
if not mixture_learnware_list:
mixture_learnware_list = [single_learnware_list[0]]

reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False)
job_selector_predict_y = reuse_job_selector.predict(user_data=test_x)


+ 2
- 0
examples/dataset_pfs_workflow/main.py View File

@@ -173,6 +173,8 @@ class PFSDatasetWorkflow:

mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list])
print(f"mixture_score: {mixture_score}, mixture_learnware: {mixture_id}")
if not mixture_learnware_list:
mixture_learnware_list = [single_learnware_list[0]]

reuse_job_selector = JobSelectorReuser(learnware_list=mixture_learnware_list, use_herding=False)
job_selector_predict_y = reuse_job_selector.predict(user_data=test_x)


+ 12
- 37
examples/dataset_text_workflow/example_files/example_init.py View File

@@ -1,54 +1,29 @@
import os
import joblib
import pickle

import numpy as np

from learnware.model import BaseModel
import torch
from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER
import torchtext.functional as F
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url


class Model(BaseModel):
def __init__(self):
super().__init__(input_shape=None, output_shape=(2,))
super(Model, self).__init__(input_shape=(1,), output_shape=(1,))
dir_path = os.path.dirname(os.path.abspath(__file__))
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_classes = 2
input_dim = 768
classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
self.model = XLMR_BASE_ENCODER.get_model(head=classifier_head).to(self.device)
self.model.load_state_dict(torch.load(os.path.join(dir_path, "model.pth"), map_location=torch.device("cpu")))
modelv_path = os.path.join(dir_path, "modelv.pth")
with open(modelv_path, "rb") as f:
self.modelv = pickle.load(f)

modell_path = os.path.join(dir_path, "modell.pth")
with open(modell_path, "rb") as f:
self.modell = pickle.load(f)

def fit(self, X: np.ndarray, y: np.ndarray):
pass

def predict(self, X: np.ndarray) -> np.ndarray:
X = sentence_preprocess(X)
X = F.to_tensor(X, padding_value=1).to(self.device)
return self.model(X)
return self.modell.predict(self.modelv.transform(X))

def finetune(self, X: np.ndarray, y: np.ndarray):
pass


def sentence_preprocess(x_datapipe):
padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

text_transform = T.Sequential(
T.SentencePieceTokenizer(xlmr_spm_model_path),
T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
T.Truncate(max_seq_len - 2),
T.AddToken(token=bos_idx, begin=True),
T.AddToken(token=eos_idx, begin=False),
)

x_datapipe = [text_transform(x) for x in x_datapipe]
# x_datapipe = x_datapipe.map(text_transform)
return x_datapipe

+ 2
- 2
examples/dataset_text_workflow/example_files/example_yaml.yaml View File

@@ -1,8 +1,8 @@
model:
class_name: Model
kwargs: {}
kwargs: { }
stat_specifications:
- module_path: learnware.specification
class_name: RKMETextSpecification
file_name: rkme.json
kwargs: {}
kwargs: { }

+ 4
- 3
examples/dataset_text_workflow/example_files/requirements.txt View File

@@ -1,3 +1,4 @@
torch==2.0.1
torchdata
torchtext
numpy
pickle
lightgbm
scikit-learn

+ 12
- 10
examples/dataset_text_workflow/get_data.py View File

@@ -1,14 +1,16 @@
from torchtext.datasets import SST2
import os

import pandas as pd

def get_sst2(data_root="./data"):
train_datapipe = SST2(root="./data", split="train")

X_train = [x[0] for x in train_datapipe]
y_train = [x[1] for x in train_datapipe]
def get_data(data_root="./data"):
dtrain = pd.read_csv(os.path.join(data_root, "train.csv"))
dtest = pd.read_csv(os.path.join(data_root, "test.csv"))

dev_datapipe = SST2(root="./data", split="dev")

X_test = [x[0] for x in dev_datapipe]
y_test = [x[1] for x in dev_datapipe]
return X_train, y_train, X_test, y_test
# returned X(DataFrame), y(Series)
return (
dtrain[["discourse_text", "discourse_type"]],
dtrain["discourse_effectiveness"],
dtest[["discourse_text", "discourse_type"]],
dtest["discourse_effectiveness"],
)

+ 225
- 184
examples/dataset_text_workflow/main.py View File

@@ -1,30 +1,28 @@
import numpy as np
import torch
from get_data import get_sst2
import os
import random
from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction
from learnware.learnware import Learnware
from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser
import time
import fire
import pickle
import time
import zipfile
from shutil import copyfile, rmtree

from learnware.market import instantiate_learnware_market, BaseUserInfo
from learnware.specification import RKMETextSpecification
from learnware.logger import get_module_logger
import numpy as np

from shutil import copyfile, rmtree
import zipfile
import learnware.specification as specification
from get_data import get_data
from learnware.logger import get_module_logger
from learnware.market import instantiate_learnware_market, BaseUserInfo
from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser
from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction

logger = get_module_logger("text_test", level="INFO")
origin_data_root = "./data/origin_data"
processed_data_root = "./data/processed_data"
tmp_dir = "./data/tmp"
learnware_pool_dir = "./data/learnware_pool"
dataset = "sst2"
n_uploaders = 10
n_users = 5
n_classes = 2
dataset = "ae" # argumentative essays
n_uploaders = 7
n_users = 7
n_classes = 3
data_root = os.path.join(origin_data_root, dataset)
data_save_root = os.path.join(processed_data_root, dataset)
user_save_root = os.path.join(data_save_root, "user")
@@ -36,18 +34,17 @@ os.makedirs(uploader_save_root, exist_ok=True)
os.makedirs(model_save_root, exist_ok=True)

output_description = {
"Dimension": 2,
"Dimension": 1,
"Description": {
"0": "the probability of being negative",
"1": "the probability of being positive",
"0": "classify as 0(ineffective), 1(effective), or 2(adequate).",
},
}
semantic_specs = [
{
"Data": {"Values": ["Text"], "Type": "Class"},
"Task": {"Values": ["Classification"], "Type": "Class"},
"Library": {"Values": ["PyTorch"], "Type": "Class"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Education"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "learnware_1", "Type": "String"},
"Output": output_description,
@@ -57,176 +54,220 @@ semantic_specs = [
user_semantic = {
"Data": {"Values": ["Text"], "Type": "Class"},
"Task": {"Values": ["Classification"], "Type": "Class"},
"Library": {"Values": ["PyTorch"], "Type": "Class"},
"Scenario": {"Values": ["Business"], "Type": "Tag"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Education"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "", "Type": "String"},
"Output": output_description,
}


def prepare_data():
if dataset == "sst2":
X_train, y_train, X_test, y_test = get_sst2(data_root)
else:
return
generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root)
generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root)


def prepare_model():
dataloader = TextDataLoader(data_save_root, train=True)
for i in range(n_uploaders):
logger.info("Train on uploader: %d" % (i))
X, y = dataloader.get_idx_data(i)
model = train(X, y, out_classes=n_classes)
model_save_path = os.path.join(model_save_root, "uploader_%d.pth" % (i))
torch.save(model.state_dict(), model_save_path)
logger.info("Model saved to '%s'" % (model_save_path))


def prepare_learnware(data_path, model_path, init_file_path, yaml_path, env_file_path, save_root, zip_name):
os.makedirs(save_root, exist_ok=True)
tmp_spec_path = os.path.join(save_root, "rkme.json")
tmp_model_path = os.path.join(save_root, "model.pth")
tmp_yaml_path = os.path.join(save_root, "learnware.yaml")
tmp_init_path = os.path.join(save_root, "__init__.py")
tmp_env_path = os.path.join(save_root, "requirements.txt")

with open(data_path, "rb") as f:
X = pickle.load(f)
semantic_spec = semantic_specs[0]

st = time.time()
user_spec = RKMETextSpecification()
user_spec.generate_stat_spec_from_data(X=X)
ed = time.time()
logger.info("Stat spec generated in %.3f s" % (ed - st))
user_spec.save(tmp_spec_path)
copyfile(model_path, tmp_model_path)
copyfile(yaml_path, tmp_yaml_path)
copyfile(init_file_path, tmp_init_path)
copyfile(env_file_path, tmp_env_path)
zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name))
with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj:
zip_obj.write(tmp_spec_path, "rkme.json")
zip_obj.write(tmp_model_path, "model.pth")
zip_obj.write(tmp_yaml_path, "learnware.yaml")
zip_obj.write(tmp_init_path, "__init__.py")
zip_obj.write(tmp_env_path, "requirements.txt")
rmtree(save_root)
logger.info("New Learnware Saved to %s" % (zip_file_name))
return zip_file_name


def prepare_market():
text_market = instantiate_learnware_market(market_id="sst2", rebuild=True)
try:
rmtree(learnware_pool_dir)
except:
pass
os.makedirs(learnware_pool_dir, exist_ok=True)
for i in range(n_uploaders):
data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i))
model_path = os.path.join(model_save_root, "uploader_%d.pth" % (i))
init_file_path = "./example_files/example_init.py"
yaml_file_path = "./example_files/example_yaml.yaml"
env_file_path = "./example_files/requirements.txt"
new_learnware_path = prepare_learnware(
data_path, model_path, init_file_path, yaml_file_path, env_file_path, tmp_dir, "%s_%d" % (dataset, i)
)
class TextDatasetWorkflow:
def _init_text_dataset(self):
self._prepare_data()
self._prepare_model()

def _prepare_data(self):
X_train, y_train, X_test, y_test = get_data(data_root)

generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root)
generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root)


def _prepare_model(self):
dataloader = TextDataLoader(data_save_root, train=True)
for i in range(n_uploaders):
logger.info("Train on uploader: %d" % (i))
X, y = dataloader.get_idx_data(i)
vectorizer, lgbm = train(X, y, out_classes=n_classes)

modelv_save_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i))
modell_save_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i))

with open(modelv_save_path, "wb") as f:
pickle.dump(vectorizer, f)

with open(modell_save_path, "wb") as f:
pickle.dump(lgbm, f)

logger.info("Model saved to '%s' and '%s'" % (modelv_save_path, modell_save_path))


def _prepare_learnware(self,
data_path, modelv_path, modell_path, init_file_path, yaml_path, env_file_path, save_root, zip_name
):
os.makedirs(save_root, exist_ok=True)
tmp_spec_path = os.path.join(save_root, "rkme.json")

tmp_modelv_path = os.path.join(save_root, "modelv.pth")
tmp_modell_path = os.path.join(save_root, "modell.pth")

tmp_yaml_path = os.path.join(save_root, "learnware.yaml")
tmp_init_path = os.path.join(save_root, "__init__.py")
tmp_env_path = os.path.join(save_root, "requirements.txt")

with open(data_path, "rb") as f:
X = pickle.load(f)
semantic_spec = semantic_specs[0]
semantic_spec["Name"]["Values"] = "learnware_%d" % (i)
semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i)
text_market.add_learnware(new_learnware_path, semantic_spec)

logger.info("Total Item: %d" % (len(text_market)))


def test_search(gamma=0.1, load_market=True):
if load_market:
text_market = instantiate_learnware_market(market_id="sst2")
else:
prepare_market()
text_market = instantiate_learnware_market(market_id="sst2")
logger.info("Number of items in the market: %d" % len(text_market))

select_list = []
avg_list = []
improve_list = []
job_selector_score_list = []
ensemble_score_list = []
pruning_score_list = []
for i in range(n_users):
user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i))
user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i))
with open(user_data_path, "rb") as f:
user_data = pickle.load(f)
with open(user_label_path, "rb") as f:
user_label = pickle.load(f)

user_stat_spec = RKMETextSpecification()
user_stat_spec.generate_stat_spec_from_data(X=user_data)
user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec})
logger.info("Searching Market for user: %d" % (i))
sorted_score_list, single_learnware_list, mixture_score, mixture_learnware_list = text_market.search_learnware(
user_info

st = time.time()

user_spec = specification.RKMETextSpecification()

user_spec.generate_stat_spec_from_data(X=X)
ed = time.time()
logger.info("Stat spec generated in %.3f s" % (ed - st))
user_spec.save(tmp_spec_path)

copyfile(modelv_path, tmp_modelv_path)
copyfile(modell_path, tmp_modell_path)

copyfile(yaml_path, tmp_yaml_path)
copyfile(init_file_path, tmp_init_path)
copyfile(env_file_path, tmp_env_path)
zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name))
with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj:
zip_obj.write(tmp_spec_path, "rkme.json")

zip_obj.write(tmp_modelv_path, "modelv.pth")
zip_obj.write(tmp_modell_path, "modell.pth")

zip_obj.write(tmp_yaml_path, "learnware.yaml")
zip_obj.write(tmp_init_path, "__init__.py")
zip_obj.write(tmp_env_path, "requirements.txt")
rmtree(save_root)
logger.info("New Learnware Saved to %s" % (zip_file_name))
return zip_file_name


def prepare_market(self, regenerate_flag=False):
if regenerate_flag:
self._init_text_dataset()
text_market = instantiate_learnware_market(market_id="ae", rebuild=True)
try:
rmtree(learnware_pool_dir)
except:
pass
os.makedirs(learnware_pool_dir, exist_ok=True)
for i in range(n_uploaders):
data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i))

modelv_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i))
modell_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i))

init_file_path = "./example_files/example_init.py"
yaml_file_path = "./example_files/example_yaml.yaml"
env_file_path = "./example_files/requirements.txt"
new_learnware_path = self._prepare_learnware(
data_path,
modelv_path,
modell_path,
init_file_path,
yaml_file_path,
env_file_path,
tmp_dir,
"%s_%d" % (dataset, i),
)
semantic_spec = semantic_specs[0]
semantic_spec["Name"]["Values"] = "learnware_%d" % (i)
semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i)
text_market.add_learnware(new_learnware_path, semantic_spec)

logger.info("Total Item: %d" % (len(text_market)))


def test(self, regenerate_flag=False):
self.prepare_market(regenerate_flag)
text_market = instantiate_learnware_market(market_id="ae")
print("Total Item: %d" % len(text_market))

select_list = []
avg_list = []
improve_list = []
job_selector_score_list = []
ensemble_score_list = []
pruning_score_list = []
for i in range(n_users):
user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i))
user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i))
with open(user_data_path, "rb") as f:
user_data = pickle.load(f)
with open(user_label_path, "rb") as f:
user_label = pickle.load(f)

user_stat_spec = specification.RKMETextSpecification()
user_stat_spec.generate_stat_spec_from_data(X=user_data)
user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec})
logger.info("Searching Market for user: %d" % (i))
sorted_score_list, single_learnware_list, mixture_score, mixture_learnware_list = text_market.search_learnware(
user_info
)

print(f"search result of user{i}:")
print(
f"single model num: {len(sorted_score_list)}, max_score: {sorted_score_list[0]}, min_score: {sorted_score_list[-1]}"
)

l = len(sorted_score_list)
acc_list = []
for idx in range(l):
learnware = single_learnware_list[idx]
score = sorted_score_list[idx]
pred_y = learnware.predict(user_data)
acc = eval_prediction(pred_y, user_label)
acc_list.append(acc)
print(
f"Top1-score: {sorted_score_list[0]}, learnware_id: {single_learnware_list[0].id}, acc: {acc_list[0]}"
)
mixture_id = " ".join([learnware.id for learnware in mixture_learnware_list])
print(f"mixture_score: {mixture_score}, mixture_learnware: {mixture_id}")
if not mixture_learnware_list:
mixture_learnware_list = [single_learnware_list[0]]

# test reuse (job selector)
reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100)
reuse_predict = reuse_baseline.predict(user_data=user_data)
reuse_score = eval_prediction(reuse_predict, user_label)
job_selector_score_list.append(reuse_score)
print(f"mixture reuse loss(job selector): {reuse_score}")

# test reuse (ensemble)
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label")
ensemble_predict_y = reuse_ensemble.predict(user_data=user_data)
ensemble_score = eval_prediction(ensemble_predict_y, user_label)
ensemble_score_list.append(ensemble_score)
print(f"mixture reuse accuracy (ensemble): {ensemble_score}")

# test reuse (ensemblePruning)
reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list)
pruning_predict_y = reuse_pruning.predict(user_data=user_data)
pruning_score = eval_prediction(pruning_predict_y, user_label)
pruning_score_list.append(pruning_score)
print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n")

select_list.append(acc_list[0])
avg_list.append(np.mean(acc_list))
improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list))

logger.info(
"Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f"
% (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list))
)
logger.info("Average performance improvement: %.3f" % (np.mean(improve_list)))
logger.info(
"Average Job Selector Reuse Performance: %.3f +/- %.3f"
% (np.mean(job_selector_score_list), np.std(job_selector_score_list))
)
logger.info(
"Averaging Ensemble Reuse Performance: %.3f +/- %.3f"
% (np.mean(ensemble_score_list), np.std(ensemble_score_list))
)
logger.info(
"Selective Ensemble Reuse Performance: %.3f +/- %.3f"
% (np.mean(pruning_score_list), np.std(pruning_score_list))
)
l = len(sorted_score_list)
acc_list = []
for idx in range(l):
learnware = single_learnware_list[idx]
score = sorted_score_list[idx]
pred_y = learnware.predict(user_data)
acc = eval_prediction(pred_y, user_label)
acc_list.append(acc)
logger.info("search rank: %d, score: %.3f, learnware_id: %s, acc: %.3f" % (idx, score, learnware.id, acc))

# test reuse (job selector)
reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100)
reuse_predict = reuse_baseline.predict(user_data=user_data)
reuse_score = eval_prediction(reuse_predict, user_label)
job_selector_score_list.append(reuse_score)
print(f"mixture reuse loss(job selector): {reuse_score}")

# test reuse (ensemble)
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label")
ensemble_predict_y = reuse_ensemble.predict(user_data=user_data)
ensemble_score = eval_prediction(ensemble_predict_y, user_label)
ensemble_score_list.append(ensemble_score)
print(f"mixture reuse accuracy (ensemble): {ensemble_score}")

# test reuse (ensemblePruning)
reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list)
pruning_predict_y = reuse_pruning.predict(user_data=user_data)
pruning_score = eval_prediction(pruning_predict_y, user_label)
pruning_score_list.append(pruning_score)
print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n")

select_list.append(acc_list[0])
avg_list.append(np.mean(acc_list))
improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list))

logger.info(
"Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f"
% (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list))
)
logger.info("Average performance improvement: %.3f" % (np.mean(improve_list)))
logger.info(
"Average Job Selector Reuse Performance: %.3f +/- %.3f"
% (np.mean(job_selector_score_list), np.std(job_selector_score_list))
)
logger.info(
"Averaging Ensemble Reuse Performance: %.3f +/- %.3f"
% (np.mean(ensemble_score_list), np.std(ensemble_score_list))
)
logger.info(
"Selective Ensemble Reuse Performance: %.3f +/- %.3f"
% (np.mean(pruning_score_list), np.std(pruning_score_list))
)


if __name__ == "__main__":
prepare_data()
prepare_model()
test_search(load_market=False)
fire.Fire(TextDatasetWorkflow)

+ 30
- 113
examples/dataset_text_workflow/utils.py View File

@@ -1,19 +1,10 @@
import os
import numpy as np
import random
import math

import torch
import torch.nn as nn
import torch.optim as optim

import pickle
import torchtext.transforms as T
from torch.hub import load_state_dict_from_url
from torchtext.models import RobertaClassificationHead, XLMR_BASE_ENCODER
import torchtext.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer


class TextDataLoader:
@@ -43,20 +34,25 @@ class TextDataLoader:
return X, y


def generate_uploader(data_x, data_y, n_uploaders=50, data_save_root=None):
def generate_uploader(data_x: pd.Series, data_y: pd.Series, n_uploaders=50, data_save_root=None):
if data_save_root is None:
return
os.makedirs(data_save_root, exist_ok=True)
n = len(data_x)

types = data_x["discourse_type"].unique()

for i in range(n_uploaders):
selected_X = data_x[i * (n // n_uploaders) : (i + 1) * (n // n_uploaders)]
selected_y = data_y[i * (n // n_uploaders) : (i + 1) * (n // n_uploaders)]
indices = data_x["discourse_type"] == types[i]
selected_X = data_x[indices]["discourse_text"].to_list()
selected_y = data_y[indices].to_list()

X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i))
y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i))
with open(X_save_dir, "wb") as f:
pickle.dump(selected_X, f)
with open(y_save_dir, "wb") as f:
pickle.dump(selected_y, f)

print("Saving to %s" % (X_save_dir))


@@ -64,114 +60,36 @@ def generate_user(data_x, data_y, n_users=50, data_save_root=None):
if data_save_root is None:
return
os.makedirs(data_save_root, exist_ok=True)
n = len(data_x)

types = data_x["discourse_type"].unique()

for i in range(n_users):
selected_X = data_x[i * (n // n_users) : (i + 1) * (n // n_users)]
selected_y = data_y[i * (n // n_users) : (i + 1) * (n // n_users)]
indices = data_x["discourse_type"] == types[i]
selected_X = data_x[indices]["discourse_text"].to_list()
selected_y = data_y[indices].to_list()

X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i))
y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i))
with open(X_save_dir, "wb") as f:
pickle.dump(selected_X, f)
with open(y_save_dir, "wb") as f:
pickle.dump(selected_y, f)
print("Saving to %s" % (X_save_dir))


def sentence_preprocess(x_datapipe):
padding_idx = 1
bos_idx = 0
eos_idx = 2
max_seq_len = 256
xlmr_vocab_path = r"https://download.pytorch.org/models/text/xlmr.vocab.pt"
xlmr_spm_model_path = r"https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"

text_transform = T.Sequential(
T.SentencePieceTokenizer(xlmr_spm_model_path),
T.VocabTransform(load_state_dict_from_url(xlmr_vocab_path)),
T.Truncate(max_seq_len - 2),
T.AddToken(token=bos_idx, begin=True),
T.AddToken(token=eos_idx, begin=False),
)

x_datapipe = [text_transform(x) for x in x_datapipe]
# x_datapipe = x_datapipe.map(text_transform)
return x_datapipe


def train_step(model, criteria, optim, input, target):
output = model(input)
loss = criteria(output, target)
optim.zero_grad()
loss.backward()
optim.step()


def eval_step(model, criteria, input, target):
output = model(input)
loss = criteria(output, target).item()
return float(loss), (output.argmax(1) == target).type(torch.float).sum().item()


def evaluate(model, criteria, dev_dataloader):
model.eval()
total_loss = 0
correct_predictions = 0
total_predictions = 0
counter = 0
with torch.no_grad():
for batch in dev_dataloader:
input = F.to_tensor(batch["token_ids"], padding_value=1).to(DEVICE)
target = torch.tensor(batch["target"]).to(DEVICE)
loss, predictions = eval_step(model, criteria, input, target)
total_loss += loss
correct_predictions += predictions
total_predictions += len(target)
counter += 1

return total_loss / counter, correct_predictions / total_predictions


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Saving to %s" % (X_save_dir))


# Train Uploaders' models
def train(X, y, out_classes, epochs=35, batch_size=128):
# print(X.shape, y.shape)
from torchdata.datapipes.iter import IterableWrapper

X = sentence_preprocess(X)
data_size = len(X)
train_datapipe = list(zip(X, y))
train_datapipe = IterableWrapper(train_datapipe)
train_datapipe = train_datapipe.batch(batch_size)
train_datapipe = train_datapipe.rows2columnar(["token_ids", "target"])
train_dataloader = DataLoader(train_datapipe, batch_size=None)

num_classes = 2
input_dim = 768
classifier_head = RobertaClassificationHead(num_classes=num_classes, input_dim=input_dim)
model = XLMR_BASE_ENCODER.get_model(head=classifier_head)
learning_rate = 1e-5
optim = AdamW(model.parameters(), lr=learning_rate)
criteria = nn.CrossEntropyLoss()
def train(X, y, out_classes):
vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf = vectorizer.fit_transform(X)

model.to(DEVICE)
lgbm = LGBMClassifier(boosting_type="dart", n_estimators=500, num_leaves=21)
lgbm.fit(X_tfidf, y)

num_epochs = 10

for e in range(num_epochs):
for batch in train_dataloader:
input = F.to_tensor(batch["token_ids"], padding_value=1).to(DEVICE)
target = torch.tensor(batch["target"]).to(DEVICE)
train_step(model, criteria, optim, input, target)

loss, accuracy = evaluate(model, criteria, train_dataloader)
print("Epoch = [{}], loss = [{}], accuracy = [{}]".format(e, loss, accuracy))
return model
return vectorizer, lgbm


def eval_prediction(pred_y, target_y):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if not isinstance(pred_y, np.ndarray):
pred_y = pred_y.detach().cpu().numpy()
if len(pred_y.shape) == 1:
@@ -179,9 +97,8 @@ def eval_prediction(pred_y, target_y):
else:
predicted = np.argmax(pred_y, 1)
annos = np.array(target_y)
# print(predicted, annos)
# annos = target_y

total = predicted.shape[0]
correct = (predicted == annos).sum().item()
criterion = nn.CrossEntropyLoss()
return correct / total

+ 0
- 29
examples/dataset_text_workflow2/example_files/example_init.py View File

@@ -1,29 +0,0 @@
import os
import pickle

import numpy as np

from learnware.model import BaseModel


class Model(BaseModel):
def __init__(self):
super(Model, self).__init__(input_shape=(1,), output_shape=(1,))
dir_path = os.path.dirname(os.path.abspath(__file__))

modelv_path = os.path.join(dir_path, "modelv.pth")
with open(modelv_path, "rb") as f:
self.modelv = pickle.load(f)

modell_path = os.path.join(dir_path, "modell.pth")
with open(modell_path, "rb") as f:
self.modell = pickle.load(f)

def fit(self, X: np.ndarray, y: np.ndarray):
pass

def predict(self, X: np.ndarray) -> np.ndarray:
return self.modell.predict(self.modelv.transform(X))

def finetune(self, X: np.ndarray, y: np.ndarray):
pass

+ 0
- 8
examples/dataset_text_workflow2/example_files/example_yaml.yaml View File

@@ -1,8 +0,0 @@
model:
class_name: Model
kwargs: { }
stat_specifications:
- module_path: learnware.specification
class_name: RKMETextSpecification
file_name: rkme.json
kwargs: { }

+ 0
- 4
examples/dataset_text_workflow2/example_files/requirements.txt View File

@@ -1,4 +0,0 @@
numpy
pickle
lightgbm
scikit-learn

+ 0
- 16
examples/dataset_text_workflow2/get_data.py View File

@@ -1,16 +0,0 @@
import os

import pandas as pd


def get_data(data_root="./data"):
dtrain = pd.read_csv(os.path.join(data_root, "train.csv"))
dtest = pd.read_csv(os.path.join(data_root, "test.csv"))

# returned X(DataFrame), y(Series)
return (
dtrain[["discourse_text", "discourse_type"]],
dtrain["discourse_effectiveness"],
dtest[["discourse_text", "discourse_type"]],
dtest["discourse_effectiveness"],
)

+ 0
- 257
examples/dataset_text_workflow2/main.py View File

@@ -1,257 +0,0 @@
import os
import pickle
import time
import zipfile
from shutil import copyfile, rmtree

import numpy as np

import learnware.specification as specification
from get_data import get_data
from learnware.logger import get_module_logger
from learnware.market import instantiate_learnware_market, BaseUserInfo
from learnware.reuse import JobSelectorReuser, AveragingReuser, EnsemblePruningReuser
from utils import generate_uploader, generate_user, TextDataLoader, train, eval_prediction

logger = get_module_logger("text_test", level="INFO")
origin_data_root = "./data/origin_data"
processed_data_root = "./data/processed_data"
tmp_dir = "./data/tmp"
learnware_pool_dir = "./data/learnware_pool"
dataset = "ae" # argumentative essays
n_uploaders = 7
n_users = 7
n_classes = 3
data_root = os.path.join(origin_data_root, dataset)
data_save_root = os.path.join(processed_data_root, dataset)
user_save_root = os.path.join(data_save_root, "user")
uploader_save_root = os.path.join(data_save_root, "uploader")
model_save_root = os.path.join(data_save_root, "uploader_model")
os.makedirs(data_root, exist_ok=True)
os.makedirs(user_save_root, exist_ok=True)
os.makedirs(uploader_save_root, exist_ok=True)
os.makedirs(model_save_root, exist_ok=True)

output_description = {
"Dimension": 1,
"Description": {
"0": "classify as 0(ineffective), 1(effective), or 2(adequate).",
},
}
semantic_specs = [
{
"Data": {"Values": ["Text"], "Type": "Class"},
"Task": {"Values": ["Classification"], "Type": "Class"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Education"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "learnware_1", "Type": "String"},
"Output": output_description,
}
]

user_semantic = {
"Data": {"Values": ["Text"], "Type": "Class"},
"Task": {"Values": ["Classification"], "Type": "Class"},
"Library": {"Values": ["Scikit-learn"], "Type": "Class"},
"Scenario": {"Values": ["Education"], "Type": "Tag"},
"Description": {"Values": "", "Type": "String"},
"Name": {"Values": "", "Type": "String"},
"Output": output_description,
}


def prepare_data():
X_train, y_train, X_test, y_test = get_data(data_root)

generate_uploader(X_train, y_train, n_uploaders=n_uploaders, data_save_root=uploader_save_root)
generate_user(X_test, y_test, n_users=n_users, data_save_root=user_save_root)


def prepare_model():
dataloader = TextDataLoader(data_save_root, train=True)
for i in range(n_uploaders):
logger.info("Train on uploader: %d" % (i))
X, y = dataloader.get_idx_data(i)
vectorizer, lgbm = train(X, y, out_classes=n_classes)

modelv_save_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i))
modell_save_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i))

with open(modelv_save_path, "wb") as f:
pickle.dump(vectorizer, f)

with open(modell_save_path, "wb") as f:
pickle.dump(lgbm, f)

logger.info("Model saved to '%s' and '%s'" % (modelv_save_path, modell_save_path))


def prepare_learnware(
data_path, modelv_path, modell_path, init_file_path, yaml_path, env_file_path, save_root, zip_name
):
os.makedirs(save_root, exist_ok=True)
tmp_spec_path = os.path.join(save_root, "rkme.json")

tmp_modelv_path = os.path.join(save_root, "modelv.pth")
tmp_modell_path = os.path.join(save_root, "modell.pth")

tmp_yaml_path = os.path.join(save_root, "learnware.yaml")
tmp_init_path = os.path.join(save_root, "__init__.py")
tmp_env_path = os.path.join(save_root, "requirements.txt")

with open(data_path, "rb") as f:
X = pickle.load(f)
semantic_spec = semantic_specs[0]

st = time.time()

user_spec = specification.RKMETextSpecification()

user_spec.generate_stat_spec_from_data(X=X)
ed = time.time()
logger.info("Stat spec generated in %.3f s" % (ed - st))
user_spec.save(tmp_spec_path)

copyfile(modelv_path, tmp_modelv_path)
copyfile(modell_path, tmp_modell_path)

copyfile(yaml_path, tmp_yaml_path)
copyfile(init_file_path, tmp_init_path)
copyfile(env_file_path, tmp_env_path)
zip_file_name = os.path.join(learnware_pool_dir, "%s.zip" % (zip_name))
with zipfile.ZipFile(zip_file_name, "w", compression=zipfile.ZIP_DEFLATED) as zip_obj:
zip_obj.write(tmp_spec_path, "rkme.json")

zip_obj.write(tmp_modelv_path, "modelv.pth")
zip_obj.write(tmp_modell_path, "modell.pth")

zip_obj.write(tmp_yaml_path, "learnware.yaml")
zip_obj.write(tmp_init_path, "__init__.py")
zip_obj.write(tmp_env_path, "requirements.txt")
rmtree(save_root)
logger.info("New Learnware Saved to %s" % (zip_file_name))
return zip_file_name


def prepare_market():
text_market = instantiate_learnware_market(market_id="ae", rebuild=True)
try:
rmtree(learnware_pool_dir)
except:
pass
os.makedirs(learnware_pool_dir, exist_ok=True)
for i in range(n_uploaders):
data_path = os.path.join(uploader_save_root, "uploader_%d_X.pkl" % (i))

modelv_path = os.path.join(model_save_root, "uploader_v_%d.pth" % (i))
modell_path = os.path.join(model_save_root, "uploader_l_%d.pth" % (i))

init_file_path = "./example_files/example_init.py"
yaml_file_path = "./example_files/example_yaml.yaml"
env_file_path = "./example_files/requirements.txt"
new_learnware_path = prepare_learnware(
data_path,
modelv_path,
modell_path,
init_file_path,
yaml_file_path,
env_file_path,
tmp_dir,
"%s_%d" % (dataset, i),
)
semantic_spec = semantic_specs[0]
semantic_spec["Name"]["Values"] = "learnware_%d" % (i)
semantic_spec["Description"]["Values"] = "test_learnware_number_%d" % (i)
text_market.add_learnware(new_learnware_path, semantic_spec)

logger.info("Total Item: %d" % (len(text_market)))


def test_search(load_market=True):
if load_market:
text_market = instantiate_learnware_market(market_id="ae")
else:
prepare_market()
text_market = instantiate_learnware_market(market_id="ae")
logger.info("Number of items in the market: %d" % len(text_market))

select_list = []
avg_list = []
improve_list = []
job_selector_score_list = []
ensemble_score_list = []
pruning_score_list = []
for i in range(n_users):
user_data_path = os.path.join(user_save_root, "user_%d_X.pkl" % (i))
user_label_path = os.path.join(user_save_root, "user_%d_y.pkl" % (i))
with open(user_data_path, "rb") as f:
user_data = pickle.load(f)
with open(user_label_path, "rb") as f:
user_label = pickle.load(f)

user_stat_spec = specification.RKMETextSpecification()
user_stat_spec.generate_stat_spec_from_data(X=user_data)
user_info = BaseUserInfo(semantic_spec=user_semantic, stat_info={"RKMETextSpecification": user_stat_spec})
logger.info("Searching Market for user: %d" % (i))
sorted_score_list, single_learnware_list, mixture_score, mixture_learnware_list = text_market.search_learnware(
user_info
)
l = len(sorted_score_list)
acc_list = []
for idx in range(l):
learnware = single_learnware_list[idx]
score = sorted_score_list[idx]
pred_y = learnware.predict(user_data)
acc = eval_prediction(pred_y, user_label)
acc_list.append(acc)
logger.info("search rank: %d, score: %.3f, learnware_id: %s, acc: %.3f" % (idx, score, learnware.id, acc))

# test reuse (job selector)
reuse_baseline = JobSelectorReuser(learnware_list=mixture_learnware_list, herding_num=100)
reuse_predict = reuse_baseline.predict(user_data=user_data)
reuse_score = eval_prediction(reuse_predict, user_label)
job_selector_score_list.append(reuse_score)
print(f"mixture reuse loss(job selector): {reuse_score}")

# test reuse (ensemble)
reuse_ensemble = AveragingReuser(learnware_list=mixture_learnware_list, mode="vote_by_label")
ensemble_predict_y = reuse_ensemble.predict(user_data=user_data)
ensemble_score = eval_prediction(ensemble_predict_y, user_label)
ensemble_score_list.append(ensemble_score)
print(f"mixture reuse accuracy (ensemble): {ensemble_score}")

# test reuse (ensemblePruning)
reuse_pruning = EnsemblePruningReuser(learnware_list=mixture_learnware_list)
pruning_predict_y = reuse_pruning.predict(user_data=user_data)
pruning_score = eval_prediction(pruning_predict_y, user_label)
pruning_score_list.append(pruning_score)
print(f"mixture reuse accuracy (ensemble Pruning): {pruning_score}\n")

select_list.append(acc_list[0])
avg_list.append(np.mean(acc_list))
improve_list.append((acc_list[0] - np.mean(acc_list)) / np.mean(acc_list))

logger.info(
"Accuracy of selected learnware: %.3f +/- %.3f, Average performance: %.3f +/- %.3f"
% (np.mean(select_list), np.std(select_list), np.mean(avg_list), np.std(avg_list))
)
logger.info("Average performance improvement: %.3f" % (np.mean(improve_list)))
logger.info(
"Average Job Selector Reuse Performance: %.3f +/- %.3f"
% (np.mean(job_selector_score_list), np.std(job_selector_score_list))
)
logger.info(
"Averaging Ensemble Reuse Performance: %.3f +/- %.3f"
% (np.mean(ensemble_score_list), np.std(ensemble_score_list))
)
logger.info(
"Selective Ensemble Reuse Performance: %.3f +/- %.3f"
% (np.mean(pruning_score_list), np.std(pruning_score_list))
)


if __name__ == "__main__":
prepare_data()
prepare_model()
test_search(load_market=False)

+ 0
- 104
examples/dataset_text_workflow2/utils.py View File

@@ -1,104 +0,0 @@
import os
import pickle

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer


class TextDataLoader:
def __init__(self, data_root, train: bool = True):
self.data_root = data_root
self.train = train

def get_idx_data(self, idx=0):
if self.train:
X_path = os.path.join(self.data_root, "uploader", "uploader_%d_X.pkl" % (idx))
y_path = os.path.join(self.data_root, "uploader", "uploader_%d_y.pkl" % (idx))
if not (os.path.exists(X_path) and os.path.exists(y_path)):
raise Exception("Index Error")
with open(X_path, "rb") as f:
X = pickle.load(f)
with open(y_path, "rb") as f:
y = pickle.load(f)
else:
X_path = os.path.join(self.data_root, "user", "user_%d_X.pkl" % (idx))
y_path = os.path.join(self.data_root, "user", "user_%d_y.pkl" % (idx))
if not (os.path.exists(X_path) and os.path.exists(y_path)):
raise Exception("Index Error")
with open(X_path, "rb") as f:
X = pickle.load(f)
with open(y_path, "rb") as f:
y = pickle.load(f)
return X, y


def generate_uploader(data_x: pd.Series, data_y: pd.Series, n_uploaders=50, data_save_root=None):
if data_save_root is None:
return
os.makedirs(data_save_root, exist_ok=True)

types = data_x["discourse_type"].unique()

for i in range(n_uploaders):
indices = data_x["discourse_type"] == types[i]
selected_X = data_x[indices]["discourse_text"].to_list()
selected_y = data_y[indices].to_list()

X_save_dir = os.path.join(data_save_root, "uploader_%d_X.pkl" % (i))
y_save_dir = os.path.join(data_save_root, "uploader_%d_y.pkl" % (i))
with open(X_save_dir, "wb") as f:
pickle.dump(selected_X, f)
with open(y_save_dir, "wb") as f:
pickle.dump(selected_y, f)

print("Saving to %s" % (X_save_dir))


def generate_user(data_x, data_y, n_users=50, data_save_root=None):
if data_save_root is None:
return
os.makedirs(data_save_root, exist_ok=True)

types = data_x["discourse_type"].unique()

for i in range(n_users):
indices = data_x["discourse_type"] == types[i]
selected_X = data_x[indices]["discourse_text"].to_list()
selected_y = data_y[indices].to_list()

X_save_dir = os.path.join(data_save_root, "user_%d_X.pkl" % (i))
y_save_dir = os.path.join(data_save_root, "user_%d_y.pkl" % (i))
with open(X_save_dir, "wb") as f:
pickle.dump(selected_X, f)
with open(y_save_dir, "wb") as f:
pickle.dump(selected_y, f)

print("Saving to %s" % (X_save_dir))


# Train Uploaders' models
def train(X, y, out_classes):
vectorizer = TfidfVectorizer(stop_words="english")
X_tfidf = vectorizer.fit_transform(X)

lgbm = LGBMClassifier(boosting_type="dart", n_estimators=500, num_leaves=21)
lgbm.fit(X_tfidf, y)

return vectorizer, lgbm


def eval_prediction(pred_y, target_y):
if not isinstance(pred_y, np.ndarray):
pred_y = pred_y.detach().cpu().numpy()
if len(pred_y.shape) == 1:
predicted = np.array(pred_y)
else:
predicted = np.argmax(pred_y, 1)
annos = np.array(target_y)

total = predicted.shape[0]
correct = (predicted == annos).sum().item()

return correct / total

+ 1
- 1
learnware/market/easy/searcher.py View File

@@ -599,7 +599,7 @@ class EasyStatSearcher(BaseSearcher):
merge_score_list = self._convert_dist_to_score(sorted_dist_list + [mixture_dist])
sorted_score_list = merge_score_list[:-1]
mixture_score = merge_score_list[-1]
if int(mixture_score * 100) == int(sorted_score_list[0] * 100):
if len(mixture_learnware_list) == 1 or int(mixture_score * 100) == int(sorted_score_list[0] * 100):
mixture_score = None
mixture_learnware_list = []
logger.info(


+ 15
- 2
learnware/specification/regular/table/__init__.py View File

@@ -1,14 +1,27 @@
from .utils import is_fast_pytorch_kmeans_available

from ....utils import is_torch_available
from ....logger import get_module_logger

logger = get_module_logger("regular_table_spec")

if not is_torch_available(verbose=False):
if not is_torch_available(verbose=False) or not is_fast_pytorch_kmeans_available(verbose=False):
RKMETableSpecification = None
RKMEStatSpecification = None
rkme_solve_qp = None
uninstall_packages = [
value
for flag, value in zip(
[
is_torch_available(verbose=False),
is_fast_pytorch_kmeans_available(verbose=False),
],
["torch", "fast_pytorch_kmeans"],
)
if flag is False
]
logger.warning(
"RKMETableSpecification, RKMEStatSpecification and rkme_solve_qp are skipped because torch is not installed!"
f"RKMETableSpecification, RKMEStatSpecification and rkme_solve_qp are skipped because {uninstall_packages} is not installed!"
)
else:
from .rkme import RKMETableSpecification, RKMEStatSpecification, rkme_solve_qp

+ 10
- 9
learnware/specification/regular/table/rkme.py View File

@@ -1,17 +1,15 @@
from __future__ import annotations

import os
import copy
import torch
import json
import codecs
import random
import scipy
import numpy as np
from qpsolvers import solve_qp, Problem, solve_problem
from collections import Counter
from typing import Tuple, Any, List, Union, Dict
import scipy
from sklearn.cluster import MiniBatchKMeans
from typing import Any, Union
from fast_pytorch_kmeans import KMeans

from ..base import RegularStatSpecification
from ....logger import get_module_logger
@@ -140,11 +138,14 @@ class RKMETableSpecification(RegularStatSpecification):
K : int
Size of the construced reduced set.
"""
X = X.astype("float32")
kmeans = MiniBatchKMeans(n_clusters=K, max_iter=100, verbose=False, n_init="auto")
if isinstance(X, np.ndarray):
X = X.astype("float32")
X = torch.from_numpy(X)
X = X.to(self._device)
kmeans = KMeans(n_clusters=K, mode='euclidean', max_iter=100, verbose=0)
kmeans.fit(X)
center = torch.from_numpy(kmeans.cluster_centers_).double()
self.z = center
self.z = kmeans.centroids.double()

def _update_beta(self, X: Any, nonnegative_beta: bool = True):
"""Fix Z and update beta using its closed-form solution.


+ 15
- 0
learnware/specification/regular/table/utils.py View File

@@ -0,0 +1,15 @@
from ....logger import get_module_logger

logger = get_module_logger("regular_table_spec_utils")


def is_fast_pytorch_kmeans_available(verbose=False):
try:
import fast_pytorch_kmeans
except ModuleNotFoundError as err:
if verbose is True:
logger.warning(
"ModuleNotFoundError: fast_pytorch_kmeans is not installed, please install fast_pytorch_kmeans!"
)
return False
return True

+ 8
- 2
learnware/specification/regular/text/__init__.py View File

@@ -1,11 +1,16 @@
from .utils import is_sentence_transformers_available
from ..table.utils import is_fast_pytorch_kmeans_available

from ....utils import is_torch_available
from ....logger import get_module_logger

logger = get_module_logger("regular_text_spec")

if not is_sentence_transformers_available(verbose=False) or not is_torch_available(verbose=False):
if (
not is_sentence_transformers_available(verbose=False)
or not is_torch_available(verbose=False)
or not is_fast_pytorch_kmeans_available(verbose=False)
):
RKMETextSpecification = None
uninstall_packages = [
value
@@ -13,8 +18,9 @@ if not is_sentence_transformers_available(verbose=False) or not is_torch_availab
[
is_sentence_transformers_available(verbose=False),
is_torch_available(verbose=False),
is_fast_pytorch_kmeans_available(verbose=False),
],
["sentence_transformers", "torch"],
["sentence_transformers", "torch", "fast_pytorch_kmeans"],
)
if flag is False
]


+ 4
- 3
setup.py View File

@@ -89,10 +89,11 @@ DEV_REQUIRED = [

FULL_REQUIRED = [
# The default full requirements for learnware package
"torch==2.1.0",
"torchvision==0.16.0",
"torch==2.0.1",
"torchvision==0.15.2",
"torch-optimizer>=0.3.0",
"sentence_transformers>=2.2.2",
"sentence_transformers==2.2.2",
"fast_pytorch_kmeans==0.2.0.1",
]

# In MACOS, the lightgbm package should be installed with brew.


Loading…
Cancel
Save