|
- # Copyright 2021 The KubeEdge Authors.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- import os
-
- import pandas as pd
- import numpy as np
- import xgboost
- from sklearn.model_selection import train_test_split
- from sklearn.metrics import precision_score
-
- os.environ['BACKEND_TYPE'] = 'SKLEARN'
-
- DATACONF = {
- "ATTRIBUTES": ["Season", "Cooling startegy_building level"],
- "LABEL": "Thermal preference",
- }
-
-
- def feature_process(df: pd.DataFrame):
- if "City" in df.columns:
- df.drop(["City"], axis=1, inplace=True)
- for feature in df.columns:
- if feature in ["Season", ]:
- continue
- df[feature] = df[feature].apply(lambda x: float(x) if x else 0.0)
- df['Thermal preference'] = df['Thermal preference'].apply(
- lambda x: int(float(x)) if x else 1)
- return df
-
-
- class Estimator:
- def __init__(self):
- """Model init"""
- self.model = xgboost.XGBClassifier(
- learning_rate=0.1,
- n_estimators=600,
- max_depth=2,
- min_child_weight=1,
- gamma=0,
- subsample=0.8,
- colsample_bytree=0.8,
- objective="multi:softmax",
- num_class=3,
- nthread=4,
- seed=27)
-
- def train(self, train_data, valid_data=None,
- save_best=True,
- metric_name="mlogloss",
- early_stopping_rounds=100
- ):
- es = [
- xgboost.callback.EarlyStopping(
- metric_name=metric_name,
- rounds=early_stopping_rounds,
- save_best=save_best
- )
- ]
- x, y = train_data.x, train_data.y
- if valid_data:
- x1, y1 = valid_data.x, valid_data.y
- else:
- x, x1, y, y1 = train_test_split(
- x, y, test_size=0.1, random_state=42)
- history = self.model.fit(x, y, eval_set=[(x1, y1), ], callbacks=es)
- d = {}
- for k, v in history.evals_result().items():
- for k1, v1, in v.items():
- m = np.mean(v1)
- if k1 not in d:
- d[k1] = []
- d[k1].append(m)
- for k, v in d.items():
- d[k] = np.mean(v)
- return d
-
- def predict(self, datas, **kwargs):
- """ Model inference """
- return self.model.predict(datas)
-
- def predict_proba(self, datas, **kwargs):
- return self.model.predict_proba(datas)
-
- def evaluate(self, test_data, **kwargs):
- """ Model evaluate """
- y_pred = self.predict(test_data.x)
- return precision_score(test_data.y, y_pred, average="micro")
-
- def load(self, model_url):
- self.model.load_model(model_url)
- return self
-
- def save(self, model_path=None):
- """
- save model as a single pb file from checkpoint
- """
- return self.model.save_model(model_path)
-
-
- if __name__ == '__main__':
- from sedna.datasources import CSVDataParse
- from sedna.common.config import BaseConfig
-
- train_dataset_url = BaseConfig.train_dataset_url
- train_data = CSVDataParse(data_type="train", func=feature_process)
- train_data.parse(train_dataset_url, label=DATACONF["LABEL"])
-
- test_dataset_url = BaseConfig.test_dataset_url
- valid_data = CSVDataParse(data_type="valid", func=feature_process)
- valid_data.parse(test_dataset_url, label=DATACONF["LABEL"])
-
- model = Estimator()
- print(model.train(train_data))
- print(model.evaluate(test_data=valid_data))
|