kubeedge
/
sedna

# Copyright 2021 The KubeEdge Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pandas as pd
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

os.environ['BACKEND_TYPE'] = 'SKLEARN'

DATACONF = {
    "ATTRIBUTES": ["Season", "Cooling startegy_building level"],
    "LABEL": "Thermal preference",
}


def feature_process(df: pd.DataFrame):
    if "City" in df.columns:
        df.drop(["City"], axis=1, inplace=True)
    for feature in df.columns:
        if feature in ["Season", ]:
            continue
        df[feature] = df[feature].apply(lambda x: float(x) if x else 0.0)
    df['Thermal preference'] = df['Thermal preference'].apply(
        lambda x: int(float(x)) if x else 1)
    return df


class Estimator:
    def __init__(self):
        """Model init"""
        self.model = xgboost.XGBClassifier(
            learning_rate=0.1,
            n_estimators=600,
            max_depth=2,
            min_child_weight=1,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="multi:softmax",
            num_class=3,
            nthread=4,
            seed=27)

    def train(self, train_data, valid_data=None,
              save_best=True,
              metric_name="mlogloss",
              early_stopping_rounds=100
              ):
        es = [
            xgboost.callback.EarlyStopping(
                metric_name=metric_name,
                rounds=early_stopping_rounds,
                save_best=save_best
            )
        ]
        x, y = train_data.x, train_data.y
        if valid_data:
            x1, y1 = valid_data.x, valid_data.y
        else:
            x, x1, y, y1 = train_test_split(
                x, y, test_size=0.1, random_state=42)
        history = self.model.fit(x, y, eval_set=[(x1, y1), ], callbacks=es)
        d = {}
        for k, v in history.evals_result().items():
            for k1, v1, in v.items():
                m = np.mean(v1)
                if k1 not in d:
                    d[k1] = []
                d[k1].append(m)
        for k, v in d.items():
            d[k] = np.mean(v)
        return d

    def predict(self, datas, **kwargs):
        """ Model inference """
        return self.model.predict(datas)

    def predict_proba(self, datas, **kwargs):
        return self.model.predict_proba(datas)

    def evaluate(self, test_data, **kwargs):
        """ Model evaluate """
        y_pred = self.predict(test_data.x)
        return precision_score(test_data.y, y_pred, average="micro")

    def load(self, model_url):
        self.model.load_model(model_url)
        return self

    def save(self, model_path=None):
        """
        save model as a single pb file from checkpoint
        """
        return self.model.save_model(model_path)


if __name__ == '__main__':
    from sedna.datasources import CSVDataParse
    from sedna.common.config import BaseConfig

    train_dataset_url = BaseConfig.train_dataset_url
    train_data = CSVDataParse(data_type="train", func=feature_process)
    train_data.parse(train_dataset_url, label=DATACONF["LABEL"])

    test_dataset_url = BaseConfig.test_dataset_url
    valid_data = CSVDataParse(data_type="valid", func=feature_process)
    valid_data.parse(test_dataset_url, label=DATACONF["LABEL"])

    model = Estimator()
    print(model.train(train_data))
    print(model.evaluate(test_data=valid_data))