Add lib code for incremental learning feature

Signed-off-by: khalid-davis <huangqinkai1@huawei.com>
4 years ago · 6072a66a11
--- a/build/worker/base_images/tensorflow/tensorflow-2.3.Dockerfile
+++ b/build/worker/base_images/tensorflow/tensorflow-2.3.Dockerfile
@@ -0,0 +1,13 @@
 FROM tensorflow/tensorflow:2.3.0

 RUN apt update \
  && apt install -y libgl1-mesa-glx
 COPY ./lib/requirements.txt /home
 RUN pip install -r /home/requirements.txt

 ENV PYTHONPATH "/home/lib"

 WORKDIR /home/work
 COPY ./lib /home/lib

 ENTRYPOINT ["python"]
--- a/examples/surface_defect_detection/training_worker/inference.py
+++ b/examples/surface_defect_detection/training_worker/inference.py
@@ -2,13 +2,13 @@ import logging

 import numpy as np

 from neptune.ml_model import load_model
 import neptune.ml_model
 from neptune.ml_model import load_model

 LOG = logging.getLogger(__name__)

 if __name__ == '__main__':
    valid_data = neptune.load_test_dataset(data_format="txt")
    valid_data = neptune.load_test_dataset(data_format="txt", with_image=True)

    x_valid = np.array([tup[0] for tup in valid_data])
    y_valid = np.array([tup[1] for tup in valid_data])
--- a/examples/surface_defect_detection/training_worker/train.py
+++ b/examples/surface_defect_detection/training_worker/train.py
@@ -8,7 +8,7 @@ from network import GlobalModelInspectionCNN

 def main():
    # load dataset.
    train_data = neptune.load_train_dataset(data_format="txt")
    train_data = neptune.load_train_dataset(data_format="txt", with_image=True)

    x = np.array([tup[0] for tup in train_data])
    y = np.array([tup[1] for tup in train_data])
--- a/lib/neptune/init.py
+++ b/lib/neptune/init.py
@@ -1,6 +1,6 @@
 import logging

 from . import joint_inference, federated_learning
 from . import joint_inference, federated_learning, incremental_learning
 from .context import context
 from .dataset.dataset import load_train_dataset, load_test_dataset

--- a/lib/neptune/common/constant.py
+++ b/lib/neptune/common/constant.py
@@ -18,6 +18,7 @@ class Framework(Enum):
 class K8sResourceKind(Enum):
    JOINT_INFERENCE_SERVICE = "jointinferenceservice"
    FEDERATED_LEARNING_JOB = "federatedlearningjob"
    INCREMENTAL_JOB = "incrementallearningjob"


 class K8sResourceKindStatus(Enum):
--- a/lib/neptune/dataset/dataset.py
+++ b/lib/neptune/dataset/dataset.py
@@ -21,9 +21,12 @@ def _load_dataset(dataset_url, format, **kwargs):
        LOG.warning(f'dataset_url is None, please check the url.')
        return None
    if format == 'txt':
        LOG.info("dataset format is txt, now loading txt from "
                 f"[{dataset_url}]")
        return _load_txt_dataset(dataset_url)
        LOG.info(
            f"dataset format is txt, now loading txt from [{dataset_url}]")
        if kwargs.get('with_image'):
            return _load_txt_dataset_with_image(dataset_url)
        else:
            return _load_txt_dataset(dataset_url)


 def load_train_dataset(data_format, **kwargs):
@@ -45,6 +48,15 @@ def load_test_dataset(data_format, **kwargs):


 def _load_txt_dataset(dataset_url):
    LOG.info(f'dataset_url is {dataset_url}, now reading dataset_url')
    root_path = BaseConfig.data_path_prefix
    with open(dataset_url) as f:
        lines = f.readlines()
    new_lines = [root_path + os.path.sep + l for l in lines]
    return new_lines


 def _load_txt_dataset_with_image(dataset_url):
    import keras.preprocessing.image as img_preprocessing
    root_path = os.path.dirname(dataset_url)
    img_data = []
--- a/lib/neptune/incremental_learning/init.py
+++ b/lib/neptune/incremental_learning/init.py
@@ -0,0 +1 @@
 from .incremental_learning import *
--- a/lib/neptune/incremental_learning/incremental_learning.py
+++ b/lib/neptune/incremental_learning/incremental_learning.py
@@ -0,0 +1,174 @@
 import logging

 import os
 import tensorflow as tf

 import neptune
 from neptune.common.config import BaseConfig
 from neptune.common.constant import K8sResourceKindStatus, K8sResourceKind
 from neptune.common.utils import clean_folder, remove_path_prefix
 from neptune.hard_example_mining import CrossEntropyFilter, IBTFilter, \
    ThresholdFilter
 from neptune.joint_inference import TSLittleModel
 from neptune.lc_client import LCClient

 LOG = logging.getLogger(__name__)


 class IncrementalConfig(BaseConfig):
    def __init__(self):
        BaseConfig.__init__(self)
        self.model_urls = os.getenv("MODEL_URLS")
        self.base_model_url = os.getenv("BASE_MODEL_URL")


 def train(model, train_data, epochs, batch_size, class_names, input_shape,
          obj_threshold, nms_threshold):
    """The train endpoint of incremental learning.

    :param model: the train model
    :param train_data: the data use for train
    :param epochs: the number of epochs for training the model
    :param batch_size: the number of samples in a training
    :param class_names:
    :param input_shape:
    :param obj_threshold:
    :param nms_threshold:
    """
    il_config = IncrementalConfig()

    clean_folder(il_config.model_url)
    model.train(train_data, [])  # validation data is empty.
    tf.reset_default_graph()
    model.save_model_pb()

    ckpt_model_url = remove_path_prefix(il_config.model_url,
                                        il_config.data_path_prefix)
    pb_model_url = remove_path_prefix(
        os.path.join(il_config.model_url, 'model.pb'),
        il_config.data_path_prefix)

    # TODO delete metrics whether affect lc
    ckpt_result = {
        "format": "ckpt",
        "url": ckpt_model_url,
    }

    pb_result = {
        "format": "pb",
        "url": pb_model_url,
    }

    results = [ckpt_result, pb_result]

    message = {
        "name": il_config.worker_name,
        "namespace": il_config.namespace,
        "ownerName": il_config.job_name,
        "ownerKind": K8sResourceKind.INCREMENTAL_JOB.value,
        "kind": "train",
        "status": K8sResourceKindStatus.COMPLETED.value,
        "results": results
    }
    LCClient.send(il_config.worker_name, message)


 def evaluate(model, test_data, class_names, input_shape):
    """The evaluation endpoint of incremental job.

    :param model: the model used for evaluation
    :param test_data:
    :param class_names:
    :param input_shape: the input shape of model
    """
    il_config = IncrementalConfig()

    results = []
    for model_url in il_config.model_urls.split(';'):
        precision, recall, all_precisions, all_recalls = model(
            model_path=model_url,
            test_dataset=test_data,
            class_names=class_names,
            input_shape=input_shape)

        result = {
            "format": "pb",
            "url": remove_path_prefix(model_url, il_config.data_path_prefix),
            "metrics": {
                "recall": recall,
                "precision": precision
            }
        }
        results.append(result)

    message = {
        "name": il_config.worker_name,
        "namespace": il_config.namespace,
        "ownerName": il_config.job_name,
        "ownerKind": K8sResourceKind.INCREMENTAL_JOB.value,
        "kind": "eval",
        "status": K8sResourceKindStatus.COMPLETED.value,
        "results": results
    }

    LCClient.send(il_config.worker_name, message)


 class TSModel(TSLittleModel):
    def __init__(self, preprocess=None, postprocess=None, input_shape=(0, 0),
                 create_input_feed=None, create_output_fetch=None):
        TSLittleModel.__init__(self, preprocess, postprocess, input_shape,
                               create_input_feed, create_output_fetch)


 class InferenceResult:
    def __init__(self, is_hard_example, infer_result):
        self.is_hard_example = is_hard_example
        self.infer_result = infer_result


 class Inference:
    def __init__(self, model: TSModel, hard_example_mining_algorithm=None):
        if hard_example_mining_algorithm is None:
            hem_name = BaseConfig.hem_name

            if hem_name == "IBT":
                threshold_box = float(neptune.context.get_hem_parameters(
                    "threshold_box", 0.8
                ))
                threshold_img = float(neptune.context.get_hem_parameters(
                    "threshold_img", 0.8
                ))
                hard_example_mining_algorithm = IBTFilter(threshold_img,
                                                          threshold_box)
            elif hem_name == "CrossEntropy":
                threshold_cross_entropy = float(
                    neptune.context.get_hem_parameters(
                        "threshold_cross_entropy", 0.5
                    )
                )
                hard_example_mining_algorithm = CrossEntropyFilter(
                    threshold_cross_entropy)
            else:
                hard_example_mining_algorithm = ThresholdFilter()
        self.hard_example_mining_algorithm = hard_example_mining_algorithm
        self.model = model

    def inference(self, img_data) -> InferenceResult:
        result = self.model.inference(img_data)
        bboxes = deal_infer_rsl(result)
        is_hard_example = self.hard_example_mining_algorithm.hard_judge(bboxes)
        if is_hard_example:
            return InferenceResult(True, result)
        else:
            return InferenceResult(False, result)


 def deal_infer_rsl(model_output):
    all_classes, all_scores, all_bboxes = model_output
    bboxes = []
    for c, s, bbox in zip(all_classes, all_scores, all_bboxes):
        bbox[0], bbox[1], bbox[2], bbox[3] = bbox[1], bbox[0], bbox[3], bbox[2]
        bboxes.append(bbox.tolist() + [s, c])

    return bboxes
--- a/lib/requirements.txt
+++ b/lib/requirements.txt
@@ -4,3 +4,5 @@ opencv-python==4.4.0.44
 websockets==8.1
 Pillow==8.0.1
 requests==2.24.0
 tqdm==4.56.0
 matplotlib==3.3.3