scisharp
/
TensorFlow.NET

﻿using NumSharp;
using System;
using System.Collections.Generic;
using System.Text;
using Tensorflow;
using static Tensorflow.Binding;

namespace TensorFlowNET.Examples.ImageProcessing.YOLO
{
    public class YOLOv3
    {
        Config cfg;
        Tensor trainable;
        Tensor input_data;
        Dictionary<int, string> classes;
        int num_class;
        NDArray strides;
        NDArray anchors;
        int anchor_per_scale;
        float iou_loss_thresh;
        string upsample_method;
        Tensor conv_lbbox;
        Tensor conv_mbbox;
        Tensor conv_sbbox;
        Tensor pred_sbbox;
        Tensor pred_mbbox;
        Tensor pred_lbbox;

        public YOLOv3(Config cfg_, Tensor input_data_, Tensor trainable_)
        {
            cfg = cfg_;
            input_data = input_data_;
            trainable = trainable_;
            classes = Utils.read_class_names(cfg.YOLO.CLASSES);
            num_class = len(classes);
            strides = np.array(cfg.YOLO.STRIDES);
            anchors = Utils.get_anchors(cfg.YOLO.ANCHORS);
            anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE;
            iou_loss_thresh = cfg.YOLO.IOU_LOSS_THRESH;
            upsample_method = cfg.YOLO.UPSAMPLE_METHOD;

            (conv_lbbox, conv_mbbox, conv_sbbox) = __build_nework(input_data);

            tf_with(tf.variable_scope("pred_sbbox"), scope =>
            {
                pred_sbbox = decode(conv_sbbox, anchors[0], strides[0]);
            });

            tf_with(tf.variable_scope("pred_mbbox"), scope =>
            {
                pred_mbbox = decode(conv_mbbox, anchors[1], strides[1]);
            });

            tf_with(tf.variable_scope("pred_lbbox"), scope =>
            {
                pred_lbbox = decode(conv_lbbox, anchors[2], strides[2]);
            });
        }

        private (Tensor, Tensor, Tensor) __build_nework(Tensor input_data)
        {
            Tensor route_1, route_2;
            (route_1, route_2, input_data) = backbone.darknet53(input_data, trainable);
            input_data = common.convolutional(input_data, new[] { 1, 1, 1024, 512 }, trainable, "conv52");
            input_data = common.convolutional(input_data, new[] { 3, 3, 512, 1024 }, trainable, "conv53");
            input_data = common.convolutional(input_data, new[] { 1, 1, 1024, 512 }, trainable, "conv54");
            input_data = common.convolutional(input_data, new[] { 3, 3, 512, 1024 }, trainable, "conv55");
            input_data = common.convolutional(input_data, new[] { 1, 1, 1024, 512 }, trainable, "conv56");

            var conv_lobj_branch = common.convolutional(input_data, new[] { 3, 3, 512, 1024 }, trainable, name: "conv_lobj_branch");
            var conv_lbbox = common.convolutional(conv_lobj_branch, new[] { 1, 1, 1024, 3 * (num_class + 5) },
                                          trainable: trainable, name: "conv_lbbox", activate: false, bn: false);

            input_data = common.convolutional(input_data, new[] { 1, 1, 512, 256 }, trainable, "conv57");
            input_data = common.upsample(input_data, name: "upsample0", method: upsample_method);

            tf_with(tf.variable_scope("route_1"), delegate
            {
                input_data = tf.concat(new[] { input_data, route_2 }, axis: -1);
            });

            input_data = common.convolutional(input_data, new[] { 1, 1, 768, 256 }, trainable, "conv58");
            input_data = common.convolutional(input_data, new[] { 3, 3, 256, 512 }, trainable, "conv59");
            input_data = common.convolutional(input_data, new[] { 1, 1, 512, 256 }, trainable, "conv60");
            input_data = common.convolutional(input_data, new[] { 3, 3, 256, 512 }, trainable, "conv61");
            input_data = common.convolutional(input_data, new[] { 1, 1, 512, 256 }, trainable, "conv62");

            var conv_mobj_branch = common.convolutional(input_data, new[] { 3, 3, 256, 512 }, trainable, name: "conv_mobj_branch");
            conv_mbbox = common.convolutional(conv_mobj_branch, new[] { 1, 1, 512, 3 * (num_class + 5) },
                                          trainable: trainable, name: "conv_mbbox", activate: false, bn: false);

            input_data = common.convolutional(input_data, new[] { 1, 1, 256, 128 }, trainable, "conv63");
            input_data = common.upsample(input_data, name: "upsample1", method: upsample_method);

            tf_with(tf.variable_scope("route_2"), delegate
            {
                input_data = tf.concat(new[] { input_data, route_1 }, axis: -1);
            });

            input_data = common.convolutional(input_data, new[] { 1, 1, 384, 128 }, trainable, "conv64");
            input_data = common.convolutional(input_data, new[] { 3, 3, 128, 256 }, trainable, "conv65");
            input_data = common.convolutional(input_data, new[] { 1, 1, 256, 128 }, trainable, "conv66");
            input_data = common.convolutional(input_data, new[] { 3, 3, 128, 256 }, trainable, "conv67");
            input_data = common.convolutional(input_data, new[] { 1, 1, 256, 128 }, trainable, "conv68");

            var conv_sobj_branch = common.convolutional(input_data, new[] { 3, 3, 128, 256 }, trainable, name: "conv_sobj_branch");
            conv_sbbox = common.convolutional(conv_sobj_branch, new[] { 1, 1, 256, 3 * (num_class + 5) },
                                          trainable: trainable, name: "conv_sbbox", activate: false, bn: false);
            
            return (conv_lbbox, conv_mbbox, conv_sbbox);
        }

        private Tensor decode(Tensor conv_output, NDArray anchors, int stride)
        {
            var conv_shape = tf.shape(conv_output);
            var batch_size = conv_shape[0];
            var output_size = conv_shape[1];
            anchor_per_scale = len(anchors);

            conv_output = tf.reshape(conv_output, new object[] { batch_size, output_size, output_size, anchor_per_scale, 5 + num_class });

            var conv_raw_dxdy = conv_output[":", ":", ":", ":", "0:2"];
            var conv_raw_dwdh = conv_output[":", ":", ":", ":", "2:4"];
            var conv_raw_conf = conv_output[":", ":", ":", ":", "4:5"];
            var conv_raw_prob = conv_output[":", ":", ":", ":", "5:"];

            var y = tf.tile(tf.range(output_size, dtype: tf.int32)[":", tf.newaxis], new object[] { 1, output_size });
            var x = tf.tile(tf.range(output_size, dtype: tf.int32)[tf.newaxis, ":"], new object[] { output_size, 1 });

            var xy_grid = tf.concat(new[] { x[":", ":", tf.newaxis], y[":", ":", tf.newaxis] }, axis: -1);
            xy_grid = tf.tile(xy_grid[tf.newaxis, ":", ":", tf.newaxis, ":"], new object[] { batch_size, 1, 1, anchor_per_scale, 1 });
            xy_grid = tf.cast(xy_grid, tf.float32);

            var pred_xy = (tf.sigmoid(conv_raw_dxdy) + xy_grid) * stride;
            var pred_wh = (tf.exp(conv_raw_dwdh) * anchors) * stride;
            var pred_xywh = tf.concat(new[] { pred_xy, pred_wh }, axis: -1);

            var pred_conf = tf.sigmoid(conv_raw_conf);
            var pred_prob = tf.sigmoid(conv_raw_prob);

            return tf.concat(new[] { pred_xywh, pred_conf, pred_prob }, axis: -1);
        }

        public (Tensor, Tensor, Tensor) compute_loss(Tensor label_sbbox, Tensor label_mbbox, Tensor label_lbbox,
            Tensor true_sbbox, Tensor true_mbbox, Tensor true_lbbox)
        {
            Tensor giou_loss = null, conf_loss = null, prob_loss = null;
            (Tensor, Tensor, Tensor) loss_sbbox = (null, null, null);
            (Tensor, Tensor, Tensor) loss_mbbox = (null, null, null);
            (Tensor, Tensor, Tensor) loss_lbbox = (null, null, null);

            tf_with(tf.name_scope("smaller_box_loss"), delegate
            {
                loss_sbbox = loss_layer(conv_sbbox, pred_sbbox, label_sbbox, true_sbbox,
                                         anchors: anchors[0], stride: strides[0]);
            });

            tf_with(tf.name_scope("medium_box_loss"), delegate
            {
                loss_mbbox = loss_layer(conv_mbbox, pred_mbbox, label_mbbox, true_mbbox,
                                         anchors: anchors[1], stride: strides[1]);
            });

            tf_with(tf.name_scope("bigger_box_loss"), delegate
            {
                loss_lbbox = loss_layer(conv_lbbox, pred_lbbox, label_lbbox, true_lbbox,
                                         anchors: anchors[2], stride: strides[2]);
            });

            tf_with(tf.name_scope("giou_loss"), delegate
            {
                giou_loss = loss_sbbox.Item1 + loss_mbbox.Item1 + loss_lbbox.Item1;
            });

            tf_with(tf.name_scope("conf_loss"), delegate
            {
                conf_loss = loss_sbbox.Item2 + loss_mbbox.Item2 + loss_lbbox.Item2;
            });

            tf_with(tf.name_scope("prob_loss"), delegate
            {
                prob_loss = loss_sbbox.Item3 + loss_mbbox.Item3 + loss_lbbox.Item3;
            });

            return (giou_loss, conf_loss, prob_loss);
        }

        public (Tensor, Tensor, Tensor) loss_layer(Tensor conv, Tensor pred, Tensor label, Tensor bboxes, NDArray anchors, int stride)
        {
            var conv_shape = tf.shape(conv);
            var batch_size = conv_shape[0];
            var output_size = conv_shape[1];
            var input_size = stride * output_size;
            conv = tf.reshape(conv, new object[] {batch_size, output_size, output_size,
                                 anchor_per_scale, 5 + num_class });
            var conv_raw_conf = conv[":", ":", ":", ":", "4:5"];
            var conv_raw_prob = conv[":", ":", ":", ":", "5:"];

            var pred_xywh = pred[":", ":", ":", ":", "0:4"];
            var pred_conf = pred[":", ":", ":", ":", "4:5"];

            var label_xywh = label[":", ":", ":", ":", "0:4"];
            var respond_bbox = label[":", ":", ":", ":", "4:5"];
            var label_prob = label[":", ":", ":", ":", "5:"];

            var giou = tf.expand_dims(bbox_giou(pred_xywh, label_xywh), axis: -1);
            input_size = tf.cast(input_size, tf.float32);

            var bbox_loss_scale = 2.0 - 1.0 * label_xywh[":", ":", ":", ":", "2:3"] * label_xywh[":", ":", ":", ":", "3:4"] / (tf.sqrt(input_size));
            var giou_loss = respond_bbox * bbox_loss_scale * (1 - giou);

            var iou = bbox_iou(pred_xywh[":", ":", ":", ":", tf.newaxis, ":"], bboxes[":", tf.newaxis, tf.newaxis, tf.newaxis, ":", ":"]);
            var max_iou = tf.expand_dims(tf.reduce_max(iou, axis: new[] { -1 }), axis: -1);

            var respond_bgd = (1.0 - respond_bbox) * tf.cast(max_iou < iou_loss_thresh, tf.float32);

            var conf_focal = focal(respond_bbox, pred_conf);

            var conf_loss = conf_focal * (
                respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels: respond_bbox, logits: conv_raw_conf) +
                respond_bgd * tf.nn.sigmoid_cross_entropy_with_logits(labels: respond_bbox, logits: conv_raw_conf));

            var prob_loss = respond_bbox * tf.nn.sigmoid_cross_entropy_with_logits(labels: label_prob, logits: conv_raw_prob);

            giou_loss = tf.reduce_mean(tf.reduce_sum(giou_loss, axis: new[] { 1, 2, 3, 4 }));
            conf_loss = tf.reduce_mean(tf.reduce_sum(conf_loss, axis: new[] { 1, 2, 3, 4 }));
            prob_loss = tf.reduce_mean(tf.reduce_sum(prob_loss, axis: new[] { 1, 2, 3, 4 }));

            return (giou_loss, conf_loss, prob_loss);
        }

        public Tensor focal(Tensor target, Tensor actual, int alpha = 1, int gamma = 2)
        {
            var focal_loss = alpha * tf.pow(tf.abs(target - actual), gamma);
            return focal_loss;
        }

        public Tensor bbox_giou(Tensor boxes1, Tensor boxes2)
        {
            boxes1 = tf.concat(new[] { boxes1["...", ":2"] - boxes1["...", "2:"] * 0.5,
                            boxes1["...", ":2"] + boxes1["...", "2:"] * 0.5}, axis: -1);
            boxes2 = tf.concat(new[] { boxes2["...", ":2"] - boxes2["...", "2:"] * 0.5,
                            boxes2["...", ":2"] + boxes2["...", "2:"] * 0.5}, axis: -1);

            boxes1 = tf.concat(new[] { tf.minimum(boxes1["...", ":2"], boxes1["...", "2:"]),
                            tf.maximum(boxes1["...", ":2"], boxes1["...", "2:"])}, axis: -1);
            boxes2 = tf.concat(new[] { tf.minimum(boxes2["...", ":2"], boxes2["...", "2:"]),
                            tf.maximum(boxes2["...", ":2"], boxes2["...", "2:"])}, axis: -1);

            var boxes1_area = (boxes1["...", "2"] - boxes1["...", "0"]) * (boxes1["...", "3"] - boxes1["...", "1"]);
            var boxes2_area = (boxes2["...", "2"] - boxes2["...", "0"]) * (boxes2["...", "3"] - boxes2["...", "1"]);

            var left_up = tf.maximum(boxes1["...", ":2"], boxes2["...", ":2"]);
            var right_down = tf.minimum(boxes1["...", "2:"], boxes2["...", "2:"]);

            var inter_section = tf.maximum(right_down - left_up, 0.0f);
            var inter_area = inter_section["...", "0"] * inter_section["...", "1"];
            var union_area = boxes1_area + boxes2_area - inter_area;
            var iou = inter_area / union_area;

            var enclose_left_up = tf.minimum(boxes1["...", ":2"], boxes2["...", ":2"]);
            var enclose_right_down = tf.maximum(boxes1["...", "2:"], boxes2["...", "2:"]);
            var enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0);
            var enclose_area = enclose["...", "0"] * enclose["...", "1"];
            var giou = iou - 1.0 * (enclose_area - union_area) / enclose_area;

            return giou;
        }

        public Tensor bbox_iou(Tensor boxes1, Tensor boxes2)
        {
            var boxes1_area = boxes1["...", "2"] * boxes1["...", "3"];
            var boxes2_area = boxes2["...", "2"] * boxes2["...", "3"];

            boxes1 = tf.concat(new[] { boxes1["...", ":2"] - boxes1["...", "2:"] * 0.5,
                            boxes1["...", ":2"] + boxes1["...", "2:"] * 0.5}, axis: -1);
            boxes2 = tf.concat(new[] { boxes2["...", ":2"] - boxes2["...", "2:"] * 0.5,
                            boxes2["...", ":2"] + boxes2["...", "2:"] * 0.5}, axis: -1);

            var left_up = tf.maximum(boxes1["...", ":2"], boxes2["...", ":2"]);
            var right_down = tf.minimum(boxes1["...", "2:"], boxes2["...", "2:"]);

            var inter_section = tf.maximum(right_down - left_up, 0.0);
            var inter_area = inter_section["...", "0"] * inter_section["...", "1"];
            var union_area = boxes1_area + boxes2_area - inter_area;
            var iou = 1.0 * inter_area / union_area;

            return iou;
        }
    }
}