Add local-up.sh

Developers can run `hack/local-up.sh` to setup up a local environment including: 1. a local k8s cluster with a master node. 2. a kubeedge node. 3. our gm/lc. Based on the kubeedge-local-up script which builds a local k8s cluster and kubeedge, our local-up script installs our package locally for simply developing and preparing for e3e tests. It does: 1. build the gm/lc/worker images. 2. download kubeedge source code and run its localup script. 3. prepare our k8s env. 4. config gm config and start gm. 5. start lc. 6. add cleanup For cleanup, it needs to do our cleanups before kubeedge cleanup otherwise lc cleanup (via kubectl delete) is stuck and lc container is kept running. Signed-off-by: llhuii <liulinghui@huawei.com>
4 years ago · c8adc93c23
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,8 @@
 # temp build directories
 _output
 tmp
 *-tmp
 # dot files
 .*
--- a/hack/local-up.sh
+++ b/hack/local-up.sh
@@ -0,0 +1,451 @@
 #!/bin/bash
 # Developers can run `hack/local-up.sh` to setup up a local environment:
 # 1. a local k8s cluster with a master node.
 # 2. a kubeedge node.
 # 3. our gm/lc.
 # Based on the kubeedge-local-up script which builds a local k8s cluster and kubeedge,
 # our local-up script installs our package locally for
 # simply developing and preparing for e3e tests.
 # It does:
 # 1. build the gm/lc/worker images.
 # 2. download kubeedge source code and run its localup script.
 # 3. prepare our k8s env.
 # 4. config gm config and start gm.
 # 5. start lc.
 # 6. add cleanup.
 # For cleanup, it needs to do our cleanups before kubeedge cleanup.
 # Otherwise lc cleanup (via kubectl delete) is stuck and lc is kept running.
 set -o errexit
 set -o nounset
 set -o pipefail
 NEPTUNE_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd -P)"
 cd "$NEPTUNE_ROOT"
 NO_CLEANUP=${NO_CLEANUP:-false}
 IMAGE_REPO=localhost/edgeai-neptune/neptune
 IMAGE_TAG=localup
 # local k8s cluster name for local-up-kubeedge.sh
 CLUSTER_NAME=neptune
 MASTER_NODENAME=${CLUSTER_NAME}-control-plane
 EDGE_NODENAME=edge-node
 NAMESPACE=neptune
 KUBEEDGE_VERSION=master
 TMP_DIR="$(realpath local-up-tmp)"
 GM_BIND_PORT=9000
 LC_BIND_PORT=9100
 arch() {
  local arch=$(uname -m)
  case "$arch" in
    x86_64) arch=amd64;;
    *);;
  esac
  echo "$arch"
 }
 download_and_extract_kubeedge() {
  [ -d kubeedge ] && return
  local version=${1:-$KUBEEDGE_VERSION}
  # master branch can't works with git clone --depth 1
  git clone -b $version https://github.com/kubeedge/kubeedge
  return
  # the archive file can't works since local-up-kubeedge.sh depends git tag
  # https://github.com/kubeedge/kubeedge/archive/${version}.tar.gz
 }
 get_kubeedge_pid() {
  ps -e -o pid,comm,args |
   grep -F "$TMP_DIR" |
   # match executable name and print the pid
   awk -v bin="${1:-edgecore}" 'NF=$2==bin'
 }
 localup_kubeedge() {
  pushd $TMP_DIR >/dev/null
  download_and_extract_kubeedge
  # without setsid when hits ctrl-c, edgecore/cloudclore will be terminated
  # before cleanup called.
  # but we need cloudcore/edgecore alive to clean our container(mainly lc),
  # so here new a session to run local-up-kubeedge.sh
  setsid  bash -c "
    cd kubeedge
    # no use ENABLE_DAEMON=true since it has not-fully-cleanup problem.
    TIMEOUT=90 CLUSTER_NAME=$CLUSTER_NAME ENABLE_DAEMON=false
    source hack/local-up-kubeedge.sh
   " &
  KUBEEDGE_ROOT_PID=$!
  add_cleanup '
    # for the case sometimes kube-proxy container in local machine
    # not cleanup.
    kubectl delete ds -n kube-system kube-proxy
    echo "found kubeedge pid, kill it: $KUBEEDGE_ROOT_PID"
    for((i=0;i<60;i++)); do
      ((i%15==0)) && kill "$KUBEEDGE_ROOT_PID"
      kill -0 "$KUBEEDGE_ROOT_PID" || break
      echo "waiting for $KUBEEDGE_ROOT_PID exists"
      sleep 1
    done
    # sometimes cloudcore/edgecore cant be stopped(one kill command
    # local-up-kubeedge.sh is not enough),
    # so to ensure this cleanup we clean it manully.
    for bin in cloudcore edgecore; do
      pid=$(get_kubeedge_pid $bin)
      if [ -n "$pid" ]; then
        echo "found $bin: $pid, kill it"
        kill $pid
        kill $pid
      fi
    done
  '
  # wait ${MASTER_NODENAME} container to be running
  while ! docker ps --filter=name=${MASTER_NODENAME} | grep -q ${MASTER_NODENAME}; do
    # errexit when kubeedge-local pid exited
    kill -0 "$KUBEEDGE_ROOT_PID"
    sleep 3
  done
  # wait edgecore
  while [ -z "$(get_kubeedge_pid edgecore)" ]; do
    # errexit when kubeedge-local pid exited
    kill -0 "$KUBEEDGE_ROOT_PID"
    sleep 3
  done
  local parent=$$
  {
    # healthcheck for kubeedge-local pid
    # if it died, we died.
    while true; do
      if ! kill -0 "$KUBEEDGE_ROOT_PID"; then
        kill -INT $parent
        break
      fi
      sleep 1
    done
  }&
  popd
 }
 build_component_image() {
  local bin
  for bin; do
    echo "building $bin image"
    make -C "${NEPTUNE_ROOT}" ${bin}image IMAGE_REPO=$IMAGE_REPO IMAGE_TAG=$IMAGE_TAG
    eval ${bin^^}_IMAGE="'${IMAGE_REPO}/${bin}:${IMAGE_TAG}'"
  done
  # no clean up for images
 }
 build_worker_base_images() {
  echo "building worker base images"
  # build tensorflow1.15 image
  WORKER_TF1_IMAGE=$IMAGE_REPO/worker-tensorflow:1.15
  docker build -f build/worker/base_images/tensorflow/tensorflow-1.15.Dockerfile -t $WORKER_TF1_IMAGE .
  WORKER_IMAGE_HUB="'tensorflow:1.15': $WORKER_TF1_IMAGE"
  # add more base images
 }
 load_images_to_master() {
  local image
  for image in $GM_IMAGE; do
    # just use the docker-image command of kind instead of ctr
    # docker save $image | docker exec -i $MASTER_NODENAME ctr --namespace k8s.io image import -
    kind load --name $CLUSTER_NAME docker-image $image
  done
 }
 prepare_k8s_env() {
  kind get kubeconfig --name $CLUSTER_NAME > $TMP_DIR/kubeconfig
  export KUBECONFIG=$(realpath $TMP_DIR/kubeconfig)
  # prepare our k8s environment
  # create these crds including dataset, model, joint-inference etc.
  kubectl apply -f build/crds/neptune/
  # gm, lc will be created in this namespace
  kubectl create namespace $NAMESPACE
  # create the cluster role for gm
  kubectl apply -f build/gm/rbac/
  add_cleanup "
    kubectl delete -f build/crds/neptune/
    kubectl delete namespace $NAMESPACE --timeout=5s
  "
  load_images_to_master
 }
 start_gm() {
  # config gm and start as pod
  pushd $TMP_DIR >/dev/null
  local gm_node_name=${MASTER_NODENAME}
  local gm_pod_name=gm-pod
  # prepare gm config
  cat > gmconfig <<EOF
 kubeConfig: ""
 namespace: ""
 imageHub:
  $WORKER_IMAGE_HUB
 websocket:
  port: $GM_BIND_PORT
 localController:
  server: http://localhost:$LC_BIND_PORT
 EOF
  add_cleanup "kubectl delete cm config -n $NAMESPACE"
  # create configmap for gm config
  kubectl create -n $NAMESPACE configmap config --from-file=gmconfig
  add_cleanup "
    kubectl delete deployment gm -n $NAMESPACE
    kubectl delete service gm -n $NAMESPACE
  "
  # start gm as pod with specified node name
  # TODO: create a k8s service, but kubeedge can't support this.
  kubectl create -f - <<EOF
 apiVersion: v1
 kind: Service
 metadata:
  name: gm
  namespace: neptune
 spec:
  selector:
    app: gm
  type: NodePort
  ports:
    - protocol: TCP
      port: $GM_BIND_PORT
      targetPort: $GM_BIND_PORT
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: gm
  labels:
    app: gm
  namespace: neptune
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: gm
  template:
    metadata:
      labels:
        app: gm
    spec:
      nodeName: $gm_node_name
      serviceAccountName: neptune
      containers:
      - name: gm
        image: $GM_IMAGE
        command: ["neptune-gm", "--config", "/config/gmconfig", "-v2"]
        resources:
          requests:
            memory: 32Mi
            cpu: 100m
          limits:
            memory: 128Mi
        volumeMounts:
        - name: config
          mountPath: /config
      volumes:
        - name: config
          configMap:
            name: config
 EOF
  local gm_ip=$(kubectl get node $gm_node_name -o jsonpath='{ .status.addresses[?(@.type=="InternalIP")].address }')
  local gm_port=$(kubectl -n $NAMESPACE get svc gm -ojsonpath='{.spec.ports[0].nodePort}')
  GM_ADDRESS=$gm_ip:$gm_port
  add_debug_info "See GM status: kubectl get deploy -n $NAMESPACE gm"
  popd
 }
 start_lc() {
  local lc_ds_name=lc
  add_cleanup "
  # so here give a timeout in case edgecore is exited unexpectedly
  kubectl delete --timeout=5s ds lc -n neptune
  # if edgecore exited unexpectedly, we need to clean lc manually
  [ -z \"\$(get_kubeedge_pid edgecore)\" ] && {
    # TODO: find a better way to do this
    echo 'try to stop lc and its pause in edgenode manually'
    docker stop \$(
      docker ps |
      # find lc and its pause container id
      # kubeedge/k8s container name rule:
      #   pod: k8s_${lc_ds_name}_{pod_name}_${NAMESPACE}_{pod_uid}_
      #   pause: k8s_POD_{pod_name}_${NAMESPACE}_{pause_uid}_
      #   where pod_name is ${lc_ds_name}-[a-z0-9]{5}
      grep 'k8s_.*_${lc_ds_name}-[a-z0-9]*_${NAMESPACE}_' |
      awk NF=1
    ) 2>/dev/null
  }
  "
  # start lc as daemonset
  kubectl create -f- <<EOF
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  labels:
    k8s-app: neptune-lc
  name: $lc_ds_name
  namespace: $NAMESPACE
 spec:
  selector:
    matchLabels:
      k8s-app: $lc_ds_name
  template:
    metadata:
      labels:
        k8s-app: $lc_ds_name
    spec:
      nodeSelector:
        # only schedule to edge node
        node-role.kubernetes.io/edge:  ""
      containers:
        - name: $lc_ds_name
          image: $LC_IMAGE
          env:
            - name: GM_ADDRESS
              value: $GM_ADDRESS
            - name: BIND_PORT
              value: "$LC_BIND_PORT"
            - name: NODENAME
              valueFrom:
                fieldRef:
                  fieldPath: spec.nodeName
            - name: ROOTFS_MOUNT_DIR
              # the value of ROOTFS_MOUNT_DIR is same with the mount path of volume
              value: /rootfs
          resources:
            requests:
              memory: 32Mi
              cpu: 100m
            limits:
              memory: 128Mi
          volumeMounts:
            - name: localcontroller
              mountPath: /rootfs
      volumes:
        - name: localcontroller
          hostPath:
            path: /
      hostNetwork: true
 EOF
  add_debug_info "See LC status: kubectl get ds -n $NAMESPACE $lc_ds_name"
 }
 declare -a CLEANUP_CMDS=()
 add_cleanup() {
  CLEANUP_CMDS+=("$@")
 }
 cleanup() {
  if [[ "${NO_CLEANUP}" = true ]]; then
    echo "No clean up..."
    return
  fi
  set +o errexit
  echo "Cleaning up neptune..."
  local idx=${#CLEANUP_CMDS[@]} cmd
  # reverse call cleanup
  for((;--idx>=0;)); do
    cmd=${CLEANUP_CMDS[idx]}
    echo "calling $cmd:"
    eval "$cmd"
  done
  set -o errexit
 }
 check_healthy() {
  # TODO
  true
 }
 debug_infos=""
 add_debug_info() {
  debug_infos+="$@
 "
 }
 check_prerequisites() {
  # TODO
  true
 }
 NO_COLOR='\033[0m'
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 green_text() {
  echo -ne "$GREEN$@$NO_COLOR"
 }
 red_text() {
  echo -ne "$RED$@$NO_COLOR"
 }
 trap cleanup EXIT
 cleanup
 mkdir -p "$TMP_DIR"
 add_cleanup 'rm -rf "$TMP_DIR"'
 build_component_image gm lc
 build_worker_base_images
 check_prerequisites
 localup_kubeedge
 prepare_k8s_env
 start_gm
 start_lc
 echo "Local Neptune cluster is $(green_text running).
 Currently local-up script only support foreground running.
 Press $(red_text Ctrl-C) to shut it down!
 You can use it with: kind export kubeconfig --name ${CLUSTER_NAME}
 $debug_infos
 "
 while check_healthy; do sleep 5; done