You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

interface.py 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. # Copyright 2021 The KubeEdge Authors.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import six
  16. import logging
  17. from urllib.parse import urlparse
  18. import cv2
  19. import numpy as np
  20. from tqdm import tqdm
  21. import tensorflow as tf
  22. from data_gen import DataGen
  23. from validate_utils import validate
  24. from yolo3_multiscale import Yolo3
  25. from yolo3_multiscale import YoloConfig
  26. os.environ['BACKEND_TYPE'] = 'TENSORFLOW'
  27. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
  28. s3_url = os.getenv("S3_ENDPOINT_URL", "http://s3.amazonaws.com")
  29. if not (s3_url.startswith("http://") or s3_url.startswith("https://")):
  30. _url = f"https://{s3_url}"
  31. s3_url = urlparse(s3_url)
  32. s3_use_ssl = s3_url.scheme == 'https' if s3_url.scheme else True
  33. os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("ACCESS_KEY_ID")
  34. os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("SECRET_ACCESS_KEY")
  35. os.environ["S3_ENDPOINT"] = s3_url.netloc
  36. os.environ["S3_USE_HTTPS"] = "1" if s3_use_ssl else "0"
  37. LOG = logging.getLogger(__name__)
  38. flags = tf.flags.FLAGS
  39. def preprocess(image, input_shape):
  40. """Preprocess functions in edge model inference"""
  41. # resize image with unchanged aspect ratio using padding by opencv
  42. h, w, _ = image.shape
  43. input_h, input_w = input_shape
  44. scale = min(float(input_w) / float(w), float(input_h) / float(h))
  45. nw = int(w * scale)
  46. nh = int(h * scale)
  47. image = cv2.resize(image, (nw, nh))
  48. new_image = np.zeros((input_h, input_w, 3), np.float32)
  49. new_image.fill(128)
  50. bh, bw, _ = new_image.shape
  51. new_image[
  52. int((bh - nh) / 2):(nh + int((bh - nh) / 2)),
  53. int((bw - nw) / 2):(nw + int((bw - nw) / 2)), :
  54. ] = image
  55. new_image /= 255.
  56. new_image = np.expand_dims(new_image, 0) # Add batch dimension.
  57. return new_image
  58. def create_input_feed(sess, new_image, img_data):
  59. """Create input feed for edge model inference"""
  60. input_feed = {}
  61. input_img_data = sess.graph.get_tensor_by_name('images:0')
  62. input_feed[input_img_data] = new_image
  63. input_img_shape = sess.graph.get_tensor_by_name('shapes:0')
  64. input_feed[input_img_shape] = [img_data.shape[0], img_data.shape[1]]
  65. return input_feed
  66. def create_output_fetch(sess):
  67. """Create output fetch for edge model inference"""
  68. output_classes = sess.graph.get_tensor_by_name('output/classes:0')
  69. output_scores = sess.graph.get_tensor_by_name('output/scores:0')
  70. output_boxes = sess.graph.get_tensor_by_name('output/boxes:0')
  71. output_fetch = [output_classes, output_scores, output_boxes]
  72. return output_fetch
  73. class Estimator:
  74. def __init__(self, **kwargs):
  75. """
  76. initialize logging configuration
  77. """
  78. sess_config = tf.ConfigProto(allow_soft_placement=True)
  79. self.graph = tf.Graph()
  80. self.session = tf.compat.v1.Session(
  81. config=sess_config, graph=self.graph)
  82. def train(self, train_data, valid_data=None, **kwargs):
  83. """
  84. train
  85. """
  86. yolo_config = YoloConfig()
  87. data_gen = DataGen(yolo_config, train_data.x)
  88. max_epochs = int(kwargs.get("epochs", flags.max_epochs))
  89. config = tf.ConfigProto(allow_soft_placement=True)
  90. config.gpu_options.allow_growth = True
  91. with tf.Session(config=config) as sess:
  92. model = Yolo3(sess, True, yolo_config)
  93. if os.path.exists(model.model_dir):
  94. saver = tf.train.Saver()
  95. latest_ckpt = tf.train.latest_checkpoint(model.model_dir)
  96. if latest_ckpt:
  97. LOG.info(f"latest_ckpt={latest_ckpt}")
  98. saver.restore(sess, latest_ckpt)
  99. else:
  100. os.makedirs(model.model_dir)
  101. steps_per_epoch = int(
  102. round(
  103. data_gen.train_data_size /
  104. data_gen.batch_size))
  105. total = steps_per_epoch * max_epochs
  106. loss = []
  107. with tqdm(desc='Train: ', total=total) as pbar:
  108. for epoch in range(max_epochs):
  109. LOG.info('Epoch %d...' % epoch)
  110. # Get a batch and make a step.
  111. for step in range(steps_per_epoch):
  112. batch_data = data_gen.next_batch_train()
  113. if not batch_data:
  114. continue
  115. batch_loss = model.step(sess, batch_data, True)
  116. pbar.set_description(
  117. 'Train, input_shape=(%d, %d), loss=%.4f' %
  118. (batch_data['input_shape'][0],
  119. batch_data['input_shape'][1], batch_loss))
  120. pbar.update()
  121. loss.append(batch_loss)
  122. LOG.info(
  123. "Saving model, global_step: %d" %
  124. model.global_step.eval())
  125. checkpoint_path = os.path.join(
  126. model.model_dir,
  127. "yolo3-epoch%03d.ckpt" % epoch)
  128. model.saver.save(
  129. sess,
  130. checkpoint_path,
  131. global_step=model.global_step,
  132. write_meta_graph=False)
  133. return {"loss": float(np.mean(loss))}
  134. def evaluate(
  135. self,
  136. valid_data,
  137. model_path="",
  138. class_names="",
  139. input_shape=(
  140. 352,
  141. 640),
  142. **kwargs):
  143. """
  144. validate
  145. """
  146. precision, recall, all_precisions, all_recalls = (
  147. validate(model_path=model_path,
  148. test_dataset=valid_data.x,
  149. class_names=class_names,
  150. input_shape=input_shape)
  151. )
  152. return {
  153. "recall": recall, "precision": precision
  154. }
  155. def avg_checkpoints(self):
  156. """
  157. Average the last N checkpoints in the model_dir.
  158. """
  159. LOG.info("average checkpoints start .......")
  160. with self.session.as_default() as sess:
  161. yolo_config = YoloConfig()
  162. model = Yolo3(sess, False, yolo_config)
  163. model_dir = model.model_dir
  164. num_last_checkpoints = 5
  165. global_step = model.global_step.eval()
  166. global_step_name = model.global_step.name.split(":")[0]
  167. checkpoint_state = tf.train.get_checkpoint_state(model_dir)
  168. if not checkpoint_state:
  169. logging.info(
  170. "# No checkpoint file found in directory: %s" %
  171. model_dir)
  172. return None
  173. # Checkpoints are ordered from oldest to newest.
  174. checkpoints = (
  175. checkpoint_state.all_model_checkpoint_paths[
  176. - num_last_checkpoints:]
  177. )
  178. if len(checkpoints) < num_last_checkpoints:
  179. logging.info(
  180. "# Skipping averaging checkpoints because"
  181. " not enough checkpoints is avaliable.")
  182. return None
  183. avg_model_dir = os.path.join(model_dir, "avg_checkpoints")
  184. if not tf.gfile.Exists(avg_model_dir):
  185. logging.info(
  186. "# Creating new directory %s "
  187. "for saving averaged checkpoints." %
  188. avg_model_dir)
  189. tf.gfile.MakeDirs(avg_model_dir)
  190. logging.info("# Reading and averaging "
  191. "variables in checkpoints:")
  192. var_list = tf.contrib.framework.list_variables(checkpoints[0])
  193. var_values, var_dtypes = {}, {}
  194. for (name, shape) in var_list:
  195. if name != global_step_name:
  196. var_values[name] = np.zeros(shape)
  197. for checkpoint in checkpoints:
  198. logging.info(" %s" % checkpoint)
  199. reader = tf.contrib.framework.load_checkpoint(checkpoint)
  200. for name in var_values:
  201. tensor = reader.get_tensor(name)
  202. var_dtypes[name] = tensor.dtype
  203. var_values[name] += tensor
  204. for name in var_values:
  205. var_values[name] /= len(checkpoints)
  206. # Build a graph with same variables in
  207. # the checkpoints, and save the averaged
  208. # variables into the avg_model_dir.
  209. with tf.Graph().as_default():
  210. tf_vars = [
  211. tf.get_variable(
  212. v,
  213. shape=var_values[v].shape,
  214. dtype=var_dtypes[name]) for v in var_values]
  215. placeholders = [
  216. tf.placeholder(
  217. v.dtype,
  218. shape=v.shape) for v in tf_vars]
  219. assign_ops = [
  220. tf.assign(
  221. v,
  222. p) for (
  223. v,
  224. p) in zip(
  225. tf_vars,
  226. placeholders)]
  227. global_step_var = tf.Variable(
  228. global_step, name=global_step_name, trainable=False)
  229. saver = tf.train.Saver(tf.global_variables())
  230. with tf.Session() as sess:
  231. sess.run(tf.global_variables_initializer())
  232. for p, assign_op, (name, value) in zip(
  233. placeholders, assign_ops,
  234. six.iteritems(var_values)):
  235. sess.run(assign_op, {p: value})
  236. # Use the built saver to save the averaged checkpoint.
  237. # Only keep 1 checkpoint and the best checkpoint will
  238. # be moved to avg_best_metric_dir.
  239. saver.save(
  240. sess, os.path.join(
  241. avg_model_dir, "translate.ckpt"))
  242. logging.info("average checkpoints end .......")
  243. def predict(self, data, input_shape=None, **kwargs):
  244. img_data_np = np.array(data)
  245. with self.session.as_default():
  246. new_image = preprocess(img_data_np, input_shape)
  247. input_feed = create_input_feed(
  248. self.session, new_image, img_data_np)
  249. output_fetch = create_output_fetch(self.session)
  250. output = self.session.run(output_fetch, input_feed)
  251. return output
  252. def load(self, model_url):
  253. with self.session.as_default():
  254. with self.session.graph.as_default():
  255. with tf.gfile.FastGFile(model_url, 'rb') as handle:
  256. LOG.info(f"Load model {model_url}, "
  257. f"ParseFromString start .......")
  258. graph_def = tf.GraphDef()
  259. graph_def.ParseFromString(handle.read())
  260. LOG.info("ParseFromString end .......")
  261. tf.import_graph_def(graph_def, name='')
  262. LOG.info("Import_graph_def end .......")
  263. LOG.info("Import model from pb end .......")
  264. def save(self, model_path=None):
  265. """
  266. save model as a single pb file from checkpoint
  267. """
  268. model_dir = ""
  269. model_name = "model.pb"
  270. if model_path:
  271. model_dir, model_name = os.path.split(model_path)
  272. logging.info("save model as .pb start .......")
  273. tf.reset_default_graph()
  274. config = tf.ConfigProto(allow_soft_placement=True)
  275. config.gpu_options.allow_growth = True
  276. with tf.Session(config=config) as sess:
  277. yolo_config = YoloConfig()
  278. model = Yolo3(sess, False, yolo_config)
  279. if not (model_dir and os.path.isdir(model_dir)):
  280. model_dir = model.model_dir
  281. input_graph_def = sess.graph.as_graph_def()
  282. output_tensors = [model.boxes, model.scores, model.classes]
  283. output_tensors = [t.op.name for t in output_tensors]
  284. graph = tf.graph_util.convert_variables_to_constants(
  285. sess, input_graph_def, output_tensors)
  286. tf.train.write_graph(graph, model_dir, model_name, False)
  287. logging.info("save model as .pb end .......")