|
- # !/usr/bin/env python
- # -*- coding:utf-8 -*-
-
- """
- Copyright 2020 Tianshu AI Platform. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- =============================================================
- """
- import json
- import os
- import re
- import six
- import numpy as np
- from typing import Tuple
- # import requests # 在 nfs 没有挂载 时使用 url 访问
- import oneflow as flow
- import oneflow.typing as tp
- import logging
- logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
- level=logging.DEBUG)
-
- BATCH_SIZE = 16
-
- current_dir = os.path.dirname(os.path.abspath(__file__))
- label_to_name_file = current_dir + os.sep + "label.names"
- label_2_name = []
- with open(label_to_name_file, 'r') as f:
- label_2_name = f.readlines()
- class TextCNN:
- def __init__(self, emb_sz, emb_dim, ksize_list, n_filters_list, n_classes, dropout):
- self.initializer = flow.random_normal_initializer(stddev=0.1)
- self.emb_sz = emb_sz
- self.emb_dim = emb_dim
- self.ksize_list = ksize_list
- self.n_filters_list = n_filters_list
- self.n_classes = n_classes
- self.dropout = dropout
- self.total_n_filters = sum(self.n_filters_list)
-
- def get_logits(self, inputs, is_train):
- emb_weight = flow.get_variable(
- 'embedding-weight',
- shape=(self.emb_sz, self.emb_dim),
- dtype=flow.float32,
- trainable=is_train,
- reuse=False,
- initializer=self.initializer,
- )
- data = flow.gather(emb_weight, inputs, axis=0)
- data = flow.transpose(data, [0, 2, 1]) # BLH -> BHL
- data = flow.reshape(data, list(data.shape) + [1])
- seq_length = data.shape[2]
- pooled_list = []
- for i in range(len(self.n_filters_list)):
- ksz = self.ksize_list[i]
- n_filters = self.n_filters_list[i]
- conv = flow.layers.conv2d(data, n_filters, [ksz, 1], data_format="NCHW",
- kernel_initializer=self.initializer, name='conv-{}'.format(i)) # NCHW
- # conv = flow.layers.layer_norm(conv, name='ln-{}'.format(i))
- conv = flow.nn.relu(conv)
- pooled = flow.nn.max_pool2d(conv, [seq_length - ksz + 1, 1], strides=1, padding='VALID', data_format="NCHW")
- pooled_list.append(pooled)
- pooled = flow.concat(pooled_list, 3)
- pooled = flow.reshape(pooled, [-1, self.total_n_filters])
-
- if is_train:
- pooled = flow.nn.dropout(pooled, rate=self.dropout)
-
- pooled = flow.layers.dense(pooled, self.total_n_filters, use_bias=True,
- kernel_initializer=self.initializer, name='dense-1')
- pooled = flow.nn.relu(pooled)
- logits = flow.layers.dense(pooled, self.n_classes, use_bias=True,
- kernel_initializer=self.initializer, name='dense-2')
- return logits
-
-
- def get_eval_config():
- config = flow.function_config()
- config.default_data_type(flow.float)
- return config
-
-
- def pad_sequences(sequences, maxlen=None, dtype='int32',
- padding='pre', truncating='pre', value=0.):
- """Pads sequences to the same length.
-
- This function transforms a list of
- `num_samples` sequences (lists of integers)
- into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
- `num_timesteps` is either the `maxlen` argument if provided,
- or the length of the longest sequence otherwise.
-
- Sequences that are shorter than `num_timesteps`
- are padded with `value` at the beginning or the end
- if padding='post.
-
- Sequences longer than `num_timesteps` are truncated
- so that they fit the desired length.
- The position where padding or truncation happens is determined by
- the arguments `padding` and `truncating`, respectively.
-
- Pre-padding is the default.
-
- # Arguments
- sequences: List of lists, where each element is a sequence.
- maxlen: Int, maximum length of all sequences.
- dtype: Type of the output sequences.
- To pad sequences with variable length strings, you can use `object`.
- padding: String, 'pre' or 'post':
- pad either before or after each sequence.
- truncating: String, 'pre' or 'post':
- remove values from sequences larger than
- `maxlen`, either at the beginning or at the end of the sequences.
- value: Float or String, padding value.
-
- # Returns
- x: Numpy array with shape `(len(sequences), maxlen)`
-
- # Raises
- ValueError: In case of invalid values for `truncating` or `padding`,
- or in case of invalid shape for a `sequences` entry.
- """
- if not hasattr(sequences, '__len__'):
- raise ValueError('`sequences` must be iterable.')
- num_samples = len(sequences)
-
- lengths = []
- sample_shape = ()
- flag = True
-
- # take the sample shape from the first non empty sequence
- # checking for consistency in the main loop below.
-
- for x in sequences:
- try:
- lengths.append(len(x))
- if flag and len(x):
- sample_shape = np.asarray(x).shape[1:]
- flag = False
- except TypeError:
- raise ValueError('`sequences` must be a list of iterables. '
- 'Found non-iterable: ' + str(x))
-
- if maxlen is None:
- maxlen = np.max(lengths)
-
- is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(dtype, np.unicode_)
- if isinstance(value, six.string_types) and dtype != object and not is_dtype_str:
- raise ValueError("`dtype` {} is not compatible with `value`'s type: {}\n"
- "You should set `dtype=object` for variable length strings."
- .format(dtype, type(value)))
-
- x = np.full((num_samples, maxlen) + sample_shape, value, dtype=dtype)
- for idx, s in enumerate(sequences):
- if not len(s):
- continue # empty list/array was found
- if truncating == 'pre':
- trunc = s[-maxlen:]
- elif truncating == 'post':
- trunc = s[:maxlen]
- else:
- raise ValueError('Truncating type "%s" '
- 'not understood' % truncating)
-
- # check `trunc` has expected shape
- trunc = np.asarray(trunc, dtype=dtype)
- if trunc.shape[1:] != sample_shape:
- raise ValueError('Shape of sample %s of sequence at position %s '
- 'is different from expected shape %s' %
- (trunc.shape[1:], idx, sample_shape))
-
- if padding == 'post':
- x[idx, :len(trunc)] = trunc
- elif padding == 'pre':
- x[idx, -len(trunc):] = trunc
- else:
- raise ValueError('Padding type "%s" not understood' % padding)
- return x
-
-
- @flow.global_function('predict', get_eval_config())
- def predict_job(text: tp.Numpy.Placeholder((BATCH_SIZE, 150), dtype=flow.int32),
- ) -> Tuple[tp.Numpy, tp.Numpy]:
- with flow.scope.placement("gpu", "0:0"):
- model = TextCNN(50000, 100, ksize_list=[2, 3, 4, 5], n_filters_list=[100] * 4, n_classes=2, dropout=0.5)
- logits = model.get_logits(text, is_train=False)
- logits = flow.nn.softmax(logits)
- label = flow.math.argmax(logits)
- return label, logits
-
-
- class TextCNNClassifier:
-
- def __init__(self):
- model_load_dir = current_dir + os.sep + "model/textcnn_imdb_of_best_model/"
- word_index_dir = current_dir + os.sep + "model/imdb_word_index/imdb_word_index.json"
-
- checkpoint = flow.train.CheckPoint()
- checkpoint.init()
- checkpoint.load(model_load_dir)
-
- with open(word_index_dir) as f:
- word_index = json.load(f)
- word_index = {k: (v + 2) for k, v in word_index.items()}
- word_index["<PAD>"] = 0
- word_index["<START>"] = 1
- word_index["<UNK>"] = 2
- self.word_index = word_index
-
- def inference(self, text_path_list, id_list, label_list):
- logging.info("infer")
- logging.info(label_list)
- classifications = []
- batch_text = []
- for i, text_path in enumerate(text_path_list):
- text = open(text_path, "r").read()
- """
- # 在 nfs 没有挂载 时使用 url 访问 MinIO 进行测试
- url = "http://10.5.29.100:9000/" + text_path
- print(url)
- text = requests.get(url).text # .encode('utf-8').decode('utf-8')
- """
- text = re.sub("[^a-zA-Z']", " ", text)
- text = list(map(lambda x: x.lower(), text.split()))
- text.insert(0, "<START>")
- batch_text.append(
- list(map(lambda x: self.word_index[x] if x in self.word_index else self.word_index["<UNK>"], text))
- )
-
- if i % BATCH_SIZE == BATCH_SIZE - 1:
- text = pad_sequences(batch_text, value=self.word_index["<PAD>"], padding='post', maxlen=150)
- text = np.array(text, dtype=np.int32)
- label, logits = predict_job(text)
- label = label.tolist()
- logits = logits.tolist()
- for k in range(BATCH_SIZE):
- temp = {}
- temp['annotation'] = []
- temp['annotation'] = json.dumps(temp['annotation'])
- temp['id'] = id_list[i - BATCH_SIZE + 1 + k]
- if label[k] in label_list:
- temp['annotation'] = json.dumps([{'category_id': label_2_name[label[k]].rstrip('\n'), 'score': round(logits[k][label[k]], 4)}])
- classifications.append(temp)
- batch_text = []
- return classifications
|