You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_nlp.py 1.3 kB

12345678910111213141516171819202122232425262728293031323334353637
  1. # Copyright (c) Alibaba, Inc. and its affiliates.
  2. import unittest
  3. from modelscope.preprocessors import build_preprocessor
  4. from modelscope.utils.constant import Fields, InputFields
  5. from modelscope.utils.logger import get_logger
  6. logger = get_logger()
  7. class NLPPreprocessorTest(unittest.TestCase):
  8. def test_tokenize(self):
  9. cfg = dict(type='Tokenize', tokenizer_name='bert-base-cased')
  10. preprocessor = build_preprocessor(cfg, Fields.nlp)
  11. input = {
  12. InputFields.text:
  13. 'Do not meddle in the affairs of wizards, '
  14. 'for they are subtle and quick to anger.'
  15. }
  16. output = preprocessor(input)
  17. self.assertTrue(InputFields.text in output)
  18. self.assertEqual(output['input_ids'], [
  19. 101, 2091, 1136, 1143, 13002, 1107, 1103, 5707, 1104, 16678, 1116,
  20. 117, 1111, 1152, 1132, 11515, 1105, 3613, 1106, 4470, 119, 102
  21. ])
  22. self.assertEqual(
  23. output['token_type_ids'],
  24. [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
  25. self.assertEqual(
  26. output['attention_mask'],
  27. [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
  28. if __name__ == '__main__':
  29. unittest.main()