From 54d219e90b0e6fafa5b34550ee476a16697d85f1 Mon Sep 17 00:00:00 2001 From: pangda Date: Sun, 4 Dec 2022 15:53:32 +0800 Subject: [PATCH] [to #42322933] add UT for NER&EL models Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10897188 --- .../test_named_entity_recognition.py | 274 +++++++++++++++++- tests/pipelines/test_sentence_embedding.py | 16 + tests/pipelines/test_text_ranking.py | 19 ++ 3 files changed, 295 insertions(+), 14 deletions(-) diff --git a/tests/pipelines/test_named_entity_recognition.py b/tests/pipelines/test_named_entity_recognition.py index c4bcdfec..abc6634a 100644 --- a/tests/pipelines/test_named_entity_recognition.py +++ b/tests/pipelines/test_named_entity_recognition.py @@ -15,24 +15,260 @@ from modelscope.utils.test_utils import test_level class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): + language_examples = { + 'zh': + '新华社北京二月十一日电(记者唐虹)', + 'en': + 'Italy recalled Marcello Cuttitta', + 'ru': + 'важным традиционным промыслом является производство пальмового масла .', + 'fr': + 'fer à souder électronique', + 'es': + 'el primer avistamiento por europeos de esta zona fue en 1606 , ' + 'en la expedición española mandada por luis váez de torres .', + 'nl': + 'in het vorige seizoen promoveerden sc cambuur , dat kampioen werd en go ahead eagles via de play offs .', + 'tr': + 'köyün pırasa kavurması ve içi yağlama ve akıtma adındaki hamur işleri meşhurdur . ; çörek ekmeği ; ' + 'diye adlandırdıkları mayasız ekmeği unutmamaklazım .', + 'ko': + '국립진주박물관은 1984년 11월 2일 개관하였으며 한국 전통목조탑을 석조 건물로 형상화한 것으로 건축가 김수근 선생의 대표적 작품이다 .', + 'fa': + 'ﺞﻤﻋیﺕ ﺍیﻥ ﺎﺴﺗﺎﻧ ۳۰ ﻩﺯﺍﺭ ﻦﻓﺭ ﺎﺴﺗ ﻭ ﻢﻧﺎﺒﻋ ﻢﻬﻣی ﺍﺯ ﺲﻧگ ﺂﻬﻧ ﺩﺍﺭﺩ .', + 'de': + 'die szene beinhaltete lenny baker und christopher walken .', + 'hi': + '१४९२ में एक चार्टर के आधार पर, उसके पिता ने उसे वाडोविस के उत्तराधिकारी के रूप में छोड़ दिया।', + 'bn': + 'যদিও গির্জার সবসময় রাজকীয় পিউ থাকত, তবে গির্জায় রাজকীয়ভাবে এটিই ছিল প্রথম দেখা।', + 'multi': + '新华社北京二月十一日电(记者唐虹)', + } + + all_modelcards_info = [ + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-news', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-social_media', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-generic', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-resume', + 'language': 'zh' + }, + { + 'model_id': 'damo/nlp_lstm_named-entity-recognition_chinese-news', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_lstm_named-entity-recognition_chinese-social_media', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_lstm_named-entity-recognition_chinese-generic', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_lstm_named-entity-recognition_chinese-resume', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-book', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-finance', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-game', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-bank', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-literature', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-cmeee', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-news', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-social_media', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-literature', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-politics', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-music', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-science', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-ai', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-wiki', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-large-generic', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-generic', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_multilingual-large-generic', + 'language': 'multi' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_russian-large-generic', + 'language': 'ru' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_spanish-large-generic', + 'language': 'es' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_dutch-large-generic', + 'language': 'nl' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_turkish-large-generic', + 'language': 'tr' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_korean-large-generic', + 'language': 'ko' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_farsi-large-generic', + 'language': 'fa' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_german-large-generic', + 'language': 'de' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_hindi-large-generic', + 'language': 'hi' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_bangla-large-generic', + 'language': 'bn' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-ecom', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_chinese-base-ecom-50cls', + 'language': 'zh' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_english-large-ecom', + 'language': 'en' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_russian-large-ecom', + 'language': 'ru' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_french-large-ecom', + 'language': 'fr' + }, + { + 'model_id': + 'damo/nlp_raner_named-entity-recognition_spanish-large-ecom', + 'language': 'es' + }, + { + 'model_id': + 'damo/nlp_structbert_keyphrase-extraction_base-icassp2023-mug-track4-baseline', + 'language': 'zh' + }, + ] def setUp(self) -> None: self.task = Tasks.named_entity_recognition self.model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' - - english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom' - chinese_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-large-generic' - tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' - lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news' - addr_model_id = 'damo/nlp_structbert_address-parsing_chinese_base' - lstm_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-generic' - sentence = '这与温岭市新河镇的一个神秘的传说有关。' - sentence_en = 'pizza shovel' - sentence_zh = '他 继 续 与 貝 塞 斯 達 遊 戲 工 作 室 在 接 下 来 辐 射 4 游 戏 。' - addr = '浙江省杭州市余杭区文一西路969号亲橙里' - addr1 = '浙江省西湖区灵隐隧道' - addr2 = '内蒙古自治区巴彦淖尔市' - ecom = '欧美单 秋季女装时尚百搭休闲修身 亚麻混纺短款 外套西装' + self.english_model_id = 'damo/nlp_raner_named-entity-recognition_english-large-ecom' + self.chinese_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-large-generic' + self.tcrf_model_id = 'damo/nlp_raner_named-entity-recognition_chinese-base-news' + self.lcrf_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-news' + self.addr_model_id = 'damo/nlp_structbert_address-parsing_chinese_base' + self.lstm_model_id = 'damo/nlp_lstm_named-entity-recognition_chinese-generic' + self.sentence = '这与温岭市新河镇的一个神秘的传说有关。' + self.sentence_en = 'pizza shovel' + self.sentence_zh = '他 继 续 与 貝 塞 斯 達 遊 戲 工 作 室 在 接 下 来 辐 射 4 游 戏 。' + self.addr = '浙江省杭州市余杭区文一西路969号亲橙里' + self.addr1 = '浙江省西湖区灵隐隧道' + self.addr2 = '内蒙古自治区巴彦淖尔市' + self.ecom = '欧美单 秋季女装时尚百搭休闲修身 亚麻混纺短款 外套西装' @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_tcrf_by_direct_model_download(self): @@ -222,6 +458,16 @@ class NamedEntityRecognitionTest(unittest.TestCase, DemoCompatibilityCheck): pipeline_ins = pipeline(task=Tasks.named_entity_recognition) print(pipeline_ins(input=self.sentence)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_all_modelcards(self): + for item in self.all_modelcards_info: + model_id = item['model_id'] + sentence = self.language_examples[item['language']] + with self.subTest(model_id=model_id): + pipeline_ins = pipeline(Tasks.named_entity_recognition, + model_id) + print(pipeline_ins(input=sentence)) + @unittest.skip('demo compatibility test is only enabled on a needed-basis') def test_demo_compatibility(self): self.compatibility_check() diff --git a/tests/pipelines/test_sentence_embedding.py b/tests/pipelines/test_sentence_embedding.py index 4132f965..35b00976 100644 --- a/tests/pipelines/test_sentence_embedding.py +++ b/tests/pipelines/test_sentence_embedding.py @@ -36,6 +36,16 @@ class SentenceEmbeddingTest(unittest.TestCase): 'sentences_to_compare': [] } + el_model_id = 'damo/nlp_bert_entity-embedding_chinese-base' + el_inputs = { + 'source_sentence': ['宋小宝小品《美人鱼》, [ENT_S] 大鹏 [ENT_E] 上演生死离别,关键时刻美人鱼登场'], + 'sentences_to_compare': [ + '董成鹏; 类型: Person; 别名: Da Peng, 大鹏;', + '超级飞侠; 类型: Work; 别名: 超飞, 출동!슈퍼윙스, Super Wings;', + '王源; 类型: Person; 别名: Roy;', + ] + } + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): cache_path = snapshot_download(self.model_id) @@ -77,6 +87,12 @@ class SentenceEmbeddingTest(unittest.TestCase): pipeline_ins = pipeline(task=Tasks.sentence_embedding) print(pipeline_ins(input=self.inputs)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_el_model(self): + pipeline_ins = pipeline( + task=Tasks.sentence_embedding, model=self.el_model_id) + print(pipeline_ins(input=self.el_inputs)) + if __name__ == '__main__': unittest.main() diff --git a/tests/pipelines/test_text_ranking.py b/tests/pipelines/test_text_ranking.py index 01f1887f..3329faad 100644 --- a/tests/pipelines/test_text_ranking.py +++ b/tests/pipelines/test_text_ranking.py @@ -28,6 +28,19 @@ class TextRankingTest(unittest.TestCase): ] } + el_model_id = 'damo/nlp_bert_entity-matching_chinese-base' + el_inputs = { + 'source_sentence': ['我是猫》([日]夏目漱石)【摘要 [ENT_S] 书评 [ENT_E] 试读】'], + 'sentences_to_compare': [ + '书评; 类型: Other; 别名: Book review; 三元组: 书评 # 外文名 # Book review $ 书评 # 摘要 # ' + '书评,即评论并介绍书籍的文章,是以“书”为对象,实事求是的、有见识的分析书籍的形式和内容,探求创作的思想性、学术性、知识性和艺术性,从而在作者、读者和出版商之间构建信息交流的渠道。 $ 书评 # 定义 # ' + '评论并介绍书籍的文章 $ 书评 # 中文名 # 书评 $ 书评 # 义项描述 # 书评 $ 书评 # 类型 # 应用写作的一种重要文体 $ 书评 # 标签 # 文学作品、文化、出版物、小说、书籍 $', + '摘要; 类型: Other; 别名: 摘, abstract, 书评; 三元组: 摘要 # 读音 # zhāi yào $ 摘要 # 外文名 # abstract $ 摘要 # 摘要 # ' + '摘要又称概要、内容提要,意思是摘录要点或摘录下来的要点。 $ 摘要 # 词目 # 摘要 $ 摘要 # 词性 # 动词,名词 $ 摘要 # 中文名 # 摘要 $ 摘要 # 别称 # 概要、内容提要 $ 摘要 ' + '# 义项描述 # 摘要 $ 摘要 # 标签 # 文化、文学家、行业人物、法律术语、小说 $', + ] + } + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_run_by_direct_model_download(self): for model_id in self.models: @@ -62,6 +75,12 @@ class TextRankingTest(unittest.TestCase): pipeline_ins = pipeline(task=Tasks.text_ranking) print(pipeline_ins(input=self.inputs)) + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') + def test_run_with_el_model(self): + pipeline_ins = pipeline( + task=Tasks.text_ranking, model=self.el_model_id) + print(pipeline_ins(input=self.el_inputs)) + if __name__ == '__main__': unittest.main()