@@ -27,7 +27,6 @@ pipeline { | |||||
} | } | ||||
stage('Package Testing') { | stage('Package Testing') { | ||||
steps { | steps { | ||||
sh 'python -m spacy download en' | |||||
sh 'pip install fitlog' | sh 'pip install fitlog' | ||||
sh 'pytest ./tests --html=test_results.html --self-contained-html' | sh 'pytest ./tests --html=test_results.html --self-contained-html' | ||||
} | } | ||||
@@ -13,7 +13,7 @@ install: | |||||
- pip install pytest-cov | - pip install pytest-cov | ||||
# command to run tests | # command to run tests | ||||
script: | script: | ||||
- python -m spacy download en | |||||
# - python -m spacy download en | |||||
- pytest --cov=fastNLP tests/ | - pytest --cov=fastNLP tests/ | ||||
after_success: | after_success: | ||||
@@ -46,10 +46,8 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 1, | |||||
"metadata": { | |||||
"collapsed": true | |||||
}, | |||||
"execution_count": null, | |||||
"metadata": {}, | |||||
"outputs": [], | "outputs": [], | ||||
"source": [ | "source": [ | ||||
"from fastNLP.io import ChnSentiCorpLoader\n", | "from fastNLP.io import ChnSentiCorpLoader\n", | ||||
@@ -68,22 +66,9 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 2, | |||||
"execution_count": null, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"In total 3 datasets:\n", | |||||
"\tdev has 1200 instances.\n", | |||||
"\ttrain has 9600 instances.\n", | |||||
"\ttest has 1200 instances.\n", | |||||
"In total 0 vocabs:\n", | |||||
"\n" | |||||
] | |||||
} | |||||
], | |||||
"outputs": [], | |||||
"source": [ | "source": [ | ||||
"print(data_bundle)" | "print(data_bundle)" | ||||
] | ] | ||||
@@ -97,20 +82,9 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 6, | |||||
"execution_count": null, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"DataSet({'raw_chars': 选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。 服务吗,一般 type=str,\n", | |||||
"'target': 1 type=str},\n", | |||||
"{'raw_chars': 15.4寸笔记本的键盘确实爽,基本跟台式机差不多了,蛮喜欢数字小键盘,输数字特方便,样子也很美观,做工也相当不错 type=str,\n", | |||||
"'target': 1 type=str})\n" | |||||
] | |||||
} | |||||
], | |||||
"outputs": [], | |||||
"source": [ | "source": [ | ||||
"print(data_bundle.get_dataset('train')[:2]) # 查看Train集前两个sample" | "print(data_bundle.get_dataset('train')[:2]) # 查看Train集前两个sample" | ||||
] | ] | ||||
@@ -127,10 +101,8 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 3, | |||||
"metadata": { | |||||
"collapsed": true | |||||
}, | |||||
"execution_count": null, | |||||
"metadata": {}, | |||||
"outputs": [], | "outputs": [], | ||||
"source": [ | "source": [ | ||||
"from fastNLP.io import ChnSentiCorpPipe\n", | "from fastNLP.io import ChnSentiCorpPipe\n", | ||||
@@ -141,24 +113,9 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 4, | |||||
"execution_count": null, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"In total 3 datasets:\n", | |||||
"\tdev has 1200 instances.\n", | |||||
"\ttrain has 9600 instances.\n", | |||||
"\ttest has 1200 instances.\n", | |||||
"In total 2 vocabs:\n", | |||||
"\tchars has 4409 entries.\n", | |||||
"\ttarget has 2 entries.\n", | |||||
"\n" | |||||
] | |||||
} | |||||
], | |||||
"outputs": [], | |||||
"source": [ | "source": [ | ||||
"print(data_bundle) # 打印data_bundle,查看其变化" | "print(data_bundle) # 打印data_bundle,查看其变化" | ||||
] | ] | ||||
@@ -172,24 +129,9 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 5, | |||||
"execution_count": null, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"DataSet({'raw_chars': 选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。 服务吗,一般 type=str,\n", | |||||
"'target': 1 type=int,\n", | |||||
"'chars': [338, 464, 1400, 784, 468, 739, 3, 289, 151, 21, 5, 88, 143, 2, 9, 81, 134, 2573, 766, 233, 196, 23, 536, 342, 297, 2, 405, 698, 132, 281, 74, 744, 1048, 74, 420, 387, 74, 412, 433, 74, 2021, 180, 8, 219, 1929, 213, 4, 34, 31, 96, 363, 8, 230, 2, 66, 18, 229, 331, 768, 4, 11, 1094, 479, 17, 35, 593, 3, 1126, 967, 2, 151, 245, 12, 44, 2, 6, 52, 260, 263, 635, 5, 152, 162, 4, 11, 336, 3, 154, 132, 5, 236, 443, 3, 2, 18, 229, 761, 700, 4, 11, 48, 59, 653, 2, 8, 230] type=list,\n", | |||||
"'seq_len': 106 type=int},\n", | |||||
"{'raw_chars': 15.4寸笔记本的键盘确实爽,基本跟台式机差不多了,蛮喜欢数字小键盘,输数字特方便,样子也很美观,做工也相当不错 type=str,\n", | |||||
"'target': 1 type=int,\n", | |||||
"'chars': [50, 133, 20, 135, 945, 520, 343, 24, 3, 301, 176, 350, 86, 785, 2, 456, 24, 461, 163, 443, 128, 109, 6, 47, 7, 2, 916, 152, 162, 524, 296, 44, 301, 176, 2, 1384, 524, 296, 259, 88, 143, 2, 92, 67, 26, 12, 277, 269, 2, 188, 223, 26, 228, 83, 6, 63] type=list,\n", | |||||
"'seq_len': 56 type=int})\n" | |||||
] | |||||
} | |||||
], | |||||
"outputs": [], | |||||
"source": [ | "source": [ | ||||
"print(data_bundle.get_dataset('train')[:2])" | "print(data_bundle.get_dataset('train')[:2])" | ||||
] | ] | ||||
@@ -203,17 +145,9 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 6, | |||||
"execution_count": null, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"Vocabulary(['选', '择', '珠', '江', '花']...)\n" | |||||
] | |||||
} | |||||
], | |||||
"outputs": [], | |||||
"source": [ | "source": [ | ||||
"char_vocab = data_bundle.get_vocab('chars')\n", | "char_vocab = data_bundle.get_vocab('chars')\n", | ||||
"print(char_vocab)" | "print(char_vocab)" | ||||
@@ -228,18 +162,9 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 7, | |||||
"execution_count": null, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"'选'的index是338\n", | |||||
"index:338对应的汉字是选\n" | |||||
] | |||||
} | |||||
], | |||||
"outputs": [], | |||||
"source": [ | "source": [ | ||||
"index = char_vocab.to_index('选')\n", | "index = char_vocab.to_index('选')\n", | ||||
"print(\"'选'的index是{}\".format(index)) # 这个值与上面打印出来的第一个instance的chars的第一个index是一致的\n", | "print(\"'选'的index是{}\".format(index)) # 这个值与上面打印出来的第一个instance的chars的第一个index是一致的\n", | ||||
@@ -256,17 +181,9 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 8, | |||||
"execution_count": null, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"Found 4321 out of 4409 words in the pre-training embedding.\n" | |||||
] | |||||
} | |||||
], | |||||
"outputs": [], | |||||
"source": [ | "source": [ | ||||
"from fastNLP.embeddings import StaticEmbedding\n", | "from fastNLP.embeddings import StaticEmbedding\n", | ||||
"\n", | "\n", | ||||
@@ -283,10 +200,8 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 9, | |||||
"metadata": { | |||||
"collapsed": true | |||||
}, | |||||
"execution_count": null, | |||||
"metadata": {}, | |||||
"outputs": [], | "outputs": [], | ||||
"source": [ | "source": [ | ||||
"from torch import nn\n", | "from torch import nn\n", | ||||
@@ -329,288 +244,9 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 10, | |||||
"execution_count": null, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"input fields after batch(if batch size is 2):\n", | |||||
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", | |||||
"\tchars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106]) \n", | |||||
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", | |||||
"target fields after batch(if batch size is 2):\n", | |||||
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", | |||||
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", | |||||
"\n", | |||||
"Evaluate data in 0.01 seconds!\n", | |||||
"training epochs started 2019-09-03-23-57-10\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3000), HTML(value='')), layout=Layout(display…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.43 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 1/10. Step:300/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.81\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.44 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 2/10. Step:600/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.8675\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.44 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 3/10. Step:900/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.878333\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.43 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 4/10. Step:1200/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.873333\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.44 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 5/10. Step:1500/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.878333\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.42 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 6/10. Step:1800/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.895833\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.44 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 7/10. Step:2100/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.8975\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.43 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 8/10. Step:2400/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.894167\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.48 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 9/10. Step:2700/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.8875\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.43 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 10/10. Step:3000/3000: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.895833\n", | |||||
"\n", | |||||
"\r\n", | |||||
"In Epoch:7/Step:2100, got best dev performance:\n", | |||||
"AccuracyMetric: acc=0.8975\n", | |||||
"Reloaded the best model.\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=19), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 0.34 seconds!\n", | |||||
"[tester] \n", | |||||
"AccuracyMetric: acc=0.8975\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"{'AccuracyMetric': {'acc': 0.8975}}" | |||||
] | |||||
}, | |||||
"execution_count": 10, | |||||
"metadata": {}, | |||||
"output_type": "execute_result" | |||||
} | |||||
], | |||||
"outputs": [], | |||||
"source": [ | "source": [ | ||||
"from fastNLP import Trainer\n", | "from fastNLP import Trainer\n", | ||||
"from fastNLP import CrossEntropyLoss\n", | "from fastNLP import CrossEntropyLoss\n", | ||||
@@ -643,139 +279,9 @@ | |||||
}, | }, | ||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": 12, | |||||
"execution_count": null, | |||||
"metadata": {}, | "metadata": {}, | ||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"loading vocabulary file /home/yh/.fastNLP/embedding/bert-chinese-wwm/vocab.txt\n", | |||||
"Load pre-trained BERT parameters from file /home/yh/.fastNLP/embedding/bert-chinese-wwm/chinese_wwm_pytorch.bin.\n", | |||||
"Start to generating word pieces for word.\n", | |||||
"Found(Or segment into word pieces) 4286 words out of 4409.\n", | |||||
"input fields after batch(if batch size is 2):\n", | |||||
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", | |||||
"\tchars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106]) \n", | |||||
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", | |||||
"target fields after batch(if batch size is 2):\n", | |||||
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", | |||||
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", | |||||
"\n", | |||||
"Evaluate data in 0.05 seconds!\n", | |||||
"training epochs started 2019-09-04-00-02-37\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3600), HTML(value='')), layout=Layout(display…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=150), HTML(value='')), layout=Layout(display=…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 15.89 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 1/3. Step:1200/3600: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.9\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=150), HTML(value='')), layout=Layout(display=…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 15.92 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 2/3. Step:2400/3600: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.904167\n", | |||||
"\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=150), HTML(value='')), layout=Layout(display=…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 15.91 seconds!\n", | |||||
"\r", | |||||
"Evaluation on dev at Epoch 3/3. Step:3600/3600: \n", | |||||
"\r", | |||||
"AccuracyMetric: acc=0.918333\n", | |||||
"\n", | |||||
"\r\n", | |||||
"In Epoch:3/Step:3600, got best dev performance:\n", | |||||
"AccuracyMetric: acc=0.918333\n", | |||||
"Reloaded the best model.\n", | |||||
"Performance on test is:\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=19), HTML(value='')), layout=Layout(display='…" | |||||
] | |||||
}, | |||||
"metadata": {}, | |||||
"output_type": "display_data" | |||||
}, | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"\r", | |||||
"Evaluate data in 29.24 seconds!\n", | |||||
"[tester] \n", | |||||
"AccuracyMetric: acc=0.919167\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"{'AccuracyMetric': {'acc': 0.919167}}" | |||||
] | |||||
}, | |||||
"execution_count": 12, | |||||
"metadata": {}, | |||||
"output_type": "execute_result" | |||||
} | |||||
], | |||||
"outputs": [], | |||||
"source": [ | "source": [ | ||||
"# 只需要切换一下Embedding即可\n", | "# 只需要切换一下Embedding即可\n", | ||||
"from fastNLP.embeddings import BertEmbedding\n", | "from fastNLP.embeddings import BertEmbedding\n", | ||||
@@ -840,9 +346,7 @@ | |||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": null, | "execution_count": null, | ||||
"metadata": { | |||||
"collapsed": true | |||||
}, | |||||
"metadata": {}, | |||||
"outputs": [], | "outputs": [], | ||||
"source": [ | "source": [ | ||||
"from fastNLP.io import ChnSentiCorpLoader\n", | "from fastNLP.io import ChnSentiCorpLoader\n", | ||||
@@ -861,9 +365,7 @@ | |||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": null, | "execution_count": null, | ||||
"metadata": { | |||||
"collapsed": true | |||||
}, | |||||
"metadata": {}, | |||||
"outputs": [], | "outputs": [], | ||||
"source": [ | "source": [ | ||||
"import os\n", | "import os\n", | ||||
@@ -912,15 +414,14 @@ | |||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": null, | "execution_count": null, | ||||
"metadata": { | |||||
"collapsed": true | |||||
}, | |||||
"metadata": {}, | |||||
"outputs": [], | "outputs": [], | ||||
"source": [ | "source": [ | ||||
"from fastHan import FastHan\n", | "from fastHan import FastHan\n", | ||||
"from fastNLP import Vocabulary\n", | "from fastNLP import Vocabulary\n", | ||||
"\n", | "\n", | ||||
"model=FastHan()\n", | "model=FastHan()\n", | ||||
"# model.set_device('cuda')\n", | |||||
"\n", | "\n", | ||||
"# 定义分词处理操作\n", | "# 定义分词处理操作\n", | ||||
"def word_seg(ins):\n", | "def word_seg(ins):\n", | ||||
@@ -933,6 +434,8 @@ | |||||
" # apply函数将对内部的instance依次执行word_seg操作,并把其返回值放入到raw_words这个field\n", | " # apply函数将对内部的instance依次执行word_seg操作,并把其返回值放入到raw_words这个field\n", | ||||
" ds.apply(word_seg, new_field_name='raw_words')\n", | " ds.apply(word_seg, new_field_name='raw_words')\n", | ||||
" # 除了apply函数,fastNLP还支持apply_field, apply_more(可同时创建多个field)等操作\n", | " # 除了apply函数,fastNLP还支持apply_field, apply_more(可同时创建多个field)等操作\n", | ||||
" # 同时我们增加一个seq_len的field\n", | |||||
" ds.add_seq_len('raw_words')\n", | |||||
"\n", | "\n", | ||||
"vocab = Vocabulary()\n", | "vocab = Vocabulary()\n", | ||||
"\n", | "\n", | ||||
@@ -961,11 +464,14 @@ | |||||
"# 我们把words和target分别设置为input和target,这样它们才会在训练循环中被取出并自动padding, 有关这部分更多的内容参考\n", | "# 我们把words和target分别设置为input和target,这样它们才会在训练循环中被取出并自动padding, 有关这部分更多的内容参考\n", | ||||
"# http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_6_datasetiter.html\n", | "# http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_6_datasetiter.html\n", | ||||
"data_bundle.set_target('target')\n", | "data_bundle.set_target('target')\n", | ||||
"data_bundle.set_input('words') # DataSet也有这两个接口\n", | |||||
"data_bundle.set_input('words', 'seq_len') # DataSet也有这两个接口\n", | |||||
"# 如果某些field,您希望它被设置为target或者input,但是不希望fastNLP自动padding或需要使用特定的padding方式,请参考\n", | "# 如果某些field,您希望它被设置为target或者input,但是不希望fastNLP自动padding或需要使用特定的padding方式,请参考\n", | ||||
"# http://www.fastnlp.top/docs/fastNLP/fastNLP.core.dataset.html\n", | "# http://www.fastnlp.top/docs/fastNLP/fastNLP.core.dataset.html\n", | ||||
"\n", | "\n", | ||||
"print(data_bundle.get_dataset('train')[:2]) # 我们可以看一下当前dataset的内容" | |||||
"print(data_bundle.get_dataset('train')[:2]) # 我们可以看一下当前dataset的内容\n", | |||||
"\n", | |||||
"# 由于之后需要使用之前定义的BiLSTMMaxPoolCls模型,所以需要将words这个field修改为chars(因为该模型的forward接受chars参数)\n", | |||||
"data_bundle.rename_field('words', 'chars')" | |||||
] | ] | ||||
}, | }, | ||||
{ | { | ||||
@@ -985,9 +491,7 @@ | |||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": null, | "execution_count": null, | ||||
"metadata": { | |||||
"collapsed": true | |||||
}, | |||||
"metadata": {}, | |||||
"outputs": [], | "outputs": [], | ||||
"source": [ | "source": [ | ||||
"from fastNLP.embeddings import StaticEmbedding\n", | "from fastNLP.embeddings import StaticEmbedding\n", | ||||
@@ -999,11 +503,14 @@ | |||||
{ | { | ||||
"cell_type": "code", | "cell_type": "code", | ||||
"execution_count": null, | "execution_count": null, | ||||
"metadata": { | |||||
"collapsed": true | |||||
}, | |||||
"metadata": {}, | |||||
"outputs": [], | "outputs": [], | ||||
"source": [ | "source": [ | ||||
"from fastNLP import Trainer\n", | |||||
"from fastNLP import CrossEntropyLoss\n", | |||||
"from torch.optim import Adam\n", | |||||
"from fastNLP import AccuracyMetric\n", | |||||
"\n", | |||||
"# 初始化模型\n", | "# 初始化模型\n", | ||||
"model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target')))\n", | "model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target')))\n", | ||||
"\n", | "\n", | ||||
@@ -1024,6 +531,13 @@ | |||||
"tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n", | "tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n", | ||||
"tester.test()" | "tester.test()" | ||||
] | ] | ||||
}, | |||||
{ | |||||
"cell_type": "code", | |||||
"execution_count": null, | |||||
"metadata": {}, | |||||
"outputs": [], | |||||
"source": [] | |||||
} | } | ||||
], | ], | ||||
"metadata": { | "metadata": { | ||||
@@ -1042,7 +556,7 @@ | |||||
"name": "python", | "name": "python", | ||||
"nbconvert_exporter": "python", | "nbconvert_exporter": "python", | ||||
"pygments_lexer": "ipython3", | "pygments_lexer": "ipython3", | ||||
"version": "3.6.10" | |||||
"version": "3.6.8" | |||||
} | } | ||||
}, | }, | ||||
"nbformat": 4, | "nbformat": 4, | ||||
@@ -447,6 +447,7 @@ PS: 基于词进行文本分类 | |||||
from fastNLP import Vocabulary | from fastNLP import Vocabulary | ||||
model=FastHan() | model=FastHan() | ||||
# model.set_device('cuda') # 可以注视掉这一行增加速度 | |||||
# 定义分词处理操作 | # 定义分词处理操作 | ||||
def word_seg(ins): | def word_seg(ins): | ||||
@@ -459,6 +460,8 @@ PS: 基于词进行文本分类 | |||||
# apply函数将对内部的instance依次执行word_seg操作,并把其返回值放入到raw_words这个field | # apply函数将对内部的instance依次执行word_seg操作,并把其返回值放入到raw_words这个field | ||||
ds.apply(word_seg, new_field_name='raw_words') | ds.apply(word_seg, new_field_name='raw_words') | ||||
# 除了apply函数,fastNLP还支持apply_field, apply_more(可同时创建多个field)等操作 | # 除了apply函数,fastNLP还支持apply_field, apply_more(可同时创建多个field)等操作 | ||||
# 同时我们增加一个seq_len的field | |||||
ds.add_seq_len('raw_words') | |||||
vocab = Vocabulary() | vocab = Vocabulary() | ||||
@@ -500,11 +503,14 @@ PS: 基于词进行文本分类 | |||||
# | 0 | 15.4寸笔记本的键盘... | ['15.4', '寸', '笔... | [71, 72, 73, 74, ... | | # | 0 | 15.4寸笔记本的键盘... | ['15.4', '寸', '笔... | [71, 72, 73, 74, ... | | ||||
# +--------+-----------------------+-----------------------+----------------------+ | # +--------+-----------------------+-----------------------+----------------------+ | ||||
# 由于之后需要使用之前定义的BiLSTMMaxPoolCls模型,所以需要将words这个field修改为chars | |||||
data_bundle.rename_field('words', 'chars') | |||||
我们可以打印一下vocab看一下当前的词表内容 | 我们可以打印一下vocab看一下当前的词表内容 | ||||
.. code-block:: python | .. code-block:: python | ||||
print(data_bundle.get_vocab('words')) | |||||
print(data_bundle.get_vocab('chars')) | |||||
# Vocabulary([选择, 珠江, 花园, 的, 原因]...) | # Vocabulary([选择, 珠江, 花园, 的, 原因]...) | ||||
(3) 选择预训练词向量 | (3) 选择预训练词向量 | ||||
@@ -520,7 +526,7 @@ PS: 基于词进行文本分类 | |||||
from fastNLP.embeddings import StaticEmbedding | from fastNLP.embeddings import StaticEmbedding | ||||
word2vec_embed = StaticEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='/path/to/Tencent_AILab_ChineseEmbedding.txt') | |||||
word2vec_embed = StaticEmbedding(data_bundle.get_vocab('chars'), model_dir_or_name='/path/to/Tencent_AILab_ChineseEmbedding.txt') | |||||
再之后的模型定义与训练过程与上面是一致的,这里就不再赘述了。 | 再之后的模型定义与训练过程与上面是一致的,这里就不再赘述了。 | ||||
@@ -531,11 +531,11 @@ class DataSet(object): | |||||
| pad_value | 0 | | | | pad_value | 0 | | | ||||
+-------------+-------+-------+ | +-------------+-------+-------+ | ||||
:param field_names: DataSet中field的名称 | |||||
:param is_input: field是否为input | |||||
:param is_target: field是否为target | |||||
:param ignore_type: 是否忽略该field的type, 一般仅在该field至少为input或target时才有意义 | |||||
:param pad_value: 该field的pad的值,仅在该field为input或target时有意义 | |||||
str field_names: DataSet中field的名称 | |||||
bool is_input: field是否为input | |||||
bool is_target: field是否为target | |||||
bool ignore_type: 是否忽略该field的type, 一般仅在该field至少为input或target时才有意义 | |||||
int pad_value: 该field的pad的值,仅在该field为input或target时有意义 | |||||
:return: | :return: | ||||
""" | """ | ||||
if len(self.field_arrays)>0: | if len(self.field_arrays)>0: | ||||
@@ -1146,3 +1146,40 @@ class DataSet(object): | |||||
def _collate_batch(self, ins_list): | def _collate_batch(self, ins_list): | ||||
return self.collater.collate_batch(ins_list) | return self.collater.collate_batch(ins_list) | ||||
def concat(self, dataset, inplace=True, field_mapping=None): | |||||
""" | |||||
将当前dataset与输入的dataset结合成一个更大的dataset,需要保证两个dataset都包含了相同的field。结合后的dataset的input,target | |||||
以及collate_fn以当前dataset为准。当dataset中包含的field多于当前的dataset,则多余的field会被忽略;若dataset中未包含所有 | |||||
当前dataset含有field,则会报错。 | |||||
:param DataSet, dataset: 需要和当前dataset concat的dataset | |||||
:param bool, inplace: 是否直接将dataset组合到当前dataset中 | |||||
:param dict, field_mapping: 当dataset中的field名称和当前dataset不一致时,需要通过field_mapping把输入的dataset中的field | |||||
名称映射到当前field. field_mapping为dict类型,key为dataset中的field名称,value是需要映射成的名称 | |||||
:return: DataSet | |||||
""" | |||||
assert isinstance(dataset, DataSet), "Can only concat two datasets." | |||||
fns_in_this_dataset = set(self.get_field_names()) | |||||
fns_in_other_dataset = dataset.get_field_names() | |||||
reverse_field_mapping = {} | |||||
if field_mapping is not None: | |||||
fns_in_other_dataset = [field_mapping.get(fn, fn) for fn in fns_in_other_dataset] | |||||
reverse_field_mapping = {v:k for k, v in field_mapping.items()} | |||||
fns_in_other_dataset = set(fns_in_other_dataset) | |||||
fn_not_seen = list(fns_in_this_dataset - fns_in_other_dataset) | |||||
if fn_not_seen: | |||||
raise RuntimeError(f"The following fields are not provided in the dataset:{fn_not_seen}") | |||||
if inplace: | |||||
ds = self | |||||
else: | |||||
ds = deepcopy(self) | |||||
for fn in fns_in_this_dataset: | |||||
ds.get_field(fn).content.extend(deepcopy(dataset.get_field(reverse_field_mapping.get(fn, fn)).content)) | |||||
return ds |
@@ -13,6 +13,7 @@ import torch | |||||
from torch import nn as nn | from torch import nn as nn | ||||
from .embedding import TokenEmbedding | from .embedding import TokenEmbedding | ||||
from .utils import _check_vocab_has_same_index | |||||
class StackEmbedding(TokenEmbedding): | class StackEmbedding(TokenEmbedding): | ||||
@@ -44,8 +45,9 @@ class StackEmbedding(TokenEmbedding): | |||||
vocabs.append(embed.get_word_vocab()) | vocabs.append(embed.get_word_vocab()) | ||||
_vocab = vocabs[0] | _vocab = vocabs[0] | ||||
for vocab in vocabs[1:]: | for vocab in vocabs[1:]: | ||||
assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary." | |||||
if _vocab!=vocab: | |||||
_check_vocab_has_same_index(_vocab, vocab) | |||||
super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) | super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) | ||||
assert isinstance(embeds, list) | assert isinstance(embeds, list) | ||||
for embed in embeds: | for embed in embeds: | ||||
@@ -60,6 +62,7 @@ class StackEmbedding(TokenEmbedding): | |||||
:return: | :return: | ||||
""" | """ | ||||
assert isinstance(embed, TokenEmbedding) | assert isinstance(embed, TokenEmbedding) | ||||
_check_vocab_has_same_index(self.get_word_vocab(), embed.get_word_vocab()) | |||||
self._embed_size += embed.embed_size | self._embed_size += embed.embed_size | ||||
self.embeds.append(embed) | self.embeds.append(embed) | ||||
return self | return self | ||||
@@ -81,7 +81,7 @@ class StaticEmbedding(TokenEmbedding): | |||||
init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): | init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): | ||||
r""" | r""" | ||||
:param vocab: Vocabulary. 若该项为None则会读取所有的embedding。 | |||||
:param Vocabulary vocab: 词表. StaticEmbedding只会加载包含在词表中的词的词向量,在预训练向量中没找到的使用随机初始化 | |||||
:param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 | :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 | ||||
以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 | 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 | ||||
如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 | 如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 | ||||
@@ -89,3 +89,16 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): | |||||
return torch.FloatTensor(sinusoid_table) | return torch.FloatTensor(sinusoid_table) | ||||
def _check_vocab_has_same_index(vocab, other_vocab): | |||||
""" | |||||
检查两个vocabulary是否含有相同的word idx | |||||
:param Vocabulary vocab: | |||||
:param Vocabulary other_vocab: | |||||
:return: | |||||
""" | |||||
if other_vocab != vocab: | |||||
for word, word_ix in vocab: | |||||
other_word_idx = other_vocab.to_index(word) | |||||
assert other_word_idx == word_ix, f"Word {word} has different index in vocabs, {word_ix} Vs. {other_word_idx}." |
@@ -34,56 +34,3 @@ class NaiveClassifier(BaseModel): | |||||
def predict(self, x): | def predict(self, x): | ||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | ||||
class NaiveClassifier2(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier2, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
def forward(self, x): | |||||
return {"predict": self.mlp(x)} | |||||
def predict(self, x): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||||
class NaiveClassifier3(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier3, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
@torch.cuda.amp.autocast() | |||||
def forward(self, x): | |||||
return {"predict": self.mlp(x)} | |||||
@torch.cuda.amp.autocast() | |||||
def predict(self, x): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||||
class NaiveClassifier4(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier4, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
def forward(self, x): | |||||
with torch.cuda.amp.autocast(): | |||||
return {"predict": self.mlp(x)} | |||||
def predict(self, x): | |||||
with torch.cuda.amp.autocast(): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} |
@@ -464,6 +464,24 @@ class BertModel(nn.Module): | |||||
logger.info('DistilBert has NOT pooler, will use hidden states of [CLS] token as pooled output.') | logger.info('DistilBert has NOT pooler, will use hidden states of [CLS] token as pooled output.') | ||||
self.apply(self.init_bert_weights) | self.apply(self.init_bert_weights) | ||||
@property | |||||
def dtype(self): | |||||
""" | |||||
:obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype). | |||||
""" | |||||
try: | |||||
return next(self.parameters()).dtype | |||||
except StopIteration: | |||||
# For nn.DataParallel compatibility in PyTorch 1.5 | |||||
def find_tensor_attributes(module: nn.Module): | |||||
tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] | |||||
return tuples | |||||
gen = self._named_members(get_members_fn=find_tensor_attributes) | |||||
first_tuple = next(gen) | |||||
return first_tuple[1].dtype | |||||
def init_bert_weights(self, module): | def init_bert_weights(self, module): | ||||
r""" Initialize the weights. | r""" Initialize the weights. | ||||
""" | """ | ||||
@@ -477,7 +495,8 @@ class BertModel(nn.Module): | |||||
if isinstance(module, nn.Linear) and module.bias is not None: | if isinstance(module, nn.Linear) and module.bias is not None: | ||||
module.bias.data.zero_() | module.bias.data.zero_() | ||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True): | |||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, | |||||
position_ids=None): | |||||
""" | """ | ||||
:param torch.LongTensor input_ids: bsz x max_len的输入id | :param torch.LongTensor input_ids: bsz x max_len的输入id | ||||
@@ -485,6 +504,7 @@ class BertModel(nn.Module): | |||||
:param attention_mask: 需要attend的为1,不需要为0 | :param attention_mask: 需要attend的为1,不需要为0 | ||||
:param bool output_all_encoded_layers: 是否输出所有层,默认输出token embedding(包含bpe, position以及type embedding) | :param bool output_all_encoded_layers: 是否输出所有层,默认输出token embedding(包含bpe, position以及type embedding) | ||||
及每一层的hidden states。如果为False,只输出最后一层的结果 | 及每一层的hidden states。如果为False,只输出最后一层的结果 | ||||
:param torch.LongTensor position_ids: bsz x max_len, position的id | |||||
:return: encode_layers: 如果output_all_encoded_layers为True,返回list(共num_layers+1个元素),每个元素为 | :return: encode_layers: 如果output_all_encoded_layers为True,返回list(共num_layers+1个元素),每个元素为 | ||||
bsz x max_len x hidden_size否则返回bsz x max_len x hidden_size的tensor; | bsz x max_len x hidden_size否则返回bsz x max_len x hidden_size的tensor; | ||||
pooled_output: bsz x hidden_size为cls的表示,可以用于句子的分类 | pooled_output: bsz x hidden_size为cls的表示,可以用于句子的分类 | ||||
@@ -506,10 +526,12 @@ class BertModel(nn.Module): | |||||
# positions we want to attend and -10000.0 for masked positions. | # positions we want to attend and -10000.0 for masked positions. | ||||
# Since we are adding it to the raw scores before the softmax, this is | # Since we are adding it to the raw scores before the softmax, this is | ||||
# effectively the same as removing these entirely. | # effectively the same as removing these entirely. | ||||
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility | |||||
# this will case an issue when DataParallel: https://github.com/pytorch/pytorch/issues/40457#issuecomment-648396469 | |||||
# extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility | |||||
extended_attention_mask = extended_attention_mask.to(self.dtype) | |||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 | extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 | ||||
embedding_output = self.embeddings(input_ids, token_type_ids) | |||||
embedding_output = self.embeddings(input_ids, token_type_ids=token_type_ids, position_ids=position_ids) | |||||
encoded_layers = self.encoder(embedding_output, | encoded_layers = self.encoder(embedding_output, | ||||
extended_attention_mask, | extended_attention_mask, | ||||
output_all_encoded_layers=output_all_encoded_layers) | output_all_encoded_layers=output_all_encoded_layers) | ||||
@@ -787,6 +787,24 @@ class GPT2Model(GPT2PreTrainedModel): | |||||
for layer, heads in heads_to_prune.items(): | for layer, heads in heads_to_prune.items(): | ||||
self.h[layer].attn.prune_heads(heads) | self.h[layer].attn.prune_heads(heads) | ||||
@property | |||||
def dtype(self): | |||||
""" | |||||
:obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype). | |||||
""" | |||||
try: | |||||
return next(self.parameters()).dtype | |||||
except StopIteration: | |||||
# For nn.DataParallel compatibility in PyTorch 1.5 | |||||
def find_tensor_attributes(module: nn.Module): | |||||
tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] | |||||
return tuples | |||||
gen = self._named_members(get_members_fn=find_tensor_attributes) | |||||
first_tuple = next(gen) | |||||
return first_tuple[1].dtype | |||||
def forward(self, input_ids, state=None, attention_mask=None, token_type_ids=None, position_ids=None, | def forward(self, input_ids, state=None, attention_mask=None, token_type_ids=None, position_ids=None, | ||||
head_mask=None, output_attentions=True): | head_mask=None, output_attentions=True): | ||||
""" | """ | ||||
@@ -834,7 +852,9 @@ class GPT2Model(GPT2PreTrainedModel): | |||||
# positions we want to attend and -10000.0 for masked positions. | # positions we want to attend and -10000.0 for masked positions. | ||||
# Since we are adding it to the raw scores before the softmax, this is | # Since we are adding it to the raw scores before the softmax, this is | ||||
# effectively the same as removing these entirely. | # effectively the same as removing these entirely. | ||||
attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility | |||||
# this will case an issue when DataParallel: https://github.com/pytorch/pytorch/issues/40457#issuecomment-648396469 | |||||
# attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility | |||||
attention_mask = attention_mask.to(self.dtype) | |||||
attention_mask = (1.0 - attention_mask) * -10000.0 | attention_mask = (1.0 - attention_mask) * -10000.0 | ||||
# attention_mask = attention_mask.masked_fill(attention_mask.eq(0), -10000.0) | # attention_mask = attention_mask.masked_fill(attention_mask.eq(0), -10000.0) | ||||
@@ -39,7 +39,7 @@ class RobertaEmbeddings(BertEmbeddings): | |||||
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx | config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx | ||||
) | ) | ||||
def forward(self, input_ids, token_type_ids, words_embeddings=None): | |||||
def forward(self, input_ids, token_type_ids, words_embeddings=None, **kwargs): | |||||
position_ids = self.create_position_ids_from_input_ids(input_ids) | position_ids = self.create_position_ids_from_input_ids(input_ids) | ||||
return super().forward( | return super().forward( | ||||
@@ -3,6 +3,5 @@ torch>=1.0.0 | |||||
tqdm>=4.28.1 | tqdm>=4.28.1 | ||||
prettytable>=0.7.2 | prettytable>=0.7.2 | ||||
requests | requests | ||||
spacy | |||||
prettytable>=0.7.2 | prettytable>=0.7.2 | ||||
regex!=2019.12.17 | regex!=2019.12.17 |
@@ -268,6 +268,57 @@ class TestDataSetMethods(unittest.TestCase): | |||||
with self.assertRaises(RuntimeError) as RE: | with self.assertRaises(RuntimeError) as RE: | ||||
ds.add_field('test', []) | ds.add_field('test', []) | ||||
def test_concat(self): | |||||
""" | |||||
测试两个dataset能否正确concat | |||||
""" | |||||
ds1 = DataSet({"x": [[1, 2, 3, 4] for i in range(10)], "y": [[5, 6] for i in range(10)]}) | |||||
ds2 = DataSet({"x": [[4,3,2,1] for i in range(10)], "y": [[6,5] for i in range(10)]}) | |||||
ds3 = ds1.concat(ds2) | |||||
self.assertEqual(len(ds3), 20) | |||||
self.assertListEqual(ds1[9]['x'], [1, 2, 3, 4]) | |||||
self.assertListEqual(ds1[10]['x'], [4,3,2,1]) | |||||
ds2[0]['x'][0] = 100 | |||||
self.assertEqual(ds3[10]['x'][0], 4) # 不改变copy后的field了 | |||||
ds3[10]['x'][0] = -100 | |||||
self.assertEqual(ds2[0]['x'][0], 100) # 不改变copy前的field了 | |||||
# 测试inplace | |||||
ds1 = DataSet({"x": [[1, 2, 3, 4] for i in range(10)], "y": [[5, 6] for i in range(10)]}) | |||||
ds2 = DataSet({"x": [[4, 3, 2, 1] for i in range(10)], "y": [[6, 5] for i in range(10)]}) | |||||
ds3 = ds1.concat(ds2, inplace=True) | |||||
ds2[0]['x'][0] = 100 | |||||
self.assertEqual(ds3[10]['x'][0], 4) # 不改变copy后的field了 | |||||
ds3[10]['x'][0] = -100 | |||||
self.assertEqual(ds2[0]['x'][0], 100) # 不改变copy前的field了 | |||||
ds3[0]['x'][0] = 100 | |||||
self.assertEqual(ds1[0]['x'][0], 100) # 改变copy前的field了 | |||||
# 测试mapping | |||||
ds1 = DataSet({"x": [[1, 2, 3, 4] for i in range(10)], "y": [[5, 6] for i in range(10)]}) | |||||
ds2 = DataSet({"X": [[4, 3, 2, 1] for i in range(10)], "Y": [[6, 5] for i in range(10)]}) | |||||
ds3 = ds1.concat(ds2, field_mapping={'X':'x', 'Y':'y'}) | |||||
self.assertEqual(len(ds3), 20) | |||||
# 测试忽略掉多余的 | |||||
ds1 = DataSet({"x": [[1, 2, 3, 4] for i in range(10)], "y": [[5, 6] for i in range(10)]}) | |||||
ds2 = DataSet({"X": [[4, 3, 2, 1] for i in range(10)], "Y": [[6, 5] for i in range(10)], 'Z':[0]*10}) | |||||
ds3 = ds1.concat(ds2, field_mapping={'X':'x', 'Y':'y'}) | |||||
# 测试报错 | |||||
ds1 = DataSet({"x": [[1, 2, 3, 4] for i in range(10)], "y": [[5, 6] for i in range(10)]}) | |||||
ds2 = DataSet({"X": [[4, 3, 2, 1] for i in range(10)]}) | |||||
with self.assertRaises(RuntimeError): | |||||
ds3 = ds1.concat(ds2, field_mapping={'X':'x'}) | |||||
class TestDataSetIter(unittest.TestCase): | class TestDataSetIter(unittest.TestCase): | ||||
def test__repr__(self): | def test__repr__(self): | ||||
@@ -14,8 +14,12 @@ from fastNLP import CrossEntropyLoss | |||||
from fastNLP import AccuracyMetric | from fastNLP import AccuracyMetric | ||||
from fastNLP import SGD | from fastNLP import SGD | ||||
from fastNLP import Trainer | from fastNLP import Trainer | ||||
from fastNLP.models.base_model import NaiveClassifier, NaiveClassifier2, NaiveClassifier3, NaiveClassifier4 | |||||
from fastNLP.models.base_model import NaiveClassifier | |||||
from fastNLP import TorchLoaderIter | from fastNLP import TorchLoaderIter | ||||
from fastNLP.models import BaseModel | |||||
from fastNLP.modules import MLP | |||||
from pkg_resources import parse_version | |||||
def prepare_fake_dataset(): | def prepare_fake_dataset(): | ||||
@@ -577,6 +581,22 @@ class TrainerTestGround(unittest.TestCase): | |||||
""" | """ | ||||
class NaiveClassifier2(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier2, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
def forward(self, x): | |||||
return {"predict": self.mlp(x)} | |||||
def predict(self, x): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||||
class Fp16TrainerTest(unittest.TestCase): | class Fp16TrainerTest(unittest.TestCase): | ||||
def test_raise_error(self): | def test_raise_error(self): | ||||
data_set = prepare_fake_dataset() | data_set = prepare_fake_dataset() | ||||
@@ -605,7 +625,7 @@ class Fp16TrainerTest(unittest.TestCase): | |||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | ||||
use_tqdm=True, check_code_level=2, fp16=True, device=torch.device('cpu')) | use_tqdm=True, check_code_level=2, fp16=True, device=torch.device('cpu')) | ||||
@unittest.skipIf(torch.cuda.is_available()==False, "Skip when no cuda device detch") | |||||
@unittest.skipIf(torch.cuda.is_available()==False or parse_version(torch.__version__) < parse_version('1.6'), "Skip when no cuda device detch") | |||||
def test_run_fp16(self): | def test_run_fp16(self): | ||||
data_set = prepare_fake_dataset() | data_set = prepare_fake_dataset() | ||||
data_set.set_input("x", flag=True) | data_set.set_input("x", flag=True) | ||||
@@ -627,7 +647,7 @@ class Fp16TrainerTest(unittest.TestCase): | |||||
use_tqdm=True, check_code_level=2, fp16=True, device=0, test_use_fp16=False) | use_tqdm=True, check_code_level=2, fp16=True, device=0, test_use_fp16=False) | ||||
trainer.train(load_best_model=False) | trainer.train(load_best_model=False) | ||||
@unittest.skipIf(torch.cuda.device_count()<2, "Skip when lower than 1 gpus.") | |||||
@unittest.skipIf(torch.cuda.device_count()<2 or parse_version(torch.__version__) < parse_version('1.6'), "Skip when lower than 1 gpus.") | |||||
def test_run_data_parallel(self): | def test_run_data_parallel(self): | ||||
data_set = prepare_fake_dataset() | data_set = prepare_fake_dataset() | ||||
data_set.set_input("x", flag=True) | data_set.set_input("x", flag=True) | ||||
@@ -635,6 +655,21 @@ class Fp16TrainerTest(unittest.TestCase): | |||||
train_set, dev_set = data_set.split(0.3) | train_set, dev_set = data_set.split(0.3) | ||||
class NaiveClassifier2(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier2, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
def forward(self, x): | |||||
return {"predict": self.mlp(x)} | |||||
def predict(self, x): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||||
model = NaiveClassifier2(2, 1) | model = NaiveClassifier2(2, 1) | ||||
with self.assertRaises(RuntimeError): | with self.assertRaises(RuntimeError): | ||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | ||||
@@ -643,12 +678,46 @@ class Fp16TrainerTest(unittest.TestCase): | |||||
use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1]) | use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1]) | ||||
with self.assertRaises(RuntimeError): | with self.assertRaises(RuntimeError): | ||||
class NaiveClassifier3(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier3, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
@torch.cuda.amp.autocast() | |||||
def forward(self, x): | |||||
return {"predict": self.mlp(x)} | |||||
@torch.cuda.amp.autocast() | |||||
def predict(self, x): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||||
model = NaiveClassifier3(2, 1) | model = NaiveClassifier3(2, 1) | ||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | ||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | ||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | ||||
use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) | use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) | ||||
class NaiveClassifier4(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier4, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
def forward(self, x): | |||||
with torch.cuda.amp.autocast(): | |||||
return {"predict": self.mlp(x)} | |||||
def predict(self, x): | |||||
with torch.cuda.amp.autocast(): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||||
model = NaiveClassifier4(2, 1) | model = NaiveClassifier4(2, 1) | ||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | ||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | ||||
@@ -31,29 +31,33 @@ class TestDownload(unittest.TestCase): | |||||
class TestBertEmbedding(unittest.TestCase): | class TestBertEmbedding(unittest.TestCase): | ||||
def test_bert_embedding_1(self): | def test_bert_embedding_1(self): | ||||
vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInBERT".split()) | |||||
embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1) | |||||
requires_grad = embed.requires_grad | |||||
embed.requires_grad = not requires_grad | |||||
embed.train() | |||||
words = torch.LongTensor([[2, 3, 4, 0]]) | |||||
result = embed(words) | |||||
self.assertEqual(result.size(), (1, 4, 16)) | |||||
embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1) | |||||
embed.eval() | |||||
words = torch.LongTensor([[2, 3, 4, 0]]) | |||||
result = embed(words) | |||||
self.assertEqual(result.size(), (1, 4, 16)) | |||||
# 自动截断而不报错 | |||||
embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1, | |||||
auto_truncate=True) | |||||
words = torch.LongTensor([[2, 3, 4, 1]*10, | |||||
[2, 3]+[0]*38]) | |||||
result = embed(words) | |||||
self.assertEqual(result.size(), (2, 40, 16)) | |||||
for pool_method in ['first', 'last', 'max', 'avg']: | |||||
with self.subTest(pool_method=pool_method): | |||||
vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInBERT".split()) | |||||
embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1, | |||||
pool_method=pool_method) | |||||
requires_grad = embed.requires_grad | |||||
embed.requires_grad = not requires_grad | |||||
embed.train() | |||||
words = torch.LongTensor([[2, 3, 4, 0]]) | |||||
result = embed(words) | |||||
self.assertEqual(result.size(), (1, 4, 16)) | |||||
embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1, | |||||
pool_method=pool_method) | |||||
embed.eval() | |||||
words = torch.LongTensor([[2, 3, 4, 0]]) | |||||
result = embed(words) | |||||
self.assertEqual(result.size(), (1, 4, 16)) | |||||
# 自动截断而不报错 | |||||
embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1, | |||||
auto_truncate=True, pool_method=pool_method) | |||||
words = torch.LongTensor([[2, 3, 4, 1]*10, | |||||
[2, 3]+[0]*38]) | |||||
result = embed(words) | |||||
self.assertEqual(result.size(), (2, 40, 16)) | |||||
def test_save_load(self): | def test_save_load(self): | ||||
bert_save_test = 'bert_save_test' | bert_save_test = 'bert_save_test' | ||||
@@ -18,3 +18,16 @@ class TestCharEmbed(unittest.TestCase): | |||||
y = embed(x) | y = embed(x) | ||||
self.assertEqual(tuple(y.size()), (2, 3, 130)) | self.assertEqual(tuple(y.size()), (2, 3, 130)) | ||||
def test_case_2(self): | |||||
# 测试只需要拥有一样的index就可以concat | |||||
ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['hello', 'Jack'])]) | |||||
vocab1 = Vocabulary().from_dataset(ds, field_name='words') | |||||
vocab2 = Vocabulary().from_dataset(ds, field_name='words') | |||||
self.assertEqual(len(vocab1), 5) | |||||
cnn_embed = CNNCharEmbedding(vocab1, embed_size=60) | |||||
lstm_embed = LSTMCharEmbedding(vocab2, embed_size=70) | |||||
embed = StackEmbedding([cnn_embed, lstm_embed]) | |||||
x = torch.LongTensor([[2, 1, 0], [4, 3, 4]]) | |||||
y = embed(x) | |||||
self.assertEqual(tuple(y.size()), (2, 3, 130)) | |||||
@@ -74,6 +74,7 @@ class TestRunMatchingPipe(unittest.TestCase): | |||||
name, vocabs = y | name, vocabs = y | ||||
self.assertEqual(x + 1 if name == 'words' else x, len(vocabs)) | self.assertEqual(x + 1 if name == 'words' else x, len(vocabs)) | ||||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||||
def test_spacy(self): | def test_spacy(self): | ||||
data_set_dict = { | data_set_dict = { | ||||
'Quora': ('tests/data_for_tests/io/Quora', QuoraPipe, QuoraBertPipe, (2, 2, 2), (93, 2)), | 'Quora': ('tests/data_for_tests/io/Quora', QuoraPipe, QuoraBertPipe, (2, 2, 2), (93, 2)), | ||||