diff --git a/.coverage b/.coverage new file mode 100644 index 00000000..a6d89bc8 --- /dev/null +++ b/.coverage @@ -0,0 +1 @@ +!coverage.py: This is a private format, don't read it directly!{"lines":{"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/__init__.py":[12,14,15,18,19,20,22,23,24,26,27,29,30,31,32,33,34,35,37,38,39,41,42,43,45,46,47,48,50,51,52,53,55,56,57,58,59,60,62,64,66,68,69,70,71,72],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/__init__.py":[6,9,10,11,12,13,14,15,16,17,18,21,22,23,24,25,26,27],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/embedding.py":[128,129,130,131,4,133,7,8,11,12,13,140,15,141,142,18,146,148,143,144,145,155,157,39,41,169,43,45,174,47,48,177,178,49,51,181,182,52,55,185,186,179,60,61,63,193,68,199,72,201,73,75,76,205,82,85,86,87,89,90,91,93,104,111,119,120,121,122,123,124,125,126,127],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/utils.py":[4,5,6,7,9,42,43,12,44,45,46,16,24,57,26,27,28,25,31],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/__init__.py":[13,15,17,19,20,21,22,24,26,27,28,30,32,33,35,36,37,38,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,56,57,58,59,60,61,63,64,65,67,68,69,70,72,73,74,75,78,79,80,83,84,85,86,87,88,89,90,91,92,93,94],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/_logger.py":[1,130,131,4,132,134,7,8,9,10,11,137,13,140,15,16,143,19,20,24,25,26,155,27,29,30,31,32,33,45,46,47,49,50,51,52,53,56,78,79,80,83,84,88,92,94,95,99,100,101,102,103,106,107,108,110,114,119,125,127],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/batch.py":[4,6,7,8,11,13,14,15,16,18,19,20,21,24,29,32,33,34,35,36,37,38,40,42,43,44,45,47,50,57,58,59,60,61,62,63,64,65,67,68,69,70,73,74,75,76,80,81,83,84,85,87,92,99,100,101,102,103,105,106,108,109,112,113,114,115,116,117,119,120,122,124,125,126,127,129,130,131,132,133,135,136,138,139,141,146,171,174,175,176,177,178,181,182,183,184,185,186,187,189,190,193,194,202,204,207,211,215,223,224,225,226,227,228,229,230,233],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/sampler.py":[3,5,6,7,8,134,135,11,140,13,137,16,149,150,151,24,153,26,155,156,158,160,34,162,163,164,166,165,40,167,42,170,43,46,52,54,55,58,186,187,188,190,191,192,193,68,70,71,72,73,75,83,84,86,87,89,90,91,92,93,94,96,97,98,100,102,103,104,105,106,107,108,109,110,112,113,114,115,117,120],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/dataset.py":[515,516,518,532,543,550,552,554,560,562,570,578,585,586,587,589,590,592,606,607,608,609,610,611,617,619,631,632,633,634,635,640,641,643,660,676,688,694,696,702,704,722,723,725,726,727,728,729,734,737,738,859,740,742,751,752,753,754,755,756,757,758,862,760,761,762,763,764,765,766,767,768,770,771,772,774,791,792,793,794,795,796,285,287,290,291,803,293,294,806,296,297,298,299,300,301,302,303,811,305,809,824,314,316,317,318,319,320,321,834,835,836,837,838,322,323,324,325,326,327,328,334,329,332,337,849,338,339,340,342,335,344,857,858,347,348,861,345,350,346,865,864,863,866,860,351,868,354,353,356,871,867,875,869,870,360,363,364,877,365,367,369,883,884,886,376,377,378,379,380,381,382,383,384,385,386,387,388,894,895,896,897,402,409,410,412,413,415,420,421,422,423,425,426,427,431,432,434,872,441,443,445,447,451,452,453,454,459,873,474,807,486,487,488,490,491,493,499,500,502,503,505,506,507,509],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/field.py":[4,7,8,9,12,13,14,15,16,18,19,21,22,535,25,26,27,28,29,30,33,34,35,36,37,38,41,43,44,557,46,559,560,47,562,48,52,53,54,563,56,57,58,59,60,564,62,63,64,568,570,67,68,572,70,71,72,65,74,578,76,580,78,590,80,591,585,83,592,85,593,87,594,89,595,596,597,598,599,95,96,97,98,99,100,613,101,614,102,615,609,616,617,618,104,106,108,622,624,113,114,115,116,629,117,118,120,119,122,130,131,132,133,134,135,136,137,138,651,139,653,140,141,142,146,147,148,149,150,663,152,659,661,157,158,159,160,162,165,677,167,169,681,682,683,685,686,175,687,177,178,688,180,181,182,183,184,690,691,187,692,693,190,694,192,697,200,201,202,205,206,207,209,211,212,214,220,221,222,226,45,236,242,244,252,254,255,256,257,259,261,278,565,566,567,298,569,571,318,573,574,575,339,576,577,579,359,581,582,379,584,586,626,398,419,695,428,429,430,431,432,433,434,435,436,437,438,439,441,443,444,445,446,447,448,450,451,452,453,454,455,456,458,459,460,465,482,484,485,487,490,491,610],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/utils.py":[3,5,517,6,7,518,10,11,12,13,14,521,16,17,18,19,20,524,22,23,527,528,26,27,535,29,536,540,541,542,543,35,544,545,547,546,40,548,551,552,553,554,46,550,49,563,564,53,565,567,568,569,59,60,574,62,63,64,67,522,592,599,609,615,530,118,119,120,121,122,124,125,126,127,641,129,130,132,131,134,647,648,649,650,651,652,135,139,140,656,142,144,147,659,145,146,662,663,664,151,666,667,152,669,670,153,672,673,674,163,676,165,678,679,168,681,682,643,685,644,645,192,709,217,218,219,220,222,736,738,227,739,740,226,229,232,233,230,148,231,745,234,235,149,236,237,238,239,240,244,245,241,242,243,246,247,248,249,250,251,252,253,254,255,256,259,260,263,154,271,273,274,156,277,157,280,158,642,288,289,159,291,292,293,294,295,296,297,298,161,301,316,333,334,335,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,364,388,389,390,391,392,393,396,397,405,411,413,416,417,421,430,433,436,437,438,439,440,290,445,449,451,452,454,456,457,458,460,463,465,466,469,470,471,475,476,477,478,479,480,485,496,497,498,499,500,501,502,503,505,506,507,508,509,510,511],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/instance.py":[58,5,37,39,7,11,46,47,48,52,53,55,56,24,26,59,28,30],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/const.py":[4,7,11,29,30,31,32,33,34,35,36,37,39,42,43,45,51,56,61,64,65,67,70,71,73,76,77,79],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/callback.py":[1024,513,1026,1030,1031,1036,529,531,1043,1044,1060,1071,562,51,53,1077,55,56,57,58,59,60,61,62,63,64,65,66,576,68,69,1092,583,72,73,74,76,78,80,81,84,85,87,88,89,91,92,603,606,97,612,106,108,621,109,623,110,113,111,118,120,123,125,128,130,133,135,648,138,140,143,145,660,148,150,153,155,159,161,674,164,166,681,169,171,683,685,686,175,687,177,688,179,692,693,183,696,701,189,191,703,705,706,708,197,710,199,721,722,210,212,723,726,724,728,729,730,220,733,222,741,229,231,743,745,746,748,237,749,239,750,751,752,753,756,245,754,247,758,761,759,252,765,254,766,767,768,770,771,260,773,262,774,775,776,778,779,780,781,782,783,777,785,786,275,787,788,789,791,790,273,794,283,795,796,797,287,799,289,800,801,802,805,293,295,303,818,820,821,310,311,312,313,822,315,316,823,318,830,824,321,322,826,836,828,827,831,832,833,841,329,331,332,333,334,839,336,337,842,851,339,340,341,852,855,348,349,350,863,351,353,864,357,870,871,361,875,365,369,881,373,377,889,890,381,385,389,902,393,907,397,912,401,405,410,411,922,929,420,428,945,946,437,961,964,455,968,457,459,461,462,463,468,469,471,472,473,987,479,482,504,489,491,1003,492,493,494,496,1009,497,1014,1016,506,1020],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/tester.py":[34,35,37,38,40,41,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,62,66,92,94,95,97,100,102,103,104,105,106,107,109,110,111,118,119,121,127,131,132,134,138,139,141,148,149,150,151,152,153,154,155,156,158,159,160,162,164,165,166,167,170,171,173,174,176,177,178,181,182,183,184,185,187,188,189,190,191,192,194,195,196,197,199,206,207,209,211,213,214,215,217,223,224,225,226,227,228],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/metrics.py":[4,6,7,8,9,12,13,15,16,18,19,20,21,22,23,24,25,26,29,117,119,120,121,122,124,133,137,138,141,151,156,158,165,166,179,180,181,182,183,185,186,187,188,192,193,194,195,200,208,209,213,215,230,231,235,236,239,240,241,242,246,247,249,250,253,254,255,256,257,258,259,262,263,264,265,267,270,271,272,274,277,278,279,280,281,282,284,285,286,287,288,290,292,295,305,307,309,311,313,314,316,329,330,332,336,340,341,343,345,346,347,348,350,354,355,356,357,359,360,362,369,370,371,372,373,376,386,388,389,390,391,392,393,394,395,396,398,399,400,401,402,406,437,468,477,479,480,481,482,483,484,485,486,488,489,491,492,493,496,504,505,506,507,508,509,510,511,512,514,515,516,520,561,564,566,568,570,573,574,575,576,577,578,579,580,581,582,586,587,588,589,590,592,593,595,597,598,599,601,609,612,616,620,622,623,624,625,633,634,635,636,637,638,640,641,643,644,646,647,648,649,651,652,653,655,657,658,659,660,661,662,663,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,681,686,687,688,689,690,691,692,694,695,696,697,699,700,702,704,712,713,714,716,719,726,727,728,729,730,732,733,734,736,738,742,743,747,750,759,760,761,762,763,766,776,777,778,779,780,781,784,799,802,804,806,808,810,811,813,814,816,818,819,820,821,823,825,827,836,837,838,839,841,842,845,846,850,851,852,853,855,856,857,858,859,862,863,864,865,867,868,870,873,875,876,878,879,880,881,883,884,885,887,888,891,893,895,897,900,901,903,905,907,909,910,911,912,914,916,918,919,920,921,923,929,932,933,935,936,937,939,940,942,944,945,946,947,949],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/vocabulary.py":[4,7,8,11,12,13,15,16,17,18,21,26,35,40,42,43,44,46,49,54,56,57,58,59,61,62,64,67,90,92,93,94,95,96,97,98,99,100,102,104,105,116,117,118,120,121,133,134,135,137,145,146,147,148,149,150,151,153,154,166,168,169,181,182,184,190,191,192,193,194,195,197,198,199,200,201,202,203,204,205,206,207,209,214,215,217,219,221,229,231,242,244,251,252,253,254,258,259,273,279,280,282,283,285,287,289,291,292,295,296,297,301,302,303,304,305,311,313,317,337,338,342,343,344,345,346,348,349,350,352,354,355,356,358,359,360,361,368,369,370,371,377,379,385,387,398,400,401,406,407,408,410,411,416,417,418,420,428,430,443,447,448,450,451,453,457,458,460,463,465,466],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/_parallel_utils.py":[1,97,3,5,7,8,9,10,11,76,104,14,105,107],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/losses.py":[4,6,8,9,11,12,13,14,17,18,20,21,23,24,25,26,27,28,29,30,31,34,37,39,40,41,43,52,55,62,63,76,77,78,79,80,82,83,84,85,89,90,91,92,102,110,112,113,114,115,119,120,122,123,125,126,127,128,129,130,131,134,135,136,137,139,141,142,143,145,148,149,150,151,152,153,155,156,157,158,160,162,163,165,168,188,190,192,193,194,195,198,201,222,224,225,226,227,228,229,230,232,233,234,235,236,239,240,241,242,243,245,246,249,259,261,262,263,264,265,267,268,271,280,282,283,284,285,286,288,289,292,303,305,306,307,308,309,310,312,313,316,323,325,326,327,329,331,332,333,334,335,336,337,338,339,340,341,343,345,347,353,356,357,358,359,360,361,366,374,377,386,387,395,410,432],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/optimizer.py":[4,6,7,8,9,135,138,12,13,14,15,18,151,24,26,27,156,29,30,32,35,41,43,47,48,51,54,61,68,70,71,72,73,75,76,78,80,83,90,92,93,95,96,98,99,101,103,106],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/trainer.py":[517,518,519,521,522,523,524,525,526,527,528,529,530,531,532,533,534,536,537,538,539,540,541,545,547,548,551,552,553,554,555,556,557,558,559,560,561,562,564,565,567,570,571,573,593,594,598,599,600,601,602,603,604,606,607,608,609,619,620,621,622,623,624,625,626,627,628,629,630,634,635,637,639,640,641,643,644,645,646,647,648,649,650,651,652,653,654,656,657,658,659,660,662,663,666,667,668,669,672,673,674,676,677,679,680,681,682,683,685,686,687,688,689,690,691,693,694,695,696,697,698,700,701,705,707,708,711,712,713,715,716,717,720,721,722,723,724,725,727,728,857,730,737,740,742,746,747,352,749,750,751,752,755,757,764,765,766,768,775,777,800,802,812,813,816,818,823,824,825,826,827,829,319,831,321,832,835,324,325,326,833,328,329,330,843,332,333,841,847,336,848,338,851,340,341,342,343,344,339,853,854,855,349,350,351,856,345,346,858,347,348,864,865,868,869,353,354,355,356,358,872,873,875,876,877,878,879,880,881,882,883,884,885,886,887,888,889,890,891,892,893,895,896,898,899,900,901,902,903,904,905,907,908,909,910,911,913,914,915,916,917,918,919,920,924,925,927,928,418,932,936,425,426,427,937,941,942,939,940,431,943,433,944,945,947,437,438,948,949,441,954,950,444,951,958,449,450,961,962,964,454,965,456,968,458,970,971,974,466,482,484,485,489,490,491,498,499,502,503,506,507,510,511],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/static_embedding.py":[4,7,9,11,12,13,14,16,17,18,19,20,21,22,25,66,69,70,71,72,75,76,77,78,79,83,84,91,92,119,121,122,123,124,127,128,130,133,134,135,136,140,141,142,143,144,146,147,148,150,151,153,154,155,156,158,164,165,166,167,168,169,171,179,181,182,186,188,202,204,205,207,209,210,226,227,229,230,231,232,233,237,238,239,240,241,242,243,244,245,246,247,248,249,250,252,254,257,258,259,260,261,262,269,270,271,272,275,277,279,283,284,285,286,287,288,290,292,299,300,301,302,303,304],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/__init__.py":[13,15,17,19,21,22,23,24,25,26,28,29,30,31,32,33,34,35,37,38,40,42,43,44,45,46,48,50,51,52,53,54,55,57,58,59,60,61,63,65,66,67,68,69,70,71,72,73,74,75,76,78,79,83,84,85,87,88],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/embed_loader.py":[4,6,7,10,11,12,14,16,17,20,22,23,24,25,34,39,41,44,45,46,63,64,66,67,68,69,70,71,72,73,75,76,77,78,80,81,82,83,84,86,88,90,91,92,93,100,101,102,103,104,105,106,107,108,109,111,112,114,116,117,118,133,134,135,136,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,157,166,168,169,170,171,173,174,175,176,177,178,180,181,182,183,184,185,187,188,190],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/data_bundle.py":[4,6,9,10,13,142,27,29,30,31,159,33,45,55,184,64,74,203,83,92,117],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/model_io.py":[32,3,5,6,9,42,12,17,19,53,22,55,62],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/__init__.py":[44,47,49,50,51,52,53,54,56,57,58,59,60,61,62,63,65,66,68,70,71,72,73,74,76,77,78,79,80,81,82,83],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/classification.py":[1,259,4,5,6,7,8,9,261,264,12,13,14,15,16,17,19,20,21,279,24,291,164,45,47,304,50,178,180,306,309,183,72,73,201,339,244,119,120],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/loader.py":[65,66,1,4,33,70,7,67,9,10,11,12,68,78,15,19,21,22,24,63],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/file_utils.py":[4,7,8,9,10,11,14,15,16,17,18,19,21,22,23,25,28,29,30,32,33,35,36,38,40,41,43,44,45,46,50,51,52,53,54,58,60,61,62,63,64,65,66,67,68,69,71,73,74,76,77,78,79,83,84,85,86,87,88,89,90,91,92,93,94,96,97,98,99,102,103,104,107,108,109,110,114,159,186,202,228,252,273,293,306,418,427,434,443],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/utils.py":[33,34,35,4,36,7,10,11,12,14,17,81],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/conll.py":[1,4,5,6,7,8,9,10,11,12,15,16,17,18,19,146,21,22,23,24,25,150,278,28,279,282,286,287,408,421,175,177,183,62,446,64,448,451,325,204,78,208,92,349,222,273,224,351,354,227,404,117,405,119,125],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/file_reader.py":[33,34,3,35,5,7,9,41,42,12,43,78,47,44,24,25,26,30],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/csv.py":[32,1,34,33,4,35,36,7,8,9,10,37,13,24,26,27,28,29,30],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/cws.py":[1,4,38,7,8,9,10,11,39,13,14,15,47,18,56],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/json.py":[1,4,38,7,8,9,10,13,25,27],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/matching.py":[1,129,4,5,6,7,8,11,12,13,15,16,17,18,19,20,273,277,23,159,35,37,40,170,298,300,303,184,186,189,318,66,216,98,228,109,241,243,246,120,122],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/__init__.py":[9,11,13,15,16,17,18,19,21,22,23,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,42,43,44,46,47,48],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/classification.py":[1,4,5,6,7,8,134,264,11,392,13,15,16,17,18,19,20,21,22,408,24,410,28,414,32,34,37,172,52,182,315,320,449,195,197,70,201,333,335,339,89,218,247,228,104,106,119,249,382],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/pipe.py":[1,4,7,10,13,14,23],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/utils.py":[1,66,153,4,5,6,39,9,137,11,12,15,87,121,91],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/conll.py":[1,4,5,6,7,8,9,12,13,14,15,16,17,18,19,20,141,272,23,286,288,34,36,293,43,306,308,182,313,192,328,330,79,208,210,215,225,98,227,100,233,113,114],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/matching.py":[128,1,129,259,4,5,6,7,8,9,10,11,12,13,14,15,135,140,18,19,20,21,22,146,147,25,152,260,134,169,42,171,44,177,50,265,266,191,64,141,271,272,247,248,122,123,253,254],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/cws.py":[1,4,7,8,136,10,11,12,13,14,17,155,157,34,168,50,65,202,84,110,254],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/__init__.py":[18,22,23,25,27,29,31,33,34,35,37,38,39,40,42,44,45,46,47,49,52,53,54,55,56],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/decoder/__init__.py":[4,6,7,8,9,12,13,14,15],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/decoder/crf.py":[1,4,5,8,9,11,12,15,29,31,32,33,34,35,36,37,38,40,41,42,43,44,46,47,48,50,51,52,53,54,55,56,57,58,59,60,63,73,74,75,76,93,94,95,96,97,98,102,121,122,123,124,125,126,127,128,157,170,173,175,177,178,181,182,183,184,186,187,192,194,196,204,205,206,207,209,211,212,213,214,215,216,218,219,221,223,231,232,233,236,237,238,240,242,243,244,245,246,247,248,250,252,261,262,263,264,265,267,269,282,283,284,287,288,289,290,291,295,296,297,298,299,300,301,302,303,304,306,310,311,312,314,316,317,318,319,320,321,322,323,328,329],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/utils.py":[4,134,7,8,11,12,14,15,16,19,35,37,39,41,43,45,47,49,52,54,56,57,60,61,62,63,64,65,67,68,69,70,72,73,74,75,77,80,83,120],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/decoder/mlp.py":[1,4,7,8,10,13,44,46,47,48,49,50,51,52,53,55,57,60,61,62,64,65,71,72,73,75,76,79,86,88,93,94,95,96,98,99],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/decoder/utils.py":[1,4,6,9],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/__init__.py":[4,9,10,12,14,16,18,20,21,22,24,25,26,27,29,32,33,34,35,36,37,38,39,40],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/attention.py":[128,1,4,132,7,9,10,11,13,16,20,22,23,24,25,26,27,28,30,38,39,40,41,42,43,46,175,55,184,57,186,58,59,60,61,62,64,65,66,67,69,198,70,71,73,74,75,76,77,78,80,212,88,89,90,92,93,94,97,98,99,100,101,102,105,106,107,110,126],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/bert.py":[512,4,517,7,10,11,12,13,14,15,17,18,20,21,22,24,25,28,30,44,571,70,586,587,75,76,77,591,78,79,80,81,82,83,84,85,600,86,87,92,100,107,621,110,115,119,632,125,126,129,133,136,654,149,150,153,154,667,155,156,158,159,160,161,162,165,167,169,170,171,172,173,689,177,178,180,181,182,183,184,187,188,189,703,191,192,193,194,197,198,199,200,715,204,205,206,208,209,210,212,214,727,215,216,217,219,220,221,222,224,225,226,229,230,743,744,232,747,235,239,241,242,243,244,245,248,249,250,251,252,253,255,256,257,258,259,262,263,776,264,265,266,268,269,270,271,274,275,786,276,277,278,279,283,796,284,285,286,289,290,291,292,293,294,296,809,297,298,299,300,303,304,816,305,306,307,308,310,311,312,313,314,317,318,319,320,833,321,323,324,325,326,327,328,329,330,331,334,335,848,336,337,338,340,852,854,343,344,345,346,349,877,374,376,377,378,385,386,387,388,389,390,391,393,396,909,399,400,401,402,403,404,406,407,409,410,417,424,425,427,428,429,430,431,432,433,434,435,437,500,509,510],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/char_encoder.py":[1,4,5,7,8,10,14,25,27,28,29,30,32,34,36,41,43,45,47,48,49,50,52,54,55,57,58,61,68,70,77,78,80,81,82,83,84,85,87,92,93,94,95,96,98,99],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/conv_maxpool.py":[1,4,6,7,8,11,23,25,26,28,29,32,33,36,37,38,43,52,59,60,69,77,79,80,81,82,84,85,86],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/lstm.py":[4,7,10,11,12,15,30,33,34,35,36,37,38,40,41,42,44,45,46,47,49,51,61,62,65,66,67,68,69,72,73,74,75,76,77,82],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/pooling.py":[1,129,4,5,6,7,135,9,10,137,13,141,25,27,38,62,67,69,73,85,86,88,92,102,107,109,114],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/star_transformer.py":[3,6,9,10,11,12,15,32,34,35,36,38,40,41,42,43,44,45,46,48,49,53,63,65,67,68,69,71,72,76,77,78,79,80,81,82,83,85,87,89,91,94,95,96,99,100,101,102,104,107,109,111,112,114,116,117,118,119,120,121,122,123,124,125,126,127,129,130,132,134,137,138,140,141,142,143,144,146,149,151,153,154,156,158,159,160,161,162,163,164,165,166],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/transformer.py":[1,4,6,8,9,12,26,28,29,30,31,32,33,34,35,36,37,39,46,47,48,49,50,51,52,54,55,56,58,65,66,69,70,71,72,73],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/dropout.py":[1,4,7,10,14,16,17,18,19,20,24],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/variational_rnn.py":[3,6,7,8,11,12,13,15,16,25,28,31,33,34,35,36,37,38,40,52,53,54,55,56,58,59,60,61,62,63,64,66,67,69,70,73,74,75,76,77,79,80,81,82,83,84,85,86,87,88,89,96,97,98,99,102,120,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,146,147,148,149,150,151,152,153,155,163,164,165,166,167,168,169,170,172,173,175,176,177,178,179,181,182,183,184,185,186,187,188,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,210,212,213,215,216,218,219,221,224,239,241,242,243,245,246,249,264,266,270,274,289,291,295],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/elmo_embedding.py":[4,7,136,10,11,12,13,14,15,141,17,18,19,20,21,23,155,163,171,173,305,58,61,92,99,111,119],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/_elmo.py":[514,3,515,5,7,263,9,10,11,12,264,14,528,17,409,410,309,56,65,453,327,328,85,98,493,239,240,251,510],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/contextual_embedding.py":[99,4,7,104,10,12,76,14,15,16,17,18,19,20,23,24,27],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/bert_embedding.py":[4,7,8,135,11,12,14,15,16,17,271,19,20,21,22,23,24,149,273,27,157,168,171,186,67,198,71,203,207,211,215,95,98,227,361,115,250],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/char_embedding.py":[4,7,8,11,12,13,14,16,17,18,19,20,21,22,25,57,61,62,64,65,67,68,70,71,72,85,87,88,89,91,92,93,94,95,98,99,101,104,106,108,109,110,111,113,120,121,122,123,124,125,127,128,129,130,131,132,133,134,135,136,137,138,142,143,145,161,168,169,170,172,173,174,175,177,180,211,216,217,219,221,222,224,225,226,239,241,242,243,245,246,247,248,249,252,253,255,258,260,261,263,264,265,267,274,275,276,277,278,279,281,282,283,284,285,286,289,290,291,292,297,299,301,318],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/stack_embedding.py":[4,7,10,12,13,15,18,37,39,40,41,42,43,44,45,46,48,49,50,51,52,53,55,64,71,75,87,92,99,100,101,102,103,104],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/__init__.py":[32,33,34,9,11,13,14,16,18,19,20,21,23,24,27,28,30,31],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/base_model.py":[32,1,33,3,5,7,10,12,14,15,17,20,24,25,26,27,29,30],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/bert.py":[4,6,8,10,11,13,14,15,16,17,20,57,58,59,60,61,65,67,68,69,71,77,78,80,81,82,83,84,86,91,93,98,135,136,137,138,139,142,144,145,146,148,154,155,156,157,158,159,160,161,162,164,169,171,176,215,216,217,218,219,222,224,225,226,228,234,235,236,237,239,251,253,258,300,301,302,303,306,308,311,313,319,320,321,322,323,324,326,343,345],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/biaffine_parser.py":[3,5,517,6,520,9,10,11,12,522,14,523,16,17,18,19,20,21,22,23,24,25,530,536,28,539,542,534,544,33,34,35,36,37,38,39,40,41,545,546,547,548,46,47,48,49,50,51,52,53,45,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,73,74,75,76,524,77,78,79,80,525,81,84,82,87,526,527,92,93,94,95,528,96,97,99,101,102,103,104,105,107,108,109,110,111,112,531,114,115,116,117,118,119,120,121,122,532,124,125,126,533,128,131,136,138,139,141,142,151,152,153,154,155,156,157,158,160,161,170,171,172,173,174,175,176,177,178,179,182,188,190,191,192,193,194,195,198,200,549,207,208,209,210,211,42,214,43,222,44,224,225,226,227,229,236,237,238,241,262,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,305,306,307,308,310,311,312,313,314,315,316,317,318,322,323,324,325,326,327,328,329,330,331,333,334,335,336,337,338,339,341,342,344,362,366,368,369,371,372,373,376,377,378,379,380,381,382,383,385,386,387,391,392,393,394,397,400,402,403,405,406,416,417,418,419,420,421,422,424,437,438,439,440,441,442,443,444,445,446,447,449,450,451,452,453,454,456,469,470,471,472,473,474,477,489,493,494,495,496,497,498,499,502],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/cnn_text_classification.py":[4,7,10,11,13,14,15,16,19,32,38,39,42,43,44,45,46,47,48,50,57,58,59,60,62,63,64,65,67,74,75,76],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/sequence_labeling.py":[3,5,6,10,11,12,14,15,16,17,18,19,20,21,22,25,39,41,61,75,78,82,93,95,96,98,99,100,101,102,104,112,113,114,116,118,120,122,124,132,134,136,138,140,141,143,151,152,153,154,155,156,158,159,160,161,162,163,165,170,171,174,189,191,193,195,196,197,198,199,200,201,202,203,204,206,207,213,218,219,221,229,230,231,232,233,234,236,237,238,239,240,241,243,252,253,254,257,259,263,264,267,269,270,271,272,273,274,275,277,279,287,289,296],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/snli.py":[4,6,9,10,11,12,14,15,16,17,20,32,35,36,38,41,42,43,44,45,48,49,50,51,52,54,57,58,59,60,61,63,65,66,68,77,78,79,80,81,82,83,87,89,90,91,92,94,95,99,100,101,102,104,105,107,113,115,116,117,121,122,123,124,126,127,128,129,130,131,134,136,137,138,139,142,143,144,145,146,147,148,149,151,153,154,155,156,158,160,162,165,167,168,169,174,177,178,179,182,183,184,185,186,187,189,190,193,194,195,196,197,198,199,202,204,205,208,209,211,213,214,215,216,217,218,220],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/star_transformer.py":[3,5,6,7,8,11,12,14,15,16,17,20,36,38,46,47,48,49,51,52,53,54,55,56,58,67,68,69,70,73,74,75,76,77,78,79,80,83,84,85,88,89,90,91,92,93,94,95,96,99,100,101,102,105,123,133,134,135,136,137,138,139,140,141,142,143,145,152,153,154,155,156,158,165,166,167,170,188,198,199,200,201,202,203,204,205,206,207,208,210,217,218,219,220,221,223,230,231,232,235,253,263,264,265,266,267,268,269,270,271,272,273,275,284,285,287,288,289,291,292,293,294,296,305,306,307],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/dist_trainer.py":[3,4,5,6,7,9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,26,152,29,30,157,34,169,47,304,50,179,183,312,58,320,332,343,355,229],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/predictor.py":[1,4,7,9,11,12,13,14,17,25,27,28,31,32,33,35,42,44,47,48,49,50,51,53,56,58,59,60,61,62,64,67,68,69,70,80,81]}} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2b2b2b35..17f7654f 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ caches .fitlog logs/ .fitconfig + +docs/build diff --git a/.travis.yml b/.travis.yml index 210d158a..0770d4e7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,11 +4,13 @@ python: # command to install dependencies install: - pip install --quiet -r requirements.txt + - pip install --quiet fitlog - pip install pytest>=3.6 - pip install pytest-cov # command to run tests script: - - pytest --cov=./ test/ + - python -m spacy download en + - pytest --cov=fastNLP test/ after_success: - bash <(curl -s https://codecov.io/bash) diff --git a/README.md b/README.md index 476c129f..f1d17144 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,12 @@ ![Hex.pm](https://img.shields.io/hexpm/l/plug.svg) [![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest) -fastNLP 是一款轻量级的 NLP 处理套件。你既可以使用它快速地完成一个序列标注([NER](reproduction/seqence_labelling/ner)、POS-Tagging等)、中文分词、[文本分类](reproduction/text_classification)、[Matching](reproduction/matching)、[指代消解](reproduction/coreference_resolution)、[摘要](reproduction/Summarization)等任务; 也可以使用它构建许多复杂的网络模型,进行科研。它具有如下的特性: +fastNLP 是一款轻量级的 NLP 工具包。你既可以使用它快速地完成一个序列标注([NER](reproduction/sequence_labelling/ner)、POS-Tagging等)、中文分词、[文本分类](reproduction/text_classification)、[Matching](reproduction/matching)、[指代消解](reproduction/coreference_resolution)、[摘要](reproduction/Summarization)等任务; 也可以使用它快速构建许多复杂的网络模型,进行科研。它具有如下的特性: -- 统一的Tabular式数据容器,让数据预处理过程简洁明了。内置多种数据集的DataSet Loader,省去预处理代码; +- 统一的Tabular式数据容器,让数据预处理过程简洁明了。内置多种数据集的Loader和Pipe,省去预处理代码; - 多种训练、测试组件,例如训练器Trainer;测试器Tester;以及各种评测metrics等等; - 各种方便的NLP工具,例如预处理embedding加载(包括ELMo和BERT); 中间数据cache等; +- 部分[数据集与预训练模型](https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0)的自动下载 - 详尽的中文[文档](https://fastnlp.readthedocs.io/)、[教程](https://fastnlp.readthedocs.io/zh/latest/user/tutorials.html)以供查阅; - 提供诸多高级模块,例如Variational LSTM, Transformer, CRF等; - 在序列标注、中文分词、文本分类、Matching、指代消解、摘要等任务上封装了各种模型可供直接使用,详细内容见 [reproduction](reproduction) 部分; @@ -27,6 +28,7 @@ fastNLP 依赖以下包: + nltk>=3.4.1 + requests + spacy ++ prettytable>=0.7.2 其中torch的安装可能与操作系统及 CUDA 的版本相关,请参见 [PyTorch 官网](https://pytorch.org/) 。 在依赖包安装完成后,您可以在命令行执行如下指令完成安装 @@ -36,24 +38,30 @@ pip install fastNLP python -m spacy download en ``` -目前使用pip安装fastNLP的版本是0.4.1,有较多功能仍未更新,最新内容以master分支为准。 -fastNLP0.5.0版本将在近期推出,请密切关注。 - ## fastNLP教程 +### 快速入门 + - [0. 快速入门](https://fastnlp.readthedocs.io/zh/latest/user/quickstart.html) + +### 详细使用教程 + - [1. 使用DataSet预处理文本](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_1_data_preprocess.html) -- [2. 使用DataSetLoader加载数据集](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_load_dataset.html) +- [2. 使用Vocabulary转换文本与index](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html) - [3. 使用Embedding模块将文本转成向量](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_3_embedding.html) -- [4. 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_4_loss_optimizer.html) -- [5. 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_5_datasetiter.html) -- [6. 快速实现序列标注模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_6_seq_labeling.html) -- [7. 使用Modules和Models快速搭建自定义模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_7_modules_models.html) -- [8. 使用Metric快速评测你的模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_8_metrics.html) -- [9. 使用Callback自定义你的训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_9_callback.html) -- [10. 使用fitlog 辅助 fastNLP 进行科研](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_10_fitlog.html) +- [4. 使用Loader和Pipe加载并处理数据集](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_4_load_dataset.html) +- [5. 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_5_loss_optimizer.html) +- [6. 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_6_datasetiter.html) +- [7. 使用Metric快速评测你的模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_7_metrics.html) +- [8. 使用Modules和Models快速搭建自定义模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_8_modules_models.html) +- [9. 快速实现序列标注模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_9_seq_labeling.html) +- [10. 使用Callback自定义你的训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_10_callback.html) + +### 扩展教程 +- [Extend-1. BertEmbedding的各种用法](https://fastnlp.readthedocs.io/zh/latest/tutorials/extend_1_bert_embedding.html) +- [Extend-2. 使用fitlog 辅助 fastNLP 进行科研](https://fastnlp.readthedocs.io/zh/latest/tutorials/extend_2_fitlog.html) ## 内置组件 @@ -79,19 +87,19 @@ fastNLP 在 embeddings 模块中内置了几种不同的embedding:静态embedd encoder 将输入编码为具有具有表示能力的向量 - embedding, RNN, CNN, transformer + Embedding, RNN, CNN, Transformer, ... decoder 将具有某种表示意义的向量解码为需要的输出形式 - MLP, CRF + MLP, CRF, ... ## 项目结构 - +![](./docs/source/figures/workflow.png) fastNLP的大致工作流程如上图所示,而项目结构如下: @@ -118,11 +126,10 @@ fastNLP的大致工作流程如上图所示,而项目结构如下: fastNLP.io - 实现了读写功能,包括数据读入,模型读写等 + 实现了读写功能,包括数据读入与预处理,模型读写,数据与模型自动下载等 -
*In memory of @FengZiYjun. May his soul rest in peace. We will miss you very very much!* diff --git a/docs/Makefile b/docs/Makefile index 2b4de2d8..b41beb44 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -14,13 +14,13 @@ help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) apidoc: - $(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ) + $(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ) && python3 format.py server: cd build/html && python -m http.server dev: - rm -rf build/html && make html && make server + rm -rf build && make html && make server .PHONY: help Makefile diff --git a/docs/README.md b/docs/README.md index 15dcccda..2bb6953c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -32,7 +32,6 @@ Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ... 我们在[这里](./source/user/example.rst)列举了fastNLP文档经常用到的reStructuredText语法(网页查看请结合Raw模式), 您可以通过阅读它进行快速上手。FastNLP大部分的文档都是写在代码中通过Sphinx工具进行抽取生成的, -您还可以参考这篇[未完成的文章](./source/user/docs_in_code.rst)了解代码内文档编写的规范。 ## 文档维护人员 diff --git a/docs/count.py b/docs/count.py new file mode 100644 index 00000000..0830c7cc --- /dev/null +++ b/docs/count.py @@ -0,0 +1,158 @@ +import inspect +import os +import sys + + +def _colored_string(string: str, color: str or int) -> str: + """在终端中显示一串有颜色的文字 + :param string: 在终端中显示的文字 + :param color: 文字的颜色 + :return: + """ + if isinstance(color, str): + color = { + "black": 30, "Black": 30, "BLACK": 30, + "red": 31, "Red": 31, "RED": 31, + "green": 32, "Green": 32, "GREEN": 32, + "yellow": 33, "Yellow": 33, "YELLOW": 33, + "blue": 34, "Blue": 34, "BLUE": 34, + "purple": 35, "Purple": 35, "PURPLE": 35, + "cyan": 36, "Cyan": 36, "CYAN": 36, + "white": 37, "White": 37, "WHITE": 37 + }[color] + return "\033[%dm%s\033[0m" % (color, string) + + +def gr(string, flag): + if flag: + return _colored_string(string, "green") + else: + return _colored_string(string, "red") + + +def find_all_modules(): + modules = {} + children = {} + to_doc = set() + root = '../fastNLP' + for path, dirs, files in os.walk(root): + for file in files: + if file.endswith('.py'): + name = ".".join(path.split('/')[1:]) + if file.split('.')[0] != "__init__": + name = name + '.' + file.split('.')[0] + __import__(name) + m = sys.modules[name] + modules[name] = m + try: + m.__all__ + except: + print(name, "__all__ missing") + continue + if m.__doc__ is None: + print(name, "__doc__ missing") + continue + if "undocumented" not in m.__doc__: + to_doc.add(name) + for module in to_doc: + t = ".".join(module.split('.')[:-1]) + if t in to_doc: + if t not in children: + children[t] = set() + children[t].add(module) + for m in children: + children[m] = sorted(children[m]) + return modules, to_doc, children + + +def create_rst_file(modules, name, children): + m = modules[name] + with open("./source/" + name + ".rst", "w") as fout: + t = "=" * len(name) + fout.write(name + "\n") + fout.write(t + "\n") + fout.write("\n") + fout.write(".. automodule:: " + name + "\n") + if name != "fastNLP.core" and len(m.__all__) > 0: + fout.write(" :members: " + ", ".join(m.__all__) + "\n") + short = name[len("fastNLP."):] + if not (short.startswith('models') or short.startswith('modules') or short.startswith('embeddings')): + fout.write(" :inherited-members:\n") + fout.write("\n") + if name in children: + fout.write("子模块\n------\n\n.. toctree::\n :maxdepth: 1\n\n") + for module in children[name]: + fout.write(" " + module + "\n") + + +def check_file(m, name): + names = name.split('.') + test_name = "test." + ".".join(names[1:-1]) + ".test_" + names[-1] + try: + __import__(test_name) + tm = sys.modules[test_name] + except ModuleNotFoundError: + tm = None + tested = tm is not None + funcs = {} + classes = {} + for item, obj in inspect.getmembers(m): + if inspect.isclass(obj) and obj.__module__ == name and not obj.__name__.startswith('_'): + this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm), {}) + for i in dir(obj): + func = getattr(obj, i) + if inspect.isfunction(func) and not i.startswith('_'): + this[2][i] = (func.__doc__ is not None, False) + classes[obj.__name__] = this + if inspect.isfunction(obj) and obj.__module__ == name and not obj.__name__.startswith('_'): + this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm)) # docs + funcs[obj.__name__] = this + return funcs, classes + + +def check_files(modules, out=None): + for name in sorted(modules.keys()): + print(name, file=out) + funcs, classes = check_file(modules[name], name) + if out is None: + for f in funcs: + print("%-30s \t %s \t %s" % (f, gr("文档", funcs[f][0]), gr("测试", funcs[f][1]))) + for c in classes: + print("%-30s \t %s \t %s" % (c, gr("文档", classes[c][0]), gr("测试", classes[c][1]))) + methods = classes[c][2] + for f in methods: + print(" %-28s \t %s" % (f, gr("文档", methods[f][0]))) + else: + for f in funcs: + if not funcs[f][0]: + print("缺少文档 %s" % (f), file=out) + if not funcs[f][1]: + print("缺少测试 %s" % (f), file=out) + for c in classes: + if not classes[c][0]: + print("缺少文档 %s" % (c), file=out) + if not classes[c][1]: + print("缺少测试 %s" % (c), file=out) + methods = classes[c][2] + for f in methods: + if not methods[f][0]: + print("缺少文档 %s" % (c + "." + f), file=out) + print(file=out) + + +def main(): + sys.path.append("..") + print(_colored_string('Getting modules...', "Blue")) + modules, to_doc, children = find_all_modules() + print(_colored_string('Done!', "Green")) + print(_colored_string('Creating rst files...', "Blue")) + for name in to_doc: + create_rst_file(modules, name, children) + print(_colored_string('Done!', "Green")) + print(_colored_string('Checking all files...', "Blue")) + check_files(modules, out=open("results.txt", "w")) + print(_colored_string('Done!', "Green")) + + +if __name__ == "__main__": + main() diff --git a/docs/source/conf.py b/docs/source/conf.py index 2e10bc89..4ef815f5 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,9 +24,9 @@ copyright = '2018, xpqiu' author = 'xpqiu' # The short X.Y version -version = '0.4.5' +version = '0.5.0' # The full version, including alpha/beta/rc tags -release = '0.4.5' +release = '0.5.0' # -- General configuration --------------------------------------------------- @@ -48,12 +48,14 @@ extensions = [ autodoc_default_options = { 'member-order': 'bysource', 'special-members': '__init__', - 'undoc-members': True, + 'undoc-members': False, } +autoclass_content = "class" + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] - +# template_bridge # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # @@ -113,7 +115,7 @@ html_static_path = ['_static'] # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'fastNLPdoc' +htmlhelp_basename = 'fastNLP doc' # -- Options for LaTeX output ------------------------------------------------ @@ -166,10 +168,12 @@ texinfo_documents = [ # -- Extension configuration ------------------------------------------------- def maybe_skip_member(app, what, name, obj, skip, options): - if name.startswith("_"): - return True if obj.__doc__ is None: return True + if name == "__init__": + return False + if name.startswith("_"): + return True return False diff --git a/docs/source/fastNLP.core.batch.rst b/docs/source/fastNLP.core.batch.rst index 03008b52..50ad6fed 100644 --- a/docs/source/fastNLP.core.batch.rst +++ b/docs/source/fastNLP.core.batch.rst @@ -2,6 +2,6 @@ fastNLP.core.batch ================== .. automodule:: fastNLP.core.batch - :members: - :undoc-members: - :show-inheritance: + :members: BatchIter, DataSetIter, TorchLoaderIter + :inherited-members: + diff --git a/docs/source/fastNLP.core.callback.rst b/docs/source/fastNLP.core.callback.rst index 74a7825d..5a508e03 100644 --- a/docs/source/fastNLP.core.callback.rst +++ b/docs/source/fastNLP.core.callback.rst @@ -2,6 +2,6 @@ fastNLP.core.callback ===================== .. automodule:: fastNLP.core.callback - :members: - :undoc-members: - :show-inheritance: + :members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError + :inherited-members: + diff --git a/docs/source/fastNLP.core.const.rst b/docs/source/fastNLP.core.const.rst index 330a8883..82a1992e 100644 --- a/docs/source/fastNLP.core.const.rst +++ b/docs/source/fastNLP.core.const.rst @@ -2,6 +2,6 @@ fastNLP.core.const ================== .. automodule:: fastNLP.core.const - :members: - :undoc-members: - :show-inheritance: + :members: Const + :inherited-members: + diff --git a/docs/source/fastNLP.core.dataset.rst b/docs/source/fastNLP.core.dataset.rst index 1ad94bb6..e13d7f1c 100644 --- a/docs/source/fastNLP.core.dataset.rst +++ b/docs/source/fastNLP.core.dataset.rst @@ -2,6 +2,6 @@ fastNLP.core.dataset ==================== .. automodule:: fastNLP.core.dataset - :members: - :undoc-members: - :show-inheritance: + :members: DataSet + :inherited-members: + diff --git a/docs/source/fastNLP.core.field.rst b/docs/source/fastNLP.core.field.rst index 7fc099c9..73dad8af 100644 --- a/docs/source/fastNLP.core.field.rst +++ b/docs/source/fastNLP.core.field.rst @@ -2,6 +2,6 @@ fastNLP.core.field ================== .. automodule:: fastNLP.core.field - :members: - :undoc-members: - :show-inheritance: + :members: Padder, AutoPadder, EngChar2DPadder + :inherited-members: + diff --git a/docs/source/fastNLP.core.instance.rst b/docs/source/fastNLP.core.instance.rst index 6e496ac1..010567b9 100644 --- a/docs/source/fastNLP.core.instance.rst +++ b/docs/source/fastNLP.core.instance.rst @@ -2,6 +2,6 @@ fastNLP.core.instance ===================== .. automodule:: fastNLP.core.instance - :members: - :undoc-members: - :show-inheritance: + :members: Instance + :inherited-members: + diff --git a/docs/source/fastNLP.core.losses.rst b/docs/source/fastNLP.core.losses.rst index 8e63dfa1..daf246f8 100644 --- a/docs/source/fastNLP.core.losses.rst +++ b/docs/source/fastNLP.core.losses.rst @@ -2,6 +2,6 @@ fastNLP.core.losses =================== .. automodule:: fastNLP.core.losses - :members: - :undoc-members: - :show-inheritance: + :members: LossBase, LossFunc, LossInForward, CrossEntropyLoss, BCELoss, L1Loss, NLLLoss + :inherited-members: + diff --git a/docs/source/fastNLP.core.metrics.rst b/docs/source/fastNLP.core.metrics.rst index d3b87bb8..96748a78 100644 --- a/docs/source/fastNLP.core.metrics.rst +++ b/docs/source/fastNLP.core.metrics.rst @@ -2,6 +2,6 @@ fastNLP.core.metrics ==================== .. automodule:: fastNLP.core.metrics - :members: - :undoc-members: - :show-inheritance: + :members: MetricBase, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric + :inherited-members: + diff --git a/docs/source/fastNLP.core.optimizer.rst b/docs/source/fastNLP.core.optimizer.rst index c80be53f..44e45c4f 100644 --- a/docs/source/fastNLP.core.optimizer.rst +++ b/docs/source/fastNLP.core.optimizer.rst @@ -2,6 +2,6 @@ fastNLP.core.optimizer ====================== .. automodule:: fastNLP.core.optimizer - :members: - :undoc-members: - :show-inheritance: + :members: Optimizer, SGD, Adam, AdamW + :inherited-members: + diff --git a/docs/source/fastNLP.core.rst b/docs/source/fastNLP.core.rst index cacc6622..15fe29d5 100644 --- a/docs/source/fastNLP.core.rst +++ b/docs/source/fastNLP.core.rst @@ -2,12 +2,9 @@ fastNLP.core ============ .. automodule:: fastNLP.core - :members: - :undoc-members: - :show-inheritance: 子模块 ----------- +------ .. toctree:: :maxdepth: 1 diff --git a/docs/source/fastNLP.core.sampler.rst b/docs/source/fastNLP.core.sampler.rst index 0110f0c0..56291894 100644 --- a/docs/source/fastNLP.core.sampler.rst +++ b/docs/source/fastNLP.core.sampler.rst @@ -2,6 +2,6 @@ fastNLP.core.sampler ==================== .. automodule:: fastNLP.core.sampler - :members: - :undoc-members: - :show-inheritance: + :members: Sampler, BucketSampler, SequentialSampler, RandomSampler + :inherited-members: + diff --git a/docs/source/fastNLP.core.tester.rst b/docs/source/fastNLP.core.tester.rst index 4d71a27b..90ec2a88 100644 --- a/docs/source/fastNLP.core.tester.rst +++ b/docs/source/fastNLP.core.tester.rst @@ -2,6 +2,6 @@ fastNLP.core.tester =================== .. automodule:: fastNLP.core.tester - :members: - :undoc-members: - :show-inheritance: + :members: Tester + :inherited-members: + diff --git a/docs/source/fastNLP.core.trainer.rst b/docs/source/fastNLP.core.trainer.rst index 60bf2d5b..92c08718 100644 --- a/docs/source/fastNLP.core.trainer.rst +++ b/docs/source/fastNLP.core.trainer.rst @@ -2,6 +2,6 @@ fastNLP.core.trainer ==================== .. automodule:: fastNLP.core.trainer - :members: - :undoc-members: - :show-inheritance: + :members: Trainer + :inherited-members: + diff --git a/docs/source/fastNLP.core.utils.rst b/docs/source/fastNLP.core.utils.rst index 3f80b4e8..027a43e9 100644 --- a/docs/source/fastNLP.core.utils.rst +++ b/docs/source/fastNLP.core.utils.rst @@ -2,6 +2,6 @@ fastNLP.core.utils ================== .. automodule:: fastNLP.core.utils - :members: - :undoc-members: - :show-inheritance: + :members: cache_results, seq_len_to_mask, get_seq_len + :inherited-members: + diff --git a/docs/source/fastNLP.core.vocabulary.rst b/docs/source/fastNLP.core.vocabulary.rst index ba9598b9..ac07a8c6 100644 --- a/docs/source/fastNLP.core.vocabulary.rst +++ b/docs/source/fastNLP.core.vocabulary.rst @@ -2,6 +2,6 @@ fastNLP.core.vocabulary ======================= .. automodule:: fastNLP.core.vocabulary - :members: - :undoc-members: - :show-inheritance: + :members: Vocabulary, VocabularyOption + :inherited-members: + diff --git a/docs/source/fastNLP.embeddings.bert_embedding.rst b/docs/source/fastNLP.embeddings.bert_embedding.rst index 24ceff1c..1b59dc35 100644 --- a/docs/source/fastNLP.embeddings.bert_embedding.rst +++ b/docs/source/fastNLP.embeddings.bert_embedding.rst @@ -1,7 +1,6 @@ -fastNLP.embeddings.bert\_embedding -================================== +fastNLP.embeddings.bert_embedding +================================= .. automodule:: fastNLP.embeddings.bert_embedding - :members: - :undoc-members: - :show-inheritance: + :members: BertEmbedding, BertWordPieceEncoder + diff --git a/docs/source/fastNLP.embeddings.char_embedding.rst b/docs/source/fastNLP.embeddings.char_embedding.rst index 501089d8..bc8d64f9 100644 --- a/docs/source/fastNLP.embeddings.char_embedding.rst +++ b/docs/source/fastNLP.embeddings.char_embedding.rst @@ -1,7 +1,6 @@ -fastNLP.embeddings.char\_embedding -================================== +fastNLP.embeddings.char_embedding +================================= .. automodule:: fastNLP.embeddings.char_embedding - :members: - :undoc-members: - :show-inheritance: + :members: CNNCharEmbedding, LSTMCharEmbedding + diff --git a/docs/source/fastNLP.embeddings.contextual_embedding.rst b/docs/source/fastNLP.embeddings.contextual_embedding.rst new file mode 100644 index 00000000..74e5f5be --- /dev/null +++ b/docs/source/fastNLP.embeddings.contextual_embedding.rst @@ -0,0 +1,6 @@ +fastNLP.embeddings.contextual_embedding +======================================= + +.. automodule:: fastNLP.embeddings.contextual_embedding + :members: ContextualEmbedding + diff --git a/docs/source/fastNLP.embeddings.elmo_embedding.rst b/docs/source/fastNLP.embeddings.elmo_embedding.rst index 76669ee3..b8c6d41c 100644 --- a/docs/source/fastNLP.embeddings.elmo_embedding.rst +++ b/docs/source/fastNLP.embeddings.elmo_embedding.rst @@ -1,7 +1,6 @@ -fastNLP.embeddings.elmo\_embedding -================================== +fastNLP.embeddings.elmo_embedding +================================= .. automodule:: fastNLP.embeddings.elmo_embedding - :members: - :undoc-members: - :show-inheritance: + :members: ElmoEmbedding + diff --git a/docs/source/fastNLP.embeddings.embedding.rst b/docs/source/fastNLP.embeddings.embedding.rst index 5960d2cd..6793446b 100644 --- a/docs/source/fastNLP.embeddings.embedding.rst +++ b/docs/source/fastNLP.embeddings.embedding.rst @@ -2,6 +2,5 @@ fastNLP.embeddings.embedding ============================ .. automodule:: fastNLP.embeddings.embedding - :members: - :undoc-members: - :show-inheritance: + :members: Embedding, TokenEmbedding + diff --git a/docs/source/fastNLP.embeddings.rst b/docs/source/fastNLP.embeddings.rst index 6b168906..f4f4a3e0 100644 --- a/docs/source/fastNLP.embeddings.rst +++ b/docs/source/fastNLP.embeddings.rst @@ -2,18 +2,17 @@ fastNLP.embeddings ================== .. automodule:: fastNLP.embeddings - :members: - :undoc-members: - :show-inheritance: + :members: Embedding, TokenEmbedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, BertWordPieceEncoder, StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding, get_embeddings 子模块 ----------- +------ .. toctree:: :maxdepth: 1 fastNLP.embeddings.bert_embedding fastNLP.embeddings.char_embedding + fastNLP.embeddings.contextual_embedding fastNLP.embeddings.elmo_embedding fastNLP.embeddings.embedding fastNLP.embeddings.stack_embedding diff --git a/docs/source/fastNLP.embeddings.stack_embedding.rst b/docs/source/fastNLP.embeddings.stack_embedding.rst index 4d2115f7..a07d1ef5 100644 --- a/docs/source/fastNLP.embeddings.stack_embedding.rst +++ b/docs/source/fastNLP.embeddings.stack_embedding.rst @@ -1,7 +1,6 @@ -fastNLP.embeddings.stack\_embedding -=================================== +fastNLP.embeddings.stack_embedding +================================== .. automodule:: fastNLP.embeddings.stack_embedding - :members: - :undoc-members: - :show-inheritance: + :members: StackEmbedding + diff --git a/docs/source/fastNLP.embeddings.static_embedding.rst b/docs/source/fastNLP.embeddings.static_embedding.rst index e46de81a..219ce0e5 100644 --- a/docs/source/fastNLP.embeddings.static_embedding.rst +++ b/docs/source/fastNLP.embeddings.static_embedding.rst @@ -1,7 +1,6 @@ -fastNLP.embeddings.static\_embedding -==================================== +fastNLP.embeddings.static_embedding +=================================== .. automodule:: fastNLP.embeddings.static_embedding - :members: - :undoc-members: - :show-inheritance: + :members: StaticEmbedding + diff --git a/docs/source/fastNLP.embeddings.utils.rst b/docs/source/fastNLP.embeddings.utils.rst index 263bfbd6..077487c1 100644 --- a/docs/source/fastNLP.embeddings.utils.rst +++ b/docs/source/fastNLP.embeddings.utils.rst @@ -2,6 +2,5 @@ fastNLP.embeddings.utils ======================== .. automodule:: fastNLP.embeddings.utils - :members: - :undoc-members: - :show-inheritance: + :members: get_embeddings + diff --git a/docs/source/fastNLP.io.base_loader.rst b/docs/source/fastNLP.io.base_loader.rst deleted file mode 100644 index 057867f4..00000000 --- a/docs/source/fastNLP.io.base_loader.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.io.base\_loader -======================= - -.. automodule:: fastNLP.io.base_loader - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.io.data_bundle.rst b/docs/source/fastNLP.io.data_bundle.rst new file mode 100644 index 00000000..71a921f1 --- /dev/null +++ b/docs/source/fastNLP.io.data_bundle.rst @@ -0,0 +1,7 @@ +fastNLP.io.data_bundle +====================== + +.. automodule:: fastNLP.io.data_bundle + :members: DataBundle + :inherited-members: + diff --git a/docs/source/fastNLP.io.data_loader.rst b/docs/source/fastNLP.io.data_loader.rst deleted file mode 100644 index 8f990102..00000000 --- a/docs/source/fastNLP.io.data_loader.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.io.data\_loader -========================== - -.. automodule:: fastNLP.io.data_loader - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/source/fastNLP.io.dataset_loader.rst b/docs/source/fastNLP.io.dataset_loader.rst deleted file mode 100644 index e7990714..00000000 --- a/docs/source/fastNLP.io.dataset_loader.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.io.dataset\_loader -========================== - -.. automodule:: fastNLP.io.dataset_loader - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.io.embed_loader.rst b/docs/source/fastNLP.io.embed_loader.rst index 69e1f7ff..581f5c1b 100644 --- a/docs/source/fastNLP.io.embed_loader.rst +++ b/docs/source/fastNLP.io.embed_loader.rst @@ -1,7 +1,7 @@ -fastNLP.io.embed\_loader -======================== +fastNLP.io.embed_loader +======================= .. automodule:: fastNLP.io.embed_loader - :members: - :undoc-members: - :show-inheritance: + :members: EmbedLoader, EmbeddingOption + :inherited-members: + diff --git a/docs/source/fastNLP.io.file_utils.rst b/docs/source/fastNLP.io.file_utils.rst new file mode 100644 index 00000000..0815e068 --- /dev/null +++ b/docs/source/fastNLP.io.file_utils.rst @@ -0,0 +1,7 @@ +fastNLP.io.file_utils +===================== + +.. automodule:: fastNLP.io.file_utils + :members: cached_path, get_filepath, get_cache_path, split_filename_suffix, get_from_cache + :inherited-members: + diff --git a/docs/source/fastNLP.io.loader.rst b/docs/source/fastNLP.io.loader.rst new file mode 100644 index 00000000..c6d0dc55 --- /dev/null +++ b/docs/source/fastNLP.io.loader.rst @@ -0,0 +1,7 @@ +fastNLP.io.loader +================= + +.. automodule:: fastNLP.io.loader + :members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, CoReferenceLoader + :inherited-members: + diff --git a/docs/source/fastNLP.io.model_io.rst b/docs/source/fastNLP.io.model_io.rst index 537ce752..183122b1 100644 --- a/docs/source/fastNLP.io.model_io.rst +++ b/docs/source/fastNLP.io.model_io.rst @@ -1,7 +1,7 @@ -fastNLP.io.model\_io -==================== +fastNLP.io.model_io +=================== .. automodule:: fastNLP.io.model_io - :members: - :undoc-members: - :show-inheritance: + :members: ModelLoader, ModelSaver + :inherited-members: + diff --git a/docs/source/fastNLP.io.pipe.rst b/docs/source/fastNLP.io.pipe.rst new file mode 100644 index 00000000..178d35a9 --- /dev/null +++ b/docs/source/fastNLP.io.pipe.rst @@ -0,0 +1,7 @@ +fastNLP.io.pipe +=============== + +.. automodule:: fastNLP.io.pipe + :members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, LCQMCPipe, CNXNLIPipe, BQCorpusPipe, RenamePipe, GranularizePipe, MachingTruncatePipe, CoReferencePipe + :inherited-members: + diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst index a97ed67d..54373df4 100644 --- a/docs/source/fastNLP.io.rst +++ b/docs/source/fastNLP.io.rst @@ -2,18 +2,19 @@ fastNLP.io ========== .. automodule:: fastNLP.io - :members: - :undoc-members: - :show-inheritance: + :members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver + :inherited-members: 子模块 ----------- +------ .. toctree:: :maxdepth: 1 - fastNLP.io.base_loader + fastNLP.io.data_bundle fastNLP.io.embed_loader - fastNLP.io.dataset_loader - fastNLP.io.data_loader + fastNLP.io.file_utils + fastNLP.io.loader fastNLP.io.model_io + fastNLP.io.pipe + fastNLP.io.utils diff --git a/docs/source/fastNLP.io.utils.rst b/docs/source/fastNLP.io.utils.rst new file mode 100644 index 00000000..3bff3c45 --- /dev/null +++ b/docs/source/fastNLP.io.utils.rst @@ -0,0 +1,7 @@ +fastNLP.io.utils +================ + +.. automodule:: fastNLP.io.utils + :members: check_loader_paths + :inherited-members: + diff --git a/docs/source/fastNLP.models.bert.rst b/docs/source/fastNLP.models.bert.rst new file mode 100644 index 00000000..b0c813f9 --- /dev/null +++ b/docs/source/fastNLP.models.bert.rst @@ -0,0 +1,6 @@ +fastNLP.models.bert +=================== + +.. automodule:: fastNLP.models.bert + :members: BertForSequenceClassification, BertForSentenceMatching, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering + diff --git a/docs/source/fastNLP.models.biaffine_parser.rst b/docs/source/fastNLP.models.biaffine_parser.rst index f19504e8..395638fe 100644 --- a/docs/source/fastNLP.models.biaffine_parser.rst +++ b/docs/source/fastNLP.models.biaffine_parser.rst @@ -1,7 +1,6 @@ -fastNLP.models.biaffine\_parser -=============================== +fastNLP.models.biaffine_parser +============================== .. automodule:: fastNLP.models.biaffine_parser - :members: - :undoc-members: - :show-inheritance: + :members: BiaffineParser, GraphParser + diff --git a/docs/source/fastNLP.models.cnn_text_classification.rst b/docs/source/fastNLP.models.cnn_text_classification.rst index eacf6916..e9ed7ee1 100644 --- a/docs/source/fastNLP.models.cnn_text_classification.rst +++ b/docs/source/fastNLP.models.cnn_text_classification.rst @@ -1,7 +1,6 @@ -fastNLP.models.cnn\_text\_classification -======================================== +fastNLP.models.cnn_text_classification +====================================== .. automodule:: fastNLP.models.cnn_text_classification - :members: - :undoc-members: - :show-inheritance: + :members: CNNText + diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst index 2ea546e2..21cf41a7 100644 --- a/docs/source/fastNLP.models.rst +++ b/docs/source/fastNLP.models.rst @@ -2,16 +2,15 @@ fastNLP.models ============== .. automodule:: fastNLP.models - :members: - :undoc-members: - :show-inheritance: + :members: CNNText, SeqLabeling, AdvSeqLabel, ESIM, StarTransEnc, STSeqLabel, STNLICls, STSeqCls, BiaffineParser, GraphParser, BertForSequenceClassification, BertForSentenceMatching, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering 子模块 ----------- +------ .. toctree:: :maxdepth: 1 + fastNLP.models.bert fastNLP.models.biaffine_parser fastNLP.models.cnn_text_classification fastNLP.models.sequence_labeling diff --git a/docs/source/fastNLP.models.sequence_labeling.rst b/docs/source/fastNLP.models.sequence_labeling.rst index 85e28f06..dcd1300e 100644 --- a/docs/source/fastNLP.models.sequence_labeling.rst +++ b/docs/source/fastNLP.models.sequence_labeling.rst @@ -1,7 +1,6 @@ -fastNLP.models.sequence\_labeling -================================= +fastNLP.models.sequence_labeling +================================ .. automodule:: fastNLP.models.sequence_labeling - :members: - :undoc-members: - :show-inheritance: + :members: SeqLabeling, AdvSeqLabel, BiLSTMCRF + diff --git a/docs/source/fastNLP.models.snli.rst b/docs/source/fastNLP.models.snli.rst index 3b9b555c..eed02139 100644 --- a/docs/source/fastNLP.models.snli.rst +++ b/docs/source/fastNLP.models.snli.rst @@ -2,6 +2,5 @@ fastNLP.models.snli =================== .. automodule:: fastNLP.models.snli - :members: - :undoc-members: - :show-inheritance: + :members: ESIM + diff --git a/docs/source/fastNLP.models.star_transformer.rst b/docs/source/fastNLP.models.star_transformer.rst index 69d5c5b2..80ab5b33 100644 --- a/docs/source/fastNLP.models.star_transformer.rst +++ b/docs/source/fastNLP.models.star_transformer.rst @@ -1,7 +1,6 @@ -fastNLP.models.star\_transformer -================================ +fastNLP.models.star_transformer +=============================== .. automodule:: fastNLP.models.star_transformer - :members: - :undoc-members: - :show-inheritance: + :members: StarTransEnc, STNLICls, STSeqCls, STSeqLabel + diff --git a/docs/source/fastNLP.modules.decoder.rst b/docs/source/fastNLP.modules.decoder.rst index ecc2adbd..de6e0d9d 100644 --- a/docs/source/fastNLP.modules.decoder.rst +++ b/docs/source/fastNLP.modules.decoder.rst @@ -2,7 +2,5 @@ fastNLP.modules.decoder ======================= .. automodule:: fastNLP.modules.decoder - :members: - :undoc-members: - :show-inheritance: + :members: MLP, ConditionalRandomField, viterbi_decode, allowed_transitions diff --git a/docs/source/fastNLP.modules.encoder.rst b/docs/source/fastNLP.modules.encoder.rst index 0562f12d..a402cb67 100644 --- a/docs/source/fastNLP.modules.encoder.rst +++ b/docs/source/fastNLP.modules.encoder.rst @@ -2,6 +2,5 @@ fastNLP.modules.encoder ======================= .. automodule:: fastNLP.modules.encoder - :members: - :undoc-members: - :show-inheritance: + :members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, KMaxPool, AvgPool, AvgPoolWithMask, MultiHeadAttention, BiAttention, SelfAttention + diff --git a/docs/source/fastNLP.modules.rst b/docs/source/fastNLP.modules.rst index 646ef2d3..9c44e461 100644 --- a/docs/source/fastNLP.modules.rst +++ b/docs/source/fastNLP.modules.rst @@ -2,16 +2,14 @@ fastNLP.modules =============== .. automodule:: fastNLP.modules - :members: - :undoc-members: - :show-inheritance: + :members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, KMaxPool, AvgPool, AvgPoolWithMask, MultiHeadAttention, MLP, ConditionalRandomField, viterbi_decode, allowed_transitions, TimestepDropout 子模块 ------------ +------ .. toctree:: - :titlesonly: :maxdepth: 1 fastNLP.modules.decoder - fastNLP.modules.encoder \ No newline at end of file + fastNLP.modules.encoder + fastNLP.modules.utils diff --git a/docs/source/fastNLP.modules.utils.rst b/docs/source/fastNLP.modules.utils.rst new file mode 100644 index 00000000..101a0f45 --- /dev/null +++ b/docs/source/fastNLP.modules.utils.rst @@ -0,0 +1,6 @@ +fastNLP.modules.utils +===================== + +.. automodule:: fastNLP.modules.utils + :members: initial_parameter, summary + diff --git a/docs/source/fastNLP.rst b/docs/source/fastNLP.rst index 0057a184..097ad0b2 100644 --- a/docs/source/fastNLP.rst +++ b/docs/source/fastNLP.rst @@ -1,13 +1,12 @@ -API 文档 -=============== +fastNLP +======= .. automodule:: fastNLP - :members: - :undoc-members: - :show-inheritance: + :members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger + :inherited-members: -内部模块 ------------ +子模块 +------ .. toctree:: :maxdepth: 1 diff --git a/docs/source/figures/workflow.png b/docs/source/figures/workflow.png index d8e4e455..6a0ebd04 100644 Binary files a/docs/source/figures/workflow.png and b/docs/source/figures/workflow.png differ diff --git a/docs/source/modules.rst b/docs/source/modules.rst index 9ca3c7f3..e9a92cb7 100644 --- a/docs/source/modules.rst +++ b/docs/source/modules.rst @@ -2,7 +2,6 @@ fastNLP ======= .. toctree:: - :titlesonly: :maxdepth: 4 fastNLP diff --git a/docs/source/quickstart/cn_cls_example.png b/docs/source/quickstart/cn_cls_example.png new file mode 100644 index 00000000..5055bb02 Binary files /dev/null and b/docs/source/quickstart/cn_cls_example.png differ diff --git a/docs/source/quickstart/文本分类.rst b/docs/source/quickstart/文本分类.rst new file mode 100644 index 00000000..65ef39c9 --- /dev/null +++ b/docs/source/quickstart/文本分类.rst @@ -0,0 +1,368 @@ +文本分类(Text classification) +============================= + +文本分类任务是将一句话或一段话划分到某个具体的类别。比如垃圾邮件识别,文本情绪分类等。 + +.. code-block:: text + + 1, 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错! + +其中开头的1是只这条评论的标签,表示是正面的情绪。我们将使用到的数据可以通过 `此链接 `_ +下载并解压,当然也可以通过fastNLP自动下载该数据。 + +数据中的内容如下图所示。接下来,我们将用fastNLP在这个数据上训练一个分类网络。 + +.. figure:: ./cn_cls_example.png + :alt: jupyter + + jupyter + +步骤 +---- + +一共有以下的几个步骤: + +1. `读取数据 <#id4>`_ + +2. `预处理数据 <#id5>`_ + +3. `选择预训练词向量 <#id6>`_ + +4. `创建模型 <#id7>`_ + +5. `训练模型 <#id8>`_ + +(1) 读取数据 +~~~~~~~~~~~~~~~~~~~~ + +fastNLP提供多种数据的自动下载与自动加载功能,对于这里我们要用到的数据,我们可以用 :class:`~fastNLP.io.Loader` 自动下载并加载该数据。 +更多有关Loader的使用可以参考 :mod:`~fastNLP.io.loader` + +.. code-block:: python + + from fastNLP.io import ChnSentiCorpLoader + + loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader + data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回 + data_bundle = loader.load(data_dir) # 这一行代码将从{data_dir}处读取数据至DataBundle + + +DataBundle的相关介绍,可以参考 :class:`~fastNLP.io.DataBundle` 。我们可以打印该data\_bundle的基本信息。 + +.. code-block:: python + + print(data_bundle) + + +.. code-block:: text + + In total 3 datasets: + dev has 1200 instances. + train has 9600 instances. + test has 1200 instances. + In total 0 vocabs: + + + +可以看出,该data\_bundle中一个含有三个 :class:`~fastNLP.DataSet` 。通过下面的代码,我们可以查看DataSet的基本情况 + +.. code-block:: python + + print(data_bundle.get_dataset('train')[:2]) # 查看Train集前两个sample + + +.. code-block:: text + + +-----------------------------+--------+ + | raw_chars | target | + +-----------------------------+--------+ + | 选择珠江花园的原因就是方... | 1 | + | 15.4寸笔记本的键盘确实爽... | 1 | + +-----------------------------+--------+ + +(2) 预处理数据 +~~~~~~~~~~~~~~~~~~~~ + +在NLP任务中,预处理一般包括: + +(a) 将一整句话切分成汉字或者词; + +(b) 将文本转换为index + +fastNLP中也提供了多种数据集的处理类,这里我们直接使用fastNLP的ChnSentiCorpPipe。更多关于Pipe的说明可以参考 :mod:`~fastNLP.io.pipe` 。 + +.. code-block:: python + + from fastNLP.io import ChnSentiCorpPipe + + pipe = ChnSentiCorpPipe() + data_bundle = pipe.process(data_bundle) # 所有的Pipe都实现了process()方法,且输入输出都为DataBundle类型 + + print(data_bundle) # 打印data_bundle,查看其变化 + + +.. code-block:: text + + In total 3 datasets: + dev has 1200 instances. + train has 9600 instances. + test has 1200 instances. + In total 2 vocabs: + chars has 4409 entries. + target has 2 entries. + + + +可以看到除了之前已经包含的3个 :class:`~fastNLP.DataSet` ,还新增了两个 :class:`~fastNLP.Vocabulary` 。我们可以打印DataSet中的内容 + +.. code-block:: python + + print(data_bundle.get_dataset('train')[:2]) + + +.. code-block:: text + + +-----------------+--------+-----------------+---------+ + | raw_chars | target | chars | seq_len | + +-----------------+--------+-----------------+---------+ + | 选择珠江花园... | 0 | [338, 464, 1... | 106 | + | 15.4寸笔记本... | 0 | [50, 133, 20... | 56 | + +-----------------+--------+-----------------+---------+ + + +新增了一列为数字列表的chars,以及变为数字的target列。可以看出这两列的名称和刚好与data\_bundle中两个Vocabulary的名称是一致的,我们可以打印一下Vocabulary看一下里面的内容。 + +.. code-block:: python + + char_vocab = data_bundle.get_vocab('chars') + print(char_vocab) + + +.. code-block:: text + + Vocabulary(['选', '择', '珠', '江', '花']...) + + +Vocabulary是一个记录着词语与index之间映射关系的类,比如 + +.. code-block:: python + + index = char_vocab.to_index('选') + print("'选'的index是{}".format(index)) # 这个值与上面打印出来的第一个instance的chars的第一个index是一致的 + print("index:{}对应的汉字是{}".format(index, char_vocab.to_word(index))) + + +.. code-block:: text + + '选'的index是338 + index:338对应的汉字是选 + + +(3) 选择预训练词向量 +~~~~~~~~~~~~~~~~~~~~ + +由于Word2vec, Glove, Elmo, +Bert等预训练模型可以增强模型的性能,所以在训练具体任务前,选择合适的预训练词向量非常重要。 +在fastNLP中我们提供了多种Embedding使得加载这些预训练模型的过程变得更加便捷。 +这里我们先给出一个使用word2vec的中文汉字预训练的示例,之后再给出一个使用Bert的文本分类。 +这里使用的预训练词向量为'cn-fastnlp-100d',fastNLP将自动下载该embedding至本地缓存, +fastNLP支持使用名字指定的Embedding以及相关说明可以参见 :mod:`fastNLP.embeddings` + +.. code-block:: python + + from fastNLP.embeddings import StaticEmbedding + + word2vec_embed = StaticEmbedding(char_vocab, model_dir_or_name='cn-char-fastnlp-100d') + + +.. code-block:: text + + Found 4321 out of 4409 compound in the pre-training embedding. + +(4) 创建模型 +~~~~~~~~~~~~ + +.. code-block:: python + + from torch import nn + from fastNLP.modules import LSTM + import torch + + # 定义模型 + class BiLSTMMaxPoolCls(nn.Module): + def __init__(self, embed, num_classes, hidden_size=400, num_layers=1, dropout=0.3): + super().__init__() + self.embed = embed + + self.lstm = LSTM(self.embed.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers, + batch_first=True, bidirectional=True) + self.dropout_layer = nn.Dropout(dropout) + self.fc = nn.Linear(hidden_size, num_classes) + + def forward(self, chars, seq_len): # 这里的名称必须和DataSet中相应的field对应,比如之前我们DataSet中有chars,这里就必须为chars + # chars:[batch_size, max_len] + # seq_len: [batch_size, ] + chars = self.embed(chars) + outputs, _ = self.lstm(chars, seq_len) + outputs = self.dropout_layer(outputs) + outputs, _ = torch.max(outputs, dim=1) + outputs = self.fc(outputs) + + return {'pred':outputs} # [batch_size,], 返回值必须是dict类型,且预测值的key建议设为pred + + # 初始化模型 + model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target'))) + +(5) 训练模型 +~~~~~~~~~~~~ + +fastNLP提供了Trainer对象来组织训练过程,包括完成loss计算(所以在初始化Trainer的时候需要指定loss类型),梯度更新(所以在初始化Trainer的时候需要提供优化器optimizer)以及在验证集上的性能验证(所以在初始化时需要提供一个Metric) + +.. code-block:: python + + from fastNLP import Trainer + from fastNLP import CrossEntropyLoss + from torch.optim import Adam + from fastNLP import AccuracyMetric + + loss = CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=0.001) + metric = AccuracyMetric() + device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快 + + trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, + optimizer=optimizer, batch_size=32, dev_data=data_bundle.get_dataset('dev'), + metrics=metric, device=device) + trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型 + + # 在测试集上测试一下模型的性能 + from fastNLP import Tester + print("Performance on test is:") + tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device) + tester.test() + + +.. code-block:: text + + input fields after batch(if batch size is 2): + target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + chars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106]) + seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + target fields after batch(if batch size is 2): + target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + + Evaluate data in 0.01 seconds! + training epochs started 2019-09-03-23-57-10 + + Evaluate data in 0.43 seconds! + Evaluation on dev at Epoch 1/10. Step:300/3000: + AccuracyMetric: acc=0.81 + + Evaluate data in 0.44 seconds! + Evaluation on dev at Epoch 2/10. Step:600/3000: + AccuracyMetric: acc=0.8675 + + Evaluate data in 0.44 seconds! + Evaluation on dev at Epoch 3/10. Step:900/3000: + AccuracyMetric: acc=0.878333 + + .... + + Evaluate data in 0.48 seconds! + Evaluation on dev at Epoch 9/10. Step:2700/3000: + AccuracyMetric: acc=0.8875 + + Evaluate data in 0.43 seconds! + Evaluation on dev at Epoch 10/10. Step:3000/3000: + AccuracyMetric: acc=0.895833 + + In Epoch:7/Step:2100, got best dev performance: + AccuracyMetric: acc=0.8975 + Reloaded the best model. + + Evaluate data in 0.34 seconds! + [tester] + AccuracyMetric: acc=0.8975 + + {'AccuracyMetric': {'acc': 0.8975}} + + + +使用Bert进行文本分类 +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + # 只需要切换一下Embedding即可 + from fastNLP.embeddings import BertEmbedding + + # 这里为了演示一下效果,所以默认Bert不更新权重 + bert_embed = BertEmbedding(char_vocab, model_dir_or_name='cn', auto_truncate=True, requires_grad=False) + model = BiLSTMMaxPoolCls(bert_embed, len(data_bundle.get_vocab('target')), ) + + + import torch + from fastNLP import Trainer + from fastNLP import CrossEntropyLoss + from torch.optim import Adam + from fastNLP import AccuracyMetric + + loss = CrossEntropyLoss() + optimizer = Adam(model.parameters(), lr=2e-5) + metric = AccuracyMetric() + device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快 + + trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, + optimizer=optimizer, batch_size=16, dev_data=data_bundle.get_dataset('test'), + metrics=metric, device=device, n_epochs=3) + trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型 + + # 在测试集上测试一下模型的性能 + from fastNLP import Tester + print("Performance on test is:") + tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device) + tester.test() + + +.. code-block:: text + + loading vocabulary file ~/.fastNLP/embedding/bert-chinese-wwm/vocab.txt + Load pre-trained BERT parameters from file ~/.fastNLP/embedding/bert-chinese-wwm/chinese_wwm_pytorch.bin. + Start to generating word pieces for word. + Found(Or segment into word pieces) 4286 words out of 4409. + input fields after batch(if batch size is 2): + target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + chars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106]) + seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + target fields after batch(if batch size is 2): + target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + + Evaluate data in 0.05 seconds! + training epochs started 2019-09-04-00-02-37 + + Evaluate data in 15.89 seconds! + Evaluation on dev at Epoch 1/3. Step:1200/3600: + AccuracyMetric: acc=0.9 + + Evaluate data in 15.92 seconds! + Evaluation on dev at Epoch 2/3. Step:2400/3600: + AccuracyMetric: acc=0.904167 + + Evaluate data in 15.91 seconds! + Evaluation on dev at Epoch 3/3. Step:3600/3600: + AccuracyMetric: acc=0.918333 + + In Epoch:3/Step:3600, got best dev performance: + AccuracyMetric: acc=0.918333 + Reloaded the best model. + Performance on test is: + + Evaluate data in 29.24 seconds! + [tester] + AccuracyMetric: acc=0.919167 + + {'AccuracyMetric': {'acc': 0.919167}} + + diff --git a/docs/source/tutorials/extend_1_bert_embedding.rst b/docs/source/tutorials/extend_1_bert_embedding.rst new file mode 100644 index 00000000..1960b107 --- /dev/null +++ b/docs/source/tutorials/extend_1_bert_embedding.rst @@ -0,0 +1,220 @@ +============================== +BertEmbedding的各种用法 +============================== + +Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding `_ +中被提出后,因其性能卓越受到了极大的关注,在这里我们展示一下在fastNLP中如何使用Bert进行各类任务。其中中文Bert我们使用的模型的权重来自于 +`中文Bert预训练 `_ 。 + +为了方便大家的使用,fastNLP提供了预训练的Embedding权重及数据集的自动下载,支持自动下载的Embedding和数据集见 +`数据集 `_ 。或您可从 :doc:`/tutorials/tutorial_3_embedding` 与 +:doc:`/tutorials/tutorial_4_load_dataset` 了解更多相关信息。 + +---------------------------------- +中文任务 +---------------------------------- +下面我们将介绍通过使用Bert来进行文本分类, 中文命名实体识别, 文本匹配, 中文问答。 + +1. 使用Bert进行文本分类 +---------------------------------- +文本分类是指给定一段文字,判定其所属的类别。例如下面的文本情感分类 + +.. code-block:: text + + 1, 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错! + +这里我们使用fastNLP提供自动下载的微博分类进行测试 + +.. code-block:: python + + from fastNLP.io import WeiboSenti100kPipe + + data_bundle =WeiboSenti100kPipe().process_from_file() + data_bundle.rename_field('chars', 'words') + + # 载入BertEmbedding + from fastNLP.embeddings import BertEmbedding + + embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True) + + # 载入模型 + from fastNLP.models import BertForSequenceClassification + + model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target'))) + + # 训练模型 + from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam + + trainer = Trainer(data_bundle.get_dataset('train'), model, + optimizer=Adam(model_params=model.parameters(), lr=2e-5), + loss=CrossEntropyLoss(), device=0, + batch_size=8, dev_data=data_bundle.get_dataset('dev'), + metrics=AccuracyMetric(), n_epochs=2, print_every=1) + trainer.train() + + # 测试结果 + from fastNLP import Tester + + tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric()) + tester.test() + +输出结果:: + + In Epoch:1/Step:12499, got best dev performance: + AccuracyMetric: acc=0.9838 + Reloaded the best model. + Evaluate data in 63.84 seconds! + [tester] + AccuracyMetric: acc=0.9815 + + +2. 使用Bert进行命名实体识别 +---------------------------------- +命名实体识别是给定一句话,标记出其中的实体。一般序列标注的任务都使用conll格式,conll格式是至一行中通过制表符分隔不同的内容,使用空行分隔 +两句话,例如下面的例子 + +.. code-block:: text + + 中 B-ORG + 共 I-ORG + 中 I-ORG + 央 I-ORG + 致 O + 中 B-ORG + 国 I-ORG + 致 I-ORG + 公 I-ORG + 党 I-ORG + 十 I-ORG + 一 I-ORG + 大 I-ORG + 的 O + 贺 O + 词 O + +这部分内容请参考 :doc:`快速实现序列标注模型 ` + + +3. 使用Bert进行文本匹配 +---------------------------------- +文本匹配任务是指给定两句话判断他们的关系。比如,给定两句话判断前一句是否和后一句具有因果关系或是否是矛盾关系;或者给定两句话判断两句话是否 +具有相同的意思。这里我们使用 + +.. code-block:: python + + data_bundle = CNXNLIBertPipe().process_from_file(paths) + data_bundle.rename_field('chars', 'words') + print(data_bundle) + + # 载入BertEmbedding + from fastNLP.embeddings import BertEmbedding + + embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True) + + # 载入模型 + from fastNLP.models import BertForSentenceMatching + + model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target'))) + + # 训练模型 + from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam + from fastNLP.core.optimizer import AdamW + from fastNLP.core.callback import WarmupCallback + + callbacks = [WarmupCallback(warmup=0.1, schedule='linear'), ] + + trainer = Trainer(data_bundle.get_dataset('train'), model, + optimizer=AdamW(params=model.parameters(), lr=4e-5), + loss=CrossEntropyLoss(), device=0, + batch_size=8, dev_data=data_bundle.get_dataset('dev'), + metrics=AccuracyMetric(), n_epochs=5, print_every=1, + update_every=8, callbacks=callbacks) + trainer.train() + + from fastNLP import Tester + tester = Tester(data_bundle.get_dataset('test'), model, batch_size=8, metrics=AccuracyMetric()) + tester.test() + +运行结果:: + + In Epoch:3/Step:73632, got best dev performance: + AccuracyMetric: acc=0.781928 + Reloaded the best model. + Evaluate data in 18.54 seconds! + [tester] + AccuracyMetric: acc=0.783633 + + +4. 使用Bert进行中文问答 +---------------------------------- +问答任务是给定一段内容,以及一个问题,需要从这段内容中找到答案。 +例如:: + + "context": "锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的演奏方法。常 + 用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子和尾声,提示音乐的板式和速度,以及 + 作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合 + 相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果,如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单 + 皮鼓(板鼓)、大鼓、大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、大 + 钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责,又称为四大件,领奏的师 + 傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼 + 和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、昆剧和粤剧锣鼓中乐器对应的口诀用字:", + "question": "锣鼓经是什么?", + "answers": [ + { + "text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法", + "answer_start": 4 + }, + { + "text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法", + "answer_start": 4 + }, + { + "text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法", + "answer_start": 4 + } + ] + +您可以通过以下的代码训练 `CMRC2018 `_ + +.. code-block:: python + + from fastNLP.embeddings import BertEmbedding + from fastNLP.models import BertForQuestionAnswering + from fastNLP.core.losses import CMRC2018Loss + from fastNLP.core.metrics import CMRC2018Metric + from fastNLP.io.pipe.qa import CMRC2018BertPipe + from fastNLP import Trainer, BucketSampler + from fastNLP import WarmupCallback, GradientClipCallback + from fastNLP.core.optimizer import AdamW + + + data_bundle = CMRC2018BertPipe().process_from_file() + data_bundle.rename_field('chars', 'words') + + print(data_bundle) + + embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=True, include_cls_sep=False, auto_truncate=True, + dropout=0.5, word_dropout=0.01) + model = BertForQuestionAnswering(embed) + loss = CMRC2018Loss() + metric = CMRC2018Metric() + + wm_callback = WarmupCallback(schedule='linear') + gc_callback = GradientClipCallback(clip_value=1, clip_type='norm') + callbacks = [wm_callback, gc_callback] + + optimizer = AdamW(model.parameters(), lr=5e-5) + + trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, + sampler=BucketSampler(seq_len_field_name='context_len'), + dev_data=data_bundle.get_dataset('dev'), metrics=metric, + callbacks=callbacks, device=0, batch_size=6, num_workers=2, n_epochs=2, print_every=1, + test_use_tqdm=False, update_every=10) + trainer.train(load_best_model=False) + +训练结果(和论文中报道的基本一致):: + + In Epoch:2/Step:1692, got best dev performance: + CMRC2018Metric: f1=85.61, em=66.08 + + diff --git a/docs/source/tutorials/tutorial_10_fitlog.rst b/docs/source/tutorials/extend_2_fitlog.rst similarity index 100% rename from docs/source/tutorials/tutorial_10_fitlog.rst rename to docs/source/tutorials/extend_2_fitlog.rst diff --git a/docs/source/tutorials/tutorial_10_callback.rst b/docs/source/tutorials/tutorial_10_callback.rst new file mode 100644 index 00000000..4a51fdd9 --- /dev/null +++ b/docs/source/tutorials/tutorial_10_callback.rst @@ -0,0 +1,132 @@ +=================================================== +使用 Callback 自定义你的训练过程 +=================================================== + +- `什么是Callback`_ +- `使用 Callback`_ +- `fastNLP 中的 Callback`_ +- `自定义 Callback`_ + + +什么是Callback +--------------------- + +:class:`~fastNLP.core.callback.Callback` 是与 :class:`~fastNLP.core.trainer.Trainer` 紧密结合的模块,利用 Callback 可以在 :class:`~fastNLP.core.trainer.Trainer` 训练时,加入自定义的操作,比如梯度裁剪,学习率调节,测试模型的性能等。定义的 Callback 会在训练的特定阶段被调用。 + +fastNLP 中提供了很多常用的 :class:`~fastNLP.core.callback.Callback` ,开箱即用。 + + +使用 Callback +--------------------- + +使用 Callback 很简单,将需要的 callback 按 list 存储,以对应参数 ``callbacks`` 传入对应的 Trainer。Trainer 在训练时就会自动执行这些 Callback 指定的操作了。 + + +.. code-block:: python + + from fastNLP import (Callback, EarlyStopCallback, + Trainer, CrossEntropyLoss, AccuracyMetric) + from fastNLP.models import CNNText + import torch.cuda + + # prepare data + def get_data(): + from fastNLP.io import ChnSentiCorpPipe as pipe + data = pipe().process_from_file() + print(data) + data.rename_field('chars', 'words') + train_data = data.get_dataset('train') + dev_data = data.get_dataset('dev') + test_data = data.get_dataset('test') + vocab = data.get_vocab('words') + tgt_vocab = data.get_vocab('target') + return train_data, dev_data, test_data, vocab, tgt_vocab + + # prepare model + train_data, dev_data, _, vocab, tgt_vocab = get_data() + device = 'cuda:0' if torch.cuda.is_available() else 'cpu' + model = CNNText((len(vocab),50), num_classes=len(tgt_vocab)) + + # define callback + callbacks=[EarlyStopCallback(5)] + + # pass callbacks to Trainer + def train_with_callback(cb_list): + trainer = Trainer( + device=device, + n_epochs=3, + model=model, + train_data=train_data, + dev_data=dev_data, + loss=CrossEntropyLoss(), + metrics=AccuracyMetric(), + callbacks=cb_list, + check_code_level=-1 + ) + trainer.train() + + train_with_callback(callbacks) + + + +fastNLP 中的 Callback +--------------------- + +fastNLP 中提供了很多常用的 Callback,如梯度裁剪,训练时早停和测试验证集,fitlog 等等。具体 Callback 请参考 :mod:`fastNLP.core.callback` + +.. code-block:: python + + from fastNLP import EarlyStopCallback, GradientClipCallback, EvaluateCallback + callbacks = [ + EarlyStopCallback(5), + GradientClipCallback(clip_value=5, clip_type='value'), + EvaluateCallback(dev_data) + ] + + train_with_callback(callbacks) + +自定义 Callback +--------------------- + +这里我们以一个简单的 Callback作为例子,它的作用是打印每一个 Epoch 平均训练 loss。 + +1. 创建 Callback + + 要自定义 Callback,我们要实现一个类,继承 :class:`~fastNLP.core.callback.Callback` 。这里我们定义 ``MyCallBack`` ,继承 fastNLP.Callback 。 + +2. 指定 Callback 调用的阶段 + + Callback 中所有以 `on_` 开头的类方法会在 Trainer 的训练中在特定阶段调用。 如 on_train_begin() 会在训练开始时被调用,on_epoch_end() + 会在每个 epoch 结束时调用。 具体有哪些类方法,参见 :class:`~fastNLP.core.callback.Callback` 文档。这里, MyCallBack 在求得loss时调用 on_backward_begin() 记录 + 当前 loss,在每一个 epoch 结束时调用 on_epoch_end() ,求当前 epoch 平均loss并输出。 + +3. 使用 Callback 的属性访问 Trainer 的内部信息 + + 为了方便使用,可以使用 :class:`~fastNLP.core.callback.Callback` 的属性,访问 :class:`~fastNLP.core.trainer.Trainer` 中的对应信息,如 optimizer, epoch, n_epochs,分别对应训练时的优化器, + 当前 epoch 数,和总 epoch 数。 具体可访问的属性,参见 :class:`~fastNLP.core.callback.Callback` 。这里, MyCallBack 为了求平均 loss ,需要知道当前 epoch 的总步 + 数,可以通过 self.step 属性得到当前训练了多少步。 + +.. code-block:: python + + from fastNLP import Callback + from fastNLP import logger + + class MyCallBack(Callback): + """Print average loss in each epoch""" + def __init__(self): + super().__init__() + self.total_loss = 0 + self.start_step = 0 + + def on_backward_begin(self, loss): + self.total_loss += loss.item() + + def on_epoch_end(self): + n_steps = self.step - self.start_step + avg_loss = self.total_loss / n_steps + logger.info('Avg loss at epoch %d, %.6f', self.epoch, avg_loss) + self.start_step = self.step + + callbacks = [MyCallBack()] + train_with_callback(callbacks) + diff --git a/docs/source/tutorials/tutorial_1_data_preprocess.rst b/docs/source/tutorials/tutorial_1_data_preprocess.rst index 0ec63f87..005f23f1 100644 --- a/docs/source/tutorials/tutorial_1_data_preprocess.rst +++ b/docs/source/tutorials/tutorial_1_data_preprocess.rst @@ -1,24 +1,22 @@ ============================== -使用DataSet预处理文本 +fastNLP中的DataSet ============================== -:class:`~fastNLP.DataSet` 是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格, -每一行是一个sample (在fastNLP中被称为 :mod:`~fastNLP.core.instance` ), -每一列是一个feature (在fastNLP中称为 :mod:`~fastNLP.core.field` )。 +:class:`~fastNLP.DataSet` 是fastNLP用于承载数据的类,一般训练集、验证集和测试集会被加载为三个单独的 :class:`~fastNLP.DataSet` 对象。 + +:class:`~fastNLP.DataSet` 中的数据组织形式类似一个表格,比如下面 :class:`~fastNLP.DataSet` 一共有3列,列在fastNLP中被称为field。 .. csv-table:: - :header: "sentence", "words", "seq_len" + :header: "raw_chars", "chars", "seq_len" - "This is the first instance .", "[This, is, the, first, instance, .]", 6 - "Second instance .", "[Second, instance, .]", 3 + "历任公司副总经理、总工程师,", "[历 任 公 司 副 总 经 理 、 总 工 程 师 ,]", 6 "Third instance .", "[Third, instance, .]", 3 "...", "[...]", "..." -上面是一个样例数据中 DataSet 的存储结构。其中它的每一行是一个 :class:`~fastNLP.Instance` 对象; 每一列是一个 :class:`~fastNLP.FieldArray` 对象。 - +每一行是一个instance (在fastNLP中被称为 :mod:`~fastNLP.core.Instance` ), +每一列是一个field (在fastNLP中称为 :mod:`~fastNLP.core.FieldArray` )。 ------------------------------ -数据集构建和删除 +DataSet构建和删除 ----------------------------- 我们使用传入字典的方式构建一个数据集,这是 :class:`~fastNLP.DataSet` 初始化的最基础的方式 @@ -26,11 +24,23 @@ .. code-block:: python from fastNLP import DataSet - data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."], + data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."], 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']], 'seq_len': [6, 3, 3]} dataset = DataSet(data) # 传入的dict的每个key的value应该为具有相同长度的list + print(dataset) + +输出为:: + + +------------------------------+------------------------------------------------+---------+ + | raw_words | words | seq_len | + +------------------------------+------------------------------------------------+---------+ + | This is the first instance . | ['this', 'is', 'the', 'first', 'instance', ... | 6 | + | Second instance . | ['Second', 'instance', '.'] | 3 | + | Third instance . | ['Third', 'instance', '.'] | 3 | + +------------------------------+------------------------------------------------+---------+ + 我们还可以使用 :func:`~fastNLP.DataSet.append` 方法向数据集内增加数据 @@ -39,7 +49,7 @@ from fastNLP import DataSet from fastNLP import Instance dataset = DataSet() - instance = Instance(sentence="This is the first instance", + instance = Instance(raw_words="This is the first instance", words=['this', 'is', 'the', 'first', 'instance', '.'], seq_len=6) dataset.append(instance) @@ -52,10 +62,10 @@ from fastNLP import DataSet from fastNLP import Instance dataset = DataSet([ - Instance(sentence="This is the first instance", + Instance(raw_words="This is the first instance", words=['this', 'is', 'the', 'first', 'instance', '.'], seq_len=6), - Instance(sentence="Second instance .", + Instance(raw_words="Second instance .", words=['Second', 'instance', '.'], seq_len=3) ]) @@ -82,7 +92,7 @@ FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop` # 删除名为'a'的field dataset.delete_field('a') ------------------------------ + 简单的数据预处理 ----------------------------- @@ -106,51 +116,41 @@ FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop` .. code-block:: python from fastNLP import DataSet - data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."]} + data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."]} dataset = DataSet(data) # 将句子分成单词形式, 详见DataSet.apply()方法 - dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words') + dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words') # 或使用DataSet.apply_field() - dataset.apply_field(lambda sent:sent.split(), field_name='sentence', new_field_name='words') + dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words') # 除了匿名函数,也可以定义函数传递进去 def get_words(instance): - sentence = instance['sentence'] + sentence = instance['raw_words'] words = sentence.split() return words dataset.apply(get_words, new_field_name='words') -除了手动处理数据集之外,你还可以使用 fastNLP 提供的各种 :class:`~fastNLP.io.base_loader.DataSetLoader` 来进行数据处理。 -详细请参考这篇教程 :doc:`使用DataSetLoader加载数据集 ` 。 +除了手动处理数据集之外,你还可以使用 fastNLP 提供的各种 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 来进行数据处理。 +详细请参考这篇教程 :doc:`使用Loader和Pipe处理数据 ` 。 ------------------------------ -DataSet与pad + +fastNLP中field的命名习惯 ----------------------------- -在fastNLP里,pad是与一个 :mod:`~fastNLP.core.field` 绑定的。即不同的 :mod:`~fastNLP.core.field` 可以使用不同的pad方式,比如在英文任务中word需要的pad和 -character的pad方式往往是不同的。fastNLP是通过一个叫做 :class:`~fastNLP.Padder` 的子类来完成的。 -默认情况下,所有field使用 :class:`~fastNLP.AutoPadder` -。可以通过使用以下方式设置Padder(如果将padder设置为None,则该field不会进行pad操作)。 -大多数情况下直接使用 :class:`~fastNLP.AutoPadder` 就可以了。 -如果 :class:`~fastNLP.AutoPadder` 或 :class:`~fastNLP.EngChar2DPadder` 无法满足需求, -也可以自己写一个 :class:`~fastNLP.Padder` 。 +在英文任务中,fastNLP常用的field名称有: -.. code-block:: python + - **raw_words**: 表示的是原始的str。例如"This is a demo sentence ."。存在多个raw_words的情况,例如matching任务,它们会被定义为raw_words0, raw_words1。但在conll格式下,raw_words列也可能为["This", "is", "a", "demo", "sentence", "."]的形式。 + - **words**: 表示的是已经tokenize后的词语。例如["This", "is", "a", "demo", "sentence"], 但由于str并不能直接被神经网络所使用,所以words中的内容往往被转换为int,如[3, 10, 4, 2, 7, ...]等。多列words的情况,会被命名为words0, words1 + - **target**: 表示目标值。分类场景下,只有一个值;序列标注场景下是一个序列。 + - **seq_len**: 一般用于表示words列的长度 - from fastNLP import DataSet - from fastNLP import EngChar2DPadder - import random - dataset = DataSet() - max_chars, max_words, sent_num = 5, 10, 20 - contents = [[ - [random.randint(1, 27) for _ in range(random.randint(1, max_chars))] - for _ in range(random.randint(1, max_words)) - ] for _ in range(sent_num)] - # 初始化时传入 - dataset.add_field('chars', contents, padder=EngChar2DPadder()) - # 直接设置 - dataset.set_padder('chars', EngChar2DPadder()) - # 也可以设置pad的value - dataset.set_pad_val('chars', -1) +在中文任务中,fastNLP常用的field名称有: + + - **raw_words**: 如果原始汉字序列中已经包含了词语的边界,则该列称为raw_words。如"上海 浦东 开发 与 法制 建设 同步"。 + - **words**: 表示单独的汉字词语序列。例如["上海", "", "浦东", "开发", "与", "法制", "建设", ...]或[2, 3, 4, ...] + - **raw_chars**: 表示的是原始的连续汉字序列。例如"这是一个示例。" + - **chars**: 表示已经切分为单独的汉字的序列。例如["这", "是", "一", "个", "示", "例", "。"]。但由于神经网络不能识别汉字,所以一般该列会被转为int形式,如[3, 4, 5, 6, ...]。 + - **target**: 表示目标值。分类场景下,只有一个值;序列标注场景下是一个序列 + - **seq_len**: 表示输入序列的长度 diff --git a/docs/source/tutorials/tutorial_2_load_dataset.rst b/docs/source/tutorials/tutorial_2_load_dataset.rst deleted file mode 100644 index 4fa4a84d..00000000 --- a/docs/source/tutorials/tutorial_2_load_dataset.rst +++ /dev/null @@ -1,224 +0,0 @@ -================================= -使用DataSetLoader加载数据集 -================================= - -这一部分是一个关于如何加载数据集的教程 - -教程目录: - - - `Part I: 数据集容器`_ - - `Part II: 数据集的使用方式`_ - - `Part III: 不同数据类型的DataSetLoader`_ - - `Part IV: DataSetLoader举例`_ - - `Part V: fastNLP封装好的数据集加载器`_ - - ----------------------------- -Part I: 数据集容器 ----------------------------- - -在fastNLP中,我们使用 :class:`~fastNLP.io.base_loader.DataBundle` 来存储数据集信息。 -:class:`~fastNLP.io.base_loader.DataBundle` 类包含了两个重要内容: `datasets` 和 `vocabs` 。 - -`datasets` 是一个 `key` 为数据集名称(如 `train` , `dev` ,和 `test` 等), `value` 为 :class:`~fastNLP.DataSet` 的字典。 - -`vocabs` 是一个 `key` 为词表名称(如 :attr:`fastNLP.Const.INPUT` 表示输入文本的词表名称, :attr:`fastNLP.Const.TARGET` 表示目标 -的真实标签词表的名称,等等), `value` 为词表内容( :class:`~fastNLP.Vocabulary` )的字典。 - ----------------------------- -Part II: 数据集的使用方式 ----------------------------- - -在fastNLP中,我们采用 :class:`~fastNLP.io.base_loader.DataSetLoader` 来作为加载数据集的基类。 -:class:`~fastNLP.io.base_loader.DataSetLoader` 定义了各种DataSetLoader所需的API接口,开发者应该继承它实现各种的DataSetLoader。 -在各种数据集的DataSetLoader当中,至少应该编写如下内容: - - - _load 函数:从一个数据文件中读取数据到一个 :class:`~fastNLP.DataSet` - - load 函数(可以使用基类的方法):从一个或多个数据文件中读取数据到一个或多个 :class:`~fastNLP.DataSet` - - process 函数:一个或多个从数据文件中读取数据,并处理成可以训练的 :class:`~fastNLP.io.DataBundle` - - **\*process函数中可以调用load函数或_load函数** - -DataSetLoader的_load或者load函数返回的 :class:`~fastNLP.DataSet` 当中,内容为数据集的文本信息,process函数返回的 -:class:`~fastNLP.io.DataBundle` 当中, `datasets` 的内容为已经index好的、可以直接被 :class:`~fastNLP.Trainer` -接受的内容。 - --------------------------------------------------------- -Part III: 不同数据类型的DataSetLoader --------------------------------------------------------- - -:class:`~fastNLP.io.dataset_loader.CSVLoader` - 读取CSV类型的数据集文件。例子如下: - - .. code-block:: python - - data_set_loader = CSVLoader( - headers=('words', 'target'), sep='\t' - ) - # 表示将CSV文件中每一行的第一项填入'words' field,第二项填入'target' field。 - # 其中每两项之间由'\t'分割开来 - - data_set = data_set_loader._load('path/to/your/file') - - 数据集内容样例如下 :: - - But it does not leave you with much . 1 - You could hate it for the same reason . 1 - The performances are an absolute joy . 4 - - -:class:`~fastNLP.io.dataset_loader.JsonLoader` - 读取Json类型的数据集文件,数据必须按行存储,每行是一个包含各类属性的Json对象。例子如下: - - .. code-block:: python - - data_set_loader = JsonLoader( - fields={'sentence1': 'words1', 'sentence2': 'words2', 'gold_label': 'target'} - ) - # 表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'words1'、'words2'、'target'这三个fields - - data_set = data_set_loader._load('path/to/your/file') - - 数据集内容样例如下 :: - - {"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"} - {"annotator_labels": ["contradiction"], "captionID": "3416050480.jpg#4", "gold_label": "contradiction", "pairID": "3416050480.jpg#4r1c", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is at a diner, ordering an omelette.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) ( ordering ( an omelette ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) (NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP (DT an) (NN omelette))))) (. .)))"} - {"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"} - ------------------------------------------- -Part IV: DataSetLoader举例 ------------------------------------------- - -以Matching任务为例子: - - :class:`~fastNLP.io.data_loader.MatchingLoader` - 我们在fastNLP当中封装了一个Matching任务数据集的数据加载类: :class:`~fastNLP.io.data_loader.MatchingLoader` . - - 在MatchingLoader类当中我们封装了一个对数据集中的文本内容进行进一步的预处理的函数: - :meth:`~fastNLP.io.data_loader.MatchingLoader.process` - 这个函数具有各种预处理option,如: - - 是否将文本转成全小写 - - 是否需要序列长度信息,需要什么类型的序列长度信息 - - 是否需要用BertTokenizer来获取序列的WordPiece信息 - - 等等 - - 具体内容参见 :meth:`fastNLP.io.MatchingLoader.process` 。 - - :class:`~fastNLP.io.data_loader.SNLILoader` - 一个关于SNLI数据集的DataSetLoader。SNLI数据集来自 - `SNLI Data Set `_ . - - 在 :class:`~fastNLP.io.data_loader.SNLILoader` 的 :meth:`~fastNLP.io.data_loader.SNLILoader._load` - 函数中,我们用以下代码将数据集内容从文本文件读入内存: - - .. code-block:: python - - data = SNLILoader().process( - paths='path/to/snli/data', to_lower=False, seq_len_type='seq_len', - get_index=True, concat=False, - ) - print(data) - - 输出的内容是:: - - In total 3 datasets: - train has 549367 instances. - dev has 9842 instances. - test has 9824 instances. - In total 2 vocabs: - words has 43154 entries. - target has 3 entries. - - - 这里的data是一个 :class:`~fastNLP.io.base_loader.DataBundle` ,取 ``datasets`` 字典里的内容即可直接传入 - :class:`~fastNLP.Trainer` 或者 :class:`~fastNLP.Tester` 进行训练或者测试。 - - :class:`~fastNLP.io.data_loader.IMDBLoader` - 以IMDB数据集为例,在 :class:`~fastNLP.io.data_loader.IMDBLoader` 的 :meth:`~fastNLP.io.data_loader.IMDBLoader._load` - 函数中,我们用以下代码将数据集内容从文本文件读入内存: - - .. code-block:: python - - data = IMDBLoader().process( - paths={'train': 'path/to/train/file', 'test': 'path/to/test/file'} - ) - print(data) - - 输出的内容是:: - - In total 3 datasets: - train has 22500 instances. - test has 25000 instances. - dev has 2500 instances. - In total 2 vocabs: - words has 82846 entries. - target has 2 entries. - - - 这里的将原来的train集按9:1的比例分成了训练集和验证集。 - - ------------------------------------------- -Part V: fastNLP封装好的数据集加载器 ------------------------------------------- - -fastNLP封装好的数据集加载器可以适用于多种类型的任务: - - - `文本分类任务`_ - - `序列标注任务`_ - - `Matching任务`_ - - -文本分类任务 -------------------- - -========================== ================================================================== -数据集名称 数据集加载器 --------------------------- ------------------------------------------------------------------ -IMDb :class:`~fastNLP.io.data_loader.IMDBLoader` --------------------------- ------------------------------------------------------------------ -SST :class:`~fastNLP.io.data_loader.SSTLoader` --------------------------- ------------------------------------------------------------------ -SST-2 :class:`~fastNLP.io.data_loader.SST2Loader` --------------------------- ------------------------------------------------------------------ -Yelp Polarity :class:`~fastNLP.io.data_loader.YelpLoader` --------------------------- ------------------------------------------------------------------ -Yelp Full :class:`~fastNLP.io.data_loader.YelpLoader` --------------------------- ------------------------------------------------------------------ -MTL16 :class:`~fastNLP.io.data_loader.MTL16Loader` -========================== ================================================================== - - - -序列标注任务 -------------------- - -========================== ================================================================== -数据集名称 数据集加载器 --------------------------- ------------------------------------------------------------------ -Conll :class:`~fastNLP.io.data_loader.ConllLoader` --------------------------- ------------------------------------------------------------------ -Conll2003 :class:`~fastNLP.io.data_loader.Conll2003Loader` --------------------------- ------------------------------------------------------------------ -人民日报数据集 :class:`~fastNLP.io.data_loader.PeopleDailyCorpusLoader` -========================== ================================================================== - - - -Matching任务 -------------------- - -========================== ================================================================== -数据集名称 数据集加载器 --------------------------- ------------------------------------------------------------------ -SNLI :class:`~fastNLP.io.data_loader.SNLILoader` --------------------------- ------------------------------------------------------------------ -MultiNLI :class:`~fastNLP.io.data_loader.MNLILoader` --------------------------- ------------------------------------------------------------------ -QNLI :class:`~fastNLP.io.data_loader.QNLILoader` --------------------------- ------------------------------------------------------------------ -RTE :class:`~fastNLP.io.data_loader.RTELoader` --------------------------- ------------------------------------------------------------------ -Quora Pair Dataset :class:`~fastNLP.io.data_loader.QuoraLoader` -========================== ================================================================== - diff --git a/docs/source/tutorials/tutorial_2_vocabulary.rst b/docs/source/tutorials/tutorial_2_vocabulary.rst new file mode 100644 index 00000000..0b26a419 --- /dev/null +++ b/docs/source/tutorials/tutorial_2_vocabulary.rst @@ -0,0 +1,129 @@ +============================== +fastNLP中的Vocabulary +============================== + +:class:`~fastNLP.Vocabulary` 是包含字或词与index关系的类,用于将文本转换为index。 + + +构建Vocabulary +----------------------------- + +.. code-block:: python + + from fastNLP import Vocabulary + + vocab = Vocabulary() + vocab.add_word_lst(['复', '旦', '大', '学']) # 加入新的字 + vocab.add_word('上海') # `上海`会作为一个整体 + vocab.to_index('复') # 应该会为3 + vocab.to_index('我') # 会输出1,Vocabulary中默认pad的index为0, unk(没有找到的词)的index为1 + + # 在构建target的Vocabulary时,词表中应该用不上pad和unk,可以通过以下的初始化 + vocab = Vocabulary(unknown=None, padding=None) + vocab.add_word_lst(['positive', 'negative']) + vocab.to_index('positive') # 输出0 + vocab.to_index('neutral') # 会报错,因为没有unk这种情况 + +除了通过以上的方式建立词表,Vocabulary还可以通过使用下面的函数直从 :class:`~fastNLP.DataSet` 中的某一列建立词表以及将该列转换为index + +.. code-block:: python + + from fastNLP import Vocabulary + from fastNLP import DataSet + + dataset = DataSet({'chars': [ + ['今', '天', '天', '气', '很', '好', '。'], + ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。'] + ], + 'target': ['neutral', 'negative'] + }) + + vocab = Vocabulary() + vocab.from_dataset(dataset, field_name='chars') + vocab.index_dataset(dataset, field_name='chars') + + target_vocab = Vocabulary(padding=None, unknown=None) + target_vocab.from_dataset(dataset, field_name='target') + target_vocab.index_dataset(dataset, field_name='target') + print(dataset) + +输出内容为:: + + +---------------------------------------------------+--------+ + | chars | target | + +---------------------------------------------------+--------+ + | [4, 2, 2, 5, 6, 7, 3] | 0 | + | [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 3] | 1 | + +---------------------------------------------------+--------+ + + +一些使用tips +----------------------------- + +在通过使用from_dataset()函数在DataSet上建立词表时,将测试集和验证集放入参数no_create_entry_dataset中,如下所示 + +.. code-block:: python + + from fastNLP import Vocabulary + from fastNLP import DataSet + + tr_data = DataSet({'chars': [ + ['今', '天', '心', '情', '很', '好', '。'], + ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。'] + ], + 'target': ['positive', 'negative'] + }) + dev_data = DataSet({'chars': [ + ['住', '宿', '条', '件', '还', '不', '错'], + ['糟', '糕', '的', '天', '气', ',', '无', '法', '出', '行', '。'] + ], + 'target': ['positive', 'negative'] + }) + + vocab = Vocabulary() + # 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。 + vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) + + + :class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集 +传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的 +情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们 +会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。通过与fastNLP中的各种Embedding配合使用,会有如下的效果, +如果一个词出现在了train中,但是没在预训练模型中,embedding会为随机初始化,且它单独的一个vector,如果finetune embedding的话, +这个词在更新之后可能会有更好的表示; 而如果这个词仅出现在了dev或test中,那么就不能为它们单独建立vector,而应该让它指向unk这个vector的 +值(当unk的值更新时,这个词也使用的是更新之后的vector)。所以被认为是no_create_entry的token,将首先从预训练的词表中寻找它的表示,如 +果找到了,就使用该表示; 如果没有找到,则认为该词的表示应该为unk的表示。 + +下面我们结合部分 :class:`~fastNLP.embeddings.StaticEmbedding` 的例子来说明下该值造成的影响,如果您对 + :class:`~fastNLP.embeddings.StaticEmbedding` 不太了解,您可以先参考 :doc:`使用Embedding模块将文本转成向量 ` 部分再来阅读该部分 + +.. code-block:: python + + import torch + from fastNLP.embeddings import StaticEmbedding + from fastNLP import Vocabulary + + vocab = Vocabulary() + vocab.add_word('train') + vocab.add_word('only_in_train') # 仅在train出现,但肯定在预训练词表中不存在 + vocab.add_word('test', no_create_entry=True) # 该词只在dev或test中出现 + vocab.add_word('only_in_test', no_create_entry=True) # 这个词在预训练的词表中找不到 + + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d') + print(embed(torch.LongTensor([vocab.to_index('train')]))) + print(embed(torch.LongTensor([vocab.to_index('only_in_train')]))) + print(embed(torch.LongTensor([vocab.to_index('test')]))) + print(embed(torch.LongTensor([vocab.to_index('only_in_test')]))) + print(embed(torch.LongTensor([vocab.unknown_idx]))) + +输出结果(只截取了部分vector):: + + tensor([[ 0.9497, 0.3433, 0.8450, -0.8852, ...]], grad_fn=) # train,en-glove-6b-50d,找到了该词 + tensor([[ 0.0540, -0.0557, -0.0514, -0.1688, ...]], grad_fn=) # only_in_train,en-glove-6b-50d,使用了随机初始化 + tensor([[ 0.1318, -0.2552, -0.0679, 0.2619, ...]], grad_fn=) # test,在en-glove-6b-50d中找到了这个词 + tensor([[0., 0., 0., 0., 0., ...]], grad_fn=) # only_in_test, en-glove-6b-50d中找不到这个词,使用unk的vector + tensor([[0., 0., 0., 0., 0., ...]], grad_fn=) # unk,使用zero初始化 + +首先train和test都能够从预训练中找到对应的vector,所以它们是各自的vector表示; only_in_train在预训练中找不到,StaticEmbedding为它 +新建了一个entry,所以它有一个单独的vector; 而only_in_test在预训练中找不到改词,因此被指向了unk的值(fastNLP用零向量初始化unk),与最后一行unk的 +表示相同。 \ No newline at end of file diff --git a/docs/source/tutorials/tutorial_3_embedding.rst b/docs/source/tutorials/tutorial_3_embedding.rst index 489b43b4..95c0105e 100644 --- a/docs/source/tutorials/tutorial_3_embedding.rst +++ b/docs/source/tutorials/tutorial_3_embedding.rst @@ -7,208 +7,447 @@ 教程目录: - `Part I: embedding介绍`_ - - `Part II: 使用随机初始化的embedding`_ - - `Part III: 使用预训练的静态embedding`_ - - `Part IV: 使用预训练的Contextual Embedding(ELMo & BERT)`_ - - `Part V: 使用character-level的embedding`_ - - `Part VI: 叠加使用多个embedding`_ + - `Part II: 使用预训练的静态embedding`_ + - `Part III: 使用随机初始化的embedding`_ + - `Part IV: ELMo Embedding`_ + - `Part V: Bert Embedding`_ + - `Part VI: 使用character-level的embedding`_ + - `Part VII: 叠加使用多个embedding`_ + - `Part VIII: Embedding的其它说明`_ + - `Part IX: StaticEmbedding的使用建议`_ - ---------------------------------------- Part I: embedding介绍 --------------------------------------- -与torch.nn.Embedding类似,fastNLP的embedding接受的输入是一个被index好的序列,输出的内容是这个序列的embedding结果。 +Embedding是一种词嵌入技术,可以将字或者词转换为实向量。目前使用较多的预训练词嵌入有word2vec, fasttext, glove, character embedding, +elmo以及bert。 +但使用这些词嵌入方式的时候都需要做一些加载上的处理,比如预训练的word2vec, fasttext以及glove都有着超过几十万个词语的表示,但一般任务大概 +只会用到其中的几万个词,如果直接加载所有的词汇,会导致内存占用变大以及训练速度变慢,需要从预训练文件中抽取本次实验的用到的词汇;而对于英文的 +elmo和character embedding, 需要将word拆分成character才能使用;Bert的使用更是涉及到了Byte pair encoding(BPE)相关的内容。为了方便 +大家的使用,fastNLP通过 :class:`~fastNLP.Vocabulary` 统一了不同embedding的使用。下面我们将讲述一些例子来说明一下 -fastNLP的embedding包括了预训练embedding和随机初始化embedding。 +Part II: 使用预训练的静态embedding --------------------------------------- -Part II: 使用随机初始化的embedding ---------------------------------------- -使用随机初始化的embedding参见 :class:`~fastNLP.embeddings.embedding.Embedding` 。 +在fastNLP中,加载预训练的word2vec, glove以及fasttext都使用的是 :class:`~fastNLP.embeddings.StaticEmbedding` 。另外,为了方便大家的 +使用,fastNLP提供了多种静态词向量的自动下载并缓存(默认缓存到~/.fastNLP/embeddings文件夹下)的功能,支持自动下载的预训练向量可以在 +`下载文档 `_ 查看。 + +.. code-block:: python + + import torch + from fastNLP.embeddings import StaticEmbedding + from fastNLP import Vocabulary + + vocab = Vocabulary() + vocab.add_word_lst("this is a demo .".split()) + + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d') + + words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) # 将文本转为index + print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的 + +输出为:: + + torch.Size([1, 5, 50]) + +fastNLP的StaticEmbedding在初始化之后,就和pytorch中的Embedding是类似的了。 :class:`~fastNLP.embeddings.StaticEmbedding` 的初始化 +主要是从model_dir_or_name提供的词向量中抽取出 :class:`~fastNLP.Vocabulary` 中词语的vector。 + +除了可以通过使用预先提供的Embedding, :class:`~fastNLP.embeddings.StaticEmbedding` 也支持加载本地的预训练词向量,glove, word2vec以及 +fasttext格式的。通过将model_dir_or_name修改为本地的embedding文件路径,即可使用本地的embedding。 + + +Part III: 使用随机初始化的embedding +--------------------------------------- -可以传入词表大小和embedding维度: +有时候需要使用随机初始化的Embedding,也可以通过使用 :class:`~fastNLP.embeddings.StaticEmbedding` 获得。只需要将model_dir_or_name +置为None,且传入embedding_dim,如下例所示 .. code-block:: python - embed = Embedding(10000, 50) + from fastNLP.embeddings import StaticEmbedding + from fastNLP import Vocabulary -也可以传入一个初始化的参数矩阵: + vocab = Vocabulary() + vocab.add_word_lst("this is a demo .".split()) + + embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30) + + words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) + print(embed(words).size()) + +输出为:: + + torch.Size([1, 5, 30]) + + + +Part IV: ELMo Embedding +----------------------------------------------------------- + +在fastNLP中,我们提供了ELMo和BERT的embedding: :class:`~fastNLP.embeddings.ElmoEmbedding` +和 :class:`~fastNLP.embeddings.BertEmbedding` 。可自动下载的ElmoEmbedding可以 +从 `下载文档 `_ 找到。 + +与静态embedding类似,ELMo的使用方法如下: .. code-block:: python - embed = Embedding(init_embed) + from fastNLP.embeddings import ElmoEmbedding + from fastNLP import Vocabulary -其中的init_embed可以是torch.FloatTensor、torch.nn.Embedding或者numpy.ndarray。 + vocab = Vocabulary() + vocab.add_word_lst("this is a demo .".split()) + embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False) + words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) + print(embed(words).size()) ---------------------------------------- -Part III: 使用预训练的静态embedding ---------------------------------------- +输出为:: -在使用预训练的embedding之前,需要根据数据集的内容构建一个词表 :class:`~fastNLP.core.vocabulary.Vocabulary` ,在 -预训练embedding类初始化的时候需要将这个词表作为参数传入。 + torch.Size([1, 5, 256]) -在fastNLP中,我们提供了 :class:`~fastNLP.embeddings.StaticEmbedding` 这一个类。 -通过 :class:`~fastNLP.embeddings.StaticEmbedding` 可以加载预训练好的静态 -Embedding,例子如下: +也可以输出多层的ELMo结果,fastNLP将在不同层的结果在最后一维上拼接,下面的代码需要在上面的代码执行结束之后执行 .. code-block:: python - embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) + embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2') + print(embed(words).size()) -vocab为根据数据集构建的词表,model_dir_or_name可以是一个路径,也可以是embedding模型的名称: +输出为:: - 1 如果传入的是路径,那么fastNLP将会根据该路径来读取预训练的权重文件并将embedding加载进来(glove - 和word2vec类型的权重文件都支持) + torch.Size([1, 5, 512]) - 2 如果传入的是模型名称,那么fastNLP将会根据名称查找embedding模型,如果在cache目录下找到模型则会 - 自动加载;如果找不到则会自动下载。可以通过环境变量 ``FASTNLP_CACHE_DIR`` 来自定义cache目录,如:: +另外,根据 `Deep contextualized word representations `_ ,不同层之间使用可学习的权重可以使得ELMo的效果更好,在fastNLP中可以通过以下的初始化 +实现3层输出的结果通过可学习的权重进行加法融合。 - $ FASTNLP_CACHE_DIR=~/fastnlp_cache_dir python your_python_file.py +.. code-block:: python -这个命令表示fastNLP将会在 `~/fastnlp_cache_dir` 这个目录下寻找模型,找不到则会自动将模型下载到这个目录 + embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix') + print(embed(words).size()) # 三层输出按照权重element-wise的加起来 -目前支持的静态embedding模型有: +输出为:: - ========================== ================================ - 模型名称 模型 - -------------------------- -------------------------------- - en glove.840B.300d - -------------------------- -------------------------------- - en-glove-840d-300 glove.840B.300d - -------------------------- -------------------------------- - en-glove-6b-50 glove.6B.50d - -------------------------- -------------------------------- - en-word2vec-300 谷歌word2vec 300维 - -------------------------- -------------------------------- - en-fasttext 英文fasttext 300维 - -------------------------- -------------------------------- - cn 腾讯中文词向量 200维 - -------------------------- -------------------------------- - cn-fasttext 中文fasttext 300维 - ========================== ================================ + torch.Size([1, 5, 256]) ------------------------------------------------------------ -Part IV: 使用预训练的Contextual Embedding(ELMo & BERT) +Part V: Bert Embedding ----------------------------------------------------------- -在fastNLP中,我们提供了ELMo和BERT的embedding: :class:`~fastNLP.embeddings.ElmoEmbedding` -和 :class:`~fastNLP.embeddings.BertEmbedding` 。 +虽然Bert并不算严格意义上的Embedding,但通过将Bert封装成Embedding的形式将极大减轻使用的复杂程度。可自动下载的Bert Embedding可以 +从 `下载文档 `_ 找到。我们将使用下面的例子讲述一下 +BertEmbedding的使用 -与静态embedding类似,ELMo的使用方法如下: +.. code-block:: python + + from fastNLP.embeddings import BertEmbedding + from fastNLP import Vocabulary + + vocab = Vocabulary() + vocab.add_word_lst("this is a demo .".split()) + + embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased') + words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) + print(embed(words).size()) + +输出为:: + + torch.Size([1, 5, 768]) + +可以通过申明使用指定层数的output也可以使用多层的output,下面的代码需要在上面的代码执行结束之后执行 .. code-block:: python - embed = ElmoEmbedding(vocab, model_dir_or_name='small', requires_grad=False) - -目前支持的ElmoEmbedding模型有: - - ========================== ================================ - 模型名称 模型 - -------------------------- -------------------------------- - small allennlp ELMo的small - -------------------------- -------------------------------- - medium allennlp ELMo的medium - -------------------------- -------------------------------- - original allennlp ELMo的original - -------------------------- -------------------------------- - 5.5b-original allennlp ELMo的5.5B original - ========================== ================================ - -BERT-embedding的使用方法如下: - -.. code-block:: python - - embed = BertEmbedding( - vocab, model_dir_or_name='en-base-cased', requires_grad=False, layers='4,-2,-1' - ) - -其中layers变量表示需要取哪几层的encode结果。 - -目前支持的BertEmbedding模型有: - - ========================== ==================================== - 模型名称 模型 - -------------------------- ------------------------------------ - en bert-base-cased - -------------------------- ------------------------------------ - en-base-uncased bert-base-uncased - -------------------------- ------------------------------------ - en-base-cased bert-base-cased - -------------------------- ------------------------------------ - en-large-uncased bert-large-uncased - -------------------------- ------------------------------------ - en-large-cased bert-large-cased - -------------------------- ------------------------------------ - -------------------------- ------------------------------------ - en-large-cased-wwm bert-large-cased-whole-word-mask - -------------------------- ------------------------------------ - en-large-uncased-wwm bert-large-uncased-whole-word-mask - -------------------------- ------------------------------------ - en-base-cased-mrpc bert-base-cased-finetuned-mrpc - -------------------------- ------------------------------------ - -------------------------- ------------------------------------ - multilingual bert-base-multilingual-cased - -------------------------- ------------------------------------ - multilingual-base-uncased bert-base-multilingual-uncased - -------------------------- ------------------------------------ - multilingual-base-cased bert-base-multilingual-cased - ========================== ==================================== + # 使用后面两层的输出 + embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11') + print(embed(words).size()) # 结果将是在最后一维做拼接 ------------------------------------------------------ -Part V: 使用character-level的embedding +输出为:: + + torch.Size([1, 5, 1536]) + +在Bert中还存在两个特殊的字符[CLS]和[SEP],默认情况下这两个字符是自动加入并且在计算结束之后会自动删除,以使得输入的序列长度和输出的序列 +长度是一致的,但是有些分类的情况,必须需要使用[CLS]的表示,这种情况可以通过在初始化时申明一下需要保留[CLS]的表示,如下例所示 + +.. code-block:: python + + embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True) + print(embed(words).size()) # 结果将在序列维度上增加2 + # 取出句子的cls表示 + cls_reps = embed(words)[:, 0] # shape: [batch_size, 768] + +输出为:: + + torch.Size([1, 7, 768]) + +在英文Bert模型中,一个英文单词可能会被切分为多个subword,例如"fairness"会被拆分为 ``["fair", "##ness"]`` ,这样一个word对应的将有两个输出, +:class:`~fastNLP.embeddings.BertEmbedding` 会使用pooling方法将一个word的subword的表示合并成一个vector,通过pool_method可以控制 +该pooling方法,支持的有"first"(即使用fair的表示作为fairness的表示), "last"(使用##ness的表示作为fairness的表示), "max"(对fair和 +##ness在每一维上做max),"avg"(对fair和##ness每一维做average)。 + +.. code-block:: python + + embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max') + print(embed(words).size()) + +输出为:: + + torch.Size([1, 5, 768]) + +另外,根据 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding `_ , +Bert在针对具有两句话的任务时(如matching,Q&A任务),句子之间通过[SEP]拼接起来,前一句话的token embedding为0, +后一句话的token embedding为1。BertEmbedding能够自动识别句子中间的[SEP]来正确设置对应的token_type_id的。 + +.. code-block:: python + + vocab = Vocabulary() + vocab.add_word_lst("this is a demo . [SEP] another sentence .".split()) + + embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max') + words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo . [SEP] another sentence .".split()]]) + print(embed(words).size()) + +输出为:: + + torch.Size([1, 9, 768]) + +在多个[SEP]的情况下,将会使token_type_id不断0,1循环。比如"first sentence [SEP] second sentence [SEP] third sentence", 它们的 +token_type_id将是[0, 0, 0, 1, 1, 1, 0, 0]。但请注意[SEP]一定要大写的,不能是[sep],否则无法识别。 + +更多 :class:`~fastNLP.embedding.BertEmbedding` 的使用,请参考 :doc:`/tutorials/extend_1_bert_embedding` + + +Part VI: 使用character-level的embedding ----------------------------------------------------- -除了预训练的embedding以外,fastNLP还提供了CharEmbedding: :class:`~fastNLP.embeddings.CNNCharEmbedding` 和 -:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。 +除了预训练的embedding以外,fastNLP还提供了两种Character Embedding: :class:`~fastNLP.embeddings.CNNCharEmbedding` 和 +:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。一般在使用character embedding时,需要在预处理的时候将word拆分成character,这 +会使得预处理过程变得非常繁琐。在fastNLP中,使用character embedding也只需要传入 :class:`~fastNLP.Vocabulary` 即可,而且该 +Vocabulary与其它Embedding使用的Vocabulary是一致的,下面我们看两个例子。 CNNCharEmbedding的使用例子如下: .. code-block:: python - embed = CNNCharEmbedding(vocab, embed_size=100, char_emb_size=50) + from fastNLP.embeddings import CNNCharEmbedding + from fastNLP import Vocabulary + + vocab = Vocabulary() + vocab.add_word_lst("this is a demo .".split()) + + # character的embedding维度大小为50,返回的embedding结果维度大小为64。 + embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50) + words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) + print(embed(words).size()) -这表示这个CNNCharEmbedding当中character的embedding维度大小为50,返回的embedding结果维度大小为100。 +输出为:: + + torch.Size([1, 5, 64]) 与CNNCharEmbedding类似,LSTMCharEmbedding的使用例子如下: .. code-block:: python - embed = LSTMCharEmbedding(vocab, embed_size=100, char_emb_size=50) + from fastNLP.embeddings import LSTMCharEmbeddding + from fastNLP import Vocabulary -这表示这个LSTMCharEmbedding当中character的embedding维度大小为50,返回的embedding结果维度大小为100。 + vocab = Vocabulary() + vocab.add_word_lst("this is a demo .".split()) + # character的embedding维度大小为50,返回的embedding结果维度大小为64。 + embed = LSTMCharEmbeddding(vocab, embed_size=64, char_emb_size=50) + words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) + print(embed(words).size()) +输出为:: + torch.Size([1, 5, 64]) + + +Part VII: 叠加使用多个embedding ----------------------------------------------------- -Part VI: 叠加使用多个embedding ------------------------------------------------------ -在fastNLP中,我们使用 :class:`~fastNLP.embeddings.StackEmbedding` 来叠加多个embedding +单独使用Character Embedding往往效果并不是很好,需要同时结合word embedding。在fastNLP中可以通过 :class:`~fastNLP.embeddings.StackEmbedding` +来叠加embedding,具体的例子如下所示 + +.. code-block:: python + + from fastNLP.embeddings import StaticEmbedding, StackEmbedding, CNNCharEmbedding + from fastNLP import Vocabulary + + vocab = Vocabulary() + vocab.add_word_lst("this is a demo .".split()) + + word_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d') + char_embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50) + embed = StackEmbedding([word_embed, char_embed]) + + words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) + print(embed(words).size()) # 输出embedding的维度为50+64=114 + +输出为:: + + torch.Size([1, 5, 114]) -例子如下: +:class:`~fastNLP.embeddings.StaticEmbedding` , :class:`~fastNLP.embeddings.ElmoEmbedding` , +:class:`~fastNLP.embeddings.CNNCharEmbedding` , :class:`~fastNLP.embeddings.BertEmbedding` 等都可以互相拼接。 +:class:`~fastNLP.embeddings.StackEmbedding` 的使用也是和其它Embedding是一致的,即输出index返回对应的表示。但能够拼接起来的Embedding +必须使用同样的 :class:`~fastNLP.Vocabulary` ,因为只有使用同样的 :class:`~fastNLP.Vocabulary` 才能保证同一个index指向的是同一个词或字 + + + +Part VIII: Embedding的其它说明 +----------------------------------------------------------- + +(1) 获取各种Embedding的dimension .. code-block:: python - embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) - embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True) + from fastNLP.embeddings import * - stack_embed = StackEmbedding([embed_1, embed_2]) + vocab = Vocabulary() + vocab.add_word_lst("this is a demo .".split()) -StackEmbedding会把多个embedding的结果拼接起来,如上面例子的stack_embed返回的embedding维度为350维。 + static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d') + print(static_embed.embedding_dim) # 50 + char_embed = CNNCharEmbedding(vocab, embed_size=30) + print(char_embed.embedding_dim) # 30 + elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2') + print(elmo_embed_1.embedding_dim) # 256 + elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2') + print(elmo_embed_2.embedding_dim) # 512 + bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased') + print(bert_embed_1.embedding_dim) # 768 + bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased') + print(bert_embed_2.embedding_dim) # 1536 + stack_embed = StackEmbedding([static_embed, char_embed]) + print(stack_embed.embedding_dim) # 80 -除此以外,还可以把静态embedding跟上下文相关的embedding拼接起来: +(2) 设置Embedding的权重是否更新 .. code-block:: python - elmo_embedding = ElmoEmbedding(vocab, model_dir_or_name='medium', layers='0,1,2', requires_grad=False) - glove_embedding = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) + from fastNLP.embeddings import * + + vocab = Vocabulary() + vocab.add_word_lst("this is a demo .".split()) + + embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新 + embed.requires_grad = False # 修改BertEmbedding的权重为不更新 + +(3) 各种Embedding中word_dropout与dropout的说明 + +fastNLP中所有的Embedding都支持传入word_dropout和dropout参数,word_dropout指示的是以多大概率将输入的word置为unk的index,这样既可以 +是的unk得到训练,也可以有一定的regularize效果; dropout参数是在获取到word的表示之后,以多大概率将一些维度的表示置为0。 + +如果使用 :class:`~fastNLP.embeddings.StackEmbedding` 且需要用到word_dropout,建议将word_dropout设置在 :class:`~fastNLP.embeddings.StackEmbedding` 上。 + + + +Part IX: StaticEmbedding的使用建议 +----------------------------------------------------------- + +在英文的命名实体识别(NER)任务中,由 `Named Entity Recognition with Bidirectional LSTM-CNNs `_ 指出,同时使用cnn character embedding和word embedding +会使得NER的效果有比较大的提升。正如你在上节中看到的那样,fastNLP支持将 :class:`~fastNLP.embeddings.CNNCharEmbedding` +与 :class:`~fastNLP.embeddings.StaticEmbedding` 拼成一个 :class:`~fastNLP.embeddings.StackEmbedding` 。如果通过这种方式使用,需要 +在预处理文本时,不要将词汇小写化(因为Character Embedding需要利用词语中的大小写信息)且不要将出现频次低于某个阈值的word设置为unk(因为 +Character embedding需要利用字形信息);但 :class:`~fastNLP.embeddings.StaticEmbedding` 使用的某些预训练词嵌入的词汇表中只有小写的词 +语, 且某些低频词并未在预训练中出现需要被剔除。即(1) character embedding需要保留大小写,而预训练词向量不需要保留大小写。(2) +character embedding需要保留所有的字形, 而static embedding需要设置一个最低阈值以学到更好的表示。 + +(1) fastNLP如何解决关于大小写的问题 + +fastNLP通过在 :class:`~fastNLP.embeddings.StaticEmbedding` 增加了一个lower参数解决该问题。如下面的例子所示 + +.. code-block:: python + + from fastNLP.embeddings import StaticEmbedding + from fastNLP import Vocabulary + + vocab = Vocabulary().add_word_lst("The the a A".split()) + # 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的 + embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5) + print(embed(torch.LongTensor([vocab.to_index('The')]))) + print(embed(torch.LongTensor([vocab.to_index('the')]))) + +输出为:: + + tensor([[-0.4685, 0.4572, 0.5159, -0.2618, -0.6871]], grad_fn=) + tensor([[ 0.2615, 0.1490, -0.2491, 0.4009, -0.3842]], grad_fn=) + +可以看到"The"与"the"的vector是不一致的。但如果我们在初始化 :class:`~fastNLP.embeddings.StaticEmbedding` 将lower设置为True,效果将 +如下所示 + +.. code-block:: python + + from fastNLP.embeddings import StaticEmbedding + from fastNLP import Vocabulary + + vocab = Vocabulary().add_word_lst("The the a A".split()) + # 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的 + embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True) + print(embed(torch.LongTensor([vocab.to_index('The')]))) + print(embed(torch.LongTensor([vocab.to_index('the')]))) + +输出为:: + + tensor([[-0.2237, 0.6825, -0.3459, -0.1795, 0.7516]], grad_fn=) + tensor([[-0.2237, 0.6825, -0.3459, -0.1795, 0.7516]], grad_fn=) + +可以看到"The"与"the"的vector是一致的。他们实际上也是引用的同一个vector。通过将lower设置为True,可以在 :class:`~fastNLP.embeddings.StaticEmbedding` +实现类似具备相同小写结果的词语引用同一个vector。 + +(2) fastNLP如何解决min_freq的问题 + +fastNLP通过在 :class:`~fastNLP.embeddings.StaticEmbedding` 增加了一个min_freq参数解决该问题。如下面的例子所示 + +.. code-block:: python + + from fastNLP.embeddings import StaticEmbedding + from fastNLP import Vocabulary + + vocab = Vocabulary().add_word_lst("the the the a".split()) + # 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的 + embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2) + print(embed(torch.LongTensor([vocab.to_index('the')]))) + print(embed(torch.LongTensor([vocab.to_index('a')]))) + print(embed(torch.LongTensor([vocab.unknown_idx]))) + +输出为:: + + tensor([[ 0.0454, 0.3375, 0.6758, -0.2026, -0.4715]], grad_fn=) + tensor([[-0.7602, 0.0149, 0.2733, 0.3974, 0.7371]], grad_fn=) + tensor([[-0.7602, 0.0149, 0.2733, 0.3974, 0.7371]], grad_fn=) + +其中最后一行为unknown值的vector,可以看到a的vector表示与unknown是一样的,这是由于a的频次低于了2,所以被指向了unknown的表示;而the由于 +词频超过了2次,所以它是单独的表示。 + +在计算min_freq时,也会考虑到lower的作用,比如 + +.. code-block:: python + + from fastNLP.embeddings import StaticEmbedding + from fastNLP import Vocabulary + + vocab = Vocabulary().add_word_lst("the the the a A".split()) + # 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的 + embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True) + print(embed(torch.LongTensor([vocab.to_index('the')]))) + print(embed(torch.LongTensor([vocab.to_index('a')]))) + print(embed(torch.LongTensor([vocab.to_index('A')]))) + print(embed(torch.LongTensor([vocab.unknown_idx]))) + +输出为:: + + tensor([[-0.7453, -0.5542, 0.5039, 0.6195, -0.4723]], grad_fn=) # the + tensor([[ 0.0170, -0.0995, -0.5743, -0.2469, -0.2095]], grad_fn=) # a + tensor([[ 0.0170, -0.0995, -0.5743, -0.2469, -0.2095]], grad_fn=) # A + tensor([[ 0.6707, -0.5786, -0.6967, 0.0111, 0.1209]], grad_fn=) # unk - stack_embed = StackEmbedding([elmo_embedding, glove_embedding]) +可以看到a不再和最后一行的unknown共享一个表示了,这是由于a与A都算入了a的词频,且A的表示也是a的表示。 diff --git a/docs/source/tutorials/tutorial_4_load_dataset.rst b/docs/source/tutorials/tutorial_4_load_dataset.rst new file mode 100644 index 00000000..a93ae8d5 --- /dev/null +++ b/docs/source/tutorials/tutorial_4_load_dataset.rst @@ -0,0 +1,210 @@ +======================================= +使用Loader和Pipe加载并处理数据集 +======================================= + +这一部分是关于如何加载数据集的教程 + +教程目录: + + - `Part I: 数据集容器DataBundle`_ + - `Part II: 加载的各种数据集的Loader`_ + - `Part III: 使用Pipe对数据集进行预处理`_ + - `Part IV: fastNLP封装好的Loader和Pipe`_ + - `Part V: 不同格式类型的基础Loader`_ + + +Part I: 数据集容器DataBundle +------------------------------------ + +而由于对于同一个任务,训练集,验证集和测试集会共用同一个词表以及具有相同的目标值,所以在fastNLP中我们使用了 :class:`~fastNLP.io.DataBundle` +来承载同一个任务的多个数据集 :class:`~fastNLP.DataSet` 以及它们的词表 :class:`~fastNLP.Vocabulary` 。下面会有例子介绍 :class:`~fastNLP.io.DataBundle` +的相关使用。 + +:class:`~fastNLP.io.DataBundle` 在fastNLP中主要在各个 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 中被使用。 +下面我们先介绍一下 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 。 + +Part II: 加载的各种数据集的Loader +------------------------------------- + +在fastNLP中,所有的 :class:`~fastNLP.io.Loader` 都可以通过其文档判断其支持读取的数据格式,以及读取之后返回的 :class:`~fastNLP.DataSet` 的格式, +例如 :class:`~fastNLP.io.ChnSentiCorpLoader` 。 + + - **download()** 函数:自动将该数据集下载到缓存地址,默认缓存地址为~/.fastNLP/datasets/。由于版权等原因,不是所有的Loader都实现了该方法。该方法会返回下载后文件所处的缓存地址。 + - **_load()** 函数:从一个数据文件中读取数据,返回一个 :class:`~fastNLP.DataSet` 。返回的DataSet的格式可从Loader文档判断。 + - **load()** 函数:从文件或者文件夹中读取数据为 :class:`~fastNLP.DataSet` 并将它们组装成 :class:`~fastNLP.io.DataBundle`。支持接受的参数类型有以下的几种 + + - None, 将尝试读取自动缓存的数据,仅支持提供了自动下载数据的Loader + - 文件夹路径, 默认将尝试在该文件夹下匹配文件名中含有 `train` , `test` , `dev` 的文件,如果有多个文件含有相同的关键字,将无法通过该方式读取 + - dict, 例如{'train':"/path/to/tr.conll", 'dev':"/to/validate.conll", "test":"/to/te.conll"}。 + +.. code-block:: python + + from fastNLP.io import CWSLoader + + loader = CWSLoader(dataset_name='pku') + data_bundle = loader.load() + print(data_bundle) + +输出内容为:: + + In total 3 datasets: + dev has 1831 instances. + train has 17223 instances. + test has 1944 instances. + +这里表示一共有3个数据集。其中: + + - 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance + +也可以取出DataSet,并打印DataSet中的具体内容 + +.. code-block:: python + + tr_data = data_bundle.get_dataset('train') + print(tr_data[:2]) + +输出为:: + + +--------------------------------------------------------------------------------------+ + | raw_words | + +--------------------------------------------------------------------------------------+ + | 迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ( 附 图片 1 张 ) | + | 中共中央 总书记 、 国家 主席 江 泽民 | + +--------------------------------------------------------------------------------------+ + +Part III: 使用Pipe对数据集进行预处理 +------------------------------------------ +通过 :class:`~fastNLP.io.Loader` 可以将文本数据读入,但并不能直接被神经网络使用,还需要进行一定的预处理。 + +在fastNLP中,我们使用 :class:`~fastNLP.io.Pipe` 的子类作为数据预处理的类, :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 一般具备一一对应的关系,该关系可以从其名称判断, +例如 :class:`~fastNLP.io.CWSLoader` 与 :class:`~fastNLP.io.CWSPipe` 是一一对应的。一般情况下Pipe处理包含以下的几个过程,(1)将raw_words或 +raw_chars进行tokenize以切分成不同的词或字; (2) 再建立词或字的 :class:`~fastNLP.Vocabulary` , 并将词或字转换为index; (3)将target +列建立词表并将target列转为index; + +所有的Pipe都可通过其文档查看该Pipe支持处理的 :class:`~fastNLP.DataSet` 以及返回的 :class:`~fastNLP.io.DataBundle` 中的Vocabulary的情况; +如 :class:`~fastNLP.io.OntoNotesNERPipe` + +各种数据集的Pipe当中,都包含了以下的两个函数: + + - process() 函数:对输入的 :class:`~fastNLP.io.DataBundle` 进行处理, 然后返回处理之后的 :class:`~fastNLP.io.DataBundle` 。process函数的文档中包含了该Pipe支持处理的DataSet的格式。 + - process_from_file() 函数:输入数据集所在文件夹,使用对应的Loader读取数据(所以该函数支持的参数类型是由于其对应的Loader的load函数决定的),然后调用相对应的process函数对数据进行预处理。相当于是把Load和process放在一个函数中执行。 + +接着上面 :class:`~fastNLP.io.CWSLoader` 的例子,我们展示一下 :class:`~fastNLP.io.CWSPipe` 的功能: + +.. code-block:: python + + from fastNLP.io import CWSPipe + + data_bundle = CWSPipe().process(data_bundle) + print(data_bundle) + +输出内容为:: + + In total 3 datasets: + dev has 1831 instances. + train has 17223 instances. + test has 1944 instances. + In total 2 vocabs: + chars has 4777 entries. + target has 4 entries. + +表示一共有3个数据集和2个词表。其中: + + - 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance + - 2个词表分别为chars词表与target词表。其中chars词表为句子文本所构建的词表,一共有4777个不同的字;target词表为目标标签所构建的词表,一共有4种标签。 + +相较于之前CWSLoader读取的DataBundle,新增了两个Vocabulary。 我们可以打印一下处理之后的DataSet + +.. code-block:: python + + tr_data = data_bundle.get_dataset('train') + print(tr_data[:2]) + +输出为:: + + +---------------------------------------------------+------------------------------------+------------------------------------+---------+ + | raw_words | chars | target | seq_len | + +---------------------------------------------------+------------------------------------+------------------------------------+---------+ + | 迈向 充满 希望 的 新 世纪 —— 一九九八年... | [1224, 178, 674, 544, 573, 435,... | [0, 1, 0, 1, 0, 1, 2, 2, 0, 1, ... | 29 | + | 中共中央 总书记 、 国家 主席 江 泽民 | [11, 212, 11, 335, 124, 256, 10... | [0, 3, 3, 1, 0, 3, 1, 2, 0, 1, ... | 15 | + +---------------------------------------------------+------------------------------------+------------------------------------+---------+ + +可以看到有两列为int的field: chars和target。这两列的名称同时也是DataBundle中的Vocabulary的名称。可以通过下列的代码获取并查看Vocabulary的 +信息 + +.. code-block:: python + + vocab = data_bundle.get_vocab('target') + print(vocab) + +输出为:: + + Vocabulary(['B', 'E', 'S', 'M']...) + + +Part IV: fastNLP封装好的Loader和Pipe +------------------------------------------ + +fastNLP封装了多种任务/数据集的 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 并提供自动下载功能,具体参见文档 +`数据集 `_ + + +Part V: 不同格式类型的基础Loader +-------------------------------------------------------- + +除了上面提到的针对具体任务的Loader,我们还提供了CSV格式和JSON格式的Loader + +:class:`~fastNLP.io.loader.CSVLoader` 读取CSV类型的数据集文件。例子如下: + + .. code-block:: python + + from fastNLP.io.loader import CSVLoader + data_set_loader = CSVLoader( + headers=('raw_words', 'target'), sep='\t' + ) + # 表示将CSV文件中每一行的第一项将填入'raw_words' field,第二项填入'target' field。 + # 其中项之间由'\t'分割开来 + + data_set = data_set_loader._load('path/to/your/file') + + 文件内容样例如下 :: + + But it does not leave you with much . 1 + You could hate it for the same reason . 1 + The performances are an absolute joy . 4 + + 读取之后的DataSet具有以下的field + + .. csv-table:: + :header: raw_words, target + + "But it does not leave you with much .", "1" + "You could hate it for the same reason .", "1" + "The performances are an absolute joy .", "4" + +:class:`~fastNLP.io.JsonLoader` 读取Json类型的数据集文件,数据必须按行存储,每行是一个包含各类属性的Json对象。例子如下: + + .. code-block:: python + + from fastNLP.io.loader import JsonLoader + oader = JsonLoader( + fields={'sentence1': 'raw_words1', 'sentence2': 'raw_words2', 'gold_label': 'target'} + ) + # 表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'raw_words1'、'raw_words2'、'target'这三个fields + + data_set = loader._load('path/to/your/file') + + 数据集内容样例如下 :: + + {"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"} + {"annotator_labels": ["contradiction"], "captionID": "3416050480.jpg#4", "gold_label": "contradiction", "pairID": "3416050480.jpg#4r1c", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is at a diner, ordering an omelette.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) ( ordering ( an omelette ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) (NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP (DT an) (NN omelette))))) (. .)))"} + {"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"} + + 读取之后的DataSet具有以下的field + + .. csv-table:: + :header: raw_words0, raw_words1, target + + "A person on a horse jumps over a broken down airplane.", "A person is training his horse for a competition.", "neutral" + "A person on a horse jumps over a broken down airplane.", "A person is at a diner, ordering an omelette.", "contradiction" + "A person on a horse jumps over a broken down airplane.", "A person is outdoors, on a horse.", "entailment" diff --git a/docs/source/tutorials/tutorial_4_loss_optimizer.rst b/docs/source/tutorials/tutorial_4_loss_optimizer.rst deleted file mode 100644 index a6e1730a..00000000 --- a/docs/source/tutorials/tutorial_4_loss_optimizer.rst +++ /dev/null @@ -1,267 +0,0 @@ -============================================================================== -动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 -============================================================================== - -我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极(label=1)、 -消极(label=0)还是中性(label=2),使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。 - --------------- -数据处理 --------------- - -数据读入 - 我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SSTLoader` 类,轻松地读取SST数据集(数据来源:https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip)。 - 这里的 dataset 是 fastNLP 中 :class:`~fastNLP.DataSet` 类的对象。 - - .. code-block:: python - - from fastNLP.io import SSTLoader - - loader = SSTLoader() - #这里的all.txt是下载好数据后train.txt、dev.txt、test.txt的组合 - dataset = loader.load("./trainDevTestTrees_PTB/trees/all.txt") - print(dataset[0]) - - 输出数据如下:: - - {'words': ['It', "'s", 'a', 'lovely', 'film', 'with', 'lovely', 'performances', 'by', 'Buy', 'and', 'Accorsi', '.'] type=list, - 'target': positive type=str} - - 除了读取数据外,fastNLP 还提供了读取其它文件类型的 Loader 类、读取 Embedding的 Loader 等。详见 :doc:`/fastNLP.io` 。 - - -数据处理 - 我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``target`` :mod:`~fastNLP.core.field` 转化为整数。 - - .. code-block:: python - - def label_to_int(x): - if x['target']=="positive": - return 1 - elif x['target']=="negative": - return 0 - else: - return 2 - - # 将label转为整数 - dataset.apply(lambda x: label_to_int(x), new_field_name='target') - - ``words`` 和 ``target`` 已经足够用于 :class:`~fastNLP.models.CNNText` 的训练了,但我们从其文档 - :class:`~fastNLP.models.CNNText` 中看到,在 :meth:`~fastNLP.models.CNNText.forward` 的时候,还可以传入可选参数 ``seq_len`` 。 - 所以,我们再使用 :meth:`~fastNLP.DataSet.apply_field` 方法增加一个名为 ``seq_len`` 的 :mod:`~fastNLP.core.field` 。 - - .. code-block:: python - - # 增加长度信息 - dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') - - 观察可知: :meth:`~fastNLP.DataSet.apply_field` 与 :meth:`~fastNLP.DataSet.apply` 类似, - 但所传入的 `lambda` 函数是针对一个 :class:`~fastNLP.Instance` 中的一个 :mod:`~fastNLP.core.field` 的; - 而 :meth:`~fastNLP.DataSet.apply` 所传入的 `lambda` 函数是针对整个 :class:`~fastNLP.Instance` 的。 - - .. note:: - `lambda` 函数即匿名函数,是 Python 的重要特性。 ``lambda x: len(x)`` 和下面的这个函数的作用相同:: - - def func_lambda(x): - return len(x) - - 你也可以编写复杂的函数做为 :meth:`~fastNLP.DataSet.apply_field` 与 :meth:`~fastNLP.DataSet.apply` 的参数 - -Vocabulary 的使用 - 我们再用 :class:`~fastNLP.Vocabulary` 类来统计数据中出现的单词,并使用 :meth:`~fastNLP.Vocabulary.index_dataset` - 将单词序列转化为训练可用的数字序列。 - - .. code-block:: python - - from fastNLP import Vocabulary - - # 使用Vocabulary类统计单词,并将单词序列转化为数字序列 - vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words') - vocab.index_dataset(dataset, field_name='words',new_field_name='words') - print(dataset[0]) - - 输出数据如下:: - - {'words': [27, 9, 6, 913, 16, 18, 913, 124, 31, 5715, 5, 1, 2] type=list, - 'target': 1 type=int, - 'seq_len': 13 type=int} - - ---------------------- -使用内置模型训练 ---------------------- - -内置模型的输入输出命名 - fastNLP内置了一些完整的神经网络模型,详见 :doc:`/fastNLP.models` , 我们使用其中的 :class:`~fastNLP.models.CNNText` 模型进行训练。 - 为了使用内置的 :class:`~fastNLP.models.CNNText`,我们必须修改 :class:`~fastNLP.DataSet` 中 :mod:`~fastNLP.core.field` 的名称。 - 在这个例子中模型输入 (forward方法的参数) 为 ``words`` 和 ``seq_len`` ; 预测输出为 ``pred`` ;标准答案为 ``target`` 。 - 具体的命名规范可以参考 :doc:`/fastNLP.core.const` 。 - - 如果不想查看文档,您也可以使用 :class:`~fastNLP.Const` 类进行命名。下面的代码展示了给 :class:`~fastNLP.DataSet` 中 - :mod:`~fastNLP.core.field` 改名的 :meth:`~fastNLP.DataSet.rename_field` 方法,以及 :class:`~fastNLP.Const` 类的使用方法。 - - .. code-block:: python - - from fastNLP import Const - - dataset.rename_field('words', Const.INPUT) - dataset.rename_field('seq_len', Const.INPUT_LEN) - dataset.rename_field('target', Const.TARGET) - - print(Const.INPUT) - print(Const.INPUT_LEN) - print(Const.TARGET) - print(Const.OUTPUT) - - 输出结果为:: - - words - seq_len - target - pred - - 在给 :class:`~fastNLP.DataSet` 中 :mod:`~fastNLP.core.field` 改名后,我们还需要设置训练所需的输入和目标,这里使用的是 - :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 两个函数。 - - .. code-block:: python - - #使用dataset的 set_input 和 set_target函数,告诉模型dataset中那些数据是输入,那些数据是标签(目标输出) - dataset.set_input(Const.INPUT, Const.INPUT_LEN) - dataset.set_target(Const.TARGET) - -数据集分割 - 除了修改 :mod:`~fastNLP.core.field` 之外,我们还可以对 :class:`~fastNLP.DataSet` 进行分割,以供训练、开发和测试使用。 - 下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法 - - .. code-block:: python - - train_dev_data, test_data = dataset.split(0.1) - train_data, dev_data = train_dev_data.split(0.1) - print(len(train_data), len(dev_data), len(test_data)) - - 输出结果为:: - - 9603 1067 1185 - -评价指标 - 训练模型需要提供一个评价指标。这里使用准确率做为评价指标。参数的 `命名规则` 跟上面类似。 - ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。 - ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。 - - .. code-block:: python - - from fastNLP import AccuracyMetric - - # metrics=AccuracyMetric() 在本例中与下面这行代码等价 - metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) - -损失函数 - 训练模型需要提供一个损失函数 - ,fastNLP中提供了直接可以导入使用的四种loss,分别为: - * :class:`~fastNLP.CrossEntropyLoss`:包装了torch.nn.functional.cross_entropy()函数,返回交叉熵损失(可以运用于多分类场景) - * :class:`~fastNLP.BCELoss`:包装了torch.nn.functional.binary_cross_entropy()函数,返回二分类的交叉熵 - * :class:`~fastNLP.L1Loss`:包装了torch.nn.functional.l1_loss()函数,返回L1 损失 - * :class:`~fastNLP.NLLLoss`:包装了torch.nn.functional.nll_loss()函数,返回负对数似然损失 - - 下面提供了一个在分类问题中常用的交叉熵损失。注意它的 **初始化参数** 。 - ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。 - ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。 - 这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或 - 数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。 - - .. code-block:: python - - from fastNLP import CrossEntropyLoss - - # loss = CrossEntropyLoss() 在本例中与下面这行代码等价 - loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET) - -优化器 - 定义模型运行的时候使用的优化器,可以使用fastNLP包装好的优化器: - - * :class:`~fastNLP.SGD` :包装了torch.optim.SGD优化器 - * :class:`~fastNLP.Adam` :包装了torch.optim.Adam优化器 - - 也可以直接使用torch.optim.Optimizer中的优化器,并在实例化 :class:`~fastNLP.Trainer` 类的时候传入优化器实参 - - .. code-block:: python - - import torch.optim as optim - from fastNLP import Adam - - #使用 torch.optim 定义优化器 - optimizer_1=optim.RMSprop(model_cnn.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False) - #使用fastNLP中包装的 Adam 定义优化器 - optimizer_2=Adam(lr=4e-3, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, model_params=model_cnn.parameters()) - -快速训练 - 现在我们可以导入 fastNLP 内置的文本分类模型 :class:`~fastNLP.models.CNNText` ,并使用 :class:`~fastNLP.Trainer` 进行训练, - 除了使用 :class:`~fastNLP.Trainer`进行训练,我们也可以通过使用 :class:`~fastNLP.DataSetIter` 来编写自己的训练过程,具体见 :doc:`/tutorials/tutorial_5_datasetiter` - - .. code-block:: python - - from fastNLP.models import CNNText - - #词嵌入的维度、训练的轮数和batch size - EMBED_DIM = 100 - N_EPOCHS = 10 - BATCH_SIZE = 16 - - #使用CNNText的时候第一个参数输入一个tuple,作为模型定义embedding的参数 - #还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值 - model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=3, padding=2, dropout=0.1) - - #如果在定义trainer的时候没有传入optimizer参数,模型默认的优化器为torch.optim.Adam且learning rate为lr=4e-3 - #这里只使用了optimizer_1作为优化器输入,感兴趣可以尝试optimizer_2或者其他优化器作为输入 - #这里只使用了loss作为损失函数输入,感兴趣可以尝试其他损失函数输入 - trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics, - optimizer=optimizer_1,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE) - trainer.train() - - 训练过程的输出如下:: - - input fields after batch(if batch size is 2): - words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 40]) - seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) - target fields after batch(if batch size is 2): - target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) - - training epochs started 2019-07-08-15-44-48 - Evaluation at Epoch 1/10. Step:601/6010. AccuracyMetric: acc=0.59044 - - Evaluation at Epoch 2/10. Step:1202/6010. AccuracyMetric: acc=0.599813 - - Evaluation at Epoch 3/10. Step:1803/6010. AccuracyMetric: acc=0.508903 - - Evaluation at Epoch 4/10. Step:2404/6010. AccuracyMetric: acc=0.596064 - - Evaluation at Epoch 5/10. Step:3005/6010. AccuracyMetric: acc=0.47985 - - Evaluation at Epoch 6/10. Step:3606/6010. AccuracyMetric: acc=0.589503 - - Evaluation at Epoch 7/10. Step:4207/6010. AccuracyMetric: acc=0.311153 - - Evaluation at Epoch 8/10. Step:4808/6010. AccuracyMetric: acc=0.549203 - - Evaluation at Epoch 9/10. Step:5409/6010. AccuracyMetric: acc=0.581068 - - Evaluation at Epoch 10/10. Step:6010/6010. AccuracyMetric: acc=0.523899 - - - In Epoch:2/Step:1202, got best dev performance:AccuracyMetric: acc=0.599813 - Reloaded the best model. - -快速测试 - 与 :class:`~fastNLP.Trainer` 对应,fastNLP 也提供了 :class:`~fastNLP.Tester` 用于快速测试,用法如下 - - .. code-block:: python - - from fastNLP import Tester - - tester = Tester(test_data, model_cnn, metrics=AccuracyMetric()) - tester.test() - - 训练过程输出如下:: - - [tester] - AccuracyMetric: acc=0.565401 diff --git a/docs/source/tutorials/tutorial_5_datasetiter.rst b/docs/source/tutorials/tutorial_5_datasetiter.rst deleted file mode 100644 index 23d26deb..00000000 --- a/docs/source/tutorials/tutorial_5_datasetiter.rst +++ /dev/null @@ -1,250 +0,0 @@ -============================================================================== -动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 -============================================================================== - -我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极(label=1)、 -消极(label=0)还是中性(label=2),使用 :class:`~fastNLP.DataSetIter` 类来编写自己的训练过程。 -自己编写训练过程之前的内容与 :doc:`/tutorials/tutorial_4_loss_optimizer` 中的完全一样,如已经阅读过可以跳过。 - --------------- -数据处理 --------------- - -数据读入 - 我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SSTLoader` 类,轻松地读取SST数据集(数据来源:https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip)。 - 这里的 dataset 是 fastNLP 中 :class:`~fastNLP.DataSet` 类的对象。 - - .. code-block:: python - - from fastNLP.io import SSTLoader - - loader = SSTLoader() - #这里的all.txt是下载好数据后train.txt、dev.txt、test.txt的组合 - dataset = loader.load("./trainDevTestTrees_PTB/trees/all.txt") - print(dataset[0]) - - 输出数据如下:: - - {'words': ['It', "'s", 'a', 'lovely', 'film', 'with', 'lovely', 'performances', 'by', 'Buy', 'and', 'Accorsi', '.'] type=list, - 'target': positive type=str} - - 除了读取数据外,fastNLP 还提供了读取其它文件类型的 Loader 类、读取 Embedding的 Loader 等。详见 :doc:`/fastNLP.io` 。 - - -数据处理 - 我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``target`` :mod:`~fastNLP.core.field` 转化为整数。 - - .. code-block:: python - - def label_to_int(x): - if x['target']=="positive": - return 1 - elif x['target']=="negative": - return 0 - else: - return 2 - - # 将label转为整数 - dataset.apply(lambda x: label_to_int(x), new_field_name='target') - - ``words`` 和 ``target`` 已经足够用于 :class:`~fastNLP.models.CNNText` 的训练了,但我们从其文档 - :class:`~fastNLP.models.CNNText` 中看到,在 :meth:`~fastNLP.models.CNNText.forward` 的时候,还可以传入可选参数 ``seq_len`` 。 - 所以,我们再使用 :meth:`~fastNLP.DataSet.apply_field` 方法增加一个名为 ``seq_len`` 的 :mod:`~fastNLP.core.field` 。 - - .. code-block:: python - - # 增加长度信息 - dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') - - 观察可知: :meth:`~fastNLP.DataSet.apply_field` 与 :meth:`~fastNLP.DataSet.apply` 类似, - 但所传入的 `lambda` 函数是针对一个 :class:`~fastNLP.Instance` 中的一个 :mod:`~fastNLP.core.field` 的; - 而 :meth:`~fastNLP.DataSet.apply` 所传入的 `lambda` 函数是针对整个 :class:`~fastNLP.Instance` 的。 - - .. note:: - `lambda` 函数即匿名函数,是 Python 的重要特性。 ``lambda x: len(x)`` 和下面的这个函数的作用相同:: - - def func_lambda(x): - return len(x) - - 你也可以编写复杂的函数做为 :meth:`~fastNLP.DataSet.apply_field` 与 :meth:`~fastNLP.DataSet.apply` 的参数 - -Vocabulary 的使用 - 我们再用 :class:`~fastNLP.Vocabulary` 类来统计数据中出现的单词,并使用 :meth:`~fastNLP.Vocabulary.index_dataset` - 将单词序列转化为训练可用的数字序列。 - - .. code-block:: python - - from fastNLP import Vocabulary - - # 使用Vocabulary类统计单词,并将单词序列转化为数字序列 - vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words') - vocab.index_dataset(dataset, field_name='words',new_field_name='words') - print(dataset[0]) - - 输出数据如下:: - - {'words': [27, 9, 6, 913, 16, 18, 913, 124, 31, 5715, 5, 1, 2] type=list, - 'target': 1 type=int, - 'seq_len': 13 type=int} - - ---------------------- -使用内置模型训练 ---------------------- - -内置模型的输入输出命名 - fastNLP内置了一些完整的神经网络模型,详见 :doc:`/fastNLP.models` , 我们使用其中的 :class:`~fastNLP.models.CNNText` 模型进行训练。 - 为了使用内置的 :class:`~fastNLP.models.CNNText`,我们必须修改 :class:`~fastNLP.DataSet` 中 :mod:`~fastNLP.core.field` 的名称。 - 在这个例子中模型输入 (forward方法的参数) 为 ``words`` 和 ``seq_len`` ; 预测输出为 ``pred`` ;标准答案为 ``target`` 。 - 具体的命名规范可以参考 :doc:`/fastNLP.core.const` 。 - - 如果不想查看文档,您也可以使用 :class:`~fastNLP.Const` 类进行命名。下面的代码展示了给 :class:`~fastNLP.DataSet` 中 - :mod:`~fastNLP.core.field` 改名的 :meth:`~fastNLP.DataSet.rename_field` 方法,以及 :class:`~fastNLP.Const` 类的使用方法。 - - .. code-block:: python - - from fastNLP import Const - - dataset.rename_field('words', Const.INPUT) - dataset.rename_field('seq_len', Const.INPUT_LEN) - dataset.rename_field('target', Const.TARGET) - - print(Const.INPUT) - print(Const.INPUT_LEN) - print(Const.TARGET) - print(Const.OUTPUT) - - 输出结果为:: - - words - seq_len - target - pred - - 在给 :class:`~fastNLP.DataSet` 中 :mod:`~fastNLP.core.field` 改名后,我们还需要设置训练所需的输入和目标,这里使用的是 - :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 两个函数。 - - .. code-block:: python - - #使用dataset的 set_input 和 set_target函数,告诉模型dataset中那些数据是输入,那些数据是标签(目标输出) - dataset.set_input(Const.INPUT, Const.INPUT_LEN) - dataset.set_target(Const.TARGET) - -数据集分割 - 除了修改 :mod:`~fastNLP.core.field` 之外,我们还可以对 :class:`~fastNLP.DataSet` 进行分割,以供训练、开发和测试使用。 - 下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法 - - .. code-block:: python - - train_dev_data, test_data = dataset.split(0.1) - train_data, dev_data = train_dev_data.split(0.1) - print(len(train_data), len(dev_data), len(test_data)) - - 输出结果为:: - - 9603 1067 1185 - -评价指标 - 训练模型需要提供一个评价指标。这里使用准确率做为评价指标。参数的 `命名规则` 跟上面类似。 - ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。 - ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。 - - .. code-block:: python - - from fastNLP import AccuracyMetric - - # metrics=AccuracyMetric() 在本例中与下面这行代码等价 - metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) - - --------------------------- -自己编写训练过程 --------------------------- - 如果你想用类似 PyTorch 的使用方法,自己编写训练过程,你可以参考下面这段代码。 - 其中使用了 fastNLP 提供的 :class:`~fastNLP.DataSetIter` 来获得小批量训练的小批量数据, - 使用 :class:`~fastNLP.BucketSampler` 做为 :class:`~fastNLP.DataSetIter` 的参数来选择采样的方式。 - -DataSetIter - fastNLP定义的 :class:`~fastNLP.DataSetIter` 类,用于定义一个batch,并实现batch的多种功能,在初始化时传入的参数有: - - * dataset: :class:`~fastNLP.DataSet` 对象, 数据集 - * batch_size: 取出的batch大小 - * sampler: 规定使用的 :class:`~fastNLP.Sampler` 若为 None, 使用 :class:`~fastNLP.RandomSampler` (Default: None) - * as_numpy: 若为 True, 输出batch为 `numpy.array`. 否则为 `torch.Tensor` (Default: False) - * prefetch: 若为 True使用多进程预先取出下一batch. (Default: False) - -sampler - fastNLP 实现的采样器有: - - * :class:`~fastNLP.BucketSampler` 可以随机地取出长度相似的元素 【初始化参数: num_buckets:bucket的数量; batch_size:batch大小; seq_len_field_name:dataset中对应序列长度的 :mod:`~fastNLP.core.field` 的名字】 - * SequentialSampler: 顺序取出元素的采样器【无初始化参数】 - * RandomSampler:随机化取元素的采样器【无初始化参数】 - - 以下代码使用BucketSampler作为 :class:`~fastNLP.DataSetIter` 初始化的输入,运用 :class:`~fastNLP.DataSetIter` 自己写训练程序 - - .. code-block:: python - - from fastNLP import BucketSampler - from fastNLP import DataSetIter - from fastNLP.models import CNNText - from fastNLP import Tester - import torch - import time - - embed_dim = 100 - model = CNNText((len(vocab),embed_dim), num_classes=3, padding=2, dropout=0.1) - - def train(epoch, data, devdata): - optimizer = torch.optim.Adam(model.parameters(), lr=0.001) - lossfunc = torch.nn.CrossEntropyLoss() - batch_size = 32 - - # 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。 - # 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket) - train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len') - train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler) - - start_time = time.time() - print("-"*5+"start training"+"-"*5) - for i in range(epoch): - loss_list = [] - for batch_x, batch_y in train_batch: - optimizer.zero_grad() - output = model(batch_x['words']) - loss = lossfunc(output['pred'], batch_y['target']) - loss.backward() - optimizer.step() - loss_list.append(loss.item()) - - #这里verbose如果为0,在调用Tester对象的test()函数时不输出任何信息,返回评估信息; 如果为1,打印出验证结果,返回评估信息 - #在调用过Tester对象的test()函数后,调用其_format_eval_results(res)函数,结构化输出验证结果 - tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0) - res=tester_tmp.test() - - print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=" ") - print(tester._format_eval_results(res),end=" ") - print('{:d}ms'.format(round((time.time()-start_time)*1000))) - loss_list.clear() - - train(10, train_data, dev_data) - #使用tester进行快速测试 - tester = Tester(test_data, model, metrics=AccuracyMetric()) - tester.test() - - 这段代码的输出如下:: - - -----start training----- - Epoch 0 Avg Loss: 1.09 AccuracyMetric: acc=0.480787 58989ms - Epoch 1 Avg Loss: 1.00 AccuracyMetric: acc=0.500469 118348ms - Epoch 2 Avg Loss: 0.93 AccuracyMetric: acc=0.536082 176220ms - Epoch 3 Avg Loss: 0.87 AccuracyMetric: acc=0.556701 236032ms - Epoch 4 Avg Loss: 0.78 AccuracyMetric: acc=0.562324 294351ms - Epoch 5 Avg Loss: 0.69 AccuracyMetric: acc=0.58388 353673ms - Epoch 6 Avg Loss: 0.60 AccuracyMetric: acc=0.574508 412106ms - Epoch 7 Avg Loss: 0.51 AccuracyMetric: acc=0.589503 471097ms - Epoch 8 Avg Loss: 0.44 AccuracyMetric: acc=0.581068 529174ms - Epoch 9 Avg Loss: 0.39 AccuracyMetric: acc=0.572634 586216ms - [tester] - AccuracyMetric: acc=0.527426 - - diff --git a/docs/source/tutorials/tutorial_5_loss_optimizer.rst b/docs/source/tutorials/tutorial_5_loss_optimizer.rst new file mode 100644 index 00000000..081fed2e --- /dev/null +++ b/docs/source/tutorials/tutorial_5_loss_optimizer.rst @@ -0,0 +1,237 @@ +============================================================================== +动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 +============================================================================== + +我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极的(label=0)、 +还是消极的(label=1),使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。 + +数据读入和处理 +----------------- + +数据读入 + 我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类,轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的 + :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数 据集,函数默认paths值为None。 + 此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` : + + * raw_words: 原source句子 + * target: 标签值 + * words: index之后的raw_words + * seq_len: 句子长度 + + 读入数据代码如下: + + .. code-block:: python + + from fastNLP.io import SST2Pipe + + pipe = SST2Pipe() + databundle = pipe.process_from_file() + vocab = databundle.get_vocab('words') + print(databundle) + print(databundle.get_dataset('train')[0]) + print(databundle.get_vocab('words')) + + + 输出数据如下:: + + In total 3 datasets: + test has 1821 instances. + train has 67349 instances. + dev has 872 instances. + In total 2 vocabs: + words has 16293 entries. + target has 2 entries. + + +-------------------------------------------+--------+--------------------------------------+---------+ + | raw_words | target | words | seq_len | + +-------------------------------------------+--------+--------------------------------------+---------+ + | hide new secretions from the parental ... | 1 | [4111, 98, 12010, 38, 2, 6844, 9042] | 7 | + +-------------------------------------------+--------+--------------------------------------+---------+ + + Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...) + + 除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:` ` 。 + +数据集分割 + 由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法 + + .. code-block:: python + + train_data = databundle.get_dataset('train') + train_data, test_data = train_data.split(0.015) + dev_data = databundle.get_dataset('dev') + print(len(train_data),len(dev_data),len(test_data)) + + 输出结果为:: + + 66339 872 1010 + +数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数 + :class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证 + 集的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input,同时将 `target` :mod:`~fastNLP.core.field` 设定 + 为target。我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个 + :mod:`~fastNLP.core.field` 的设定情况,代码如下: + + .. code-block:: python + + train_data.print_field_meta() + + 输出结果为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + 其中is_input和is_target分别表示是否为input和target。ignore_type为true时指使用 :class:`~fastNLP.DataSetIter` 取出batch数 + 据时fastNLP不会进行自动padding,pad_value指对应 :mod:`~fastNLP.core.field` padding所用的值,这两者只有 + 当 :mod:`~fastNLP.core.field` 设定为input或者target的时候才有存在的意义。 + + is_input为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的batch_x 中,而is_target为true + 的 :mod:`~fastNLP.core.field` 在:class:`~fastNLP.DataSetIter` 迭代取出的 batch_y 中。 + 具体分析见 :doc:`使用DataSetIter实现自定义训练过程 ` 。 + +使用内置模型训练 +--------------------- +模型定义和初始化 + 我们可以导入 fastNLP 内置的文本分类模型 :class:`~fastNLP.models.CNNText` 来对模型进行定义,代码如下: + + .. code-block:: python + + from fastNLP.models import CNNText + + #词嵌入的维度 + EMBED_DIM = 100 + + #使用CNNText的时候第一个参数输入一个tuple,作为模型定义embedding的参数 + #还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值 + model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=2, dropout=0.1) + + 使用fastNLP快速搭建自己的模型详见 :doc:`` 。 + +评价指标 + 训练模型需要提供一个评价指标。这里使用准确率做为评价指标。 + + * ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。 + * ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。 + + 这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或 + 数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。代码如下: + + .. code-block:: python + + from fastNLP import AccuracyMetric + from fastNLP import Const + + # metrics=AccuracyMetric() 在本例中与下面这行代码等价 + metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) + + +损失函数 + 训练模型需要提供一个损失函数 + ,fastNLP中提供了直接可以导入使用的四种loss,分别为: + + * :class:`~fastNLP.CrossEntropyLoss`:包装了torch.nn.functional.cross_entropy()函数,返回交叉熵损失(可以运用于多分类场景) + * :class:`~fastNLP.BCELoss`:包装了torch.nn.functional.binary_cross_entropy()函数,返回二分类的交叉熵 + * :class:`~fastNLP.L1Loss`:包装了torch.nn.functional.l1_loss()函数,返回L1 损失 + * :class:`~fastNLP.NLLLoss`:包装了torch.nn.functional.nll_loss()函数,返回负对数似然损失 + + 下面提供了一个在分类问题中常用的交叉熵损失。注意它的 **初始化参数** 。 + + * ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。 + * ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。 + + 这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或 + 数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。 + + .. code-block:: python + + from fastNLP import CrossEntropyLoss + + # loss = CrossEntropyLoss() 在本例中与下面这行代码等价 + loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET) + + 除了使用fastNLP已经包装好的了损失函数,也可以通过fastNLP中的LossFunc类来构建自己的损失函数,方法如下: + + .. code-block:: python + + # 这表示构建了一个损失函数类,由func计算损失函数,其中将从模型返回值或者DataSet的target=True的field + # 当中找到一个参数名为`pred`的参数传入func一个参数名为`input`的参数;找到一个参数名为`label`的参数 + # 传入func作为一个名为`target`的参数 + #下面自己构建了一个交叉熵函数,和之后直接使用fastNLP中的交叉熵函数是一个效果 + import torch + from fastNLP import LossFunc + func = torch.nn.functional.cross_entropy + loss_func = LossFunc(func, input=Const.OUTPUT, target=Const.TARGET) + +优化器 + 定义模型运行的时候使用的优化器,可以直接使用torch.optim.Optimizer中的优化器,并在实例化 :class:`~fastNLP.Trainer` 类的时候传入优化器实参 + + .. code-block:: python + + import torch.optim as optim + + #使用 torch.optim 定义优化器 + optimizer=optim.RMSprop(model_cnn.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False) + +快速训练 + 现在我们对上面定义的模型使用 :class:`~fastNLP.Trainer` 进行训练。 + 除了使用 :class:`~fastNLP.Trainer`进行训练,我们也可以通过使用 :class:`~fastNLP.DataSetIter` 来编写自己的训练过程,具体见 :doc:`/tutorials/tutorial_6_datasetiter` + + .. code-block:: python + + from fastNLP import Trainer + + #训练的轮数和batch size + N_EPOCHS = 10 + BATCH_SIZE = 16 + + #如果在定义trainer的时候没有传入optimizer参数,模型默认的优化器为torch.optim.Adam且learning rate为lr=4e-3 + #这里只使用了loss作为损失函数输入,感兴趣可以尝试其他损失函数(如之前自定义的loss_func)作为输入 + trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics, + optimizer=optimizer,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE) + trainer.train() + + 训练过程的输出如下:: + + input fields after batch(if batch size is 2): + words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) + seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + target fields after batch(if batch size is 2): + target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + + training epochs started 2019-09-17-14-29-00 + + Evaluate data in 0.11 seconds! + Evaluation on dev at Epoch 1/10. Step:4147/41470: + AccuracyMetric: acc=0.762615 + + ... + + Evaluate data in 0.2 seconds! + Evaluation on dev at Epoch 10/10. Step:41470/41470: + AccuracyMetric: acc=0.769495 + + In Epoch:2/Step:8294, got best dev performance: + AccuracyMetric: acc=0.800459 + Reloaded the best model. + +快速测试 + 与 :class:`~fastNLP.Trainer` 对应,fastNLP 也提供了 :class:`~fastNLP.Tester` 用于快速测试,用法如下 + + .. code-block:: python + + from fastNLP import Tester + + tester = Tester(test_data, model_cnn, metrics=AccuracyMetric()) + tester.test() + + 训练过程输出如下:: + + Evaluate data in 0.19 seconds! + [tester] + AccuracyMetric: acc=0.889109 diff --git a/docs/source/tutorials/tutorial_6_datasetiter.rst b/docs/source/tutorials/tutorial_6_datasetiter.rst new file mode 100644 index 00000000..40d1ade6 --- /dev/null +++ b/docs/source/tutorials/tutorial_6_datasetiter.rst @@ -0,0 +1,413 @@ +============================================================================== +动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 +============================================================================== + +我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极的(label=0)、 +还是消极的(label=1),使用 :class:`~fastNLP.DataSetIter` 类来编写自己的训练过程。 +DataSetIter初探之前的内容与 :doc:`/tutorials/tutorial_5_loss_optimizer` 中的完全一样,如已经阅读过可以跳过。 + + +数据读入和预处理 +-------------------- + +数据读入 + 我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类,轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的 + :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数 据集,函数默认paths值为None。 + 此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` : + + * raw_words: 原source句子 + * target: 标签值 + * words: index之后的raw_words + * seq_len: 句子长度 + + 读入数据代码如下: + + .. code-block:: python + + from fastNLP.io import SST2Pipe + + pipe = SST2Pipe() + databundle = pipe.process_from_file() + vocab = databundle.vocabs['words'] + print(databundle) + print(databundle.datasets['train'][0]) + print(databundle.vocabs['words']) + + + 输出数据如下:: + + In total 3 datasets: + test has 1821 instances. + train has 67349 instances. + dev has 872 instances. + In total 2 vocabs: + words has 16293 entries. + target has 2 entries. + + +-------------------------------------------+--------+--------------------------------------+---------+ + | raw_words | target | words | seq_len | + +-------------------------------------------+--------+--------------------------------------+---------+ + | hide new secretions from the parental ... | 1 | [4111, 98, 12010, 38, 2, 6844, 9042] | 7 | + +-------------------------------------------+--------+--------------------------------------+---------+ + + Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...) + + 除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:`/tutorials/tutorial_4_load_dataset` 。 + +数据集分割 + 由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法 + + .. code-block:: python + + train_data = databundle.get_dataset('train') + train_data, test_data = train_data.split(0.015) + dev_data = databundle.get_dataset('dev') + print(len(train_data),len(dev_data),len(test_data)) + + 输出结果为:: + + 66339 872 1010 + +数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数 + :class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证集 + 的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input,同时将`target` :mod:`~fastNLP.core.field` 设定为target。 + 我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个 + :mod:`~fastNLP.core.field` 的设定情况,代码如下: + + .. code-block:: python + + train_data.print_field_meta() + + 输出结果为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + 其中is_input和is_target分别表示是否为input和target。ignore_type为true时指使用 :class:`~fastNLP.DataSetIter` 取出batch数 + 据时fastNLP不会进行自动padding,pad_value指对应 :mod:`~fastNLP.core.field` padding所用的值,这两者只有当 + :mod:`~fastNLP.core.field` 设定为input或者target的时候才有存在的意义。 + + is_input为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的 batch_x 中, + 而 is_target为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的 batch_y 中。 + 具体分析见下面DataSetIter的介绍过程。 + + +评价指标 + 训练模型需要提供一个评价指标。这里使用准确率做为评价指标。 + + * ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。 + * ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。 + + 这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或 + 数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。代码如下: + + .. code-block:: python + + from fastNLP import AccuracyMetric + from fastNLP import Const + + # metrics=AccuracyMetric() 在本例中与下面这行代码等价 + metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET) + + +DataSetIter初探 +-------------------------- + +DataSetIter + fastNLP定义的 :class:`~fastNLP.DataSetIter` 类,用于定义一个batch,并实现batch的多种功能,在初始化时传入的参数有: + + * dataset: :class:`~fastNLP.DataSet` 对象, 数据集 + * batch_size: 取出的batch大小 + * sampler: 规定使用的 :class:`~fastNLP.Sampler` 若为 None, 使用 :class:`~fastNLP.RandomSampler` (Default: None) + * as_numpy: 若为 True, 输出batch为 `numpy.array`. 否则为 `torch.Tensor` (Default: False) + * prefetch: 若为 True使用多进程预先取出下一batch. (Default: False) + +sampler + fastNLP 实现的采样器有: + + * :class:`~fastNLP.BucketSampler` 可以随机地取出长度相似的元素 【初始化参数: num_buckets:bucket的数量; batch_size:batch大小; seq_len_field_name:dataset中对应序列长度的 :mod:`~fastNLP.core.field` 的名字】 + * SequentialSampler: 顺序取出元素的采样器【无初始化参数】 + * RandomSampler:随机化取元素的采样器【无初始化参数】 + +Padder + 在fastNLP里,pad是与一个 :mod:`~fastNLP.core.field` 绑定的。即不同的 :mod:`~fastNLP.core.field` 可以使用不同的pad方式,比如在英文任务中word需要的pad和 + character的pad方式往往是不同的。fastNLP是通过一个叫做 :class:`~fastNLP.Padder` 的子类来完成的。 + 默认情况下,所有field使用 :class:`~fastNLP.AutoPadder` + 。大多数情况下直接使用 :class:`~fastNLP.AutoPadder` 就可以了。 + 如果 :class:`~fastNLP.AutoPadder` 或 :class:`~fastNLP.EngChar2DPadder` 无法满足需求, + 也可以自己写一个 :class:`~fastNLP.Padder` 。 + +DataSetIter自动padding + 以下代码展示了DataSetIter的简单使用: + + .. code-block:: python + + from fastNLP import BucketSampler + from fastNLP import DataSetIter + + tmp_data = dev_data[:10] + # 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。 + # 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket) + sampler = BucketSampler(batch_size=2, seq_len_field_name='seq_len') + batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler) + for batch_x, batch_y in batch: + print("batch_x: ",batch_x) + print("batch_y: ", batch_y) + + 输出结果如下:: + + batch_x: {'words': tensor([[ 4, 278, 686, 18, 7], + [15619, 3205, 5, 1676, 0]]), 'seq_len': tensor([5, 4])} + batch_y: {'target': tensor([1, 1])} + batch_x: {'words': tensor([[ 44, 753, 328, 181, 10, 15622, 16, 71, 8905, 9, + 1218, 7, 0, 0, 0, 0, 0, 0, 0, 0], + [ 880, 97, 8, 1027, 12, 8068, 11, 13624, 8, 15620, + 4, 674, 663, 15, 4, 1155, 241, 640, 418, 7]]), 'seq_len': tensor([12, 20])} + batch_y: {'target': tensor([1, 0])} + batch_x: {'words': tensor([[ 1046, 11114, 16, 105, 5, 4, 177, 1825, 1705, 3, + 2, 18, 11, 4, 1019, 433, 144, 32, 246, 309, + 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0], + [ 13, 831, 7747, 175, 3, 46, 6, 84, 5753, 15, + 2178, 15, 62, 56, 407, 85, 1010, 4974, 26, 17, + 13786, 3, 534, 3688, 15624, 38, 376, 8, 15625, 8, + 1324, 4399, 7]]), 'seq_len': tensor([21, 33])} + batch_y: {'target': tensor([0, 1])} + batch_x: {'words': tensor([[ 14, 10, 438, 31, 78, 3, 78, 438, 7], + [ 14, 10, 4, 312, 5, 155, 1419, 610, 7]]), 'seq_len': tensor([9, 9])} + batch_y: {'target': tensor([1, 0])} + batch_x: {'words': tensor([[ 24, 96, 27, 45, 8, 337, 37, 240, 8, 2134, + 2, 18, 10, 15623, 1422, 6, 60, 5, 388, 7], + [ 2, 156, 3, 4427, 3, 240, 3, 740, 5, 1137, + 40, 42, 2428, 737, 2, 649, 10, 15621, 2286, 7]]), 'seq_len': tensor([20, 20])} + batch_y: {'target': tensor([0, 0])} + + 可以看到那些设定为input的 :mod:`~fastNLP.core.field` 都出现在batch_x中,而设定为target的 :mod:`~fastNLP.core.field` 则出现在batch_y中。同时对于同一个batch_x中的两个数 据,长度偏短的那个会被自动padding到和长度偏长的句子长度一致,默认的padding值为0。 + +Dataset改变padding值 + 可以通过 :meth:`~fastNLP.core.Dataset.set_pad_val` 方法修改默认的pad值,代码如下: + + .. code-block:: python + + tmp_data.set_pad_val('words',-1) + batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler) + for batch_x, batch_y in batch: + print("batch_x: ",batch_x) + print("batch_y: ", batch_y) + + 输出结果如下:: + + batch_x: {'words': tensor([[15619, 3205, 5, 1676, -1], + [ 4, 278, 686, 18, 7]]), 'seq_len': tensor([4, 5])} + batch_y: {'target': tensor([1, 1])} + batch_x: {'words': tensor([[ 1046, 11114, 16, 105, 5, 4, 177, 1825, 1705, 3, + 2, 18, 11, 4, 1019, 433, 144, 32, 246, 309, + 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1], + [ 13, 831, 7747, 175, 3, 46, 6, 84, 5753, 15, + 2178, 15, 62, 56, 407, 85, 1010, 4974, 26, 17, + 13786, 3, 534, 3688, 15624, 38, 376, 8, 15625, 8, + 1324, 4399, 7]]), 'seq_len': tensor([21, 33])} + batch_y: {'target': tensor([0, 1])} + batch_x: {'words': tensor([[ 14, 10, 4, 312, 5, 155, 1419, 610, 7], + [ 14, 10, 438, 31, 78, 3, 78, 438, 7]]), 'seq_len': tensor([9, 9])} + batch_y: {'target': tensor([0, 1])} + batch_x: {'words': tensor([[ 2, 156, 3, 4427, 3, 240, 3, 740, 5, 1137, + 40, 42, 2428, 737, 2, 649, 10, 15621, 2286, 7], + [ 24, 96, 27, 45, 8, 337, 37, 240, 8, 2134, + 2, 18, 10, 15623, 1422, 6, 60, 5, 388, 7]]), 'seq_len': tensor([20, 20])} + batch_y: {'target': tensor([0, 0])} + batch_x: {'words': tensor([[ 44, 753, 328, 181, 10, 15622, 16, 71, 8905, 9, + 1218, 7, -1, -1, -1, -1, -1, -1, -1, -1], + [ 880, 97, 8, 1027, 12, 8068, 11, 13624, 8, 15620, + 4, 674, 663, 15, 4, 1155, 241, 640, 418, 7]]), 'seq_len': tensor([12, 20])} + batch_y: {'target': tensor([1, 0])} + + 可以看到使用了-1进行padding。 + +Dataset个性化padding + 如果我们希望对某一些 :mod:`~fastNLP.core.field` 进行个性化padding,可以自己构造Padder类,并使用 :meth:`~fastNLP.core.Dataset.set_padder` 函数修改padder来实现。下面通 过构造一个将数据padding到固定长度的padder进行展示: + + .. code-block:: python + + from fastNLP.core.field import Padder + import numpy as np + class FixLengthPadder(Padder): + def __init__(self, pad_val=0, length=None): + super().__init__(pad_val=pad_val) + self.length = length + assert self.length is not None, "Creating FixLengthPadder with no specific length!" + + def __call__(self, contents, field_name, field_ele_dtype, dim): + #计算当前contents中的最大长度 + max_len = max(map(len, contents)) + #如果当前contents中的最大长度大于指定的padder length的话就报错 + assert max_len <= self.length, "Fixed padder length smaller than actual length! with length {}".format(max_len) + array = np.full((len(contents), self.length), self.pad_val, dtype=field_ele_dtype) + for i, content_i in enumerate(contents): + array[i, :len(content_i)] = content_i + return array + + #设定FixLengthPadder的固定长度为40 + tmp_padder = FixLengthPadder(pad_val=0,length=40) + #利用dataset的set_padder函数设定words field的padder + tmp_data.set_padder('words',tmp_padder) + batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler) + for batch_x, batch_y in batch: + print("batch_x: ",batch_x) + print("batch_y: ", batch_y) + + 输出结果如下:: + + batch_x: {'words': tensor([[ 4, 278, 686, 18, 7, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [15619, 3205, 5, 1676, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([5, 4])} + batch_y: {'target': tensor([1, 1])} + batch_x: {'words': tensor([[ 2, 156, 3, 4427, 3, 240, 3, 740, 5, 1137, + 40, 42, 2428, 737, 2, 649, 10, 15621, 2286, 7, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [ 24, 96, 27, 45, 8, 337, 37, 240, 8, 2134, + 2, 18, 10, 15623, 1422, 6, 60, 5, 388, 7, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 20])} + batch_y: {'target': tensor([0, 0])} + batch_x: {'words': tensor([[ 13, 831, 7747, 175, 3, 46, 6, 84, 5753, 15, + 2178, 15, 62, 56, 407, 85, 1010, 4974, 26, 17, + 13786, 3, 534, 3688, 15624, 38, 376, 8, 15625, 8, + 1324, 4399, 7, 0, 0, 0, 0, 0, 0, 0], + [ 1046, 11114, 16, 105, 5, 4, 177, 1825, 1705, 3, + 2, 18, 11, 4, 1019, 433, 144, 32, 246, 309, + 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([33, 21])} + batch_y: {'target': tensor([1, 0])} + batch_x: {'words': tensor([[ 14, 10, 4, 312, 5, 155, 1419, 610, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0], + [ 14, 10, 438, 31, 78, 3, 78, 438, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0]]), 'seq_len': tensor([9, 9])} + batch_y: {'target': tensor([0, 1])} + batch_x: {'words': tensor([[ 44, 753, 328, 181, 10, 15622, 16, 71, 8905, 9, + 1218, 7, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [ 880, 97, 8, 1027, 12, 8068, 11, 13624, 8, 15620, + 4, 674, 663, 15, 4, 1155, 241, 640, 418, 7, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([12, 20])} + batch_y: {'target': tensor([1, 0])} + + 在这里所有的`words`都被pad成了长度为40的list。 + + +使用DataSetIter自己编写训练过程 +------------------------------------ + 如果你想用类似 PyTorch 的使用方法,自己编写训练过程,可以参考下面这段代码。 + 其中使用了 fastNLP 提供的 :class:`~fastNLP.DataSetIter` 来获得小批量训练的小批量数据, + 使用 :class:`~fastNLP.BucketSampler` 做为 :class:`~fastNLP.DataSetIter` 的参数来选择采样的方式。 + + 以下代码使用BucketSampler作为 :class:`~fastNLP.DataSetIter` 初始化的输入,运用 :class:`~fastNLP.DataSetIter` 自己写训练程序 + + .. code-block:: python + + from fastNLP import BucketSampler + from fastNLP import DataSetIter + from fastNLP.models import CNNText + from fastNLP import Tester + import torch + import time + + embed_dim = 100 + model = CNNText((len(vocab),embed_dim), num_classes=2, dropout=0.1) + + def train(epoch, data, devdata): + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + lossfunc = torch.nn.CrossEntropyLoss() + batch_size = 32 + + # 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。 + # 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket) + train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len') + train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler) + + start_time = time.time() + print("-"*5+"start training"+"-"*5) + for i in range(epoch): + loss_list = [] + for batch_x, batch_y in train_batch: + optimizer.zero_grad() + output = model(batch_x['words']) + loss = lossfunc(output['pred'], batch_y['target']) + loss.backward() + optimizer.step() + loss_list.append(loss.item()) + + #这里verbose如果为0,在调用Tester对象的test()函数时不输出任何信息,返回评估信息; 如果为1,打印出验证结果,返回评估信息 + #在调用过Tester对象的test()函数后,调用其_format_eval_results(res)函数,结构化输出验证结果 + tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0) + res=tester_tmp.test() + + print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=" ") + print(tester_tmp._format_eval_results(res),end=" ") + print('{:d}ms'.format(round((time.time()-start_time)*1000))) + loss_list.clear() + + train(10, train_data, dev_data) + #使用tester进行快速测试 + tester = Tester(test_data, model, metrics=AccuracyMetric()) + tester.test() + + 这段代码的输出如下:: + + -----start training----- + + Evaluate data in 0.2 seconds! + Epoch 0 Avg Loss: 0.33 AccuracyMetric: acc=0.825688 48895ms + + Evaluate data in 0.19 seconds! + Epoch 1 Avg Loss: 0.16 AccuracyMetric: acc=0.829128 102081ms + + Evaluate data in 0.18 seconds! + Epoch 2 Avg Loss: 0.10 AccuracyMetric: acc=0.822248 152853ms + + Evaluate data in 0.17 seconds! + Epoch 3 Avg Loss: 0.08 AccuracyMetric: acc=0.821101 200184ms + + Evaluate data in 0.17 seconds! + Epoch 4 Avg Loss: 0.06 AccuracyMetric: acc=0.827982 253097ms + + Evaluate data in 0.27 seconds! + Epoch 5 Avg Loss: 0.05 AccuracyMetric: acc=0.806193 303883ms + + Evaluate data in 0.26 seconds! + Epoch 6 Avg Loss: 0.04 AccuracyMetric: acc=0.803899 392315ms + + Evaluate data in 0.36 seconds! + Epoch 7 Avg Loss: 0.04 AccuracyMetric: acc=0.802752 527211ms + + Evaluate data in 0.15 seconds! + Epoch 8 Avg Loss: 0.03 AccuracyMetric: acc=0.809633 661533ms + + Evaluate data in 0.31 seconds! + Epoch 9 Avg Loss: 0.03 AccuracyMetric: acc=0.797018 812232ms + + Evaluate data in 0.25 seconds! + [tester] + AccuracyMetric: acc=0.917822 + + + diff --git a/docs/source/tutorials/tutorial_6_seq_labeling.rst b/docs/source/tutorials/tutorial_6_seq_labeling.rst deleted file mode 100644 index 09a53cdc..00000000 --- a/docs/source/tutorials/tutorial_6_seq_labeling.rst +++ /dev/null @@ -1,114 +0,0 @@ -===================== -快速实现序列标注模型 -===================== - -这一部分的内容主要展示如何使用fastNLP 实现序列标注任务。你可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。 -在阅读这篇Tutorial前,希望你已经熟悉了fastNLP的基础使用,包括基本数据结构以及数据预处理,embedding的嵌入等,希望你对之前的教程有更进一步的掌握。 -我们将对CoNLL-03的英文数据集进行处理,展示如何完成命名实体标注任务整个训练的过程。 - -载入数据 -=================================== -fastNLP可以方便地载入各种类型的数据。同时,针对常见的数据集,我们已经预先实现了载入方法,其中包含CoNLL-03数据集。 -在设计dataloader时,以DataSetLoader为基类,可以改写并应用于其他数据集的载入。 - -.. code-block:: python - - class Conll2003DataLoader(DataSetLoader): - def __init__(self, task:str='ner', encoding_type:str='bioes'): - assert task in ('ner', 'pos', 'chunk') - index = {'ner':3, 'pos':1, 'chunk':2}[task] - #ConllLoader是fastNLP内置的类 - self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index]) - self._tag_converters = None - if task in ('ner', 'chunk'): - #iob和iob2bioes会对tag进行统一,标准化 - self._tag_converters = [iob2] - if encoding_type == 'bioes': - self._tag_converters.append(iob2bioes) - - def load(self, path: str): - dataset = self._loader.load(path) - def convert_tag_schema(tags): - for converter in self._tag_converters: - tags = converter(tags) - return tags - if self._tag_converters: - #使用apply实现convert_tag_schema函数,实际上也支持匿名函数 - dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) - return dataset - -输出数据格式如: - - {'raw_words': ['on', 'Friday', ':'] type=list, - 'target': ['O', 'O', 'O'] type=list}, - - -数据处理 ----------------------------- -我们进一步处理数据。将数据和词表封装在 :class:`~fastNLP.DataBundle` 类中。data是DataBundle的实例。 -我们输入模型的数据包括char embedding,以及word embedding。在数据处理部分,我们尝试完成词表的构建。 -使用fastNLP中的Vocabulary类来构建词表。 - -.. code-block:: python - - word_vocab = Vocabulary(min_freq=2) - word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT) - word_vocab.index_dataset(*data.datasets.values(),field_name=Const.INPUT, new_field_name=Const.INPUT) - -处理后的data对象内部为: - - dataset - vocabs - dataset保存了train和test中的数据,并保存为dataset类型 - vocab保存了words,raw-words以及target的词表。 - -模型构建 --------------------------------- -我们使用CNN-BILSTM-CRF模型完成这一任务。在网络构建方面,fastNLP的网络定义继承pytorch的 :class:`nn.Module` 类。 -自己可以按照pytorch的方式定义网络。需要注意的是命名。fastNLP的标准命名位于 :class:`~fastNLP.Const` 类。 - -模型的训练 -首先实例化模型,导入所需的char embedding以及word embedding。Embedding的载入可以参考教程。 -也可以查看 :mod:`~fastNLP.modules.encoder.embedding` 使用所需的embedding 载入方法。 -fastNLP将模型的训练过程封装在了 :class:`~fastnlp.trainer` 类中。 -根据不同的任务调整trainer中的参数即可。通常,一个trainer实例需要有:指定的训练数据集,模型,优化器,loss函数,评测指标,以及指定训练的epoch数,batch size等参数。 - -.. code-block:: python - - #实例化模型 - model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) - #定义优化器 - optimizer = Adam(model.parameters(), lr=0.005) - #定义评估指标 - Metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) - #实例化trainer - trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, dev_data=data.datasets['test'], batch_size=10, metrics=Metrics,callbacks=callbacks, n_epochs=100) - #开始训练 - trainer.train() - -训练中会保存最优的参数配置。 -训练的结果如下: - -.. code-block:: python - - Evaluation on DataSet test: - SpanFPreRecMetric: f=0.727661, pre=0.732293, rec=0.723088 - Evaluation at Epoch 1/100. Step:1405/140500. SpanFPreRecMetric: f=0.727661, pre=0.732293, rec=0.723088 - - Evaluation on DataSet test: - SpanFPreRecMetric: f=0.784307, pre=0.779371, rec=0.789306 - Evaluation at Epoch 2/100. Step:2810/140500. SpanFPreRecMetric: f=0.784307, pre=0.779371, rec=0.789306 - - Evaluation on DataSet test: - SpanFPreRecMetric: f=0.810068, pre=0.811003, rec=0.809136 - Evaluation at Epoch 3/100. Step:4215/140500. SpanFPreRecMetric: f=0.810068, pre=0.811003, rec=0.809136 - - Evaluation on DataSet test: - SpanFPreRecMetric: f=0.829592, pre=0.84153, rec=0.817989 - Evaluation at Epoch 4/100. Step:5620/140500. SpanFPreRecMetric: f=0.829592, pre=0.84153, rec=0.817989 - - Evaluation on DataSet test: - SpanFPreRecMetric: f=0.828789, pre=0.837096, rec=0.820644 - Evaluation at Epoch 5/100. Step:7025/140500. SpanFPreRecMetric: f=0.828789, pre=0.837096, rec=0.820644 - - diff --git a/docs/source/tutorials/tutorial_8_metrics.rst b/docs/source/tutorials/tutorial_7_metrics.rst similarity index 100% rename from docs/source/tutorials/tutorial_8_metrics.rst rename to docs/source/tutorials/tutorial_7_metrics.rst diff --git a/docs/source/tutorials/tutorial_7_modules_models.rst b/docs/source/tutorials/tutorial_8_modules_models.rst similarity index 99% rename from docs/source/tutorials/tutorial_7_modules_models.rst rename to docs/source/tutorials/tutorial_8_modules_models.rst index 680d75fd..0b26e0bd 100644 --- a/docs/source/tutorials/tutorial_7_modules_models.rst +++ b/docs/source/tutorials/tutorial_8_modules_models.rst @@ -6,7 +6,6 @@ 下面我们会分三节介绍编写构建模型的具体方法。 ----------------------- 使用 models 中的模型 ---------------------- @@ -81,8 +80,9 @@ FastNLP 中内置的 models 如下表所示,您可以点击具体的名称查 :class:`~fastNLP.models.STNLICls` ,用于自然语言推断 (NLI) 的 Star-Transformer 模型 :class:`~fastNLP.models.STSeqCls` , 用于分类任务的 Star-Transformer 模型 :class:`~fastNLP.models.BiaffineParser` , Biaffine 依存句法分析网络的实现 + :class:`~fastNLP.models.BiLSTMCRF`, 使用BiLSTM与CRF进行序列标注 + ----------------------------- 使用 nn.torch 编写模型 ---------------------------- @@ -137,7 +137,7 @@ FastNLP 完全支持使用 pyTorch 编写的模型,但与 pyTorch 中编写模 (dropout): Dropout(p=0.5) ) ----------------------------- + 使用 modules 编写模型 ---------------------------- diff --git a/docs/source/tutorials/tutorial_9_callback.rst b/docs/source/tutorials/tutorial_9_callback.rst deleted file mode 100644 index 8e2742bb..00000000 --- a/docs/source/tutorials/tutorial_9_callback.rst +++ /dev/null @@ -1,67 +0,0 @@ -=================================================== -使用Callback自定义你的训练过程 -=================================================== - -在训练时,我们常常要使用trick来提高模型的性能(如调节学习率),或者要打印训练中的信息。 -这里我们提供Callback类,在Trainer中插入代码,完成一些自定义的操作。 - -我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。 -给出一段评价性文字,预测其情感倾向是积极(label=1)、消极(label=0)还是中性(label=2),使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。 -关于数据处理,Loss和Optimizer的选择可以看其他教程,这里仅在训练时加入学习率衰减。 - ---------------------- -Callback的构建和使用 ---------------------- - -创建Callback - 我们可以继承fastNLP :class:`~fastNLP.Callback` 类来定义自己的Callback。 - 这里我们实现一个让学习率线性衰减的Callback。 - - .. code-block:: python - - import fastNLP - - class LRDecay(fastNLP.Callback): - def __init__(self): - super(MyCallback, self).__init__() - self.base_lrs = [] - self.delta = [] - - def on_train_begin(self): - # 初始化,仅训练开始时调用 - self.base_lrs = [pg['lr'] for pg in self.optimizer.param_groups] - self.delta = [float(lr) / self.n_epochs for lr in self.base_lrs] - - def on_epoch_end(self): - # 每个epoch结束时,更新学习率 - ep = self.epoch - lrs = [lr - d * ep for lr, d in zip(self.base_lrs, self.delta)] - self.change_lr(lrs) - - def change_lr(self, lrs): - for pg, lr in zip(self.optimizer.param_groups, lrs): - pg['lr'] = lr - - 这里,:class:`~fastNLP.Callback` 中所有以 ``on_`` 开头的类方法会在 :class:`~fastNLP.Trainer` 的训练中在特定时间调用。 - 如 on_train_begin() 会在训练开始时被调用,on_epoch_end() 会在每个 epoch 结束时调用。 - 具体有哪些类方法,参见文档 :class:`~fastNLP.Callback` 。 - - 另外,为了使用方便,可以在 :class:`~fastNLP.Callback` 内部访问 :class:`~fastNLP.Trainer` 中的属性,如 optimizer, epoch, step,分别对应训练时的优化器,当前epoch数,和当前的总step数。 - 具体可访问的属性,参见文档 :class:`~fastNLP.Callback` 。 - -使用Callback - 在定义好 :class:`~fastNLP.Callback` 之后,就能将它传入Trainer的 ``callbacks`` 参数,在实际训练时使用。 - - .. code-block:: python - - """ - 数据预处理,模型定义等等 - """ - - trainer = fastNLP.Trainer( - model=model, train_data=train_data, dev_data=dev_data, - optimizer=optimizer, metrics=metrics, - batch_size=10, n_epochs=100, - callbacks=[LRDecay()]) - - trainer.train() diff --git a/docs/source/tutorials/tutorial_9_seq_labeling.rst b/docs/source/tutorials/tutorial_9_seq_labeling.rst new file mode 100644 index 00000000..60bc1440 --- /dev/null +++ b/docs/source/tutorials/tutorial_9_seq_labeling.rst @@ -0,0 +1,187 @@ +===================== +快速实现序列标注模型 +===================== + +这一部分的内容主要展示如何使用fastNLP实现序列标注任务。您可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。 +在阅读这篇Tutorial前,希望您已经熟悉了fastNLP的基础使用,尤其是数据的载入以及模型的构建,通过这个小任务的能让您进一步熟悉fastNLP的使用。 + +命名实体识别(name entity recognition, NER) +------------------------------------------ + +命名实体识别任务是从文本中抽取出具有特殊意义或者指代性非常强的实体,通常包括人名、地名、机构名和时间等。 +如下面的例子中 + + 我来自复旦大学。 + +其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题在实际做的时候会被 +转换为序列标注问题 + + 针对"我来自复旦大学"这句话,我们的预测目标将是[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG],其中O表示out,即不是一个实体,B-ORG是ORG( + organization的缩写)这个类别的开头(Begin),I-ORG是ORG类别的中间(Inside)。 + +在本tutorial中我们将通过fastNLP尝试写出一个能够执行以上任务的模型。 + +载入数据 +------------------------------------------ +fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的,您可以通过 :doc:`使用Loader和Pipe处理数据 ` +了解如何使用fastNLP提供的数据加载函数。下面我们以微博命名实体任务来演示一下在fastNLP进行序列标注任务。 + +.. code-block:: python + + from fastNLP.io import WeiboNERPipe + data_bundle = WeiboNERPipe().process_from_file() + print(data_bundle.get_dataset('train')[:2]) + +打印的数据如下 :: + + +-------------------------------------------------+------------------------------------------+------------------------------------------+---------+ + | raw_chars | target | chars | seq_len | + +-------------------------------------------------+------------------------------------------+------------------------------------------+---------+ + | ['一', '节', '课', '的', '时', '间', '真', '... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, ... | [8, 211, 775, 3, 49, 245, 89, 26, 101... | 16 | + | ['回', '复', '支', '持', ',', '赞', '成', '... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | [116, 480, 127, 109, 2, 446, 134, 2, ... | 59 | + +-------------------------------------------------+------------------------------------------+------------------------------------------+---------+ + + +模型构建 +-------------------------------- + +首先选择需要使用的Embedding类型。关于Embedding的相关说明可以参见 :doc:`使用Embedding模块将文本转成向量 ` 。 +在这里我们使用通过word2vec预训练的中文汉字embedding。 + +.. code-block:: python + + from fastNLP.embeddings import StaticEmbedding + + embed = StaticEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name='cn-char-fastnlp-100d') + +选择好Embedding之后,我们可以使用fastNLP中自带的 :class:`fastNLP.models.BiLSTMCRF` 作为模型。 + +.. code-block:: python + + from fastNLP.models import BiLSTMCRF + + data_bundle.rename_field('chars', 'words') # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名 + model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5, + target_vocab=data_bundle.get_vocab('target')) + +下面我们选择用来评估模型的metric,以及优化用到的优化函数。 + +.. code-block:: python + + from fastNLP import SpanFPreRecMetric + from torch.optim import Adam + from fastNLP import LossInForward + + metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target')) + optimizer = Adam(model.parameters(), lr=1e-2) + loss = LossInForward() + +使用Trainer进行训练 + +.. code-block:: python + + from fastNLP import Trainer + import torch + + device= 0 if torch.cuda.is_available() else 'cpu' + trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, + dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device) + trainer.train() + +训练过程输出为:: + + input fields after batch(if batch size is 2): + target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) + seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) + target fields after batch(if batch size is 2): + target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) + seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) + + training epochs started 2019-09-25-10-43-09 + Evaluate data in 0.62 seconds! + Evaluation on dev at Epoch 1/10. Step:43/430: + SpanFPreRecMetric: f=0.070352, pre=0.100962, rec=0.053985 + + ... + + Evaluate data in 0.61 seconds! + Evaluation on dev at Epoch 10/10. Step:430/430: + SpanFPreRecMetric: f=0.51223, pre=0.581699, rec=0.457584 + + + In Epoch:7/Step:301, got best dev performance: + SpanFPreRecMetric: f=0.515528, pre=0.65098, rec=0.426735 + Reloaded the best model. + +训练结束之后过,可以通过 :class:`~fastNLP.Tester` 测试其在测试集上的性能 + +.. code-block::python + + from fastNLP import Tester + + tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric) + tester.test() + +输出为:: + + [tester] + SpanFPreRecMetric: f=0.482399, pre=0.530086, rec=0.442584 + + +使用更强的Bert做序列标注 +-------------------------------- + +在fastNLP使用Bert进行任务,您只需要切换为 :class:`fastNLP.embeddings.BertEmbedding` 即可。 + +.. code-block:: python + + from fastNLP.io import WeiboNERPipe + data_bundle = WeiboNERPipe().process_from_file() + data_bundle.rename_field('chars', 'words') + + from fastNLP.embeddings import BertEmbedding + embed = BertEmbedding(vocab=data_bundle.get_vocab('words'), model_dir_or_name='cn') + model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5, + target_vocab=data_bundle.get_vocab('target')) + + from fastNLP import SpanFPreRecMetric + from torch import Adam + from fastNLP import LossInForward + metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target')) + optimizer = Adam(model.parameters(), lr=2e-5) + loss = LossInForward() + + from fastNLP import Trainer + import torch + device= 0 if torch.cuda.is_available() else 'cpu' + trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, batch_size=12, + dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device) + trainer.train() + + from fastNLP import Tester + tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric) + tester.test() + +输出为:: + + training epochs started 2019-09-25-07-15-43 + Evaluate data in 2.02 seconds! + Evaluation on dev at Epoch 1/10. Step:113/1130: + SpanFPreRecMetric: f=0.0, pre=0.0, rec=0.0 + + ... + + Evaluate data in 2.17 seconds! + Evaluation on dev at Epoch 10/10. Step:1130/1130: + SpanFPreRecMetric: f=0.647332, pre=0.589852, rec=0.717224 + + In Epoch:6/Step:678, got best dev performance: + SpanFPreRecMetric: f=0.669963, pre=0.645238, rec=0.696658 + Reloaded the best model. + + Evaluate data in 1.82 seconds! + [tester] + SpanFPreRecMetric: f=0.641774, pre=0.626424, rec=0.657895 + +可以看出通过使用Bert,效果有明显的提升,从48.2提升到了64.1。 \ No newline at end of file diff --git a/docs/source/user/docs_in_code.rst b/docs/source/user/docs_in_code.rst deleted file mode 100644 index a0b9576f..00000000 --- a/docs/source/user/docs_in_code.rst +++ /dev/null @@ -1,3 +0,0 @@ -=============== -在代码中写文档 -=============== \ No newline at end of file diff --git a/docs/source/user/installation.rst b/docs/source/user/installation.rst index 42ea402c..b4156f6a 100644 --- a/docs/source/user/installation.rst +++ b/docs/source/user/installation.rst @@ -13,8 +13,9 @@ fastNLP 依赖如下包:: nltk>=3.4.1 requests spacy + prettytable>=0.7.2 -其中torch的安装可能与操作系统及 CUDA 的版本相关,请参见 `PyTorch 官网 `_ 。 +其中torch的安装可能与操作系统及 CUDA 的版本相关,请参见 `PyTorch 官网 `_ 。 在依赖包安装完成的情况,您可以在命令行执行如下指令完成安装 .. code:: shell diff --git a/docs/source/user/quickstart.rst b/docs/source/user/quickstart.rst index b92645b0..24809001 100644 --- a/docs/source/user/quickstart.rst +++ b/docs/source/user/quickstart.rst @@ -2,123 +2,13 @@ 快速入门 =============== -这是一个简单的分类任务 (数据来源 `kaggle `_ )。 -给出一段文字,预测它的标签是0~4中的哪一个。 +如果你想用 fastNLP 来快速地解决某类自然语言处理问题,你可以参考以下教程之一 -我们可以使用 fastNLP 中 io 模块中的 :class:`~fastNLP.io.CSVLoader` 类,轻松地从 csv 文件读取我们的数据。 +.. toctree:: + :maxdepth: 1 -.. code-block:: python + /quickstart/文本分类 - from fastNLP.io import CSVLoader - loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t') - dataset = loader.load("./sample_data/tutorial_sample_dataset.csv") +这些教程是简单地介绍了使用 fastNLP 的流程,更多的教程分析见 :doc:`/user/tutorials` -此时的 `dataset[0]` 的值如下,可以看到,数据集中的每个数据包含 ``raw_sentence`` 和 ``label`` 两个字段,他们的类型都是 ``str``:: - - {'raw_sentence': A series of escapades demonstrating the adage that what is good for the - goose is also good for the gander , some of which occasionally amuses but none of which - amounts to much of a story . type=str, - 'label': 1 type=str} - - -我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``raw_sentence`` 中字母变成小写,并将句子分词。 - -.. code-block:: python - - dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence') - dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True) - -然后我们再用 :class:`~fastNLP.Vocabulary` 类来统计数据中出现的单词,并将单词序列转化为训练可用的数字序列。 - -.. code-block:: python - - from fastNLP import Vocabulary - vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words') - vocab.index_dataset(dataset, field_name='words',new_field_name='words') - -同时,我们也将原来 str 类型的标签转化为数字,并设置为训练中的标准答案 ``target`` - -.. code-block:: python - - dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) - -现在我们可以导入 fastNLP 内置的文本分类模型 :class:`~fastNLP.models.CNNText` , - - -.. code-block:: python - - from fastNLP.models import CNNText - model = CNNText((len(vocab),50), num_classes=5, dropout=0.1) - -:class:`~fastNLP.models.CNNText` 的网络结构如下:: - - CNNText( - (embed): Embedding( - 177, 50 - (dropout): Dropout(p=0.0) - ) - (conv_pool): ConvMaxpool( - (convs): ModuleList( - (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,)) - (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,)) - (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,)) - ) - ) - (dropout): Dropout(p=0.1) - (fc): Linear(in_features=12, out_features=5, bias=True) - ) - -下面我们用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.split` 方法将数据集划分为 ``train_data`` 和 ``dev_data`` -两个部分,分别用于训练和验证 - -.. code-block:: python - - train_data, dev_data = dataset.split(0.2) - -最后我们用 fastNLP 的 :class:`~fastNLP.Trainer` 进行训练,训练的过程中需要传入模型 ``model`` ,训练数据集 ``train_data`` , -验证数据集 ``dev_data`` ,损失函数 ``loss`` 和衡量标准 ``metrics`` 。 -其中损失函数使用的是 fastNLP 提供的 :class:`~fastNLP.CrossEntropyLoss` 损失函数; -衡量标准使用的是 fastNLP 提供的 :class:`~fastNLP.AccuracyMetric` 正确率指标。 - -.. code-block:: python - - from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric - - trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, - loss=CrossEntropyLoss(), metrics=AccuracyMetric()) - trainer.train() - -训练过程的输出如下:: - - input fields after batch(if batch size is 2): - words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) - target fields after batch(if batch size is 2): - target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) - - training epochs started 2019-05-09-10-59-39 - Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.333333 - - Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.533333 - - Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.533333 - - Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.533333 - - Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.6 - - Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.8 - - Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.8 - - Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.733333 - - Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.733333 - - Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.733333 - - - In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.8 - Reloaded the best model. - -这份教程只是简单地介绍了使用 fastNLP 工作的流程,更多的教程分析见 :doc:`/user/tutorials` diff --git a/docs/source/user/tutorials.rst b/docs/source/user/tutorials.rst index 196f9c29..6d239e32 100644 --- a/docs/source/user/tutorials.rst +++ b/docs/source/user/tutorials.rst @@ -1,4 +1,4 @@ -======================== +======================== fastNLP 详细使用教程 ======================== @@ -8,13 +8,18 @@ fastNLP 详细使用教程 :maxdepth: 1 使用DataSet预处理文本 - 使用DataSetLoader加载数据集 + 使用Vocabulary转换文本与index 使用Embedding模块将文本转成向量 - 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 - 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 - 快速实现序列标注模型 - 使用Modules和Models快速搭建自定义模型 - 使用Metric快速评测你的模型 - 使用Callback自定义你的训练过程 - 使用fitlog 辅助 fastNLP 进行科研 + 使用Loader和Pipe加载并处理数据集 + 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 + 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 + 使用Metric快速评测你的模型 + 使用Modules和Models快速搭建自定义模型 + 快速实现序列标注模型 + 使用Callback自定义你的训练过程 + +.. toctree:: + :maxdepth: 1 + 拓展阅读1:BertEmbedding的各种用法 + 拓展阅读2:使用fitlog 辅助 fastNLP 进行科研 diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py index ec192568..2ca2c427 100644 --- a/fastNLP/__init__.py +++ b/fastNLP/__init__.py @@ -2,22 +2,22 @@ fastNLP 由 :mod:`~fastNLP.core` 、 :mod:`~fastNLP.io` 、:mod:`~fastNLP.embeddings` 、 :mod:`~fastNLP.modules`、 :mod:`~fastNLP.models` 等子模块组成,你可以查看每个模块的文档。 -- :mod:`~fastNLP.core` 是fastNLP 的核心模块,包括 DataSet、 Trainer、 Tester 等组件。详见文档 :doc:`/fastNLP.core` -- :mod:`~fastNLP.io` 是实现输入输出的模块,包括了数据集的读取,模型的存取等功能。详见文档 :doc:`/fastNLP.io` -- :mod:`~fastNLP.embeddings` 提供用于构建复杂网络模型所需的各种embedding。详见文档 :doc:`/fastNLP.embeddings` -- :mod:`~fastNLP.modules` 包含了用于搭建神经网络模型的诸多组件,可以帮助用户快速搭建自己所需的网络。详见文档 :doc:`/fastNLP.modules` -- :mod:`~fastNLP.models` 包含了一些使用 fastNLP 实现的完整网络模型,包括 :class:`~fastNLP.models.CNNText` 、 :class:`~fastNLP.models.SeqLabeling` 等常见模型。详见文档 :doc:`fastNLP.models` +- :mod:`~fastNLP.core` 是fastNLP 的核心模块,包括 DataSet、 Trainer、 Tester 等组件。详见文档 :mod:`fastNLP.core` +- :mod:`~fastNLP.io` 是实现输入输出的模块,包括了数据集的读取,模型的存取等功能。详见文档 :mod:`fastNLP.io` +- :mod:`~fastNLP.embeddings` 提供用于构建复杂网络模型所需的各种embedding。详见文档 :mod:`fastNLP.embeddings` +- :mod:`~fastNLP.modules` 包含了用于搭建神经网络模型的诸多组件,可以帮助用户快速搭建自己所需的网络。详见文档 :mod:`fastNLP.modules` +- :mod:`~fastNLP.models` 包含了一些使用 fastNLP 实现的完整网络模型,包括 :class:`~fastNLP.models.CNNText` 、 :class:`~fastNLP.models.SeqLabeling` 等常见模型。详见文档 :mod:`fastNLP.models` fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ,他们的文档如下: """ __all__ = [ "Instance", "FieldArray", - + "DataSetIter", "BatchIter", "TorchLoaderIter", - + "Vocabulary", "DataSet", "Const", @@ -28,9 +28,16 @@ __all__ = [ "Callback", "GradientClipCallback", "EarlyStopCallback", - "TensorboardCallback", + "FitlogCallback", + "EvaluateCallback", "LRScheduler", "ControlC", + "LRFinder", + "TensorboardCallback", + "WarmupCallback", + 'SaveModelCallback', + "CallbackException", + "EarlyStopError", "Padder", "AutoPadder", @@ -43,6 +50,7 @@ __all__ = [ "Optimizer", "SGD", "Adam", + "AdamW", "Sampler", "SequentialSampler", @@ -51,16 +59,24 @@ __all__ = [ "LossFunc", "CrossEntropyLoss", - "L1Loss", "BCELoss", + "L1Loss", + "BCELoss", "NLLLoss", "LossInForward", - "cache_results" + "cache_results", + + 'logger' ] __version__ = '0.4.5' -from .core import * +import sys + +from . import embeddings from . import models from . import modules -from . import embeddings -from .io import data_loader +from .core import * +from .doc_utils import doc_process +from .io import loader, pipe + +doc_process(sys.modules[__name__]) diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index c9f51123..f8e9c995 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -8,23 +8,86 @@ core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fa # 从 core 模块的子模块 batch 中 import DataSetIter from fastNLP.core.batch import DataSetIter -对于常用的功能,你只需要在 :doc:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用,您可以在下面找到每个子模块的具体文档。 +对于常用的功能,你只需要在 :mod:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用,您可以在下面找到每个子模块的具体文档。 -.. todo:: - 介绍core 的子模块的分工,好像必要性不大 - """ +__all__ = [ + "DataSet", + + "Instance", + + "FieldArray", + "Padder", + "AutoPadder", + "EngChar2DPadder", + + "Vocabulary", + + "DataSetIter", + "BatchIter", + "TorchLoaderIter", + + "Const", + + "Tester", + "Trainer", + + "cache_results", + "seq_len_to_mask", + "get_seq_len", + "logger", + + "Callback", + "GradientClipCallback", + "EarlyStopCallback", + "FitlogCallback", + "EvaluateCallback", + "LRScheduler", + "ControlC", + "LRFinder", + "TensorboardCallback", + "WarmupCallback", + 'SaveModelCallback', + "CallbackException", + "EarlyStopError", + + "LossFunc", + "CrossEntropyLoss", + "L1Loss", + "BCELoss", + "NLLLoss", + "LossInForward", + "CMRC2018Loss", + + "AccuracyMetric", + "SpanFPreRecMetric", + "CMRC2018Metric", + + "Optimizer", + "SGD", + "Adam", + "AdamW", + + "SequentialSampler", + "BucketSampler", + "RandomSampler", + "Sampler", +] + +from ._logger import logger from .batch import DataSetIter, BatchIter, TorchLoaderIter -from .callback import Callback, GradientClipCallback, EarlyStopCallback, TensorboardCallback, LRScheduler, ControlC +from .callback import Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, \ + LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, \ + EarlyStopError from .const import Const from .dataset import DataSet from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder from .instance import Instance -from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward -from .metrics import AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric -from .optimizer import Optimizer, SGD, Adam +from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, CMRC2018Loss +from .metrics import AccuracyMetric, SpanFPreRecMetric, CMRC2018Metric +from .optimizer import Optimizer, SGD, Adam, AdamW from .sampler import SequentialSampler, BucketSampler, RandomSampler, Sampler from .tester import Tester from .trainer import Trainer -from .utils import cache_results, seq_len_to_mask +from .utils import cache_results, seq_len_to_mask, get_seq_len from .vocabulary import Vocabulary diff --git a/fastNLP/core/_logger.py b/fastNLP/core/_logger.py new file mode 100644 index 00000000..7198cfbd --- /dev/null +++ b/fastNLP/core/_logger.py @@ -0,0 +1,155 @@ +"""undocumented""" + +__all__ = [ + 'logger', +] + +import logging +import logging.config +import os +import sys +import warnings + +ROOT_NAME = 'fastNLP' + +try: + import fitlog +except ImportError: + fitlog = None +try: + from tqdm.auto import tqdm +except ImportError: + tqdm = None + +if tqdm is not None: + class TqdmLoggingHandler(logging.Handler): + def __init__(self, level=logging.INFO): + super().__init__(level) + + def emit(self, record): + try: + msg = self.format(record) + tqdm.write(msg) + self.flush() + except (KeyboardInterrupt, SystemExit): + raise + except: + self.handleError(record) +else: + class TqdmLoggingHandler(logging.StreamHandler): + def __init__(self, level=logging.INFO): + super().__init__(sys.stdout) + self.setLevel(level) + + +def _get_level(level): + if isinstance(level, int): + pass + else: + level = level.lower() + level = {'info': logging.INFO, 'debug': logging.DEBUG, + 'warn': logging.WARN, 'warning': logging.WARN, + 'error': logging.ERROR}[level] + return level + + +def _add_file_handler(logger, path, level='INFO'): + for h in logger.handlers: + if isinstance(h, logging.FileHandler): + if os.path.abspath(path) == h.baseFilename: + # file path already added + return + + # File Handler + if os.path.exists(path): + assert os.path.isfile(path) + warnings.warn('log already exists in {}'.format(path)) + dirname = os.path.abspath(os.path.dirname(path)) + os.makedirs(dirname, exist_ok=True) + + file_handler = logging.FileHandler(path, mode='a') + file_handler.setLevel(_get_level(level)) + file_formatter = logging.Formatter(fmt='%(asctime)s - %(module)s - [%(levelname)s] - %(message)s', + datefmt='%Y/%m/%d %H:%M:%S') + file_handler.setFormatter(file_formatter) + logger.addHandler(file_handler) + + +def _set_stdout_handler(logger, stdout='tqdm', level='INFO'): + level = _get_level(level) + if stdout not in ['none', 'plain', 'tqdm']: + raise ValueError('stdout must in one of {}'.format(['none', 'plain', 'tqdm'])) + # make sure to initialize logger only once + stream_handler = None + for i, h in enumerate(logger.handlers): + if isinstance(h, (logging.StreamHandler, TqdmLoggingHandler)): + stream_handler = h + break + if stream_handler is not None: + logger.removeHandler(stream_handler) + + # Stream Handler + if stdout == 'plain': + stream_handler = logging.StreamHandler(sys.stdout) + elif stdout == 'tqdm': + stream_handler = TqdmLoggingHandler(level) + else: + stream_handler = None + + if stream_handler is not None: + stream_formatter = logging.Formatter('%(message)s') + stream_handler.setLevel(level) + stream_handler.setFormatter(stream_formatter) + logger.addHandler(stream_handler) + + +class FastNLPLogger(logging.getLoggerClass()): + def __init__(self, name): + super().__init__(name) + + def add_file(self, path='./log.txt', level='INFO'): + """add log output file and level""" + _add_file_handler(self, path, level) + + def set_stdout(self, stdout='tqdm', level='INFO'): + """set stdout format and level""" + _set_stdout_handler(self, stdout, level) + + +logging.setLoggerClass(FastNLPLogger) + + +# print(logging.getLoggerClass()) +# print(logging.getLogger()) + +def _init_logger(path=None, stdout='tqdm', level='INFO'): + """initialize logger""" + level = _get_level(level) + + # logger = logging.getLogger() + logger = logging.getLogger(ROOT_NAME) + logger.propagate = False + logger.setLevel(level) + + _set_stdout_handler(logger, stdout, level) + + # File Handler + if path is not None: + _add_file_handler(logger, path, level) + + return logger + + +def _get_logger(name=None, level='INFO'): + level = _get_level(level) + if name is None: + name = ROOT_NAME + assert isinstance(name, str) + if not name.startswith(ROOT_NAME): + name = '{}.{}'.format(ROOT_NAME, name) + logger = logging.getLogger(name) + logger.setLevel(level) + return logger + + +logger = _init_logger(path=None) diff --git a/fastNLP/core/_parallel_utils.py b/fastNLP/core/_parallel_utils.py index 4a7757d3..ce745820 100644 --- a/fastNLP/core/_parallel_utils.py +++ b/fastNLP/core/_parallel_utils.py @@ -1,10 +1,14 @@ +"""undocumented""" + +__all__ = [] import threading + import torch +from torch import nn from torch.nn.parallel.parallel_apply import get_a_var - -from torch.nn.parallel.scatter_gather import scatter_kwargs, gather from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.scatter_gather import scatter_kwargs, gather def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None): @@ -26,11 +30,11 @@ def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None): assert len(modules) == len(devices) else: devices = [None] * len(modules) - + lock = threading.Lock() results = {} grad_enabled = torch.is_grad_enabled() - + def _worker(i, module, input, kwargs, device=None): torch.set_grad_enabled(grad_enabled) if device is None: @@ -46,20 +50,20 @@ def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None): except Exception as e: with lock: results[i] = e - + if len(modules) > 1: threads = [threading.Thread(target=_worker, args=(i, module, input, kwargs, device)) for i, (module, input, kwargs, device) in enumerate(zip(modules, inputs, kwargs_tup, devices))] - + for thread in threads: thread.start() for thread in threads: thread.join() else: _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0]) - + outputs = [] for i in range(len(inputs)): output = results[i] @@ -78,6 +82,7 @@ def _data_parallel_wrapper(func_name, device_ids, output_device): :param output_device: nn.DataParallel中的output_device :return: """ + def wrapper(network, *inputs, **kwargs): inputs, kwargs = scatter_kwargs(inputs, kwargs, device_ids, dim=0) if len(device_ids) == 1: @@ -85,4 +90,18 @@ def _data_parallel_wrapper(func_name, device_ids, output_device): replicas = replicate(network, device_ids[:len(inputs)]) outputs = parallel_apply(replicas, func_name, inputs, kwargs, device_ids[:len(replicas)]) return gather(outputs, output_device) + return wrapper + + +def _model_contains_inner_module(model): + """ + + :param nn.Module model: 模型文件,判断是否内部包含model.module, 多用于check模型是否是nn.DataParallel, + nn.parallel.DistributedDataParallel。主要是在做形参匹配的时候需要使用最内部的model的function。 + :return: bool + """ + if isinstance(model, nn.Module): + if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): + return True + return False diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 64c5f48e..f2e34c52 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -9,14 +9,15 @@ __all__ = [ ] import atexit +from numbers import Number import numpy as np import torch import torch.utils.data -from numbers import Number -from .sampler import SequentialSampler +from ._logger import logger from .dataset import DataSet +from .sampler import SequentialSampler _python_is_exit = False @@ -48,6 +49,11 @@ class DataSetGetter: return len(self.dataset) def collate_fn(self, batch: list): + """ + + :param batch: [[idx1, x_dict1, y_dict1], [idx2, x_dict2, y_dict2], [xx, xx, xx]] + :return: + """ # TODO 支持在DataSet中定义collate_fn,因为有时候可能需要不同的field之间融合,比如BERT的场景 batch_x = {n:[] for n in self.inputs.keys()} batch_y = {n:[] for n in self.targets.keys()} @@ -70,7 +76,7 @@ class DataSetGetter: try: data, flag = _to_tensor(data, f.dtype) except TypeError as e: - print(f"Field {n} cannot be converted to torch.tensor.") + logger.error(f"Field {n} cannot be converted to torch.tensor.") raise e batch_dict[n] = data return batch_dict @@ -93,37 +99,68 @@ class DataSetGetter: class SamplerAdapter(torch.utils.data.Sampler): def __init__(self, sampler, dataset): + super().__init__(dataset) self.sampler = sampler self.dataset = dataset + def __len__(self): + return len(self.dataset) + def __iter__(self): return iter(self.sampler(self.dataset)) class BatchIter: - def __init__(self): - self.dataiter = None - self.num_batches = None + def __init__(self, dataset, batch_size=1, sampler=None, + num_workers=0, pin_memory=False, drop_last=False, + timeout=0, worker_init_fn=None, collate_fn=None): + if not isinstance(sampler, torch.utils.data.Sampler): + self.sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset) + else: + self.sampler = sampler + if collate_fn is None: + # pytoch <= 1.1 中不能设置collate_fn=None + self.dataiter = torch.utils.data.DataLoader( + dataset=dataset, batch_size=batch_size, sampler=self.sampler, + num_workers=num_workers, + pin_memory=pin_memory, drop_last=drop_last, + timeout=timeout, worker_init_fn=worker_init_fn) + else: + self.dataiter = torch.utils.data.DataLoader( + dataset=dataset, batch_size=batch_size, sampler=self.sampler, + collate_fn=collate_fn, num_workers=num_workers, + pin_memory=pin_memory, drop_last=drop_last, + timeout=timeout, worker_init_fn=worker_init_fn) + + # 以sampler的数量为准,因为DistributedSampler的时候每个进程上并不是所有的数据都用上了 + self.num_batches = self.get_num_batches(len(self.dataiter.sampler), batch_size, drop_last) + self.batch_size = batch_size self.cur_batch_indices = None - self.batch_size = None def init_iter(self): pass @staticmethod def get_num_batches(num_samples, batch_size, drop_last): + """ + 计算batch的数量。 + + :param int num_samples: + :param int batch_size: + :param bool drop_last: 如果最后一个batch没有batch_size这么多,是否就丢掉。 + :return: + """ num_batches = num_samples // batch_size if not drop_last and (num_samples % batch_size > 0): num_batches += 1 return num_batches - def __iter__(self): - self.init_iter() - for indices, batch_x, batch_y in self.dataiter: - self.cur_batch_indices = indices - yield batch_x, batch_y - def get_batch_indices(self): + """ + 获取当前已经输出的batch的index。 + + :return: + """ return self.cur_batch_indices def __len__(self): @@ -136,8 +173,6 @@ class BatchIter: class DataSetIter(BatchIter): """ - 别名::class:`fastNLP.DataSetIter` :class:`fastNLP.core.batch.DataSetIter` - DataSetIter 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出, 组成 `x` 和 `y`:: @@ -146,60 +181,94 @@ class DataSetIter(BatchIter): for batch_x, batch_y in batch: # do stuff ... - :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集 - :param int batch_size: 取出的batch大小 - :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`. - - Default: ``None`` - :param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`. - - Default: ``False`` - :param int num_workers: 使用多少个进程来预处理数据 - :param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。 - :param bool drop_last: 如果最后一个batch没有batch_size这么多sample,就扔掉最后一个 - :param timeout: - :param worker_init_fn: 在每个worker启动时调用该函数,会传入一个值,该值是worker的index。 """ def __init__(self, dataset, batch_size=1, sampler=None, as_numpy=False, num_workers=0, pin_memory=False, drop_last=False, - timeout=0, worker_init_fn=None): - super().__init__() + timeout=0, worker_init_fn=None, collate_fn=None): + """ + + :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集 + :param int batch_size: 取出的batch大小 + :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`. + + Default: ``None`` + :param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`. + + Default: ``False`` + :param int num_workers: 使用多少个进程来预处理数据 + :param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。 + :param bool drop_last: 如果最后一个batch没有batch_size这么多sample,就扔掉最后一个 + :param timeout: 生成一个batch的timeout值 + :param worker_init_fn: 在每个worker启动时调用该函数,会传入一个值,该值是worker的index。 + :param collate_fn: 用于将样本组合成batch的函数 + """ assert isinstance(dataset, DataSet) - sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset) dataset = DataSetGetter(dataset, as_numpy) - collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None - self.dataiter = torch.utils.data.DataLoader( + collate_fn = dataset.collate_fn if collate_fn is None else collate_fn + super().__init__( dataset=dataset, batch_size=batch_size, sampler=sampler, - collate_fn=collate_fn, num_workers=num_workers, - pin_memory=pin_memory, drop_last=drop_last, - timeout=timeout, worker_init_fn=worker_init_fn) - self.num_batches = self.get_num_batches(len(dataset), batch_size, drop_last) - self.batch_size = batch_size - - -class TorchLoaderIter(BatchIter): - def __init__(self, dataset): - super().__init__() - assert isinstance(dataset, torch.utils.data.DataLoader) - self.dataiter = dataset - self.num_batches = self.get_num_batches(len(dataset), dataset.batch_size, dataset.drop_last) - self.batch_size = dataset.batch_size + num_workers=num_workers, pin_memory=pin_memory, + drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn, + collate_fn=collate_fn + ) + def __iter__(self): + self.init_iter() + for indices, batch_x, batch_y in self.dataiter: + self.cur_batch_indices = indices + yield batch_x, batch_y -class OnlineDataGettter: - # TODO - pass +class TorchLoaderIter(BatchIter): + """ + 与DataSetIter类似,但用于pytorch的DataSet对象。 + 通过使用TorchLoaderIter封装pytorch的DataSet,然后将其传入到Trainer中。 -class OnlineDataIter(BatchIter): - # TODO - def __init__(self, dataset, batch_size=1, buffer_size=10000, sampler=None, as_numpy=False, + """ + def __init__(self, dataset, batch_size=1, sampler=None, num_workers=0, pin_memory=False, drop_last=False, - timeout=0, worker_init_fn=None, **kwargs): - super().__init__() + timeout=0, worker_init_fn=None, collate_fn=None): + """ + + :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集 + :param int batch_size: 取出的batch大小 + :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`. + + Default: ``None`` + :param int num_workers: 使用多少个进程来预处理数据 + :param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。 + :param bool drop_last: 如果最后一个batch没有batch_size这么多sample,就扔掉最后一个 + :param timeout: 生成一个batch的timeout值 + :param worker_init_fn: 在每个worker启动时调用该函数,会传入一个值,该值是worker的index。 + :param collate_fn: 用于将样本组合成batch的函数""" + assert len(dataset) > 0 + ins = dataset[0] + assert len(ins) == 2 and \ + isinstance(ins[0], dict) and \ + isinstance(ins[1], dict), 'DataSet should return two dict, as X and Y' + + super().__init__( + dataset=dataset, batch_size=batch_size, sampler=sampler, + num_workers=num_workers, pin_memory=pin_memory, + drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn, + collate_fn=collate_fn + ) + + def __iter__(self): + self.init_iter() + for batch_x, batch_y in self.dataiter: + self.cur_batch_indices = None + yield batch_x, batch_y def _to_tensor(batch, field_dtype): + """ + + :param batch: np.array() + :param field_dtype: 数据类型 + :return: batch, flag. 如果传入的数据支持转为tensor,返回的batch就是tensor,且flag为True;如果传入的数据不支持转为tensor, + 返回的batch就是原来的数据,且flag为False + """ try: if field_dtype is not None and isinstance(field_dtype, type)\ and issubclass(field_dtype, Number) \ diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 6f855397..ad417340 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -4,7 +4,7 @@ callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class: 虽然Trainer本身已经集成了一些功能,但仍然不足以囊括训练过程中可能需要到的功能, 比如负采样,learning rate decay 和 early stop等。 为了解决这个问题,fastNLP引入了callback的机制,:class:`~fastNLP.Callback` 是一种在Trainer训练过程中特定阶段会运行的函数集合。 -关于 :class:`~fastNLP.Trainer` 的详细文档,请参见 :doc:`trainer 模块` +关于 :class:`~fastNLP.Trainer` 的详细文档,请参见 :mod:`trainer 模块` 我们将 :meth:`~fastNLP.Trainer.train` 这个函数内部分为以下的阶段,在对应阶段会触发相应的调用:: @@ -51,22 +51,28 @@ callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class: """ __all__ = [ "Callback", + "GradientClipCallback", "EarlyStopCallback", - "TensorboardCallback", "FitlogCallback", + "EvaluateCallback", "LRScheduler", "ControlC", + "LRFinder", + "TensorboardCallback", + "WarmupCallback", + "SaveModelCallback", "CallbackException", "EarlyStopError" ] import os +import sys +from copy import deepcopy import torch -from copy import deepcopy -import sys + from .utils import _save_model try: @@ -76,23 +82,27 @@ try: except: tensorboardX_flag = False -from ..io.model_io import ModelSaver, ModelLoader from .dataset import DataSet from .tester import Tester +from ._logger import logger +from .utils import _check_fp16 try: import fitlog except: pass +try: + from apex import amp +except: + amp = None + class Callback(object): """ - 别名::class:`fastNLP.Callback` :class:`fastNLP.core.callback.Callback` - Callback是fastNLP中被设计用于增强 :class:`~fastNLP.Trainer` 的类。 如果Callback被传递给了 Trainer , 则 Trainer 会在对应的阶段调用Callback的函数, - 具体调用时机可以通过 :doc:`trainer 模块` 查看。 + 具体调用时机可以通过 :mod:`trainer 模块` 查看。 这是Callback的基类,所有的callback必须继承自这个类 """ @@ -100,7 +110,8 @@ class Callback(object): def __init__(self): super(Callback, self).__init__() self._trainer = None # 在Trainer内部被重新赋值 - + self._disabled = False + @property def trainer(self): """ @@ -158,7 +169,19 @@ class Callback(object): def batch_per_epoch(self): """每个epoch一共有多少个batch,只有在on_epoch_begin之后才能调用该属性。""" return self._trainer.batch_per_epoch - + + @property + def is_master(self): + return self._trainer.is_master + + @property + def disabled(self): + return self._disabled + + @property + def logger(self): + return getattr(self._trainer, 'logger', logger) + def on_train_begin(self): """ 在Train过程开始之前调用。 @@ -281,6 +304,8 @@ def _transfer(func): def wrapper(manager, *arg): returns = [] for callback in manager.callbacks: + if callback.disabled: + continue returns.append(getattr(callback, func.__name__)(*arg)) return returns @@ -288,31 +313,39 @@ def _transfer(func): class CallbackManager(Callback): + """ + 内部使用的Callback管理类 + """ def __init__(self, env, callbacks=None): """ - 内部使用的Callback管理类 :param dict env: The key is the name of the Trainer attribute(str). The value is the attribute itself. :param List[Callback] callbacks: """ super(CallbackManager, self).__init__() # set attribute of trainer environment - + self._env = env self.callbacks = [] - if callbacks is not None: - if isinstance(callbacks, list): - if all([isinstance(cb, Callback) for cb in callbacks]) is True: - self.callbacks.extend(callbacks) - else: - obj = [not isinstance(cb, Callback) for cb in callbacks][0] - raise TypeError(f"Expect sub-classes of Callback. Got {type(obj)}") + if callbacks: + self.callbacks = self.prepare_callbacks(callbacks) + + def prepare_callbacks(self, callbacks): + if not callbacks: + return [] + if isinstance(callbacks, list): + if all([isinstance(cb, Callback) for cb in callbacks]) is True: + pass else: - raise TypeError(f"Expect callbacks in CallbackManager(callbacks) to be list. Got {type(callbacks)}.") - - for env_name, env_val in env.items(): - for callback in self.callbacks: + obj = [not isinstance(cb, Callback) for cb in callbacks][0] + raise TypeError(f"Expect sub-classes of Callback. Got {type(obj)}") + else: + raise TypeError(f"Expect callbacks in CallbackManager(callbacks) to be list. Got {type(callbacks)}.") + + for env_name, env_val in self._env.items(): + for callback in callbacks: setattr(callback, '_' + env_name, env_val) # Callback.trainer - + return callbacks + @_transfer def on_train_begin(self): pass @@ -352,6 +385,10 @@ class CallbackManager(Callback): @_transfer def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): pass + + @_transfer + def on_validation(self): + pass @_transfer def on_epoch_end(self): @@ -366,28 +403,53 @@ class CallbackManager(Callback): pass +class DistCallbackManager(CallbackManager): + def __init__(self, env, callbacks_all=None, callbacks_master=None): + super(DistCallbackManager, self).__init__(env) + assert 'trainer' in env + self._trainer = env['trainer'] + self.callbacks_master = [] + self.callbacks_all = [] + self.add_callback(callbacks_all, master=False) + self.add_callback(callbacks_master, master=True) + + def patch_callback(self, callbacks, disabled): + if not callbacks: + return + if not isinstance(callbacks, (list, tuple)): + callbacks = [callbacks] + for cb in callbacks: + cb._disabled = disabled + + def add_callback(self, cb, master=False): + if master: + self.patch_callback(cb, not self.is_master) + self.callbacks_master += self.prepare_callbacks(cb) + else: + self.callbacks_all += self.prepare_callbacks(cb) + self.callbacks = self.callbacks_all + self.callbacks_master + + class GradientClipCallback(Callback): """ - 别名::class:`fastNLP.GradientClipCallback` :class:`fastNLP.core.callback.GradientClipCallback` - 每次backward前,将parameter的gradient clip到某个范围。 - - :param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。 - 如果为None则默认对Trainer的model中所有参数进行clip - :param float clip_value: 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数 - :param str clip_type: 支持'norm', 'value' - 两种:: - - 1 'norm', 将gradient的norm rescale到[-clip_value, clip_value] - - 2 'value', 将gradient限制在[-clip_value, clip_value], - 小于-clip_value的gradient被赋值为-clip_value; - 大于clip_value的gradient被赋值为clip_value. - """ def __init__(self, parameters=None, clip_value=1, clip_type='norm'): + """ + :param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。 + 如果为None则默认对Trainer的model中所有参数进行clip + :param float clip_value: 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数 + :param str clip_type: 支持'norm', 'value' + 两种:: + + 1 'norm', 将gradient的norm rescale到[-clip_value, clip_value] + + 2 'value', 将gradient限制在[-clip_value, clip_value], + 小于-clip_value的gradient被赋值为-clip_value; + 大于clip_value的gradient被赋值为clip_value. + """ super().__init__() from torch import nn @@ -403,21 +465,25 @@ class GradientClipCallback(Callback): def on_backward_end(self): if self.step%self.update_every==0: if self.parameters is None: - self.clip_fun(self.model.parameters(), self.clip_value) + if getattr(self.trainer, 'fp16', ''): + _check_fp16() + self.clip_fun(amp.master_params(self.optimizer), self.clip_value) + else: + self.clip_fun(self.model.parameters(), self.clip_value) else: self.clip_fun(self.parameters, self.clip_value) class EarlyStopCallback(Callback): """ - 别名::class:`fastNLP.EarlyStopCallback` :class:`fastNLP.core.callback.EarlyStopCallback` - - 多少个epoch没有变好就停止训练,相关类 :class:`EarlyStopError` - - :param int patience: epoch的数量 + 多少个epoch没有变好就停止训练,相关类 :class:`~fastNLP.core.callback.EarlyStopError` """ def __init__(self, patience): + """ + + :param int patience: epoch的数量 + """ super(EarlyStopCallback, self).__init__() self.patience = patience self.wait = 0 @@ -434,52 +500,54 @@ class EarlyStopCallback(Callback): def on_exception(self, exception): if isinstance(exception, EarlyStopError): - print("Early Stopping triggered in epoch {}!".format(self.epoch)) + logger.info("Early Stopping triggered in epoch {}!".format(self.epoch)) else: raise exception # 抛出陌生Error class FitlogCallback(Callback): """ - 别名: :class:`fastNLP.FitlogCallback` :class:`fastNLP.core.callback.FitlogCallback` - 该callback可将loss和progress写入到fitlog中; 如果Trainer有dev的数据,将自动把dev的结果写入到log中; 同时还支持传入 - 一个(或多个)test数据集进行测试(只有在trainer具有dev时才能使用),每次在dev上evaluate之后会在这些数据集上验证一下。 - 并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则 - fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。 - - :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 - DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。若tester不为None时,data需要通过 - dict的方式传入。如果仅传入DataSet, 则被命名为test - :param ~fastNLP.Tester tester: Tester对象,将在on_valid_end时调用。tester中的DataSet会被称为为`test` - :param int log_loss_every: 多少个step记录一次loss(记录的是这几个batch的loss平均值),如果数据集较大建议将该值设置得 - 大一些,不然会导致log文件巨大。默认为0, 即不要记录loss。 - :param int verbose: 是否在终端打印evaluation的结果,0不打印。 - :param bool log_exception: fitlog是否记录发生的exception信息 + 一个(或多个)test数据集进行测试(只有在trainer具有dev时才能使用),每次在dev上evaluate之后会在这些数据集上验证一下。 + 并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则 + fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。 """ def __init__(self, data=None, tester=None, log_loss_every=0, verbose=0, log_exception=False): + """ + + :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要 + 传入多个DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。data的结果的名称以'data'开头。 + :param ~fastNLP.Tester,Dict[~fastNLP.Tester] tester: Tester对象,将在on_valid_end时调用。tester的结果的名称以'tester'开头 + :param int log_loss_every: 多少个step记录一次loss(记录的是这几个batch的loss平均值),如果数据集较大建议将该值设置得 + 大一些,不然会导致log文件巨大。默认为0, 即不要记录loss。 + :param int verbose: 是否在终端打印evaluation的结果,0不打印。 + :param bool log_exception: fitlog是否记录发生的exception信息 + """ super().__init__() self.datasets = {} self.testers = {} self._log_exception = log_exception assert isinstance(log_loss_every, int) and log_loss_every>=0 if tester is not None: - assert isinstance(tester, Tester), "Only fastNLP.Tester allowed." - assert isinstance(data, dict) or data is None, "If tester is not None, only dict[DataSet] allowed for data." - if data is not None: - assert 'test' not in data, "Cannot use `test` as DataSet key, when tester is passed." - setattr(tester, 'verbose', 0) - self.testers['test'] = tester - + if isinstance(tester, dict): + for name, test in tester.items(): + if not isinstance(test, Tester): + raise TypeError(f"{name} in tester is not a valid fastNLP.Tester.") + self.testers['tester-' + name] = test + if isinstance(tester, Tester): + self.testers['tester-test'] = tester + for tester in self.testers.values(): + setattr(tester, 'verbose', 0) + if isinstance(data, dict): for key, value in data.items(): assert isinstance(value, DataSet), f"Only DataSet object is allowed, not {type(value)}." for key, value in data.items(): - self.datasets[key] = value + self.datasets['data-' + key] = value elif isinstance(data, DataSet): - self.datasets['test'] = data - else: + self.datasets['data-test'] = data + elif data is not None: raise TypeError("data receives dict[DataSet] or DataSet object.") self.verbose = verbose @@ -492,8 +560,11 @@ class FitlogCallback(Callback): if len(self.datasets) > 0: for key, data in self.datasets.items(): - tester = Tester(data=data, model=self.model, batch_size=self.batch_size, metrics=self.trainer.metrics, - verbose=0) + tester = Tester(data=data, model=self.model, + batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), + metrics=self.trainer.metrics, + verbose=0, + use_tqdm=self.trainer.test_use_tqdm) self.testers[key] = tester fitlog.add_progress(total_steps=self.n_steps) @@ -516,7 +587,7 @@ class FitlogCallback(Callback): try: eval_result = tester.test() if self.verbose != 0: - self.pbar.write("Evaluation on DataSet {}:".format(key)) + self.pbar.write("FitlogCallback evaluation on {}:".format(key)) self.pbar.write(tester._format_eval_results(eval_result)) fitlog.add_metric(eval_result, name=key, step=self.step, epoch=self.epoch) if better_result: @@ -533,17 +604,75 @@ class FitlogCallback(Callback): fitlog.add_other(repr(exception), name='except_info') -class LRScheduler(Callback): +class EvaluateCallback(Callback): + """ + 通过使用该Callback可以使得Trainer在evaluate dev之外还可以evaluate其它数据集,比如测试集。每一次验证dev之前都会先验证EvaluateCallback + 中的数据。 """ - 别名::class:`fastNLP.LRScheduler` :class:`fastNLP.core.callback.LRScheduler` - 对PyTorch LR Scheduler的包装以使得其可以被Trainer所使用 + def __init__(self, data=None, tester=None): + """ + :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用Trainer中的metric对数据进行验证。如果需要传入多个 + DataSet请通过dict的方式传入。 + :param ~fastNLP.Tester,Dict[~fastNLP.DataSet] tester: Tester对象, 通过使用Tester对象,可以使得验证的metric与Trainer中 + 的metric不一样。 + """ + super().__init__() + self.datasets = {} + self.testers = {} + if tester is not None: + if isinstance(tester, dict): + for name, test in tester.items(): + if not isinstance(test, Tester): + raise TypeError(f"{name} in tester is not a valid fastNLP.Tester.") + self.testers['tester-' + name] = test + if isinstance(tester, Tester): + self.testers['tester-test'] = tester + for tester in self.testers.values(): + setattr(tester, 'verbose', 0) + + if isinstance(data, dict): + for key, value in data.items(): + assert isinstance(value, DataSet), f"Only DataSet object is allowed, not {type(value)}." + for key, value in data.items(): + self.datasets['data-' + key] = value + elif isinstance(data, DataSet): + self.datasets['data-test'] = data + elif data is not None: + raise TypeError("data receives dict[DataSet] or DataSet object.") + + def on_train_begin(self): + if len(self.datasets) > 0 and self.trainer.dev_data is None: + raise RuntimeError("Trainer has no dev data, you cannot pass extra DataSet to do evaluation.") + + if len(self.datasets) > 0: + for key, data in self.datasets.items(): + tester = Tester(data=data, model=self.model, + batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), + metrics=self.trainer.metrics, verbose=0, + use_tqdm=self.trainer.test_use_tqdm) + self.testers[key] = tester - :param torch.optim.lr_scheduler._LRScheduler lr_scheduler: PyTorch的lr_scheduler + def on_valid_end(self, eval_result, metric_key, optimizer, better_result): + if len(self.testers) > 0: + for key, tester in self.testers.items(): + try: + eval_result = tester.test() + self.logger.info("EvaluateCallback evaluation on {}:".format(key)) + self.logger.info(tester._format_eval_results(eval_result)) + except Exception: + self.logger.error("Exception happens when evaluate on DataSet named `{}`.".format(key)) + + +class LRScheduler(Callback): + """ + 对PyTorch LR Scheduler的包装以使得其可以被Trainer所使用 """ def __init__(self, lr_scheduler): - + """ + :param torch.optim.lr_scheduler._LRScheduler lr_scheduler: PyTorch的lr_scheduler + """ super(LRScheduler, self).__init__() import torch.optim if isinstance(lr_scheduler, torch.optim.lr_scheduler._LRScheduler): @@ -557,13 +686,13 @@ class LRScheduler(Callback): class ControlC(Callback): """ - 别名::class:`fastNLP.ControlC` :class:`fastNLP.core.callback.ControlC` - - :param bool quit_all: 若为True,则检测到control+C 直接退出程序;否则只退出Trainer + 检测到 control+C 时的反馈 """ def __init__(self, quit_all): - + """ + :param bool quit_all: 若为True,则检测到control+C 直接退出程序;否则只退出Trainer + """ super(ControlC, self).__init__() if type(quit_all) != bool: raise ValueError("In KeyBoardInterrupt, quit_all arguemnt must be a bool.") @@ -581,12 +710,14 @@ class ControlC(Callback): class SmoothValue(object): + """work for LRFinder""" + def __init__(self, beta: float): self.beta, self.n, self.mov_avg = beta, 0, 0 self.smooth = None def add_value(self, val: float) -> None: - "Add `val` to calculate updated smoothed value." + """Add `val` to calculate updated smoothed value.""" self.n += 1 self.mov_avg = self.beta * self.mov_avg + (1 - self.beta) * val self.smooth = self.mov_avg / (1 - self.beta ** self.n) @@ -594,16 +725,15 @@ class SmoothValue(object): class LRFinder(Callback): """ - 别名::class:`fastNLP.LRFinder` :class:`fastNLP.core.callback.LRFinder` - 用第一个 epoch 找最佳的学习率,从第二个epoch开始应用它 - - :param float start_lr: 学习率下界 - :param float end_lr: 学习率上界 """ def __init__(self, start_lr=1e-6, end_lr=10): + """ + :param float start_lr: 学习率下界 + :param float end_lr: 学习率上界 + """ super(LRFinder, self).__init__() self.start_lr, self.end_lr = start_lr, end_lr @@ -614,8 +744,7 @@ class LRFinder(Callback): self.smooth_value = SmoothValue(0.8) self.opt = None self.find = None - self.loader = ModelLoader() - + @property def lr_gen(self): scale = (self.end_lr - self.start_lr) / self.batch_per_epoch @@ -630,7 +759,7 @@ class LRFinder(Callback): self.opt = self.trainer.optimizer # pytorch optimizer self.opt.param_groups[0]["lr"] = self.start_lr # save model - ModelSaver("tmp").save_pytorch(self.trainer.model, param_only=True) + torch.save(self.model.state_dict(), 'tmp') self.find = True def on_backward_begin(self, loss): @@ -659,14 +788,14 @@ class LRFinder(Callback): self.opt.param_groups[0]["lr"] = self.best_lr self.find = False # reset model - ModelLoader().load_pytorch(self.trainer.model, "tmp") + states = torch.load('tmp') + self.model.load_state_dict(states) + os.remove('tmp') self.pbar.write("Model reset. \nFind best lr={}".format(self.best_lr)) class TensorboardCallback(Callback): """ - 别名::class:`fastNLP.TensorboardCallback` :class:`fastNLP.core.callback.TensorboardCallback` - 接受以下一个或多个字符串作为参数: - "model" - "loss" @@ -674,7 +803,7 @@ class TensorboardCallback(Callback): .. warning:: fastNLP 已停止对此功能的维护,请等待 fastNLP 兼容 PyTorch1.1 的下一个版本。 - 或者使用和 fastNLP 高度配合的 fitlog(参见 :doc:`/tutorials/tutorial_10_fitlog` )。 + 或者使用和 fastNLP 高度配合的 fitlog(参见 :doc:`/tutorials/tutorial_11_fitlog` )。 """ @@ -741,14 +870,17 @@ class TensorboardCallback(Callback): class WarmupCallback(Callback): """ - 按一定的周期调节Learning rate的大小。 - - :param int,float warmup: 如果warmup为int,则在该step之前,learning rate根据schedule的策略变化; 如果warmup为float, - 如0.1, 则前10%的step是按照schedule策略调整learning rate。 - :param str schedule: 以哪种方式调整。linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后 - warmup的step下降到0; constant前warmup的step上升到指定learning rate,后面的step保持learning rate. + learning rate按照一定的速率从0上升到设置的learning rate。 """ def __init__(self, warmup=0.1, schedule='constant'): + """ + + :param int,float warmup: 如果warmup为int,则在该step之前,learning rate根据schedule的策略变化; 如果warmup为float, + 如0.1, 则前10%的step是按照schedule策略调整learning rate。 + :param str schedule: 以哪种方式调整。 + linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后warmup的step下降到0; + constant前warmup的step上升到指定learning rate,后面的step保持learning rate. + """ super().__init__() self.warmup = max(warmup, 0.) @@ -790,23 +922,26 @@ class WarmupCallback(Callback): class SaveModelCallback(Callback): """ 由于Trainer在训练过程中只会保存最佳的模型, 该callback可实现多种方式的结果存储。 - 会根据训练开始的时间戳在save_dir下建立文件夹,再在文件夹下存放多个模型 - -save_dir - -2019-07-03-15-06-36 - -epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_performance是性能 - -epoch:1_step:40_{metric_key}:{evaluate_performance}.pt - -2019-07-03-15-10-00 - -epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_perfomance是性能 - :param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型 - :param int top: 保存dev表现top多少模型。-1为保存所有模型。 - :param bool only_param: 是否只保存模型d饿权重。 - :param save_on_exception: 发生exception时,是否保存一份发生exception的模型。模型名称为epoch:x_step:x_Exception:{exception_name}. + 会根据训练开始的时间戳在save_dir下建立文件夹,再在文件夹下存放多个模型:: + + -save_dir + -2019-07-03-15-06-36 + -epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_performance是性能 + -epoch:1_step:40_{metric_key}:{evaluate_performance}.pt + -2019-07-03-15-10-00 + -epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_perfomance是性能 """ def __init__(self, save_dir, top=3, only_param=False, save_on_exception=False): + """ + + :param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型。如果save_dir不存在将自动创建 + :param int top: 保存dev表现top多少模型。-1为保存所有模型。 + :param bool only_param: 是否只保存模型的权重。 + :param save_on_exception: 发生exception时,是否保存一份发生exception的模型。模型名称为epoch:x_step:x_Exception:{exception_name}. + """ super().__init__() - if not os.path.isdir(save_dir): - raise IsADirectoryError("{} is not a directory.".format(save_dir)) + os.makedirs(save_dir, exist_ok=True) self.save_dir = save_dir if top < 0: self.top = sys.maxsize @@ -844,35 +979,37 @@ class SaveModelCallback(Callback): return save_pair, delete_pair def _save_this_model(self, metric_value): - name = "epoch:{}_step:{}_{}:{:.6f}.pt".format(self.epoch, self.step, self.trainer.metric_key, metric_value) + name = "epoch-{}_step-{}_{}-{:.6f}.pt".format(self.epoch, self.step, self.trainer.metric_key, metric_value) save_pair, delete_pair = self._insert_into_ordered_save_models((metric_value, name)) if save_pair: try: _save_model(self.model, model_name=name, save_dir=self.save_dir, only_param=self.only_param) except Exception as e: - print(f"The following exception:{e} happens when save model to {self.save_dir}.") + logger.error(f"The following exception:{e} happens when save model to {self.save_dir}.") if delete_pair: try: delete_model_path = os.path.join(self.save_dir, delete_pair[1]) if os.path.exists(delete_model_path): os.remove(delete_model_path) except Exception as e: - print(f"Fail to delete model {name} at {self.save_dir} caused by exception:{e}.") + logger.error(f"Fail to delete model {name} at {self.save_dir} caused by exception:{e}.") def on_exception(self, exception): if self.save_on_exception: - name = "epoch:{}_step:{}_Exception:{}.pt".format(self.epoch, self.step, exception.__class__.__name__) + name = "epoch-{}_step-{}_Exception-{}.pt".format(self.epoch, self.step, exception.__class__.__name__) _save_model(self.model, model_name=name, save_dir=self.save_dir, only_param=self.only_param) class CallbackException(BaseException): """ 当需要通过callback跳出训练的时候可以通过抛出CallbackException并在on_exception中捕获这个值。 - - :param str msg: Exception的信息。 """ def __init__(self, msg): + """ + + :param str msg: Exception的信息。 + """ super(CallbackException, self).__init__(msg) @@ -884,3 +1021,79 @@ class EarlyStopError(CallbackException): def __init__(self, msg): super(EarlyStopError, self).__init__(msg) + + +class EchoCallback(Callback): + """ + 用于测试分布式训练 + + """ + def __init__(self, name, out=sys.stdout): + super(EchoCallback, self).__init__() + self.name = name + self.out = out # deprecated + + def __getattribute__(self, item): + if item.startswith('on_'): + logger.info('{}.{} has been called at pid: {}'.format(self.name, item, os.getpid())) + return super(EchoCallback, self).__getattribute__(item) + + +class _TesterCallback(Callback): + def __init__(self, data, model, metrics, metric_key=None, batch_size=16, num_workers=None): + super(_TesterCallback, self).__init__() + if hasattr(model, 'module'): + # for data parallel model + model = model.module + self.tester = Tester(data, model, + metrics=metrics, batch_size=batch_size, + num_workers=num_workers, verbose=0) + if metric_key is not None: + self.metric_key, self.increase_better = self._parse_metric_key(metric_key) + else: + self.metric_key = None + self.increase_better = True + self.score = None + + def on_valid_begin(self): + cur_score = self.tester.test() + eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. - {}".format( + self.epoch, self.n_epochs, self.step, self.n_steps, + self.tester._format_eval_results(cur_score)) + self.logger.info(eval_str) + is_better = self.compare_better(cur_score) + if is_better: + self.score = cur_score + return cur_score, is_better + + @staticmethod + def _get_score(metric_dict, key): + for metric in metric_dict.items(): + if key in metric: + return metric[key] + return None + + @staticmethod + def _parse_metric_key(metric_key): + # parse metric_key + # increase_better is True. It means the exp result gets better if the indicator increases. + # It is true by default. + increase_better = False if metric_key[0] == "-" else True + metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key + return metric_key, increase_better + + def compare_better(self, a): + if self.score is None: + return True + if self.metric_key is None: + metric_key = list(list(self.score.values())[0].keys())[0] + self.metric_key, self.increase_better = self._parse_metric_key(metric_key) + k = self.metric_key + score = self._get_score(self.score, k) + new_score = self._get_score(a, k) + if score is None or new_score is None: + return False + if self.increase_better: + return score <= new_score + else: + return score >= new_score diff --git a/fastNLP/core/const.py b/fastNLP/core/const.py index 89ff51a2..9bcea2d6 100644 --- a/fastNLP/core/const.py +++ b/fastNLP/core/const.py @@ -1,3 +1,12 @@ +r""" +fastNLP包当中的field命名均符合一定的规范,该规范由fastNLP.Const类进行定义。 +""" + +__all__ = [ + "Const" +] + + class Const: """ fastNLP中field命名常量。 @@ -7,12 +16,14 @@ class Const: 具体列表:: - INPUT 模型的序列输入 words(复数words1, words2) - CHAR_INPUT 模型character输入 chars(复数chars1, chars2) - INPUT_LEN 序列长度 seq_len(复数seq_len1,seq_len2) - OUTPUT 模型输出 pred(复数pred1, pred2) - TARGET 真实目标 target(复数target1,target2) - LOSS 损失函数 loss (复数loss1,loss2) + INPUT 模型的序列输入 words(具有多列words时,依次使用words1, words2, ) + CHAR_INPUT 模型character输入 chars(具有多列chars时,依次使用chars1, chars2) + INPUT_LEN 序列长度 seq_len(具有多列seq_len时,依次使用seq_len1,seq_len2) + OUTPUT 模型输出 pred(具有多列pred时,依次使用pred1, pred2) + TARGET 真实目标 target(具有多列target时,依次使用target1,target2) + LOSS 损失函数 loss (具有多列loss时,依次使用loss1,loss2) + RAW_WORD 原文的词 raw_words (具有多列raw_words时,依次使用raw_words1, raw_words2) + RAW_CHAR 原文的字 raw_chars (具有多列raw_chars时,依次使用raw_chars1, raw_chars2) """ INPUT = 'words' @@ -21,37 +32,51 @@ class Const: OUTPUT = 'pred' TARGET = 'target' LOSS = 'loss' - + RAW_WORD = 'raw_words' + RAW_CHAR = 'raw_chars' + @staticmethod def INPUTS(i): """得到第 i 个 ``INPUT`` 的命名""" i = int(i) + 1 return Const.INPUT + str(i) - + @staticmethod def CHAR_INPUTS(i): """得到第 i 个 ``CHAR_INPUT`` 的命名""" i = int(i) + 1 return Const.CHAR_INPUT + str(i) - + + @staticmethod + def RAW_WORDS(i): + """得到第 i 个 ``RAW_WORDS`` 的命名""" + i = int(i) + 1 + return Const.RAW_WORD + str(i) + + @staticmethod + def RAW_CHARS(i): + """得到第 i 个 ``RAW_CHARS`` 的命名""" + i = int(i) + 1 + return Const.RAW_CHAR + str(i) + @staticmethod def INPUT_LENS(i): """得到第 i 个 ``INPUT_LEN`` 的命名""" i = int(i) + 1 return Const.INPUT_LEN + str(i) - + @staticmethod def OUTPUTS(i): """得到第 i 个 ``OUTPUT`` 的命名""" i = int(i) + 1 return Const.OUTPUT + str(i) - + @staticmethod def TARGETS(i): """得到第 i 个 ``TARGET`` 的命名""" i = int(i) + 1 return Const.TARGET + str(i) - + @staticmethod def LOSSES(i): """得到第 i 个 ``LOSS`` 的命名""" diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 7b7fa87a..53e9bb4c 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -86,7 +86,7 @@ dataset.append(Instance(sentence=sent, label=label)) .. note:: - 直接读取特定数据集的数据请参考 :doc:`/tutorials/tutorial_2_load_dataset` + 直接读取特定数据集的数据请参考 :doc:`/tutorials/tutorial_4_load_dataset` 2.2 对DataSet中的内容处理 -------------------------------------- @@ -288,29 +288,33 @@ __all__ = [ ] import _pickle as pickle -import warnings +from copy import deepcopy import numpy as np +from prettytable import PrettyTable +from ._logger import logger +from .const import Const +from .field import AppendToTargetOrInputException from .field import AutoPadder from .field import FieldArray +from .field import SetInputOrTargetException from .instance import Instance from .utils import _get_func_signature -from .field import AppendToTargetOrInputException -from .field import SetInputOrTargetException +from .utils import pretty_table_printer + class DataSet(object): """ - 别名::class:`fastNLP.DataSet` :class:`fastNLP.core.dataset.DataSet` - - fastNLP的数据容器,详细的使用方法见文档 :doc:`fastNLP.core.dataset` - - :param data: 如果为dict类型,则每个key的value应该为等长的list; 如果为list, - 每个元素应该为具有相同field的 :class:`~fastNLP.Instance` 。 - + fastNLP的数据容器,详细的使用方法见文档 :mod:`fastNLP.core.dataset` """ - + def __init__(self, data=None): + """ + + :param data: 如果为dict类型,则每个key的value应该为等长的list; 如果为list, + 每个元素应该为具有相同field的 :class:`~fastNLP.Instance` 。 + """ self.field_arrays = {} if data is not None: if isinstance(data, dict): @@ -324,41 +328,48 @@ class DataSet(object): for ins in data: assert isinstance(ins, Instance), "Must be Instance type, not {}.".format(type(ins)) self.append(ins) - + else: raise ValueError("data only be dict or list type.") - + def __contains__(self, item): return item in self.field_arrays - + def __iter__(self): def iter_func(): for idx in range(len(self)): yield self[idx] - + return iter_func() - + def _inner_iter(self): class Iter_ptr: def __init__(self, dataset, idx): self.dataset = dataset self.idx = idx - + def __getitem__(self, item): assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[ self.idx]) assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) return self.dataset.field_arrays[item][self.idx] - + + def __setitem__(self, key, value): + raise TypeError("You cannot modify value directly.") + + def items(self): + ins = self.dataset[self.idx] + return ins.items() + def __repr__(self): return self.dataset[self.idx].__repr__() - + def inner_iter_func(): for idx in range(len(self)): yield Iter_ptr(self, idx) - + return inner_iter_func() - + def __getitem__(self, idx): """给定int的index,返回一个Instance; 给定slice,返回包含这个slice内容的新的DataSet。 @@ -391,20 +402,20 @@ class DataSet(object): return dataset else: raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx))) - + def __getattr__(self, item): # Not tested. Don't use !! if item == "field_arrays": raise AttributeError if isinstance(item, str) and item in self.field_arrays: return self.field_arrays[item] - + def __setstate__(self, state): self.__dict__ = state - + def __getstate__(self): return self.__dict__ - + def __len__(self): """Fetch the length of the dataset. @@ -414,16 +425,66 @@ class DataSet(object): return 0 field = iter(self.field_arrays.values()).__next__() return len(field) - - def __inner_repr__(self): - if len(self) < 20: - return ",\n".join([ins.__repr__() for ins in self]) - else: - return self[:5].__inner_repr__() + "\n...\n" + self[-5:].__inner_repr__() - + def __repr__(self): - return "DataSet(" + self.__inner_repr__() + ")" - + return str(pretty_table_printer(self)) + + def print_field_meta(self): + """ + 输出当前field的meta信息, 形似下列的输出:: + + +-------------+-------+-------+ + | field_names | x | y | + +=============+=======+=======+ + | is_input | True | False | + | is_target | False | False | + | ignore_type | False | | + | pad_value | 0 | | + +-------------+-------+-------+ + + :param field_names: DataSet中field的名称 + :param is_input: field是否为input + :param is_target: field是否为target + :param ignore_type: 是否忽略该field的type, 一般仅在该field至少为input或target时才有意义 + :param pad_value: 该field的pad的值,仅在该field为input或target时有意义 + :return: + """ + if len(self.field_arrays)>0: + field_names = ['field_names'] + is_inputs = ['is_input'] + is_targets = ['is_target'] + pad_values = ['pad_value'] + ignore_types = ['ignore_type'] + + for name, field_array in self.field_arrays.items(): + field_names.append(name) + if field_array.is_input: + is_inputs.append(True) + else: + is_inputs.append(False) + if field_array.is_target: + is_targets.append(True) + else: + is_targets.append(False) + + if (field_array.is_input or field_array.is_target) and field_array.padder is not None: + pad_values.append(field_array.padder.get_pad_val()) + else: + pad_values.append(' ') + + if field_array._ignore_type: + ignore_types.append(True) + elif field_array.is_input or field_array.is_target: + ignore_types.append(False) + else: + ignore_types.append(' ') + table = PrettyTable(field_names=field_names) + fields = [is_inputs, is_targets, ignore_types, pad_values] + for field in fields: + table.add_row(field) + logger.info(table) + return table + def append(self, instance): """ 将一个instance对象append到DataSet后面。 @@ -446,9 +507,9 @@ class DataSet(object): try: self.field_arrays[name].append(field) except AppendToTargetOrInputException as e: - print(f"Cannot append to field:{name}.") + logger.error(f"Cannot append to field:{name}.") raise e - + def add_fieldarray(self, field_name, fieldarray): """ 将fieldarray添加到DataSet中. @@ -463,7 +524,7 @@ class DataSet(object): raise RuntimeError(f"The field to add must have the same size as dataset. " f"Dataset size {len(self)} != field size {len(fieldarray)}") self.field_arrays[field_name] = fieldarray - + def add_field(self, field_name, fields, padder=AutoPadder(), is_input=False, is_target=False, ignore_type=False): """ 新增一个field @@ -475,19 +536,19 @@ class DataSet(object): :param bool is_target: 新加入的field是否是target :param bool ignore_type: 是否忽略对新加入的field的类型检查 """ - + if len(self.field_arrays) != 0: if len(self) != len(fields): raise RuntimeError(f"The field to add must have the same size as dataset. " f"Dataset size {len(self)} != field size {len(fields)}") self.field_arrays[field_name] = FieldArray(field_name, fields, is_target=is_target, is_input=is_input, padder=padder, ignore_type=ignore_type) - + def delete_instance(self, index): """ 删除第index个instance - :param int index: 需要删除的instance的index,从0开始 + :param int index: 需要删除的instance的index,序号从0开始。 """ assert isinstance(index, int), "Only integer supported." if len(self) <= index: @@ -497,7 +558,8 @@ class DataSet(object): else: for field in self.field_arrays.values(): field.pop(index) - + return self + def delete_field(self, field_name): """ 删除名为field_name的field @@ -505,7 +567,22 @@ class DataSet(object): :param str field_name: 需要删除的field的名称. """ self.field_arrays.pop(field_name) - + return self + + def copy_field(self, field_name, new_field_name): + """ + 深度copy名为field_name的field到new_field_name + + :param str field_name: 需要copy的field。 + :param str new_field_name: copy生成的field名称 + :return: self + """ + if not self.has_field(field_name): + raise KeyError(f"Field:{field_name} not found in DataSet.") + fieldarray = deepcopy(self.get_field(field_name)) + self.add_fieldarray(field_name=new_field_name, fieldarray=fieldarray) + return self + def has_field(self, field_name): """ 判断DataSet中是否有名为field_name这个field @@ -516,7 +593,7 @@ class DataSet(object): if isinstance(field_name, str): return field_name in self.field_arrays return False - + def get_field(self, field_name): """ 获取field_name这个field @@ -527,7 +604,7 @@ class DataSet(object): if field_name not in self.field_arrays: raise KeyError("Field name {} not found in DataSet".format(field_name)) return self.field_arrays[field_name] - + def get_all_fields(self): """ 返回一个dict,key为field_name, value为对应的 :class:`~fastNLP.FieldArray` @@ -535,7 +612,7 @@ class DataSet(object): :return dict: 返回如上所述的字典 """ return self.field_arrays - + def get_field_names(self) -> list: """ 返回一个list,包含所有 field 的名字 @@ -543,7 +620,7 @@ class DataSet(object): :return list: 返回如上所述的列表 """ return sorted(self.field_arrays.keys()) - + def get_length(self): """ 获取DataSet的元素数量 @@ -551,22 +628,22 @@ class DataSet(object): :return: int: DataSet中Instance的个数。 """ return len(self) - - def rename_field(self, old_name, new_name): + + def rename_field(self, field_name, new_field_name): """ 将某个field重新命名. - :param str old_name: 原来的field名称。 - :param str new_name: 修改为new_name。 + :param str field_name: 原来的field名称。 + :param str new_field_name: 修改为new_name。 """ - if old_name in self.field_arrays: - self.field_arrays[new_name] = self.field_arrays.pop(old_name) - self.field_arrays[new_name].name = new_name + if field_name in self.field_arrays: + self.field_arrays[new_field_name] = self.field_arrays.pop(field_name) + self.field_arrays[new_field_name].name = new_field_name else: - raise KeyError("DataSet has no field named {}.".format(old_name)) + raise KeyError("DataSet has no field named {}.".format(field_name)) return self - - def set_target(self, *field_names, flag=True): + + def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): """ 将field_names的field设置为target @@ -577,19 +654,23 @@ class DataSet(object): :param str field_names: field的名称 :param bool flag: 将field_name的target状态设置为flag + :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 + 行的数据进行类型和维度推断本列的数据的类型和维度。 """ assert isinstance(flag, bool), "Only bool type supported." for name in field_names: if name in self.field_arrays: try: + self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self.field_arrays[name].is_target = flag except SetInputOrTargetException as e: - print(f"Cannot set field:{name} as target.") + logger.error(f"Cannot set field:{name} as target.") raise e else: raise KeyError("{} is not a valid field name.".format(name)) - - def set_input(self, *field_names, flag=True): + return self + + def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): """ 将field_names的field设置为input:: @@ -598,17 +679,21 @@ class DataSet(object): :param str field_names: field的名称 :param bool flag: 将field_name的input状态设置为flag + :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 + 行的数据进行类型和维度推断本列的数据的类型和维度。 """ for name in field_names: if name in self.field_arrays: try: + self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self.field_arrays[name].is_input = flag except SetInputOrTargetException as e: - print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.") + logger.error(f"Cannot set field:{name} as input, exception happens at the {e.index} value.") raise e else: raise KeyError("{} is not a valid field name.".format(name)) - + return self + def set_ignore_type(self, *field_names, flag=True): """ 将field设置为忽略类型状态。当某个field被设置了ignore_type, 则在被设置为target或者input时将不进行类型检查, @@ -624,7 +709,8 @@ class DataSet(object): self.field_arrays[name].ignore_type = flag else: raise KeyError("{} is not a valid field name.".format(name)) - + return self + def set_padder(self, field_name, padder): """ 为field_name设置padder:: @@ -639,7 +725,8 @@ class DataSet(object): if field_name not in self.field_arrays: raise KeyError("There is no field named {}.".format(field_name)) self.field_arrays[field_name].set_padder(padder) - + return self + def set_pad_val(self, field_name, pad_val): """ 为某个field设置对应的pad_val. @@ -650,7 +737,8 @@ class DataSet(object): if field_name not in self.field_arrays: raise KeyError("There is no field named {}.".format(field_name)) self.field_arrays[field_name].set_pad_val(pad_val) - + return self + def get_input_name(self): """ 返回所有is_input被设置为True的field名称 @@ -658,7 +746,7 @@ class DataSet(object): :return list: 里面的元素为被设置为input的field名称 """ return [name for name, field in self.field_arrays.items() if field.is_input] - + def get_target_name(self): """ 返回所有is_target被设置为True的field名称 @@ -666,7 +754,7 @@ class DataSet(object): :return list: 里面的元素为被设置为target的field名称 """ return [name for name, field in self.field_arrays.items() if field.is_target] - + def apply_field(self, func, field_name, new_field_name=None, **kwargs): """ 将DataSet中的每个instance中的名为 `field_name` 的field传给func,并获取它的返回值。 @@ -695,16 +783,16 @@ class DataSet(object): results.append(func(ins[field_name])) except Exception as e: if idx != -1: - print("Exception happens at the `{}`th instance.".format(idx)) + logger.error("Exception happens at the `{}`th(from 1) instance.".format(idx + 1)) raise e if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(_get_func_signature(func=func))) - + if new_field_name is not None: self._add_apply_field(results, new_field_name, kwargs) - + return results - + def _add_apply_field(self, results, new_field_name, kwargs): """ 将results作为加入到新的field中,field名称为new_field_name @@ -736,7 +824,7 @@ class DataSet(object): self.add_field(field_name=new_field_name, fields=results, is_input=extra_param.get("is_input", None), is_target=extra_param.get("is_target", None), ignore_type=extra_param.get("ignore_type", False)) - + def apply(self, func, new_field_name=None, **kwargs): """ 将DataSet中每个instance传入到func中,并获取它的返回值. @@ -760,20 +848,21 @@ class DataSet(object): results = [] for idx, ins in enumerate(self._inner_iter()): results.append(func(ins)) - except Exception as e: + except BaseException as e: if idx != -1: - print("Exception happens at the `{}`th instance.".format(idx)) + logger.error("Exception happens at the `{}`th instance.".format(idx)) raise e + # results = [func(ins) for ins in self._inner_iter()] if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(_get_func_signature(func=func))) - + if new_field_name is not None: self._add_apply_field(results, new_field_name, kwargs) - + return results - def add_seq_len(self, field_name:str, new_field_name='seq_len'): + def add_seq_len(self, field_name: str, new_field_name=Const.INPUT_LEN): """ 将使用len()直接对field_name中每个元素作用,将其结果作为seqence length, 并放入seq_len这个field。 @@ -810,7 +899,7 @@ class DataSet(object): return dataset else: return DataSet() - + def split(self, ratio, shuffle=True): """ 将DataSet按照ratio的比例拆分,返回两个DataSet @@ -836,51 +925,9 @@ class DataSet(object): for field_name in self.field_arrays: train_set.field_arrays[field_name].to(self.field_arrays[field_name]) dev_set.field_arrays[field_name].to(self.field_arrays[field_name]) - + return train_set, dev_set - - @classmethod - def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): - r""" - .. warning:: - 此方法会在下个版本移除,请使用 :class:`fastNLP.io.CSVLoader` - - 从csv_path路径下以csv的格式读取数据。 - :param str csv_path: 从哪里读取csv文件 - :param list[str] headers: 如果为None,则使用csv文件的第一行作为header; 如果传入list(str), 则元素的个数必须 - 与csv文件中每行的元素个数相同。 - :param str sep: 分割符 - :param bool dropna: 是否忽略与header数量不一致行。 - :return: 读取后的 :class:`~fastNLP.读取后的DataSet`。 - """ - warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead', - category=DeprecationWarning) - with open(csv_path, "r", encoding='utf-8') as f: - start_idx = 0 - if headers is None: - headers = f.readline().rstrip('\r\n') - headers = headers.split(sep) - start_idx += 1 - else: - assert isinstance(headers, (list, tuple)), "headers should be list or tuple, not {}.".format( - type(headers)) - _dict = {} - for col in headers: - _dict[col] = [] - for line_idx, line in enumerate(f, start_idx): - contents = line.rstrip('\r\n').split(sep) - if len(contents) != len(headers): - if dropna: - continue - else: - # TODO change error type - raise ValueError("Line {} has {} parts, while header has {} parts." \ - .format(line_idx, len(contents), len(headers))) - for header, content in zip(headers, contents): - _dict[header].append(content) - return cls(_dict) - def save(self, path): """ 保存DataSet. @@ -889,7 +936,7 @@ class DataSet(object): """ with open(path, 'wb') as f: pickle.dump(self, f) - + @staticmethod def load(path): r""" diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py new file mode 100644 index 00000000..15c5eda5 --- /dev/null +++ b/fastNLP/core/dist_trainer.py @@ -0,0 +1,447 @@ +"""undocumented +正在开发中的分布式训练代码 +""" +import logging +import os +import time +from datetime import datetime + +import torch +import torch.cuda +import torch.distributed as dist +import torch.optim +from pkg_resources import parse_version +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data.distributed import DistributedSampler +from tqdm import tqdm + +from ._logger import logger +from .batch import DataSetIter, BatchIter +from .callback import DistCallbackManager, CallbackException +from .callback import _TesterCallback +from .dataset import DataSet +from .losses import _prepare_losser +from .optimizer import Optimizer +from .utils import _build_args +from .utils import _check_fp16 +from .utils import _get_func_signature +from .utils import _move_dict_value_to_device + +try: + from apex import amp +except: + amp = None + +__all__ = [ + 'get_local_rank', + 'DistTrainer', +] + +def get_local_rank(): + """ + 返回当前进程的 local rank, 0 到 N-1 ,N为当前分布式总进程数 + """ + if 'LOCAL_RANK' in os.environ: + return int(os.environ['LOCAL_RANK']) + from argparse import ArgumentParser + parser = ArgumentParser() + parser.add_argument('--local_rank', type=int) + args, _ = parser.parse_known_args() + if 'local_rank' in args and args.local_rank: + os.environ['LOCAL_RANK'] = str(args.local_rank) # for multiple calls for this function + return args.local_rank + raise RuntimeError('Please use "python -m torch.distributed.launch --nproc_per_node=N train_script.py') + + +class DistTrainer(): + """ + 分布式的 Trainer,支持分布式训练和混合精度的训练。具体实现原理请阅读 pytorch 官方文档。 + + Note: 使用分布式 Trainer 时会同时有多个进程执行训练代码。因此将单进程的训练代码改为多进程之前, + 请仔细检查,确保训练代码中的同步和互斥操作能正确执行(如模型保持,打印日志等) + """ + def __init__(self, train_data, model, optimizer=None, loss=None, + callbacks_all=None, callbacks_master=None, + batch_size_per_gpu=8, n_epochs=1, + num_workers=1, drop_last=False, + dev_data=None, metrics=None, metric_key=None, + update_every=1, print_every=10, validate_every=-1, + save_every=-1, save_path=None, device='auto', + fp16='', backend=None, init_method=None, use_tqdm=True): + """ + + :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。 + :param nn.modules model: 待训练的模型 + :param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器 + :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward` + :param list callbacks_all: 用于在train过程中起调节作用的回调函数,作用于所有训练进程中。 + 可使用的callback参见 :mod:`callback模块 ` + :param list callbacks_master: 用于在train过程中起调节作用的回调函数,只作用于其中一个进程( Master 进程)。 + 可使用的callback参见 :mod:`callback模块 ` + :param int batch_size_per_gpu: 训练时,每个进程的 batch 大小。 + :param int n_epochs: 需要优化迭代多少次。 + :param num_workers: int, 有多少个线程来进行数据pad处理。 + :param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch + :param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。 + :param metrics: 验证的评估函数。可以只使用一个 :class:`Metric` , + 也可以使用多个 :class:`Metric` ,通过列表传入。 + 如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None, + 则保存当前模型。Metric种类详见 :mod:`metrics模块 ` 。仅在传入dev_data时有效。 + :param str,None metric_key: :class:`Metric` 有时会有多个指标, + 比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需 + 要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表 + 明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。 + :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128 + 会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。 + :param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。 + :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 + :param int save_every: 多少个step保存一次模型,如果为-1,则每个epoch结束保存一次。仅在传入save_path时有效。 + :param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存 + 最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 + :param str device: 指定 device,可以是 gpu,cpu 或 auto + :param str fp16: 指定半精度训练的优化等级,可为 O1,O2 或 O3,若为空字符串则不使用半精度。 + :param backend: 指定分布式的backend,详情参考 pytorch 文档 + :param init_method 指定分布式的初始化方法,详情参考 pytorch 文档 + :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 + """ + assert device in ['auto', 'cuda', 'cpu'], "Please set correct device in [auto', 'cuda', 'cpu']" + if device == 'auto': + device = 'cuda' if torch.cuda.is_available() else 'cpu' + if backend is None: + backend = 'nccl' if device == 'cuda' else 'gloo' + + # init distributed + if device == 'cuda': + torch.cuda.set_device(get_local_rank()) + self.device = torch.device("cuda", get_local_rank()) + else: + self.device = torch.device(device) + + dist.init_process_group(backend=backend, init_method=init_method) + self.world_size = dist.get_world_size() + self.rank = dist.get_rank() # unique id for each process + + self.model = model + self.train_data = train_data + self.batch_size_per_gpu = int(batch_size_per_gpu) + self.n_epochs = int(n_epochs) + self.num_data_workers = int(num_workers) + self.drop_last = drop_last + self.update_every = int(update_every) + self.print_every = int(print_every) + self.validate_every = int(validate_every) + self.save_every = int(save_every) + self.save_path = save_path + self.losser = _prepare_losser(loss) + self.fp16 = fp16 + self.init_method = init_method + self.backend = backend + self.local_rank = get_local_rank() + self._forward_func = model.forward + self.callback_manager = DistCallbackManager( + env={"trainer": self}, callbacks_all=callbacks_all, + callbacks_master=callbacks_master) + self.test_manager = DistCallbackManager(env={'trainer': self}) + self.metric_key = metric_key + self.use_tqdm = use_tqdm + + model.to(self.device) + optimizer = self._get_optimizer(optimizer) + + # init fp16, must before DataParallel init + if len(self.fp16): + assert isinstance(self.fp16, str), "Please set Apex AMP optimization level selected in ['O0', 'O1', 'O2', 'O3']" + _check_fp16() + assert device == 'cuda', "Amp requires cuda device" + model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16) + + # init DataParallel + if parse_version(torch.__version__)>=parse_version('1.1'): + self.model = DDP(model, device_ids=[self.local_rank], + output_device=self.local_rank, find_unused_parameters=True) + else: + self.model = DDP(model, device_ids=[self.local_rank], + output_device=self.local_rank) + + self.optimizer = optimizer + self.sampler = DistributedSampler(self.train_data) + self.data_iterator = self._get_data_iter(self.train_data) + self.batch_size = self.world_size * self.batch_size_per_gpu + self.n_steps = self._get_n_steps() + + # for evaluation, only run eval on master proc + if dev_data and metrics: + cb = _TesterCallback( + dev_data, model, metrics, + batch_size=batch_size_per_gpu, num_workers=num_workers) + self.test_manager.add_callback([cb], master=False) + + # Setup logging + dist.barrier() + self.start_time = datetime.now().strftime('%m_%d_%Y-%H_%M') + if self.save_path: + self.cp_save_path = os.path.join(self.save_path, 'checkpoints') + else: + self.cp_save_path = None + + # use INFO in the master, WARN for others + logger.setLevel(logging.INFO if self.is_master else logging.WARNING) + self.logger = logger + self.logger.info("Setup Distributed Trainer") + self.logger.warning("Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".format( + os.getpid(), self.rank, self.local_rank, self.device, self.fp16 if self.fp16 else False)) + self.logger.info("Num of processes: {}".format(self.world_size)) + self.logger.info("Use device: {}".format(device)) + self.logger.info("Training with fp16: {}, optimization level: {}".format( + len(self.fp16) > 0, self.fp16 if self.fp16 else None)) + + def _get_n_steps(self): + batch_size = self.world_size * self.batch_size_per_gpu + return (len(self.train_data) // batch_size + int( + len(self.train_data) % batch_size != 0)) * int(self.drop_last == 0) * self.n_epochs + + def _get_data_iter(self, dataset): + if isinstance(dataset, DataSet): + return DataSetIter( + dataset=dataset, batch_size=self.batch_size_per_gpu, + num_workers=self.num_data_workers, sampler=self.sampler, + drop_last=self.drop_last + ) + elif isinstance(dataset, BatchIter): + return dataset + else: + raise TypeError("train_data type {} not support".format(type(dataset))) + + def _get_optimizer(self, optimizer): + if isinstance(optimizer, torch.optim.Optimizer): + return optimizer + elif isinstance(optimizer, Optimizer): + return optimizer.construct_from_pytorch(self.model.parameters()) + elif optimizer is None: + return torch.optim.Adam(self.model.parameters(), lr=4e-3) + else: + raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer))) + + @property + def is_master(self): + """是否是主进程""" + return self.rank == 0 + + def train(self, load_best_model=True, on_exception='auto'): + """ + 使用该函数使Trainer开始训练。 + + :param str on_exception: 在训练过程遭遇exception,并被 :py:class:Callback 的on_exception()处理后,是否继续抛出异常。 + 支持'ignore','raise', 'auto': 'ignore'将捕获异常,写在Trainer.train()后面的代码将继续运行; 'raise'将异常抛出; + 'auto'将ignore以下两种Exception: CallbackException与KeyboardInterrupt, raise其它exception. + :return dict: 返回一个字典类型的数据, + 内含以下内容:: + + seconds: float, 表示训练时长 + 以下三个内容只有在提供了dev_data的情况下会有。 + best_eval: Dict of Dict, 表示evaluation的结果。第一层的key为Metric的名称, + 第二层的key为具体的Metric + best_epoch: int,在第几个epoch取得的最佳值 + best_step: int, 在第几个step(batch)更新取得的最佳值 + + """ + try: + self.logger.info("###### Training epochs started ######") + self.logger.info('Total epochs: %d'% self.n_epochs) + self.logger.info('Total steps: %d'% self.n_steps) + self.logger.info('Num instances per GPU %d'% self.batch_size_per_gpu) + self.logger.info('Total batch_size: %d'% self.batch_size_per_gpu * dist.get_world_size()) + self.logger.info('Total num of samples: %d'% len(self.train_data)) + self.logger.info("Num of callbacks for all workers: {}".format( + len(self.callback_manager.callbacks_all))) + self.logger.info("Num of callbacks for master workers: {}".format( + len(self.callback_manager.callbacks_master))) + self.logger.info("Callbacks for all workers: {}".format( + [repr(cb) for cb in self.callback_manager.callbacks_all])) + self.logger.info("Callbacks for master workers: {}".format( + [repr(cb) for cb in self.callback_manager.callbacks_master])) + + start_time = time.time() + results = {} + if self.n_epochs <= 0: + self.logger.info("Training epoch is {}, nothing was done.".format(self.n_epochs)) + results['seconds'] = 0. + return results + + try: + self.callback_manager.on_train_begin() + self._train() + self.callback_manager.on_train_end() + + except BaseException as e: + self.callback_manager.on_exception(e) + if on_exception == 'auto': + if not isinstance(e, (CallbackException, KeyboardInterrupt)): + raise e + else: + self.logger.info('Catch {}, ignored.'.format(e.__class__.__name__)) + elif on_exception == 'raise': + raise e + + results['seconds'] = round(time.time() - start_time, 2) + self.logger.info("###### Train finished ######") + self.logger.info('Total train time: {} seconds.'. format(results['seconds'])) + if load_best_model and self.cp_save_path and len(self.test_manager.callbacks): + self.load_check_point('best') + finally: + pass + dist.barrier() + return results + + def _train(self): + if not self.use_tqdm: + from .utils import _pseudo_tqdm as inner_tqdm + else: + inner_tqdm = tqdm + + self.step = 0 + self.epoch = 0 + self.pbar = inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', + leave=False, dynamic_ncols=True, disable=not self.is_master) + pbar = self.pbar + avg_loss = 0 + data_iterator = self.data_iterator + self.model.zero_grad() + for epoch in range(1, self.n_epochs + 1): + self.epoch = epoch + pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) + # early stopping + self.callback_manager.on_epoch_begin() + for batch_x, batch_y in data_iterator: + self.model.train() + self.step += 1 + _move_dict_value_to_device(batch_x, batch_y, device=self.device) + indices = data_iterator.get_batch_indices() + # negative sampling; replace unknown; re-weight batch_y + self.callback_manager.on_batch_begin(batch_x, batch_y, indices) + prediction = self._data_forward(self.model, batch_x) + + # edit prediction + self.callback_manager.on_loss_begin(batch_y, prediction) + loss = self._compute_loss(prediction, batch_y) + avg_loss += loss.item() + + # Is loss NaN or inf? requires_grad = False + self.callback_manager.on_backward_begin(loss) + + if self.fp16: + with amp.scale_loss(loss, self.optimizer) as scale_loss: + scale_loss.backward() + else: + loss.backward() + + self.callback_manager.on_backward_end() + + self._update() + self.callback_manager.on_step_end() + + if self.step % self.print_every == 0: + avg_loss = float(avg_loss) / self.print_every + print_output = "loss:{:<6.5f}".format(avg_loss) + pbar.update(self.print_every) + pbar.set_postfix_str(print_output) + avg_loss = 0 + + self.callback_manager.on_batch_end() + + if (self.validate_every > 0 and self.step % self.validate_every == 0): + self._do_validation() + + if self.cp_save_path and \ + self.save_every > 0 and \ + self.step % self.save_every == 0: + self.save_check_point() + + # ================= mini-batch end ==================== # + if self.validate_every < 0: + self._do_validation() + + if self.save_every < 0 and self.cp_save_path: + self.save_check_point() + # lr decay; early stopping + self.callback_manager.on_epoch_end() + # =============== epochs end =================== # + pbar.close() + self.pbar = None + # ============ tqdm end ============== # + + def _update(self): + """Perform weight update on a model. + + """ + if self.step % self.update_every == 0: + self.optimizer.step() + self.model.zero_grad() + + def _data_forward(self, network, x): + x = _build_args(self._forward_func, **x) + y = network(**x) + if not isinstance(y, dict): + raise TypeError( + f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") + return y + + def _compute_loss(self, predict, truth): + """Compute loss given prediction and ground truth. + + :param predict: prediction dict, produced by model.forward + :param truth: ground truth dict, produced by batch_y + :return: a scalar + """ + loss = self.losser(predict, truth) + if self.update_every > 1: + loss = loss / self.update_every + if loss.dim() > 0: + loss = loss.mean() + return loss + + def save_check_point(self, name=None, only_params=False): + """保存当前模型""" + # only master save models + if self.is_master: + if name is None: + name = 'checkpoint-{}.bin'.format(self.step) + os.makedirs(self.cp_save_path, exist_ok=True) + path = os.path.join(self.cp_save_path, name) + self.logger.info("Save checkpoint to {}".format(path)) + model_to_save = self.model.module + if only_params: + model_to_save = model_to_save.state_dict() + torch.save(model_to_save, path) + + def load_check_point(self, name): + path = os.path.join(self.cp_save_path, name) + self.logger.info('reload best model from %s', path) + model_load = torch.load(path, map_location='cpu') + if not isinstance(model_load, dict): + model_load = model_load.state_dict() + self.model.module.load_state_dict(model_load) + + def _do_validation(self): + self.callback_manager.on_valid_begin() + # do evaluate on all nodes + eval_res = self.test_manager.on_valid_begin() + eval_res = list(filter(lambda x: x is not None, eval_res)) + if len(eval_res): + eval_res, is_better = list(zip(*eval_res)) + else: + eval_res, is_better = None, None + # save better model on master node + if self.is_master and is_better is not None and self.cp_save_path: + for i, better_flag in enumerate(is_better): + if better_flag: + # TODO to support multiple datasets to evaluate + self.save_check_point('best') + break + self.callback_manager.on_valid_end( + eval_res, self.metric_key, self.optimizer, is_better) + dist.barrier() + + def close(self): + """关闭Trainer,销毁进程""" + dist.destroy_process_group() diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index bba854f5..1835bafa 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -1,73 +1,91 @@ +""" +.. todo:: + doc +""" +__all__ = [ + "Padder", + "AutoPadder", + "EngChar2DPadder", +] -from numbers import Number -import torch -import numpy as np -from typing import Any from abc import abstractmethod -from copy import deepcopy from collections import Counter +from copy import deepcopy +from numbers import Number +from typing import Any + +import numpy as np +import torch + +from ._logger import logger +from .utils import _is_iterable + class SetInputOrTargetException(Exception): def __init__(self, msg, index=None, field_name=None): super().__init__(msg) self.msg = msg self.index = index # 标示在哪个数据遭遇到问题了 - self.field_name = field_name # 标示当前field的名称 + self.field_name = field_name # 标示当前field的名称 + class AppendToTargetOrInputException(Exception): def __init__(self, msg, index=None, field_name=None): super().__init__(msg) self.msg = msg self.index = index # 标示在哪个数据遭遇到问题了 - self.field_name = field_name # 标示当前field的名称 + self.field_name = field_name # 标示当前field的名称 + class FieldArray: - def __init__(self, name, content, is_target=False, is_input=False, padder=None, ignore_type=False): - if len(content)==0: + def __init__(self, name, content, is_target=False, is_input=False, padder=None, ignore_type=False, + use_1st_ins_infer_dim_type=True): + if len(content) == 0: raise RuntimeError("Empty fieldarray is not allowed.") _content = content try: _content = list(_content) except BaseException as e: - print(f"Cannot convert content(of type:{type(content)}) into list.") + logger.error(f"Cannot convert content(of type:{type(content)}) into list.") raise e self.name = name self.content = _content self._ignore_type = ignore_type # 根据input的情况设置input,target等 - self._cell_ndim = None # 多少维度 + self._cell_ndim = None # 多少维度, 如果value是1, dim为0; 如果value是[1, 2], dim=2 self.dtype = None # 最内层的element都是什么类型的 + self._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self._is_input = False self._is_target = False - + if is_input: self.is_input = is_input if is_target: self.is_target = is_target - + if padder is None: padder = AutoPadder(pad_val=0) else: assert isinstance(padder, Padder), "padder must be of type fastNLP.Padder." padder = deepcopy(padder) self.set_padder(padder) - + @property def ignore_type(self): return self._ignore_type - + @ignore_type.setter def ignore_type(self, value): if value: self._cell_ndim = None self.dtype = None self._ignore_type = value - + @property def is_input(self): return self._is_input - + @is_input.setter def is_input(self, value): """ @@ -77,16 +95,16 @@ class FieldArray: if value is True and \ self._is_target is False and \ self._ignore_type is False: - self._check_dtype_and_ndim() + self._check_dtype_and_ndim(only_check_1st_ins_dim_type=self._use_1st_ins_infer_dim_type) if value is False and self._is_target is False: self.dtype = None self._cell_ndim = None self._is_input = value - + @property def is_target(self): return self._is_target - + @is_target.setter def is_target(self, value): """ @@ -95,70 +113,82 @@ class FieldArray: if value is True and \ self._is_input is False and \ self._ignore_type is False: - self._check_dtype_and_ndim() + self._check_dtype_and_ndim(only_check_1st_ins_dim_type=self._use_1st_ins_infer_dim_type) if value is False and self._is_input is False: self.dtype = None self._cell_ndim = None self._is_target = value - - def _check_dtype_and_ndim(self): + + def _check_dtype_and_ndim(self, only_check_1st_ins_dim_type=True): """ 检查当前content所有的element是否是同一个类型,且是否每个元素具有相同的维度。通过的话,设置_cell_ndim与_ele_type属性;没有 通过将直接报错. + :param bool only_check_1st_ins_dim_type: 是否只检查第一个元素的type和dim :return: """ cell_0 = self.content[0] index = 0 try: type_0, dim_0 = _get_ele_type_and_dim(cell_0) - for cell in self.content[1:]: - index += 1 - type_i, dim_i = _get_ele_type_and_dim(cell) - if type_i!=type_0: - raise SetInputOrTargetException("Type:{} in index {} is different from the first element with type:{}." - ".".format(type_i, index, type_0)) - if dim_0!=dim_i: - raise SetInputOrTargetException("Dimension:{} in index {} is different from the first element with " - "dimension:{}.".format(dim_i, index, dim_0)) + if not only_check_1st_ins_dim_type: + for cell in self.content[1:]: + index += 1 + type_i, dim_i = _get_ele_type_and_dim(cell) + if type_i != type_0: + raise SetInputOrTargetException( + "Type:{} in index {} is different from the first element with type:{}." + ".".format(type_i, index, type_0)) + if dim_0 != dim_i: + raise SetInputOrTargetException( + "Dimension:{} in index {} is different from the first element with " + "dimension:{}.".format(dim_i, index, dim_0)) self._cell_ndim = dim_0 self.dtype = type_0 except SetInputOrTargetException as e: e.index = index raise e - - def append(self, val:Any): + + def append(self, val: Any): """ :param val: 把该val append到fieldarray。 :return: """ - if (self._is_target or self._is_input) and self._ignore_type is False: + if (self._is_target or self._is_input) and self._ignore_type is False and not self._use_1st_ins_infer_dim_type: type_, dim_ = _get_ele_type_and_dim(val) - if self.dtype!=type_: + if self.dtype != type_: raise AppendToTargetOrInputException(f"Value(type:{type_}) are of different types with " f"previous values(type:{self.dtype}).") - if self._cell_ndim!=dim_: + if self._cell_ndim != dim_: raise AppendToTargetOrInputException(f"Value(dim:{dim_}) are of different dimensions with " f"previous values(dim:{self._cell_ndim}).") self.content.append(val) else: self.content.append(val) - + + def pop(self, index): + """ + 删除该field中index处的元素 + :param int index: 从0开始的数据下标。 + :return: + """ + self.content.pop(index) + def __getitem__(self, indices): return self.get(indices, pad=False) - + def __setitem__(self, idx, val): assert isinstance(idx, int) if (self._is_target or self._is_input) and self.ignore_type is False: # 需要检测类型 type_, dim_ = _get_ele_type_and_dim(val) - if self.dtype!=type_: + if self.dtype != type_: raise RuntimeError(f"Value(type:{type_}) are of different types with " - f"other values(type:{self.dtype}).") - if self._cell_ndim!=dim_: + f"other values(type:{self.dtype}).") + if self._cell_ndim != dim_: raise RuntimeError(f"Value(dim:{dim_}) are of different dimensions with " - f"previous values(dim:{self._cell_ndim}).") + f"previous values(dim:{self._cell_ndim}).") self.content[idx] = val - + def get(self, indices, pad=True): """ 根据给定的indices返回内容 @@ -171,16 +201,16 @@ class FieldArray: return self.content[indices] if self.is_input is False and self.is_target is False: raise RuntimeError("Please specify either is_input or is_target to True for {}".format(self.name)) - + contents = [self.content[i] for i in indices] if self.padder is None or pad is False: return np.array(contents) else: return self.pad(contents) - + def pad(self, contents): return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim) - + def set_padder(self, padder): """ 设置padder,在这个field进行pad的时候用这个padder进行pad,如果为None则不进行pad。 @@ -192,7 +222,7 @@ class FieldArray: self.padder = deepcopy(padder) else: self.padder = None - + def set_pad_val(self, pad_val): """ 修改padder的pad_val. @@ -202,7 +232,7 @@ class FieldArray: if self.padder is not None: self.padder.set_pad_val(pad_val) return self - + def __len__(self): """ Returns the size of FieldArray. @@ -210,7 +240,7 @@ class FieldArray: :return int length: """ return len(self.content) - + def to(self, other): """ 将other的属性复制给本FieldArray(other必须为FieldArray类型). @@ -220,15 +250,15 @@ class FieldArray: :return: :class:`~fastNLP.FieldArray` """ assert isinstance(other, FieldArray), "Only supports fastNLP.FieldArray type, not {}.".format(type(other)) - + self.ignore_type = other.ignore_type self.is_input = other.is_input self.is_target = other.is_target self.padder = other.padder - + return self - - def split(self, sep:str=None, inplace:bool=True): + + def split(self, sep: str = None, inplace: bool = True): """ 依次对自身的元素使用.split()方法,应该只有当本field的元素为str时,该方法才有用。将返回值 @@ -241,11 +271,11 @@ class FieldArray: try: new_contents.append(cell.split(sep)) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - - def int(self, inplace:bool=True): + + def int(self, inplace: bool = True): """ 将本field中的值调用int(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list,list中的值会被依次转换。) @@ -261,10 +291,10 @@ class FieldArray: else: new_contents.append(int(cell)) except Exception as e: - print(f"Exception happens when process value in index {index}.") - print(e) + logger.error(f"Exception happens when process value in index {index}.") + raise e return self._after_process(new_contents, inplace=inplace) - + def float(self, inplace=True): """ 将本field中的值调用float(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -281,10 +311,10 @@ class FieldArray: else: new_contents.append(float(cell)) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - + def bool(self, inplace=True): """ 将本field中的值调用bool(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -301,11 +331,11 @@ class FieldArray: else: new_contents.append(bool(cell)) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e - + return self._after_process(new_contents, inplace=inplace) - + def lower(self, inplace=True): """ 将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -322,10 +352,10 @@ class FieldArray: else: new_contents.append(cell.lower()) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - + def upper(self, inplace=True): """ 将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -342,10 +372,10 @@ class FieldArray: else: new_contents.append(cell.upper()) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - + def value_count(self): """ 返回该field下不同value的数量。多用于统计label数量 @@ -353,17 +383,18 @@ class FieldArray: :return: Counter, key是label,value是出现次数 """ count = Counter() - + def cum(cell): if _is_iterable(cell) and not isinstance(cell, str): for cell_ in cell: cum(cell_) else: count[cell] += 1 + for cell in self.content: cum(cell) return count - + def _after_process(self, new_contents, inplace): """ 当调用处理函数之后,决定是否要替换field。 @@ -378,14 +409,14 @@ class FieldArray: self.is_input = self.is_input self.is_target = self.is_input except SetInputOrTargetException as e: - print("The newly generated field cannot be set as input or target.") + logger.error("The newly generated field cannot be set as input or target.") raise e return self else: return new_contents -def _get_ele_type_and_dim(cell:Any, dim=0): +def _get_ele_type_and_dim(cell: Any, dim=0): """ 识别cell的类别与dimension的数量 @@ -401,13 +432,13 @@ def _get_ele_type_and_dim(cell:Any, dim=0): elif isinstance(cell, list): dim += 1 res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell] - types = set([i for i,j in res]) - dims = set([j for i,j in res]) - if len(types)>1: + types = set([i for i, j in res]) + dims = set([j for i, j in res]) + if len(types) > 1: raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) - elif len(types)==0: + elif len(types) == 0: raise SetInputOrTargetException("Empty value encountered.") - if len(dims)>1: + if len(dims) > 1: raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) return types.pop(), dims.pop() elif isinstance(cell, torch.Tensor): @@ -418,55 +449,47 @@ def _get_ele_type_and_dim(cell:Any, dim=0): # 否则需要继续往下iterate dim += 1 res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell] - types = set([i for i,j in res]) - dims = set([j for i,j in res]) - if len(types)>1: + types = set([i for i, j in res]) + dims = set([j for i, j in res]) + if len(types) > 1: raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) - elif len(types)==0: + elif len(types) == 0: raise SetInputOrTargetException("Empty value encountered.") - if len(dims)>1: + if len(dims) > 1: raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) return types.pop(), dims.pop() - else: # 包含tuple, set, dict以及其它的类型 + else: # 包含tuple, set, dict以及其它的类型 raise SetInputOrTargetException(f"Cannot process type:{type(cell)}.") -def _is_iterable(value): - # 检查是否是iterable的, duck typing - try: - iter(value) - return True - except BaseException as e: - return False - - class Padder: """ - 别名::class:`fastNLP.Padder` :class:`fastNLP.core.field.Padder` - 所有padder都需要继承这个类,并覆盖__call__方法。 用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 .. py:function:: __call__(self, contents, field_name, field_ele_dtype): + + """ + + def __init__(self, pad_val=0, **kwargs): + """ - 传入的是List内容。假设有以下的DataSet。 - :param List[Any] contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 deepcopy一份。 :param str, field_name: field的名称。 :param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True,该这个值为None。 :return: np.array([padded_element]) - - """ - - def __init__(self, pad_val=0, **kwargs): + """ self.pad_val = pad_val - + def set_pad_val(self, pad_val): self.pad_val = pad_val + def get_pad_val(self): + return self.pad_val + @abstractmethod - def __call__(self, contents, field_name, field_ele_dtype, dim:int): + def __call__(self, contents, field_name, field_ele_dtype, dim: int): """ 传入的是List内容。假设有以下的DataSet。 @@ -512,8 +535,6 @@ class Padder: class AutoPadder(Padder): """ - 别名::class:`fastNLP.AutoPadder` :class:`fastNLP.core.field.AutoPadder` - 根据contents的数据自动判定是否需要做padding。 1 如果元素类型(元素类型是指field中最里层元素的数据类型, 可以通过FieldArray.dtype查看,比如['This', 'is', ...]的元素类 @@ -533,23 +554,24 @@ class AutoPadder(Padder): 3 其它情况不进行处理,返回一个np.array类型。 """ + def __init__(self, pad_val=0): super().__init__(pad_val=pad_val) - + def __call__(self, contents, field_name, field_ele_dtype, dim): if field_ele_dtype: - if dim>3: + if dim > 3: return np.array(contents) if isinstance(field_ele_dtype, type) and \ (issubclass(field_ele_dtype, np.number) or issubclass(field_ele_dtype, Number)): - if dim==0: + if dim == 0: array = np.array(contents, dtype=field_ele_dtype) - elif dim==1: + elif dim == 1: max_len = max(map(len, contents)) array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype) for i, content_i in enumerate(contents): array[i, :len(content_i)] = content_i - elif dim==2: + elif dim == 2: max_len = max(map(len, contents)) max_word_len = max([max([len(content_ii) for content_ii in content_i]) for content_i in contents]) @@ -559,20 +581,21 @@ class AutoPadder(Padder): array[i, j, :len(content_ii)] = content_ii else: shape = np.shape(contents) - if len(shape)==4: # 说明各dimension是相同的大小 + if len(shape) == 4: # 说明各dimension是相同的大小 array = np.array(contents, dtype=field_ele_dtype) else: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + raise RuntimeError( + f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") return array elif str(field_ele_dtype).startswith('torch'): - if dim==0: + if dim == 0: tensor = torch.tensor(contents).to(field_ele_dtype) - elif dim==1: + elif dim == 1: max_len = max(map(len, contents)) tensor = torch.full((len(contents), max_len), fill_value=self.pad_val, dtype=field_ele_dtype) for i, content_i in enumerate(contents): - tensor[i, :len(content_i)] = torch.tensor(content_i) - elif dim==2: + tensor[i, :len(content_i)] = content_i.clone().detach() + elif dim == 2: max_len = max(map(len, contents)) max_word_len = max([max([len(content_ii) for content_ii in content_i]) for content_i in contents]) @@ -580,18 +603,21 @@ class AutoPadder(Padder): dtype=field_ele_dtype) for i, content_i in enumerate(contents): for j, content_ii in enumerate(content_i): - tensor[i, j, :len(content_ii)] = torch.tensor(content_ii) + tensor[i, j, :len(content_ii)] = content_ii.clone().detach() else: shapes = set([np.shape(content_i) for content_i in contents]) - if len(shapes)>1: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + if len(shapes) > 1: + raise RuntimeError( + f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") shape = shapes.pop() - if len(shape)==3: - tensor = torch.full([len(contents)]+list(shape), fill_value=self.pad_val, dtype=field_ele_dtype) + if len(shape) == 3: + tensor = torch.full([len(contents)] + list(shape), fill_value=self.pad_val, + dtype=field_ele_dtype) for i, content_i in enumerate(contents): - tensor[i] = torch.tensor(content_i, dtype=field_ele_dtype) + tensor[i] = content_i.clone().detach().to(field_ele_dtype) else: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + raise RuntimeError( + f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") return tensor else: return np.array(contents) # 不进行任何操作 @@ -601,8 +627,6 @@ class AutoPadder(Padder): class EngChar2DPadder(Padder): """ - 别名::class:`fastNLP.EngChar2DPadder` :class:`fastNLP.core.field.EngChar2DPadder` - 用于为英语执行character级别的2D padding操作。对应的field内容应该类似[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']], 但这个Padder只能处理index为int的情况。 @@ -622,7 +646,7 @@ class EngChar2DPadder(Padder): dataset.set_padder('chars', padder) # chars这个field的设置为了EnChar2DPadder """ - + def __init__(self, pad_val=0, pad_length=0): """ :param pad_val: int, pad的位置使用该index @@ -630,9 +654,9 @@ class EngChar2DPadder(Padder): 都pad或截取到该长度. """ super().__init__(pad_val=pad_val) - + self.pad_length = pad_length - + def __call__(self, contents, field_name, field_ele_dtype, dim): """ 期望输入类似于 @@ -651,7 +675,7 @@ class EngChar2DPadder(Padder): raise TypeError('dtype of Field:{} should be np.int64 or np.float64 to do 2D padding, get {}.'.format( field_name, field_ele_dtype )) - assert dim==2, f"Field:{field_name} has {dim}, EngChar2DPadder only supports input with 2 dimensions." + assert dim == 2, f"Field:{field_name} has {dim}, EngChar2DPadder only supports input with 2 dimensions." if self.pad_length < 1: max_char_length = max([max(len(char_lst) for char_lst in word_lst) for word_lst in contents]) else: @@ -659,12 +683,12 @@ class EngChar2DPadder(Padder): max_sent_length = max(len(word_lst) for word_lst in contents) batch_size = len(contents) dtype = type(contents[0][0][0]) - + padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val, dtype=dtype) for b_idx, word_lst in enumerate(contents): for c_idx, char_lst in enumerate(word_lst): chars = char_lst[:max_char_length] padded_array[b_idx, c_idx, :len(chars)] = chars - + return padded_array diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py index 5408522e..311c582a 100644 --- a/fastNLP/core/instance.py +++ b/fastNLP/core/instance.py @@ -1,17 +1,18 @@ """ instance 模块实现了Instance 类在fastNLP中对应sample。一个sample可以认为是一个Instance类型的对象。 -便于理解的例子可以参考文档 :doc:`fastNLP.core.dataset` 中的表格 +便于理解的例子可以参考文档 :mod:`fastNLP.core.dataset` 中的表格 """ + __all__ = [ "Instance" ] +from .utils import pretty_table_printer + class Instance(object): """ - 别名::class:`fastNLP.Instance` :class:`fastNLP.core.instance.Instance` - Instance是fastNLP中对应一个sample的类。每个sample在fastNLP中是一个Instance对象。 Instance一般与 :class:`~fastNLP.DataSet` 一起使用, Instance的初始化如下面的Example所示:: @@ -22,11 +23,11 @@ class Instance(object): >>>ins.add_field("field_3", [3, 3, 3]) >>>ins = Instance(**{'x1': 1, 'x2':np.zeros((3, 4))}) """ - + def __init__(self, **fields): - + self.fields = fields - + def add_field(self, field_name, field): """ 向Instance中增加一个field @@ -35,18 +36,26 @@ class Instance(object): :param Any field: 新增field的内容 """ self.fields[field_name] = field - + + def items(self): + """ + 返回一个迭代器,迭代器返回两个内容,第一个内容是field_name, 第二个内容是field_value + + :return: 一个迭代器 + """ + return self.fields.items() + + def __contains__(self, item): + return item in self.fields + def __getitem__(self, name): if name in self.fields: return self.fields[name] else: raise KeyError("{} not found".format(name)) - + def __setitem__(self, name, field): return self.add_field(name, field) - + def __repr__(self): - s = '\'' - return "{" + ",\n".join( - "\'" + field_name + "\': " + str(self.fields[field_name]) + \ - f" type={(str(type(self.fields[field_name]))).split(s)[1]}" for field_name in self.fields) + "}" + return str(pretty_table_printer(self)) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 1f8923eb..2166734d 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -11,7 +11,10 @@ __all__ = [ "CrossEntropyLoss", "BCELoss", "L1Loss", - "NLLLoss" + "NLLLoss", + + "CMRC2018Loss" + ] import inspect @@ -20,7 +23,6 @@ from collections import defaultdict import torch import torch.nn.functional as F -from ..core.const import Const from .utils import _CheckError from .utils import _CheckRes from .utils import _build_args @@ -28,6 +30,7 @@ from .utils import _check_arg_dict_list from .utils import _check_function_or_method from .utils import _get_func_signature from .utils import seq_len_to_mask +from ..core.const import Const class LossBase(object): @@ -166,8 +169,6 @@ class LossBase(object): class LossFunc(LossBase): """ - 别名::class:`fastNLP.LossFunc` :class:`fastNLP.core.losses.LossFunc` - 提供给用户使用自定义损失函数的类 :param func: 用户自行定义的损失函数,应当为一个函数或者callable(func)为True的ojbect @@ -199,13 +200,15 @@ class LossFunc(LossBase): class CrossEntropyLoss(LossBase): """ - 别名::class:`fastNLP.CrossEntropyLoss` :class:`fastNLP.core.losses.CrossEntropyLoss` - 交叉熵损失函数 :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` - :param seq_len: 句子的长度, 长度之外的token不会计算loss。。 + :param seq_len: 句子的长度, 长度之外的token不会计算loss。 + :param int class_in_dim: 在序列标注的场景中,pred可能的shape为(batch_size, max_len, num_classes) + 或(batch_size, num_classes, max_len), CrossEntropyLoss需要知道哪一维是class的维度以计算loss。如果为-1,就根据pred的第 + 二维是否等于target的第二维来判断是否需要交换pred的第二维和第三维,因为target的第二维是length的维度,如果这一维度上和pred相等, + 那么pred可能第二维也是长度维(存在误判的可能,如果有误判的情况,请显示设置该值)。其它大于0的值则认为该维度是class的维度。 :param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替 传入seq_len. :param str reduction: 支持 `mean` ,`sum` 和 `none` . @@ -216,21 +219,25 @@ class CrossEntropyLoss(LossBase): """ - def __init__(self, pred=None, target=None, seq_len=None, padding_idx=-100, reduction='mean'): + def __init__(self, pred=None, target=None, seq_len=None, class_in_dim=-1, padding_idx=-100, reduction='mean'): super(CrossEntropyLoss, self).__init__() self._init_param_map(pred=pred, target=target, seq_len=seq_len) self.padding_idx = padding_idx assert reduction in ('mean', 'sum', 'none') self.reduction = reduction + self.class_in_dim = class_in_dim def get_loss(self, pred, target, seq_len=None): if pred.dim() > 2: - if pred.size(1) != target.size(1): - pred = pred.transpose(1, 2) + if self.class_in_dim == -1: + if pred.size(1) != target.size(1): # 有可能顺序替换了 + pred = pred.transpose(1, 2) + else: + pred = pred.tranpose(-1, pred) pred = pred.reshape(-1, pred.size(-1)) target = target.reshape(-1) - if seq_len is not None: - mask = seq_len_to_mask(seq_len).reshape(-1).eq(0) + if seq_len is not None and target.dim()>1: + mask = seq_len_to_mask(seq_len, max_len=target.size(1)).reshape(-1).eq(0) target = target.masked_fill(mask, self.padding_idx) return F.cross_entropy(input=pred, target=target, @@ -239,8 +246,6 @@ class CrossEntropyLoss(LossBase): class L1Loss(LossBase): """ - 别名::class:`fastNLP.L1Loss` :class:`fastNLP.core.losses.L1Loss` - L1损失函数 :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` @@ -261,8 +266,6 @@ class L1Loss(LossBase): class BCELoss(LossBase): """ - 别名::class:`fastNLP.BCELoss` :class:`fastNLP.core.losses.BCELoss` - 二分类交叉熵损失函数 :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` @@ -282,18 +285,18 @@ class BCELoss(LossBase): class NLLLoss(LossBase): """ - 别名::class:`fastNLP.NLLLoss` :class:`fastNLP.core.losses.NLLLoss` - 负对数似然损失函数 - - :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` - :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` - :param ignore_idx: ignore的index,在计算loss时将忽略target中标号为ignore_idx的内容, 可以通过该值代替 - 传入seq_len. - :param str reduction: 支持 `mean` ,`sum` 和 `none` . """ def __init__(self, pred=None, target=None, ignore_idx=-100, reduction='mean'): + """ + + :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` + :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` + :param ignore_idx: ignore的index,在计算loss时将忽略target中标号为ignore_idx的内容, 可以通过该值代替 + 传入seq_len. + :param str reduction: 支持 `mean` ,`sum` 和 `none` . + """ super(NLLLoss, self).__init__() self._init_param_map(pred=pred, target=target) assert reduction in ('mean', 'sum', 'none') @@ -306,14 +309,14 @@ class NLLLoss(LossBase): class LossInForward(LossBase): """ - 别名::class:`fastNLP.LossInForward` :class:`fastNLP.core.losses.LossInForward` - 从forward()函数返回结果中获取loss - - :param str loss_key: 在forward函数中loss的键名,默认为loss """ def __init__(self, loss_key=Const.LOSS): + """ + + :param str loss_key: 在forward函数中loss的键名,默认为loss + """ super().__init__() if not isinstance(loss_key, str): raise TypeError(f"Only str allowed for loss_key, got {type(loss_key)}.") @@ -344,90 +347,52 @@ class LossInForward(LossBase): return loss -def _prepare_losser(losser): - if losser is None: - losser = LossInForward() - return losser - elif isinstance(losser, LossBase): - return losser - else: - raise TypeError(f"Type of loss should be `fastNLP.LossBase`, got {type(losser)}") - - -def squash(predict, truth, **kwargs): - """To reshape tensors in order to fit loss functions in PyTorch. - - :param predict: Tensor, model output - :param truth: Tensor, truth from dataset - :param kwargs: extra arguments - :return predict , truth: predict & truth after processing +class CMRC2018Loss(LossBase): """ - return predict.view(-1, predict.size()[-1]), truth.view(-1, ) - - -def unpad(predict, truth, **kwargs): - """To process padded sequence output to get true loss. + 用于计算CMRC2018中文问答任务。 - :param predict: Tensor, [batch_size , max_len , tag_size] - :param truth: Tensor, [batch_size , max_len] - :param kwargs: kwargs["lens"] is a list or LongTensor, with size [batch_size]. The i-th element is true lengths of i-th sequence. - - :return predict , truth: predict & truth after processing """ - if kwargs.get("lens") is None: - return predict, truth - lens = torch.LongTensor(kwargs["lens"]) - lens, idx = torch.sort(lens, descending=True) - predict = torch.nn.utils.rnn.pack_padded_sequence(predict[idx], lens, batch_first=True).data - truth = torch.nn.utils.rnn.pack_padded_sequence(truth[idx], lens, batch_first=True).data - return predict, truth - + def __init__(self, target_start=None, target_end=None, context_len=None, pred_start=None, pred_end=None, + reduction='mean'): + super().__init__() -def unpad_mask(predict, truth, **kwargs): - """To process padded sequence output to get true loss. + assert reduction in ('mean', 'sum') - :param predict: Tensor, [batch_size , max_len , tag_size] - :param truth: Tensor, [batch_size , max_len] - :param kwargs: kwargs["lens"] is a list or LongTensor, with size [batch_size]. The i-th element is true lengths of i-th sequence. + self._init_param_map(target_start=target_start, target_end=target_end, context_len=context_len, + pred_start=pred_start, pred_end=pred_end) + self.reduction = reduction - :return predict , truth: predict & truth after processing - """ - if kwargs.get("lens") is None: - return predict, truth - mas = make_mask(kwargs["lens"], truth.size()[1]) - return mask(predict, truth, mask=mas) + def get_loss(self, target_start, target_end, context_len, pred_start, pred_end): + """ + :param target_start: batch_size + :param target_end: batch_size + :param context_len: batch_size + :param pred_start: batch_size x max_len + :param pred_end: batch_size x max_len + :return: + """ + batch_size, max_len = pred_end.size() + mask = seq_len_to_mask(context_len, max_len).eq(0) -def mask(predict, truth, **kwargs): - """To select specific elements from Tensor. This method calls ``squash()``. + pred_start = pred_start.masked_fill(mask, float('-inf')) + pred_end = pred_end.masked_fill(mask, float('-inf')) - :param predict: Tensor, [batch_size , max_len , tag_size] - :param truth: Tensor, [batch_size , max_len] - :param kwargs: extra arguments, kwargs["mask"]: ByteTensor, [batch_size , max_len], the mask Tensor. The position that is 1 will be selected. + start_loss = F.cross_entropy(pred_start, target_start, reduction='sum') + end_loss = F.cross_entropy(pred_end, target_end, reduction='sum') - :return predict , truth: predict & truth after processing - """ - if kwargs.get("mask") is None: - return predict, truth - mask = kwargs["mask"] - - predict, truth = squash(predict, truth) - mask = mask.view(-1, ) - - predict = torch.masked_select(predict.permute(1, 0), mask).view(predict.size()[-1], -1).permute(1, 0) - truth = torch.masked_select(truth, mask) - - return predict, truth + loss = start_loss + end_loss + if self.reduction == 'mean': + loss = loss / batch_size -def make_mask(lens, tar_len): - """To generate a mask over a sequence. + return loss/2 - :param lens: list or LongTensor, [batch_size] - :param tar_len: int - :return mask: ByteTensor - """ - lens = torch.LongTensor(lens) - mask = [torch.ge(lens, i + 1) for i in range(tar_len)] - mask = torch.stack(mask, 1) - return mask +def _prepare_losser(losser): + if losser is None: + losser = LossInForward() + return losser + elif isinstance(losser, LossBase): + return losser + else: + raise TypeError(f"Type of loss should be `fastNLP.LossBase`, got {type(losser)}") diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index f23eab91..6ef1aea5 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -6,11 +6,15 @@ __all__ = [ "MetricBase", "AccuracyMetric", "SpanFPreRecMetric", - "ExtractiveQAMetric" + "CMRC2018Metric" ] import inspect +import warnings +from abc import abstractmethod from collections import defaultdict +from typing import Union +import re import numpy as np import torch @@ -22,7 +26,6 @@ from .utils import _check_arg_dict_list from .utils import _get_func_signature from .utils import seq_len_to_mask from .vocabulary import Vocabulary -from abc import abstractmethod class MetricBase(object): @@ -114,10 +117,11 @@ class MetricBase(object): self.get_metric将统计当前的评价指标并返回评价结果, 返回值需要是一个dict, key是指标名称,value是指标的值 """ - + def __init__(self): self._param_map = {} # key is param in function, value is input param. self._checked = False + self._metric_name = self.__class__.__name__ @property def param_map(self): @@ -135,7 +139,25 @@ class MetricBase(object): @abstractmethod def get_metric(self, reset=True): raise NotImplemented - + + def set_metric_name(self, name: str): + """ + 设置metric的名称,默认是Metric的class name. + + :param str name: + :return: self + """ + self._metric_name = name + return self + + def get_metric_name(self): + """ + 返回metric的名称 + + :return: + """ + return self._metric_name + def _init_param_map(self, key_map=None, **kwargs): """检查key_map和其他参数map,并将这些映射关系添加到self._param_map @@ -168,7 +190,7 @@ class MetricBase(object): for value, key_set in value_counter.items(): if len(key_set) > 1: raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.") - + # check consistence between signature and _param_map func_spect = inspect.getfullargspec(self.evaluate) func_args = [arg for arg in func_spect.args if arg != 'self'] @@ -177,7 +199,7 @@ class MetricBase(object): raise NameError( f"Parameter `{func_param}` is not in {_get_func_signature(self.evaluate)}. Please check the " f"initialization parameters, or change its signature.") - + def _fast_param_map(self, pred_dict, target_dict): """Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map. such as pred_dict has one element, target_dict has one element @@ -192,7 +214,7 @@ class MetricBase(object): fast_param['target'] = list(target_dict.values())[0] return fast_param return fast_param - + def __call__(self, pred_dict, target_dict): """ 这个方法会调用self.evaluate 方法. @@ -207,12 +229,12 @@ class MetricBase(object): :param target_dict: DataSet.batch_y里的键-值对所组成的dict(即is_target=True的fields的内容) :return: """ - + fast_param = self._fast_param_map(pred_dict, target_dict) if fast_param: self.evaluate(**fast_param) return - + if not self._checked: if not callable(self.evaluate): raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.") @@ -222,14 +244,14 @@ class MetricBase(object): for func_arg, input_arg in self._param_map.items(): if func_arg not in func_args: raise NameError(f"`{func_arg}` not in {_get_func_signature(self.evaluate)}.") - + # 2. only part of the _param_map are passed, left are not for arg in func_args: if arg not in self._param_map: self._param_map[arg] = arg # This param does not need mapping. self._evaluate_args = func_args self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self._param_map.items()} - + # need to wrap inputs in dict. mapped_pred_dict = {} mapped_target_dict = {} @@ -238,7 +260,7 @@ class MetricBase(object): mapped_pred_dict[mapped_arg] = pred_dict[input_arg] if input_arg in target_dict: mapped_target_dict[mapped_arg] = target_dict[input_arg] - + # missing if not self._checked: duplicated = [] @@ -253,47 +275,46 @@ class MetricBase(object): for idx, func_arg in enumerate(missing): # Don't delete `` in this information, nor add `` replaced_missing[idx] = f"{self._param_map[func_arg]}" + f"(assign to `{func_arg}` " \ - f"in `{self.__class__.__name__}`)" - + f"in `{self.__class__.__name__}`)" + check_res = _CheckRes(missing=replaced_missing, unused=check_res.unused, duplicated=duplicated, required=check_res.required, all_needed=check_res.all_needed, varargs=check_res.varargs) - + if check_res.missing or check_res.duplicated: raise _CheckError(check_res=check_res, func_signature=_get_func_signature(self.evaluate)) self._checked = True refined_args = _build_args(self.evaluate, **mapped_pred_dict, **mapped_target_dict) - + self.evaluate(**refined_args) - + return class AccuracyMetric(MetricBase): """ - - 别名::class:`fastNLP.AccuracyMetric` :class:`fastNLP.core.metrics.AccuracyMetric` - - 准确率Metric(其它的Metric参见 :doc:`fastNLP.core.metrics` ) - - :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` - :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` - :param seq_len: 参数映射表中 `seq_len` 的映射关系,None表示映射关系为 `seq_len` -> `seq_len` + 准确率Metric(其它的Metric参见 :mod:`fastNLP.core.metrics` ) """ - + def __init__(self, pred=None, target=None, seq_len=None): + """ + :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` + :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` + :param seq_len: 参数映射表中 `seq_len` 的映射关系,None表示映射关系为 `seq_len` -> `seq_len` + """ + super().__init__() - + self._init_param_map(pred=pred, target=target, seq_len=seq_len) - + self.total = 0 self.acc_count = 0 - + def evaluate(self, pred, target, seq_len=None): """ evaluate函数将针对一个批次的预测结果做评价指标的累计 @@ -313,25 +334,28 @@ class AccuracyMetric(MetricBase): if not isinstance(target, torch.Tensor): raise TypeError(f"`target` in {_get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(target)}.") - + if seq_len is not None and not isinstance(seq_len, torch.Tensor): raise TypeError(f"`seq_lens` in {_get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(seq_len)}.") - - if seq_len is not None: - masks = seq_len_to_mask(seq_len=seq_len) + + if seq_len is not None and target.dim() > 1: + max_len = target.size(1) + masks = seq_len_to_mask(seq_len=seq_len, max_len=max_len) else: masks = None - - if pred.size() == target.size(): + + if pred.dim() == target.dim(): pass - elif len(pred.size()) == len(target.size()) + 1: + elif pred.dim() == target.dim() + 1: pred = pred.argmax(dim=-1) + if seq_len is None and target.dim() > 1: + warnings.warn("You are not passing `seq_len` to exclude pad when calculate accuracy.") else: raise RuntimeError(f"In {_get_func_signature(self.evaluate)}, when pred have " f"size:{pred.size()}, target should have size: {pred.size()} or " f"{pred.size()[:-1]}, got {target.size()}.") - + target = target.to(pred) if masks is not None: self.acc_count += torch.sum(torch.eq(pred, target).masked_fill(masks.eq(0), 0)).item() @@ -339,7 +363,7 @@ class AccuracyMetric(MetricBase): else: self.acc_count += torch.sum(torch.eq(pred, target)).item() self.total += np.prod(list(pred.size())) - + def get_metric(self, reset=True): """ get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果. @@ -358,13 +382,14 @@ def _bmes_tag_to_spans(tags, ignore_labels=None): """ 给定一个tags的lis,比如['S-song', 'B-singer', 'M-singer', 'E-singer', 'S-moive', 'S-actor']。 返回[('song', (0, 1)), ('singer', (1, 4)), ('moive', (4, 5)), ('actor', (5, 6))] (左闭右开区间) + 也可以是单纯的['S', 'B', 'M', 'E', 'B', 'M', 'M',...]序列 :param tags: List[str], :param ignore_labels: List[str], 在该list中的label将被忽略 :return: List[Tuple[str, List[int, int]]]. [(label,[start, end])] """ ignore_labels = set(ignore_labels) if ignore_labels else set() - + spans = [] prev_bmes_tag = None for idx, tag in enumerate(tags): @@ -393,7 +418,7 @@ def _bmeso_tag_to_spans(tags, ignore_labels=None): :return: List[Tuple[str, List[int, int]]]. [(label,[start, end])] """ ignore_labels = set(ignore_labels) if ignore_labels else set() - + spans = [] prev_bmes_tag = None for idx, tag in enumerate(tags): @@ -455,7 +480,7 @@ def _bio_tag_to_spans(tags, ignore_labels=None): :return: List[Tuple[str, List[int, int]]]. [(label,[start, end])] """ ignore_labels = set(ignore_labels) if ignore_labels else set() - + spans = [] prev_bio_tag = None for idx, tag in enumerate(tags): @@ -473,10 +498,75 @@ def _bio_tag_to_spans(tags, ignore_labels=None): return [(span[0], (span[1][0], span[1][1] + 1)) for span in spans if span[0] not in ignore_labels] +def _get_encoding_type_from_tag_vocab(tag_vocab: Union[Vocabulary, dict]) -> str: + """ + 给定Vocabulary自动判断是哪种类型的encoding, 支持判断bmes, bioes, bmeso, bio + + :param tag_vocab: 支持传入tag Vocabulary; 或者传入形如{0:"O", 1:"B-tag1"},即index在前,tag在后的dict。 + :return: + """ + tag_set = set() + unk_token = '' + pad_token = '' + if isinstance(tag_vocab, Vocabulary): + unk_token = tag_vocab.unknown + pad_token = tag_vocab.padding + tag_vocab = tag_vocab.idx2word + for idx, tag in tag_vocab.items(): + if tag in (unk_token, pad_token): + continue + tag = tag[:1].lower() + tag_set.add(tag) + + bmes_tag_set = set('bmes') + if tag_set == bmes_tag_set: + return 'bmes' + bio_tag_set = set('bio') + if tag_set == bio_tag_set: + return 'bio' + bmeso_tag_set = set('bmeso') + if tag_set == bmeso_tag_set: + return 'bmeso' + bioes_tag_set = set('bioes') + if tag_set == bioes_tag_set: + return 'bioes' + raise RuntimeError("encoding_type cannot be inferred automatically. Only support " + "'bio', 'bmes', 'bmeso', 'bioes' type.") + + +def _check_tag_vocab_and_encoding_type(tag_vocab: Union[Vocabulary, dict], encoding_type: str): + """ + 检查vocab中的tag是否与encoding_type是匹配的 + + :param tag_vocab: 支持传入tag Vocabulary; 或者传入形如{0:"O", 1:"B-tag1"},即index在前,tag在后的dict。 + :param encoding_type: bio, bmes, bioes, bmeso + :return: + """ + tag_set = set() + unk_token = '' + pad_token = '' + if isinstance(tag_vocab, Vocabulary): + unk_token = tag_vocab.unknown + pad_token = tag_vocab.padding + tag_vocab = tag_vocab.idx2word + for idx, tag in tag_vocab.items(): + if tag in (unk_token, pad_token): + continue + tag = tag[:1].lower() + tag_set.add(tag) + + tags = encoding_type + for tag in tag_set: + assert tag in tags, f"{tag} is not a valid tag in encoding type:{encoding_type}. Please check your " \ + f"encoding_type." + tags = tags.replace(tag, '') # 删除该值 + if tags: # 如果不为空,说明出现了未使用的tag + warnings.warn(f"Tag:{tags} in encoding type:{encoding_type} is not presented in your Vocabulary. Check your " + "encoding_type.") + + class SpanFPreRecMetric(MetricBase): r""" - 别名::class:`fastNLP.SpanFPreRecMetric` :class:`fastNLP.core.metrics.SpanFPreRecMetric` - 在序列标注问题中,以span的方式计算F, pre, rec. 比如中文Part of speech中,会以character的方式进行标注,句子 `中国在亚洲` 对应的POS可能为(以BMES为例) ['B-NN', 'E-NN', 'S-DET', 'B-NN', 'E-NN']。该metric就是为类似情况下的F1计算。 @@ -499,34 +589,36 @@ class SpanFPreRecMetric(MetricBase): 'rec-label':xxx, ... } - - :param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN), - 在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'. - :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用 `pred` 取数据 - :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用 `target` 取数据 - :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用 `seq_len` 取数据。 - :param str encoding_type: 目前支持bio, bmes, bmeso, bioes - :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这 - 个label - :param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个 - label的f1, pre, rec - :param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; `macro` : - 分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同) - :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . - 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 """ - - def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type='bio', ignore_labels=None, + + def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type=None, ignore_labels=None, only_gross=True, f_type='micro', beta=1): - - encoding_type = encoding_type.lower() - + r""" + + :param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN), + 在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'. + :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用 `pred` 取数据 + :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用 `target` 取数据 + :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用 `seq_len` 取数据。 + :param str encoding_type: 目前支持bio, bmes, bmeso, bioes。默认为None,通过tag_vocab自动判断. + :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'个label + :param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个label的f1, pre, rec + :param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; `macro` : 分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同) + :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为 `beta=0.5, 1, 2` 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 + """ + if not isinstance(tag_vocab, Vocabulary): raise TypeError("tag_vocab can only be fastNLP.Vocabulary, not {}.".format(type(tag_vocab))) if f_type not in ('micro', 'macro'): raise ValueError("f_type only supports `micro` or `macro`', got {}.".format(f_type)) - - self.encoding_type = encoding_type + + if encoding_type: + encoding_type = encoding_type.lower() + _check_tag_vocab_and_encoding_type(tag_vocab, encoding_type) + self.encoding_type = encoding_type + else: + self.encoding_type = _get_encoding_type_from_tag_vocab(tag_vocab) + if self.encoding_type == 'bmes': self.tag_to_span_func = _bmes_tag_to_spans elif self.encoding_type == 'bio': @@ -536,23 +628,23 @@ class SpanFPreRecMetric(MetricBase): elif self.encoding_type == 'bioes': self.tag_to_span_func = _bioes_tag_to_spans else: - raise ValueError("Only support 'bio', 'bmes', 'bmeso' type.") - + raise ValueError("Only support 'bio', 'bmes', 'bmeso', 'bioes' type.") + self.ignore_labels = ignore_labels self.f_type = f_type self.beta = beta self.beta_square = self.beta ** 2 self.only_gross = only_gross - + super().__init__() self._init_param_map(pred=pred, target=target, seq_len=seq_len) - + self.tag_vocab = tag_vocab - + self._true_positives = defaultdict(int) self._false_positives = defaultdict(int) self._false_negatives = defaultdict(int) - + def evaluate(self, pred, target, seq_len): """evaluate函数将针对一个批次的预测结果做评价指标的累计 @@ -567,11 +659,11 @@ class SpanFPreRecMetric(MetricBase): if not isinstance(target, torch.Tensor): raise TypeError(f"`target` in {_get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(target)}.") - + if not isinstance(seq_len, torch.Tensor): raise TypeError(f"`seq_lens` in {_get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(seq_len)}.") - + if pred.size() == target.size() and len(target.size()) == 2: pass elif len(pred.size()) == len(target.size()) + 1 and len(target.size()) == 2: @@ -584,20 +676,20 @@ class SpanFPreRecMetric(MetricBase): raise RuntimeError(f"In {_get_func_signature(self.evaluate)}, when pred have " f"size:{pred.size()}, target should have size: {pred.size()} or " f"{pred.size()[:-1]}, got {target.size()}.") - + batch_size = pred.size(0) pred = pred.tolist() target = target.tolist() for i in range(batch_size): pred_tags = pred[i][:int(seq_len[i])] gold_tags = target[i][:int(seq_len[i])] - + pred_str_tags = [self.tag_vocab.to_word(tag) for tag in pred_tags] gold_str_tags = [self.tag_vocab.to_word(tag) for tag in gold_tags] - + pred_spans = self.tag_to_span_func(pred_str_tags, ignore_labels=self.ignore_labels) gold_spans = self.tag_to_span_func(gold_str_tags, ignore_labels=self.ignore_labels) - + for span in pred_spans: if span in gold_spans: self._true_positives[span[0]] += 1 @@ -606,7 +698,7 @@ class SpanFPreRecMetric(MetricBase): self._false_positives[span[0]] += 1 for span in gold_spans: self._false_negatives[span[0]] += 1 - + def get_metric(self, reset=True): """get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果.""" evaluate_result = {} @@ -624,7 +716,7 @@ class SpanFPreRecMetric(MetricBase): f, pre, rec = self._compute_f_pre_rec(tp, fn, fp) f_sum += f pre_sum += pre - rec_sum + rec + rec_sum += rec if not self.only_gross and tag != '': # tag!=''防止无tag的情况 f_key = 'f-{}'.format(tag) pre_key = 'pre-{}'.format(tag) @@ -632,12 +724,12 @@ class SpanFPreRecMetric(MetricBase): evaluate_result[f_key] = f evaluate_result[pre_key] = pre evaluate_result[rec_key] = rec - + if self.f_type == 'macro': evaluate_result['f'] = f_sum / len(tags) evaluate_result['pre'] = pre_sum / len(tags) evaluate_result['rec'] = rec_sum / len(tags) - + if self.f_type == 'micro': f, pre, rec = self._compute_f_pre_rec(sum(self._true_positives.values()), sum(self._false_negatives.values()), @@ -645,17 +737,17 @@ class SpanFPreRecMetric(MetricBase): evaluate_result['f'] = f evaluate_result['pre'] = pre evaluate_result['rec'] = rec - + if reset: self._true_positives = defaultdict(int) self._false_positives = defaultdict(int) self._false_negatives = defaultdict(int) - + for key, value in evaluate_result.items(): evaluate_result[key] = round(value, 6) - + return evaluate_result - + def _compute_f_pre_rec(self, tp, fn, fp): """ @@ -667,7 +759,7 @@ class SpanFPreRecMetric(MetricBase): pre = tp / (fp + tp + 1e-13) rec = tp / (fn + tp + 1e-13) f = (1 + self.beta_square) * pre * rec / (self.beta_square * pre + rec + 1e-13) - + return f, pre, rec @@ -736,169 +828,129 @@ def _pred_topk(y_prob, k=1): return y_pred_topk, y_prob_topk -class ExtractiveQAMetric(MetricBase): - r""" - 别名::class:`fastNLP.ExtractiveQAMetric` :class:`fastNLP.core.metrics.ExtractiveQAMetric` +class CMRC2018Metric(MetricBase): + def __init__(self, answers=None, raw_chars=None, context_len=None, pred_start=None, pred_end=None): + super().__init__() + self._init_param_map(answers=answers, raw_chars=raw_chars, context_len=context_len, pred_start=pred_start, + pred_end=pred_end) + self.em = 0 + self.total = 0 + self.f1 = 0 - 抽取式QA(如SQuAD)的metric. - - :param pred1: 参数映射表中 `pred1` 的映射关系,None表示映射关系为 `pred1` -> `pred1` - :param pred2: 参数映射表中 `pred2` 的映射关系,None表示映射关系为 `pred2` -> `pred2` - :param target1: 参数映射表中 `target1` 的映射关系,None表示映射关系为 `target1` -> `target1` - :param target2: 参数映射表中 `target2` 的映射关系,None表示映射关系为 `target2` -> `target2` - :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . - 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 - :param bool right_open: right_open为true表示start跟end指针指向一个左闭右开区间,为false表示指向一个左闭右闭区间。 - :param bool print_predict_stat: True则输出预测答案是否为空与正确答案是否为空的统计信息, False则不输出 - - """ - - def __init__(self, pred1=None, pred2=None, target1=None, target2=None, - beta=1, right_open=True, print_predict_stat=False): - - super(ExtractiveQAMetric, self).__init__() - - self._init_param_map(pred1=pred1, pred2=pred2, target1=target1, target2=target2) - - self.print_predict_stat = print_predict_stat - - self.no_ans_correct = 0 - self.no_ans_wrong = 0 - - self.has_ans_correct = 0 - self.has_ans_wrong = 0 - - self.has_ans_f = 0. - - self.no2no = 0 - self.no2yes = 0 - self.yes2no = 0 - self.yes2yes = 0 - - self.f_beta = beta - - self.right_open = right_open - - def evaluate(self, pred1, pred2, target1, target2): - """evaluate函数将针对一个批次的预测结果做评价指标的累计 + def evaluate(self, answers, raw_chars, context_len, pred_start, pred_end): + """ - :param pred1: [batch]或者[batch, seq_len], 预测答案开始的index, 如果SQuAD2.0中答案为空则为0 - :param pred2: [batch]或者[batch, seq_len] 预测答案结束的index, 如果SQuAD2.0中答案为空则为0(左闭右闭区间)或者1(左闭右开区间) - :param target1: [batch], 正确答案开始的index, 如果SQuAD2.0中答案为空则为0 - :param target2: [batch], 正确答案结束的index, 如果SQuAD2.0中答案为空则为0(左闭右闭区间)或者1(左闭右开区间) - :return: None + :param list[str] answers: 如[["答案1", "答案2", "答案3"], [...], ...] + :param list[str] raw_chars: [["这", "是", ...], [...]] + :param tensor context_len: context长度, batch_size + :param tensor pred_start: batch_size x length + :param tensor pred_end: batch_size x length + :return: """ - pred_start = pred1 - pred_end = pred2 - target_start = target1 - target_end = target2 - - if len(pred_start.size()) == 2: - start_inference = pred_start.max(dim=-1)[1].cpu().tolist() - else: - start_inference = pred_start.cpu().tolist() - if len(pred_end.size()) == 2: - end_inference = pred_end.max(dim=-1)[1].cpu().tolist() - else: - end_inference = pred_end.cpu().tolist() - - start, end = [], [] - max_len = pred_start.size(1) - t_start = target_start.cpu().tolist() - t_end = target_end.cpu().tolist() - - for s, e in zip(start_inference, end_inference): - start.append(min(s, e)) - end.append(max(s, e)) - for s, e, ts, te in zip(start, end, t_start, t_end): - if not self.right_open: - e += 1 - te += 1 - if ts == 0 and te == int(not self.right_open): - if s == 0 and e == int(not self.right_open): - self.no_ans_correct += 1 - self.no2no += 1 - else: - self.no_ans_wrong += 1 - self.no2yes += 1 - else: - if s == 0 and e == int(not self.right_open): - self.yes2no += 1 - else: - self.yes2yes += 1 - - if s == ts and e == te: - self.has_ans_correct += 1 - else: - self.has_ans_wrong += 1 - a = [0] * s + [1] * (e - s) + [0] * (max_len - e) - b = [0] * ts + [1] * (te - ts) + [0] * (max_len - te) - a, b = torch.tensor(a), torch.tensor(b) - - TP = int(torch.sum(a * b)) - pre = TP / int(torch.sum(a)) if int(torch.sum(a)) > 0 else 0 - rec = TP / int(torch.sum(b)) if int(torch.sum(b)) > 0 else 0 - - if pre + rec > 0: - f = (1 + (self.f_beta ** 2)) * pre * rec / ((self.f_beta ** 2) * pre + rec) - else: - f = 0 - self.has_ans_f += f - + batch_size, max_len = pred_start.size() + context_mask = seq_len_to_mask(context_len, max_len=max_len).eq(0) + pred_start.masked_fill_(context_mask, float('-inf')) + pred_end.masked_fill_(context_mask, float('-inf')) + max_pred_start, pred_start_index = pred_start.max(dim=-1, keepdim=True) # batch_size, + pred_start_mask = pred_start.eq(max_pred_start).cumsum(dim=-1).eq(0) # 只能预测这之后的值 + pred_end.masked_fill_(pred_start_mask, float('-inf')) + pred_end_index = pred_end.argmax(dim=-1) + 1 + pred_ans = [] + for index, (start, end) in enumerate(zip(pred_start_index.flatten().tolist(), pred_end_index.tolist())): + pred_ans.append(''.join(raw_chars[index][start:end])) + for answer, pred_an in zip(answers, pred_ans): + pred_an = pred_an.strip() + self.f1 += _calc_cmrc2018_f1_score(answer, pred_an) + self.total += 1 + self.em += _calc_cmrc2018_em_score(answer, pred_an) + def get_metric(self, reset=True): - """get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果.""" - evaluate_result = {} - - if self.no_ans_correct + self.no_ans_wrong + self.has_ans_correct + self.no_ans_wrong <= 0: - return evaluate_result - - evaluate_result['EM'] = 0 - evaluate_result[f'f_{self.f_beta}'] = 0 - - flag = 0 - - if self.no_ans_correct + self.no_ans_wrong > 0: - evaluate_result[f'noAns-f_{self.f_beta}'] = \ - round(100 * self.no_ans_correct / (self.no_ans_correct + self.no_ans_wrong), 3) - evaluate_result['noAns-EM'] = \ - round(100 * self.no_ans_correct / (self.no_ans_correct + self.no_ans_wrong), 3) - evaluate_result[f'f_{self.f_beta}'] += evaluate_result[f'noAns-f_{self.f_beta}'] - evaluate_result['EM'] += evaluate_result['noAns-EM'] - flag += 1 - - if self.has_ans_correct + self.has_ans_wrong > 0: - evaluate_result[f'hasAns-f_{self.f_beta}'] = \ - round(100 * self.has_ans_f / (self.has_ans_correct + self.has_ans_wrong), 3) - evaluate_result['hasAns-EM'] = \ - round(100 * self.has_ans_correct / (self.has_ans_correct + self.has_ans_wrong), 3) - evaluate_result[f'f_{self.f_beta}'] += evaluate_result[f'hasAns-f_{self.f_beta}'] - evaluate_result['EM'] += evaluate_result['hasAns-EM'] - flag += 1 - - if self.print_predict_stat: - evaluate_result['no2no'] = self.no2no - evaluate_result['no2yes'] = self.no2yes - evaluate_result['yes2no'] = self.yes2no - evaluate_result['yes2yes'] = self.yes2yes - - if flag <= 0: - return evaluate_result - - evaluate_result[f'f_{self.f_beta}'] = round(evaluate_result[f'f_{self.f_beta}'] / flag, 3) - evaluate_result['EM'] = round(evaluate_result['EM'] / flag, 3) - + eval_res = {'f1': round(self.f1 / self.total*100, 2), 'em': round(self.em / self.total*100, 2)} if reset: - self.no_ans_correct = 0 - self.no_ans_wrong = 0 - - self.has_ans_correct = 0 - self.has_ans_wrong = 0 - - self.has_ans_f = 0. - - self.no2no = 0 - self.no2yes = 0 - self.yes2no = 0 - self.yes2yes = 0 - - return evaluate_result + self.em = 0 + self.total = 0 + self.f1 = 0 + return eval_res + +# split Chinese +def _cn_segmentation(in_str, rm_punc=False): + in_str = str(in_str).lower().strip() + segs_out = [] + temp_str = "" + sp_char = {'-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',', '。', ':', '?', '!', '“', '”', ';', '’', '《', + '》', '……', '·', '、', '「', '」', '(', ')', '-', '~', '『', '』'} + for char in in_str: + if rm_punc and char in sp_char: + continue + if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char: + if temp_str != "": + ss = list(temp_str) + segs_out.extend(ss) + temp_str = "" + segs_out.append(char) + else: + temp_str += char + + # handling last part + if temp_str != "": + ss = list(temp_str) + segs_out.extend(ss) + + return segs_out + + +# remove punctuation +def _remove_punctuation(in_str): + in_str = str(in_str).lower().strip() + sp_char = ['-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', + ',', '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', + '「', '」', '(', ')', '-', '~', '『', '』'] + out_segs = [] + for char in in_str: + if char in sp_char: + continue + else: + out_segs.append(char) + return ''.join(out_segs) + + +# find longest common string +def _find_lcs(s1, s2): + m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)] + mmax = 0 + p = 0 + for i in range(len(s1)): + for j in range(len(s2)): + if s1[i] == s2[j]: + m[i + 1][j + 1] = m[i][j] + 1 + if m[i + 1][j + 1] > mmax: + mmax = m[i + 1][j + 1] + p = i + 1 + return s1[p - mmax:p], mmax + + +def _calc_cmrc2018_f1_score(answers, prediction): + f1_scores = [] + for ans in answers: + ans_segs = _cn_segmentation(ans, rm_punc=True) + prediction_segs = _cn_segmentation(prediction, rm_punc=True) + lcs, lcs_len = _find_lcs(ans_segs, prediction_segs) + if lcs_len == 0: + f1_scores.append(0) + continue + precision = 1.0 * lcs_len / len(prediction_segs) + recall = 1.0 * lcs_len / len(ans_segs) + f1 = (2 * precision * recall) / (precision + recall) + f1_scores.append(f1) + return max(f1_scores) + + +def _calc_cmrc2018_em_score(answers, prediction): + em = 0 + for ans in answers: + ans_ = _remove_punctuation(ans) + prediction_ = _remove_punctuation(prediction) + if ans_ == prediction_: + em = 1 + break + return em diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index 3036257c..4d76c24e 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -9,21 +9,23 @@ __all__ = [ "AdamW" ] -import torch import math + import torch from torch.optim.optimizer import Optimizer as TorchOptimizer class Optimizer(object): """ - 别名::class:`fastNLP.Optimizer` :class:`fastNLP.core.optimizer.Optimizer` - - :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. - :param kwargs: additional parameters. + Optimizer """ def __init__(self, model_params, **kwargs): + """ + + :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. + :param kwargs: additional parameters. + """ if model_params is not None and not hasattr(model_params, "__next__"): raise RuntimeError("model parameters should be a generator, rather than {}.".format(type(model_params))) self.model_params = model_params @@ -31,15 +33,18 @@ class Optimizer(object): def construct_from_pytorch(self, model_params): raise NotImplementedError - - def _get_require_grads_param(self, params): + + @staticmethod + def _get_require_grads_param(params): """ 将params中不需要gradient的删除 + :param iterable params: parameters :return: list(nn.Parameters) """ return [param for param in params if param.requires_grad] + class NullOptimizer(Optimizer): """ 当不希望Trainer更新optimizer时,传入本optimizer,但请确保通过callback的方式对参数进行了更新。 @@ -49,7 +54,7 @@ class NullOptimizer(Optimizer): super().__init__(None) def construct_from_pytorch(self, model_params): - pass + return self def __getattr__(self, item): def pass_func(*args, **kwargs): @@ -60,14 +65,15 @@ class NullOptimizer(Optimizer): class SGD(Optimizer): """ - 别名::class:`fastNLP.SGD` :class:`fastNLP.core.optimizer.SGD` - - :param float lr: learning rate. Default: 0.01 - :param float momentum: momentum. Default: 0 - :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. + SGD """ def __init__(self, lr=0.001, momentum=0, model_params=None): + """ + :param float lr: learning rate. Default: 0.01 + :param float momentum: momentum. Default: 0 + :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. + """ if not isinstance(lr, float): raise TypeError("learning rate has to be float.") super(SGD, self).__init__(model_params, lr=lr, momentum=momentum) @@ -82,14 +88,18 @@ class SGD(Optimizer): class Adam(Optimizer): """ - 别名::class:`fastNLP.Adam` :class:`fastNLP.core.optimizer.Adam` - - :param float lr: learning rate - :param float weight_decay: - :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. + Adam """ def __init__(self, lr=0.001, weight_decay=0, betas=(0.9, 0.999), eps=1e-8, amsgrad=False, model_params=None): + """ + + :param float lr: learning rate + :param float weight_decay: + :param eps: + :param amsgrad: + :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. + """ if not isinstance(lr, float): raise TypeError("learning rate has to be float.") super(Adam, self).__init__(model_params, lr=lr, betas=betas, eps=eps, amsgrad=amsgrad, @@ -105,9 +115,8 @@ class Adam(Optimizer): class AdamW(TorchOptimizer): r""" - 别名::class:`fastNLP.AdamW` :class:`fastNLP.core.optimizer.AdamW` - - 对AdamW的实现,该实现应该会在pytorch更高版本中出现,https://github.com/pytorch/pytorch/pull/21250。这里提前加入 + 对AdamW的实现,该实现在pytorch 1.2.0版本中已经出现,https://github.com/pytorch/pytorch/pull/21250。 + 这里加入以适配低版本的pytorch .. todo:: 翻译成中文 @@ -115,27 +124,28 @@ class AdamW(TorchOptimizer): The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. - :param params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - :param lr (float, optional): learning rate (default: 1e-3) - :param betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.99)) - :param eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - :param weight_decay (float, optional): weight decay coefficient (default: 1e-2) - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - - .. _Adam\: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - .. _Decoupled Weight Decay Regularization: - https://arxiv.org/abs/1711.05101 - .. _On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ + .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 + + .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101 + + .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False): + """ + + :param params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + :param lr (float, optional): learning rate (default: 1e-3) + :param betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.99)) + :param eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + :param weight_decay (float, optional): weight decay coefficient (default: 1e-2) + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + """ if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index 2d6a7380..e4112d5f 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -1,13 +1,15 @@ -""" - ..todo:: - 检查这个类是否需要 -""" +"""undocumented""" + +__all__ = [ + "Predictor" +] + from collections import defaultdict import torch -from . import DataSetIter from . import DataSet +from . import DataSetIter from . import SequentialSampler from .utils import _build_args, _move_dict_value_to_device, _get_model_device @@ -18,18 +20,20 @@ class Predictor(object): 与测试器(Tester)不同的是,predictor不关心模型性能的评价指标,只做inference。 这是一个fastNLP调用的高级模型包装器。它与Trainer、Tester不共享任何操作。 - - :param torch.nn.Module network: 用来完成预测任务的模型 """ - + def __init__(self, network): + """ + + :param torch.nn.Module network: 用来完成预测任务的模型 + """ if not isinstance(network, torch.nn.Module): raise ValueError( "Only fastNLP.models.BaseModel or torch.nn,Module is allowed, not {}".format(type(network))) self.network = network self.batch_size = 1 self.batch_output = [] - + def predict(self, data: DataSet, seq_len_field_name=None): """用已经训练好的模型进行inference. @@ -41,27 +45,27 @@ class Predictor(object): raise ValueError("Only Dataset class is allowed, not {}.".format(type(data))) if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays: raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data)) - + prev_training = self.network.training self.network.eval() network_device = _get_model_device(self.network) batch_output = defaultdict(list) data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) - + if hasattr(self.network, "predict"): predict_func = self.network.predict else: predict_func = self.network.forward - + with torch.no_grad(): for batch_x, _ in data_iterator: _move_dict_value_to_device(batch_x, _, device=network_device) refined_batch_x = _build_args(predict_func, **batch_x) prediction = predict_func(**refined_batch_x) - + if seq_len_field_name is not None: seq_lens = batch_x[seq_len_field_name].tolist() - + for key, value in prediction.items(): value = value.cpu().numpy() if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1): @@ -74,6 +78,6 @@ class Predictor(object): batch_output[key].extend(tmp_batch) else: batch_output[key].append(value) - + self.network.train(prev_training) return batch_output diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py index d8ba1ad1..6e025688 100644 --- a/fastNLP/core/sampler.py +++ b/fastNLP/core/sampler.py @@ -15,9 +15,6 @@ import numpy as np class Sampler(object): """ - 别名::class:`fastNLP.Sampler` :class:`fastNLP.core.sampler.Sampler` - - `Sampler` 类的基类. 规定以何种顺序取出data中的元素 子类必须实现 ``__call__`` 方法. 输入 `DataSet` 对象, 返回其中元素的下标序列 @@ -25,16 +22,14 @@ class Sampler(object): def __call__(self, data_set): """ - :param DataSet data_set: `DataSet` 对象, 需要Sample的数据 - :return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出 - """ + :param DataSet data_set: `DataSet` 对象, 需要Sample的数据 + :return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出 + """ raise NotImplementedError class SequentialSampler(Sampler): """ - 别名::class:`fastNLP.SequentialSampler` :class:`fastNLP.core.sampler.SequentialSampler` - 顺序取出元素的 `Sampler` """ @@ -45,8 +40,6 @@ class SequentialSampler(Sampler): class RandomSampler(Sampler): """ - 别名::class:`fastNLP.RandomSampler` :class:`fastNLP.core.sampler.RandomSampler` - 随机化取元素的 `Sampler` """ @@ -57,17 +50,17 @@ class RandomSampler(Sampler): class BucketSampler(Sampler): """ - 别名::class:`fastNLP.BucketSampler` :class:`fastNLP.core.sampler.BucketSampler` - 带Bucket的 `Random Sampler`. 可以随机地取出长度相似的元素 - - :param int num_buckets: bucket的数量 - :param int batch_size: batch的大小. 默认为None,Trainer在调用BucketSampler时,会将该值正确设置,如果是非Trainer场景使用,需 - 要显示传递该值 - :param str seq_len_field_name: 对应序列长度的 `field` 的名字 """ def __init__(self, num_buckets=10, batch_size=None, seq_len_field_name='seq_len'): + """ + + :param int num_buckets: bucket的数量 + :param int batch_size: batch的大小. 默认为None,Trainer在调用BucketSampler时,会将该值正确设置,如果是非Trainer场景使用,需 + 要显示传递该值 + :param str seq_len_field_name: 对应序列长度的 `field` 的名字 + """ self.num_buckets = num_buckets self.batch_size = batch_size self.seq_len_field_name = seq_len_field_name diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index c1d270d1..e92eb422 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -27,14 +27,21 @@ tester模块实现了 fastNLP 所需的Tester类,能在提供数据、模型 tester = Tester(dataset, model, metrics=AccuracyMetric()) eval_results = tester.test() -这里Metric的映射规律是和 :class:`fastNLP.Trainer` 中一致的,具体使用请参考 :doc:`trainer 模块` 的1.3部分。 +这里Metric的映射规律是和 :class:`fastNLP.Trainer` 中一致的,具体使用请参考 :mod:`trainer 模块` 的1.3部分。 Tester在验证进行之前会调用model.eval()提示当前进入了evaluation阶段,即会关闭nn.Dropout()等,在验证结束之后会调用model.train()恢复到训练状态。 """ +import time + import torch import torch.nn as nn +try: + from tqdm.auto import tqdm +except: + from .utils import _pseudo_tqdm as tqdm + from .batch import BatchIter, DataSetIter from .dataset import DataSet from .metrics import _prepare_metrics @@ -47,7 +54,9 @@ from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device from ._parallel_utils import _data_parallel_wrapper +from ._parallel_utils import _model_contains_inner_module from functools import partial +from ._logger import logger __all__ = [ "Tester" @@ -56,36 +65,35 @@ __all__ = [ class Tester(object): """ - 别名::class:`fastNLP.Tester` :class:`fastNLP.core.tester.Tester` - Tester是在提供数据,模型以及metric的情况下进行性能测试的类。需要传入模型,数据以及metric进行验证。 - - :param ~fastNLP.DataSet data: 需要测试的数据集 - :param torch.nn.module model: 使用的模型 - :param ~fastNLP.core.metrics.MetricBase,List[~fastNLP.core.metrics.MetricBase] metrics: 测试时使用的metrics - :param int batch_size: evaluation时使用的batch_size有多大。 - :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 - 的计算位置进行管理。支持以下的输入: - - 1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中,可见的第一个GPU中,可见的第二个GPU中; - - 2. torch.device:将模型装载到torch.device上。 - - 3. int: 将使用device_id为该值的gpu进行训练 - - 4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。 - - 5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。 - - 如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 - :param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 """ - def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1): - super(Tester, self).__init__() + def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1, use_tqdm=True): + """ - if not isinstance(data, DataSet): - raise TypeError(f"The type of data must be `fastNLP.DataSet`, got `{type(data)}`.") + :param ~fastNLP.DataSet data: 需要测试的数据集 + :param torch.nn.module model: 使用的模型 + :param ~fastNLP.core.metrics.MetricBase,List[~fastNLP.core.metrics.MetricBase] metrics: 测试时使用的metrics + :param int batch_size: evaluation时使用的batch_size有多大。 + :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 + 的计算位置进行管理。支持以下的输入: + + 1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中,可见的第一个GPU中,可见的第二个GPU中; + + 2. torch.device:将模型装载到torch.device上。 + + 3. int: 将使用device_id为该值的gpu进行训练 + + 4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。 + + 5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。 + + 如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 + :param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 + :param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。 + """ + super(Tester, self).__init__() + if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be `torch.nn.Module`, got `{type(model)}`.") @@ -95,6 +103,8 @@ class Tester(object): self._model = _move_model_to_device(model, device=device) self.batch_size = batch_size self.verbose = verbose + self.use_tqdm = use_tqdm + self.logger = logger if isinstance(data, DataSet): self.data_iterator = DataSetIter( @@ -106,19 +116,22 @@ class Tester(object): # check predict if (hasattr(self._model, 'predict') and callable(self._model.predict)) or \ - (isinstance(self._model, nn.DataParallel) and hasattr(self._model.module, 'predict') and - callable(self._model.module.predict)): + (_model_contains_inner_module(self._model) and hasattr(self._model.module, 'predict') and + callable(self._model.module.predict)): if isinstance(self._model, nn.DataParallel): self._predict_func_wrapper = partial(_data_parallel_wrapper('predict', self._model.device_ids, self._model.output_device), network=self._model.module) + self._predict_func = self._model.module.predict # 用于匹配参数 + elif isinstance(self._model, nn.parallel.DistributedDataParallel): self._predict_func = self._model.module.predict + self._predict_func_wrapper = self._model.module.predict # 用于调用 else: self._predict_func = self._model.predict self._predict_func_wrapper = self._model.predict else: - if isinstance(self._model, nn.DataParallel): + if _model_contains_inner_module(model): self._predict_func_wrapper = self._model.forward self._predict_func = self._model.module.forward else: @@ -126,10 +139,9 @@ class Tester(object): self._predict_func_wrapper = self._model.forward def test(self): - """开始进行验证,并返回验证结果。 + r"""开始进行验证,并返回验证结果。 - :return Dict[Dict] : dict的二层嵌套结构,dict的第一层是metric的名称; 第二层是这个metric的指标。 - 一个AccuracyMetric的例子为{'AccuracyMetric': {'acc': 1.0}}。 + :return Dict[Dict]: dict的二层嵌套结构,dict的第一层是metric的名称; 第二层是这个metric的指标。一个AccuracyMetric的例子为{'AccuracyMetric': {'acc': 1.0}}。 """ # turn on the testing mode; clean up the history self._model_device = _get_model_device(self._model) @@ -139,21 +151,39 @@ class Tester(object): eval_results = {} try: with torch.no_grad(): - for batch_x, batch_y in data_iterator: - _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - pred_dict = self._data_forward(self._predict_func, batch_x) - if not isinstance(pred_dict, dict): - raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " - f"must be `dict`, got {type(pred_dict)}.") + if not self.use_tqdm: + from .utils import _pseudo_tqdm as inner_tqdm + else: + inner_tqdm = tqdm + with inner_tqdm(total=len(data_iterator), leave=False, dynamic_ncols=True) as pbar: + pbar.set_description_str(desc="Test") + + start_time = time.time() + + for batch_x, batch_y in data_iterator: + _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) + pred_dict = self._data_forward(self._predict_func, batch_x) + if not isinstance(pred_dict, dict): + raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " + f"must be `dict`, got {type(pred_dict)}.") + for metric in self.metrics: + metric(pred_dict, batch_y) + + if self.use_tqdm: + pbar.update() + for metric in self.metrics: - metric(pred_dict, batch_y) - for metric in self.metrics: - eval_result = metric.get_metric() - if not isinstance(eval_result, dict): - raise TypeError(f"The return value of {_get_func_signature(metric.get_metric)} must be " - f"`dict`, got {type(eval_result)}") - metric_name = metric.__class__.__name__ - eval_results[metric_name] = eval_result + eval_result = metric.get_metric() + if not isinstance(eval_result, dict): + raise TypeError(f"The return value of {_get_func_signature(metric.get_metric)} must be " + f"`dict`, got {type(eval_result)}") + metric_name = metric.get_metric_name() + eval_results[metric_name] = eval_result + pbar.close() + end_time = time.time() + test_str = f'Evaluate data in {round(end_time - start_time, 2)} seconds!' + if self.verbose >= 0: + self.logger.info(test_str) except _CheckError as e: prev_func_signature = _get_func_signature(self._predict_func) _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, @@ -161,7 +191,7 @@ class Tester(object): dataset=self.data, check_level=0) if self.verbose >= 1: - print("[tester] \n{}".format(self._format_eval_results(eval_results))) + logger.info("[tester] \n{}".format(self._format_eval_results(eval_results))) self._mode(network, is_test=False) return eval_results diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 671e2736..a39362e2 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -314,7 +314,7 @@ Example2.3 这里,我们通过继承 :class:`~fastNLP.Callback` 类定义了自己的 callback 的,并和内置的 :class:`~fastNLP.EarlyStopCallback` 一起传给了 :class:`~fastNLP.Trainer` ,增强了 :class:`~fastNLP.Trainer` 的功能 -fastNLP已经自带了很多callback函数供使用,可以参考 :doc:`fastNLP.core.callback` 。 +fastNLP已经自带了很多callback函数供使用,可以参考 :mod:`fastNLP.core.callback` 。 """ __all__ = [ @@ -336,7 +336,7 @@ except: import warnings from .batch import DataSetIter, BatchIter -from .callback import CallbackManager, CallbackException +from .callback import CallbackManager, CallbackException, Callback from .dataset import DataSet from .losses import _prepare_losser from .metrics import _prepare_metrics @@ -352,12 +352,11 @@ from .utils import _move_dict_value_to_device from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device - +from ._parallel_utils import _model_contains_inner_module +from ._logger import logger class Trainer(object): """ - 别名::class:`fastNLP.Trainer` :class:`fastNLP.core.trainer.Trainer` - Trainer在fastNLP中用于组织单任务的训练过程,可以避免用户在不同训练任务中重复撰写 (1) epoch循环; (2) 将数据分成不同的Batch; @@ -365,88 +364,85 @@ class Trainer(object): (4) 每个epoch结束或一定step后进行验证集验证; (5) 保存获得更好验证性能的模型等。 - 详细的介绍参见 :doc:`fastNLP.core.trainer` - - :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。 - :param nn.modules model: 待训练的模型 - :param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器 - :param int batch_size: 训练和验证的时候的batch大小。 - :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward` - :param sampler: Batch数据生成的顺序, :class:`~fastNLP.Sampler` 类型。如果为None,默认使用 :class:`~fastNLP.RandomSampler` - :param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch - :param num_workers: int, 有多少个线程来进行数据pad处理。 - :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128 - 会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。 - :param int n_epochs: 需要优化迭代多少次。 - :param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。 - :param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。 - :param metrics: 验证的评估函数。可以只使用一个 :class:`Metric` , - 也可以使用多个 :class:`Metric` ,通过列表传入。 - 如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None, - 则保存当前模型。Metric种类详见 :doc:`metrics模块 ` 。仅在传入dev_data时有效。 - :param str,None metric_key: :class:`Metric` 有时会有多个指标, - 比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需 - 要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表 - 明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。 - :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 - :param str,None save_path: 将模型保存路径。如果为None,则不保存模型。如果dev_data为None,则保存最后一次迭代的模型。 - 保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 - :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 - :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 - 的计算位置进行管理。支持以下的输入: - - 1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中, - 可见的第二个GPU中; - - 2. torch.device:将模型装载到torch.device上。 - - 3. int: 将使用device_id为该值的gpu进行训练 - - 4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。 - - 5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。 - - 已知可能会出现的问题:Adagrad优化器可能无法正常使用这个参数,请手动管理模型位置。 - - :param list(callbacks) callbacks: 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以 - 通过callback机制实现。 可使用的callback参见 :doc:`callback模块 ` - :param int check_code_level: 模型检查等级. -1: 不进行检查; 0: 仅出现错误时停止; 1: 如果有field没有被使用, - 报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是 - 这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况; - (2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。 + 详细的介绍参见 :mod:`fastNLP.core.trainer` """ def __init__(self, train_data, model, optimizer=None, loss=None, batch_size=32, sampler=None, drop_last=False, update_every=1, num_workers=0, n_epochs=10, print_every=5, dev_data=None, metrics=None, metric_key=None, - validate_every=-1, save_path=None, use_tqdm=True, device=None, prefetch=False, - callbacks=None, check_code_level=0): - if prefetch and num_workers==0: - num_workers = 1 - if prefetch: - warnings.warn("prefetch is deprecated, will be removed in version 0.5.0, please use num_workers instead.") - + validate_every=-1, save_path=None, use_tqdm=True, device=None, + callbacks=None, check_code_level=0, **kwargs): + """ + + :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。 + :param nn.modules model: 待训练的模型 + :param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器 + :param int batch_size: 训练和验证的时候的batch大小。 + :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward` + :param sampler: Batch数据生成的顺序, :class:`~fastNLP.Sampler` 类型。如果为None,默认使用 :class:`~fastNLP.RandomSampler` + :param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch + :param num_workers: int, 有多少个线程来进行数据pad处理。 + :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128 + 会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。 + :param int n_epochs: 需要优化迭代多少次。 + :param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。 + :param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。 + :param metrics: 验证的评估函数。可以只使用一个 :class:`Metric` , + 也可以使用多个 :class:`Metric` ,通过列表传入。 + 如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None, + 则保存当前模型。Metric种类详见 :mod:`metrics模块 ` 。仅在传入dev_data时有效。 + :param str,None metric_key: :class:`Metric` 有时会有多个指标, + 比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需 + 要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表 + 明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。 + :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 + :param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存 + 最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 + :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 + :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 + 的计算位置进行管理。支持以下的输入: + + 1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中, + 可见的第二个GPU中; + + 2. torch.device:将模型装载到torch.device上。 + + 3. int: 将使用device_id为该值的gpu进行训练 + + 4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。 + + 5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。 + + 已知可能会出现的问题:Adagrad优化器可能无法正常使用这个参数,请手动管理模型位置。 + + :param list(callbacks) callbacks: 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以 + 通过callback机制实现。 可使用的callback参见 :mod:`callback模块 ` + :param int check_code_level: 模型检查等级. -1: 不进行检查; 0: 仅出现错误时停止; 1: 如果有field没有被使用, + 报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是 + 这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况; + (2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。 + """ super(Trainer, self).__init__() if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") - + # check metrics and dev_data if (not metrics) and dev_data is not None: raise ValueError("No metric for dev_data evaluation.") if metrics and (dev_data is None): raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") - + # check update every assert update_every >= 1, "update_every must be no less than 1." self.update_every = int(update_every) - + # check save_path if not (save_path is None or isinstance(save_path, str)): raise ValueError("save_path can only be None or `str`.") # prepare evaluate metrics = _prepare_metrics(metrics) - + # parse metric_key # increase_better is True. It means the exp result gets better if the indicator increases. # It is true by default. @@ -458,30 +454,70 @@ class Trainer(object): self.metric_key = None # prepare loss losser = _prepare_losser(loss) - - # sampler check - if sampler is not None and not isinstance(sampler, Sampler): - raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) - if sampler is None: - sampler = RandomSampler() - elif hasattr(sampler, 'set_batch_size'): - sampler.set_batch_size(batch_size) + if isinstance(train_data, BatchIter): + if sampler is not None: + warnings.warn("sampler is ignored when train_data is a BatchIter.") + if num_workers>0: + warnings.warn("num_workers is ignored when train_data is BatchIter.") + if drop_last: + warnings.warn("drop_last is ignored when train_data is BatchIter.") + + if isinstance(model, nn.parallel.DistributedDataParallel): # 如果是分布式的 + # device为None + if device is not None: + warnings.warn("device is ignored when model is nn.parallel.DistributedDataParallel.") + device = None + # Sampler要是分布式的 + if sampler is None: + sampler = torch.utils.data.DistributedSampler(train_data) + elif not isinstance(sampler, torch.utils.data.DistributedSampler): + raise TypeError("When using nn.parallel.DistributedDataParallel, " + "sampler must be None or torch.utils.data.DistributedSampler.") + # 不能保存模型 + if save_path: + raise RuntimeError("Saving model in Distributed situation is not allowed right now.") + else: + # sampler check + if sampler is not None and not isinstance(sampler, (Sampler, torch.utils.data.Sampler)): + raise ValueError(f"The type of sampler should be fastNLP.BaseSampler or pytorch's Sampler, got {type(sampler)}") + if sampler is None: + sampler = RandomSampler() + elif hasattr(sampler, 'set_batch_size'): + sampler.set_batch_size(batch_size) if isinstance(train_data, DataSet): self.data_iterator = DataSetIter( dataset=train_data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, drop_last=drop_last) elif isinstance(train_data, BatchIter): self.data_iterator = train_data + train_data = train_data.dataset else: raise TypeError("train_data type {} not support".format(type(train_data))) - if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): - _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, - metric_key=self.metric_key, check_level=check_code_level, - batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) - # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 + model.train() self.model = _move_model_to_device(model, device=device) + if _model_contains_inner_module(self.model): + self._forward_func = self.model.module.forward + else: + self._forward_func = self.model.forward + if check_code_level > -1: + # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入 + # 名是否匹配 + dev_dataset = dev_data + if isinstance(dev_data, BatchIter): + dev_dataset = None + warnings.warn("dev_data is of BatchIter type, ignore validation checking.") + check_batch_size = min(batch_size, DEFAULT_CHECK_BATCH_SIZE) + if isinstance(self.model, nn.DataParallel): + _num_devices = len(self.model.device_ids) + if batch_size//_num_devices>1: # 如果多卡是每个卡可以分多个数据的,则用每个卡给两个sample + check_batch_size = max(len(self.model.device_ids)*2, check_batch_size) + else: + check_batch_size = max(len(self.model.device_ids), check_batch_size) + _check_code(dataset=train_data, model=self.model, losser=losser, forward_func=self._forward_func, metrics=metrics, + dev_data=dev_dataset, metric_key=self.metric_key, check_level=check_code_level, + batch_size=check_batch_size) self.train_data = train_data self.dev_data = dev_data # If None, No validation. @@ -496,8 +532,7 @@ class Trainer(object): self.best_dev_epoch = None self.best_dev_step = None self.best_dev_perf = None - self.n_steps = (len(self.train_data) // self.batch_size + int( - len(self.train_data) % self.batch_size != 0)) * int(drop_last==0) * self.n_epochs + self.n_steps = len(self.data_iterator) * self.n_epochs if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer @@ -507,22 +542,32 @@ class Trainer(object): self.optimizer = torch.optim.Adam(self.model.parameters(), lr=4e-3) else: raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer))) - + + self.logger = logger + self.use_tqdm = use_tqdm + if 'test_use_tqdm' in kwargs: + self.test_use_tqdm = kwargs.get('test_use_tqdm') + else: + self.test_use_tqdm = self.use_tqdm self.pbar = None self.print_every = abs(self.print_every) - + self.kwargs = kwargs if self.dev_data is not None: self.tester = Tester(model=self.model, data=self.dev_data, metrics=self.metrics, - batch_size=self.batch_size, + batch_size=kwargs.get("dev_batch_size", self.batch_size), device=None, # 由上面的部分处理device - verbose=0) - + verbose=0, + use_tqdm=self.test_use_tqdm) + self.step = 0 self.start_time = None # start timestamp - + + if isinstance(callbacks, Callback): + callbacks = [callbacks] + self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) @@ -548,7 +593,7 @@ class Trainer(object): """ results = {} if self.n_epochs <= 0: - print(f"training epoch is {self.n_epochs}, nothing was done.") + self.logger.info(f"training epoch is {self.n_epochs}, nothing was done.") results['seconds'] = 0. return results try: @@ -557,8 +602,8 @@ class Trainer(object): self._load_best_model = load_best_model self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) start_time = time.time() - print("training epochs started " + self.start_time, flush=True) - + self.logger.info("training epochs started " + self.start_time) + try: self.callback_manager.on_train_begin() self._train() @@ -571,11 +616,11 @@ class Trainer(object): raise e elif on_exception == 'raise': raise e - + if self.dev_data is not None and self.best_dev_perf is not None: - print( - "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + - self.tester._format_eval_results(self.best_dev_perf), ) + self.logger.info( + "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step)) + self.logger.info(self.tester._format_eval_results(self.best_dev_perf)) results['best_eval'] = self.best_dev_perf results['best_epoch'] = self.best_dev_epoch results['best_step'] = self.best_dev_step @@ -583,27 +628,23 @@ class Trainer(object): model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]) load_succeed = self._load_model(self.model, model_name) if load_succeed: - print("Reloaded the best model.") + self.logger.info("Reloaded the best model.") else: - print("Fail to reload best model.") + self.logger.info("Fail to reload best model.") finally: pass results['seconds'] = round(time.time() - start_time, 2) - + return results - + def _train(self): if not self.use_tqdm: - from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm + from .utils import _pseudo_tqdm as inner_tqdm else: inner_tqdm = tqdm self.step = 0 self.epoch = 0 start = time.time() - if isinstance(self.model, nn.DataParallel): - self._forward_func = self.model.module.forward - else: - self._forward_func = self.model.forward with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: self.pbar = pbar avg_loss = 0 @@ -621,21 +662,21 @@ class Trainer(object): # negative sampling; replace unknown; re-weight batch_y self.callback_manager.on_batch_begin(batch_x, batch_y, indices) prediction = self._data_forward(self.model, batch_x) - + # edit prediction self.callback_manager.on_loss_begin(batch_y, prediction) loss = self._compute_loss(prediction, batch_y).mean() avg_loss += loss.item() loss = loss / self.update_every - + # Is loss NaN or inf? requires_grad = False self.callback_manager.on_backward_begin(loss) self._grad_backward(loss) self.callback_manager.on_backward_end() - + self._update() self.callback_manager.on_step_end() - + if self.step % self.print_every == 0: avg_loss = float(avg_loss) / self.print_every if self.use_tqdm: @@ -649,36 +690,36 @@ class Trainer(object): pbar.set_postfix_str(print_output) avg_loss = 0 self.callback_manager.on_batch_end() - + if ((self.validate_every > 0 and self.step % self.validate_every == 0) or (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ and self.dev_data is not None: eval_res = self._do_validation(epoch=epoch, step=self.step) - eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, - self.n_steps) + \ - self.tester._format_eval_results(eval_res) - pbar.write(eval_str + '\n') - + eval_str = "Evaluation on dev at Epoch {}/{}. Step:{}/{}: ".format(epoch, self.n_epochs, self.step, + self.n_steps) + # pbar.write(eval_str + '\n') + self.logger.info(eval_str) + self.logger.info(self.tester._format_eval_results(eval_res)+'\n') # ================= mini-batch end ==================== # - + # lr decay; early stopping self.callback_manager.on_epoch_end() # =============== epochs end =================== # pbar.close() self.pbar = None # ============ tqdm end ============== # - + def _do_validation(self, epoch, step): self.callback_manager.on_valid_begin() res = self.tester.test() - + is_better_eval = False if self._better_eval_result(res): if self.save_path is not None: self._save_model(self.model, "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])) elif self._load_best_model: - self._best_model_states = {name: param.cpu().clone() for name, param in self.model.named_parameters()} + self._best_model_states = {name: param.cpu().clone() for name, param in self.model.state_dict().items()} self.best_dev_perf = res self.best_dev_epoch = epoch self.best_dev_step = step @@ -686,7 +727,7 @@ class Trainer(object): # get validation results; adjust optimizer self.callback_manager.on_valid_end(res, self.metric_key, self.optimizer, is_better_eval) return res - + def _mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. @@ -698,14 +739,14 @@ class Trainer(object): model.eval() else: model.train() - + def _update(self): """Perform weight update on a model. """ if self.step % self.update_every == 0: self.optimizer.step() - + def _data_forward(self, network, x): x = _build_args(self._forward_func, **x) y = network(**x) @@ -713,7 +754,7 @@ class Trainer(object): raise TypeError( f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") return y - + def _grad_backward(self, loss): """Compute gradient with link rules. @@ -724,7 +765,7 @@ class Trainer(object): if (self.step-1) % self.update_every == 0: self.model.zero_grad() loss.backward() - + def _compute_loss(self, predict, truth): """Compute loss given prediction and ground truth. @@ -733,7 +774,7 @@ class Trainer(object): :return: a scalar """ return self.losser(predict, truth) - + def _save_model(self, model, model_name, only_param=False): """ 存储不含有显卡信息的state_dict或model :param model: @@ -745,7 +786,7 @@ class Trainer(object): model_path = os.path.join(self.save_path, model_name) if not os.path.exists(self.save_path): os.makedirs(self.save_path, exist_ok=True) - if isinstance(model, nn.DataParallel): + if _model_contains_inner_module(model): model = model.module if only_param: state_dict = model.state_dict() @@ -756,7 +797,7 @@ class Trainer(object): model.cpu() torch.save(model, model_path) model.to(self._model_device) - + def _load_model(self, model, model_name, only_param=False): # 返回bool值指示是否成功reload模型 if self.save_path is not None: @@ -765,7 +806,7 @@ class Trainer(object): states = torch.load(model_path) else: states = torch.load(model_path).state_dict() - if isinstance(model, nn.DataParallel): + if _model_contains_inner_module(model): model.module.load_state_dict(states) else: model.load_state_dict(states) @@ -774,7 +815,7 @@ class Trainer(object): else: return False return True - + def _better_eval_result(self, metrics): """Check if the current epoch yields better validation results. @@ -800,6 +841,10 @@ class Trainer(object): is_better = False return is_better + @property + def is_master(self): + """是否是主进程""" + return True DEFAULT_CHECK_BATCH_SIZE = 2 DEFAULT_CHECK_NUM_BATCH = 2 @@ -821,14 +866,15 @@ def _get_value_info(_dict): strs.append(_str) return strs + from numbers import Number from .batch import _to_tensor -def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, - dev_data=None, metric_key=None, - check_level=0): + + +def _check_code(dataset, model, losser, metrics, forward_func, batch_size=DEFAULT_CHECK_BATCH_SIZE, + dev_data=None, metric_key=None, check_level=0): # check get_loss 方法 - model_devcie = _get_model_device(model=model) - + model_device = _get_model_device(model=model) def _iter(): start_idx = 0 while start_idx -1, "device can only be non-negative integer" assert torch.cuda.device_count() > device, "Only has {} gpus, cannot use device {}.".format( @@ -312,7 +273,7 @@ def _get_model_device(model): """ # TODO 这个函数存在一定的风险,因为同一个模型可能存在某些parameter不在显卡中,比如BertEmbedding. 或者跨显卡 assert isinstance(model, nn.Module) - + parameters = list(model.parameters()) if len(parameters) == 0: return None @@ -352,7 +313,6 @@ def _map_args(maps: dict, **kwargs): output.update({name: val}) for keys in maps.keys(): if keys not in output.keys(): - # TODO: add UNUSED warning. pass return output @@ -473,10 +433,10 @@ def _move_dict_value_to_device(*args, device: torch.device, non_blocking=False): """ if not torch.cuda.is_available(): return - + if not isinstance(device, torch.device): raise TypeError(f"device must be `torch.device`, got `{type(device)}`") - + for arg in args: if isinstance(arg, dict): for key, value in arg.items(): @@ -491,10 +451,10 @@ class _CheckError(Exception): _CheckError. Used in losses.LossBase, metrics.MetricBase. """ - + def __init__(self, check_res: _CheckRes, func_signature: str): errs = [f'Problems occurred when calling `{func_signature}`'] - + if check_res.varargs: errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)") if check_res.missing: @@ -503,9 +463,9 @@ class _CheckError(Exception): errs.append(f"\tduplicated param: {check_res.duplicated}") if check_res.unused: errs.append(f"\tunused param: {check_res.unused}") - + Exception.__init__(self, '\n'.join(errs)) - + self.check_res = check_res self.func_signature = func_signature @@ -525,7 +485,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re # if check_res.varargs: # errs.append(f"\tvarargs: *{check_res.varargs}") # suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.") - + if check_res.unused: for _unused in check_res.unused: if _unused in target_dict: @@ -536,7 +496,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re unuseds.append(f"\tunused field: {_unused_field}") if _unused_param: unuseds.append(f"\tunused param: {_unused_param}") # output from predict or forward - + module_name = func_signature.split('.')[0] if check_res.missing: errs.append(f"\tmissing param: {check_res.missing}") @@ -557,7 +517,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re mapped_missing.append(_miss) else: unmapped_missing.append(_miss) - + for _miss in mapped_missing + unmapped_missing: if _miss in dataset: suggestions.append(f"Set `{_miss}` as target.") @@ -570,29 +530,17 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re else: _tmp = f'Provide `{_miss}` in DataSet or output of {prev_func_signature}.' suggestions.append(_tmp) - # for _miss in unmapped_missing: - # if _miss in dataset: - # suggestions.append(f"Set `{_miss}` as target.") - # else: - # _tmp = '' - # if check_res.unused: - # _tmp = f"Specify your assignment for `{input_func_map.get(_miss, _miss)}` when initialize {module_name}." - # if _tmp: - # _tmp += f' Or provide `{_miss}` in DataSet or output of {prev_func_signature}.' - # else: - # _tmp = f'Provide `{_miss}` in output of {prev_func_signature} or DataSet.' - # suggestions.append(_tmp) - + if check_res.duplicated: errs.append(f"\tduplicated param: {check_res.duplicated}.") suggestions.append(f"Delete {check_res.duplicated} in the output of " f"{prev_func_signature} or do not set {check_res.duplicated} as targets. ") - + if len(errs) > 0: errs.extend(unuseds) elif check_level == STRICT_CHECK_LEVEL: errs.extend(unuseds) - + if len(errs) > 0: errs.insert(0, f'Problems occurred when calling {func_signature}') sugg_str = "" @@ -619,11 +567,11 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re def _check_forward_error(forward_func, batch_x, dataset, check_level): check_res = _check_arg_dict_list(forward_func, batch_x) func_signature = _get_func_signature(forward_func) - + errs = [] suggestions = [] _unused = [] - + # if check_res.varargs: # errs.append(f"\tvarargs: {check_res.varargs}") # suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.") @@ -644,14 +592,14 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level): # _tmp += f"Or you might find it in `unused field:`, you can use DataSet.rename_field() to " \ # f"rename the field in `unused field:`." suggestions.append(_tmp) - + if check_res.unused: _unused = [f"\tunused field: {check_res.unused}"] if len(errs) > 0: errs.extend(_unused) elif check_level == STRICT_CHECK_LEVEL: errs.extend(_unused) - + if len(errs) > 0: errs.insert(0, f'Problems occurred when calling {func_signature}') sugg_str = "" @@ -699,7 +647,7 @@ def seq_len_to_mask(seq_len, max_len=None): max_len = int(max_len) if max_len else int(seq_len.max()) broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1)) mask = broad_cast_seq_len < seq_len.reshape(-1, 1) - + elif isinstance(seq_len, torch.Tensor): assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}." batch_size = seq_len.size(0) @@ -708,7 +656,7 @@ def seq_len_to_mask(seq_len, max_len=None): mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1)) else: raise TypeError("Only support 1-d numpy.ndarray or 1-d torch.Tensor.") - + return mask @@ -716,25 +664,25 @@ class _pseudo_tqdm: """ 当无法引入tqdm,或者Trainer中设置use_tqdm为false的时候,用该方法打印数据 """ - + def __init__(self, **kwargs): - pass - + self.logger = logger + def write(self, info): - print(info) - + self.logger.info(info) + def set_postfix_str(self, info): - print(info) - + self.logger.info(info) + def __getattr__(self, item): def pass_func(*args, **kwargs): pass - + return pass_func - + def __enter__(self): return self - + def __exit__(self, exc_type, exc_val, exc_tb): del self @@ -788,3 +736,95 @@ def iob2bioes(tags: List[str]) -> List[str]: else: raise TypeError("Invalid IOB format.") return new_tags + + +def _is_iterable(value): + # 检查是否是iterable的, duck typing + try: + iter(value) + return True + except BaseException as e: + return False + + +def get_seq_len(words, pad_value=0): + """ + 给定batch_size x max_len的words矩阵,返回句子长度 + + :param words: batch_size x max_len + :return: (batch_size,) + """ + mask = words.ne(pad_value) + return mask.sum(dim=-1) + + +def pretty_table_printer(dataset_or_ins) -> PrettyTable: + """ + :param dataset_or_ins: 传入一个dataSet或者instance + ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2], field_3=["a", "b", "c"]) + +-----------+-----------+-----------------+ + | field_1 | field_2 | field_3 | + +-----------+-----------+-----------------+ + | [1, 1, 1] | [2, 2, 2] | ['a', 'b', 'c'] | + +-----------+-----------+-----------------+ + :return: 以 pretty table的形式返回根据terminal大小进行自动截断 + """ + x = PrettyTable() + try: + sz = os.get_terminal_size() + column = sz.columns + row = sz.lines + except OSError: + column = 144 + row = 11 + + if type(dataset_or_ins).__name__ == "DataSet": + x.field_names = list(dataset_or_ins.field_arrays.keys()) + c_size = len(x.field_names) + for ins in dataset_or_ins: + x.add_row([sub_column(ins[k], column, c_size, k) for k in x.field_names]) + row -= 1 + if row < 0: + x.add_row(["..." for _ in range(c_size)]) + break + elif type(dataset_or_ins).__name__ == "Instance": + x.field_names = list(dataset_or_ins.fields.keys()) + c_size = len(x.field_names) + x.add_row([sub_column(dataset_or_ins[k], column, c_size, k) for k in x.field_names]) + + else: + raise Exception("only accept DataSet and Instance") + x.align = "l" + + return x + + +def sub_column(string: str, c: int, c_size: int, title: str) -> str: + """ + :param string: 要被截断的字符串 + :param c: 命令行列数 + :param c_size: instance或dataset field数 + :param title: 列名 + :return: 对一个过长的列进行截断的结果 + """ + avg = max(int(c / c_size / 2), len(title)) + string = str(string) + res = "" + counter = 0 + for char in string: + if ord(char) > 255: + counter += 2 + else: + counter += 1 + res += char + if counter > avg: + res = res + "..." + break + return res + + +def _check_fp16(): + if amp is None: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + if not torch.backends.cudnn.enabled: + raise RuntimeError("Amp requires cudnn backend to be enabled.") diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 9ce59a8c..6d530eb6 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -1,14 +1,21 @@ +""" +.. todo:: + doc +""" + __all__ = [ "Vocabulary", "VocabularyOption", ] +from collections import Counter +from functools import partial from functools import wraps -from collections import Counter, defaultdict + +from ._logger import logger from .dataset import DataSet from .utils import Option -from functools import partial -import numpy as np +from .utils import _is_iterable class VocabularyOption(Option): @@ -32,7 +39,7 @@ def _check_build_vocab(func): @wraps(func) # to solve missing docstring def _wrapper(self, *args, **kwargs): - if self.word2idx is None or self.rebuild is True: + if self._word2idx is None or self.rebuild is True: self.build_vocab() return func(self, *args, **kwargs) @@ -49,8 +56,8 @@ def _check_build_status(func): if self.rebuild is False: self.rebuild = True if self.max_size is not None and len(self.word_count) >= self.max_size: - print("[Warning] Vocabulary has reached the max size {} when calling {} method. " - "Adding more words may cause unexpected behaviour of Vocabulary. ".format( + logger.info("[Warning] Vocabulary has reached the max size {} when calling {} method. " + "Adding more words may cause unexpected behaviour of Vocabulary. ".format( self.max_size, func.__name__)) return func(self, *args, **kwargs) @@ -59,8 +66,6 @@ def _check_build_status(func): class Vocabulary(object): """ - 别名::class:`fastNLP.Vocabulary` :class:`fastNLP.core.vocabulary.Vocabulary` - 用于构建, 存储和使用 `str` 到 `int` 的一一映射:: vocab = Vocabulary() @@ -68,32 +73,52 @@ class Vocabulary(object): vocab.update(word_list) vocab["word"] # str to int vocab.to_word(5) # int to str - - :param int max_size: `Vocabulary` 的最大大小, 即能存储词的最大数量 - 若为 ``None`` , 则不限制大小. Default: ``None`` - :param int min_freq: 能被记录下的词在文本中的最小出现频率, 应大于或等于 1. - 若小于该频率, 词语将被视为 `unknown`. 若为 ``None`` , 所有文本中的词都被记录. Default: ``None`` - :param str optional padding: padding的字符. 如果设置为 ``None`` , - 则vocabulary中不考虑padding, 也不计入词表大小,为 ``None`` 的情况多在为label建立Vocabulary的情况. - Default: '' - :param str optional unknown: unknown的字符,所有未被记录的词在转为 `int` 时将被视为unknown. - 如果设置为 ``None`` ,则vocabulary中不考虑unknow, 也不计入词表大小. - 为 ``None`` 的情况多在为label建立Vocabulary的情况. - Default: '' """ def __init__(self, max_size=None, min_freq=None, padding='', unknown=''): + """ + + :param int max_size: `Vocabulary` 的最大大小, 即能存储词的最大数量 + 若为 ``None`` , 则不限制大小. Default: ``None`` + :param int min_freq: 能被记录下的词在文本中的最小出现频率, 应大于或等于 1. + 若小于该频率, 词语将被视为 `unknown`. 若为 ``None`` , 所有文本中的词都被记录. Default: ``None`` + :param str optional padding: padding的字符. 如果设置为 ``None`` , + 则vocabulary中不考虑padding, 也不计入词表大小,为 ``None`` 的情况多在为label建立Vocabulary的情况. + Default: '' + :param str optional unknown: unknown的字符,所有未被记录的词在转为 `int` 时将被视为unknown. + 如果设置为 ``None`` ,则vocabulary中不考虑unknow, 也不计入词表大小. + 为 ``None`` 的情况多在为label建立Vocabulary的情况. + Default: '' + """ self.max_size = max_size self.min_freq = min_freq self.word_count = Counter() self.unknown = unknown self.padding = padding - self.word2idx = None - self.idx2word = None + self._word2idx = None + self._idx2word = None self.rebuild = True # 用于承载不需要单独创建entry的词语,具体见from_dataset()方法 self._no_create_word = Counter() - + + @property + @_check_build_vocab + def word2idx(self): + return self._word2idx + + @word2idx.setter + def word2idx(self, value): + self._word2idx = value + + @property + @_check_build_vocab + def idx2word(self): + return self._idx2word + + @idx2word.setter + def idx2word(self, value): + self._word2idx = value + @_check_build_status def update(self, word_lst, no_create_entry=False): """依次增加序列中词在词典中的出现频率 @@ -131,11 +156,11 @@ class Vocabulary(object): """ 在新加入word时,检查_no_create_word的设置。 - :param str, List[str] word: + :param str List[str] word: :param bool no_create_entry: :return: """ - if isinstance(word, str): + if isinstance(word, str) or not _is_iterable(word): word = [word] for w in word: if no_create_entry and self.word_count.get(w, 0) == self._no_create_word.get(w, 0): @@ -180,36 +205,36 @@ class Vocabulary(object): 但已经记录在词典中的词, 不会改变对应的 `int` """ - if self.word2idx is None: - self.word2idx = {} + if self._word2idx is None: + self._word2idx = {} if self.padding is not None: - self.word2idx[self.padding] = len(self.word2idx) + self._word2idx[self.padding] = len(self._word2idx) if self.unknown is not None: - self.word2idx[self.unknown] = len(self.word2idx) + self._word2idx[self.unknown] = len(self._word2idx) max_size = min(self.max_size, len(self.word_count)) if self.max_size else None words = self.word_count.most_common(max_size) if self.min_freq is not None: words = filter(lambda kv: kv[1] >= self.min_freq, words) - if self.word2idx is not None: - words = filter(lambda kv: kv[0] not in self.word2idx, words) - start_idx = len(self.word2idx) - self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) + if self._word2idx is not None: + words = filter(lambda kv: kv[0] not in self._word2idx, words) + start_idx = len(self._word2idx) + self._word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) self.build_reverse_vocab() self.rebuild = False return self - + def build_reverse_vocab(self): """ 基于 `word to index` dict, 构建 `index to word` dict. """ - self.idx2word = {i: w for w, i in self.word2idx.items()} + self._idx2word = {i: w for w, i in self._word2idx.items()} return self @_check_build_vocab def __len__(self): - return len(self.word2idx) + return len(self._word2idx) @_check_build_vocab def __contains__(self, item): @@ -219,7 +244,7 @@ class Vocabulary(object): :param item: the word :return: True or False """ - return item in self.word2idx + return item in self._word2idx def has_word(self, w): """ @@ -241,12 +266,12 @@ class Vocabulary(object): vocab[w] """ - if w in self.word2idx: - return self.word2idx[w] + if w in self._word2idx: + return self._word2idx[w] if self.unknown is not None: - return self.word2idx[self.unknown] + return self._word2idx[self.unknown] else: - raise ValueError("word {} not in vocabulary".format(w)) + raise ValueError("word `{}` not in vocabulary".format(w)) @_check_build_vocab def index_dataset(self, *datasets, field_name, new_field_name=None): @@ -257,37 +282,47 @@ class Vocabulary(object): vocab.index_dataset(train_data, dev_data, test_data, field_name='words') :param ~fastNLP.DataSet,List[~fastNLP.DataSet] datasets: 需要转index的一个或多个数据集 - :param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field. - 目前仅支持 ``str`` , ``List[str]`` , ``List[List[str]]`` - :param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field. - Default: ``None`` + :param list,str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field. + 目前支持 ``str`` , ``List[str]`` + :param list,str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field. + Default: ``None``. """ - def index_instance(ins): + def index_instance(field): """ 有几种情况, str, 1d-list, 2d-list :param ins: :return: """ - field = ins[field_name] - if isinstance(field, str): + if isinstance(field, str) or not _is_iterable(field): return self.to_index(field) - elif isinstance(field, list): - if not isinstance(field[0], list): + else: + if isinstance(field[0], str) or not _is_iterable(field[0]): return [self.to_index(w) for w in field] else: - if isinstance(field[0][0], list): + if not isinstance(field[0][0], str) and _is_iterable(field[0][0]): raise RuntimeError("Only support field with 2 dimensions.") return [[self.to_index(c) for c in w] for w in field] - if new_field_name is None: - new_field_name = field_name + new_field_name = new_field_name or field_name + + if type(new_field_name) == type(field_name): + if isinstance(new_field_name, list): + assert len(new_field_name) == len(field_name), "new_field_name should have same number elements with " \ + "field_name." + elif isinstance(new_field_name, str): + field_name = [field_name] + new_field_name = [new_field_name] + else: + raise TypeError("field_name and new_field_name can only be str or List[str].") + for idx, dataset in enumerate(datasets): if isinstance(dataset, DataSet): try: - dataset.apply(index_instance, new_field_name=new_field_name) + for f_n, n_f_n in zip(field_name, new_field_name): + dataset.apply_field(index_instance, field_name=f_n, new_field_name=n_f_n) except Exception as e: - print("When processing the `{}` dataset, the following error occurred.".format(idx)) + logger.info("When processing the `{}` dataset, the following error occurred.".format(idx)) raise e else: raise RuntimeError("Only DataSet type is allowed.") @@ -306,9 +341,8 @@ class Vocabulary(object): :param ~fastNLP.DataSet,List[~fastNLP.DataSet] datasets: 需要转index的一个或多个数据集 :param str,List[str] field_name: 可为 ``str`` 或 ``List[str]`` . - 构建词典所使用的 field(s), 支持一个或多个field - 若有多个 DataSet, 每个DataSet都必须有这些field. - 目前仅支持的field结构: ``str`` , ``List[str]`` , ``list[List[str]]`` + 构建词典所使用的 field(s), 支持一个或多个field,若有多个 DataSet, 每个DataSet都必须有这些field. 目前支持的field结构 + : ``str`` , ``List[str]`` :param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain 的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev 中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 @@ -326,14 +360,14 @@ class Vocabulary(object): def construct_vocab(ins, no_create_entry=False): for fn in field_name: field = ins[fn] - if isinstance(field, str): + if isinstance(field, str) or not _is_iterable(field): self.add_word(field, no_create_entry=no_create_entry) - elif isinstance(field, (list, np.ndarray)): - if not isinstance(field[0], (list, np.ndarray)): + else: + if isinstance(field[0], str) or not _is_iterable(field[0]): for word in field: self.add_word(word, no_create_entry=no_create_entry) else: - if isinstance(field[0][0], (list, np.ndarray)): + if not isinstance(field[0][0], str) and _is_iterable(field[0][0]): raise RuntimeError("Only support field with 2 dimensions.") for words in field: for word in words: @@ -343,8 +377,8 @@ class Vocabulary(object): if isinstance(dataset, DataSet): try: dataset.apply(construct_vocab) - except Exception as e: - print("When processing the `{}` dataset, the following error occurred.".format(idx)) + except BaseException as e: + logger.error("When processing the `{}` dataset, the following error occurred:".format(idx)) raise e else: raise TypeError("Only DataSet type is allowed.") @@ -370,7 +404,7 @@ class Vocabulary(object): def to_index(self, w): """ - 将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出``ValueError``:: + 将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出 ``ValueError`` :: index = vocab.to_index('abc') # equals to @@ -389,7 +423,7 @@ class Vocabulary(object): """ if self.unknown is None: return None - return self.word2idx[self.unknown] + return self._word2idx[self.unknown] @property @_check_build_vocab @@ -399,7 +433,7 @@ class Vocabulary(object): """ if self.padding is None: return None - return self.word2idx[self.padding] + return self._word2idx[self.padding] @_check_build_vocab def to_word(self, idx): @@ -409,7 +443,7 @@ class Vocabulary(object): :param int idx: the index :return str word: the word """ - return self.idx2word[idx] + return self._idx2word[idx] def clear(self): """ @@ -418,8 +452,8 @@ class Vocabulary(object): :return: """ self.word_count.clear() - self.word2idx = None - self.idx2word = None + self._word2idx = None + self._idx2word = None self.rebuild = True self._no_create_word.clear() return self @@ -430,8 +464,8 @@ class Vocabulary(object): """ len(self) # make sure vocab has been built state = self.__dict__.copy() - # no need to pickle idx2word as it can be constructed from word2idx - del state['idx2word'] + # no need to pickle _idx2word as it can be constructed from _word2idx + del state['_idx2word'] return state def __setstate__(self, state): @@ -446,5 +480,5 @@ class Vocabulary(object): @_check_build_vocab def __iter__(self): - for word, index in self.word2idx.items(): + for word, index in self._word2idx.items(): yield word, index diff --git a/fastNLP/doc_utils.py b/fastNLP/doc_utils.py new file mode 100644 index 00000000..d5412ff4 --- /dev/null +++ b/fastNLP/doc_utils.py @@ -0,0 +1,46 @@ +"""undocumented +用于辅助生成 fastNLP 文档的代码 +""" + +__all__ = [] + +import inspect +import sys + + +def doc_process(m): + for name, obj in inspect.getmembers(m): + if inspect.isclass(obj) or inspect.isfunction(obj): + if obj.__module__ != m.__name__: + if obj.__doc__ is None: + # print(name, obj.__doc__) + pass + else: + module_name = obj.__module__ + + # 识别并标注类和函数在不同层次中的位置 + + while 1: + defined_m = sys.modules[module_name] + if "undocumented" not in defined_m.__doc__ and name in defined_m.__all__: + obj.__doc__ = r"别名 :class:`" + m.__name__ + "." + name + "`" \ + + " :class:`" + module_name + "." + name + "`\n" + obj.__doc__ + break + module_name = ".".join(module_name.split('.')[:-1]) + if module_name == m.__name__: + # print(name, ": not found defined doc.") + break + + # 识别并标注基类,只有基类也在 fastNLP 中定义才显示 + + if inspect.isclass(obj): + for base in obj.__bases__: + if base.__module__.startswith("fastNLP"): + parts = base.__module__.split(".") + [] + module_name, i = "fastNLP", 1 + for i in range(len(parts) - 1): + defined_m = sys.modules[module_name] + if "undocumented" not in defined_m.__doc__ and name in defined_m.__all__: + obj.__doc__ = r"基类 :class:`" + defined_m.__name__ + "." + base.__name__ + "` \n\n" + obj.__doc__ + break + module_name += "." + parts[i + 1] diff --git a/fastNLP/embeddings/__init__.py b/fastNLP/embeddings/__init__.py index 2bfb2960..ea99154e 100644 --- a/fastNLP/embeddings/__init__.py +++ b/fastNLP/embeddings/__init__.py @@ -7,20 +7,25 @@ torch.FloatTensor。所有的embedding都可以使用 `self.num_embedding` 获 __all__ = [ "Embedding", + "TokenEmbedding", "StaticEmbedding", "ElmoEmbedding", "BertEmbedding", + "BertWordPieceEncoder", "StackEmbedding", "LSTMCharEmbedding", "CNNCharEmbedding", - "get_embeddings" + "get_embeddings", ] - -from .embedding import Embedding +from .embedding import Embedding, TokenEmbedding from .static_embedding import StaticEmbedding from .elmo_embedding import ElmoEmbedding -from .bert_embedding import BertEmbedding +from .bert_embedding import BertEmbedding, BertWordPieceEncoder from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding from .stack_embedding import StackEmbedding -from .utils import get_embeddings \ No newline at end of file +from .utils import get_embeddings + +import sys +from ..doc_utils import doc_process +doc_process(sys.modules[__name__]) \ No newline at end of file diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index aa72898a..36670a0b 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -1,22 +1,30 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "BertEmbedding", + "BertWordPieceEncoder" +] -import os import collections +import warnings +from itertools import chain -from torch import nn -import torch import numpy as np -from itertools import chain +import torch +from torch import nn +from .contextual_embedding import ContextualEmbedding +from ..core import logger from ..core.vocabulary import Vocabulary -from ..io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR +from ..io.file_utils import PRETRAINED_BERT_MODEL_DIR from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer -from .contextual_embedding import ContextualEmbedding class BertEmbedding(ContextualEmbedding): """ - 别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding` - 使用BERT对words进行编码的Embedding。建议将输入的words长度限制在430以内,而不要使用512(根据预训练模型参数,可能有变化)。这是由于 预训练的bert模型长度限制为512个token,而因为输入的word是未进行word piece分割的(word piece的分割有BertEmbedding在输入word 时切分),在分割之后长度可能会超过最大长度限制。 @@ -27,53 +35,62 @@ class BertEmbedding(ContextualEmbedding): >>> import torch >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import BertEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) >>> embed = BertEmbedding(vocab, model_dir_or_name='en-base-uncased', requires_grad=False, layers='4,-2,-1') >>> words = torch.LongTensor([[vocab.to_index(word) for word in "The whether is good .".split()]]) >>> outputs = embed(words) >>> outputs.size() >>> # torch.Size([1, 5, 2304]) - - :param ~fastNLP.Vocabulary vocab: 词表 - :param str model_dir_or_name: 模型所在目录或者模型的名称。当传入模型所在目录时,目录中应该包含一个词表文件(以.txt作为后缀名), - 权重文件(以.bin作为文件后缀名), 配置文件(以.json作为后缀名)。 - :param str layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,可以以负数 - 去索引倒数几层。 - :param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces - 中计算得到它对应的表示。支持 ``last`` , ``first`` , ``avg`` , ``max``。 - :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 - :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 - :param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样 - 会使得word embedding的结果比输入的结果长两个token。如果该值为True,则在使用 :class::StackEmbedding 可能会与其它类型的 - embedding长度不匹配。 - :param bool requires_grad: 是否需要gradient以更新Bert的权重。 """ - def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', - pool_method: str='first', word_dropout=0, dropout=0, requires_grad: bool=False, - include_cls_sep: bool=False): + + def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', + pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, + pooled_cls=True, requires_grad: bool = True, auto_truncate: bool = False): + """ + + :param ~fastNLP.Vocabulary vocab: 词表 + :param str model_dir_or_name: 模型所在目录或者模型的名称。当传入模型所在目录时,目录中应该包含一个词表文件(以.txt作为后缀名), + 权重文件(以.bin作为文件后缀名), 配置文件(以.json作为后缀名)。 + :param str layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,层的序号是 + 从0开始,可以以负数去索引倒数几层。 + :param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces + 中计算得到它对应的表示。支持 ``last`` , ``first`` , ``avg`` , ``max``。 + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 + :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 + :param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样 + 会使得word embedding的结果比输入的结果长两个token。如果该值为True,则在使用 :class::StackEmbedding 可能会与其它类型的 + embedding长度不匹配。 + :param bool pooled_cls: 返回的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取[CLS]做预测, + 一般该值为True。 + :param bool requires_grad: 是否需要gradient以更新Bert的权重。 + :param bool auto_truncate: 当句子words拆分为word pieces长度超过bert最大允许长度(一般为512), 自动截掉拆分后的超过510个 + word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS] + 来进行分类的任务将auto_truncate置为True。 + """ super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: - PRETRAIN_URL = _get_base_url('bert') - model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) - # 检查是否存在 - elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): - model_dir = model_dir_or_name - else: - raise ValueError(f"Cannot recognize {model_dir_or_name}.") - - self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, - pool_method=pool_method, include_cls_sep=include_cls_sep) - + if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): + logger.warning("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" + " faster speed.") + warnings.warn("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" + " faster speed.") + + self._word_sep_index = None + if '[SEP]' in vocab: + self._word_sep_index = vocab['[SEP]'] + + self.model = _WordBertModel(model_dir_or_name=model_dir_or_name, vocab=vocab, layers=layers, + pool_method=pool_method, include_cls_sep=include_cls_sep, + pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2) + self.requires_grad = requires_grad - self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size - + self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size + def _delete_model_weights(self): del self.model - + def forward(self, words): """ 计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 @@ -85,143 +102,178 @@ class BertEmbedding(ContextualEmbedding): words = self.drop_word(words) outputs = self._get_sent_reprs(words) if outputs is not None: - return self.dropout(words) + return self.dropout(outputs) outputs = self.model(words) outputs = torch.cat([*outputs], dim=-1) - + return self.dropout(outputs) - - @property - def requires_grad(self): + + def drop_word(self, words): """ - Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 - + 按照设定随机将words设置为unknown_index。 + + :param torch.LongTensor words: batch_size x max_len :return: """ - requires_grads = set([param.requires_grad for name, param in self.named_parameters() - if 'word_pieces_lengths' not in name]) - if len(requires_grads) == 1: - return requires_grads.pop() - else: - return None - - @requires_grad.setter - def requires_grad(self, value): - for name, param in self.named_parameters(): - if 'word_pieces_lengths' in name: # 这个不能加入到requires_grad中 - continue - param.requires_grad = value + if self.word_dropout > 0 and self.training: + with torch.no_grad(): + if self._word_sep_index: # 不能drop sep + sep_mask = words.eq(self._word_sep_index) + mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 + pad_mask = words.ne(0) + mask = pad_mask.__and__(mask) # pad的位置不为unk + words = words.masked_fill(mask, self._word_unk_index) + if self._word_sep_index: + words.masked_fill_(sep_mask, self._word_sep_index) + return words class BertWordPieceEncoder(nn.Module): """ 读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。 - - :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased`` - :param str layers: 最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 - :param bool requires_grad: 是否需要gradient。 """ - def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', - requires_grad: bool=False): + + def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False, + word_dropout=0, dropout=0, requires_grad: bool = True): + """ + + :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased`` + :param str layers: 最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 + :param bool pooled_cls: 返回的句子开头的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取 + [CLS]做预测,一般该值为True。 + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 + :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 + :param bool requires_grad: 是否需要gradient。 + """ super().__init__() - PRETRAIN_URL = _get_base_url('bert') - - if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR: - model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) - # 检查是否存在 - elif os.path.isdir(model_dir_or_name): - model_dir = model_dir_or_name - else: - raise ValueError(f"Cannot recognize {model_dir_or_name}.") - - self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers) + + self.model = _WordPieceBertModel(model_dir_or_name=model_dir_or_name, layers=layers, pooled_cls=pooled_cls) + self._sep_index = self.model._sep_index + self._wordpiece_pad_index = self.model._wordpiece_pad_index + self._wordpiece_unk_index = self.model._wordpiece_unknown_index self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size self.requires_grad = requires_grad - - @property - def requires_grad(self): - """ - Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 - :return: - """ - requires_grads = set([param.requires_grad for name, param in self.named_parameters()]) - if len(requires_grads) == 1: - return requires_grads.pop() - else: - return None - - @requires_grad.setter - def requires_grad(self, value): - for name, param in self.named_parameters(): - param.requires_grad = value - + self.word_dropout = word_dropout + self.dropout_layer = nn.Dropout(dropout) + @property def embed_size(self): return self._embed_size - - def index_datasets(self, *datasets, field_name): + + @property + def embedding_dim(self): + return self._embed_size + + @property + def num_embedding(self): + return self.model.encoder.config.vocab_size + + def index_datasets(self, *datasets, field_name, add_cls_sep=True): """ - 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 - [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 + 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 + bert的pad value。 - :param datasets: DataSet对象 - :param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。 + :param ~fastNLP.DataSet datasets: DataSet对象 + :param str field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。 + :param bool add_cls_sep: 如果首尾不是[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP]。 :return: """ - self.model.index_dataset(*datasets, field_name=field_name) - + self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep) + def forward(self, word_pieces, token_type_ids=None): """ 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 :param words: batch_size x max_len - :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话 + :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入), + 第一个[SEP]及之前为0, 第二个[SEP]及到第一个[SEP]之间为1; 第三个[SEP]及到第二个[SEP]之间为0,依次往后推。 :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) """ + with torch.no_grad(): + sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len + if token_type_ids is None: + sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) + token_type_ids = sep_mask_cumsum.fmod(2) + if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 + token_type_ids = token_type_ids.eq(0).long() + + word_pieces = self.drop_word(word_pieces) outputs = self.model(word_pieces, token_type_ids) outputs = torch.cat([*outputs], dim=-1) + + return self.dropout_layer(outputs) + + def drop_word(self, words): + """ + 按照设定随机将words设置为unknown_index。 - return outputs + :param torch.LongTensor words: batch_size x max_len + :return: + """ + if self.word_dropout > 0 and self.training: + with torch.no_grad(): + if self._word_sep_index: # 不能drop sep + sep_mask = words.eq(self._wordpiece_unk_index) + mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 + pad_mask = words.ne(self._wordpiece_pad_index) + mask = pad_mask.__and__(mask) # pad的位置不为unk + words = words.masked_fill(mask, self._word_unk_index) + if self._word_sep_index: + words.masked_fill_(sep_mask, self._wordpiece_unk_index) + return words class _WordBertModel(nn.Module): - def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', include_cls_sep:bool=False): + def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', + include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2): super().__init__() - - self.tokenzier = BertTokenizer.from_pretrained(model_dir) - self.encoder = BertModel.from_pretrained(model_dir) + + self.tokenzier = BertTokenizer.from_pretrained(model_dir_or_name) + self.encoder = BertModel.from_pretrained(model_dir_or_name) + self._max_position_embeddings = self.encoder.config.max_position_embeddings # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) self.layers = list(map(int, layers.split(','))) for layer in self.layers: - if layer<0: - assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \ - f"a bert model with {encoder_layer_number} layers." + if layer < 0: + assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ + f"a bert model with {encoder_layer_number} layers." else: - assert layer= min_freq and not vocab._is_word_no_create_entry( + word): # 出现次数大于这个次数才新增 + word_piece_dict[word] = 1 # 新增一个值 continue for word_piece in word_pieces: word_piece_dict[word_piece] = 1 @@ -242,7 +294,7 @@ class _WordBertModel(nn.Module): new_word_piece_vocab[token] = len(new_word_piece_vocab) self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) self.encoder.embeddings.word_embeddings = embed - + word_to_wordpieces = [] word_pieces_lengths = [] for word, index in vocab: @@ -254,81 +306,126 @@ class _WordBertModel(nn.Module): word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces) word_to_wordpieces.append(word_pieces) word_pieces_lengths.append(len(word_pieces)) - print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab))) self._cls_index = self.tokenzier.vocab['[CLS]'] self._sep_index = self.tokenzier.vocab['[SEP]'] - self._pad_index = vocab.padding_idx + self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece + logger.info("Found(Or segment into word pieces) {} words out of {}.".format(found_count, len(vocab))) self.word_to_wordpieces = np.array(word_to_wordpieces) - self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) - print("Successfully generate word pieces.") - + self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) + logger.debug("Successfully generate word pieces.") + def forward(self, words): """ :param words: torch.LongTensor, batch_size x max_len :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size """ - batch_size, max_word_len = words.size() - seq_len = words.ne(self._pad_index).sum(dim=-1) - batch_word_pieces_length = self.word_pieces_lengths[words] # batch_size x max_len - word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) - max_word_piece_length = word_pieces_lengths.max().item() - # +2是由于需要加入[CLS]与[SEP] - word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index) - word_pieces[:, 0].fill_(self._cls_index) - batch_indexes = torch.arange(batch_size).to(words) - word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index - attn_masks = torch.zeros_like(word_pieces) - # 1. 获取words的word_pieces的id,以及对应的span范围 - word_indexes = words.tolist() - for i in range(batch_size): - word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i]])) - word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i) - attn_masks[i, :len(word_pieces_i)+2].fill_(1) - # TODO 截掉长度超过的部分。 + with torch.no_grad(): + batch_size, max_word_len = words.size() + word_mask = words.ne(self._word_pad_index) # 为1的地方有word + seq_len = word_mask.sum(dim=-1) + batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), + 0) # batch_size x max_len + word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size + word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) + if word_piece_length + 2 > self._max_position_embeddings: + if self.auto_truncate: + word_pieces_lengths = word_pieces_lengths.masked_fill( + word_pieces_lengths + 2 > self._max_position_embeddings, + self._max_position_embeddings - 2) + else: + raise RuntimeError( + "After split words into word pieces, the lengths of word pieces are longer than the " + f"maximum allowed sequence length:{self._max_position_embeddings} of bert. You can set " + f"`auto_truncate=True` for BertEmbedding to automatically truncate overlong input.") + + # +2是由于需要加入[CLS]与[SEP] + word_pieces = words.new_full((batch_size, min(word_piece_length + 2, self._max_position_embeddings)), + fill_value=self._wordpiece_pad_index) + attn_masks = torch.zeros_like(word_pieces) + # 1. 获取words的word_pieces的id,以及对应的span范围 + word_indexes = words.cpu().numpy() + for i in range(batch_size): + word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]])) + if self.auto_truncate and len(word_pieces_i) > self._max_position_embeddings - 2: + word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2] + word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i) + attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1) + # 添加[cls]和[sep] + word_pieces[:, 0].fill_(self._cls_index) + batch_indexes = torch.arange(batch_size).to(words) + word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index + if self._has_sep_in_vocab: # 但[SEP]在vocab中出现应该才会需要token_ids + sep_mask = word_pieces.eq(self._sep_index).long() # batch_size x max_len + sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) + token_type_ids = sep_mask_cumsum.fmod(2) + if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 + token_type_ids = token_type_ids.eq(0).long() + else: + token_type_ids = torch.zeros_like(word_pieces) # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] - bert_outputs, _ = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks, - output_all_encoded_layers=True) - # output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size - + bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, + output_all_encoded_layers=True) + # output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size + if self.include_cls_sep: - outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, - bert_outputs[-1].size(-1)) s_shift = 1 + outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, + bert_outputs[-1].size(-1)) + else: + s_shift = 0 outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len, bert_outputs[-1].size(-1)) - s_shift = 0 batch_word_pieces_cum_length = batch_word_pieces_length.new_zeros(batch_size, max_word_len + 1) batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1) # batch_size x max_len + + if self.pool_method == 'first': + batch_word_pieces_cum_length = batch_word_pieces_cum_length[:, :seq_len.max()] + batch_word_pieces_cum_length.masked_fill_(batch_word_pieces_cum_length.ge(word_piece_length), 0) + _batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) + elif self.pool_method == 'last': + batch_word_pieces_cum_length = batch_word_pieces_cum_length[:, 1:seq_len.max()+1] - 1 + batch_word_pieces_cum_length.masked_fill_(batch_word_pieces_cum_length.ge(word_piece_length), 0) + _batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) + for l_index, l in enumerate(self.layers): output_layer = bert_outputs[l] + real_word_piece_length = output_layer.size(1) - 2 + if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的 + paddings = output_layer.new_zeros(batch_size, + word_piece_length - real_word_piece_length, + output_layer.size(2)) + output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() # 从word_piece collapse到word的表示 truncate_output_layer = output_layer[:, 1:-1] # 删除[CLS]与[SEP] batch_size x len x hidden_size - outputs_seq_len = seq_len + s_shift if self.pool_method == 'first': - for i in range(batch_size): - i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置 - outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size + tmp = truncate_output_layer[_batch_indexes, batch_word_pieces_cum_length] + tmp = tmp.masked_fill(word_mask[:, :batch_word_pieces_cum_length.size(1), None].eq(0), 0) + outputs[l_index, :, s_shift:batch_word_pieces_cum_length.size(1)+s_shift] = tmp + elif self.pool_method == 'last': - for i in range(batch_size): - i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end - outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] + tmp = truncate_output_layer[_batch_indexes, batch_word_pieces_cum_length] + tmp = tmp.masked_fill(word_mask[:, :batch_word_pieces_cum_length.size(1), None].eq(0), 0) + outputs[l_index, :, s_shift:batch_word_pieces_cum_length.size(1)+s_shift] = tmp elif self.pool_method == 'max': for i in range(batch_size): for j in range(seq_len[i]): - start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] - outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) + start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] + outputs[l_index, i, j + s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) else: for i in range(batch_size): for j in range(seq_len[i]): - start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] - outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) + start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] + outputs[l_index, i, j + s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) if self.include_cls_sep: - outputs[l_index, :, 0] = output_layer[:, 0] - outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift] + if l in (len(bert_outputs) - 1, -1) and self.pooled_cls: + outputs[l_index, :, 0] = pooled_cls + else: + outputs[l_index, :, 0] = output_layer[:, 0] + outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, seq_len + s_shift] + # 3. 最终的embedding结果 return outputs - diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index b9e6659e..0624d07f 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -3,27 +3,36 @@ 词的index而不需要使用词语中的char的index来获取表达。 """ +__all__ = [ + "CNNCharEmbedding", + "LSTMCharEmbedding" +] + +from typing import List import torch import torch.nn as nn import torch.nn.functional as F -from typing import List -from ..modules.encoder.lstm import LSTM -from ..core.vocabulary import Vocabulary from .embedding import TokenEmbedding +from .static_embedding import StaticEmbedding from .utils import _construct_char_vocab_from_vocab +from .utils import get_embeddings +from ..core import logger +from ..core.vocabulary import Vocabulary +from ..modules.encoder.lstm import LSTM class CNNCharEmbedding(TokenEmbedding): """ - 别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding` - 使用CNN生成character embedding。CNN的结构为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool -> fc -> Dropout. 不同的kernel大小的fitler结果是concat起来然后通过一层fully connected layer, 然后输出word的表示。 Example:: + >>> import torch + >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import CNNCharEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) >>> embed = CNNCharEmbedding(vocab, embed_size=50) >>> words = torch.LongTensor([[vocab.to_index(word) for word in "The whether is good .".split()]]) @@ -31,27 +40,36 @@ class CNNCharEmbedding(TokenEmbedding): >>> outputs.size() >>> # torch.Size([1, 5,50]) - :param vocab: 词表 - :param embed_size: 该word embedding的大小,默认值为50. - :param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50. - :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 - :param float dropout: 以多大的概率drop分布式表示与char embedding的输出。 - :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20]. - :param kernel_sizes: kernel的大小. 默认值为[5, 3, 1]. - :param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'. - :param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数. - :param min_char_freq: character的最少出现次数。默认值为2. """ - def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, - dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), - pool_method: str='max', activation='relu', min_char_freq: int=2): + + def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, + dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1), + pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None, + requires_grad:bool=True, include_word_start_end:bool=True): + """ + + :param vocab: 词表 + :param embed_size: 该CNNCharEmbedding的输出维度大小,默认值为50. + :param char_emb_size: character的embed的维度。character是从vocab中生成的。默认值为50. + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 + :param float dropout: 以多大的概率drop分布式表示与char embedding的输出。 + :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20]. + :param kernel_sizes: kernel的大小. 默认值为[5, 3, 1]. + :param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'. + :param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数. + :param min_char_freq: character的最少出现次数。默认值为2. + :param pre_train_char_embed: 可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹 + (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, + 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. + :param requires_grad: 是否更新权重 + :param include_word_start_end: 是否在每个word开始的character前和结束的character增加特殊标示符号; + """ super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + for kernel in kernel_sizes: assert kernel % 2 == 1, "Only odd kernel is allowed." - + assert pool_method in ('max', 'avg') - self.dropout = nn.Dropout(dropout) self.pool_method = pool_method # activation function if isinstance(activation, str): @@ -68,31 +86,39 @@ class CNNCharEmbedding(TokenEmbedding): else: raise Exception( "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") - - print("Start constructing character vocabulary.") + + logger.info("Start constructing character vocabulary.") # 建立char的词表 - self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) + self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq, + include_word_start_end=include_word_start_end) self.char_pad_index = self.char_vocab.padding_idx - print(f"In total, there are {len(self.char_vocab)} distinct characters.") + logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.") # 对vocab进行index max_word_len = max(map(lambda x: len(x[0]), vocab)) - self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len), - fill_value=self.char_pad_index, dtype=torch.long), - requires_grad=False) - self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False) + if include_word_start_end: + max_word_len += 2 + self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), max_word_len), + fill_value=self.char_pad_index, dtype=torch.long)) + self.register_buffer('word_lengths', torch.zeros(len(vocab)).long()) for word, index in vocab: # if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。修改为不区分pad, 这样所有的也是同一个embed + if include_word_start_end: + word = [''] + list(word) + [''] self.words_to_chars_embedding[index, :len(word)] = \ torch.LongTensor([self.char_vocab.to_index(c) for c in word]) self.word_lengths[index] = len(word) - self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) - + # self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) + if pre_train_char_embed: + self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed) + else: + self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size)) + self.convs = nn.ModuleList([nn.Conv1d( char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) for i in range(len(kernel_sizes))]) self._embed_size = embed_size self.fc = nn.Linear(sum(filter_nums), embed_size) - self.init_param() + self.requires_grad = requires_grad def forward(self, words): """ @@ -104,14 +130,14 @@ class CNNCharEmbedding(TokenEmbedding): words = self.drop_word(words) batch_size, max_len = words.size() chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len - word_lengths = self.word_lengths[words] # batch_size x max_len + word_lengths = self.word_lengths[words] # batch_size x max_len max_word_len = word_lengths.max() chars = chars[:, :, :max_word_len] # 为1的地方为mask chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size chars = self.dropout(chars) - reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) + reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1) reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) for conv in self.convs] @@ -119,54 +145,23 @@ class CNNCharEmbedding(TokenEmbedding): conv_chars = self.activation(conv_chars) if self.pool_method == 'max': conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) - chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) + chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) else: conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0) - chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float() + chars = torch.sum(conv_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() chars = self.fc(chars) return self.dropout(chars) - @property - def requires_grad(self): - """ - Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 - :return: - """ - params = [] - for name, param in self.named_parameters(): - if 'words_to_chars_embedding' not in name and 'word_lengths' not in name: - params.append(param.requires_grad) - requires_grads = set(params) - if len(requires_grads) == 1: - return requires_grads.pop() - else: - return None - - @requires_grad.setter - def requires_grad(self, value): - for name, param in self.named_parameters(): - if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 - continue - param.requires_grad = value - - def init_param(self): - for name, param in self.named_parameters(): - if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset - continue - if param.data.dim()>1: - nn.init.xavier_uniform_(param, 1) - else: - nn.init.uniform_(param, -1, 1) - class LSTMCharEmbedding(TokenEmbedding): """ - 别名::class:`fastNLP.embeddings.LSTMCharEmbedding` :class:`fastNLP.embeddings.char_embedding.LSTMCharEmbedding` - 使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool -> Dropout Example:: + >>> import torch + >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import LSTMCharEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) >>> embed = LSTMCharEmbedding(vocab, embed_size=50) >>> words = torch.LongTensor([[vocab.to_index(word) for word in "The whether is good .".split()]]) @@ -174,27 +169,36 @@ class LSTMCharEmbedding(TokenEmbedding): >>> outputs.size() >>> # torch.Size([1, 5,50]) - :param vocab: 词表 - :param embed_size: embedding的大小。默认值为50. - :param char_emb_size: character的embedding的大小。默认值为50. - :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 - :param dropout: 以多大概率drop character embedding的输出以及最终的word的输出。 - :param hidden_size: LSTM的中间hidden的大小,如果为bidirectional的,hidden会除二,默认为50. - :param pool_method: 支持'max', 'avg'。 - :param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数. - :param min_char_freq: character的最小出现次数。默认值为2. - :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。 """ - def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, - dropout:float=0.5, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, - bidirectional=True): - super(LSTMCharEmbedding, self).__init__(vocab) - + + def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, + dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu', + min_char_freq: int = 2, bidirectional=True, pre_train_char_embed: str = None, + requires_grad:bool=True, include_word_start_end:bool=True): + """ + + :param vocab: 词表 + :param embed_size: LSTMCharEmbedding的输出维度。默认值为50. + :param char_emb_size: character的embedding的维度。默认值为50. + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 + :param dropout: 以多大概率drop character embedding的输出以及最终的word的输出。 + :param hidden_size: LSTM的中间hidden的大小,如果为bidirectional的,hidden会除二,默认为50. + :param pool_method: 支持'max', 'avg'。 + :param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数. + :param min_char_freq: character的最小出现次数。默认值为2. + :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。 + :param pre_train_char_embed: 可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹 + (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, + 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. + :param requires_grad: 是否更新权重 + :param include_word_start_end: 是否在每个word开始的character前和结束的character增加特殊标示符号; + """ + super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) + assert hidden_size % 2 == 0, "Only even kernel is allowed." - + assert pool_method in ('max', 'avg') self.pool_method = pool_method - self.dropout = nn.Dropout(dropout) # activation function if isinstance(activation, str): if activation.lower() == 'relu': @@ -210,32 +214,40 @@ class LSTMCharEmbedding(TokenEmbedding): else: raise Exception( "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") - - print("Start constructing character vocabulary.") + + logger.info("Start constructing character vocabulary.") # 建立char的词表 - self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) + self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq, + include_word_start_end=include_word_start_end) self.char_pad_index = self.char_vocab.padding_idx - print(f"In total, there are {len(self.char_vocab)} distinct characters.") + logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.") # 对vocab进行index - self.max_word_len = max(map(lambda x: len(x[0]), vocab)) - self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len), - fill_value=self.char_pad_index, dtype=torch.long), - requires_grad=False) - self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False) + max_word_len = max(map(lambda x: len(x[0]), vocab)) + if include_word_start_end: + max_word_len += 2 + self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), max_word_len), + fill_value=self.char_pad_index, dtype=torch.long)) + self.register_buffer('word_lengths', torch.zeros(len(vocab)).long()) for word, index in vocab: # if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了. 修改为不区分pad与否 + if include_word_start_end: + word = [''] + list(word) + [''] self.words_to_chars_embedding[index, :len(word)] = \ torch.LongTensor([self.char_vocab.to_index(c) for c in word]) self.word_lengths[index] = len(word) - self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) - + if pre_train_char_embed: + self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) + else: + self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) + self.fc = nn.Linear(hidden_size, embed_size) hidden_size = hidden_size // 2 if bidirectional else hidden_size - + self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True) self._embed_size = embed_size self.bidirectional = bidirectional - + self.requires_grad = requires_grad + def forward(self, words): """ 输入words的index后,生成对应的words的表示。 @@ -257,7 +269,7 @@ class LSTMCharEmbedding(TokenEmbedding): char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) # B x M x M x H - + lstm_chars = self.activation(lstm_chars) if self.pool_method == 'max': lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) @@ -265,31 +277,7 @@ class LSTMCharEmbedding(TokenEmbedding): else: lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0) chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() - + chars = self.fc(chars) - - return self.dropout(chars) - - @property - def requires_grad(self): - """ - Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 - :return: - """ - params = [] - for name, param in self.named_parameters(): - if 'words_to_chars_embedding' not in name and 'word_lengths' not in name: - params.append(param) - requires_grads = set(params) - if len(requires_grads) == 1: - return requires_grads.pop() - else: - return None - - @requires_grad.setter - def requires_grad(self, value): - for name, param in self.named_parameters(): - if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 - continue - param.requires_grad = value + return self.dropout(chars) diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py index 1831af4e..9910a44b 100644 --- a/fastNLP/embeddings/contextual_embedding.py +++ b/fastNLP/embeddings/contextual_embedding.py @@ -1,20 +1,30 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "ContextualEmbedding" +] from abc import abstractmethod + import torch -from ..core.vocabulary import Vocabulary -from ..core.dataset import DataSet +from .embedding import TokenEmbedding +from ..core import logger from ..core.batch import DataSetIter +from ..core.dataset import DataSet from ..core.sampler import SequentialSampler from ..core.utils import _move_model_to_device, _get_model_device -from .embedding import TokenEmbedding +from ..core.vocabulary import Vocabulary class ContextualEmbedding(TokenEmbedding): - def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0): + def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0): super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - - def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True): + + def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool = True): """ 由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 @@ -29,14 +39,14 @@ class ContextualEmbedding(TokenEmbedding): assert isinstance(dataset, DataSet), "Only fastNLP.DataSet object is allowed." assert 'words' in dataset.get_input_name(), "`words` field has to be set as input." except Exception as e: - print(f"Exception happens at {index} dataset.") + logger.error(f"Exception happens at {index} dataset.") raise e - + sent_embeds = {} _move_model_to_device(self, device=device) device = _get_model_device(self) pad_index = self._word_vocab.padding_idx - print("Start to calculate sentence representations.") + logger.info("Start to calculate sentence representations.") with torch.no_grad(): for index, dataset in enumerate(datasets): try: @@ -51,18 +61,18 @@ class ContextualEmbedding(TokenEmbedding): word_embeds = self(words).detach().cpu().numpy() for b in range(words.size(0)): length = seq_len_from_behind[b] - if length==0: + if length == 0: sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b] else: sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length] except Exception as e: - print(f"Exception happens at {index} dataset.") + logger.error(f"Exception happens at {index} dataset.") raise e - print("Finish calculating sentence representations.") + logger.info("Finish calculating sentence representations.") self.sent_embeds = sent_embeds if delete_weights: self._delete_model_weights() - + def _get_sent_reprs(self, words): """ 获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None @@ -85,12 +95,12 @@ class ContextualEmbedding(TokenEmbedding): embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device) return embeds return None - + @abstractmethod def _delete_model_weights(self): """删除计算表示的模型以节省资源""" raise NotImplementedError - + def remove_sentence_cache(self): """ 删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。 diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index af94e8ec..f2d643f7 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -1,26 +1,36 @@ +""" +.. todo:: + doc +""" +__all__ = [ + "ElmoEmbedding" +] + +import codecs +import json import os import torch import torch.nn as nn import torch.nn.functional as F -import json -import codecs +from .contextual_embedding import ContextualEmbedding +from ..core import logger from ..core.vocabulary import Vocabulary -from ..io.file_utils import cached_path, _get_base_url, PRETRAINED_ELMO_MODEL_DIR +from ..io.file_utils import cached_path, _get_embedding_url, PRETRAINED_ELMO_MODEL_DIR from ..modules.encoder._elmo import ElmobiLm, ConvTokenEmbedder -from .contextual_embedding import ContextualEmbedding class ElmoEmbedding(ContextualEmbedding): """ - 别名::class:`fastNLP.embeddings.ElmoEmbedding` :class:`fastNLP.embeddings.elmo_embedding.ElmoEmbedding` - 使用ELMo的embedding。初始化之后,只需要传入words就可以得到对应的embedding。当前支持的使用名称初始化的模型有以下的这些(待补充) Example:: + >>> import torch + >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import ElmoEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) >>> # 使用不同层的concat的结果 >>> embed = ElmoEmbedding(vocab, model_dir_or_name='en', layers='1,2', requires_grad=False) @@ -33,37 +43,39 @@ class ElmoEmbedding(ContextualEmbedding): >>> embed = ElmoEmbedding(vocab, model_dir_or_name='en', layers='mix', requires_grad=False) >>> embed.set_mix_weights_requires_grad() # 使得weighted的权重是可以学习的,但ELMO的LSTM部分是不更新 - :param vocab: 词表 - :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo所在文件夹,该文件夹下面应该有两个文件, - 其中一个是以json为后缀的配置文件,另一个是以pkl为后缀的权重文件;第二种是传入ELMo版本的名称,将自动查看缓存中是否存在该模型, - 没有的话将自动下载并缓存。 - :param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果 - 按照这个顺序concat起来,默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致, - 初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。) - :param requires_grad: bool, 该层是否需要gradient, 默认为False. - :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 - :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 - :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, - 并删除character encoder,之后将直接使用cache的embedding。默认为False。 """ - - def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False, + + def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = True, word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False): + """ + + :param vocab: 词表 + :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo所在文件夹,该文件夹下面应该有两个文件, + 其中一个是以json为后缀的配置文件,另一个是以pkl为后缀的权重文件;第二种是传入ELMo版本的名称,将自动查看缓存中是否存在该模型, + 没有的话将自动下载并缓存。 + :param layers: str, 指定返回的层数(从0开始), 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果 + 按照这个顺序concat起来,默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致, + 初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。) + :param requires_grad: bool, 该层是否需要gradient, 默认为False. + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 + :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 + :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, + 并删除character encoder,之后将直接使用cache的embedding。默认为False。 + """ super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: - PRETRAIN_URL = _get_base_url('elmo') - model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) + model_url = _get_embedding_url('elmo', model_dir_or_name.lower()) + model_dir = cached_path(model_url, name='embedding') # 检查是否存在 - elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): + elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): model_dir = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) - + num_layers = self.model.encoder.num_layers + if layers == 'mix': self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1), requires_grad=requires_grad) @@ -72,22 +84,22 @@ class ElmoEmbedding(ContextualEmbedding): self._embed_size = self.model.config['lstm']['projection_dim'] * 2 else: layers = list(map(int, layers.split(','))) - assert len(layers) > 0, "Must choose one output" + assert len(layers) > 0, "Must choose at least one output, but got None." for layer in layers: - assert 0 <= layer <= 2, "Layer index should be in range [0, 2]." + assert 0 <= layer <= num_layers, f"Layer index should be in range [0, {num_layers}], but got {layer}." self.layers = layers self._get_outputs = self._get_layer_outputs self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2 - + self.requires_grad = requires_grad - + def _get_mixed_outputs(self, outputs): # outputs: num_layers x batch_size x max_len x hidden_size # return: batch_size x max_len x hidden_size weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs) outputs = torch.einsum('l,lbij->bij', weights, outputs) return self.gamma.to(outputs) * outputs - + def set_mix_weights_requires_grad(self, flag=True): """ 当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用 @@ -99,15 +111,15 @@ class ElmoEmbedding(ContextualEmbedding): if hasattr(self, 'layer_weights'): self.layer_weights.requires_grad = flag self.gamma.requires_grad = flag - + def _get_layer_outputs(self, outputs): if len(self.layers) == 1: outputs = outputs[self.layers[0]] else: outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1) - + return outputs - + def forward(self, words: torch.LongTensor): """ 计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的 @@ -124,33 +136,12 @@ class ElmoEmbedding(ContextualEmbedding): outputs = self.model(words) outputs = self._get_outputs(outputs) return self.dropout(outputs) - + def _delete_model_weights(self): for name in ['layers', 'model', 'layer_weights', 'gamma']: if hasattr(self, name): delattr(self, name) - @property - def requires_grad(self): - """ - Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 - - :return: - """ - requires_grads = set([param.requires_grad for name, param in self.named_parameters() - if 'words_to_chars_embedding' not in name and 'words_to_words' not in name]) - if len(requires_grads) == 1: - return requires_grads.pop() - else: - return None - - @requires_grad.setter - def requires_grad(self, value): - for name, param in self.named_parameters(): - if 'words_to_chars_embedding' in name or 'words_to_words' in name: # 这个不能加入到requires_grad中 - continue - param.requires_grad = value - class _ElmoModel(nn.Module): """ @@ -161,7 +152,7 @@ class _ElmoModel(nn.Module): (4) 设计一个保存token的embedding,允许缓存word的表示。 """ - + def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False): super(_ElmoModel, self).__init__() self.model_dir = model_dir @@ -182,18 +173,18 @@ class _ElmoModel(nn.Module): raise Exception(f"Multiple config files(*.json) or weight files(*.hdf5) detected in {model_dir}.") elif config_count == 0 or weight_count == 0: raise Exception(f"No config file or weight file found in {model_dir}") - - config = json.load(open(os.path.join(model_dir, config_file), 'r')) + with open(os.path.join(model_dir, config_file), 'r') as config_f: + config = json.load(config_f) self.weight_file = os.path.join(model_dir, weight_file) self.config = config - + OOV_TAG = '' PAD_TAG = '' BOS_TAG = '' EOS_TAG = '' BOW_TAG = '' EOW_TAG = '' - + # For the model trained with character-based word encoder. char_lexicon = {} with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: @@ -203,29 +194,29 @@ class _ElmoModel(nn.Module): tokens.insert(0, '\u3000') token, i = tokens char_lexicon[token] = int(i) - + # 做一些sanity check for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: assert special_word in char_lexicon, f"{special_word} not found in char.dic." - + # 从vocab中构建char_vocab char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) # 需要保证在里面 char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG]) - + for word, index in vocab: char_vocab.add_word_lst(list(word)) - + self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx # 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示) char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']), padding_idx=len(char_vocab)) - + # 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu') - + char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight'] - + found_char_count = 0 for char, index in char_vocab: # 调整character embedding if char in char_lexicon: @@ -234,15 +225,13 @@ class _ElmoModel(nn.Module): else: index_in_pre = char_lexicon[OOV_TAG] char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] - - print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") + + logger.info(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") # 生成words到chars的映射 max_chars = config['char_cnn']['max_characters_per_token'] - - self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), + self.register_buffer('words_to_chars_embedding', torch.full((len(vocab) + 2, max_chars), fill_value=len(char_vocab), - dtype=torch.long), - requires_grad=False) + dtype=torch.long)) for word, index in list(iter(vocab)) + [(BOS_TAG, len(vocab)), (EOS_TAG, len(vocab) + 1)]: if len(word) + 2 > max_chars: word = word[:max_chars - 2] @@ -257,29 +246,29 @@ class _ElmoModel(nn.Module): char_vocab.to_index(EOW_TAG)] char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) - + self.char_vocab = char_vocab - + self.token_embedder = ConvTokenEmbedder( config, self.weight_file, None, char_emb_layer) elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight self.token_embedder.load_state_dict(elmo_model["char_cnn"]) - + self.output_dim = config['lstm']['projection_dim'] - + # lstm encoder self.encoder = ElmobiLm(config) self.encoder.load_state_dict(elmo_model["lstm"]) - + if cache_word_reprs: if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 - print("Start to generate cache word representations.") + logger.info("Start to generate cache word representations.") batch_size = 320 # bos eos word_size = self.words_to_chars_embedding.size(0) num_batches = word_size // batch_size + \ int(word_size % batch_size != 0) - + self.cached_word_embedding = nn.Embedding(word_size, config['lstm']['projection_dim']) with torch.no_grad(): @@ -290,12 +279,12 @@ class _ElmoModel(nn.Module): word_reprs = self.token_embedder(words.unsqueeze(1), chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) - - print("Finish generating cached word representations. Going to delete the character encoder.") + + logger.info("Finish generating cached word representations. Going to delete the character encoder.") del self.token_embedder, self.words_to_chars_embedding else: - print("There is no need to cache word representations, since no character information is used.") - + logger.info("There is no need to cache word representations, since no character information is used.") + def forward(self, words): """ @@ -320,7 +309,7 @@ class _ElmoModel(nn.Module): else: chars = None token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim - + encoder_output = self.encoder(token_embedding, seq_len) if encoder_output.size(2) < max_len + 2: num_layers, _, output_len, hidden_size = encoder_output.size() @@ -331,7 +320,7 @@ class _ElmoModel(nn.Module): token_embedding = token_embedding.masked_fill(mask, 0) token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3]) encoder_output = torch.cat((token_embedding, encoder_output), dim=0) - + # 删除, . 这里没有精确地删除,但应该也不会影响最后的结果了。 encoder_output = encoder_output[:, :, 1:-1] return encoder_output diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index 111bacd0..08921f33 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -3,83 +3,94 @@ """ +__all__ = [ + "Embedding", + "TokenEmbedding" +] -import torch.nn as nn from abc import abstractmethod + import torch +import torch.nn as nn from .utils import get_embeddings class Embedding(nn.Module): """ - 别名::class:`fastNLP.embeddings.Embedding` :class:`fastNLP.embeddings.embedding.Embedding` - 词向量嵌入,支持输入多种方式初始化. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度. Example:: >>> import numpy as np + >>> from fastNLP.embeddings import Embedding >>> init_embed = (2000, 100) >>> embed = Embedding(init_embed) # 随机初始化一个具有2000个词,每个词表示为100维的词向量 >>> init_embed = np.zeros((2000, 100)) >>> embed = Embedding(init_embed) # 使用numpy.ndarray的值作为初始化值初始化一个Embedding - :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: 支持传入Embedding的大小(传入tuple(int, int), - 第一个int为vocab_zie, 第二个int为embed_dim); 或传入Tensor, Embedding, numpy.ndarray等则直接使用该值初始化Embedding; - :param float word_dropout: 按照一定概率随机将word设置为unk_index,这样可以使得unk这个token得到足够的训练, 且会对网络有 - 一定的regularize的作用。设置该值时,必须同时设置unk_index - :param float dropout: 对Embedding的输出的dropout。 - :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 """ - + def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None): - + """ + + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: 支持传入Embedding的大小(传入tuple(int, int), + 第一个int为vocab_zie, 第二个int为embed_dim); 或传入Tensor, Embedding, numpy.ndarray等则直接使用该值初始化Embedding; + :param float word_dropout: 按照一定概率随机将word设置为unk_index,这样可以使得unk这个token得到足够的训练, 且会对网络有 + 一定的regularize的作用。设置该值时,必须同时设置unk_index + :param float dropout: 对Embedding的输出的dropout。 + :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 + """ super(Embedding, self).__init__() - + self.embed = get_embeddings(init_embed) self.dropout = nn.Dropout(dropout) if not isinstance(self.embed, TokenEmbedding): - self._embed_size = self.embed.weight.size(1) - if word_dropout>0 and not isinstance(unk_index, int): + if hasattr(self.embed, 'embed_size'): + self._embed_size = self.embed.embed_size + elif hasattr(self.embed, 'embedding_dim'): + self._embed_size = self.embed.embedding_dim + else: + self._embed_size = self.embed.weight.size(1) + if word_dropout > 0 and not isinstance(unk_index, int): raise ValueError("When drop word is set, you need to pass in the unk_index.") else: self._embed_size = self.embed.embed_size unk_index = self.embed.get_word_vocab().unknown_idx self.unk_index = unk_index self.word_dropout = word_dropout - + def forward(self, words): """ :param torch.LongTensor words: [batch, seq_len] :return: torch.Tensor : [batch, seq_len, embed_dim] """ - if self.word_dropout>0 and self.training: + if self.word_dropout > 0 and self.training: mask = torch.ones_like(words).float() * self.word_dropout - mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self.unk_index) words = self.embed(words) return self.dropout(words) - + @property - def num_embedding(self)->int: + def num_embedding(self) -> int: if isinstance(self.embed, nn.Embedding): return self.embed.weight.size(0) else: return self.embed.num_embedding - + def __len__(self): return len(self.embed) - + @property def embed_size(self) -> int: return self._embed_size - + @property def embedding_dim(self) -> int: return self._embed_size - + @property def requires_grad(self): """ @@ -90,14 +101,14 @@ class Embedding(nn.Module): return self.embed.weight.requires_grad else: return self.embed.requires_grad - + @requires_grad.setter def requires_grad(self, value): if not isinstance(self.embed, TokenEmbedding): self.embed.weight.requires_grad = value else: self.embed.requires_grad = value - + @property def size(self): if isinstance(self.embed, TokenEmbedding): @@ -107,6 +118,10 @@ class Embedding(nn.Module): class TokenEmbedding(nn.Module): + """ + fastNLP中各种Embedding的基类 + + """ def __init__(self, vocab, word_dropout=0.0, dropout=0.0): super(TokenEmbedding, self).__init__() if vocab.rebuild: @@ -114,12 +129,12 @@ class TokenEmbedding(nn.Module): assert vocab.padding is not None, "Vocabulary must have a padding entry." self._word_vocab = vocab self._word_pad_index = vocab.padding_idx - if word_dropout>0: + if word_dropout > 0: assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word." self.word_dropout = word_dropout self._word_unk_index = vocab.unknown_idx self.dropout_layer = nn.Dropout(dropout) - + def drop_word(self, words): """ 按照设定随机将words设置为unknown_index。 @@ -128,11 +143,13 @@ class TokenEmbedding(nn.Module): :return: """ if self.word_dropout > 0 and self.training: - mask = torch.ones_like(words).float() * self.word_dropout - mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 + mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 + pad_mask = words.ne(self._word_pad_index) + mask = mask.__and__(pad_mask) words = words.masked_fill(mask, self._word_unk_index) return words - + def dropout(self, words): """ 对embedding后的word表示进行drop。 @@ -141,7 +158,7 @@ class TokenEmbedding(nn.Module): :return: """ return self.dropout_layer(words) - + @property def requires_grad(self): """ @@ -153,23 +170,23 @@ class TokenEmbedding(nn.Module): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for param in self.parameters(): param.requires_grad = value - + def __len__(self): return len(self._word_vocab) - + @property def embed_size(self) -> int: return self._embed_size - + @property def embedding_dim(self) -> int: return self._embed_size - + @property def num_embedding(self) -> int: """ @@ -177,7 +194,7 @@ class TokenEmbedding(nn.Module): :return: """ return len(self._word_vocab) - + def get_word_vocab(self): """ 返回embedding的词典。 @@ -185,11 +202,11 @@ class TokenEmbedding(nn.Module): :return: Vocabulary """ return self._word_vocab - + @property def size(self): return torch.Size(self.num_embedding, self._embed_size) - + @abstractmethod def forward(self, words): raise NotImplementedError diff --git a/fastNLP/embeddings/stack_embedding.py b/fastNLP/embeddings/stack_embedding.py index 8091d598..21a06b5f 100644 --- a/fastNLP/embeddings/stack_embedding.py +++ b/fastNLP/embeddings/stack_embedding.py @@ -1,3 +1,12 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "StackEmbedding", +] + from typing import List import torch @@ -8,25 +17,27 @@ from .embedding import TokenEmbedding class StackEmbedding(TokenEmbedding): """ - 别名::class:`fastNLP.embeddings.StackEmbedding` :class:`fastNLP.embeddings.stack_embedding.StackEmbedding` - 支持将多个embedding集合成一个embedding。 Example:: >>> from fastNLP import Vocabulary - >>> from fastNLP.embeddings import StaticEmbedding + >>> from fastNLP.embeddings import StaticEmbedding, StackEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) - >>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) + >>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d', requires_grad=True) >>> embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True) - - :param embeds: 一个由若干个TokenEmbedding组成的list,要求每一个TokenEmbedding的词表都保持一致 - :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。不同embedidng会在相同的位置 - 被设置为unknown。如果这里设置了dropout,则组成的embedding就不要再设置dropout了。 - :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 + >>> embed = StackEmbedding([embed_1, embed_2]) """ + def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0): + """ + + :param embeds: 一个由若干个TokenEmbedding组成的list,要求每一个TokenEmbedding的词表都保持一致 + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。不同embedidng会在相同的位置 + 被设置为unknown。如果这里设置了dropout,则组成的embedding就不要再设置dropout了。 + :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 + """ vocabs = [] for embed in embeds: if hasattr(embed, 'get_word_vocab'): @@ -34,14 +45,14 @@ class StackEmbedding(TokenEmbedding): _vocab = vocabs[0] for vocab in vocabs[1:]: assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary." - + super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) assert isinstance(embeds, list) for embed in embeds: assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported." self.embeds = nn.ModuleList(embeds) self._embed_size = sum([embed.embed_size for embed in self.embeds]) - + def append(self, embed: TokenEmbedding): """ 添加一个embedding到结尾。 @@ -49,36 +60,27 @@ class StackEmbedding(TokenEmbedding): :return: """ assert isinstance(embed, TokenEmbedding) + self._embed_size += embed.embed_size self.embeds.append(embed) - + return self + def pop(self): """ 弹出最后一个embed :return: """ - return self.embeds.pop() - + embed = self.embeds.pop() + self._embed_size -= embed.embed_size + return embed + @property def embed_size(self): - return self._embed_size - - @property - def requires_grad(self): """ - Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 + 该Embedding输出的vector的最后一维的维度。 :return: """ - requires_grads = set([embed.requires_grad for embed in self.embeds()]) - if len(requires_grads) == 1: - return requires_grads.pop() - else: - return None - - @requires_grad.setter - def requires_grad(self, value): - for embed in self.embeds(): - embed.requires_grad = value - + return self._embed_size + def forward(self, words): """ 得到多个embedding的结果,并把结果按照顺序concat起来。 @@ -91,4 +93,4 @@ class StackEmbedding(TokenEmbedding): for embed in self.embeds: outputs.append(embed(words)) outputs = self.dropout(torch.cat(outputs, dim=-1)) - return outputs \ No newline at end of file + return outputs diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 94f7adb5..f519e705 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -1,31 +1,42 @@ +""" +.. todo:: + doc +""" +__all__ = [ + "StaticEmbedding" +] import os +import warnings +from collections import defaultdict +from copy import deepcopy +import numpy as np import torch import torch.nn as nn -import numpy as np -import warnings -from ..core.vocabulary import Vocabulary -from ..io.file_utils import PRETRAIN_STATIC_FILES, _get_base_url, cached_path from .embedding import TokenEmbedding +from ..core import logger +from ..core.vocabulary import Vocabulary +from ..io.file_utils import PRETRAIN_STATIC_FILES, _get_embedding_url, cached_path from ..modules.utils import _get_file_name_base_on_postfix + class StaticEmbedding(TokenEmbedding): """ - 别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding` - StaticEmbedding组件. 给定预训练embedding的名称或路径,根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来, 如果没有找到,则会随机初始化一个值(但如果该word是被标记为no_create_entry的话,则不会单独创建一个值,而是会被指向unk的index))。 当前支持自动下载的预训练vector有以下的几种(待补充); Example:: - + + >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import StaticEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) - >>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-50') + >>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-50d') >>> vocab = Vocabulary().add_word_lst(["The", 'the', "THE"]) - >>> embed = StaticEmbedding(vocab, model_dir_or_name="en-glove-50", lower=True) + >>> embed = StaticEmbedding(vocab, model_dir_or_name="en-glove-50d", lower=True) >>> # "the", "The", "THE"它们共用一个vector,且将使用"the"在预训练词表中寻找它们的初始化表示。 >>> vocab = Vocabulary().add_word_lst(["The", "the", "THE"]) @@ -37,89 +48,129 @@ class StaticEmbedding(TokenEmbedding): [ 0.5773, 0.7251, -0.3104, 0.0777, 0.4849]]], grad_fn=) # 每种word的输出是一致的。 - :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。 - :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 - 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 - 如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 - :param int embedding_dim: 随机初始化的embedding的维度,仅在model_dir_or_name为None时有效。 - :param bool requires_grad: 是否需要gradient. 默认为True - :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。 - :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 - 为大写的词语开辟一个vector表示,则将lower设置为False。 - :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 - :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 - :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 """ - def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=100, requires_grad: bool=True, - init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False): + + def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True, + init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): + """ + + :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。 + :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 + 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 + 如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 + :param int embedding_dim: 随机初始化的embedding的维度,当该值为大于0的值时,将忽略model_dir_or_name。 + :param bool requires_grad: 是否需要gradient. 默认为True + :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法, 传入的方法应该接受一个tensor,并 + inplace地修改其值。 + :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 + 为大写的词语开辟一个vector表示,则将lower设置为False。 + :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 + :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 + :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 + :param dict kwarngs: only_train_min_freq, 仅对train中的词语使用min_freq筛选; only_norm_found_vector是否仅对在预训练中找到的词语使用normalize。 + """ super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + if embedding_dim > 0: + model_dir_or_name = None + # 得到cache_path if model_dir_or_name is None: - assert embedding_dim>=1, "The dimension of embedding should be larger than 1." + assert embedding_dim >= 1, "The dimension of embedding should be larger than 1." embedding_dim = int(embedding_dim) model_path = None elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: - PRETRAIN_URL = _get_base_url('static') - model_name = PRETRAIN_STATIC_FILES[model_dir_or_name] - model_url = PRETRAIN_URL + model_name - model_path = cached_path(model_url) + model_url = _get_embedding_url('static', model_dir_or_name.lower()) + model_path = cached_path(model_url, name='embedding') # 检查是否存在 - elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))): - model_path = model_dir_or_name - elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): - model_path = _get_file_name_base_on_postfix(model_dir_or_name, '.txt') + elif os.path.isfile(os.path.abspath(os.path.expanduser(model_dir_or_name))): + model_path = os.path.abspath(os.path.expanduser(model_dir_or_name)) + elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): + model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - + + # 根据min_freq缩小vocab + truncate_vocab = (vocab.min_freq is None and min_freq > 1) or (vocab.min_freq and vocab.min_freq < min_freq) + if truncate_vocab: + truncated_vocab = deepcopy(vocab) + truncated_vocab.min_freq = min_freq + truncated_vocab.word2idx = None + if lower: # 如果有lower,将大小写的的freq需要同时考虑到 + lowered_word_count = defaultdict(int) + for word, count in truncated_vocab.word_count.items(): + lowered_word_count[word.lower()] += count + for word in truncated_vocab.word_count.keys(): + word_count = truncated_vocab.word_count[word] + if lowered_word_count[word.lower()] >= min_freq and word_count < min_freq: + truncated_vocab.add_word_lst([word] * (min_freq - word_count), + no_create_entry=truncated_vocab._is_word_no_create_entry(word)) + + # 只限制在train里面的词语使用min_freq筛选 + if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: + for word in truncated_vocab.word_count.keys(): + if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] < min_freq: + truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]), + no_create_entry=True) + truncated_vocab.build_vocab() + truncated_words_to_words = torch.arange(len(vocab)).long() + for word, index in vocab: + truncated_words_to_words[index] = truncated_vocab.to_index(word) + logger.info(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") + vocab = truncated_vocab + + self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False) # 读取embedding if lower: lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown) for word, index in vocab: - if not vocab._is_word_no_create_entry(word): + if vocab._is_word_no_create_entry(word): + lowered_vocab.add_word(word.lower(), no_create_entry=True) + else: lowered_vocab.add_word(word.lower()) # 先加入需要创建entry的 - for word in vocab._no_create_word.keys(): # 不需要创建entry的 - if word in vocab: - lowered_word = word.lower() - if lowered_word not in lowered_vocab.word_count: - lowered_vocab.add_word(lowered_word) - lowered_vocab._no_create_word[lowered_word] += 1 - print(f"All word in vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered " - f"words.") + logger.info(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} " + f"unique lowered words.") if model_path: embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) - # 需要适配一下 - if not hasattr(self, 'words_to_words'): - self.words_to_words = torch.arange(len(lowered_vocab, )).long() + self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) if lowered_vocab.unknown: unknown_idx = lowered_vocab.unknown_idx else: unknown_idx = embedding.size(0) - 1 # 否则是最后一个为unknow - words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), - requires_grad=False) + self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) + words_to_words = torch.full((len(vocab),), fill_value=unknown_idx).long() for word, index in vocab: if word not in lowered_vocab: word = word.lower() - if lowered_vocab._is_word_no_create_entry(word): # 如果不需要创建entry,已经默认unknown了 - continue + if word not in lowered_vocab and lowered_vocab._is_word_no_create_entry(word): + continue # 如果不需要创建entry,已经默认unknown了 words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)] - self.words_to_words = words_to_words + self.register_buffer('words_to_words', words_to_words) + self._word_unk_index = lowered_vocab.unknown_idx # 替换一下unknown的index else: if model_path: embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) - if normalize: + self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) + if not self.only_norm_found_vector and normalize: embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) + + if truncate_vocab: + for i in range(len(truncated_words_to_words)): + index_in_truncated_vocab = truncated_words_to_words[i] + truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab] + del self.words_to_words + self.register_buffer('words_to_words', truncated_words_to_words) self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], padding_idx=vocab.padding_idx, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False, _weight=embedding) self._embed_size = self.embedding.weight.size(1) self.requires_grad = requires_grad - + def _randomly_init_embed(self, num_embedding, embedding_dim, init_embed=None): """ @@ -129,35 +180,14 @@ class StaticEmbedding(TokenEmbedding): :return: torch.FloatTensor """ embed = torch.zeros(num_embedding, embedding_dim) - + if init_embed is None: - nn.init.uniform_(embed, -np.sqrt(3/embedding_dim), np.sqrt(3/embedding_dim)) + nn.init.uniform_(embed, -np.sqrt(3 / embedding_dim), np.sqrt(3 / embedding_dim)) else: init_embed(embed) - - return embed - - @property - def requires_grad(self): - """ - Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 - :return: - """ - requires_grads = set([param.requires_grad for name, param in self.named_parameters() - if 'words_to_words' not in name]) - if len(requires_grads) == 1: - return requires_grads.pop() - else: - return None - - @requires_grad.setter - def requires_grad(self, value): - for name, param in self.named_parameters(): - if 'words_to_words' in name: - continue - param.requires_grad = value - + return embed + def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='', unknown='', error='ignore', init_method=None): """ @@ -189,7 +219,12 @@ class StaticEmbedding(TokenEmbedding): dim = len(parts) - 1 f.seek(0) matrix = {} + if vocab.padding: + matrix[vocab.padding_idx] = torch.zeros(dim) + if vocab.unknown: + matrix[vocab.unknown_idx] = torch.zeros(dim) found_count = 0 + found_unknown = False for idx, line in enumerate(f, start_idx): try: parts = line.strip().split() @@ -200,46 +235,42 @@ class StaticEmbedding(TokenEmbedding): word = vocab.padding elif word == unknown and vocab.unknown is not None: word = vocab.unknown + found_unknown = True if word in vocab: index = vocab.to_index(word) matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) + if self.only_norm_found_vector: + matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) found_count += 1 except Exception as e: if error == 'ignore': warnings.warn("Error occurred at the {} line.".format(idx)) else: - print("Error occurred at the {} line.".format(idx)) + logger.error("Error occurred at the {} line.".format(idx)) raise e - print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) + logger.info("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) for word, index in vocab: if index not in matrix and not vocab._is_word_no_create_entry(word): - if vocab.unknown_idx in matrix: # 如果有unkonwn,用unknown初始化 + if found_unknown: # 如果有unkonwn,用unknown初始化 matrix[index] = matrix[vocab.unknown_idx] else: matrix[index] = None - + # matrix中代表是需要建立entry的词 vectors = self._randomly_init_embed(len(matrix), dim, init_method) - - if vocab._no_create_word_length>0: - if vocab.unknown is None: # 创建一个专门的unknown - unknown_idx = len(matrix) - vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() - else: - unknown_idx = vocab.unknown_idx - words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), - requires_grad=False) - for order, (index, vec) in enumerate(matrix.items()): - if vec is not None: - vectors[order] = vec - words_to_words[index] = order - self.words_to_words = words_to_words + + if vocab.unknown is None: # 创建一个专门的unknown + unknown_idx = len(matrix) + vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() else: - for index, vec in matrix.items(): - if vec is not None: - vectors[index] = vec - + unknown_idx = vocab.unknown_idx + self.register_buffer('words_to_words', torch.full((len(vocab), ), fill_value=unknown_idx).long()) + for index, (index_in_vocab, vec) in enumerate(matrix.items()): + if vec is not None: + vectors[index] = vec + self.words_to_words[index_in_vocab] = index + return vectors - + def forward(self, words): """ 传入words的index diff --git a/fastNLP/embeddings/utils.py b/fastNLP/embeddings/utils.py index b79f563c..942f8b02 100644 --- a/fastNLP/embeddings/utils.py +++ b/fastNLP/embeddings/utils.py @@ -1,24 +1,33 @@ +""" +.. todo:: + doc +""" import numpy as np import torch from torch import nn as nn from ..core.vocabulary import Vocabulary -__all__ = ['get_embeddings'] +__all__ = [ + 'get_embeddings' +] -def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1): +def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1, include_word_start_end=True): """ 给定一个word的vocabulary生成character的vocabulary. :param vocab: 从vocab :param min_freq: + :param include_word_start_end: 是否需要包含特殊的 :return: """ char_vocab = Vocabulary(min_freq=min_freq) for word, index in vocab: if not vocab._is_word_no_create_entry(word): char_vocab.add_word_lst(list(word)) + if include_word_start_end: + char_vocab.add_word_lst(['', '']) return char_vocab @@ -31,13 +40,13 @@ def get_embeddings(init_embed): :param init_embed: 可以是 tuple:(num_embedings, embedding_dim), 即embedding的大小和每个词的维度;也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding; 传入np.ndarray也行,将使用传入的ndarray作为作为Embedding初始化; 传入torch.Tensor, 将使用传入的值作为Embedding初始化。 - :return nn.Embedding embeddings: + :return nn.Embedding: embeddings """ if isinstance(init_embed, tuple): res = nn.Embedding( num_embeddings=init_embed[0], embedding_dim=init_embed[1]) - nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)), - b=np.sqrt(3/res.weight.data.size(1))) + nn.init.uniform_(res.weight.data, a=-np.sqrt(3 / res.weight.data.size(1)), + b=np.sqrt(3 / res.weight.data.size(1))) elif isinstance(init_embed, nn.Module): res = init_embed elif isinstance(init_embed, torch.Tensor): @@ -48,4 +57,4 @@ def get_embeddings(init_embed): else: raise TypeError( 'invalid init_embed type: {}'.format((type(init_embed)))) - return res \ No newline at end of file + return res diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index cd0d3527..377597ea 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -1,47 +1,134 @@ """ 用于IO的模块, 具体包括: -1. 用于读入 embedding 的 :doc:`EmbedLoader ` 类, +1. 用于读入 embedding 的 :mod:`EmbedLoader ` 类, -2. 用于读入不同格式数据的 :doc:`DataSetLoader ` 类 +2. 用于读入不同格式数据的 :mod:`Loader ` 类 -3. 用于读入不同数据集并进行预处理的 :doc:`DataLoader ` 类 +3. 用于处理读入数据的 :mod:`Pipe ` 类 -4. 用于保存和载入模型的类, 参考 :doc:`model_io文档` +4. 用于保存和载入模型的类, 参考 :mod:`model_io模块 ` 这些类的使用方法如下: """ __all__ = [ + 'DataBundle', + 'EmbedLoader', + + 'Loader', + + 'YelpLoader', + 'YelpFullLoader', + 'YelpPolarityLoader', + 'IMDBLoader', + 'SSTLoader', + 'SST2Loader', + "ChnSentiCorpLoader", + "THUCNewsLoader", + "WeiboSenti100kLoader", + + 'ConllLoader', + 'Conll2003Loader', + 'Conll2003NERLoader', + 'OntoNotesNERLoader', + 'CTBLoader', + "MsraNERLoader", + "WeiboNERLoader", + "PeopleDailyNERLoader", 'CSVLoader', 'JsonLoader', - 'DataBundle', - 'DataSetLoader', + 'CWSLoader', - 'ConllLoader', - 'Conll2003Loader', - 'IMDBLoader', - 'MatchingLoader', - 'SNLILoader', 'MNLILoader', - 'MTL16Loader', - 'PeopleDailyCorpusLoader', - 'QNLILoader', - 'QuoraLoader', - 'RTELoader', - 'SSTLoader', - 'SST2Loader', - 'YelpLoader', + "QuoraLoader", + "SNLILoader", + "QNLILoader", + "RTELoader", + "CNXNLILoader", + "BQCorpusLoader", + "LCQMCLoader", + + "CMRC2018Loader", + + "Pipe", + + "YelpFullPipe", + "YelpPolarityPipe", + "SSTPipe", + "SST2Pipe", + "IMDBPipe", + "ChnSentiCorpPipe", + "THUCNewsPipe", + "WeiboSenti100kPipe", + + "Conll2003Pipe", + "Conll2003NERPipe", + "OntoNotesNERPipe", + "MsraNERPipe", + "PeopleDailyPipe", + "WeiboNERPipe", + + "CWSPipe", + + "Pipe", + "CWSPipe", + + "YelpFullPipe", + "YelpPolarityPipe", + "SSTPipe", + "SST2Pipe", + "IMDBPipe", + "ChnSentiCorpPipe", + "THUCNewsPipe", + "WeiboSenti100kPipe", + + "Conll2003NERPipe", + "OntoNotesNERPipe", + "MsraNERPipe", + "WeiboNERPipe", + "PeopleDailyPipe", + "Conll2003Pipe", + + "MatchingBertPipe", + "RTEBertPipe", + "SNLIBertPipe", + "QuoraBertPipe", + "QNLIBertPipe", + "MNLIBertPipe", + "CNXNLIBertPipe", + "BQCorpusBertPipe", + "LCQMCBertPipe", + "MatchingPipe", + "RTEPipe", + "SNLIPipe", + "QuoraPipe", + "QNLIPipe", + "MNLIPipe", + "LCQMCPipe", + "CNXNLIPipe", + "BQCorpusPipe", + "RenamePipe", + "GranularizePipe", + "MachingTruncatePipe", + + "CMRC2018BertPipe", + 'ModelLoader', 'ModelSaver', + ] +import sys + +from .data_bundle import DataBundle from .embed_loader import EmbedLoader -from .base_loader import DataBundle, DataSetLoader -from .dataset_loader import CSVLoader, JsonLoader +from .loader import * from .model_io import ModelLoader, ModelSaver +from .pipe import * +from ..doc_utils import doc_process -from .data_loader import * +doc_process(sys.modules[__name__]) \ No newline at end of file diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py deleted file mode 100644 index 5d61c16a..00000000 --- a/fastNLP/io/base_loader.py +++ /dev/null @@ -1,220 +0,0 @@ -__all__ = [ - "BaseLoader", - 'DataBundle', - 'DataSetLoader', -] - -import _pickle as pickle -import os -from typing import Union, Dict -import os -from ..core.dataset import DataSet - - -class BaseLoader(object): - """ - 各个 Loader 的基类,提供了 API 的参考。 - - """ - - def __init__(self): - super(BaseLoader, self).__init__() - - @staticmethod - def load_lines(data_path): - """ - 按行读取,舍弃每行两侧空白字符,返回list of str - - :param data_path: 读取数据的路径 - """ - with open(data_path, "r", encoding="utf=8") as f: - text = f.readlines() - return [line.strip() for line in text] - - @classmethod - def load(cls, data_path): - """ - 先按行读取,去除一行两侧空白,再提取每行的字符。返回list of list of str - - :param data_path: - """ - with open(data_path, "r", encoding="utf-8") as f: - text = f.readlines() - return [[word for word in sent.strip()] for sent in text] - - @classmethod - def load_with_cache(cls, data_path, cache_path): - """缓存版的load - """ - if os.path.isfile(cache_path) and os.path.getmtime(data_path) < os.path.getmtime(cache_path): - with open(cache_path, 'rb') as f: - return pickle.load(f) - else: - obj = cls.load(data_path) - with open(cache_path, 'wb') as f: - pickle.dump(obj, f) - return obj - - -def _download_from_url(url, path): - try: - from tqdm.auto import tqdm - except: - from ..core.utils import _pseudo_tqdm as tqdm - import requests - - """Download file""" - r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True) - chunk_size = 16 * 1024 - total_size = int(r.headers.get('Content-length', 0)) - with open(path, "wb") as file, \ - tqdm(total=total_size, unit='B', unit_scale=1, desc=path.split('/')[-1]) as t: - for chunk in r.iter_content(chunk_size): - if chunk: - file.write(chunk) - t.update(len(chunk)) - - -def _uncompress(src, dst): - import zipfile - import gzip - import tarfile - import os - - def unzip(src, dst): - with zipfile.ZipFile(src, 'r') as f: - f.extractall(dst) - - def ungz(src, dst): - with gzip.open(src, 'rb') as f, open(dst, 'wb') as uf: - length = 16 * 1024 # 16KB - buf = f.read(length) - while buf: - uf.write(buf) - buf = f.read(length) - - def untar(src, dst): - with tarfile.open(src, 'r:gz') as f: - f.extractall(dst) - - fn, ext = os.path.splitext(src) - _, ext_2 = os.path.splitext(fn) - if ext == '.zip': - unzip(src, dst) - elif ext == '.gz' and ext_2 != '.tar': - ungz(src, dst) - elif (ext == '.gz' and ext_2 == '.tar') or ext_2 == '.tgz': - untar(src, dst) - else: - raise ValueError('unsupported file {}'.format(src)) - - -class DataBundle: - """ - 经过处理的数据信息,包括一系列数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。 - - :param vocabs: 从名称(字符串)到 :class:`~fastNLP.Vocabulary` 类型的dict - :param datasets: 从名称(字符串)到 :class:`~fastNLP.DataSet` 类型的dict - """ - - def __init__(self, vocabs: dict = None, datasets: dict = None): - self.vocabs = vocabs or {} - self.datasets = datasets or {} - - def __repr__(self): - _str = 'In total {} datasets:\n'.format(len(self.datasets)) - for name, dataset in self.datasets.items(): - _str += '\t{} has {} instances.\n'.format(name, len(dataset)) - _str += 'In total {} vocabs:\n'.format(len(self.vocabs)) - for name, vocab in self.vocabs.items(): - _str += '\t{} has {} entries.\n'.format(name, len(vocab)) - return _str - - -class DataSetLoader: - """ - 别名::class:`fastNLP.io.DataSetLoader` :class:`fastNLP.io.dataset_loader.DataSetLoader` - - 定义了各种 DataSetLoader 所需的API 接口,开发者应该继承它实现各种的 DataSetLoader。 - - 开发者至少应该编写如下内容: - - - _load 函数:从一个数据文件中读取数据到一个 :class:`~fastNLP.DataSet` - - load 函数(可以使用基类的方法):从一个或多个数据文件中读取数据到一个或多个 :class:`~fastNLP.DataSet` - - process 函数:一个或多个从数据文件中读取数据,并处理成可以训练的一个或多个 :class:`~fastNLP.DataSet` - - **process 函数中可以 调用load 函数或 _load 函数** - - """ - URL = '' - DATA_DIR = '' - - ROOT_DIR = '.fastnlp/datasets/' - UNCOMPRESS = True - - def _download(self, url: str, pdir: str, uncompress=True) -> str: - """ - - 从 ``url`` 下载数据到 ``path``, 如果 ``uncompress`` 为 ``True`` ,自动解压。 - - :param url: 下载的网站 - :param pdir: 下载到的目录 - :param uncompress: 是否自动解压缩 - :return: 数据的存放路径 - """ - fn = os.path.basename(url) - path = os.path.join(pdir, fn) - """check data exists""" - if not os.path.exists(path): - os.makedirs(pdir, exist_ok=True) - _download_from_url(url, path) - if uncompress: - dst = os.path.join(pdir, 'data') - if not os.path.exists(dst): - _uncompress(path, dst) - return dst - return path - - def download(self): - return self._download( - self.URL, - os.path.join(self.ROOT_DIR, self.DATA_DIR), - uncompress=self.UNCOMPRESS) - - def load(self, paths: Union[str, Dict[str, str]]) -> Union[DataSet, Dict[str, DataSet]]: - """ - 从指定一个或多个路径中的文件中读取数据,返回一个或多个数据集 :class:`~fastNLP.DataSet` 。 - 如果处理多个路径,传入的 dict 中的 key 与返回的 dict 中的 key 保存一致。 - - :param Union[str, Dict[str, str]] paths: 文件路径 - :return: :class:`~fastNLP.DataSet` 类的对象或存储多个 :class:`~fastNLP.DataSet` 的字典 - """ - if isinstance(paths, str): - return self._load(paths) - return {name: self._load(path) for name, path in paths.items()} - - def _load(self, path: str) -> DataSet: - """从指定路径的文件中读取数据,返回 :class:`~fastNLP.DataSet` 类型的对象 - - :param str path: 文件路径 - :return: 一个 :class:`~fastNLP.DataSet` 类型的对象 - """ - raise NotImplementedError - - def process(self, paths: Union[str, Dict[str, str]], **options) -> DataBundle: - """ - 对于特定的任务和数据集,读取并处理数据,返回处理DataInfo类对象或字典。 - - 从指定一个或多个路径中的文件中读取数据,DataInfo对象中可以包含一个或多个数据集 。 - 如果处理多个路径,传入的 dict 的 key 与返回DataInfo中的 dict 中的 key 保存一致。 - - 返回的 :class:`DataBundle` 对象有如下属性: - - - vocabs: 由从数据集中获取的词表组成的字典,每个词表 - - datasets: 一个dict,包含一系列 :class:`~fastNLP.DataSet` 类型的对象。其中 field 的命名参考 :mod:`~fastNLP.core.const` - - :param paths: 原始数据读取的路径 - :param options: 根据不同的任务和数据集,设计自己的参数 - :return: 返回一个 DataBundle - """ - raise NotImplementedError diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py deleted file mode 100644 index 4acdbb96..00000000 --- a/fastNLP/io/config_io.py +++ /dev/null @@ -1,311 +0,0 @@ -""" -用于读入和处理和保存 config 文件 - .. todo:: - 这个模块中的类可能被抛弃? -""" -__all__ = [ - "ConfigLoader", - "ConfigSection", - "ConfigSaver" -] - -import configparser -import json -import os - -from .base_loader import BaseLoader - - -class ConfigLoader(BaseLoader): - """ - 别名::class:`fastNLP.io.ConfigLoader` :class:`fastNLP.io.config_io.ConfigLoader` - - 读取配置文件的Loader - - :param str data_path: 配置文件的路径 - - """ - - def __init__(self, data_path=None): - super(ConfigLoader, self).__init__() - if data_path is not None: - self.config = self.parse(super(ConfigLoader, self).load(data_path)) - - @staticmethod - def parse(string): - raise NotImplementedError - - @staticmethod - def load_config(file_path, sections): - """ - 把配置文件的section 存入提供的 ``sections`` 中 - - :param str file_path: 配置文件的路径 - :param dict sections: 符合如下键值对组成的字典 `section_name(string)` : :class:`~fastNLP.io.ConfigSection` - - Example:: - - test_args = ConfigSection() - ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) - - """ - assert isinstance(sections, dict) - cfg = configparser.ConfigParser() - if not os.path.exists(file_path): - raise FileNotFoundError("config file {} not found. ".format(file_path)) - cfg.read(file_path) - for s in sections: - attr_list = [i for i in sections[s].__dict__.keys() if - not callable(getattr(sections[s], i)) and not i.startswith("__")] - if s not in cfg: - print('section %s not found in config file' % (s)) - continue - gen_sec = cfg[s] - for attr in gen_sec.keys(): - try: - val = json.loads(gen_sec[attr]) - # print(s, attr, val, type(val)) - if attr in attr_list: - assert type(val) == type(getattr(sections[s], attr)), \ - 'type not match, except %s but got %s' % \ - (type(getattr(sections[s], attr)), type(val)) - """ - if attr in attr_list then check its type and - update its value. - else add a new attr in sections[s] - """ - setattr(sections[s], attr, val) - except Exception as e: - print("cannot load attribute %s in section %s" - % (attr, s)) - pass - - -class ConfigSection(object): - """ - 别名::class:`fastNLP.io.ConfigSection` :class:`fastNLP.io.config_io.ConfigSection` - - ConfigSection是一个存储了一个section中所有键值对的数据结构,推荐使用此类的实例来配合 :meth:`ConfigLoader.load_config` 使用 - - """ - - def __init__(self): - super(ConfigSection, self).__init__() - - def __getitem__(self, key): - """ - :param key: str, the name of the attribute - :return attr: the value of this attribute - if key not in self.__dict__.keys(): - return self[key] - else: - raise AttributeError - """ - if key in self.__dict__.keys(): - return getattr(self, key) - raise AttributeError("do NOT have attribute %s" % key) - - def __setitem__(self, key, value): - """ - :param key: str, the name of the attribute - :param value: the value of this attribute - if key not in self.__dict__.keys(): - self[key] will be added - else: - self[key] will be updated - """ - if key in self.__dict__.keys(): - if not isinstance(value, type(getattr(self, key))): - raise AttributeError("attr %s except %s but got %s" % - (key, str(type(getattr(self, key))), str(type(value)))) - setattr(self, key, value) - - def __contains__(self, item): - """ - :param item: The key of item. - :return: True if the key in self.__dict__.keys() else False. - """ - return item in self.__dict__.keys() - - def __eq__(self, other): - """Overwrite the == operator - - :param other: Another ConfigSection() object which to be compared. - :return: True if value of each key in each ConfigSection() object are equal to the other, else False. - """ - for k in self.__dict__.keys(): - if k not in other.__dict__.keys(): - return False - if getattr(self, k) != getattr(self, k): - return False - - for k in other.__dict__.keys(): - if k not in self.__dict__.keys(): - return False - if getattr(self, k) != getattr(self, k): - return False - - return True - - def __ne__(self, other): - """Overwrite the != operator - - :param other: - :return: - """ - return not self.__eq__(other) - - @property - def data(self): - return self.__dict__ - - -class ConfigSaver(object): - """ - 别名::class:`fastNLP.io.ConfigSaver` :class:`fastNLP.io.config_io.ConfigSaver` - - ConfigSaver 是用来存储配置文件并解决相关冲突的类 - - :param str file_path: 配置文件的路径 - - """ - - def __init__(self, file_path): - self.file_path = file_path - if not os.path.exists(self.file_path): - raise FileNotFoundError("file {} NOT found!".__format__(self.file_path)) - - def _get_section(self, sect_name): - """ - This is the function to get the section with the section name. - - :param sect_name: The name of section what wants to load. - :return: The section. - """ - sect = ConfigSection() - ConfigLoader().load_config(self.file_path, {sect_name: sect}) - return sect - - def _read_section(self): - """ - This is the function to read sections from the config file. - - :return: sect_list, sect_key_list - sect_list: A list of ConfigSection(). - sect_key_list: A list of names in sect_list. - """ - sect_name = None - - sect_list = {} - sect_key_list = [] - - single_section = {} - single_section_key = [] - - with open(self.file_path, 'r') as f: - lines = f.readlines() - - for line in lines: - if line.startswith('[') and line.endswith(']\n'): - if sect_name is None: - pass - else: - sect_list[sect_name] = single_section, single_section_key - single_section = {} - single_section_key = [] - sect_key_list.append(sect_name) - sect_name = line[1: -2] - continue - - if line.startswith('#'): - single_section[line] = '#' - single_section_key.append(line) - continue - - if line.startswith('\n'): - single_section_key.append('\n') - continue - - if '=' not in line: - raise RuntimeError("can NOT load config file {}".__format__(self.file_path)) - - key = line.split('=', maxsplit=1)[0].strip() - value = line.split('=', maxsplit=1)[1].strip() + '\n' - single_section[key] = value - single_section_key.append(key) - - if sect_name is not None: - sect_list[sect_name] = single_section, single_section_key - sect_key_list.append(sect_name) - return sect_list, sect_key_list - - def _write_section(self, sect_list, sect_key_list): - """ - This is the function to write config file with section list and name list. - - :param sect_list: A list of ConfigSection() need to be writen into file. - :param sect_key_list: A list of name of sect_list. - :return: - """ - with open(self.file_path, 'w') as f: - for sect_key in sect_key_list: - single_section, single_section_key = sect_list[sect_key] - f.write('[' + sect_key + ']\n') - for key in single_section_key: - if key == '\n': - f.write('\n') - continue - if single_section[key] == '#': - f.write(key) - continue - f.write(key + ' = ' + single_section[key]) - f.write('\n') - - def save_config_file(self, section_name, section): - """ - 这个方法可以用来修改并保存配置文件中单独的一个 section - - :param str section_name: 需要保存的 section 的名字. - :param section: 你需要修改并保存的 section, :class:`~fastNLP.io.ConfigSaver` 类型 - """ - section_file = self._get_section(section_name) - if len(section_file.__dict__.keys()) == 0: # the section not in the file before - # append this section to config file - with open(self.file_path, 'a') as f: - f.write('[' + section_name + ']\n') - for k in section.__dict__.keys(): - f.write(k + ' = ') - if isinstance(section[k], str): - f.write('\"' + str(section[k]) + '\"\n\n') - else: - f.write(str(section[k]) + '\n\n') - else: - # the section exists - change_file = False - for k in section.__dict__.keys(): - if k not in section_file: - # find a new key in this section - change_file = True - break - if section_file[k] != section[k]: - change_file = True - break - if not change_file: - return - - sect_list, sect_key_list = self._read_section() - if section_name not in sect_key_list: - raise AttributeError() - - sect, sect_key = sect_list[section_name] - for k in section.__dict__.keys(): - if k not in sect_key: - if sect_key[-1] != '\n': - sect_key.append('\n') - sect_key.append(k) - sect[k] = str(section[k]) - if isinstance(section[k], str): - sect[k] = "\"" + sect[k] + "\"" - sect[k] = sect[k] + "\n" - sect_list[section_name] = sect, sect_key - self._write_section(sect_list, sect_key_list) diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py new file mode 100644 index 00000000..ba275e61 --- /dev/null +++ b/fastNLP/io/data_bundle.py @@ -0,0 +1,347 @@ +""" +.. todo:: + doc +""" +__all__ = [ + 'DataBundle', +] + +from typing import Union + +from ..core.dataset import DataSet +from ..core.vocabulary import Vocabulary +from ..core._logger import logger + + +class DataBundle: + """ + 经过处理的数据信息,包括一系列数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。该对象一般由fastNLP中各种 + Loader的load函数生成,可以通过以下的方法获取里面的内容 + + Example:: + + data_bundle = YelpLoader().load({'train':'/path/to/train', 'dev': '/path/to/dev'}) + train_vocabs = data_bundle.vocabs['train'] + train_data = data_bundle.datasets['train'] + dev_data = data_bundle.datasets['train'] + + """ + + def __init__(self, vocabs: dict = None, datasets: dict = None): + """ + + :param vocabs: 从名称(字符串)到 :class:`~fastNLP.Vocabulary` 类型的dict + :param datasets: 从名称(字符串)到 :class:`~fastNLP.DataSet` 类型的dict + """ + self.vocabs = vocabs or {} + self.datasets = datasets or {} + + def set_vocab(self, vocab, field_name): + """ + 向DataBunlde中增加vocab + + :param ~fastNLP.Vocabulary vocab: 词表 + :param str field_name: 这个vocab对应的field名称 + :return: self + """ + assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary supports." + self.vocabs[field_name] = vocab + return self + + def set_dataset(self, dataset, name: str): + """ + + :param ~fastNLP.DataSet dataset: 传递给DataBundle的DataSet + :param str name: dataset的名称 + :return: self + """ + assert isinstance(dataset, DataSet), "Only fastNLP.DataSet supports." + self.datasets[name] = dataset + return self + + def get_dataset(self, name: str) -> DataSet: + """ + 获取名为name的dataset + + :param str name: dataset的名称,一般为'train', 'dev', 'test' + :return: DataSet + """ + if name in self.datasets.keys(): + return self.datasets[name] + else: + error_msg = f'DataBundle do NOT have DataSet named {name}. ' \ + f'It should be one of {self.datasets.keys()}.' + logger.error(error_msg) + raise KeyError(error_msg) + + def delete_dataset(self, name: str): + """ + 删除名为name的DataSet + + :param str name: + :return: self + """ + self.datasets.pop(name, None) + return self + + def get_vocab(self, field_name: str) -> Vocabulary: + """ + 获取field名为field_name对应的vocab + + :param str field_name: 名称 + :return: Vocabulary + """ + if field_name in self.vocabs.keys(): + return self.vocabs[field_name] + else: + error_msg = f'DataBundle do NOT have Vocabulary named {field_name}. ' \ + f'It should be one of {self.vocabs.keys()}.' + logger.error(error_msg) + raise KeyError(error_msg) + + def delete_vocab(self, field_name: str): + """ + 删除vocab + :param str field_name: + :return: self + """ + self.vocabs.pop(field_name, None) + return self + + @property + def num_dataset(self): + return len(self.datasets) + + @property + def num_vocab(self): + return len(self.vocabs) + + def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_dataset=True): + """ + 将field_names中的field设置为input, 对data_bundle中所有的dataset执行该操作:: + + data_bundle.set_input('words', 'seq_len') # 将words和seq_len这两个field的input属性设置为True + data_bundle.set_input('words', flag=False) # 将words这个field的input属性设置为False + + :param str field_names: field的名称 + :param bool flag: 将field_name的input状态设置为flag + :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 + 行的数据进行类型和维度推断本列的数据的类型和维度。 + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return: self + """ + for field_name in field_names: + for name, dataset in self.datasets.items(): + if not ignore_miss_dataset and not dataset.has_field(field_name): + raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") + if not dataset.has_field(field_name): + continue + else: + dataset.set_input(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) + return self + + def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_dataset=True): + """ + 将field_names中的field设置为target, 对data_bundle中所有的dataset执行该操作:: + + data_bundle.set_target('target', 'seq_len') # 将words和target这两个field的input属性设置为True + data_bundle.set_target('target', flag=False) # 将target这个field的input属性设置为False + + :param str field_names: field的名称 + :param bool flag: 将field_name的target状态设置为flag + :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 + 行的数据进行类型和维度推断本列的数据的类型和维度。 + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return: self + """ + for field_name in field_names: + for name, dataset in self.datasets.items(): + if not ignore_miss_dataset and not dataset.has_field(field_name): + raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") + if not dataset.has_field(field_name): + continue + else: + dataset.set_target(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) + return self + + def set_pad_val(self, field_name, pad_val, ignore_miss_dataset=True): + """ + 将DataBundle中所有的DataSet中名为field_name的Field的padding值设置为pad_val. + + :param str field_name: + :param int pad_val: + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return: self + """ + for name, dataset in self.datasets.items(): + if dataset.has_field(field_name=field_name): + dataset.set_pad_val(field_name=field_name, pad_val=pad_val) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + return self + + def set_ignore_type(self, *field_names, flag=True, ignore_miss_dataset=True): + """ + 将DataBundle中所有的DataSet中名为*field_names的Field的ignore_type设置为flag状态 + + :param str field_names: + :param bool flag: + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return: self + """ + for name, dataset in self.datasets.items(): + for field_name in field_names: + if dataset.has_field(field_name=field_name): + dataset.set_ignore_type(field_name, flag=flag) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + return self + + def copy_field(self, field_name, new_field_name, ignore_miss_dataset=True): + """ + 将DataBundle中所有的DataSet中名为field_name的Field复制一份并命名为叫new_field_name. + + :param str field_name: + :param str new_field_name: + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return: self + """ + for name, dataset in self.datasets.items(): + if dataset.has_field(field_name=field_name): + dataset.copy_field(field_name=field_name, new_field_name=new_field_name) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + return self + + def rename_field(self, field_name, new_field_name, ignore_miss_dataset=True, rename_vocab=True): + """ + 将DataBundle中所有DataSet中名为field_name的field重命名为new_field_name. + + :param str field_name: + :param str new_field_name: + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :param bool rename_vocab: 如果该field同时也存在于vocabs中,会将该field的名称对应修改 + :return: self + """ + for name, dataset in self.datasets.items(): + if dataset.has_field(field_name=field_name): + dataset.rename_field(field_name=field_name, new_field_name=new_field_name) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + if rename_vocab: + if field_name in self.vocabs: + self.vocabs[new_field_name] = self.vocabs.pop(field_name) + + return self + + def delete_field(self, field_name, ignore_miss_dataset=True, delete_vocab=True): + """ + 将DataBundle中所有DataSet中名为field_name的field删除掉. + + :param str field_name: + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :param bool delete_vocab: 如果该field也在vocabs中存在,将该值也一并删除 + :return: self + """ + for name, dataset in self.datasets.items(): + if dataset.has_field(field_name=field_name): + dataset.delete_field(field_name=field_name) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + if delete_vocab: + if field_name in self.vocabs: + self.vocabs.pop(field_name) + return self + + def iter_datasets(self) -> Union[str, DataSet]: + """ + 迭代data_bundle中的DataSet + + Example:: + + for name, dataset in data_bundle.iter_datasets(): + pass + + :return: + """ + for name, dataset in self.datasets.items(): + yield name, dataset + + def iter_vocabs(self) -> Union[str, Vocabulary]: + """ + 迭代data_bundle中的DataSet + + Example: + + for field_name, vocab in data_bundle.iter_vocabs(): + pass + + :return: + """ + for field_name, vocab in self.vocabs.items(): + yield field_name, vocab + + def apply_field(self, func, field_name: str, new_field_name: str, ignore_miss_dataset=True, **kwargs): + """ + 对DataBundle中所有的dataset使用apply_field方法 + + :param callable func: input是instance中名为 `field_name` 的field的内容。 + :param str field_name: 传入func的是哪个field。 + :param str new_field_name: 将func返回的内容放入到 `new_field_name` 这个field中,如果名称与已有的field相同,则覆 + 盖之前的field。如果为None则不创建新的field。 + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :param optional kwargs: 支持输入is_input,is_target,ignore_type + + 1. is_input: bool, 如果为True则将名为 `new_field_name` 的field设置为input + + 2. is_target: bool, 如果为True则将名为 `new_field_name` 的field设置为target + + 3. ignore_type: bool, 如果为True则将名为 `new_field_name` 的field的ignore_type设置为true, 忽略其类型 + """ + for name, dataset in self.datasets.items(): + if dataset.has_field(field_name=field_name): + dataset.apply_field(func=func, field_name=field_name, new_field_name=new_field_name, **kwargs) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + return self + + def apply(self, func, new_field_name:str, **kwargs): + """ + 对DataBundle中所有的dataset使用apply方法 + + :param callable func: input是instance中名为 `field_name` 的field的内容。 + :param str new_field_name: 将func返回的内容放入到 `new_field_name` 这个field中,如果名称与已有的field相同,则覆 + 盖之前的field。如果为None则不创建新的field。 + :param optional kwargs: 支持输入is_input,is_target,ignore_type + + 1. is_input: bool, 如果为True则将名为 `new_field_name` 的field设置为input + + 2. is_target: bool, 如果为True则将名为 `new_field_name` 的field设置为target + + 3. ignore_type: bool, 如果为True则将名为 `new_field_name` 的field的ignore_type设置为true, 忽略其类型 + """ + for name, dataset in self.datasets.items(): + dataset.apply(func, new_field_name=new_field_name, **kwargs) + return self + + def __repr__(self): + _str = '' + if len(self.datasets): + _str += 'In total {} datasets:\n'.format(self.num_dataset) + for name, dataset in self.datasets.items(): + _str += '\t{} has {} instances.\n'.format(name, len(dataset)) + if len(self.vocabs): + _str += 'In total {} vocabs:\n'.format(self.num_vocab) + for name, vocab in self.vocabs.items(): + _str += '\t{} has {} entries.\n'.format(name, len(vocab)) + return _str + + diff --git a/fastNLP/io/data_loader/__init__.py b/fastNLP/io/data_loader/__init__.py deleted file mode 100644 index 5d6b08b0..00000000 --- a/fastNLP/io/data_loader/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -用于读数据集的模块, 可以读取文本分类、序列标注、Matching任务的数据集 - -这些模块的具体介绍如下,您可以通过阅读 :doc:`教程` 来进行了解。 -""" -__all__ = [ - 'ConllLoader', - 'Conll2003Loader', - 'IMDBLoader', - 'MatchingLoader', - 'SNLILoader', - 'MNLILoader', - 'MTL16Loader', - 'PeopleDailyCorpusLoader', - 'QNLILoader', - 'QuoraLoader', - 'RTELoader', - 'SSTLoader', - 'SST2Loader', - 'YelpLoader', -] - - -from .conll import ConllLoader, Conll2003Loader -from .imdb import IMDBLoader -from .matching import MatchingLoader -from .mnli import MNLILoader -from .mtl import MTL16Loader -from .people_daily import PeopleDailyCorpusLoader -from .qnli import QNLILoader -from .quora import QuoraLoader -from .rte import RTELoader -from .snli import SNLILoader -from .sst import SSTLoader, SST2Loader -from .yelp import YelpLoader diff --git a/fastNLP/io/data_loader/conll.py b/fastNLP/io/data_loader/conll.py deleted file mode 100644 index 9b2402a2..00000000 --- a/fastNLP/io/data_loader/conll.py +++ /dev/null @@ -1,73 +0,0 @@ - -from ...core.dataset import DataSet -from ...core.instance import Instance -from ..base_loader import DataSetLoader -from ..file_reader import _read_conll - - -class ConllLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader` - - 读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html. 数据中以"-DOCSTART-"开头的行将被忽略,因为 - 该符号在conll 2003中被用为文档分割符。 - - 列号从0开始, 每列对应内容为:: - - Column Type - 0 Document ID - 1 Part number - 2 Word number - 3 Word itself - 4 Part-of-Speech - 5 Parse bit - 6 Predicate lemma - 7 Predicate Frameset ID - 8 Word sense - 9 Speaker/Author - 10 Named Entities - 11:N Predicate Arguments - N Coreference - - :param headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 - :param indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` - :param dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``False`` - """ - - def __init__(self, headers, indexes=None, dropna=False): - super(ConllLoader, self).__init__() - if not isinstance(headers, (list, tuple)): - raise TypeError( - 'invalid headers: {}, should be list of strings'.format(headers)) - self.headers = headers - self.dropna = dropna - if indexes is None: - self.indexes = list(range(len(self.headers))) - else: - if len(indexes) != len(headers): - raise ValueError - self.indexes = indexes - - def _load(self, path): - ds = DataSet() - for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): - ins = {h: data[i] for i, h in enumerate(self.headers)} - ds.append(Instance(**ins)) - return ds - - -class Conll2003Loader(ConllLoader): - """ - 别名::class:`fastNLP.io.Conll2003Loader` :class:`fastNLP.io.data_loader.Conll2003Loader` - - 读取Conll2003数据 - - 关于数据集的更多信息,参考: - https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data - """ - - def __init__(self): - headers = [ - 'tokens', 'pos', 'chunks', 'ner', - ] - super(Conll2003Loader, self).__init__(headers=headers) diff --git a/fastNLP/io/data_loader/imdb.py b/fastNLP/io/data_loader/imdb.py deleted file mode 100644 index d3636cde..00000000 --- a/fastNLP/io/data_loader/imdb.py +++ /dev/null @@ -1,99 +0,0 @@ - -from typing import Union, Dict - -from ..embed_loader import EmbeddingOption, EmbedLoader -from ..base_loader import DataSetLoader, DataBundle -from ...core.vocabulary import VocabularyOption, Vocabulary -from ...core.dataset import DataSet -from ...core.instance import Instance -from ...core.const import Const - -from ..utils import get_tokenizer - - -class IMDBLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.IMDBLoader` :class:`fastNLP.io.data_loader.IMDBLoader` - - 读取IMDB数据集,DataSet包含以下fields: - - words: list(str), 需要分类的文本 - - target: str, 文本的标签 - - """ - - def __init__(self): - super(IMDBLoader, self).__init__() - self.tokenizer = get_tokenizer() - - def _load(self, path): - dataset = DataSet() - with open(path, 'r', encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line: - continue - parts = line.split('\t') - target = parts[0] - words = self.tokenizer(parts[1].lower()) - dataset.append(Instance(words=words, target=target)) - - if len(dataset) == 0: - raise RuntimeError(f"{path} has no valid data.") - - return dataset - - def process(self, - paths: Union[str, Dict[str, str]], - src_vocab_opt: VocabularyOption = None, - tgt_vocab_opt: VocabularyOption = None, - char_level_op=False): - - datasets = {} - info = DataBundle() - for name, path in paths.items(): - dataset = self.load(path) - datasets[name] = dataset - - def wordtochar(words): - chars = [] - for word in words: - word = word.lower() - for char in word: - chars.append(char) - chars.append('') - chars.pop() - return chars - - if char_level_op: - for dataset in datasets.values(): - dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') - - datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False) - - src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) - src_vocab.from_dataset(datasets['train'], field_name='words') - - src_vocab.index_dataset(*datasets.values(), field_name='words') - - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) - tgt_vocab.from_dataset(datasets['train'], field_name='target') - tgt_vocab.index_dataset(*datasets.values(), field_name='target') - - info.vocabs = { - Const.INPUT: src_vocab, - Const.TARGET: tgt_vocab - } - - info.datasets = datasets - - for name, dataset in info.datasets.items(): - dataset.set_input(Const.INPUT) - dataset.set_target(Const.TARGET) - - return info - - - diff --git a/fastNLP/io/data_loader/matching.py b/fastNLP/io/data_loader/matching.py deleted file mode 100644 index 481b5056..00000000 --- a/fastNLP/io/data_loader/matching.py +++ /dev/null @@ -1,248 +0,0 @@ -import os - -from typing import Union, Dict, List - -from ...core.const import Const -from ...core.vocabulary import Vocabulary -from ..base_loader import DataBundle, DataSetLoader -from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR -from ...modules.encoder.bert import BertTokenizer - - -class MatchingLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.data_loader.MatchingLoader` - - 读取Matching任务的数据集 - - :param dict paths: key是数据集名称(如train、dev、test),value是对应的文件名 - """ - - def __init__(self, paths: dict=None): - self.paths = paths - - def _load(self, path): - """ - :param str path: 待读取数据集的路径名 - :return: fastNLP.DataSet ds: 返回一个DataSet对象,里面必须包含3个field:其中两个分别为两个句子 - 的原始字符串文本,第三个为标签 - """ - raise NotImplementedError - - def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None, - to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None, - cut_text: int = None, get_index=True, auto_pad_length: int=None, - auto_pad_token: str='', set_input: Union[list, str, bool]=True, - set_target: Union[list, str, bool]=True, concat: Union[str, list, bool]=None, - extra_split: List[str]=None, ) -> DataBundle: - """ - :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹, - 则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和 - 对应的全路径文件名。 - :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义 - 这个数据集的名字,如果不定义则默认为train。 - :param bool to_lower: 是否将文本自动转为小写。默认值为False。 - :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` : - 提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和 - attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len - :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径 - :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。 - :param bool get_index: 是否需要根据词表将文本转为index - :param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad - :param str auto_pad_token: 自动pad的内容 - :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False - 则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input, - 于此同时其他field不会被设置为input。默认值为True。 - :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。 - :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个。 - 如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果 - 传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]']. - :param extra_split: 额外的分隔符,即除了空格之外的用于分词的字符。 - :return: - """ - if isinstance(set_input, str): - set_input = [set_input] - if isinstance(set_target, str): - set_target = [set_target] - if isinstance(set_input, bool): - auto_set_input = set_input - else: - auto_set_input = False - if isinstance(set_target, bool): - auto_set_target = set_target - else: - auto_set_target = False - if isinstance(paths, str): - if os.path.isdir(paths): - path = {n: os.path.join(paths, self.paths[n]) for n in self.paths.keys()} - else: - path = {dataset_name if dataset_name is not None else 'train': paths} - else: - path = paths - - data_info = DataBundle() - for data_name in path.keys(): - data_info.datasets[data_name] = self._load(path[data_name]) - - for data_name, data_set in data_info.datasets.items(): - if auto_set_input: - data_set.set_input(Const.INPUTS(0), Const.INPUTS(1)) - if auto_set_target: - if Const.TARGET in data_set.get_field_names(): - data_set.set_target(Const.TARGET) - - if extra_split is not None: - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: ' '.join(x[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0)) - data_set.apply(lambda x: ' '.join(x[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1)) - - for s in extra_split: - data_set.apply(lambda x: x[Const.INPUTS(0)].replace(s, ' ' + s + ' '), - new_field_name=Const.INPUTS(0)) - data_set.apply(lambda x: x[Const.INPUTS(0)].replace(s, ' ' + s + ' '), - new_field_name=Const.INPUTS(0)) - - _filt = lambda x: x - data_set.apply(lambda x: list(filter(_filt, x[Const.INPUTS(0)].split(' '))), - new_field_name=Const.INPUTS(0), is_input=auto_set_input) - data_set.apply(lambda x: list(filter(_filt, x[Const.INPUTS(1)].split(' '))), - new_field_name=Const.INPUTS(1), is_input=auto_set_input) - _filt = None - - if to_lower: - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0), - is_input=auto_set_input) - data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1), - is_input=auto_set_input) - - if bert_tokenizer is not None: - if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: - PRETRAIN_URL = _get_base_url('bert') - model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) - # 检查是否存在 - elif os.path.isdir(bert_tokenizer): - model_dir = bert_tokenizer - else: - raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.") - - words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]') - with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f: - lines = f.readlines() - lines = [line.strip() for line in lines] - words_vocab.add_word_lst(lines) - words_vocab.build_vocab() - - tokenizer = BertTokenizer.from_pretrained(model_dir) - - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields, - is_input=auto_set_input) - - if isinstance(concat, bool): - concat = 'default' if concat else None - if concat is not None: - if isinstance(concat, str): - CONCAT_MAP = {'bert': ['[CLS]', '[SEP]', '', '[SEP]'], - 'default': ['', '', '', '']} - if concat.lower() in CONCAT_MAP: - concat = CONCAT_MAP[concat] - else: - concat = 4 * [concat] - assert len(concat) == 4, \ - f'Please choose a list with 4 symbols which at the beginning of first sentence ' \ - f'the end of first sentence, the begin of second sentence, and the end of second' \ - f'sentence. Your input is {concat}' - - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[1]] + [concat[2]] + - x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT) - data_set.apply(lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT, - is_input=auto_set_input) - - if seq_len_type is not None: - if seq_len_type == 'seq_len': # - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), - is_input=auto_set_input) - elif seq_len_type == 'mask': - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: [1] * len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), - is_input=auto_set_input) - elif seq_len_type == 'bert': - for data_name, data_set in data_info.datasets.items(): - if Const.INPUT not in data_set.get_field_names(): - raise KeyError(f'Field ``{Const.INPUT}`` not in {data_name} data set: ' - f'got {data_set.get_field_names()}') - data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), - new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input) - data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), - new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) - - if auto_pad_length is not None: - cut_text = min(auto_pad_length, cut_text if cut_text is not None else auto_pad_length) - - if cut_text is not None: - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')): - data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields, - is_input=auto_set_input) - - data_set_list = [d for n, d in data_info.datasets.items()] - assert len(data_set_list) > 0, f'There are NO data sets in data info!' - - if bert_tokenizer is None: - words_vocab = Vocabulary(padding=auto_pad_token) - words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n], - field_name=[n for n in data_set_list[0].get_field_names() - if (Const.INPUT in n)], - no_create_entry_dataset=[d for n, d in data_info.datasets.items() - if 'train' not in n]) - target_vocab = Vocabulary(padding=None, unknown=None) - target_vocab = target_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n], - field_name=Const.TARGET) - data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab} - - if get_index: - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields, - is_input=auto_set_input) - - if Const.TARGET in data_set.get_field_names(): - data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET, - is_input=auto_set_input, is_target=auto_set_target) - - if auto_pad_length is not None: - if seq_len_type == 'seq_len': - raise RuntimeError(f'the sequence will be padded with the length {auto_pad_length}, ' - f'so the seq_len_type cannot be `{seq_len_type}`!') - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] * - (auto_pad_length - len(x[fields])), new_field_name=fields, - is_input=auto_set_input) - elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'): - data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])), - new_field_name=fields, is_input=auto_set_input) - - for data_name, data_set in data_info.datasets.items(): - if isinstance(set_input, list): - data_set.set_input(*[inputs for inputs in set_input if inputs in data_set.get_field_names()]) - if isinstance(set_target, list): - data_set.set_target(*[target for target in set_target if target in data_set.get_field_names()]) - - return data_info diff --git a/fastNLP/io/data_loader/mnli.py b/fastNLP/io/data_loader/mnli.py deleted file mode 100644 index 65863f3d..00000000 --- a/fastNLP/io/data_loader/mnli.py +++ /dev/null @@ -1,62 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import CSVLoader - - -class MNLILoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.MNLILoader` :class:`fastNLP.io.data_loader.MNLILoader` - - 读取MNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev_matched': 'dev_matched.tsv', - 'dev_mismatched': 'dev_mismatched.tsv', - 'test_matched': 'test_matched.tsv', - 'test_mismatched': 'test_mismatched.tsv', - # 'test_0.9_matched': 'multinli_0.9_test_matched_unlabeled.txt', - # 'test_0.9_mismatched': 'multinli_0.9_test_mismatched_unlabeled.txt', - - # test_0.9_mathed与mismatched是MNLI0.9版本的(数据来源:kaggle) - } - MatchingLoader.__init__(self, paths=paths) - CSVLoader.__init__(self, sep='\t') - self.fields = { - 'sentence1_binary_parse': Const.INPUTS(0), - 'sentence2_binary_parse': Const.INPUTS(1), - 'gold_label': Const.TARGET, - } - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - - if Const.TARGET in ds.get_field_names(): - if ds[0][Const.TARGET] == 'hidden': - ds.delete_field(Const.TARGET) - - parentheses_table = str.maketrans({'(': None, ')': None}) - - ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(0)) - ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(1)) - if Const.TARGET in ds.get_field_names(): - ds.drop(lambda x: x[Const.TARGET] == '-') - return ds diff --git a/fastNLP/io/data_loader/mtl.py b/fastNLP/io/data_loader/mtl.py deleted file mode 100644 index cbca413d..00000000 --- a/fastNLP/io/data_loader/mtl.py +++ /dev/null @@ -1,68 +0,0 @@ - -from typing import Union, Dict - -from ..base_loader import DataBundle -from ..dataset_loader import CSVLoader -from ...core.vocabulary import Vocabulary, VocabularyOption -from ...core.const import Const -from ..utils import check_dataloader_paths - - -class MTL16Loader(CSVLoader): - """ - 别名::class:`fastNLP.io.MTL16Loader` :class:`fastNLP.io.data_loader.MTL16Loader` - - 读取MTL16数据集,DataSet包含以下fields: - - words: list(str), 需要分类的文本 - - target: str, 文本的标签 - - 数据来源:https://pan.baidu.com/s/1c2L6vdA - - """ - - def __init__(self): - super(MTL16Loader, self).__init__(headers=(Const.TARGET, Const.INPUT), sep='\t') - - def _load(self, path): - dataset = super(MTL16Loader, self)._load(path) - dataset.apply(lambda x: x[Const.INPUT].lower().split(), new_field_name=Const.INPUT) - if len(dataset) == 0: - raise RuntimeError(f"{path} has no valid data.") - - return dataset - - def process(self, - paths: Union[str, Dict[str, str]], - src_vocab_opt: VocabularyOption = None, - tgt_vocab_opt: VocabularyOption = None,): - - paths = check_dataloader_paths(paths) - datasets = {} - info = DataBundle() - for name, path in paths.items(): - dataset = self.load(path) - datasets[name] = dataset - - src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) - src_vocab.from_dataset(datasets['train'], field_name=Const.INPUT) - src_vocab.index_dataset(*datasets.values(), field_name=Const.INPUT) - - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) - tgt_vocab.from_dataset(datasets['train'], field_name=Const.TARGET) - tgt_vocab.index_dataset(*datasets.values(), field_name=Const.TARGET) - - info.vocabs = { - Const.INPUT: src_vocab, - Const.TARGET: tgt_vocab - } - - info.datasets = datasets - - for name, dataset in info.datasets.items(): - dataset.set_input(Const.INPUT) - dataset.set_target(Const.TARGET) - - return info diff --git a/fastNLP/io/data_loader/people_daily.py b/fastNLP/io/data_loader/people_daily.py deleted file mode 100644 index 5efadb7d..00000000 --- a/fastNLP/io/data_loader/people_daily.py +++ /dev/null @@ -1,85 +0,0 @@ - -from ..base_loader import DataSetLoader -from ...core.dataset import DataSet -from ...core.instance import Instance -from ...core.const import Const - - -class PeopleDailyCorpusLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.PeopleDailyCorpusLoader` :class:`fastNLP.io.data_loader.PeopleDailyCorpusLoader` - - 读取人民日报数据集 - """ - - def __init__(self, pos=True, ner=True): - super(PeopleDailyCorpusLoader, self).__init__() - self.pos = pos - self.ner = ner - - def _load(self, data_path): - with open(data_path, "r", encoding="utf-8") as f: - sents = f.readlines() - examples = [] - for sent in sents: - if len(sent) <= 2: - continue - inside_ne = False - sent_pos_tag = [] - sent_words = [] - sent_ner = [] - words = sent.strip().split()[1:] - for word in words: - if "[" in word and "]" in word: - ner_tag = "U" - print(word) - elif "[" in word: - inside_ne = True - ner_tag = "B" - word = word[1:] - elif "]" in word: - ner_tag = "L" - word = word[:word.index("]")] - if inside_ne is True: - inside_ne = False - else: - raise RuntimeError("only ] appears!") - else: - if inside_ne is True: - ner_tag = "I" - else: - ner_tag = "O" - tmp = word.split("/") - token, pos = tmp[0], tmp[1] - sent_ner.append(ner_tag) - sent_pos_tag.append(pos) - sent_words.append(token) - example = [sent_words] - if self.pos is True: - example.append(sent_pos_tag) - if self.ner is True: - example.append(sent_ner) - examples.append(example) - return self.convert(examples) - - def convert(self, data): - """ - - :param data: python 内置对象 - :return: 一个 :class:`~fastNLP.DataSet` 类型的对象 - """ - data_set = DataSet() - for item in data: - sent_words = item[0] - if self.pos is True and self.ner is True: - instance = Instance( - words=sent_words, pos_tags=item[1], ner=item[2]) - elif self.pos is True: - instance = Instance(words=sent_words, pos_tags=item[1]) - elif self.ner is True: - instance = Instance(words=sent_words, ner=item[1]) - else: - instance = Instance(words=sent_words) - data_set.append(instance) - data_set.apply(lambda ins: len(ins[Const.INPUT]), new_field_name=Const.INPUT_LEN) - return data_set diff --git a/fastNLP/io/data_loader/qnli.py b/fastNLP/io/data_loader/qnli.py deleted file mode 100644 index 84b0f3d6..00000000 --- a/fastNLP/io/data_loader/qnli.py +++ /dev/null @@ -1,47 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import CSVLoader - - -class QNLILoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.data_loader.QNLILoader` - - 读取QNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv' # test set has not label - } - MatchingLoader.__init__(self, paths=paths) - self.fields = { - 'question': Const.INPUTS(0), - 'sentence': Const.INPUTS(1), - 'label': Const.TARGET, - } - CSVLoader.__init__(self, sep='\t') - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - for fields in ds.get_all_fields(): - if Const.INPUT in fields: - ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) - - return ds diff --git a/fastNLP/io/data_loader/quora.py b/fastNLP/io/data_loader/quora.py deleted file mode 100644 index d0ee41ec..00000000 --- a/fastNLP/io/data_loader/quora.py +++ /dev/null @@ -1,34 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import CSVLoader - - -class QuoraLoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.QuoraLoader` :class:`fastNLP.io.data_loader.QuoraLoader` - - 读取MNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv', - } - MatchingLoader.__init__(self, paths=paths) - CSVLoader.__init__(self, sep='\t', headers=(Const.TARGET, Const.INPUTS(0), Const.INPUTS(1), 'pairID')) - - def _load(self, path): - ds = CSVLoader._load(self, path) - return ds diff --git a/fastNLP/io/data_loader/rte.py b/fastNLP/io/data_loader/rte.py deleted file mode 100644 index f8c5e2fc..00000000 --- a/fastNLP/io/data_loader/rte.py +++ /dev/null @@ -1,47 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import CSVLoader - - -class RTELoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.RTELoader` :class:`fastNLP.io.data_loader.RTELoader` - - 读取RTE数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv' # test set has not label - } - MatchingLoader.__init__(self, paths=paths) - self.fields = { - 'sentence1': Const.INPUTS(0), - 'sentence2': Const.INPUTS(1), - 'label': Const.TARGET, - } - CSVLoader.__init__(self, sep='\t') - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - for fields in ds.get_all_fields(): - if Const.INPUT in fields: - ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) - - return ds diff --git a/fastNLP/io/data_loader/snli.py b/fastNLP/io/data_loader/snli.py deleted file mode 100644 index 1db0ac5b..00000000 --- a/fastNLP/io/data_loader/snli.py +++ /dev/null @@ -1,46 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import JsonLoader - - -class SNLILoader(MatchingLoader, JsonLoader): - """ - 别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.data_loader.SNLILoader` - - 读取SNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip - """ - - def __init__(self, paths: dict=None): - fields = { - 'sentence1_binary_parse': Const.INPUTS(0), - 'sentence2_binary_parse': Const.INPUTS(1), - 'gold_label': Const.TARGET, - } - paths = paths if paths is not None else { - 'train': 'snli_1.0_train.jsonl', - 'dev': 'snli_1.0_dev.jsonl', - 'test': 'snli_1.0_test.jsonl'} - MatchingLoader.__init__(self, paths=paths) - JsonLoader.__init__(self, fields=fields) - - def _load(self, path): - ds = JsonLoader._load(self, path) - - parentheses_table = str.maketrans({'(': None, ')': None}) - - ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(0)) - ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(1)) - ds.drop(lambda x: x[Const.TARGET] == '-') - return ds diff --git a/fastNLP/io/data_loader/sst.py b/fastNLP/io/data_loader/sst.py deleted file mode 100644 index 0d881e65..00000000 --- a/fastNLP/io/data_loader/sst.py +++ /dev/null @@ -1,177 +0,0 @@ - -from typing import Union, Dict -from nltk import Tree - -from ..base_loader import DataBundle, DataSetLoader -from ..dataset_loader import CSVLoader -from ...core.vocabulary import VocabularyOption, Vocabulary -from ...core.dataset import DataSet -from ...core.const import Const -from ...core.instance import Instance -from ..utils import check_dataloader_paths, get_tokenizer - - -class SSTLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.data_loader.SSTLoader` - - 读取SST数据集, DataSet包含fields:: - - words: list(str) 需要分类的文本 - target: str 文本的标签 - - 数据来源: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip - - :param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False`` - :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` - """ - - URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' - DATA_DIR = 'sst/' - - def __init__(self, subtree=False, fine_grained=False): - self.subtree = subtree - - tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral', - '3': 'positive', '4': 'very positive'} - if not fine_grained: - tag_v['0'] = tag_v['1'] - tag_v['4'] = tag_v['3'] - self.tag_v = tag_v - self.tokenizer = get_tokenizer() - - def _load(self, path): - """ - - :param str path: 存储数据的路径 - :return: 一个 :class:`~fastNLP.DataSet` 类型的对象 - """ - datalist = [] - with open(path, 'r', encoding='utf-8') as f: - datas = [] - for l in f: - datas.extend([(s, self.tag_v[t]) - for s, t in self._get_one(l, self.subtree)]) - ds = DataSet() - for words, tag in datas: - ds.append(Instance(words=words, target=tag)) - return ds - - def _get_one(self, data, subtree): - tree = Tree.fromstring(data) - if subtree: - return [(self.tokenizer(' '.join(t.leaves())), t.label()) for t in tree.subtrees() ] - return [(self.tokenizer(' '.join(tree.leaves())), tree.label())] - - def process(self, - paths, train_subtree=True, - src_vocab_op: VocabularyOption = None, - tgt_vocab_op: VocabularyOption = None,): - paths = check_dataloader_paths(paths) - input_name, target_name = 'words', 'target' - src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) - - info = DataBundle() - origin_subtree = self.subtree - self.subtree = train_subtree - info.datasets['train'] = self._load(paths['train']) - self.subtree = origin_subtree - for n, p in paths.items(): - if n != 'train': - info.datasets[n] = self._load(p) - - src_vocab.from_dataset( - info.datasets['train'], - field_name=input_name, - no_create_entry_dataset=[ds for n, ds in info.datasets.items() if n != 'train']) - tgt_vocab.from_dataset(info.datasets['train'], field_name=target_name) - - src_vocab.index_dataset( - *info.datasets.values(), - field_name=input_name, new_field_name=input_name) - tgt_vocab.index_dataset( - *info.datasets.values(), - field_name=target_name, new_field_name=target_name) - info.vocabs = { - input_name: src_vocab, - target_name: tgt_vocab - } - - return info - - -class SST2Loader(CSVLoader): - """ - 别名::class:`fastNLP.io.SST2Loader` :class:`fastNLP.io.data_loader.SST2Loader` - - 数据来源 SST: https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8 - """ - - def __init__(self): - super(SST2Loader, self).__init__(sep='\t') - self.tokenizer = get_tokenizer() - self.field = {'sentence': Const.INPUT, 'label': Const.TARGET} - - def _load(self, path: str) -> DataSet: - ds = super(SST2Loader, self)._load(path) - for k, v in self.field.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - ds.apply(lambda x: self.tokenizer(x[Const.INPUT]), new_field_name=Const.INPUT) - print("all count:", len(ds)) - return ds - - def process(self, - paths: Union[str, Dict[str, str]], - src_vocab_opt: VocabularyOption = None, - tgt_vocab_opt: VocabularyOption = None, - char_level_op=False): - - paths = check_dataloader_paths(paths) - datasets = {} - info = DataBundle() - for name, path in paths.items(): - dataset = self.load(path) - datasets[name] = dataset - - def wordtochar(words): - chars = [] - for word in words: - word = word.lower() - for char in word: - chars.append(char) - chars.append('') - chars.pop() - return chars - - input_name, target_name = Const.INPUT, Const.TARGET - info.vocabs={} - - # 就分隔为char形式 - if char_level_op: - for dataset in datasets.values(): - dataset.apply_field(wordtochar, field_name=Const.INPUT, new_field_name=Const.CHAR_INPUT) - src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) - src_vocab.from_dataset(datasets['train'], field_name=Const.INPUT) - src_vocab.index_dataset(*datasets.values(), field_name=Const.INPUT) - - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) - tgt_vocab.from_dataset(datasets['train'], field_name=Const.TARGET) - tgt_vocab.index_dataset(*datasets.values(), field_name=Const.TARGET) - - info.vocabs = { - Const.INPUT: src_vocab, - Const.TARGET: tgt_vocab - } - - info.datasets = datasets - - for name, dataset in info.datasets.items(): - dataset.set_input(Const.INPUT) - dataset.set_target(Const.TARGET) - - return info - diff --git a/fastNLP/io/data_loader/yelp.py b/fastNLP/io/data_loader/yelp.py deleted file mode 100644 index 333fcab0..00000000 --- a/fastNLP/io/data_loader/yelp.py +++ /dev/null @@ -1,132 +0,0 @@ - -import csv -from typing import Iterable - -from ...core.const import Const -from ...core.dataset import DataSet -from ...core.instance import Instance -from ...core.vocabulary import VocabularyOption, Vocabulary -from ..base_loader import DataBundle, DataSetLoader -from typing import Union, Dict -from ..utils import check_dataloader_paths, get_tokenizer - - -class YelpLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.YelpLoader` :class:`fastNLP.io.data_loader.YelpLoader` - 读取Yelp_full/Yelp_polarity数据集, DataSet包含fields: - - words: list(str), 需要分类的文本 - - target: str, 文本的标签 - - chars:list(str),未index的字符列表 - - 数据集:yelp_full/yelp_polarity - - :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` - :param lower: 是否需要自动转小写,默认为False。 - """ - - def __init__(self, fine_grained=False, lower=False): - super(YelpLoader, self).__init__() - tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral', - '4.0': 'positive', '5.0': 'very positive'} - if not fine_grained: - tag_v['1.0'] = tag_v['2.0'] - tag_v['5.0'] = tag_v['4.0'] - self.fine_grained = fine_grained - self.tag_v = tag_v - self.lower = lower - self.tokenizer = get_tokenizer() - - def _load(self, path): - ds = DataSet() - csv_reader = csv.reader(open(path, encoding='utf-8')) - all_count = 0 - real_count = 0 - for row in csv_reader: - all_count += 1 - if len(row) == 2: - target = self.tag_v[row[0] + ".0"] - words = clean_str(row[1], self.tokenizer, self.lower) - if len(words) != 0: - ds.append(Instance(words=words, target=target)) - real_count += 1 - print("all count:", all_count) - print("real count:", real_count) - return ds - - def process(self, paths: Union[str, Dict[str, str]], - train_ds: Iterable[str] = None, - src_vocab_op: VocabularyOption = None, - tgt_vocab_op: VocabularyOption = None, - char_level_op=False): - paths = check_dataloader_paths(paths) - info = DataBundle(datasets=self.load(paths)) - src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) - _train_ds = [info.datasets[name] - for name in train_ds] if train_ds else info.datasets.values() - - def wordtochar(words): - chars = [] - for word in words: - word = word.lower() - for char in word: - chars.append(char) - chars.append('') - chars.pop() - return chars - - input_name, target_name = Const.INPUT, Const.TARGET - info.vocabs = {} - # 就分隔为char形式 - if char_level_op: - for dataset in info.datasets.values(): - dataset.apply_field(wordtochar, field_name=Const.INPUT, new_field_name=Const.CHAR_INPUT) - else: - src_vocab.from_dataset(*_train_ds, field_name=input_name) - src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) - info.vocabs[input_name] = src_vocab - - tgt_vocab.from_dataset(*_train_ds, field_name=target_name) - tgt_vocab.index_dataset( - *info.datasets.values(), - field_name=target_name, new_field_name=target_name) - - info.vocabs[target_name] = tgt_vocab - - info.datasets['train'], info.datasets['dev'] = info.datasets['train'].split(0.1, shuffle=False) - - for name, dataset in info.datasets.items(): - dataset.set_input(Const.INPUT) - dataset.set_target(Const.TARGET) - - return info - - -def clean_str(sentence, tokenizer, char_lower=False): - """ - heavily borrowed from github - https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb - :param sentence: is a str - :return: - """ - if char_lower: - sentence = sentence.lower() - import re - nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') - words = tokenizer(sentence) - words_collection = [] - for word in words: - if word in ['-lrb-', '-rrb-', '', '-r', '-l', 'b-']: - continue - tt = nonalpnum.split(word) - t = ''.join(tt) - if t != '': - words_collection.append(t) - - return words_collection - diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py deleted file mode 100644 index ad6bbdc1..00000000 --- a/fastNLP/io/dataset_loader.py +++ /dev/null @@ -1,138 +0,0 @@ -""" -dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的数据, 并返回 `DataSet` , -得到的 :class:`~fastNLP.DataSet` 对象可以直接传入 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester`, 用于模型的训练和测试。 -以SNLI数据集为例:: - - loader = SNLILoader() - train_ds = loader.load('path/to/train') - dev_ds = loader.load('path/to/dev') - test_ds = loader.load('path/to/test') - - # ... do stuff - -为 fastNLP 提供 DataSetLoader 的开发者请参考 :class:`~fastNLP.io.DataSetLoader` 的介绍。 -""" -__all__ = [ - 'CSVLoader', - 'JsonLoader', -] - - -from ..core.dataset import DataSet -from ..core.instance import Instance -from .file_reader import _read_csv, _read_json -from .base_loader import DataSetLoader - - -class JsonLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.JsonLoader` :class:`fastNLP.io.dataset_loader.JsonLoader` - - 读取json格式数据.数据必须按行存储,每行是一个包含各类属性的json对象 - - :param dict fields: 需要读入的json属性名称, 和读入后在DataSet中存储的field_name - ``fields`` 的 `key` 必须是json对象的属性名. ``fields`` 的 `value` 为读入后在DataSet存储的 `field_name` , - `value` 也可为 ``None`` , 这时读入后的 `field_name` 与json对象对应属性同名 - ``fields`` 可为 ``None`` , 这时,json对象所有属性都保存在DataSet中. Default: ``None`` - :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` . - Default: ``False`` - """ - - def __init__(self, fields=None, dropna=False): - super(JsonLoader, self).__init__() - self.dropna = dropna - self.fields = None - self.fields_list = None - if fields: - self.fields = {} - for k, v in fields.items(): - self.fields[k] = k if v is None else v - self.fields_list = list(self.fields.keys()) - - def _load(self, path): - ds = DataSet() - for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): - if self.fields: - ins = {self.fields[k]: v for k, v in d.items()} - else: - ins = d - ds.append(Instance(**ins)) - return ds - - -class CSVLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.dataset_loader.CSVLoader` - - 读取CSV格式的数据集。返回 ``DataSet`` - - :param List[str] headers: CSV文件的文件头.定义每一列的属性名称,即返回的DataSet中`field`的名称 - 若为 ``None`` ,则将读入文件的第一行视作 ``headers`` . Default: ``None`` - :param str sep: CSV文件中列与列之间的分隔符. Default: "," - :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` . - Default: ``False`` - """ - - def __init__(self, headers=None, sep=",", dropna=False): - self.headers = headers - self.sep = sep - self.dropna = dropna - - def _load(self, path): - ds = DataSet() - for idx, data in _read_csv(path, headers=self.headers, - sep=self.sep, dropna=self.dropna): - ds.append(Instance(**data)) - return ds - - -def _cut_long_sentence(sent, max_sample_length=200): - """ - 将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。 - 所以截取的句子可能长于或者短于max_sample_length - - :param sent: str. - :param max_sample_length: int. - :return: list of str. - """ - sent_no_space = sent.replace(' ', '') - cutted_sentence = [] - if len(sent_no_space) > max_sample_length: - parts = sent.strip().split() - new_line = '' - length = 0 - for part in parts: - length += len(part) - new_line += part + ' ' - if length > max_sample_length: - new_line = new_line[:-1] - cutted_sentence.append(new_line) - length = 0 - new_line = '' - if new_line != '': - cutted_sentence.append(new_line[:-1]) - else: - cutted_sentence.append(sent) - return cutted_sentence - - -def _add_seg_tag(data): - """ - - :param data: list of ([word], [pos], [heads], [head_tags]) - :return: list of ([word], [pos]) - """ - - _processed = [] - for word_list, pos_list, _, _ in data: - new_sample = [] - for word, pos in zip(word_list, pos_list): - if len(word) == 1: - new_sample.append((word, 'S-' + pos)) - else: - new_sample.append((word[0], 'B-' + pos)) - for c in word[1:-1]: - new_sample.append((c, 'M-' + pos)) - new_sample.append((word[-1], 'E-' + pos)) - _processed.append(list(map(list, zip(*new_sample)))) - return _processed diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 91a0919c..73a7a1de 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -1,16 +1,20 @@ +""" +.. todo:: + doc +""" __all__ = [ "EmbedLoader", "EmbeddingOption", ] +import logging import os import warnings import numpy as np -from ..core.vocabulary import Vocabulary -from .base_loader import BaseLoader from ..core.utils import Option +from ..core.vocabulary import Vocabulary class EmbeddingOption(Option): @@ -27,10 +31,8 @@ class EmbeddingOption(Option): ) -class EmbedLoader(BaseLoader): +class EmbedLoader: """ - 别名::class:`fastNLP.io.EmbedLoader` :class:`fastNLP.io.embed_loader.EmbedLoader` - 用于读取预训练的embedding, 读取结果可直接载入为模型参数。 """ @@ -79,9 +81,9 @@ class EmbedLoader(BaseLoader): word = ''.join(parts[:-dim]) nums = parts[-dim:] # 对齐unk与pad - if word==padding and vocab.padding is not None: + if word == padding and vocab.padding is not None: word = vocab.padding - elif word==unknown and vocab.unknown is not None: + elif word == unknown and vocab.unknown is not None: word = vocab.unknown if word in vocab: index = vocab.to_index(word) @@ -91,10 +93,10 @@ class EmbedLoader(BaseLoader): if error == 'ignore': warnings.warn("Error occurred at the {} line.".format(idx)) else: - print("Error occurred at the {} line.".format(idx)) + logging.error("Error occurred at the {} line.".format(idx)) raise e total_hits = sum(hit_flags) - print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab))) + logging.info("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab))) if init_method is None: found_vectors = matrix[hit_flags] if len(found_vectors) != 0: @@ -157,7 +159,7 @@ class EmbedLoader(BaseLoader): warnings.warn("Error occurred at the {} line.".format(idx)) pass else: - print("Error occurred at the {} line.".format(idx)) + logging.error("Error occurred at the {} line.".format(idx)) raise e if dim == -1: raise RuntimeError("{} is an empty file.".format(embed_filepath)) @@ -166,7 +168,7 @@ class EmbedLoader(BaseLoader): index = vocab.to_index(key) matrix[index] = vec - if (unknown is not None and not found_unknown) or (padding is not None and not found_pad): + if ((unknown is not None) and (not found_unknown)) or ((padding is not None) and (not found_pad)): start_idx = 0 if padding is not None: start_idx += 1 @@ -175,9 +177,9 @@ class EmbedLoader(BaseLoader): mean = np.mean(matrix[start_idx:], axis=0, keepdims=True) std = np.std(matrix[start_idx:], axis=0, keepdims=True) - if (unknown is not None and not found_unknown): + if (unknown is not None) and (not found_unknown): matrix[start_idx - 1] = np.random.randn(1, dim).astype(dtype) * std + mean - if (padding is not None and not found_pad): + if (padding is not None) and (not found_pad): matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean if normalize: diff --git a/fastNLP/io/file_reader.py b/fastNLP/io/file_reader.py index 0ae0a319..f1c90284 100644 --- a/fastNLP/io/file_reader.py +++ b/fastNLP/io/file_reader.py @@ -1,7 +1,13 @@ -""" +"""undocumented 此模块用于给其它模块提供读取文件的函数,没有为用户提供 API """ + +__all__ = [] + import json +import csv + +from ..core import logger def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True): @@ -16,23 +22,28 @@ def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True): :if False, raise ValueError when reading invalid data. default: True :return: generator, every time yield (line number, csv item) """ - with open(path, 'r', encoding=encoding) as f: + with open(path, 'r', encoding=encoding) as csv_file: + f = csv.reader(csv_file, delimiter=sep) start_idx = 0 if headers is None: - headers = f.readline().rstrip('\r\n') - headers = headers.split(sep) + headers = next(f) start_idx += 1 elif not isinstance(headers, (list, tuple)): - raise TypeError("headers should be list or tuple, not {}." \ - .format(type(headers))) + raise TypeError("headers should be list or tuple, not {}." \ + .format(type(headers))) for line_idx, line in enumerate(f, start_idx): - contents = line.rstrip('\r\n').split(sep) + contents = line if len(contents) != len(headers): if dropna: continue else: - raise ValueError("Line {} has {} parts, while header has {} parts." \ - .format(line_idx, len(contents), len(headers))) + if "" in headers: + raise ValueError(("Line {} has {} parts, while header has {} parts.\n" + + "Please check the empty parts or unnecessary '{}'s in header.") + .format(line_idx, len(contents), len(headers), sep)) + else: + raise ValueError("Line {} has {} parts, while header has {} parts." \ + .format(line_idx, len(contents), len(headers))) _dict = {} for header, content in zip(headers, contents): _dict[header] = content @@ -81,6 +92,7 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): :if False, raise ValueError when reading invalid data. default: True :return: generator, every time yield (line number, conll item) """ + def parse_conll(sample): sample = list(map(list, zip(*sample))) sample = [sample[i] for i in indexes] @@ -88,14 +100,15 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): if len(f) <= 0: raise ValueError('empty field') return sample + with open(path, 'r', encoding=encoding) as f: sample = [] start = next(f).strip() - if '-DOCSTART-' not in start and start!='': + if start != '': sample.append(start.split()) for line_idx, line in enumerate(f, 1): line = line.strip() - if line=='': + if line == '': if len(sample): try: res = parse_conll(sample) @@ -103,13 +116,13 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): yield line_idx, res except Exception as e: if dropna: + logger.warning('Invalid instance which ends at line: {} has been dropped.'.format(line_idx)) continue - raise ValueError('invalid instance ends at line: {}'.format(line_idx)) + raise ValueError('Invalid instance which ends at line: {}'.format(line_idx)) elif line.startswith('#'): continue else: - if not line.startswith('-DOCSTART-'): - sample.append(line.split()) + sample.append(line.split()) if len(sample) > 0: try: res = parse_conll(sample) @@ -117,5 +130,5 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): except Exception as e: if dropna: return - print('invalid instance ends at line: {}'.format(line_idx)) + logger.error('invalid instance ends at line: {}'.format(line_idx)) raise e diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index cb762eb7..2c447e87 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -1,71 +1,180 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "cached_path", + "get_filepath", + "get_cache_path", + "split_filename_suffix", + "get_from_cache", +] import os +import re +import shutil +import tempfile from pathlib import Path from urllib.parse import urlparse -import re + import requests -import tempfile +from requests import HTTPError from tqdm import tqdm -import shutil -import hashlib +from ..core import logger PRETRAINED_BERT_MODEL_DIR = { - 'en': 'bert-base-cased-f89bfe08.zip', - 'en-base-uncased': 'bert-base-uncased-3413b23c.zip', - 'en-base-cased': 'bert-base-cased-f89bfe08.zip', - 'en-large-uncased': 'bert-large-uncased-20939f45.zip', - 'en-large-cased': 'bert-large-cased-e0cf90fc.zip', - - 'en-large-cased-wwm': 'bert-large-cased-wwm-a457f118.zip', - 'en-large-uncased-wwm': 'bert-large-uncased-wwm-92a50aeb.zip', - 'en-base-cased-mrpc': 'bert-base-cased-finetuned-mrpc-c7099855.zip', - - 'cn': 'bert-base-chinese-29d0a84a.zip', - 'cn-base': 'bert-base-chinese-29d0a84a.zip', - - 'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip', - 'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip', - 'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip', + 'en': 'bert-base-cased.zip', + 'en-large-cased-wwm': 'bert-large-cased-wwm.zip', + 'en-large-uncased-wwm': 'bert-large-uncased-wwm.zip', + + 'en-large-uncased': 'bert-large-uncased.zip', + 'en-large-cased': 'bert-large-cased.zip', + + 'en-base-uncased': 'bert-base-uncased.zip', + 'en-base-cased': 'bert-base-cased.zip', + + 'en-base-cased-mrpc': 'bert-base-cased-finetuned-mrpc.zip', + + 'en-distilbert-base-uncased': 'distilbert-base-uncased.zip', + + 'multi-base-cased': 'bert-base-multilingual-cased.zip', + 'multi-base-uncased': 'bert-base-multilingual-uncased.zip', + + 'cn': 'bert-chinese-wwm.zip', + 'cn-base': 'bert-base-chinese.zip', + 'cn-wwm': 'bert-chinese-wwm.zip', + 'cn-wwm-ext': "bert-chinese-wwm-ext.zip" } PRETRAINED_ELMO_MODEL_DIR = { - 'en': 'elmo_en-d39843fe.tar.gz', - 'cn': 'elmo_cn-5e9b34e2.tar.gz' + 'en': 'elmo_en_Medium.zip', + 'en-small': "elmo_en_Small.zip", + 'en-original-5.5b': 'elmo_en_Original_5.5B.zip', + 'en-original': 'elmo_en_Original.zip', + 'en-medium': 'elmo_en_Medium.zip' } PRETRAIN_STATIC_FILES = { - 'en': 'glove.840B.300d-cc1ad5e1.tar.gz', - 'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz', - 'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz", - 'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz", - 'en-fasttext': "cc.en.300.vec-d53187b2.gz", - 'cn': "tencent_cn-dab24577.tar.gz", - 'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz", + 'en': 'glove.840B.300d.zip', + + 'en-glove-6b-50d': 'glove.6B.50d.zip', + 'en-glove-6b-100d': 'glove.6B.100d.zip', + 'en-glove-6b-200d': 'glove.6B.200d.zip', + 'en-glove-6b-300d': 'glove.6B.300d.zip', + 'en-glove-42b-300d': 'glove.42B.300d.zip', + 'en-glove-840b-300d': 'glove.840B.300d.zip', + 'en-glove-twitter-27b-25d': 'glove.twitter.27B.25d.zip', + 'en-glove-twitter-27b-50d': 'glove.twitter.27B.50d.zip', + 'en-glove-twitter-27b-100d': 'glove.twitter.27B.100d.zip', + 'en-glove-twitter-27b-200d': 'glove.twitter.27B.200d.zip', + + 'en-word2vec-300': "GoogleNews-vectors-negative300.txt.gz", + + 'en-fasttext-wiki': "wiki-news-300d-1M.vec.zip", + 'en-fasttext-crawl': "crawl-300d-2M.vec.zip", + + 'cn': "tencent_cn.zip", + 'cn-tencent': "tencent_cn.zip", + 'cn-fasttext': "cc.zh.300.vec.gz", + 'cn-sgns-literature-word': 'sgns.literature.word.txt.zip', + 'cn-char-fastnlp-100d': "cn_char_fastnlp_100d.zip", + 'cn-bi-fastnlp-100d': "cn_bi_fastnlp_100d.zip", + "cn-tri-fastnlp-100d": "cn_tri_fastnlp_100d.zip" } +DATASET_DIR = { + # Classification, English + 'aclImdb': "imdb.zip", + "yelp-review-full": "yelp_review_full.tar.gz", + "yelp-review-polarity": "yelp_review_polarity.tar.gz", + "sst-2": "SST-2.zip", + "sst": "SST.zip", + + # Classification, Chinese + "chn-senti-corp": "chn_senti_corp.zip", + "weibo-senti-100k": "WeiboSenti100k.zip", + "thuc-news": "THUCNews.zip", + + # Matching, English + "mnli": "MNLI.zip", + "snli": "SNLI.zip", + "qnli": "QNLI.zip", + "rte": "RTE.zip", + + # Matching, Chinese + "cn-xnli": "XNLI.zip", + + # Sequence Labeling, Chinese + "msra-ner": "MSRA_NER.zip", + "peopledaily": "peopledaily.zip", + "weibo-ner": "weibo_NER.zip", + + # Chinese Word Segmentation + "cws-pku": 'cws_pku.zip', + "cws-cityu": "cws_cityu.zip", + "cws-as": 'cws_as.zip', + "cws-msra": 'cws_msra.zip', + + # Summarization, English + "ext-cnndm": "ext-cnndm.zip", + + # Question & answer + "cmrc2018": "cmrc2018.zip" -def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: +} + +PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, + "bert": PRETRAINED_BERT_MODEL_DIR, + "static": PRETRAIN_STATIC_FILES} + +# 用于扩展fastNLP的下载 +FASTNLP_EXTEND_DATASET_URL = 'fastnlp_dataset_url.txt' +FASTNLP_EXTEND_EMBEDDING_URL = {'elmo': 'fastnlp_elmo_url.txt', + 'bert':'fastnlp_bert_url.txt', + 'static': 'fastnlp_static_url.txt' +} + + +def cached_path(url_or_filename: str, cache_dir: str = None, name=None) -> Path: """ - 给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 - 将文件放入到cache_dir中 + 给定一个url,尝试通过url中的解析出来的文件名字filename到{cache_dir}/{name}/{filename}下寻找这个文件, + + 1. 如果cache_dir=None, 则cache_dir=~/.fastNLP/; 否则cache_dir=cache_dir + 2. 如果name=None, 则没有中间的{name}这一层结构;否者中间结构就为{name} + + 如果有该文件,就直接返回路径 + + 如果没有该文件,则尝试用传入的url下载 + + 或者文件名(可以是具体的文件名,也可以是文件夹),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 + 将文件放入到cache_dir中. + + :param str url_or_filename: 文件的下载url或者文件名称。 + :param str cache_dir: 文件的缓存文件夹。如果为None,将使用"~/.fastNLP"这个默认路径 + :param str name: 中间一层的名称。如embedding, dataset + :return: """ if cache_dir is None: - dataset_cache = Path(get_defalt_path()) + data_cache = Path(get_cache_path()) else: - dataset_cache = cache_dir + data_cache = cache_dir + + if name: + data_cache = os.path.join(data_cache, name) parsed = urlparse(url_or_filename) if parsed.scheme in ("http", "https"): # URL, so get it from the cache (downloading if necessary) - return get_from_cache(url_or_filename, dataset_cache) - elif parsed.scheme == "" and Path(os.path.join(dataset_cache, url_or_filename)).exists(): + return get_from_cache(url_or_filename, Path(data_cache)) + elif parsed.scheme == "" and Path(os.path.join(data_cache, url_or_filename)).exists(): # File, and it exists. - return Path(url_or_filename) + return Path(os.path.join(data_cache, url_or_filename)) elif parsed.scheme == "": # File, but it doesn't exist. - raise FileNotFoundError("file {} not found".format(url_or_filename)) + raise FileNotFoundError("file {} not found in {}.".format(url_or_filename, data_cache)) else: # Something unknown raise ValueError( @@ -75,48 +184,143 @@ def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: def get_filepath(filepath): """ - 如果filepath中只有一个文件,则直接返回对应的全路径 - :param filepath: + 如果filepath为文件夹, + + 如果内含多个文件, 返回filepath + + 如果只有一个文件, 返回filepath + filename + + 如果filepath为文件 + + 返回filepath + + :param str filepath: 路径 :return: """ if os.path.isdir(filepath): files = os.listdir(filepath) - if len(files)==1: + if len(files) == 1: return os.path.join(filepath, files[0]) else: return filepath - return filepath + elif os.path.isfile(filepath): + return filepath + else: + raise FileNotFoundError(f"{filepath} is not a valid file or directory.") -def get_defalt_path(): +def get_cache_path(): """ - 获取默认的fastNLP存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中,将使用环境变量的值,使得不用每个用户都去下载。 + 获取fastNLP默认cache的存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中,将使用环境变量的值,使得不用每个用户都去下载。 - :return: + :return str: 存放路径 """ if 'FASTNLP_CACHE_DIR' in os.environ: fastnlp_cache_dir = os.environ.get('FASTNLP_CACHE_DIR') - if os.path.exists(fastnlp_cache_dir): + if os.path.isdir(fastnlp_cache_dir): return fastnlp_cache_dir - raise RuntimeError("Some errors happens on cache directory.") - else: - raise RuntimeError("There function is not available right now.") + else: + raise NotADirectoryError(f"{os.environ['FASTNLP_CACHE_DIR']} is not a directory.") fastnlp_cache_dir = os.path.expanduser(os.path.join("~", ".fastNLP")) return fastnlp_cache_dir def _get_base_url(name): + """ + 根据name返回下载的url地址。 + + :param str name: 支持dataset和embedding两种 + :return: + """ # 返回的URL结尾必须是/ - if 'FASTNLP_BASE_URL' in os.environ: - fastnlp_base_url = os.environ['FASTNLP_BASE_URL'] - return fastnlp_base_url - raise RuntimeError("There function is not available right now.") + environ_name = "FASTNLP_{}_URL".format(name.upper()) + + if environ_name in os.environ: + url = os.environ[environ_name] + if url.endswith('/'): + return url + else: + return url + '/' + else: + URLS = { + 'embedding': "http://212.129.155.247/embedding/", + "dataset": "http://212.129.155.247/dataset/" + } + if name.lower() not in URLS: + raise KeyError(f"{name} is not recognized.") + return URLS[name.lower()] + + +def _get_embedding_url(embed_type, name): + """ + 给定embedding类似和名称,返回下载url + + :param str embed_type: 支持static, bert, elmo。即embedding的类型 + :param str name: embedding的名称, 例如en, cn, based等 + :return: str, 下载的url地址 + """ + # 从扩展中寻找下载的url + _filename = FASTNLP_EXTEND_EMBEDDING_URL.get(embed_type, None) + if _filename: + url = _read_extend_url_file(_filename, name) + if url: + return url + embed_map = PRETRAIN_MAP.get(embed_type, None) + if embed_map: + filename = embed_map.get(name, None) + if filename: + url = _get_base_url('embedding') + filename + return url + raise KeyError("There is no {}. Only supports {}.".format(name, list(embed_map.keys()))) + else: + raise KeyError(f"There is no {embed_type}. Only supports bert, elmo, static") + +def _read_extend_url_file(filename, name)->str: + """ + filename中的内容使用制表符隔开,第一列是名称,第二列是下载的url地址 + + :param str filename: 在默认的路径下寻找file这个文件 + :param str name: 需要寻找的资源的名称 + :return: str or None + """ + cache_dir = get_cache_path() + filepath = os.path.join(cache_dir, filename) + if os.path.exists(filepath): + with open(filepath, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + if len(parts) == 2: + if name == parts[0]: + return parts[1] + return None + +def _get_dataset_url(name): + """ + 给定dataset的名称,返回下载url + + :param str name: 给定dataset的名称,比如imdb, sst-2等 + :return: str + """ + # 从扩展中寻找下载的url + url = _read_extend_url_file(FASTNLP_EXTEND_DATASET_URL, name) + if url: + return url + + filename = DATASET_DIR.get(name, None) + if filename: + url = _get_base_url('dataset') + filename + return url + else: + raise KeyError(f"There is no {name}.") def split_filename_suffix(filepath): """ - 给定filepath返回对应的name和suffix - :param filepath: + 给定filepath 返回对应的name和suffix. 如果后缀是多个点,仅支持.tar.gz类型 + + :param filepath: 文件路径 :return: filename, suffix """ filename = os.path.basename(filepath) @@ -127,21 +331,19 @@ def split_filename_suffix(filepath): def get_from_cache(url: str, cache_dir: Path = None) -> Path: """ - 尝试在cache_dir中寻找url定义的资源; 如果没有找到。则从url下载并将结果放在cache_dir下,缓存的名称由url的结果推断而来。 - 如果从url中下载的资源解压后有多个文件,则返回directory的路径; 如果只有一个资源,则返回具体的路径。 - + 尝试在cache_dir中寻找url定义的资源; 如果没有找到; 则从url下载并将结果放在cache_dir下,缓存的名称由url的结果推断而来。会将下载的 + 文件解压,将解压后的文件全部放在cache_dir文件夹中。 + + 如果从url中下载的资源解压后有多个文件,则返回目录的路径; 如果只有一个资源文件,则返回具体的路径。 + + :param url: 资源的 url + :param cache_dir: cache 目录 + :return: 路径 """ cache_dir.mkdir(parents=True, exist_ok=True) filename = re.sub(r".+/", "", url) dir_name, suffix = split_filename_suffix(filename) - sep_index = dir_name[::-1].index('-') - if sep_index<0: - check_sum = None - else: - check_sum = dir_name[-sep_index+1:] - sep_index = len(dir_name) if sep_index==-1 else -sep_index-1 - dir_name = dir_name[:sep_index] # 寻找与它名字匹配的内容, 而不关心后缀 match_dir_name = match_file(dir_name, cache_dir) @@ -154,11 +356,11 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: return get_filepath(cache_path) # make HEAD request to check ETag TODO ETag可以用来判断资源是否已经更新了,之后需要加上 - response = requests.head(url, headers={"User-Agent": "fastNLP"}) - if response.status_code != 200: - raise IOError( - f"HEAD request failed for url {url} with status code {response.status_code}." - ) + # response = requests.head(url, headers={"User-Agent": "fastNLP"}) + # if response.status_code != 200: + # raise IOError( + # f"HEAD request failed for url {url} with status code {response.status_code}." + # ) # add ETag to filename if it exists # etag = response.headers.get("ETag") @@ -166,74 +368,77 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: if not cache_path.exists(): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. - fd, temp_filename = tempfile.mkstemp() - print("%s not found in cache, downloading to %s"%(url, temp_filename)) - # GET file object req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"}) - content_length = req.headers.get("Content-Length") - total = int(content_length) if content_length is not None else None - progress = tqdm(unit="B", total=total) - sha256 = hashlib.sha256() - with open(temp_filename, "wb") as temp_file: - for chunk in req.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - progress.update(len(chunk)) - temp_file.write(chunk) - sha256.update(chunk) - # check sum - digit = sha256.hexdigest()[:8] - if not check_sum: - assert digit == check_sum, "File corrupted when download." - progress.close() - print(f"Finish download from {url}.") - - # 开始解压 - delete_temp_dir = None - if suffix in ('.zip', '.tar.gz'): - uncompress_temp_dir = tempfile.mkdtemp() - delete_temp_dir = uncompress_temp_dir - print(f"Start to uncompress file to {uncompress_temp_dir}.") - if suffix == '.zip': - unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) - else: - untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) - filenames = os.listdir(uncompress_temp_dir) - if len(filenames)==1: - if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): - uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) - - cache_path.mkdir(parents=True, exist_ok=True) - print("Finish un-compressing file.") - else: - uncompress_temp_dir = temp_filename - cache_path = str(cache_path) + suffix - success = False - try: - # 复制到指定的位置 - print(f"Copy file to {cache_path}.") - if os.path.isdir(uncompress_temp_dir): - for filename in os.listdir(uncompress_temp_dir): - shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path/filename) - else: - shutil.copyfile(uncompress_temp_dir, cache_path) - success = True - except Exception as e: - print(e) - raise e - finally: - if not success: - if cache_path.exists(): - if cache_path.is_file(): - os.remove(cache_path) + if req.status_code == 200: + success = False + fd, temp_filename = tempfile.mkstemp() + uncompress_temp_dir = None + try: + content_length = req.headers.get("Content-Length") + total = int(content_length) if content_length is not None else None + progress = tqdm(unit="B", total=total, unit_scale=1) + logger.info("%s not found in cache, downloading to %s" % (url, temp_filename)) + + with open(temp_filename, "wb") as temp_file: + for chunk in req.iter_content(chunk_size=1024 * 16): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + logger.info(f"Finish download from {url}") + + # 开始解压 + if suffix in ('.zip', '.tar.gz', '.gz'): + uncompress_temp_dir = tempfile.mkdtemp() + logger.debug(f"Start to uncompress file to {uncompress_temp_dir}") + if suffix == '.zip': + unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) + elif suffix == '.gz': + ungzip_file(temp_filename, uncompress_temp_dir, dir_name) else: - shutil.rmtree(cache_path) - if delete_temp_dir: - shutil.rmtree(delete_temp_dir) - os.close(fd) - os.remove(temp_filename) - - return get_filepath(cache_path) + untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) + filenames = os.listdir(uncompress_temp_dir) + if len(filenames) == 1: + if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): + uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) + + cache_path.mkdir(parents=True, exist_ok=True) + logger.debug("Finish un-compressing file.") + else: + uncompress_temp_dir = temp_filename + cache_path = str(cache_path) + suffix + + # 复制到指定的位置 + logger.info(f"Copy file to {cache_path}") + if os.path.isdir(uncompress_temp_dir): + for filename in os.listdir(uncompress_temp_dir): + if os.path.isdir(os.path.join(uncompress_temp_dir, filename)): + shutil.copytree(os.path.join(uncompress_temp_dir, filename), cache_path / filename) + else: + shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path / filename) + else: + shutil.copyfile(uncompress_temp_dir, cache_path) + success = True + except Exception as e: + logger.error(e) + raise e + finally: + if not success: + if cache_path.exists(): + if cache_path.is_file(): + os.remove(cache_path) + else: + shutil.rmtree(cache_path) + os.close(fd) + os.remove(temp_filename) + if os.path.isdir(uncompress_temp_dir): + shutil.rmtree(uncompress_temp_dir) + elif os.path.isfile(uncompress_temp_dir): + os.remove(uncompress_temp_dir) + return get_filepath(cache_path) + else: + raise HTTPError(f"Status code:{req.status_code}. Fail to download from {url}.") def unzip_file(file: Path, to: Path): @@ -245,55 +450,39 @@ def unzip_file(file: Path, to: Path): zipObj.extractall(to) -def untar_gz_file(file:Path, to:Path): +def untar_gz_file(file: Path, to: Path): import tarfile with tarfile.open(file, 'r:gz') as tar: tar.extractall(to) -def match_file(dir_name: str, cache_dir: str) -> str: +def ungzip_file(file: str, to: str, filename:str): + import gzip + + g_file = gzip.GzipFile(file) + with open(os.path.join(to, filename), 'wb+') as f: + f.write(g_file.read()) + g_file.close() + + +def match_file(dir_name: str, cache_dir: Path) -> str: """ - 匹配的原则是,在cache_dir下的文件: (1) 与dir_name完全一致; (2) 除了后缀以外和dir_name完全一致。 + 匹配的原则是: 在cache_dir下的文件与dir_name完全一致, 或除了后缀以外和dir_name完全一致。 如果找到了两个匹配的结果将报错. 如果找到了则返回匹配的文件的名称; 没有找到返回空字符串 :param dir_name: 需要匹配的名称 :param cache_dir: 在该目录下找匹配dir_name是否存在 - :return: str + :return str: 做为匹配结果的字符串 """ files = os.listdir(cache_dir) matched_filenames = [] for file_name in files: - if re.match(dir_name+'$', file_name) or re.match(dir_name+'\\..*', file_name): + if re.match(dir_name + '$', file_name) or re.match(dir_name + '\\..*', file_name): matched_filenames.append(file_name) - if len(matched_filenames)==0: + if len(matched_filenames) == 0: return '' - elif len(matched_filenames)==1: + elif len(matched_filenames) == 1: return matched_filenames[-1] else: raise RuntimeError(f"Duplicate matched files:{matched_filenames}, this should be caused by a bug.") - - -if __name__ == '__main__': - cache_dir = Path('caches') - cache_dir = None - # 需要对cache_dir进行测试 - base_url = 'http://0.0.0.0:8888/file/download' - # if True: - # for filename in os.listdir(cache_dir): - # if os.path.isdir(os.path.join(cache_dir, filename)): - # shutil.rmtree(os.path.join(cache_dir, filename)) - # else: - # os.remove(os.path.join(cache_dir, filename)) - # 1. 测试.txt文件 - print(cached_path(base_url + '/{}'.format('txt_test-bcb4fe65.txt'), cache_dir)) - # 2. 测试.zip文件(只有一个文件) - print(cached_path(base_url + '/{}'.format('zip_test-40966d39.zip'), cache_dir)) - # 3. 测试.zip文件(有多个文件) - print(cached_path(base_url + '/{}'.format('zip_pack_test-70c0b20d.zip'), cache_dir)) - # 4. 测试.tar.gz文件 - print(cached_path(base_url + '/{}'.format('tar_gz_test-3e2679cf.tar.gz'), cache_dir)) - # 5. 测试.tar.gz多个文件 - print(cached_path(base_url + '/{}'.format('tar_gz_pack_test-08dfdccd.tar.gz'), cache_dir)) - - # 6. 测试.pkl文件 diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py new file mode 100644 index 00000000..c50ce383 --- /dev/null +++ b/fastNLP/io/loader/__init__.py @@ -0,0 +1,99 @@ +""" +Loader用于读取数据,并将内容读取到 :class:`~fastNLP.DataSet` 或者 :class:`~fastNLP.io.DataBundle` 中。所有的Loader都支持以下的 +三个方法: ``__init__`` , ``_load`` , ``loads`` . 其中 ``__init__(...)`` 用于申明读取参数,以及说明该Loader支持的数据格式, +读取后 :class:`~fastNLP.DataSet` 中的 `field` ; ``_load(path)`` 方法传入文件路径读取单个文件,并返回 :class:`~fastNLP.DataSet` ; +``load(paths)`` 用于读取文件夹下的文件,并返回 :class:`~fastNLP.io.DataBundle` 类型的对象 , load()方法支持以下几种类型的参数: + +0.传入None + 将尝试自动下载数据集并缓存。但不是所有的数据都可以直接下载。 + +1.传入一个文件的 path + 返回的 `data_bundle` 包含一个名为 `train` 的 dataset ,可以通过 ``data_bundle.get_dataset('train')`` 获取 + +2.传入一个文件夹目录 + 将读取的是这个文件夹下文件名中包含 `train` , `test` , `dev` 的文件,其它文件会被忽略。假设某个目录下的文件为:: + + | + +-train.txt + +-dev.txt + +-test.txt + +-other.txt + + 在 Loader().load('/path/to/dir') 返回的 `data_bundle` 中可以用 ``data_bundle.get_dataset('train')`` , + ``data_bundle.get_dataset('dev')`` , + ``data_bundle.get_dataset('test')`` 获取对应的 `dataset` ,其中 `other.txt` 的内容会被忽略。假设某个目录下的文件为:: + + | + +-train.txt + +-dev.txt + + 在 Loader().load('/path/to/dir') 返回的 `data_bundle` 中可以用 ``data_bundle.get_dataset('train')`` , + ``data_bundle.get_dataset('dev')`` 获取对应的 dataset。 + +3.传入一个字典 + 字典的的 key 为 `dataset` 的名称,value 是该 `dataset` 的文件路径:: + + paths = {'train':'/path/to/train', 'dev': '/path/to/dev', 'test':'/path/to/test'} + + 在 Loader().load(paths) 返回的 `data_bundle` 中可以用 ``data_bundle.get_dataset('train')`` , ``data_bundle.get_dataset('dev')`` , + ``data_bundle.get_dataset('test')`` 来获取对应的 `dataset` + +fastNLP 目前提供了如下的 Loader + + + +""" + +__all__ = [ + 'Loader', + + 'YelpLoader', + 'YelpFullLoader', + 'YelpPolarityLoader', + 'IMDBLoader', + 'SSTLoader', + 'SST2Loader', + "ChnSentiCorpLoader", + "THUCNewsLoader", + "WeiboSenti100kLoader", + + 'ConllLoader', + 'Conll2003Loader', + 'Conll2003NERLoader', + 'OntoNotesNERLoader', + 'CTBLoader', + "MsraNERLoader", + "PeopleDailyNERLoader", + "WeiboNERLoader", + + 'CSVLoader', + 'JsonLoader', + + 'CWSLoader', + + 'MNLILoader', + "QuoraLoader", + "SNLILoader", + "QNLILoader", + "RTELoader", + "CNXNLILoader", + "BQCorpusLoader", + "LCQMCLoader", + + "CoReferenceLoader", + + "CMRC2018Loader" +] +from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, \ + ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader +from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader +from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader +from .coreference import CoReferenceLoader +from .csv import CSVLoader +from .cws import CWSLoader +from .json import JsonLoader +from .loader import Loader +from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, \ + LCQMCLoader +from .qa import CMRC2018Loader + diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py new file mode 100644 index 00000000..aee661c6 --- /dev/null +++ b/fastNLP/io/loader/classification.py @@ -0,0 +1,513 @@ +"""undocumented""" + +__all__ = [ + "YelpLoader", + "YelpFullLoader", + "YelpPolarityLoader", + "IMDBLoader", + "SSTLoader", + "SST2Loader", + "ChnSentiCorpLoader", + "THUCNewsLoader", + "WeiboSenti100kLoader" +] + +import glob +import os +import random +import shutil +import time +import warnings + +from .loader import Loader +from ...core.dataset import DataSet +from ...core.instance import Instance + + +class YelpLoader(Loader): + """ + 原始数据中内容应该为, 每一行为一个sample,第一个逗号之前为target,第一个逗号之后为文本内容。 + + Example:: + + "1","I got 'new' tires from the..." + "1","Don't waste your time..." + + 读取的DataSet将具备以下的数据结构 + + .. csv-table:: + :header: "raw_words", "target" + + "I got 'new' tires from them and... ", "1" + "Don't waste your time. We had two...", "1" + "...", "..." + + """ + + def __init__(self): + super(YelpLoader, self).__init__() + + def _load(self, path: str = None): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + sep_index = line.index(',') + target = line[:sep_index] + raw_words = line[sep_index + 1:] + if target.startswith("\""): + target = target[1:] + if target.endswith("\""): + target = target[:-1] + if raw_words.endswith("\""): + raw_words = raw_words[:-1] + if raw_words.startswith('"'): + raw_words = raw_words[1:] + raw_words = raw_words.replace('""', '"') # 替换双引号 + if raw_words: + ds.append(Instance(raw_words=raw_words, target=target)) + return ds + + +class YelpFullLoader(YelpLoader): + def download(self, dev_ratio: float = 0.1, re_download: bool = False): + """ + 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 + + Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances + in Neural Information Processing Systems 28 (NIPS 2015) + + 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后在output_dir中有train.csv, test.csv, + dev.csv三个文件。 + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + """ + + dataset_name = 'yelp-review-full' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + try: + with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.csv'), 'w', encoding='utf-8') as f2: + for line in f: + if random.random() < dev_ratio: + f2.write(line) + else: + f1.write(line) + os.remove(os.path.join(data_dir, 'train.csv')) + os.renames(os.path.join(data_dir, 'middle_file.csv'), os.path.join(data_dir, 'train.csv')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.csv')): + os.remove(os.path.join(data_dir, 'middle_file.csv')) + + return data_dir + + +class YelpPolarityLoader(YelpLoader): + def download(self, dev_ratio: float = 0.1, re_download=False): + """ + 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 + + Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances + in Neural Information Processing Systems 28 (NIPS 2015) + + 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分dev_ratio这么多作为dev + + :param float dev_ratio: 如果路径中不存在dev.csv, 从train划分多少作为dev的数据。 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + """ + dataset_name = 'yelp-review-polarity' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + try: + with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.csv'), 'w', encoding='utf-8') as f2: + for line in f: + if random.random() < dev_ratio: + f2.write(line) + else: + f1.write(line) + os.remove(os.path.join(data_dir, 'train.csv')) + os.renames(os.path.join(data_dir, 'middle_file.csv'), os.path.join(data_dir, 'train.csv')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.csv')): + os.remove(os.path.join(data_dir, 'middle_file.csv')) + + return data_dir + + +class IMDBLoader(Loader): + """ + 原始数据中内容应该为, 每一行为一个sample,制表符之前为target,制表符之后为文本内容。 + + Example:: + + neg Alan Rickman & Emma... + neg I have seen this... + + IMDBLoader读取后的数据将具有以下两列内容: raw_words: str, 需要分类的文本; target: str, 文本的标签 + 读取的DataSet具备以下的结构: + + .. csv-table:: + :header: "raw_words", "target" + + "Alan Rickman & Emma... ", "neg" + "I have seen this... ", "neg" + "...", "..." + + """ + + def __init__(self): + super(IMDBLoader, self).__init__() + + def _load(self, path: str): + dataset = DataSet() + with open(path, 'r', encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split('\t') + target = parts[0] + words = parts[1] + if words: + dataset.append(Instance(raw_words=words, target=target)) + + if len(dataset) == 0: + raise RuntimeError(f"{path} has no valid data.") + + return dataset + + def download(self, dev_ratio: float = 0.1, re_download=False): + """ + 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 + + http://www.aclweb.org/anthology/P11-1015 + + 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev + + :param float dev_ratio: 如果路径中没有dev.txt。从train划分多少作为dev的数据. 如果为0,则不划分dev + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + """ + dataset_name = 'aclImdb' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + try: + with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.txt'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.txt'), 'w', encoding='utf-8') as f2: + for line in f: + if random.random() < dev_ratio: + f2.write(line) + else: + f1.write(line) + os.remove(os.path.join(data_dir, 'train.txt')) + os.renames(os.path.join(data_dir, 'middle_file.txt'), os.path.join(data_dir, 'train.txt')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.txt')): + os.remove(os.path.join(data_dir, 'middle_file.txt')) + + return data_dir + + +class SSTLoader(Loader): + """ + 原始数据中内容应该为: + + Example:: + + (2 (3 (3 Effective) (2 but)) (1 (1 too-tepid)... + (3 (3 (2 If) (3 (2 you) (3 (2 sometimes)... + + 读取之后的DataSet具有以下的结构 + + .. csv-table:: 下面是使用SSTLoader读取的DataSet所具备的field + :header: "raw_words" + + "(2 (3 (3 Effective) (2 but)) (1 (1 too-tepid)..." + "(3 (3 (2 If) (3 (2 you) (3 (2 sometimes) ..." + "..." + + raw_words列是str。 + + """ + + def __init__(self): + super().__init__() + + def _load(self, path: str): + """ + 从path读取SST文件 + + :param str path: 文件路径 + :return: DataSet + """ + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line: + ds.append(Instance(raw_words=line)) + return ds + + def download(self): + """ + 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 + + https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf + + :return: str, 数据集的目录地址 + """ + output_dir = self._get_dataset_path(dataset_name='sst') + return output_dir + + +class SST2Loader(Loader): + """ + 原始数据中内容为:第一行为标题(具体内容会被忽略),之后一行为一个sample,第一个制表符之前被认为是句子,第一个制表符之后认为是label + + Example:: + + sentence label + it 's a charming and often affecting journey . 1 + unflinchingly bleak and desperate 0 + + 读取之后DataSet将如下所示 + + .. csv-table:: + :header: "raw_words", "target" + + "it 's a charming and often affecting journey .", "1" + "unflinchingly bleak and desperate", "0" + "..." + + test的DataSet没有target列。 + """ + + def __init__(self): + super().__init__() + + def _load(self, path: str): + """ + 从path读取SST2文件 + + :param str path: 数据路径 + :return: DataSet + """ + ds = DataSet() + + with open(path, 'r', encoding='utf-8') as f: + f.readline() # 跳过header + if 'test' in os.path.split(path)[1]: + warnings.warn("SST2's test file has no target.") + for line in f: + line = line.strip() + if line: + sep_index = line.index('\t') + raw_words = line[sep_index + 1:] + if raw_words: + ds.append(Instance(raw_words=raw_words)) + else: + for line in f: + line = line.strip() + if line: + raw_words = line[:-2] + target = line[-1] + if raw_words: + ds.append(Instance(raw_words=raw_words, target=target)) + return ds + + def download(self): + """ + 自动下载数据集,如果你使用了该数据集,请引用以下的文章 + + https://nlp.stanford.edu/pubs/SocherBauerManningNg_ACL2013.pdf + + :return: + """ + output_dir = self._get_dataset_path(dataset_name='sst-2') + return output_dir + + +class ChnSentiCorpLoader(Loader): + """ + 支持读取的数据的格式为,第一行为标题(具体内容会被忽略),之后一行为一个sample,第一个制表符之前被认为是label,第 + 一个制表符之后认为是句子 + + Example:: + + label text_a + 1 基金痛所有投资项目一样,必须先要有所了解... + 1 系统很好装,LED屏是不错,就是16比9的比例... + + 读取后的DataSet具有以下的field + + .. csv-table:: + :header: "raw_chars", "target" + + "基金痛所有投资项目一样,必须先要有所了解...", "1" + "系统很好装,LED屏是不错,就是16比9的比例...", "1" + "..." + + """ + def __init__(self): + super().__init__() + + def _load(self, path:str): + """ + 从path中读取数据 + + :param path: + :return: + """ + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + f.readline() + for line in f: + line = line.strip() + tab_index = line.index('\t') + if tab_index != -1: + target = line[:tab_index] + raw_chars = line[tab_index+1:] + if raw_chars: + ds.append(Instance(raw_chars=raw_chars, target=target)) + return ds + + def download(self) -> str: + """ + 自动下载数据,该数据取自https://github.com/pengming617/bert_classification/tree/master/data,在 + https://arxiv.org/pdf/1904.09223.pdf与https://arxiv.org/pdf/1906.08101.pdf有使用 + + :return: + """ + output_dir = self._get_dataset_path('chn-senti-corp') + return output_dir + + +class THUCNewsLoader(Loader): + """ + 数据集简介:document-level分类任务,新闻10分类 + 原始数据内容为:每行一个sample,第一个'\t'之前为target,第一个'\t'之后为raw_words + + Example:: + + 体育 调查-您如何评价热火客场胜绿军总分3-1夺赛点?... + + 读取后的Dataset将具有以下数据结构: + + .. csv-table:: + :header: "raw_words", "target" + + "调查-您如何评价热火客场胜绿军总分3-1夺赛点?...", "体育" + "...", "..." + + """ + + def __init__(self): + super(THUCNewsLoader, self).__init__() + + def _load(self, path: str = None): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + sep_index = line.index('\t') + raw_chars = line[sep_index + 1:] + target = line[:sep_index] + if raw_chars: + ds.append(Instance(raw_chars=raw_chars, target=target)) + return ds + + def download(self) -> str: + """ + 自动下载数据,该数据取自 + + http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews + + :return: + """ + output_dir = self._get_dataset_path('thuc-news') + return output_dir + + +class WeiboSenti100kLoader(Loader): + """ + 别名: + 数据集简介:微博sentiment classification,二分类 + + Example:: + + label text + 1 多谢小莲,好运满满[爱你] + 1 能在他乡遇老友真不赖,哈哈,珠儿,我也要用... + + 读取后的Dataset将具有以下数据结构: + + .. csv-table:: + :header: "raw_chars", "target" + + "多谢小莲,好运满满[爱你]", "1" + "能在他乡遇老友真不赖,哈哈,珠儿,我也要用...", "1" + "...", "..." + + """ + + def __init__(self): + super(WeiboSenti100kLoader, self).__init__() + + def _load(self, path: str = None): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + next(f) + for line in f: + line = line.strip() + target = line[0] + raw_chars = line[1:] + if raw_chars: + ds.append(Instance(raw_chars=raw_chars, target=target)) + return ds + + def download(self) -> str: + """ + 自动下载数据,该数据取自 https://github.com/SophonPlus/ChineseNlpCorpus/ + 在 https://arxiv.org/abs/1906.08101 有使用 + :return: + """ + output_dir = self._get_dataset_path('weibo-senti-100k') + return output_dir diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py new file mode 100644 index 00000000..2e0bb038 --- /dev/null +++ b/fastNLP/io/loader/conll.py @@ -0,0 +1,541 @@ +"""undocumented""" + +__all__ = [ + "ConllLoader", + "Conll2003Loader", + "Conll2003NERLoader", + "OntoNotesNERLoader", + "CTBLoader", + "CNNERLoader", + "MsraNERLoader", + "WeiboNERLoader", + "PeopleDailyNERLoader" +] + +import glob +import os +import random +import shutil +import time + +from .loader import Loader +from ..file_reader import _read_conll +from ...core.const import Const +from ...core.dataset import DataSet +from ...core.instance import Instance + + +class ConllLoader(Loader): + """ + ConllLoader支持读取的数据格式: 以空行隔开两个sample,除了分割行,每一行用空格或者制表符隔开不同的元素。如下例所示: + + Example:: + + # 文件中的内容 + Nadim NNP B-NP B-PER + Ladki NNP I-NP I-PER + + AL-AIN NNP B-NP B-LOC + United NNP B-NP B-LOC + Arab NNP I-NP I-LOC + Emirates NNPS I-NP I-LOC + 1996-12-06 CD I-NP O + ... + + # 如果用以下的参数读取,返回的DataSet将包含raw_words和pos两个field, 这两个field的值分别取自于第0列与第1列 + dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll') + # 如果用以下的参数读取,返回的DataSet将包含raw_words和ner两个field, 这两个field的值分别取自于第0列与第2列 + dataset = ConllLoader(headers=['raw_words', 'ner'], indexes=[0, 3])._load('/path/to/train.conll') + # 如果用以下的参数读取,返回的DataSet将包含raw_words, pos和ner三个field + dataset = ConllLoader(headers=['raw_words', 'pos', 'ner'], indexes=[0, 1, 3])._load('/path/to/train.conll') + + ConllLoader返回的DataSet的field由传入的headers确定。 + + 数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。 + + """ + + def __init__(self, headers, indexes=None, dropna=True): + """ + + :param list headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 + :param list indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` + :param bool dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``True`` + """ + super(ConllLoader, self).__init__() + if not isinstance(headers, (list, tuple)): + raise TypeError( + 'invalid headers: {}, should be list of strings'.format(headers)) + self.headers = headers + self.dropna = dropna + if indexes is None: + self.indexes = list(range(len(self.headers))) + else: + if len(indexes) != len(headers): + raise ValueError + self.indexes = indexes + + def _load(self, path): + """ + 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 + + :param str path: 文件的路径 + :return: DataSet + """ + ds = DataSet() + for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): + ins = {h: data[i] for i, h in enumerate(self.headers)} + ds.append(Instance(**ins)) + return ds + + +class Conll2003Loader(ConllLoader): + """ + 用于读取conll2003任务的数据。数据的内容应该类似与以下的内容, 第一列为raw_words, 第二列为pos, 第三列为chunking,第四列为ner。 + + Example:: + + Nadim NNP B-NP B-PER + Ladki NNP I-NP I-PER + + AL-AIN NNP B-NP B-LOC + United NNP B-NP B-LOC + Arab NNP I-NP I-LOC + Emirates NNPS I-NP I-LOC + 1996-12-06 CD I-NP O + ... + + 返回的DataSet的内容为 + + .. csv-table:: 下面是Conll2003Loader加载后数据具备的结构。 + :header: "raw_words", "pos", "chunk", "ner" + + "[Nadim, Ladki]", "[NNP, NNP]", "[B-NP, I-NP]", "[B-PER, I-PER]" + "[AL-AIN, United, Arab, ...]", "[NNP, NNP, NNP, ...]", "[B-NP, B-NP, I-NP, ...]", "[B-LOC, B-LOC, I-LOC, ...]" + "[...]", "[...]", "[...]", "[...]" + + """ + + def __init__(self): + headers = [ + 'raw_words', 'pos', 'chunk', 'ner', + ] + super(Conll2003Loader, self).__init__(headers=headers) + + def _load(self, path): + """ + 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 + + :param str path: 文件的路径 + :return: DataSet + """ + ds = DataSet() + for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): + doc_start = False + for i, h in enumerate(self.headers): + field = data[i] + if str(field[0]).startswith('-DOCSTART-'): + doc_start = True + break + if doc_start: + continue + ins = {h: data[i] for i, h in enumerate(self.headers)} + ds.append(Instance(**ins)) + return ds + + def download(self, output_dir=None): + raise RuntimeError("conll2003 cannot be downloaded automatically.") + + +class Conll2003NERLoader(ConllLoader): + """ + 用于读取conll2003任务的NER数据。每一行有4列内容,空行意味着隔开两个句子 + + 支持读取的内容如下 + Example:: + + Nadim NNP B-NP B-PER + Ladki NNP I-NP I-PER + + AL-AIN NNP B-NP B-LOC + United NNP B-NP B-LOC + Arab NNP I-NP I-LOC + Emirates NNPS I-NP I-LOC + 1996-12-06 CD I-NP O + ... + + 返回的DataSet的内容为 + + .. csv-table:: 下面是Conll2003Loader加载后数据具备的结构, target是BIO2编码 + :header: "raw_words", "target" + + "[Nadim, Ladki]", "[B-PER, I-PER]" + "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" + "[...]", "[...]" + + """ + + def __init__(self): + headers = [ + 'raw_words', 'target', + ] + super().__init__(headers=headers, indexes=[0, 3]) + + def _load(self, path): + """ + 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 + + :param str path: 文件的路径 + :return: DataSet + """ + ds = DataSet() + for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): + doc_start = False + for i, h in enumerate(self.headers): + field = data[i] + if str(field[0]).startswith('-DOCSTART-'): + doc_start = True + break + if doc_start: + continue + ins = {h: data[i] for i, h in enumerate(self.headers)} + ds.append(Instance(**ins)) + if len(ds) == 0: + raise RuntimeError("No data found {}.".format(path)) + return ds + + def download(self): + raise RuntimeError("conll2003 cannot be downloaded automatically.") + + +class OntoNotesNERLoader(ConllLoader): + """ + 用以读取OntoNotes的NER数据,同时也是Conll2012的NER任务数据。将OntoNote数据处理为conll格式的过程可以参考 + https://github.com/yhcc/OntoNotes-5.0-NER。OntoNoteNERLoader将取第4列和第11列的内容。 + + 读取的数据格式为: + + Example:: + + bc/msnbc/00/msnbc_0000 0 0 Hi UH (TOP(FRAG(INTJ*) - - - Dan_Abrams * - + bc/msnbc/00/msnbc_0000 0 1 everyone NN (NP*) - - - Dan_Abrams * - + ... + + 返回的DataSet的内容为 + + .. csv-table:: + :header: "raw_words", "target" + + "['Hi', 'everyone', '.']", "['O', 'O', 'O']" + "['first', 'up', 'on', 'the', 'docket'], "['O', 'O', 'O', 'O', 'O']" + "[...]", "[...]" + + """ + + def __init__(self): + super().__init__(headers=[Const.RAW_WORD, Const.TARGET], indexes=[3, 10]) + + def _load(self, path: str): + dataset = super()._load(path) + + def convert_to_bio(tags): + bio_tags = [] + flag = None + for tag in tags: + label = tag.strip("()*") + if '(' in tag: + bio_label = 'B-' + label + flag = label + elif flag: + bio_label = 'I-' + flag + else: + bio_label = 'O' + if ')' in tag: + flag = None + bio_tags.append(bio_label) + return bio_tags + + def convert_word(words): + converted_words = [] + for word in words: + word = word.replace('/.', '.') # 有些结尾的.是/.形式的 + if not word.startswith('-'): + converted_words.append(word) + continue + # 以下是由于这些符号被转义了,再转回来 + tfrs = {'-LRB-': '(', + '-RRB-': ')', + '-LSB-': '[', + '-RSB-': ']', + '-LCB-': '{', + '-RCB-': '}' + } + if word in tfrs: + converted_words.append(tfrs[word]) + else: + converted_words.append(word) + return converted_words + + dataset.apply_field(convert_word, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD) + dataset.apply_field(convert_to_bio, field_name=Const.TARGET, new_field_name=Const.TARGET) + + return dataset + + def download(self): + raise RuntimeError("Ontonotes cannot be downloaded automatically, you can refer " + "https://github.com/yhcc/OntoNotes-5.0-NER to download and preprocess.") + + +class CTBLoader(Loader): + """ + 支持加载的数据应该具备以下格式, 其中第二列为词语,第四列为pos tag,第七列为依赖树的head,第八列为依赖树的label + + Example:: + + 1 印度 _ NR NR _ 3 nn _ _ + 2 海军 _ NN NN _ 3 nn _ _ + 3 参谋长 _ NN NN _ 5 nsubjpass _ _ + 4 被 _ SB SB _ 5 pass _ _ + 5 解职 _ VV VV _ 0 root _ _ + + 1 新华社 _ NR NR _ 7 dep _ _ + 2 新德里 _ NR NR _ 7 dep _ _ + 3 12月 _ NT NT _ 7 dep _ _ + ... + + 读取之后DataSet具备的格式为 + + .. csv-table:: + :header: "raw_words", "pos", "dep_head", "dep_label" + + "[印度, 海军, ...]", "[NR, NN, SB, ...]", "[3, 3, ...]", "[nn, nn, ...]" + "[新华社, 新德里, ...]", "[NR, NR, NT, ...]", "[7, 7, 7, ...]", "[dep, dep, dep, ...]" + "[...]", "[...]", "[...]", "[...]" + + """ + def __init__(self): + super().__init__() + headers = [ + 'raw_words', 'pos', 'dep_head', 'dep_label', + ] + indexes = [ + 1, 3, 6, 7, + ] + self.loader = ConllLoader(headers=headers, indexes=indexes) + + def _load(self, path: str): + dataset = self.loader._load(path) + return dataset + + def download(self): + """ + 由于版权限制,不能提供自动下载功能。可参考 + + https://catalog.ldc.upenn.edu/LDC2013T21 + + :return: + """ + raise RuntimeError("CTB cannot be downloaded automatically.") + + +class CNNERLoader(Loader): + def _load(self, path: str): + """ + 支持加载形如以下格式的内容,一行两列,以空格隔开两个sample + + Example:: + + 我 O + 们 O + 变 O + 而 O + 以 O + 书 O + 会 O + ... + + :param str path: 文件路径 + :return: DataSet,包含raw_words列和target列 + """ + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + raw_chars = [] + target = [] + for line in f: + line = line.strip() + if line: + parts = line.split() + if len(parts) == 1: # 网上下载的数据有一些列少tag,默认补充O + parts.append('O') + raw_chars.append(parts[0]) + target.append(parts[1]) + else: + if raw_chars: + ds.append(Instance(raw_chars=raw_chars, target=target)) + raw_chars = [] + target = [] + return ds + + +class MsraNERLoader(CNNERLoader): + """ + 读取MSRA-NER数据,数据中的格式应该类似与下列的内容 + + Example:: + + 把 O + 欧 B-LOC + + 美 B-LOC + 、 O + + 港 B-LOC + 台 B-LOC + + 流 O + 行 O + + 的 O + + 食 O + + ... + + 读取后的DataSet包含以下的field + + .. csv-table:: + :header: "raw_chars", "target" + + "['把', '欧'] ", "['O', 'B-LOC']" + "['美', '、']", "['B-LOC', 'O']" + "[...]", "[...]" + + """ + + def __init__(self): + super().__init__() + + def download(self, dev_ratio: float = 0.1, re_download: bool = False) -> str: + """ + 自动下载MSAR-NER的数据,如果你使用该数据,请引用 Gina-Anne Levow, 2006, The Third International Chinese Language + Processing Bakeoff: Word Segmentation and Named Entity Recognition. + + 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后在output_dir中有train.conll, test.conll, + dev.conll三个文件。 + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + :return: + """ + dataset_name = 'msra-ner' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.conll')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + try: + with open(os.path.join(data_dir, 'train.conll'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.conll'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.conll'), 'w', encoding='utf-8') as f2: + lines = [] # 一个sample包含很多行 + for line in f: + line = line.strip() + if line: + lines.append(line) + else: + if random.random() < dev_ratio: + f2.write('\n'.join(lines) + '\n\n') + else: + f1.write('\n'.join(lines) + '\n\n') + lines.clear() + os.remove(os.path.join(data_dir, 'train.conll')) + os.renames(os.path.join(data_dir, 'middle_file.conll'), os.path.join(data_dir, 'train.conll')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.conll')): + os.remove(os.path.join(data_dir, 'middle_file.conll')) + + return data_dir + + +class WeiboNERLoader(CNNERLoader): + """ + 读取WeiboNER数据,数据中的格式应该类似与下列的内容 + + Example:: + + 老 B-PER.NOM + 百 I-PER.NOM + 姓 I-PER.NOM + + 心 O + + ... + + 读取后的DataSet包含以下的field + + .. csv-table:: + + :header: "raw_chars", "target" + + "['老', '百', '姓']", "['B-PER.NOM', 'I-PER.NOM', 'I-PER.NOM']" + "['心']", "['O']" + "[...]", "[...]" + + """ + def __init__(self): + super().__init__() + + def download(self) -> str: + """ + 自动下载Weibo-NER的数据,如果你使用了该数据,请引用 Nanyun Peng and Mark Dredze, 2015, Named Entity Recognition for + Chinese Social Media with Jointly Trained Embeddings. + + :return: str + """ + dataset_name = 'weibo-ner' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + return data_dir + + +class PeopleDailyNERLoader(CNNERLoader): + """ + 支持加载的数据格式如下 + + Example:: + + 中 B-ORG + 共 I-ORG + 中 I-ORG + 央 I-ORG + + 致 O + 中 B-ORG + ... + + 读取后的DataSet包含以下的field + + .. csv-table:: target列是基于BIO的编码方式 + :header: "raw_chars", "target" + + "['中', '共', '中', '央']", "['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG']" + "[...]", "[...]" + + """ + + def __init__(self): + super().__init__() + + def download(self) -> str: + dataset_name = 'peopledaily' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + return data_dir diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py new file mode 100644 index 00000000..d54610e4 --- /dev/null +++ b/fastNLP/io/loader/coreference.py @@ -0,0 +1,64 @@ +"""undocumented""" + +__all__ = [ + "CoReferenceLoader", +] + +from ...core.dataset import DataSet +from ..file_reader import _read_json +from ...core.instance import Instance +from ...core.const import Const +from .json import JsonLoader + + +class CoReferenceLoader(JsonLoader): + """ + 原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 + + Example:: + + {"doc_key": "bc/cctv/00/cctv_0000_0", + "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], + "clusters": [[[70, 70], [485, 486], [500, 500], [73, 73], [55, 55], [153, 154], [366, 366]]], + "sentences": [["In", "the", "summer", "of", "2005", ",", "a", "picture", "that", "people", "have", "long", "been", "looking", "forward", "to", "started", "emerging", "with", "frequency", "in", "various", "major", "Hong", "Kong", "media", "."], ["With", "their", "unique", "charm", ",", "these", "well", "-", "known", "cartoon", "images", "once", "again", "caused", "Hong", "Kong", "to", "be", "a", "focus", "of", "worldwide", "attention", "."]] + } + + 读取预处理好的Conll2012数据,数据结构如下: + + .. csv-table:: + :header: "raw_words1", "raw_words2", "raw_words3", "raw_words4" + + "bc/cctv/00/cctv_0000_0", "[['Speaker#1', 'Speaker#1', 'Speaker#1...", "[[[70, 70], [485, 486], [500, 500], [7...", "[['In', 'the', 'summer', 'of', '2005',..." + "...", "...", "...", "..." + + """ + def __init__(self, fields=None, dropna=False): + super().__init__(fields, dropna) + self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2), + "sentences": Const.RAW_WORDS(3)} + + def _load(self, path): + """ + 加载数据 + :param path: 数据文件路径,文件为json + + :return: + """ + dataset = DataSet() + for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): + if self.fields: + ins = {self.fields[k]: v for k, v in d.items()} + else: + ins = d + dataset.append(Instance(**ins)) + return dataset + + def download(self): + """ + 由于版权限制,不能提供自动下载功能。可参考 + + https://www.aclweb.org/anthology/W12-4501 + + :return: + """ + raise RuntimeError("CoReference cannot be downloaded automatically.") diff --git a/fastNLP/io/loader/csv.py b/fastNLP/io/loader/csv.py new file mode 100644 index 00000000..6f35efbe --- /dev/null +++ b/fastNLP/io/loader/csv.py @@ -0,0 +1,39 @@ +"""undocumented""" + +__all__ = [ + "CSVLoader", +] + +from .loader import Loader +from ..file_reader import _read_csv +from ...core.dataset import DataSet +from ...core.instance import Instance + + +class CSVLoader(Loader): + """ + 读取CSV格式的数据集, 返回 ``DataSet`` 。 + + """ + + def __init__(self, headers=None, sep=",", dropna=False): + """ + + :param List[str] headers: CSV文件的文件头.定义每一列的属性名称,即返回的DataSet中`field`的名称 + 若为 ``None`` ,则将读入文件的第一行视作 ``headers`` . Default: ``None`` + :param str sep: CSV文件中列与列之间的分隔符. Default: "," + :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` . + Default: ``False`` + """ + super().__init__() + self.headers = headers + self.sep = sep + self.dropna = dropna + + def _load(self, path): + ds = DataSet() + for idx, data in _read_csv(path, headers=self.headers, + sep=self.sep, dropna=self.dropna): + ds.append(Instance(**data)) + return ds + diff --git a/fastNLP/io/loader/cws.py b/fastNLP/io/loader/cws.py new file mode 100644 index 00000000..887bb545 --- /dev/null +++ b/fastNLP/io/loader/cws.py @@ -0,0 +1,97 @@ +"""undocumented""" + +__all__ = [ + "CWSLoader" +] + +import glob +import os +import random +import shutil +import time + +from .loader import Loader +from ...core.dataset import DataSet +from ...core.instance import Instance + + +class CWSLoader(Loader): + """ + CWSLoader支持的数据格式为,一行一句话,不同词之间用空格隔开, 例如: + + Example:: + + 上海 浦东 开发 与 法制 建设 同步 + 新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 ) + ... + + 该Loader读取后的DataSet具有如下的结构 + + .. csv-table:: + :header: "raw_words" + + "上海 浦东 开发 与 法制 建设 同步" + "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )" + "..." + + """ + def __init__(self, dataset_name:str=None): + """ + + :param str dataset_name: data的名称,支持pku, msra, cityu(繁体), as(繁体), None + """ + super().__init__() + datanames = {'pku': 'cws-pku', 'msra':'cws-msra', 'as':'cws-as', 'cityu':'cws-cityu'} + if dataset_name in datanames: + self.dataset_name = datanames[dataset_name] + else: + self.dataset_name = None + + def _load(self, path:str): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line: + ds.append(Instance(raw_words=line)) + return ds + + def download(self, dev_ratio=0.1, re_download=False)->str: + """ + 如果你使用了该数据集,请引用以下的文章:Thomas Emerson, The Second International Chinese Word Segmentation Bakeoff, + 2005. 更多信息可以在http://sighan.cs.uchicago.edu/bakeoff2005/查看 + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str + """ + if self.dataset_name is None: + return None + data_dir = self._get_dataset_path(dataset_name=self.dataset_name) + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=self.dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.txt')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + try: + with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.txt'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.txt'), 'w', encoding='utf-8') as f2: + for line in f: + if random.random() < dev_ratio: + f2.write(line) + else: + f1.write(line) + os.remove(os.path.join(data_dir, 'train.txt')) + os.renames(os.path.join(data_dir, 'middle_file.txt'), os.path.join(data_dir, 'train.txt')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.txt')): + os.remove(os.path.join(data_dir, 'middle_file.txt')) + + return data_dir diff --git a/fastNLP/io/loader/json.py b/fastNLP/io/loader/json.py new file mode 100644 index 00000000..012dee5a --- /dev/null +++ b/fastNLP/io/loader/json.py @@ -0,0 +1,46 @@ +"""undocumented""" + +__all__ = [ + "JsonLoader" +] + +from .loader import Loader +from ..file_reader import _read_json +from ...core.dataset import DataSet +from ...core.instance import Instance + + +class JsonLoader(Loader): + """ + 别名::class:`fastNLP.io.JsonLoader` :class:`fastNLP.io.loader.JsonLoader` + + 读取json格式数据.数据必须按行存储,每行是一个包含各类属性的json对象 + + :param dict fields: 需要读入的json属性名称, 和读入后在DataSet中存储的field_name + ``fields`` 的 `key` 必须是json对象的属性名. ``fields`` 的 `value` 为读入后在DataSet存储的 `field_name` , + `value` 也可为 ``None`` , 这时读入后的 `field_name` 与json对象对应属性同名 + ``fields`` 可为 ``None`` , 这时,json对象所有属性都保存在DataSet中. Default: ``None`` + :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` . + Default: ``False`` + """ + + def __init__(self, fields=None, dropna=False): + super(JsonLoader, self).__init__() + self.dropna = dropna + self.fields = None + self.fields_list = None + if fields: + self.fields = {} + for k, v in fields.items(): + self.fields[k] = k if v is None else v + self.fields_list = list(self.fields.keys()) + + def _load(self, path): + ds = DataSet() + for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): + if self.fields: + ins = {self.fields[k]: v for k, v in d.items()} + else: + ins = d + ds.append(Instance(**ins)) + return ds diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py new file mode 100644 index 00000000..fa1128ed --- /dev/null +++ b/fastNLP/io/loader/loader.py @@ -0,0 +1,94 @@ +"""undocumented""" + +__all__ = [ + "Loader" +] + +from typing import Union, Dict + +from .. import DataBundle +from ..file_utils import _get_dataset_url, get_cache_path, cached_path +from ..utils import check_loader_paths +from ...core.dataset import DataSet + + +class Loader: + """ + 各种数据 Loader 的基类,提供了 API 的参考. + Loader支持以下的三个函数 + + - download() 函数:自动将该数据集下载到缓存地址,默认缓存地址为~/.fastNLP/datasets/。由于版权等原因,不是所有的Loader都实现了该方法。该方法会返回下载后文件所处的缓存地址。 + - _load() 函数:从一个数据文件中读取数据,返回一个 :class:`~fastNLP.DataSet` 。返回的DataSet的内容可以通过每个Loader的文档判断出。 + - load() 函数:将文件分别读取为DataSet,然后将多个DataSet放入到一个DataBundle中并返回 + + """ + + def __init__(self): + pass + + def _load(self, path: str) -> DataSet: + """ + 给定一个路径,返回读取的DataSet。 + + :param str path: 路径 + :return: DataSet + """ + raise NotImplementedError + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + r""" + 从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。 + + :param Union[str, Dict[str, str]] paths: 支持以下的几种输入方式: + + 0.如果为None,则先查看本地是否有缓存,如果没有则自动下载并缓存。 + + 1.传入一个目录, 该目录下名称包含train的被认为是train,包含test的被认为是test,包含dev的被认为是dev,如果检测到多个文件名包含'train'、 'dev'、 'test'则会报错:: + + data_bundle = xxxLoader().load('/path/to/dir') # 返回的DataBundle中datasets根据目录下是否检测到train + # dev、 test等有所变化,可以通过以下的方式取出DataSet + tr_data = data_bundle.get_dataset('train') + te_data = data_bundle.get_dataset('test') # 如果目录下有文件包含test这个字段 + + 2.传入一个dict,比如train,dev,test不在同一个目录下,或者名称中不包含train, dev, test:: + + paths = {'train':"/path/to/tr.conll", 'dev':"/to/validate.conll", "test":"/to/te.conll"} + data_bundle = xxxLoader().load(paths) # 返回的DataBundle中的dataset中包含"train", "dev", "test" + dev_data = data_bundle.get_dataset('dev') + + 3.传入文件路径:: + + data_bundle = xxxLoader().load("/path/to/a/train.conll") # 返回DataBundle对象, datasets中仅包含'train' + tr_data = data_bundle.get_dataset('train') # 取出DataSet + + :return: 返回的 :class:`~fastNLP.io.DataBundle` + """ + if paths is None: + paths = self.download() + paths = check_loader_paths(paths) + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + def download(self) -> str: + """ + 自动下载该数据集 + + :return: 下载后解压目录 + """ + raise NotImplementedError(f"{self.__class__} cannot download data automatically.") + + @staticmethod + def _get_dataset_path(dataset_name): + """ + 传入dataset的名称,获取读取数据的目录。如果数据不存在,会尝试自动下载并缓存(如果支持的话) + + :param str dataset_name: 数据集的名称 + :return: str, 数据集的目录地址。直接到该目录下读取相应的数据即可。 + """ + + default_cache_path = get_cache_path() + url = _get_dataset_url(dataset_name) + output_dir = cached_path(url_or_filename=url, cache_dir=default_cache_path, name='dataset') + + return output_dir diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py new file mode 100644 index 00000000..9c4c90d9 --- /dev/null +++ b/fastNLP/io/loader/matching.py @@ -0,0 +1,576 @@ +"""undocumented""" + +__all__ = [ + "MNLILoader", + "SNLILoader", + "QNLILoader", + "RTELoader", + "QuoraLoader", + "BQCorpusLoader", + "CNXNLILoader", + "LCQMCLoader" +] + +import os +import warnings +from typing import Union, Dict + +from .csv import CSVLoader +from .json import JsonLoader +from .loader import Loader +from .. import DataBundle +from ..utils import check_loader_paths +from ...core.const import Const +from ...core.dataset import DataSet +from ...core.instance import Instance + + +class MNLILoader(Loader): + """ + 读取的数据格式为: + + Example:: + + index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 gold_label + 0 31193 31193n government ( ( Conceptually ( cream skimming ) ) ... + 1 101457 101457e telephone ( you ( ( know ( during ( ( ( the season ) and ) ( i guess ) ) )... + ... + + 读取MNLI任务的数据,读取之后的DataSet中包含以下的内容,words0是sentence1, words1是sentence2, target是gold_label, 测试集中没 + 有target列。 + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "Conceptually cream ...", "Product and geography...", "neutral" + "you know during the ...", "You lose the things to the...", "entailment" + "...", "...", "..." + + """ + + def __init__(self): + super().__init__() + + def _load(self, path: str): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + f.readline() # 跳过header + if path.endswith("test_matched.tsv") or path.endswith('test_mismatched.tsv'): + warnings.warn("RTE's test file has no target.") + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[8] + raw_words2 = parts[9] + if raw_words1 and raw_words2: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2)) + else: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[8] + raw_words2 = parts[9] + target = parts[-1] + if raw_words1 and raw_words2 and target: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) + return ds + + def load(self, paths: str = None): + """ + + :param str paths: 传入数据所在目录,会在该目录下寻找dev_matched.tsv, dev_mismatched.tsv, test_matched.tsv, + test_mismatched.tsv, train.tsv文件夹 + :return: DataBundle + """ + if paths: + paths = os.path.abspath(os.path.expanduser(paths)) + else: + paths = self.download() + if not os.path.isdir(paths): + raise NotADirectoryError(f"{paths} is not a valid directory.") + + files = {'dev_matched': "dev_matched.tsv", + "dev_mismatched": "dev_mismatched.tsv", + "test_matched": "test_matched.tsv", + "test_mismatched": "test_mismatched.tsv", + "train": 'train.tsv'} + + datasets = {} + for name, filename in files.items(): + filepath = os.path.join(paths, filename) + if not os.path.isfile(filepath): + if 'test' not in name: + raise FileNotFoundError(f"{name} not found in directory {filepath}.") + datasets[name] = self._load(filepath) + + data_bundle = DataBundle(datasets=datasets) + + return data_bundle + + def download(self): + """ + 如果你使用了这个数据,请引用 + + https://www.nyu.edu/projects/bowman/multinli/paper.pdf + :return: + """ + output_dir = self._get_dataset_path('mnli') + return output_dir + + +class SNLILoader(JsonLoader): + """ + 文件每一行是一个sample,每一行都为一个json对象,其数据格式为: + + Example:: + + {"annotator_labels": ["neutral", "entailment", "neutral", "neutral", "neutral"], "captionID": "4705552913.jpg#2", + "gold_label": "neutral", "pairID": "4705552913.jpg#2r1n", + "sentence1": "Two women are embracing while holding to go packages.", + "sentence1_binary_parse": "( ( Two women ) ( ( are ( embracing ( while ( holding ( to ( go packages ) ) ) ) ) ) . ) )", + "sentence1_parse": "(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP are) (VP (VBG embracing) (SBAR (IN while) (S (NP (VBG holding)) (VP (TO to) (VP (VB go) (NP (NNS packages)))))))) (. .)))", + "sentence2": "The sisters are hugging goodbye while holding to go packages after just eating lunch.", + "sentence2_binary_parse": "( ( The sisters ) ( ( are ( ( hugging goodbye ) ( while ( holding ( to ( ( go packages ) ( after ( just ( eating lunch ) ) ) ) ) ) ) ) ) . ) )", + "sentence2_parse": "(ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP are) (VP (VBG hugging) (NP (UH goodbye)) (PP (IN while) (S (VP (VBG holding) (S (VP (TO to) (VP (VB go) (NP (NNS packages)) (PP (IN after) (S (ADVP (RB just)) (VP (VBG eating) (NP (NN lunch))))))))))))) (. .)))" + } + + 读取之后的DataSet中的field情况为 + + .. csv-table:: 下面是使用SNLILoader加载的DataSet所具备的field + :header: "target", "raw_words1", "raw_words2", + + "neutral ", "Two women are embracing while holding..", "The sisters are hugging goodbye..." + "entailment", "Two women are embracing while holding...", "Two woman are holding packages." + "...", "...", "..." + + """ + + def __init__(self): + super().__init__(fields={ + 'sentence1': Const.RAW_WORDS(0), + 'sentence2': Const.RAW_WORDS(1), + 'gold_label': Const.TARGET, + }) + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + """ + 从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。 + + 读取的field根据Loader初始化时传入的field决定。 + + :param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl + 和snli_1.0_test.jsonl三个文件。 + + :return: 返回的 :class:`~fastNLP.io.DataBundle` + """ + _paths = {} + if paths is None: + paths = self.download() + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'snli_1.0_train.jsonl')): + raise FileNotFoundError(f"snli_1.0_train.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'snli_1.0_train.jsonl') + for filename in ['snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl']: + filepath = os.path.join(paths, filename) + _paths[filename.split('_')[-1].split('.')[0]] = filepath + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + def download(self): + """ + 如果您的文章使用了这份数据,请引用 + + http://nlp.stanford.edu/pubs/snli_paper.pdf + + :return: str + """ + return self._get_dataset_path('snli') + + +class QNLILoader(JsonLoader): + """ + 第一行为标题(具体内容会被忽略),之后每一行是一个sample,由index、问题、句子和标签构成(以制表符分割),数据结构如下: + + Example:: + + index question sentence label + 0 What came into force after the new constitution was herald? As of that day, the new constitution heralding the Second Republic came into force. entailment + + QNLI数据集的Loader, + 加载的DataSet将具备以下的field, raw_words1是question, raw_words2是sentence, target是label + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "What came into force after the new...", "As of that day...", "entailment" + "...","." + + test数据集没有target列 + + """ + + def __init__(self): + super().__init__() + + def _load(self, path): + ds = DataSet() + + with open(path, 'r', encoding='utf-8') as f: + f.readline() # 跳过header + if path.endswith("test.tsv"): + warnings.warn("QNLI's test file has no target.") + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + if raw_words1 and raw_words2: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2)) + else: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + target = parts[-1] + if raw_words1 and raw_words2 and target: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) + return ds + + def download(self): + """ + 如果您的实验使用到了该数据,请引用 + + https://arxiv.org/pdf/1809.05053.pdf + + :return: + """ + return self._get_dataset_path('qnli') + + +class RTELoader(Loader): + """ + 第一行为标题(具体内容会被忽略),之后每一行是一个sample,由index、句子1、句子2和标签构成(以制表符分割),数据结构如下: + + Example:: + + index sentence1 sentence2 label + 0 Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. Christopher Reeve had an accident. not_entailment + + RTE数据的loader + 加载的DataSet将具备以下的field, raw_words1是sentence0,raw_words2是sentence1, target是label + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "Dana Reeve, the widow of the actor...", "Christopher Reeve had an...", "not_entailment" + "...","..." + + test数据集没有target列 + """ + + def __init__(self): + super().__init__() + + def _load(self, path: str): + ds = DataSet() + + with open(path, 'r', encoding='utf-8') as f: + f.readline() # 跳过header + if path.endswith("test.tsv"): + warnings.warn("RTE's test file has no target.") + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + if raw_words1 and raw_words2: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2)) + else: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + target = parts[-1] + if raw_words1 and raw_words2 and target: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) + return ds + + def download(self): + """ + 如果您的实验使用到了该数据,请引用GLUE Benchmark + + https://openreview.net/pdf?id=rJ4km2R5t7 + + :return: + """ + return self._get_dataset_path('rte') + + +class QuoraLoader(Loader): + """ + Quora matching任务的数据集Loader + + 支持读取的文件中的内容,应该有以下的形式, 以制表符分隔,且前三列的内容必须是:第一列是label,第二列和第三列是句子 + + Example:: + + 1 How do I get funding for my web based startup idea ? How do I get seed funding pre product ? 327970 + 0 Is honey a viable alternative to sugar for diabetics ? How would you compare the United States ' euthanasia laws to Denmark ? 90348 + ... + + 加载的DataSet将具备以下的field + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "How do I get funding for my web based...", "How do I get seed funding...","1" + "Is honey a viable alternative ...", "How would you compare the United...","0" + "...","...","..." + + """ + + def __init__(self): + super().__init__() + + def _load(self, path: str): + ds = DataSet() + + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + target = parts[0] + if raw_words1 and raw_words2 and target: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) + return ds + + def download(self): + """ + 由于版权限制,不能提供自动下载功能。可参考 + + https://www.kaggle.com/c/quora-question-pairs/data + + :return: + """ + raise RuntimeError("Quora cannot be downloaded automatically.") + + +class CNXNLILoader(Loader): + """ + 数据集简介:中文句对NLI(本为multi-lingual的数据集,但是这里只取了中文的数据集)。原句子已被MOSES tokenizer处理,这里我们将其还原并重新按字tokenize + 原始数据数据为: + + Example:: + + premise hypo label + 我们 家里 有 一个 但 我 没 找到 我 可以 用 的 时间 我们 家里 有 一个 但 我 从来 没有 时间 使用 它 . entailment + + dev和test中的数据为csv或json格式,包括十多个field,这里只取与以上三个field中的数据 + 读取后的Dataset将具有以下数据结构: + + .. csv-table:: + :header: "raw_chars1", "raw_chars2", "target" + + "我们 家里 有 一个 但 我 没 找到 我 可以 用 的 时间", "我们 家里 有 一个 但 我 从来 没有 时间 使用 它 .", "0" + "...", "...", "..." + + """ + + def __init__(self): + super(CNXNLILoader, self).__init__() + + def _load(self, path: str = None): + ds_all = DataSet() + with open(path, 'r', encoding='utf-8') as f: + head_name_list = f.readline().strip().split('\t') + sentence1_index = head_name_list.index('sentence1') + sentence2_index = head_name_list.index('sentence2') + gold_label_index = head_name_list.index('gold_label') + language_index = head_name_list.index(('language')) + + for line in f: + line = line.strip() + raw_instance = line.split('\t') + sentence1 = raw_instance[sentence1_index] + sentence2 = raw_instance[sentence2_index] + gold_label = raw_instance[gold_label_index] + language = raw_instance[language_index] + if sentence1: + ds_all.append(Instance(sentence1=sentence1, sentence2=sentence2, gold_label=gold_label, language=language)) + + ds_zh = DataSet() + for i in ds_all: + if i['language'] == 'zh': + ds_zh.append(Instance(raw_chars1=i['sentence1'], raw_chars2=i['sentence2'], target=i['gold_label'])) + + return ds_zh + + def _load_train(self, path: str = None): + ds = DataSet() + + with open(path, 'r', encoding='utf-8') as f: + next(f) + for line in f: + raw_instance = line.strip().split('\t') + premise = "".join(raw_instance[0].split())# 把已经分好词的premise和hypo强制还原为character segmentation + hypo = "".join(raw_instance[1].split()) + label = "".join(raw_instance[-1].split()) + if premise: + ds.append(Instance(premise=premise, hypo=hypo, label=label)) + + ds.rename_field('label', 'target') + ds.rename_field('premise', 'raw_chars1') + ds.rename_field('hypo', 'raw_chars2') + ds.apply(lambda i: "".join(i['raw_chars1'].split()), new_field_name='raw_chars1') + ds.apply(lambda i: "".join(i['raw_chars2'].split()), new_field_name='raw_chars2') + return ds + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: + if paths is None: + paths = self.download() + paths = check_loader_paths(paths) + datasets = {} + for name, path in paths.items(): + if name == 'train': + datasets[name] = self._load_train(path) + else: + datasets[name] = self._load(path) + + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + def download(self) -> str: + """ + 自动下载数据,该数据取自 https://arxiv.org/abs/1809.05053 + 在 https://arxiv.org/pdf/1905.05526.pdf https://arxiv.org/pdf/1901.10125.pdf + https://arxiv.org/pdf/1809.05053.pdf 有使用 + :return: + """ + output_dir = self._get_dataset_path('cn-xnli') + return output_dir + + +class BQCorpusLoader(Loader): + """ + 别名: + 数据集简介:句子对二分类任务(判断是否具有相同的语义) + 原始数据结构为: + + Example:: + + sentence1,sentence2,label + 综合评分不足什么原因,综合评估的依据,0 + 什么时候我能使用微粒贷,你就赶快给我开通就行了,0 + + 读取后的Dataset将具有以下数据结构: + + .. csv-table:: + :header: "raw_chars1", "raw_chars2", "target" + + "综合评分不足什么原因", "综合评估的依据", "0" + "什么时候我能使用微粒贷", "你就赶快给我开通就行了", "0" + "...", "...", "..." + + """ + + def __init__(self): + super(BQCorpusLoader, self).__init__() + + def _load(self, path: str = None): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + next(f) + for line in f: + line = line.strip() + target = line[-1] + sep_index = line.index(',') + raw_chars1 = line[:sep_index] + raw_chars2 = line[sep_index + 1:] + + if raw_chars1: + ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) + return ds + + def download(self): + """ + 由于版权限制,不能提供自动下载功能。可参考 + + https://github.com/ymcui/Chinese-BERT-wwm + + :return: + """ + raise RuntimeError("BQCorpus cannot be downloaded automatically.") + + +class LCQMCLoader(Loader): + r""" + 数据集简介:句对匹配(question matching) + + 原始数据为: + + Example:: + + 喜欢打篮球的男生喜欢什么样的女生 爱打篮球的男生喜欢什么样的女生 1 + 你帮我设计小说的封面吧 谁能帮我给小说设计个封面? 0 + + + 读取后的Dataset将具有以下的数据结构 + + .. csv-table:: + :header: "raw_chars1", "raw_chars2", "target" + + "喜欢打篮球的男生喜欢什么样的女生", "爱打篮球的男生喜欢什么样的女生", "1" + "你帮我设计小说的封面吧", "妇可以戴耳机听音乐吗?", "0" + "...", "...", "..." + + + """ + + def __init__(self): + super(LCQMCLoader, self).__init__() + + def _load(self, path: str = None): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + line_segments = line.split('\t') + assert len(line_segments) == 3 + + target = line_segments[-1] + + raw_chars1 = line_segments[0] + raw_chars2 = line_segments[1] + + if raw_chars1: + ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) + return ds + + def download(self): + """ + 由于版权限制,不能提供自动下载功能。可参考 + + https://github.com/ymcui/Chinese-BERT-wwm + + :return: + """ + raise RuntimeError("LCQMC cannot be downloaded automatically.") + + diff --git a/fastNLP/io/loader/qa.py b/fastNLP/io/loader/qa.py new file mode 100644 index 00000000..782a2701 --- /dev/null +++ b/fastNLP/io/loader/qa.py @@ -0,0 +1,74 @@ +""" +该文件中的Loader主要用于读取问答式任务的数据 + +""" + + +from . import Loader +import json +from ...core import DataSet, Instance + +__all__ = ['CMRC2018Loader'] + + +class CMRC2018Loader(Loader): + """ + 请直接使用从fastNLP下载的数据进行处理。该数据集未提供测试集,测试需要通过上传到对应的系统进行评测 + + 读取之后训练集DataSet将具备以下的内容,每个问题的答案只有一个 + + .. csv-table:: + :header:"title", "context", "question", "answers", "answer_starts", "id" + + "范廷颂", "范廷颂枢机(,),圣名保禄·若瑟()...", "范廷颂是什么时候被任为主教的?", ["1963年"], ["30"], "TRAIN_186_QUERY_0" + "范廷颂", "范廷颂枢机(,),圣名保禄·若瑟()...", "1990年,范廷颂担任什么职务?", ["1990年被擢升为天..."], ["41"],"TRAIN_186_QUERY_1" + "...", "...", "...","...", ".", "..." + + 其中title是文本的标题,多条记录可能是相同的title;id是该问题的id,具备唯一性 + + 验证集DataSet将具备以下的内容,每个问题的答案可能有三个(有时候只是3个重复的答案) + + .. csv-table:: + :header:"title", "context", "question", "answers", "answer_starts", "id" + + "战国无双3", "《战国无双3》()是由光荣和ω-force开发...", "《战国无双3》是由哪两个公司合作开发的?", ["光荣和ω-force", "光荣和ω-force", "光荣和ω-force"], ["30", "30", "30"], "DEV_0_QUERY_0" + "战国无双3", "《战国无双3》()是由光荣和ω-force开发...", "男女主角亦有专属声优这一模式是由谁改编的?", ["村雨城", "村雨城", "任天堂游戏谜之村雨城"], ["226", "226", "219"], "DEV_0_QUERY_1" + "...", "...", "...","...", ".", "..." + + 其中answer_starts是从0开始的index。例如"我来自a复旦大学?",其中"复"的开始index为4。另外"Russell评价说"中的说的index为9, 因为 + 英文和数字都直接按照character计量的。 + """ + def __init__(self): + super().__init__() + + def _load(self, path: str) -> DataSet: + with open(path, 'r', encoding='utf-8') as f: + data = json.load(f)['data'] + ds = DataSet() + for entry in data: + title = entry['title'] + para = entry['paragraphs'][0] + context = para['context'] + qas = para['qas'] + for qa in qas: + question = qa['question'] + ans = qa['answers'] + answers = [] + answer_starts = [] + id = qa['id'] + for an in ans: + answers.append(an['text']) + answer_starts.append(an['answer_start']) + ds.append(Instance(title=title, context=context, question=question, answers=answers, + answer_starts=answer_starts,id=id)) + return ds + + def download(self) -> str: + """ + 如果您使用了本数据,请引用A Span-Extraction Dataset for Chinese Machine Reading Comprehension. Yiming Cui, Ting Liu, etc. + + :return: + """ + output_dir = self._get_dataset_path('cmrc2018') + return output_dir + diff --git a/fastNLP/io/loader/summarization.py b/fastNLP/io/loader/summarization.py new file mode 100644 index 00000000..95b18af7 --- /dev/null +++ b/fastNLP/io/loader/summarization.py @@ -0,0 +1,63 @@ +"""undocumented""" + +__all__ = [ + "ExtCNNDMLoader" +] + +import os +from typing import Union, Dict + +from ..data_bundle import DataBundle +from ..utils import check_loader_paths +from .json import JsonLoader + + +class ExtCNNDMLoader(JsonLoader): + """ + 读取之后的DataSet中的field情况为 + + .. csv-table:: + :header: "text", "summary", "label", "publication" + + ["I got new tires from them and... ","..."], ["The new tires...","..."], [0, 1], "cnndm" + ["Don't waste your time. We had two...","..."], ["Time is precious","..."], [1], "cnndm" + ["..."], ["..."], [], "cnndm" + + """ + + def __init__(self, fields=None): + fields = fields or {"text": None, "summary": None, "label": None, "publication": None} + super(ExtCNNDMLoader, self).__init__(fields=fields) + + def load(self, paths: Union[str, Dict[str, str]] = None): + """ + 从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。 + + 读取的field根据ExtCNNDMLoader初始化时传入的headers决定。 + + :param str paths: 传入一个目录, 将在该目录下寻找train.label.jsonl, dev.label.jsonl + test.label.jsonl三个文件(该目录还应该需要有一个名字为vocab的文件,在 :class:`~fastNLP.io.ExtCNNDMPipe` + 当中需要用到)。 + + :return: 返回 :class:`~fastNLP.io.DataBundle` + """ + if paths is None: + paths = self.download() + paths = check_loader_paths(paths) + if ('train' in paths) and ('test' not in paths): + paths['test'] = paths['train'] + paths.pop('train') + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + def download(self): + """ + 如果你使用了这个数据,请引用 + + https://arxiv.org/pdf/1506.03340.pdf + :return: + """ + output_dir = self._get_dataset_path('ext-cnndm') + return output_dir diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py index ffaa4ef5..9da921df 100644 --- a/fastNLP/io/model_io.py +++ b/fastNLP/io/model_io.py @@ -8,13 +8,9 @@ __all__ = [ import torch -from .base_loader import BaseLoader - -class ModelLoader(BaseLoader): +class ModelLoader: """ - 别名::class:`fastNLP.io.ModelLoader` :class:`fastNLP.io.model_io.ModelLoader` - 用于读取模型 """ @@ -43,8 +39,6 @@ class ModelLoader(BaseLoader): class ModelSaver(object): """ - 别名::class:`fastNLP.io.ModelSaver` :class:`fastNLP.io.model_io.ModelSaver` - 用于保存模型 Example:: diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py new file mode 100644 index 00000000..aa2a59ca --- /dev/null +++ b/fastNLP/io/pipe/__init__.py @@ -0,0 +1,68 @@ +""" +Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``process`` 和 ``process_from_file`` 两种方法。 +``process(data_bundle)`` 传入一个 :class:`~fastNLP.io.DataBundle` 类型的对象, 在传入的 `data_bundle` 上进行原位修改,并将其返回; +``process_from_file(paths)`` 传入的文件路径,返回一个 :class:`~fastNLP.io.DataBundle` 类型的对象。 +``process(data_bundle)`` 或者 ``process_from_file(paths)`` 的返回 `data_bundle` 中的 :class:`~fastNLP.DataSet` +一般都包含原文与转换为index的输入以及转换为index的target;除了 :class:`~fastNLP.DataSet` 之外, +`data_bundle` 还会包含将field转为index时所建立的词表。 + +""" +__all__ = [ + "Pipe", + + "CWSPipe", + + "YelpFullPipe", + "YelpPolarityPipe", + "SSTPipe", + "SST2Pipe", + "IMDBPipe", + "ChnSentiCorpPipe", + "THUCNewsPipe", + "WeiboSenti100kPipe", + + "Conll2003NERPipe", + "OntoNotesNERPipe", + "MsraNERPipe", + "WeiboNERPipe", + "PeopleDailyPipe", + "Conll2003Pipe", + + "MatchingBertPipe", + "RTEBertPipe", + "SNLIBertPipe", + "QuoraBertPipe", + "QNLIBertPipe", + "MNLIBertPipe", + "CNXNLIBertPipe", + "BQCorpusBertPipe", + "LCQMCBertPipe", + "MatchingPipe", + "RTEPipe", + "SNLIPipe", + "QuoraPipe", + "QNLIPipe", + "MNLIPipe", + "LCQMCPipe", + "CNXNLIPipe", + "BQCorpusPipe", + "RenamePipe", + "GranularizePipe", + "MachingTruncatePipe", + + "CoReferencePipe", + + "CMRC2018BertPipe" +] + +from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, \ + WeiboSenti100kPipe +from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe +from .conll import Conll2003Pipe +from .coreference import CoReferencePipe +from .cws import CWSPipe +from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ + MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, CNXNLIBertPipe, CNXNLIPipe, BQCorpusBertPipe, \ + LCQMCPipe, BQCorpusPipe, LCQMCBertPipe, RenamePipe, GranularizePipe, MachingTruncatePipe +from .pipe import Pipe +from .qa import CMRC2018BertPipe diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py new file mode 100644 index 00000000..ab31c9de --- /dev/null +++ b/fastNLP/io/pipe/classification.py @@ -0,0 +1,881 @@ +"""undocumented""" + +__all__ = [ + "YelpFullPipe", + "YelpPolarityPipe", + "SSTPipe", + "SST2Pipe", + 'IMDBPipe', + "ChnSentiCorpPipe", + "THUCNewsPipe", + "WeiboSenti100kPipe" +] + +import re +import warnings + +from nltk import Tree + +from .pipe import Pipe +from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_instance, _add_chars_field +from ..data_bundle import DataBundle +from ..loader.classification import ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader +from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader +from ...core._logger import logger +from ...core.const import Const +from ...core.dataset import DataSet +from ...core.instance import Instance +from ...core.vocabulary import Vocabulary + +nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') + + +class _CLSPipe(Pipe): + """ + 分类问题的基类,负责对classification的数据进行tokenize操作。默认是对raw_words列操作,然后生成words列 + + """ + + def __init__(self, tokenizer: str = 'spacy', lang='en'): + + self.tokenizer = get_tokenizer(tokenizer, lang=lang) + + def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None): + """ + 将DataBundle中的数据进行tokenize + + :param DataBundle data_bundle: + :param str field_name: + :param str new_field_name: + :return: 传入的DataBundle对象 + """ + new_field_name = new_field_name or field_name + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name) + + return data_bundle + + def _granularize(self, data_bundle, tag_map): + """ + 该函数对data_bundle中'target'列中的内容进行转换。 + + :param data_bundle: + :param dict tag_map: 将target列中的tag做以下的映射,比如{"0":0, "1":0, "3":1, "4":1}, 则会删除target为"2"的instance, + 且将"1"认为是第0类。 + :return: 传入的data_bundle + """ + for name in list(data_bundle.datasets.keys()): + dataset = data_bundle.get_dataset(name) + dataset.apply_field(lambda target: tag_map.get(target, -100), field_name=Const.TARGET, + new_field_name=Const.TARGET) + dataset.drop(lambda ins: ins[Const.TARGET] == -100) + data_bundle.set_dataset(dataset, name) + return data_bundle + + +def _clean_str(words): + """ + heavily borrowed from github + https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb + :param sentence: is a str + :return: + """ + words_collection = [] + for word in words: + if word in ['-lrb-', '-rrb-', '', '-r', '-l', 'b-']: + continue + tt = nonalpnum.split(word) + t = ''.join(tt) + if t != '': + words_collection.append(t) + + return words_collection + + +class YelpFullPipe(_CLSPipe): + """ + 处理YelpFull的数据, 处理之后DataSet中的内容如下 + + .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field + :header: "raw_words", "target", "words", "seq_len" + + "I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160 + " Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40 + "...", ., "[...]", . + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def __init__(self, lower: bool = False, granularity=5, tokenizer: str = 'spacy'): + """ + + :param bool lower: 是否对输入进行小写化。 + :param int granularity: 支持2, 3, 5。若为2, 则认为是2分类问题,将1、2归为1类,4、5归为一类,丢掉2;若为3, 则有3分类问题,将 + 1、2归为1类,3归为1类,4、5归为1类;若为5, 则有5分类问题。 + :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + assert granularity in (2, 3, 5), "granularity can only be 2,3,5." + self.granularity = granularity + + if granularity == 2: + self.tag_map = {"1": 0, "2": 0, "4": 1, "5": 1} + elif granularity == 3: + self.tag_map = {"1": 0, "2": 0, "3": 1, "4": 2, "5": 2} + else: + self.tag_map = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4} + + def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None): + """ + 将DataBundle中的数据进行tokenize + + :param DataBundle data_bundle: + :param str field_name: + :param str new_field_name: + :return: 传入的DataBundle对象 + """ + new_field_name = new_field_name or field_name + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name) + dataset.apply_field(_clean_str, field_name=field_name, new_field_name=new_field_name) + return data_bundle + + def process(self, data_bundle): + """ + 传入的DataSet应该具备如下的结构 + + .. csv-table:: + :header: "raw_words", "target" + + "I got 'new' tires from them and... ", "1" + "Don't waste your time. We had two...", "1" + "...", "..." + + :param data_bundle: + :return: + """ + + # 复制一列words + data_bundle = _add_words_field(data_bundle, lower=self.lower) + + # 进行tokenize + data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT) + + # 根据granularity设置tag + data_bundle = self._granularize(data_bundle, tag_map=self.tag_map) + + # 删除空行 + data_bundle = _drop_empty_instance(data_bundle, field_name=Const.INPUT) + + # index + data_bundle = _indexize(data_bundle=data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) + data_bundle.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param paths: + :return: DataBundle + """ + data_bundle = YelpFullLoader().load(paths) + return self.process(data_bundle=data_bundle) + + +class YelpPolarityPipe(_CLSPipe): + """ + 处理YelpPolarity的数据, 处理之后DataSet中的内容如下 + + .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field + :header: "raw_words", "target", "words", "seq_len" + + "I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160 + " Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40 + "...", ., "[...]", . + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): + """ + + :param bool lower: 是否对输入进行小写化。 + :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process(self, data_bundle): + """ + 传入的DataSet应该具备如下的结构 + + .. csv-table:: + :header: "raw_words", "target" + + "I got 'new' tires from them and... ", "1" + "Don't waste your time. We had two...", "1" + "...", "..." + + :param data_bundle: + :return: + """ + # 复制一列words + data_bundle = _add_words_field(data_bundle, lower=self.lower) + + # 进行tokenize + data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT) + # index + data_bundle = _indexize(data_bundle=data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) + data_bundle.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param str paths: + :return: DataBundle + """ + data_bundle = YelpPolarityLoader().load(paths) + return self.process(data_bundle=data_bundle) + + +class SSTPipe(_CLSPipe): + """ + 经过该Pipe之后,DataSet中具备的field如下所示 + + .. csv-table:: 下面是使用SSTPipe处理后的DataSet所具备的field + :header: "raw_words", "words", "target", "seq_len" + + "It 's a lovely film with lovely perfor...", 1, "[187, 6, 5, 132, 120, 70, 132, 188, 25...", 13 + "No one goes unindicted here , which is...", 0, "[191, 126, 192, 193, 194, 4, 195, 17, ...", 13 + "...", ., "[...]", . + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def __init__(self, subtree=False, train_subtree=True, lower=False, granularity=5, tokenizer='spacy'): + """ + + :param bool subtree: 是否将train, test, dev数据展开为子树,扩充数据量。 Default: ``False`` + :param bool train_subtree: 是否将train集通过子树扩展数据。 + :param bool lower: 是否对输入进行小写化。 + :param int granularity: 支持2, 3, 5。若为2, 则认为是2分类问题,将0、1归为1类,3、4归为一类,丢掉2;若为3, 则有3分类问题,将 + 0、1归为1类,2归为1类,3、4归为1类;若为5, 则有5分类问题。 + :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.subtree = subtree + self.train_tree = train_subtree + self.lower = lower + assert granularity in (2, 3, 5), "granularity can only be 2,3,5." + self.granularity = granularity + + if granularity == 2: + self.tag_map = {"0": 0, "1": 0, "3": 1, "4": 1} + elif granularity == 3: + self.tag_map = {"0": 0, "1": 0, "2": 1, "3": 2, "4": 2} + else: + self.tag_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4} + + def process(self, data_bundle: DataBundle): + """ + 对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列,且内容类似与 + + .. csv-table:: 下面是使用SSTLoader读取的DataSet所具备的field + :header: "raw_words" + + "(2 (3 (3 Effective) (2 but)) (1 (1 too-tepid)..." + "(3 (3 (2 If) (3 (2 you) (3 (2 sometimes) ..." + "..." + + :param ~fastNLP.io.DataBundle data_bundle: 需要处理的DataBundle对象 + :return: + """ + # 先取出subtree + for name in list(data_bundle.datasets.keys()): + dataset = data_bundle.get_dataset(name) + ds = DataSet() + use_subtree = self.subtree or (name == 'train' and self.train_tree) + for ins in dataset: + raw_words = ins['raw_words'] + tree = Tree.fromstring(raw_words) + if use_subtree: + for t in tree.subtrees(): + raw_words = " ".join(t.leaves()) + instance = Instance(raw_words=raw_words, target=t.label()) + ds.append(instance) + else: + instance = Instance(raw_words=' '.join(tree.leaves()), target=tree.label()) + ds.append(instance) + data_bundle.set_dataset(ds, name) + + _add_words_field(data_bundle, lower=self.lower) + + # 进行tokenize + data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT) + + # 根据granularity设置tag + data_bundle = self._granularize(data_bundle, tag_map=self.tag_map) + + # index + data_bundle = _indexize(data_bundle=data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) + data_bundle.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + data_bundle = SSTLoader().load(paths) + return self.process(data_bundle=data_bundle) + + +class SST2Pipe(_CLSPipe): + """ + 加载SST2的数据, 处理完成之后DataSet将拥有以下的field + + .. csv-table:: + :header: "raw_words", "target", "words", "seq_len" + + "it 's a charming and often affecting j... ", 1, "[19, 9, 6, 111, 5, 112, 113, 114, 3]", 9 + "unflinchingly bleak and desperate", 0, "[115, 116, 5, 117]", 4 + "...", "...", ., . + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def __init__(self, lower=False, tokenizer='spacy'): + """ + + :param bool lower: 是否对输入进行小写化。 + :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process(self, data_bundle: DataBundle): + """ + 可以处理的DataSet应该具备如下的结构 + + .. csv-table:: + :header: "raw_words", "target" + + "it 's a charming and often affecting...", "1" + "unflinchingly bleak and...", "0" + "..." + + :param data_bundle: + :return: + """ + _add_words_field(data_bundle, self.lower) + + data_bundle = self._tokenize(data_bundle=data_bundle) + + src_vocab = Vocabulary() + src_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.INPUT, + no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if + name != 'train']) + src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) + + tgt_vocab = Vocabulary(unknown=None, padding=None) + tgt_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name], + field_name=Const.TARGET, + no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets() + if ('train' not in name) and (ds.has_field(Const.TARGET))] + ) + if len(tgt_vocab._no_create_word) > 0: + warn_msg = f"There are {len(tgt_vocab._no_create_word)} target labels" \ + f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \ + f"data set but not in train data set!." + warnings.warn(warn_msg) + logger.warning(warn_msg) + datasets = [] + for name, dataset in data_bundle.datasets.items(): + if dataset.has_field(Const.TARGET): + datasets.append(dataset) + tgt_vocab.index_dataset(*datasets, field_name=Const.TARGET) + + data_bundle.set_vocab(src_vocab, Const.INPUT) + data_bundle.set_vocab(tgt_vocab, Const.TARGET) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) + data_bundle.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param str paths: 如果为None,则自动下载并缓存到fastNLP的缓存地址。 + :return: DataBundle + """ + data_bundle = SST2Loader().load(paths) + return self.process(data_bundle) + + +class IMDBPipe(_CLSPipe): + """ + 经过本Pipe处理后DataSet将如下 + + .. csv-table:: 输出DataSet的field + :header: "raw_words", "target", "words", "seq_len" + + "Bromwell High is a cartoon ... ", 0, "[3, 5, 6, 9, ...]", 20 + "Story of a man who has ...", 1, "[20, 43, 9, 10, ...]", 31 + "...", ., "[...]", . + + 其中raw_words为str类型,是原文; words是转换为index的输入; target是转换为index的目标值; + words列被设置为input; target列被设置为target。 + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): + """ + + :param bool lower: 是否将words列的数据小写。 + :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 + """ + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process(self, data_bundle: DataBundle): + """ + 期待的DataBunlde中输入的DataSet应该类似于如下,有两个field,raw_words和target,且均为str类型 + + .. csv-table:: 输入DataSet的field + :header: "raw_words", "target" + + "Bromwell High is a cartoon ... ", "pos" + "Story of a man who has ...", "neg" + "...", "..." + + :param DataBunlde data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和target两个field,且raw_words列应该为str, + target列应该为str。 + :return: DataBundle + """ + + # 替换
+ def replace_br(raw_words): + raw_words = raw_words.replace("
", ' ') + return raw_words + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(replace_br, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD) + + _add_words_field(data_bundle, lower=self.lower) + self._tokenize(data_bundle, field_name=Const.INPUT, new_field_name=Const.INPUT) + _indexize(data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + dataset.set_input(Const.INPUT, Const.INPUT_LEN) + dataset.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = IMDBLoader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + +class ChnSentiCorpPipe(Pipe): + """ + 处理之后的DataSet有以下的结构 + + .. csv-table:: + :header: "raw_chars", "target", "chars", "seq_len" + + "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", 1, "[2, 3, 4, 5, ...]", 31 + "<荐书> 推荐所有喜欢<红楼>...", 1, "[10, 21, ....]", 25 + "..." + + 其中chars, seq_len是input,target是target + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + def __init__(self, bigrams=False, trigrams=False): + """ + + :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 + 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('bigrams')获取. + :param bool trigrams: 是否增加一列trigrams. trigrams的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + 。如果设置为True,返回的DataSet将有一列名为trigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('trigrams')获取. + """ + super().__init__() + + self.bigrams = bigrams + self.trigrams = trigrams + + def _tokenize(self, data_bundle): + """ + 将DataSet中的"复旦大学"拆分为["复", "旦", "大", "学"]. 未来可以通过扩展这个函数实现分词。 + + :param data_bundle: + :return: + """ + data_bundle.apply_field(list, field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) + return data_bundle + + def process(self, data_bundle:DataBundle): + """ + 可以处理的DataSet应该具备以下的field + + .. csv-table:: + :header: "raw_chars", "target" + + "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1" + "<荐书> 推荐所有喜欢<红楼>...", "1" + "..." + + :param data_bundle: + :return: + """ + _add_chars_field(data_bundle, lower=False) + + data_bundle = self._tokenize(data_bundle) + + input_field_names = [Const.CHAR_INPUT] + if self.bigrams: + for name, dataset in data_bundle.iter_datasets(): + dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + [''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + if self.trigrams: + for name, dataset in data_bundle.iter_datasets(): + dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in + zip(chars, chars[1:] + [''], chars[2:] + [''] * 2)], + field_name=Const.CHAR_INPUT, new_field_name='trigrams') + input_field_names.append('trigrams') + + # index + _indexize(data_bundle, input_field_names, Const.TARGET) + + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names + target_fields = [Const.TARGET] + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = ChnSentiCorpLoader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + +class THUCNewsPipe(_CLSPipe): + """ + 处理之后的DataSet有以下的结构 + + .. csv-table:: + :header: "raw_chars", "target", "chars", "seq_len" + + "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道...", 0, "[409, 1197, 2146, 213, ...]", 746 + "..." + + 其中chars, seq_len是input,target是target + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 + 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('bigrams')获取. + :param bool trigrams: 是否增加一列trigrams. trigrams的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + 。如果设置为True,返回的DataSet将有一列名为trigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('trigrams')获取. + """ + + def __init__(self, bigrams=False, trigrams=False): + super().__init__() + + self.bigrams = bigrams + self.trigrams = trigrams + + def _chracter_split(self, sent): + return list(sent) + # return [w for w in sent] + + def _raw_split(self, sent): + return sent.split() + + def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None): + new_field_name = new_field_name or field_name + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self._chracter_split, field_name=field_name, new_field_name=new_field_name) + return data_bundle + + def process(self, data_bundle: DataBundle): + """ + 可处理的DataSet应具备如下的field + + .. csv-table:: + :header: "raw_words", "target" + + "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育" + "...", "..." + + :param data_bundle: + :return: + """ + # 根据granularity设置tag + tag_map = {'体育': 0, '财经': 1, '房产': 2, '家居': 3, '教育': 4, '科技': 5, '时尚': 6, '时政': 7, '游戏': 8, '娱乐': 9} + data_bundle = self._granularize(data_bundle=data_bundle, tag_map=tag_map) + + # clean,lower + + # CWS(tokenize) + data_bundle = self._tokenize(data_bundle=data_bundle, field_name='raw_chars', new_field_name='chars') + + input_field_names = [Const.CHAR_INPUT] + + # n-grams + if self.bigrams: + for name, dataset in data_bundle.iter_datasets(): + dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + [''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + if self.trigrams: + for name, dataset in data_bundle.iter_datasets(): + dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in + zip(chars, chars[1:] + [''], chars[2:] + [''] * 2)], + field_name=Const.CHAR_INPUT, new_field_name='trigrams') + input_field_names.append('trigrams') + + # index + data_bundle = _indexize(data_bundle=data_bundle, input_field_names=Const.CHAR_INPUT) + + # add length + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(field_name=Const.CHAR_INPUT, new_field_name=Const.INPUT_LEN) + + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names + target_fields = [Const.TARGET] + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths=None): + """ + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + data_loader = THUCNewsLoader() # 此处需要实例化一个data_loader,否则传入load()的参数为None + data_bundle = data_loader.load(paths) + data_bundle = self.process(data_bundle) + return data_bundle + + +class WeiboSenti100kPipe(_CLSPipe): + """ + 处理之后的DataSet有以下的结构 + + .. csv-table:: + :header: "raw_chars", "target", "chars", "seq_len" + + "六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", 0, "[0, 690, 18, ...]", 56 + "..." + + 其中chars, seq_len是input,target是target + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 + 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('bigrams')获取. + :param bool trigrams: 是否增加一列trigrams. trigrams的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + 。如果设置为True,返回的DataSet将有一列名为trigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('trigrams')获取. + """ + + def __init__(self, bigrams=False, trigrams=False): + super().__init__() + + self.bigrams = bigrams + self.trigrams = trigrams + + def _chracter_split(self, sent): + return list(sent) + + def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None): + new_field_name = new_field_name or field_name + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self._chracter_split, field_name=field_name, new_field_name=new_field_name) + return data_bundle + + + def process(self, data_bundle: DataBundle): + """ + 可处理的DataSet应具备以下的field + + .. csv-table:: + :header: "raw_chars", "target" + + "六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "0" + "...", "..." + + :param data_bundle: + :return: + """ + # clean,lower + + # CWS(tokenize) + data_bundle = self._tokenize(data_bundle=data_bundle, field_name='raw_chars', new_field_name='chars') + + input_field_names = [Const.CHAR_INPUT] + + # n-grams + if self.bigrams: + for name, dataset in data_bundle.iter_datasets(): + dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + [''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + if self.trigrams: + for name, dataset in data_bundle.iter_datasets(): + dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in + zip(chars, chars[1:] + [''], chars[2:] + [''] * 2)], + field_name=Const.CHAR_INPUT, new_field_name='trigrams') + input_field_names.append('trigrams') + + # index + data_bundle = _indexize(data_bundle=data_bundle, input_field_names='chars') + + # add length + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(field_name=Const.CHAR_INPUT, new_field_name=Const.INPUT_LEN) + + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names + target_fields = [Const.TARGET] + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths=None): + """ + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + data_loader = WeiboSenti100kLoader() # 此处需要实例化一个data_loader,否则传入load()的参数为None + data_bundle = data_loader.load(paths) + data_bundle = self.process(data_bundle) + return data_bundle + diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py new file mode 100644 index 00000000..918cff9f --- /dev/null +++ b/fastNLP/io/pipe/conll.py @@ -0,0 +1,430 @@ +"""undocumented""" + +__all__ = [ + "Conll2003NERPipe", + "Conll2003Pipe", + "OntoNotesNERPipe", + "MsraNERPipe", + "PeopleDailyPipe", + "WeiboNERPipe" +] + +from .pipe import Pipe +from .utils import _add_chars_field +from .utils import _indexize, _add_words_field +from .utils import iob2, iob2bioes +from .. import DataBundle +from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader +from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader, ConllLoader +from ...core.const import Const +from ...core.vocabulary import Vocabulary + + +class _NERPipe(Pipe): + """ + NER任务的处理Pipe, 该Pipe会(1)复制raw_words列,并命名为words; (2)在words, target列建立词表 + (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将words,target列根据相应的 + Vocabulary转换为index。 + + raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target, seq_len。 + """ + + def __init__(self, encoding_type: str = 'bio', lower: bool = False): + """ + + :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 + :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 + """ + if encoding_type == 'bio': + self.convert_tag = iob2 + elif encoding_type == 'bioes': + self.convert_tag = lambda words: iob2bioes(iob2(words)) + else: + raise ValueError("encoding_type only supports `bio` and `bioes`.") + self.lower = lower + + def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 支持的DataSet的field为 + + .. csv-table:: + :header: "raw_words", "target" + + "[Nadim, Ladki]", "[B-PER, I-PER]" + "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" + "[...]", "[...]" + + :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]在传入DataBundle基础上原位修改。 + :return DataBundle: + """ + # 转换tag + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) + + _add_words_field(data_bundle, lower=self.lower) + + # index + _indexize(data_bundle) + + input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] + target_fields = [Const.TARGET, Const.INPUT_LEN] + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + +class Conll2003NERPipe(_NERPipe): + """ + Conll2003的NER任务的处理Pipe, 该Pipe会(1)复制raw_words列,并命名为words; (2)在words, target列建立词表 + (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将words,target列根据相应的 + Vocabulary转换为index。 + 经过该Pipe过后,DataSet中的内容如下所示 + + .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader + :header: "raw_words", "target", "words", "seq_len" + + "[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2 + "[AL-AIN, United, Arab, ...]", "[3, 4,...]", "[4, 5, 6,...]", 6 + "[...]", "[...]", "[...]", . + + raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def process_from_file(self, paths) -> DataBundle: + """ + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = Conll2003NERLoader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + +class Conll2003Pipe(Pipe): + """ + 经过该Pipe后,DataSet中的内容如下 + + .. csv-table:: + :header: "raw_words" , "pos", "chunk", "ner", "words", "seq_len" + + "[Nadim, Ladki]", "[0, 0]", "[1, 2]", "[1, 2]", "[2, 3]", 2 + "[AL-AIN, United, Arab, ...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", "[4, 5, 6,...]", 6 + "[...]", "[...]", "[...]", "[...]", "[...]", . + + 其中words, seq_len是input; pos, chunk, ner, seq_len是target + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+-------+-------+-------+-------+---------+ + | field_names | raw_words | pos | chunk | ner | words | seq_len | + +-------------+-----------+-------+-------+-------+-------+---------+ + | is_input | False | False | False | False | True | True | + | is_target | False | True | True | True | False | True | + | ignore_type | | False | False | False | False | False | + | pad_value | | 0 | 0 | 0 | 0 | 0 | + +-------------+-----------+-------+-------+-------+-------+---------+ + + + """ + def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False): + """ + + :param str chunk_encoding_type: 支持bioes, bio。 + :param str ner_encoding_type: 支持bioes, bio。 + :param bool lower: 是否将words列小写化后再建立词表 + """ + if chunk_encoding_type == 'bio': + self.chunk_convert_tag = iob2 + elif chunk_encoding_type == 'bioes': + self.chunk_convert_tag = lambda tags: iob2bioes(iob2(tags)) + else: + raise ValueError("chunk_encoding_type only supports `bio` and `bioes`.") + if ner_encoding_type == 'bio': + self.ner_convert_tag = iob2 + elif ner_encoding_type == 'bioes': + self.ner_convert_tag = lambda tags: iob2bioes(iob2(tags)) + else: + raise ValueError("ner_encoding_type only supports `bio` and `bioes`.") + self.lower = lower + + def process(self, data_bundle) -> DataBundle: + """ + 输入的DataSet应该类似于如下的形式 + + .. csv-table:: + :header: "raw_words", "pos", "chunk", "ner" + + "[Nadim, Ladki]", "[NNP, NNP]", "[B-NP, I-NP]", "[B-PER, I-PER]" + "[AL-AIN, United, Arab, ...]", "[NNP, NNP...]", "[B-NP, B-NP, ...]", "[B-LOC, B-LOC,...]" + "[...]", "[...]", "[...]", "[...]", . + + :param data_bundle: + :return: 传入的DataBundle + """ + # 转换tag + for name, dataset in data_bundle.datasets.items(): + dataset.drop(lambda x: "-DOCSTART-" in x[Const.RAW_WORD]) + dataset.apply_field(self.chunk_convert_tag, field_name='chunk', new_field_name='chunk') + dataset.apply_field(self.ner_convert_tag, field_name='ner', new_field_name='ner') + + _add_words_field(data_bundle, lower=self.lower) + + # index + _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=['pos', 'ner']) + # chunk中存在一些tag只在dev中出现,没在train中 + tgt_vocab = Vocabulary(unknown=None, padding=None) + tgt_vocab.from_dataset(*data_bundle.datasets.values(), field_name='chunk') + tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name='chunk') + data_bundle.set_vocab(tgt_vocab, 'chunk') + + input_fields = [Const.INPUT, Const.INPUT_LEN] + target_fields = ['pos', 'ner', 'chunk', Const.INPUT_LEN] + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths): + """ + + :param paths: + :return: + """ + data_bundle = ConllLoader(headers=['raw_words', 'pos', 'chunk', 'ner']).load(paths) + return self.process(data_bundle) + + +class OntoNotesNERPipe(_NERPipe): + """ + 处理OntoNotes的NER数据,处理之后DataSet中的field情况为 + + .. csv-table:: + :header: "raw_words", "target", "words", "seq_len" + + "[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2 + "[AL-AIN, United, Arab, ...]", "[3, 4]", "[4, 5, 6,...]", 6 + "[...]", "[...]", "[...]", . + + raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def process_from_file(self, paths): + data_bundle = OntoNotesNERLoader().load(paths) + return self.process(data_bundle) + + +class _CNNERPipe(Pipe): + """ + 中文NER任务的处理Pipe, 该Pipe会(1)复制raw_chars列,并命名为chars; (2)在chars, target列建立词表 + (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将chars,target列根据相应的 + Vocabulary转换为index。 + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target, seq_len。 + + """ + + def __init__(self, encoding_type: str = 'bio', bigrams=False, trigrams=False): + """ + + :param str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 + :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 + 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('bigrams')获取. + :param bool trigrams: 是否增加一列trigrams. trigrams的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + 。如果设置为True,返回的DataSet将有一列名为trigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('trigrams')获取. + """ + if encoding_type == 'bio': + self.convert_tag = iob2 + elif encoding_type == 'bioes': + self.convert_tag = lambda words: iob2bioes(iob2(words)) + else: + raise ValueError("encoding_type only supports `bio` and `bioes`.") + + self.bigrams = bigrams + self.trigrams = trigrams + + def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 支持的DataSet的field为 + + .. csv-table:: + :header: "raw_chars", "target" + + "[相, 比, 之, 下,...]", "[O, O, O, O, ...]" + "[青, 岛, 海, 牛, 队, 和, ...]", "[B-ORG, I-ORG, I-ORG, ...]" + "[...]", "[...]" + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int], + 是转换为index的target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + + :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。在传入DataBundle基础上原位修改。 + :return: DataBundle + """ + # 转换tag + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) + + _add_chars_field(data_bundle, lower=False) + + input_field_names = [Const.CHAR_INPUT] + if self.bigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + [''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + if self.trigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in + zip(chars, chars[1:] + [''], chars[2:] + [''] * 2)], + field_name=Const.CHAR_INPUT, new_field_name='trigrams') + input_field_names.append('trigrams') + + # index + _indexize(data_bundle, input_field_names, Const.TARGET) + + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names + target_fields = [Const.TARGET, Const.INPUT_LEN] + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + +class MsraNERPipe(_CNNERPipe): + """ + 处理MSRA-NER的数据,处理之后的DataSet的field情况为 + + .. csv-table:: + :header: "raw_chars", "target", "chars", "seq_len" + + "[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11 + "[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21 + "[...]", "[...]", "[...]", . + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def process_from_file(self, paths=None) -> DataBundle: + data_bundle = MsraNERLoader().load(paths) + return self.process(data_bundle) + + +class PeopleDailyPipe(_CNNERPipe): + """ + 处理people daily的ner的数据,处理之后的DataSet的field情况为 + + .. csv-table:: + :header: "raw_chars", "target", "chars", "seq_len" + + "[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11 + "[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21 + "[...]", "[...]", "[...]", . + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def process_from_file(self, paths=None) -> DataBundle: + data_bundle = PeopleDailyNERLoader().load(paths) + return self.process(data_bundle) + + +class WeiboNERPipe(_CNNERPipe): + """ + 处理weibo的ner的数据,处理之后的DataSet的field情况为 + + .. csv-table:: + :header: "raw_chars", "chars", "target", "seq_len" + + "['老', '百', '姓']", "[4, 3, 3]", "[38, 39, 40]", 3 + "['心']", "[0]", "[41]", 1 + "[...]", "[...]", "[...]", . + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def process_from_file(self, paths=None) -> DataBundle: + data_bundle = WeiboNERLoader().load(paths) + return self.process(data_bundle) diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py new file mode 100644 index 00000000..0cf6c996 --- /dev/null +++ b/fastNLP/io/pipe/coreference.py @@ -0,0 +1,183 @@ +"""undocumented""" + +__all__ = [ + "CoReferencePipe" +] + +import collections + +import numpy as np + +from fastNLP.core.vocabulary import Vocabulary +from .pipe import Pipe +from ..data_bundle import DataBundle +from ..loader.coreference import CoReferenceLoader +from ...core.const import Const + + +class CoReferencePipe(Pipe): + """ + 对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。 + + 处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target: + + .. csv-table:: + :header: "words1", "words2","words3","words4","chars","seq_len","target" + + "bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]" + "[...]", "[...]","[...]","[...]","[...]","[...]","[...]" + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + + """ + + def __init__(self, config): + super().__init__() + self.config = config + + def process(self, data_bundle: DataBundle): + """ + 对load进来的数据进一步处理原始数据包含:raw_key,raw_speaker,raw_words,raw_clusters + + .. csv-table:: + :header: "raw_key", "raw_speaker","raw_words","raw_clusters" + + "bc/cctv/00/cctv_0000_0", "[[Speaker#1, Speaker#1],[]]","[['I','am'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" + "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" + "[...]", "[...]","[...]","[...]" + + + :param data_bundle: + :return: + """ + genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} + vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name= Const.RAW_WORDS(3)) + vocab.build_vocab() + word2id = vocab.word2idx + data_bundle.set_vocab(vocab, Const.INPUTS(0)) + if self.config.char_path: + char_dict = get_char_dict(self.config.char_path) + else: + char_set = set() + for i,w in enumerate(word2id): + if i < 2: + continue + for c in w: + char_set.add(c) + + char_dict = collections.defaultdict(int) + char_dict.update({c: i for i, c in enumerate(char_set)}) + + for name, ds in data_bundle.datasets.items(): + # genre + ds.apply(lambda x: genres[x[Const.RAW_WORDS(0)][:2]], new_field_name=Const.INPUTS(0)) + + # speaker_ids_np + ds.apply(lambda x: speaker2numpy(x[Const.RAW_WORDS(1)], self.config.max_sentences, is_train=name == 'train'), + new_field_name=Const.INPUTS(1)) + + # sentences + ds.rename_field(Const.RAW_WORDS(3),Const.INPUTS(2)) + + # doc_np + ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), + self.config.max_sentences, is_train=name == 'train')[0], + new_field_name=Const.INPUTS(3)) + # char_index + ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), + self.config.max_sentences, is_train=name == 'train')[1], + new_field_name=Const.CHAR_INPUT) + # seq len + ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), + self.config.max_sentences, is_train=name == 'train')[2], + new_field_name=Const.INPUT_LEN) + + # clusters + ds.rename_field(Const.RAW_WORDS(2), Const.TARGET) + + ds.set_ignore_type(Const.TARGET) + ds.set_padder(Const.TARGET, None) + ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN) + ds.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths): + bundle = CoReferenceLoader().load(paths) + return self.process(bundle) + + +# helper + +def doc2numpy(doc, word2id, chardict, max_filter, max_sentences, is_train): + docvec, char_index, length, max_len = _doc2vec(doc, word2id, chardict, max_filter, max_sentences, is_train) + assert max(length) == max_len + assert char_index.shape[0] == len(length) + assert char_index.shape[1] == max_len + doc_np = np.zeros((len(docvec), max_len), int) + for i in range(len(docvec)): + for j in range(len(docvec[i])): + doc_np[i][j] = docvec[i][j] + return doc_np, char_index, length + +def _doc2vec(doc,word2id,char_dict,max_filter,max_sentences,is_train): + max_len = 0 + max_word_length = 0 + docvex = [] + length = [] + if is_train: + sent_num = min(max_sentences,len(doc)) + else: + sent_num = len(doc) + + for i in range(sent_num): + sent = doc[i] + length.append(len(sent)) + if (len(sent) > max_len): + max_len = len(sent) + sent_vec =[] + for j,word in enumerate(sent): + if len(word)>max_word_length: + max_word_length = len(word) + if word in word2id: + sent_vec.append(word2id[word]) + else: + sent_vec.append(word2id["UNK"]) + docvex.append(sent_vec) + + char_index = np.zeros((sent_num, max_len, max_word_length),dtype=int) + for i in range(sent_num): + sent = doc[i] + for j,word in enumerate(sent): + char_index[i, j, :len(word)] = [char_dict[c] for c in word] + + return docvex,char_index,length,max_len + +def speaker2numpy(speakers_raw,max_sentences,is_train): + if is_train and len(speakers_raw)> max_sentences: + speakers_raw = speakers_raw[0:max_sentences] + speakers = flatten(speakers_raw) + speaker_dict = {s: i for i, s in enumerate(set(speakers))} + speaker_ids = np.array([speaker_dict[s] for s in speakers]) + return speaker_ids + +# 展平 +def flatten(l): + return [item for sublist in l for item in sublist] + +def get_char_dict(path): + vocab = [""] + with open(path) as f: + vocab.extend(c.strip() for c in f.readlines()) + char_dict = collections.defaultdict(int) + char_dict.update({c: i for i, c in enumerate(vocab)}) + return char_dict \ No newline at end of file diff --git a/fastNLP/io/pipe/cws.py b/fastNLP/io/pipe/cws.py new file mode 100644 index 00000000..a2f2e7a2 --- /dev/null +++ b/fastNLP/io/pipe/cws.py @@ -0,0 +1,278 @@ +"""undocumented""" + +__all__ = [ + "CWSPipe" +] + +import re +from itertools import chain + +from .pipe import Pipe +from .utils import _indexize +from .. import DataBundle +from ..loader import CWSLoader +from ...core.const import Const + + +def _word_lens_to_bmes(word_lens): + """ + + :param list word_lens: List[int], 每个词语的长度 + :return: List[str], BMES的序列 + """ + tags = [] + for word_len in word_lens: + if word_len == 1: + tags.append('S') + else: + tags.append('B') + tags.extend(['M'] * (word_len - 2)) + tags.append('E') + return tags + + +def _word_lens_to_segapp(word_lens): + """ + + :param list word_lens: List[int], 每个词语的长度 + :return: List[str], BMES的序列 + """ + tags = [] + for word_len in word_lens: + if word_len == 1: + tags.append('SEG') + else: + tags.extend(['APP'] * (word_len - 1)) + tags.append('SEG') + return tags + + +def _alpha_span_to_special_tag(span): + """ + 将span替换成特殊的字符 + + :param str span: + :return: + """ + if 'oo' == span.lower(): # speical case when represent 2OO8 + return span + if len(span) == 1: + return span + else: + return '' + + +def _find_and_replace_alpha_spans(line): + """ + 传入原始句子,替换其中的字母为特殊标记 + + :param str line:原始数据 + :return: str + """ + new_line = '' + pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%,.。!<-“])' + prev_end = 0 + for match in re.finditer(pattern, line): + start, end = match.span() + span = line[start:end] + new_line += line[prev_end:start] + _alpha_span_to_special_tag(span) + prev_end = end + new_line += line[prev_end:] + return new_line + + +def _digit_span_to_special_tag(span): + """ + + :param str span: 需要替换的str + :return: + """ + if span[0] == '0' and len(span) > 2: + return '' + decimal_point_count = 0 # one might have more than one decimal pointers + for idx, char in enumerate(span): + if char == '.' or char == '﹒' or char == '·': + decimal_point_count += 1 + if span[-1] == '.' or span[-1] == '﹒' or span[ + -1] == '·': # last digit being decimal point means this is not a number + if decimal_point_count == 1: + return span + else: + return '' + if decimal_point_count == 1: + return '' + elif decimal_point_count > 1: + return '' + else: + return '' + + +def _find_and_replace_digit_spans(line): + """ + only consider words start with number, contains '.', characters. + + If ends with space, will be processed + + If ends with Chinese character, will be processed + + If ends with or contains english char, not handled. + + floats are replaced by + + otherwise unkdgt + """ + new_line = '' + pattern = '\d[\d\\.﹒·]*(?=[\u4e00-\u9fff ,%,。!<-“])' + prev_end = 0 + for match in re.finditer(pattern, line): + start, end = match.span() + span = line[start:end] + new_line += line[prev_end:start] + _digit_span_to_special_tag(span) + prev_end = end + new_line += line[prev_end:] + return new_line + + +class CWSPipe(Pipe): + """ + 对CWS数据进行预处理, 处理之后的数据,具备以下的结构 + + .. csv-table:: + :header: "raw_words", "chars", "target", "seq_len" + + "共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", 13 + "2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", 20 + "...", "[...]","[...]", . + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+-------+--------+---------+ + | field_names | raw_words | chars | target | seq_len | + +-------------+-----------+-------+--------+---------+ + | is_input | False | True | True | True | + | is_target | False | False | True | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+-------+--------+---------+ + + """ + + def __init__(self, dataset_name=None, encoding_type='bmes', replace_num_alpha=True, bigrams=False, trigrams=False): + """ + + :param str,None dataset_name: 支持'pku', 'msra', 'cityu', 'as', None + :param str encoding_type: 可以选择'bmes', 'segapp'两种。"我 来自 复旦大学...", bmes的tag为[S, B, E, B, M, M, E...]; segapp + 的tag为[seg, app, seg, app, app, app, seg, ...] + :param bool replace_num_alpha: 是否将数字和字母用特殊字符替换。 + :param bool bigrams: 是否增加一列bigram. bigram的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...] + :param bool trigrams: 是否增加一列trigram. trigram的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + """ + if encoding_type == 'bmes': + self.word_lens_to_tags = _word_lens_to_bmes + else: + self.word_lens_to_tags = _word_lens_to_segapp + + self.dataset_name = dataset_name + self.bigrams = bigrams + self.trigrams = trigrams + self.replace_num_alpha = replace_num_alpha + + def _tokenize(self, data_bundle): + """ + 将data_bundle中的'chars'列切分成一个一个的word. + 例如输入是"共同 创造 美好.."->[[共, 同], [创, 造], [...], ] + + :param data_bundle: + :return: + """ + def split_word_into_chars(raw_chars): + words = raw_chars.split() + chars = [] + for word in words: + char = [] + subchar = [] + for c in word: + if c == '<': + subchar.append(c) + continue + if c == '>' and subchar[0] == '<': + char.append(''.join(subchar)) + subchar = [] + if subchar: + subchar.append(c) + else: + char.append(c) + char.extend(subchar) + chars.append(char) + return chars + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(split_word_into_chars, field_name=Const.CHAR_INPUT, + new_field_name=Const.CHAR_INPUT) + return data_bundle + + def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 可以处理的DataSet需要包含raw_words列 + + .. csv-table:: + :header: "raw_words" + + "上海 浦东 开发 与 法制 建设 同步" + "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )" + "..." + + :param data_bundle: + :return: + """ + data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT) + + if self.replace_num_alpha: + data_bundle.apply_field(_find_and_replace_alpha_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) + data_bundle.apply_field(_find_and_replace_digit_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) + + self._tokenize(data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: self.word_lens_to_tags(map(len, chars)), field_name=Const.CHAR_INPUT, + new_field_name=Const.TARGET) + dataset.apply_field(lambda chars: list(chain(*chars)), field_name=Const.CHAR_INPUT, + new_field_name=Const.CHAR_INPUT) + input_field_names = [Const.CHAR_INPUT] + if self.bigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + [''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + if self.trigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in + zip(chars, chars[1:] + [''], chars[2:] + [''] * 2)], + field_name=Const.CHAR_INPUT, new_field_name='trigrams') + input_field_names.append('trigrams') + + _indexize(data_bundle, input_field_names, Const.TARGET) + + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names + target_fields = [Const.TARGET, Const.INPUT_LEN] + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths=None) -> DataBundle: + """ + + :param str paths: + :return: + """ + if self.dataset_name is None and paths is None: + raise RuntimeError( + "You have to set `paths` when calling process_from_file() or `dataset_name `when initialization.") + if self.dataset_name is not None and paths is not None: + raise RuntimeError("You cannot specify `paths` and `dataset_name` simultaneously") + data_bundle = CWSLoader(self.dataset_name).load(paths) + return self.process(data_bundle) diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py new file mode 100644 index 00000000..f58706fe --- /dev/null +++ b/fastNLP/io/pipe/matching.py @@ -0,0 +1,545 @@ +"""undocumented""" + +__all__ = [ + "MatchingBertPipe", + "RTEBertPipe", + "SNLIBertPipe", + "QuoraBertPipe", + "QNLIBertPipe", + "MNLIBertPipe", + "CNXNLIBertPipe", + "BQCorpusBertPipe", + "LCQMCBertPipe", + "MatchingPipe", + "RTEPipe", + "SNLIPipe", + "QuoraPipe", + "QNLIPipe", + "MNLIPipe", + "LCQMCPipe", + "CNXNLIPipe", + "BQCorpusPipe", + "RenamePipe", + "GranularizePipe", + "MachingTruncatePipe", +] + +import warnings + +from .pipe import Pipe +from .utils import get_tokenizer +from ..data_bundle import DataBundle +from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, \ + LCQMCLoader +from ...core._logger import logger +from ...core.const import Const +from ...core.vocabulary import Vocabulary + + +class MatchingBertPipe(Pipe): + """ + Matching任务的Bert pipe,输出的DataSet将包含以下的field + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target", "words", "seq_len" + + "The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", 10 + "This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", 5 + "...", "...", ., "[...]", . + + words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。 + words列被设置为input,target列被设置为target和input(设置为input以方便在forward函数中计算loss, + 如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数的形参名进行传参). + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+------------+------------+--------+-------+---------+ + | field_names | raw_words1 | raw_words2 | target | words | seq_len | + +-------------+------------+------------+--------+-------+---------+ + | is_input | False | False | False | True | True | + | is_target | False | False | True | False | False | + | ignore_type | | | False | False | False | + | pad_value | | | 0 | 0 | 0 | + +-------------+------------+------------+--------+-------+---------+ + + """ + + def __init__(self, lower=False, tokenizer: str = 'raw'): + """ + + :param bool lower: 是否将word小写化。 + :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 + """ + super().__init__() + + self.lower = bool(lower) + self.tokenizer = get_tokenizer(tokenize_method=tokenizer) + + def _tokenize(self, data_bundle, field_names, new_field_names): + """ + + :param DataBundle data_bundle: DataBundle. + :param list field_names: List[str], 需要tokenize的field名称 + :param list new_field_names: List[str], tokenize之后field的名称,与field_names一一对应。 + :return: 输入的DataBundle对象 + """ + for name, dataset in data_bundle.datasets.items(): + for field_name, new_field_name in zip(field_names, new_field_names): + dataset.apply_field(lambda words: self.tokenizer(words), field_name=field_name, + new_field_name=new_field_name) + return data_bundle + + def process(self, data_bundle): + """ + 输入的data_bundle中的dataset需要具有以下结构: + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "Dana Reeve, the widow of the actor...", "Christopher Reeve had an...", "not_entailment" + "...","..." + + :param data_bundle: + :return: + """ + for dataset in data_bundle.datasets.values(): + if dataset.has_field(Const.TARGET): + dataset.drop(lambda x: x[Const.TARGET] == '-') + + for name, dataset in data_bundle.datasets.items(): + dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0), ) + dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1), ) + + if self.lower: + for name, dataset in data_bundle.datasets.items(): + dataset[Const.INPUTS(0)].lower() + dataset[Const.INPUTS(1)].lower() + + data_bundle = self._tokenize(data_bundle, [Const.INPUTS(0), Const.INPUTS(1)], + [Const.INPUTS(0), Const.INPUTS(1)]) + + # concat两个words + def concat(ins): + words0 = ins[Const.INPUTS(0)] + words1 = ins[Const.INPUTS(1)] + words = words0 + ['[SEP]'] + words1 + return words + + for name, dataset in data_bundle.datasets.items(): + dataset.apply(concat, new_field_name=Const.INPUT) + dataset.delete_field(Const.INPUTS(0)) + dataset.delete_field(Const.INPUTS(1)) + + word_vocab = Vocabulary() + word_vocab.from_dataset(*[dataset for name, dataset in data_bundle.datasets.items() if 'train' in name], + field_name=Const.INPUT, + no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if + 'train' not in name]) + word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) + + target_vocab = Vocabulary(padding=None, unknown=None) + target_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name], + field_name=Const.TARGET, + no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets() + if ('train' not in name) and (ds.has_field(Const.TARGET))] + ) + if len(target_vocab._no_create_word) > 0: + warn_msg = f"There are {len(target_vocab._no_create_word)} target labels" \ + f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \ + f"data set but not in train data set!." + warnings.warn(warn_msg) + logger.warning(warn_msg) + + has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if + dataset.has_field(Const.TARGET)] + target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) + + data_bundle.set_vocab(word_vocab, Const.INPUT) + data_bundle.set_vocab(target_vocab, Const.TARGET) + + input_fields = [Const.INPUT, Const.INPUT_LEN] + target_fields = [Const.TARGET] + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + dataset.set_input(*input_fields, flag=True) + for fields in target_fields: + if dataset.has_field(fields): + dataset.set_target(fields, flag=True) + + return data_bundle + + +class RTEBertPipe(MatchingBertPipe): + def process_from_file(self, paths=None): + data_bundle = RTELoader().load(paths) + return self.process(data_bundle) + + +class SNLIBertPipe(MatchingBertPipe): + def process_from_file(self, paths=None): + data_bundle = SNLILoader().load(paths) + return self.process(data_bundle) + + +class QuoraBertPipe(MatchingBertPipe): + def process_from_file(self, paths): + data_bundle = QuoraLoader().load(paths) + return self.process(data_bundle) + + +class QNLIBertPipe(MatchingBertPipe): + def process_from_file(self, paths=None): + data_bundle = QNLILoader().load(paths) + return self.process(data_bundle) + + +class MNLIBertPipe(MatchingBertPipe): + def process_from_file(self, paths=None): + data_bundle = MNLILoader().load(paths) + return self.process(data_bundle) + + +class MatchingPipe(Pipe): + """ + Matching任务的Pipe。输出的DataSet将包含以下的field + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target", "words1", "words2", "seq_len1", "seq_len2" + + "The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", "[10, 20, 6]", 10, 13 + "This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", "[2, 7, ...]", 6, 7 + "...", "...", ., "[...]", "[...]", ., . + + words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target + 和input(设置为input以方便在forward函数中计算loss,如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数 + 的形参名进行传参)。 + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+------------+------------+--------+--------+--------+----------+----------+ + | field_names | raw_words1 | raw_words2 | target | words1 | words2 | seq_len1 | seq_len2 | + +-------------+------------+------------+--------+--------+--------+----------+----------+ + | is_input | False | False | False | True | True | True | True | + | is_target | False | False | True | False | False | False | False | + | ignore_type | | | False | False | False | False | False | + | pad_value | | | 0 | 0 | 0 | 0 | 0 | + +-------------+------------+------------+--------+--------+--------+----------+----------+ + + """ + + def __init__(self, lower=False, tokenizer: str = 'raw'): + """ + + :param bool lower: 是否将所有raw_words转为小写。 + :param str tokenizer: 将原始数据tokenize的方式。支持spacy, raw. spacy是使用spacy切分,raw就是用空格切分。 + """ + super().__init__() + + self.lower = bool(lower) + self.tokenizer = get_tokenizer(tokenize_method=tokenizer) + + def _tokenize(self, data_bundle, field_names, new_field_names): + """ + + :param ~fastNLP.DataBundle data_bundle: DataBundle. + :param list field_names: List[str], 需要tokenize的field名称 + :param list new_field_names: List[str], tokenize之后field的名称,与field_names一一对应。 + :return: 输入的DataBundle对象 + """ + for name, dataset in data_bundle.datasets.items(): + for field_name, new_field_name in zip(field_names, new_field_names): + dataset.apply_field(lambda words: self.tokenizer(words), field_name=field_name, + new_field_name=new_field_name) + return data_bundle + + def process(self, data_bundle): + """ + 接受的DataBundle中的DataSet应该具有以下的field, target列可以没有 + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "The new rights are...", "Everyone really likes..", "entailment" + "This site includes a...", "The Government Executive...", "not_entailment" + "...", "..." + + :param ~fastNLP.DataBundle data_bundle: 通过loader读取得到的data_bundle,里面包含了数据集的原始数据内容 + :return: data_bundle + """ + data_bundle = self._tokenize(data_bundle, [Const.RAW_WORDS(0), Const.RAW_WORDS(1)], + [Const.INPUTS(0), Const.INPUTS(1)]) + + for dataset in data_bundle.datasets.values(): + if dataset.has_field(Const.TARGET): + dataset.drop(lambda x: x[Const.TARGET] == '-') + + if self.lower: + for name, dataset in data_bundle.datasets.items(): + dataset[Const.INPUTS(0)].lower() + dataset[Const.INPUTS(1)].lower() + + word_vocab = Vocabulary() + word_vocab.from_dataset(*[dataset for name, dataset in data_bundle.datasets.items() if 'train' in name], + field_name=[Const.INPUTS(0), Const.INPUTS(1)], + no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if + 'train' not in name]) + word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=[Const.INPUTS(0), Const.INPUTS(1)]) + + target_vocab = Vocabulary(padding=None, unknown=None) + target_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name], + field_name=Const.TARGET, + no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets() + if ('train' not in name) and (ds.has_field(Const.TARGET))] + ) + if len(target_vocab._no_create_word) > 0: + warn_msg = f"There are {len(target_vocab._no_create_word)} target labels" \ + f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \ + f"data set but not in train data set!." + warnings.warn(warn_msg) + logger.warning(warn_msg) + + has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if + dataset.has_field(Const.TARGET)] + target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) + + data_bundle.set_vocab(word_vocab, Const.INPUTS(0)) + data_bundle.set_vocab(target_vocab, Const.TARGET) + + input_fields = [Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LENS(0), Const.INPUT_LENS(1)] + target_fields = [Const.TARGET] + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUTS(0), Const.INPUT_LENS(0)) + dataset.add_seq_len(Const.INPUTS(1), Const.INPUT_LENS(1)) + dataset.set_input(*input_fields, flag=True) + for fields in target_fields: + if dataset.has_field(fields): + dataset.set_target(fields, flag=True) + + return data_bundle + + +class RTEPipe(MatchingPipe): + def process_from_file(self, paths=None): + data_bundle = RTELoader().load(paths) + return self.process(data_bundle) + + +class SNLIPipe(MatchingPipe): + def process_from_file(self, paths=None): + data_bundle = SNLILoader().load(paths) + return self.process(data_bundle) + + +class QuoraPipe(MatchingPipe): + def process_from_file(self, paths): + data_bundle = QuoraLoader().load(paths) + return self.process(data_bundle) + + +class QNLIPipe(MatchingPipe): + def process_from_file(self, paths=None): + data_bundle = QNLILoader().load(paths) + return self.process(data_bundle) + + +class MNLIPipe(MatchingPipe): + def process_from_file(self, paths=None): + data_bundle = MNLILoader().load(paths) + return self.process(data_bundle) + + +class LCQMCPipe(MatchingPipe): + def __init__(self, tokenizer='cn=char'): + super().__init__(tokenizer=tokenizer) + + def process_from_file(self, paths=None): + data_bundle = LCQMCLoader().load(paths) + data_bundle = RenamePipe().process(data_bundle) + data_bundle = self.process(data_bundle) + data_bundle = RenamePipe().process(data_bundle) + return data_bundle + + +class CNXNLIPipe(MatchingPipe): + def __init__(self, tokenizer='cn-char'): + super().__init__(tokenizer=tokenizer) + + def process_from_file(self, paths=None): + data_bundle = CNXNLILoader().load(paths) + data_bundle = GranularizePipe(task='XNLI').process(data_bundle) + data_bundle = RenamePipe().process(data_bundle) # 使中文数据的field + data_bundle = self.process(data_bundle) + data_bundle = RenamePipe().process(data_bundle) + return data_bundle + + +class BQCorpusPipe(MatchingPipe): + def __init__(self, tokenizer='cn-char'): + super().__init__(tokenizer=tokenizer) + + def process_from_file(self, paths=None): + data_bundle = BQCorpusLoader().load(paths) + data_bundle = RenamePipe().process(data_bundle) + data_bundle = self.process(data_bundle) + data_bundle = RenamePipe().process(data_bundle) + return data_bundle + + +class RenamePipe(Pipe): + def __init__(self, task='cn-nli'): + super().__init__() + self.task = task + + def process(self, data_bundle: DataBundle): # rename field name for Chinese Matching dataset + if (self.task == 'cn-nli'): + for name, dataset in data_bundle.datasets.items(): + if (dataset.has_field(Const.RAW_CHARS(0))): + dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS + dataset.rename_field(Const.RAW_CHARS(1), Const.RAW_WORDS(1)) + elif (dataset.has_field(Const.INPUTS(0))): + dataset.rename_field(Const.INPUTS(0), Const.CHAR_INPUTS(0)) # WORDS->CHARS + dataset.rename_field(Const.INPUTS(1), Const.CHAR_INPUTS(1)) + dataset.rename_field(Const.RAW_WORDS(0), Const.RAW_CHARS(0)) + dataset.rename_field(Const.RAW_WORDS(1), Const.RAW_CHARS(1)) + else: + raise RuntimeError( + "field name of dataset is not qualified. It should have ether RAW_CHARS or WORDS") + elif (self.task == 'cn-nli-bert'): + for name, dataset in data_bundle.datasets.items(): + if (dataset.has_field(Const.RAW_CHARS(0))): + dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS + dataset.rename_field(Const.RAW_CHARS(1), Const.RAW_WORDS(1)) + elif (dataset.has_field(Const.RAW_WORDS(0))): + dataset.rename_field(Const.RAW_WORDS(0), Const.RAW_CHARS(0)) + dataset.rename_field(Const.RAW_WORDS(1), Const.RAW_CHARS(1)) + dataset.rename_field(Const.INPUT, Const.CHAR_INPUT) + else: + raise RuntimeError( + "field name of dataset is not qualified. It should have ether RAW_CHARS or RAW_WORDS" + ) + else: + raise RuntimeError( + "Only support task='cn-nli' or 'cn-nli-bert'" + ) + + return data_bundle + + +class GranularizePipe(Pipe): + def __init__(self, task=None): + super().__init__() + self.task = task + + def _granularize(self, data_bundle, tag_map): + """ + 该函数对data_bundle中'target'列中的内容进行转换。 + + :param data_bundle: + :param dict tag_map: 将target列中的tag做以下的映射,比如{"0":0, "1":0, "3":1, "4":1}, 则会删除target为"2"的instance, + 且将"1"认为是第0类。 + :return: 传入的data_bundle + """ + for name in list(data_bundle.datasets.keys()): + dataset = data_bundle.get_dataset(name) + dataset.apply_field(lambda target: tag_map.get(target, -100), field_name=Const.TARGET, + new_field_name=Const.TARGET) + dataset.drop(lambda ins: ins[Const.TARGET] == -100) + data_bundle.set_dataset(dataset, name) + return data_bundle + + def process(self, data_bundle: DataBundle): + task_tag_dict = { + 'XNLI': {'neutral': 0, 'entailment': 1, 'contradictory': 2, 'contradiction': 2} + } + if self.task in task_tag_dict: + data_bundle = self._granularize(data_bundle=data_bundle, tag_map=task_tag_dict[self.task]) + else: + raise RuntimeError(f"Only support {task_tag_dict.keys()} task_tag_map.") + return data_bundle + + +class MachingTruncatePipe(Pipe): # truncate sentence for bert, modify seq_len + def __init__(self): + super().__init__() + + def process(self, data_bundle: DataBundle): + for name, dataset in data_bundle.datasets.items(): + pass + return None + + +class LCQMCBertPipe(MatchingBertPipe): + def __init__(self, tokenizer='cn=char'): + super().__init__(tokenizer=tokenizer) + + def process_from_file(self, paths=None): + data_bundle = LCQMCLoader().load(paths) + data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) + data_bundle = self.process(data_bundle) + data_bundle = TruncateBertPipe(task='cn').process(data_bundle) + data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) + return data_bundle + + +class BQCorpusBertPipe(MatchingBertPipe): + def __init__(self, tokenizer='cn-char'): + super().__init__(tokenizer=tokenizer) + + def process_from_file(self, paths=None): + data_bundle = BQCorpusLoader().load(paths) + data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) + data_bundle = self.process(data_bundle) + data_bundle = TruncateBertPipe(task='cn').process(data_bundle) + data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) + return data_bundle + + +class CNXNLIBertPipe(MatchingBertPipe): + def __init__(self, tokenizer='cn-char'): + super().__init__(tokenizer=tokenizer) + + def process_from_file(self, paths=None): + data_bundle = CNXNLILoader().load(paths) + data_bundle = GranularizePipe(task='XNLI').process(data_bundle) + data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) + data_bundle = self.process(data_bundle) + data_bundle = TruncateBertPipe(task='cn').process(data_bundle) + data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) + return data_bundle + + +class TruncateBertPipe(Pipe): + def __init__(self, task='cn'): + super().__init__() + self.task = task + + def _truncate(self, sentence_index:list, sep_index_vocab): + # 根据[SEP]在vocab中的index,找到[SEP]在dataset的field['words']中的index + sep_index_words = sentence_index.index(sep_index_vocab) + words_before_sep = sentence_index[:sep_index_words] + words_after_sep = sentence_index[sep_index_words:] # 注意此部分包括了[SEP] + if self.task == 'cn': + # 中文任务将Instance['words']中在[SEP]前后的文本分别截至长度不超过250 + words_before_sep = words_before_sep[:250] + words_after_sep = words_after_sep[:250] + elif self.task == 'en': + # 英文任务将Instance['words']中在[SEP]前后的文本分别截至长度不超过215 + words_before_sep = words_before_sep[:215] + words_after_sep = words_after_sep[:215] + else: + raise RuntimeError("Only support 'cn' or 'en' task.") + + return words_before_sep + words_after_sep + + def process(self, data_bundle: DataBundle) -> DataBundle: + for name in data_bundle.datasets.keys(): + dataset = data_bundle.get_dataset(name) + sep_index_vocab = data_bundle.get_vocab('words').to_index('[SEP]') + dataset.apply_field(lambda sent_index: self._truncate(sentence_index=sent_index, sep_index_vocab=sep_index_vocab), field_name='words', new_field_name='words') + + # truncate之后需要更新seq_len + dataset.add_seq_len(field_name='words') + return data_bundle + diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py new file mode 100644 index 00000000..ab3c9120 --- /dev/null +++ b/fastNLP/io/pipe/pipe.py @@ -0,0 +1,41 @@ +"""undocumented""" + +__all__ = [ + "Pipe", +] + +from .. import DataBundle + + +class Pipe: + """ + Pipe是fastNLP中用于处理DataBundle的类,但实际是处理DataBundle中的DataSet。所有Pipe都会在其process()函数的文档中指出该Pipe可处理的DataSet应该具备怎样的格式;在Pipe + 文档中说明该Pipe返回后DataSet的格式以及其field的信息;以及新增的Vocabulary的信息。 + + 一般情况下Pipe处理包含以下的几个过程,(1)将raw_words或raw_chars进行tokenize以切分成不同的词或字; + (2) 再建立词或字的 :class:`~fastNLP.Vocabulary` , 并将词或字转换为index; (3)将target列建立词表并将target列转为index; + + Pipe中提供了两个方法 + + -process()函数,输入为DataBundle + -process_from_file()函数,输入为对应Loader的load函数可接受的类型。 + + """ + + def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 对输入的DataBundle进行处理,然后返回该DataBundle。 + + :param ~fastNLP.DataBundle data_bundle: 需要处理的DataBundle对象 + :return: + """ + raise NotImplementedError + + def process_from_file(self, paths) -> DataBundle: + """ + 传入文件路径,生成处理好的DataBundle对象。paths支持的路径形式可以参考 ::meth:`fastNLP.io.Loader.load()` + + :param paths: + :return: DataBundle + """ + raise NotImplementedError diff --git a/fastNLP/io/pipe/qa.py b/fastNLP/io/pipe/qa.py new file mode 100644 index 00000000..ea989545 --- /dev/null +++ b/fastNLP/io/pipe/qa.py @@ -0,0 +1,142 @@ +""" +本文件中的Pipe主要用于处理问答任务的数据。 + +""" + + +from copy import deepcopy + +from .pipe import Pipe +from .. import DataBundle +from ..loader.qa import CMRC2018Loader +from .utils import get_tokenizer +from ...core import DataSet +from ...core import Vocabulary + +__all__ = ['CMRC2018BertPipe'] + + +def _concat_clip(data_bundle, tokenizer, max_len, concat_field_name='raw_chars'): + """ + 处理data_bundle中的DataSet,将context与question进行tokenize,然后使用[SEP]将两者连接起来。 + + 会新增field: context_len(int), raw_words(list[str]), target_start(int), target_end(int)其中target_start + 与target_end是与raw_chars等长的。其中target_start和target_end是前闭后闭的区间。 + + :param DataBundle data_bundle: 类似["a", "b", "[SEP]", "c", ] + :return: + """ + for name in list(data_bundle.datasets.keys()): + ds = data_bundle.get_dataset(name) + data_bundle.delete_dataset(name) + new_ds = DataSet() + for ins in ds: + new_ins = deepcopy(ins) + context = ins['context'] + question = ins['question'] + + cnt_lst = tokenizer(context) + q_lst = tokenizer(question) + + answer_start = -1 + + if len(cnt_lst) + len(q_lst) + 3 > max_len: # 预留开头的[CLS]和[SEP]和中间的[sep] + if 'answer_starts' in ins and 'answers' in ins: + answer_start = int(ins['answer_starts'][0]) + answer = ins['answers'][0] + answer_end = answer_start + len(answer) + if answer_end > max_len - 3 - len(q_lst): + span_start = answer_end + 3 + len(q_lst) - max_len + span_end = answer_end + else: + span_start = 0 + span_end = max_len - 3 - len(q_lst) + cnt_lst = cnt_lst[span_start:span_end] + answer_start = int(ins['answer_starts'][0]) + answer_start -= span_start + answer_end = answer_start + len(ins['answers'][0]) + else: + cnt_lst = cnt_lst[:max_len - len(q_lst) - 3] + else: + if 'answer_starts' in ins and 'answers' in ins: + answer_start = int(ins['answer_starts'][0]) + answer_end = answer_start + len(ins['answers'][0]) + + tokens = cnt_lst + ['[SEP]'] + q_lst + new_ins['context_len'] = len(cnt_lst) + new_ins[concat_field_name] = tokens + + if answer_start != -1: + new_ins['target_start'] = answer_start + new_ins['target_end'] = answer_end - 1 + + new_ds.append(new_ins) + data_bundle.set_dataset(new_ds, name) + + return data_bundle + + +class CMRC2018BertPipe(Pipe): + """ + 处理之后的DataSet将新增以下的field(传入的field仍然保留) + + .. csv-table:: + :header: "context_len", "raw_chars", "target_start", "target_end", "chars" + 492, ['范', '廷', '颂... ], 30, 34, [21, 25, ...] + 491, ['范', '廷', '颂... ], 41, 61, [21, 25, ...] + + ".", "...", "...","...", "..." + + raw_words列是context与question拼起来的结果,words是转为index的值, target_start当当前位置为答案的开头时为1,target_end当当前 + 位置为答案的结尾是为1;context_len指示的是words列中context的长度。 + + 其中各列的meta信息如下: + +-------------+-------------+-----------+--------------+------------+-------+---------+ + | field_names | context_len | raw_chars | target_start | target_end | chars | answers | + +-------------+-------------+-----------+--------------+------------+-------+---------| + | is_input | False | False | False | False | True | False | + | is_target | True | True | True | True | False | True | + | ignore_type | False | True | False | False | False | True | + | pad_value | 0 | 0 | 0 | 0 | 0 | 0 | + +-------------+-------------+-----------+--------------+------------+-------+---------+ + + """ + def __init__(self, max_len=510): + super().__init__() + self.max_len = max_len + + def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 传入的DataSet应该具备以下的field + + .. csv-table:: + :header:"title", "context", "question", "answers", "answer_starts", "id" + + "范廷颂", "范廷颂枢机(,),圣名保禄·若瑟()...", "范廷颂是什么时候被任为主教的?", ["1963年"], ["30"], "TRAIN_186_QUERY_0" + "范廷颂", "范廷颂枢机(,),圣名保禄·若瑟()...", "1990年,范廷颂担任什么职务?", ["1990年被擢升为天..."], ["41"],"TRAIN_186_QUERY_1" + "...", "...", "...","...", ".", "..." + + :param data_bundle: + :return: + """ + _tokenizer = get_tokenizer('cn-char', lang='cn') + data_bundle = _concat_clip(data_bundle, tokenizer=_tokenizer, max_len=self.max_len, concat_field_name='raw_chars') + + src_vocab = Vocabulary() + src_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name], + field_name='raw_chars', + no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets() + if 'train' not in name] + ) + src_vocab.index_dataset(*data_bundle.datasets.values(), field_name='raw_chars', new_field_name='chars') + data_bundle.set_vocab(src_vocab, 'chars') + + data_bundle.set_ignore_type('raw_chars', 'answers', flag=True) + data_bundle.set_input('chars') + data_bundle.set_target('raw_chars', 'answers', 'target_start', 'target_end', 'context_len') + + return data_bundle + + def process_from_file(self, paths=None) -> DataBundle: + data_bundle = CMRC2018Loader().load(paths) + return self.process(data_bundle) \ No newline at end of file diff --git a/fastNLP/io/pipe/summarization.py b/fastNLP/io/pipe/summarization.py new file mode 100644 index 00000000..64fa545d --- /dev/null +++ b/fastNLP/io/pipe/summarization.py @@ -0,0 +1,197 @@ +"""undocumented""" +import os +import numpy as np + +from .pipe import Pipe +from .utils import _drop_empty_instance +from ..loader.summarization import ExtCNNDMLoader +from ..data_bundle import DataBundle +from ...core.const import Const +from ...core.vocabulary import Vocabulary +from ...core._logger import logger + + +WORD_PAD = "[PAD]" +WORD_UNK = "[UNK]" +DOMAIN_UNK = "X" +TAG_UNK = "X" + + +class ExtCNNDMPipe(Pipe): + """ + 对CNN/Daily Mail数据进行适用于extractive summarization task的预处理,预处理之后的数据,具备以下结构: + + .. csv-table:: + :header: "text", "summary", "label", "publication", "text_wd", "words", "seq_len", "target" + + """ + def __init__(self, vocab_size, sent_max_len, doc_max_timesteps, vocab_path=None, domain=False): + """ + + :param vocab_size: int, 词表大小 + :param sent_max_len: int, 句子最大长度,不足的句子将padding,超出的将截断 + :param doc_max_timesteps: int, 文章最多句子个数,不足的将padding,超出的将截断 + :param vocab_path: str, 外部词表路径 + :param domain: bool, 是否需要建立domain词表 + """ + self.vocab_size = vocab_size + self.vocab_path = vocab_path + self.sent_max_len = sent_max_len + self.doc_max_timesteps = doc_max_timesteps + self.domain = domain + + def process(self, data_bundle: DataBundle): + """ + 传入的DataSet应该具备如下的结构 + + .. csv-table:: + :header: "text", "summary", "label", "publication" + + ["I got new tires from them and... ","..."], ["The new tires...","..."], [0, 1], "cnndm" + ["Don't waste your time. We had two...","..."], ["Time is precious","..."], [1], "cnndm" + ["..."], ["..."], [], "cnndm" + + :param data_bundle: + :return: 处理得到的数据包括 + .. csv-table:: + :header: "text_wd", "words", "seq_len", "target" + + [["I","got",..."."],...,["..."]], [[54,89,...,5],...,[9,43,..,0]], [1,1,...,0], [0,1,...,0] + [["Don't","waste",...,"."],...,["..."]], [[5234,653,...,5],...,[87,234,..,0]], [1,1,...,0], [1,1,...,0] + [[""],...,[""]], [[],...,[]], [], [] + """ + + if self.vocab_path is None: + error_msg = 'vocab file is not defined!' + logger.error(error_msg) + raise RuntimeError(error_msg) + data_bundle.apply(lambda x: _lower_text(x['text']), new_field_name='text') + data_bundle.apply(lambda x: _lower_text(x['summary']), new_field_name='summary') + data_bundle.apply(lambda x: _split_list(x['text']), new_field_name='text_wd') + data_bundle.apply(lambda x: _convert_label(x["label"], len(x["text"])), new_field_name=Const.TARGET) + + data_bundle.apply(lambda x: _pad_sent(x["text_wd"], self.sent_max_len), new_field_name=Const.INPUT) + # db.apply(lambda x: _token_mask(x["text_wd"], self.sent_max_len), new_field_name="pad_token_mask") + + # pad document + data_bundle.apply(lambda x: _pad_doc(x[Const.INPUT], self.sent_max_len, self.doc_max_timesteps), new_field_name=Const.INPUT) + data_bundle.apply(lambda x: _sent_mask(x[Const.INPUT], self.doc_max_timesteps), new_field_name=Const.INPUT_LEN) + data_bundle.apply(lambda x: _pad_label(x[Const.TARGET], self.doc_max_timesteps), new_field_name=Const.TARGET) + + data_bundle = _drop_empty_instance(data_bundle, "label") + + # set input and target + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) + data_bundle.set_target(Const.TARGET, Const.INPUT_LEN) + + # print("[INFO] Load existing vocab from %s!" % self.vocab_path) + word_list = [] + with open(self.vocab_path, 'r', encoding='utf8') as vocab_f: + cnt = 2 # pad and unk + for line in vocab_f: + pieces = line.split("\t") + word_list.append(pieces[0]) + cnt += 1 + if cnt > self.vocab_size: + break + vocabs = Vocabulary(max_size=self.vocab_size, padding=WORD_PAD, unknown=WORD_UNK) + vocabs.add_word_lst(word_list) + vocabs.build_vocab() + data_bundle.set_vocab(vocabs, "vocab") + + if self.domain is True: + domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK) + domaindict.from_dataset(data_bundle.get_dataset("train"), field_name="publication") + data_bundle.set_vocab(domaindict, "domain") + + return data_bundle + + def process_from_file(self, paths=None): + """ + :param paths: dict or string + :return: DataBundle + """ + loader = ExtCNNDMLoader() + if self.vocab_path is None: + if paths is None: + paths = loader.download() + if not os.path.isdir(paths): + error_msg = 'vocab file is not defined!' + logger.error(error_msg) + raise RuntimeError(error_msg) + self.vocab_path = os.path.join(paths, 'vocab') + db = loader.load(paths=paths) + db = self.process(db) + for ds in db.datasets.values(): + db.get_vocab("vocab").index_dataset(ds, field_name=Const.INPUT, new_field_name=Const.INPUT) + + return db + + +def _lower_text(text_list): + return [text.lower() for text in text_list] + + +def _split_list(text_list): + return [text.split() for text in text_list] + + +def _convert_label(label, sent_len): + np_label = np.zeros(sent_len, dtype=int) + if label != []: + np_label[np.array(label)] = 1 + return np_label.tolist() + + +def _pad_sent(text_wd, sent_max_len): + pad_text_wd = [] + for sent_wd in text_wd: + if len(sent_wd) < sent_max_len: + pad_num = sent_max_len - len(sent_wd) + sent_wd.extend([WORD_PAD] * pad_num) + else: + sent_wd = sent_wd[:sent_max_len] + pad_text_wd.append(sent_wd) + return pad_text_wd + + +def _token_mask(text_wd, sent_max_len): + token_mask_list = [] + for sent_wd in text_wd: + token_num = len(sent_wd) + if token_num < sent_max_len: + mask = [1] * token_num + [0] * (sent_max_len - token_num) + else: + mask = [1] * sent_max_len + token_mask_list.append(mask) + return token_mask_list + + +def _pad_label(label, doc_max_timesteps): + text_len = len(label) + if text_len < doc_max_timesteps: + pad_label = label + [0] * (doc_max_timesteps - text_len) + else: + pad_label = label[:doc_max_timesteps] + return pad_label + + +def _pad_doc(text_wd, sent_max_len, doc_max_timesteps): + text_len = len(text_wd) + if text_len < doc_max_timesteps: + padding = [WORD_PAD] * sent_max_len + pad_text = text_wd + [padding] * (doc_max_timesteps - text_len) + else: + pad_text = text_wd[:doc_max_timesteps] + return pad_text + + +def _sent_mask(text_wd, doc_max_timesteps): + text_len = len(text_wd) + if text_len < doc_max_timesteps: + sent_mask = [1] * text_len + [0] * (doc_max_timesteps - text_len) + else: + sent_mask = [1] * doc_max_timesteps + return sent_mask + + diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py new file mode 100644 index 00000000..d05ffe96 --- /dev/null +++ b/fastNLP/io/pipe/utils.py @@ -0,0 +1,200 @@ +"""undocumented""" + +__all__ = [ + "iob2", + "iob2bioes", + "get_tokenizer", +] + +from typing import List +import warnings + +from ...core.const import Const +from ...core.vocabulary import Vocabulary +from ...core._logger import logger + + +def iob2(tags: List[str]) -> List[str]: + """ + 检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。两种格式的区别见 + https://datascience.stackexchange.com/questions/37824/difference-between-iob-and-iob2-format + + :param tags: 需要转换的tags + """ + for i, tag in enumerate(tags): + if tag == "O": + continue + split = tag.split("-") + if len(split) != 2 or split[0] not in ["I", "B"]: + raise TypeError("The encoding schema is not a valid IOB type.") + if split[0] == "B": + continue + elif i == 0 or tags[i - 1] == "O": # conversion IOB1 to IOB2 + tags[i] = "B" + tag[1:] + elif tags[i - 1][1:] == tag[1:]: + continue + else: # conversion IOB1 to IOB2 + tags[i] = "B" + tag[1:] + return tags + + +def iob2bioes(tags: List[str]) -> List[str]: + """ + 将iob的tag转换为bioes编码 + :param tags: + :return: + """ + new_tags = [] + for i, tag in enumerate(tags): + if tag == 'O': + new_tags.append(tag) + else: + split = tag.split('-')[0] + if split == 'B': + if i + 1 != len(tags) and tags[i + 1].split('-')[0] == 'I': + new_tags.append(tag) + else: + new_tags.append(tag.replace('B-', 'S-')) + elif split == 'I': + if i + 1 < len(tags) and tags[i + 1].split('-')[0] == 'I': + new_tags.append(tag) + else: + new_tags.append(tag.replace('I-', 'E-')) + else: + raise TypeError("Invalid IOB format.") + return new_tags + + +def get_tokenizer(tokenize_method: str, lang='en'): + """ + + :param str tokenize_method: 获取tokenzier方法 + :param str lang: 语言,当前仅支持en + :return: 返回tokenize函数 + """ + tokenizer_dict = { + 'spacy': None, + 'raw': _raw_split, + 'cn-char': _cn_char_split, + } + if tokenize_method == 'spacy': + import spacy + spacy.prefer_gpu() + if lang != 'en': + raise RuntimeError("Spacy only supports en right right.") + en = spacy.load(lang) + tokenizer = lambda x: [w.text for w in en.tokenizer(x)] + elif tokenize_method in tokenizer_dict: + tokenizer = tokenizer_dict[tokenize_method] + else: + raise RuntimeError(f"Only support {tokenizer_dict.keys()} tokenizer.") + return tokenizer + + +def _cn_char_split(sent): + return [chars for chars in sent] + + +def _raw_split(sent): + return sent.split() + + +def _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=Const.TARGET): + """ + 在dataset中的field_name列建立词表,Const.TARGET列建立词表,并把词表加入到data_bundle中。 + + :param ~fastNLP.DataBundle data_bundle: + :param: str,list input_field_names: + :param: str,list target_field_names: 这一列的vocabulary没有unknown和padding + :return: + """ + if isinstance(input_field_names, str): + input_field_names = [input_field_names] + if isinstance(target_field_names, str): + target_field_names = [target_field_names] + for input_field_name in input_field_names: + src_vocab = Vocabulary() + src_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name], + field_name=input_field_name, + no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets() + if ('train' not in name) and (ds.has_field(input_field_name))] + ) + src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) + data_bundle.set_vocab(src_vocab, input_field_name) + + for target_field_name in target_field_names: + tgt_vocab = Vocabulary(unknown=None, padding=None) + tgt_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name], + field_name=target_field_name, + no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets() + if ('train' not in name) and (ds.has_field(target_field_name))] + ) + if len(tgt_vocab._no_create_word) > 0: + warn_msg = f"There are {len(tgt_vocab._no_create_word)} `{target_field_name}` labels" \ + f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \ + f"data set but not in train data set!.\n" \ + f"These label(s) are {tgt_vocab._no_create_word}" + warnings.warn(warn_msg) + logger.warning(warn_msg) + tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=target_field_name) + data_bundle.set_vocab(tgt_vocab, target_field_name) + + return data_bundle + + +def _add_words_field(data_bundle, lower=False): + """ + 给data_bundle中的dataset中复制一列words. 并根据lower参数判断是否需要小写化 + + :param data_bundle: + :param bool lower:是否要小写化 + :return: 传入的DataBundle + """ + data_bundle.copy_field(field_name=Const.RAW_WORD, new_field_name=Const.INPUT, ignore_miss_dataset=True) + + if lower: + for name, dataset in data_bundle.datasets.items(): + dataset[Const.INPUT].lower() + return data_bundle + + +def _add_chars_field(data_bundle, lower=False): + """ + 给data_bundle中的dataset中复制一列chars. 并根据lower参数判断是否需要小写化 + + :param data_bundle: + :param bool lower:是否要小写化 + :return: 传入的DataBundle + """ + data_bundle.copy_field(field_name=Const.RAW_CHAR, new_field_name=Const.CHAR_INPUT, ignore_miss_dataset=True) + + if lower: + for name, dataset in data_bundle.datasets.items(): + dataset[Const.CHAR_INPUT].lower() + return data_bundle + + +def _drop_empty_instance(data_bundle, field_name): + """ + 删除data_bundle的DataSet中存在的某个field为空的情况 + + :param ~fastNLP.DataBundle data_bundle: + :param str field_name: 对哪个field进行检查,如果为None,则任意field为空都会删掉 + :return: 传入的DataBundle + """ + + def empty_instance(ins): + if field_name: + field_value = ins[field_name] + if field_value in ((), {}, [], ''): + return True + return False + for _, field_value in ins.items(): + if field_value in ((), {}, [], ''): + return True + return False + + for name, dataset in data_bundle.datasets.items(): + dataset.drop(empty_instance) + + return data_bundle diff --git a/fastNLP/io/utils.py b/fastNLP/io/utils.py index a7d2de85..496aee77 100644 --- a/fastNLP/io/utils.py +++ b/fastNLP/io/utils.py @@ -1,23 +1,37 @@ -import os +""" +.. todo:: + doc +""" + +__all__ = [ + "check_loader_paths" +] +import os +from pathlib import Path from typing import Union, Dict +from ..core import logger + -def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: +def check_loader_paths(paths: Union[str, Dict[str, str]]) -> Dict[str, str]: """ - 检查传入dataloader的文件的合法性。如果为合法路径,将返回至少包含'train'这个key的dict。类似于下面的结果 - { - 'train': '/some/path/to/', # 一定包含,建词表应该在这上面建立,剩下的其它文件应该只需要处理并index。 - 'test': 'xxx' # 可能有,也可能没有 - ... - } - 如果paths为不合法的,将直接进行raise相应的错误 + 检查传入dataloader的文件的合法性。如果为合法路径,将返回至少包含'train'这个key的dict。类似于下面的结果:: - :param paths: 路径. 可以为一个文件路径(则认为该文件就是train的文件); 可以为一个文件目录,将在该目录下寻找train(文件名 + { + 'train': '/some/path/to/', # 一定包含,建词表应该在这上面建立,剩下的其它文件应该只需要处理并index。 + 'test': 'xxx' # 可能有,也可能没有 + ... + } + + 如果paths为不合法的,将直接进行raise相应的错误. 如果paths内不包含train也会报错。 + + :param str paths: 路径. 可以为一个文件路径(则认为该文件就是train的文件); 可以为一个文件目录,将在该目录下寻找train(文件名 中包含train这个字段), test.txt, dev.txt; 可以为一个dict, 则key是用户自定义的某个文件的名称,value是这个文件的路径。 :return: """ - if isinstance(paths, str): + if isinstance(paths, (str, Path)): + paths = os.path.abspath(os.path.expanduser(paths)) if os.path.isfile(paths): return {'train': paths} elif os.path.isdir(paths): @@ -29,26 +43,32 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: path_pair = ('train', filename) if 'dev' in filename: if path_pair: - raise Exception("File:{} in {} contains bot `{}` and `dev`.".format(filename, paths, path_pair[0])) + raise Exception( + "File:{} in {} contains bot `{}` and `dev`.".format(filename, paths, path_pair[0])) path_pair = ('dev', filename) if 'test' in filename: if path_pair: - raise Exception("File:{} in {} contains bot `{}` and `test`.".format(filename, paths, path_pair[0])) + raise Exception( + "File:{} in {} contains bot `{}` and `test`.".format(filename, paths, path_pair[0])) path_pair = ('test', filename) if path_pair: files[path_pair[0]] = os.path.join(paths, path_pair[1]) + if 'train' not in files: + raise KeyError(f"There is no train file in {paths}.") return files else: raise FileNotFoundError(f"{paths} is not a valid file path.") - + elif isinstance(paths, dict): if paths: if 'train' not in paths: raise KeyError("You have to include `train` in your dict.") for key, value in paths.items(): if isinstance(key, str) and isinstance(value, str): + value = os.path.abspath(os.path.expanduser(value)) if not os.path.isfile(value): raise TypeError(f"{value} is not a valid file.") + paths[key] = value else: raise TypeError("All keys and values in paths should be str.") return paths @@ -56,14 +76,3 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: raise ValueError("Empty paths is not allowed.") else: raise TypeError(f"paths only supports str and dict. not {type(paths)}.") - -def get_tokenizer(): - try: - import spacy - spacy.prefer_gpu() - en = spacy.load('en') - print('use spacy tokenizer') - return lambda x: [w.text for w in en.tokenizer(x)] - except Exception as e: - print('use raw tokenizer') - return lambda x: x.split() diff --git a/fastNLP/models/__init__.py b/fastNLP/models/__init__.py index 14314049..ba499ac2 100644 --- a/fastNLP/models/__init__.py +++ b/fastNLP/models/__init__.py @@ -12,6 +12,7 @@ __all__ = [ "SeqLabeling", "AdvSeqLabel", + "BiLSTMCRF", "ESIM", @@ -21,14 +22,24 @@ __all__ = [ "STSeqCls", "BiaffineParser", - "GraphParser" + "GraphParser", + + "BertForSequenceClassification", + "BertForSentenceMatching", + "BertForMultipleChoice", + "BertForTokenClassification", + "BertForQuestionAnswering" ] from .base_model import BaseModel from .bert import BertForMultipleChoice, BertForQuestionAnswering, BertForSequenceClassification, \ - BertForTokenClassification + BertForTokenClassification, BertForSentenceMatching from .biaffine_parser import BiaffineParser, GraphParser from .cnn_text_classification import CNNText -from .sequence_labeling import SeqLabeling, AdvSeqLabel +from .sequence_labeling import SeqLabeling, AdvSeqLabel, BiLSTMCRF from .snli import ESIM from .star_transformer import StarTransEnc, STSeqCls, STNLICls, STSeqLabel + +import sys +from ..doc_utils import doc_process +doc_process(sys.modules[__name__]) \ No newline at end of file diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 2646d580..f1896cb2 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -1,3 +1,7 @@ +"""undocumented""" + +__all__ = [] + import torch from ..modules.decoder.mlp import MLP @@ -18,6 +22,9 @@ class BaseModel(torch.nn.Module): class NaiveClassifier(BaseModel): + """ + 一个简单的分类器例子,可用于各种测试 + """ def __init__(self, in_feature_dim, out_feature_dim): super(NaiveClassifier, self).__init__() self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py index adecab60..4ee979d1 100644 --- a/fastNLP/models/bert.py +++ b/fastNLP/models/bert.py @@ -1,338 +1,259 @@ """ -bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0. +fastNLP提供了BERT应用到五个下游任务的模型代码,可以直接调用。这五个任务分别为 + + - 文本分类任务: :class:`~fastNLP.models.BertForSequenceClassification` + - Matching任务: :class:`~fastNLP.models.BertForSentenceMatching` + - 多选任务: :class:`~fastNLP.models.BertForMultipleChoice` + - 序列标注任务: :class:`~fastNLP.models.BertForTokenClassification` + - 抽取式QA任务: :class:`~fastNLP.models.BertForQuestionAnswering` + +每一个模型必须要传入一个名字为 `embed` 的 :class:`fastNLP.embeddings.BertEmbedding` ,这个参数包含了 +:class:`fastNLP.modules.encoder.BertModel` ,是下游模型的编码器(encoder)。 + +除此以外,还需要传入一个数字,这个数字在不同下游任务模型上的意义如下:: + + 下游任务模型 参数名称 含义 + BertForSequenceClassification num_labels 文本分类类别数目,默认值为2 + BertForSentenceMatching num_labels Matching任务类别数目,默认值为2 + BertForMultipleChoice num_choices 多选任务选项数目,默认值为2 + BertForTokenClassification num_labels 序列标注标签数目,无默认值 + BertForQuestionAnswering num_labels 抽取式QA列数,默认值为2(即第一列为start_span, 第二列为end_span) + +最后还可以传入dropout的大小,默认值为0.1。 """ + +__all__ = [ + "BertForSequenceClassification", + "BertForSentenceMatching", + "BertForMultipleChoice", + "BertForTokenClassification", + "BertForQuestionAnswering" +] + +import warnings + import torch from torch import nn from .base_model import BaseModel +from ..core._logger import logger from ..core.const import Const -from ..modules.encoder import BertModel -from ..modules.encoder.bert import BertConfig +from ..embeddings import BertEmbedding class BertForSequenceClassification(BaseModel): - """BERT model for classification. - This module is composed of the BERT model with a linear layer on top of - the pooled output. - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `num_labels`: the number of classes for the classifier. Default = 2. - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] - with indices selected in [0, ..., num_labels]. - Outputs: - if `labels` is not `None`: - Outputs the CrossEntropy classification loss of the output with the labels. - if `labels` is `None`: - Outputs the classification logits of shape [batch_size, num_labels]. - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - num_labels = 2 - model = BertForSequenceClassification(num_labels, config) - logits = model(input_ids, token_type_ids, input_mask) - ``` """ - def __init__(self, num_labels, config=None, bert_dir=None): + BERT model for classification. + + """ + def __init__(self, embed: BertEmbedding, num_labels: int=2, dropout=0.1): + """ + + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_labels: 文本分类类别数目,默认值为2. + :param float dropout: dropout的大小,默认值为0.1. + """ super(BertForSequenceClassification, self).__init__() + self.num_labels = num_labels - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - else: - if config is None: - config = BertConfig(30522) - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, num_labels) - - @classmethod - def from_pretrained(cls, num_labels, pretrained_model_dir): - config = BertConfig(pretrained_model_dir) - model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir) - return model - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): - _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) + self.bert = embed + self.dropout = nn.Dropout(p=dropout) + self.classifier = nn.Linear(self.bert.embedding_dim, num_labels) + + if not self.bert.model.include_cls_sep: + self.bert.model.include_cls_sep = True + warn_msg = "Bert for sequence classification excepts BertEmbedding `include_cls_sep` True, " \ + "but got False. FastNLP has changed it to True." + logger.warning(warn_msg) + warnings.warn(warn_msg) + + def forward(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.Tensor [batch_size, num_labels] + """ + hidden = self.dropout(self.bert(words)) + cls_hidden = hidden[:, 0] + logits = self.classifier(cls_hidden) - if labels is not None: - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - return {Const.OUTPUT: logits, Const.LOSS: loss} - else: - return {Const.OUTPUT: logits} + return {Const.OUTPUT: logits} - def predict(self, input_ids, token_type_ids=None, attention_mask=None): - logits = self.forward(input_ids, token_type_ids, attention_mask) + def predict(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size] + """ + logits = self.forward(words)[Const.OUTPUT] + return {Const.OUTPUT: torch.argmax(logits, dim=-1)} + + +class BertForSentenceMatching(BaseModel): + """ + BERT model for sentence matching. + + """ + def __init__(self, embed: BertEmbedding, num_labels: int=2, dropout=0.1): + """ + + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_labels: Matching任务类别数目,默认值为2. + :param float dropout: dropout的大小,默认值为0.1. + """ + super(BertForSentenceMatching, self).__init__() + self.num_labels = num_labels + self.bert = embed + self.dropout = nn.Dropout(p=dropout) + self.classifier = nn.Linear(self.bert.embedding_dim, num_labels) + + if not self.bert.model.include_cls_sep: + self.bert.model.include_cls_sep = True + warn_msg = "Bert for sentence matching excepts BertEmbedding `include_cls_sep` True, " \ + "but got False. FastNLP has changed it to True." + logger.warning(warn_msg) + warnings.warn(warn_msg) + + def forward(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.Tensor [batch_size, num_labels] + """ + hidden = self.bert(words) + cls_hidden = self.dropout(hidden[:, 0]) + logits = self.classifier(cls_hidden) + + return {Const.OUTPUT: logits} + + def predict(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size] + """ + logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} class BertForMultipleChoice(BaseModel): - """BERT model for multiple choice tasks. - This module is composed of the BERT model with a linear layer on top of - the pooled output. - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `num_choices`: the number of classes for the classifier. Default = 2. - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] - with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` - and type 1 corresponds to a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] - with indices selected in [0, ..., num_choices]. - Outputs: - if `labels` is not `None`: - Outputs the CrossEntropy classification loss of the output with the labels. - if `labels` is `None`: - Outputs the classification logits of shape [batch_size, num_labels]. - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) - input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) - token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - num_choices = 2 - model = BertForMultipleChoice(num_choices, config, bert_dir) - logits = model(input_ids, token_type_ids, input_mask) - ``` """ - def __init__(self, num_choices, config=None, bert_dir=None): + BERT model for multiple choice. + + """ + def __init__(self, embed: BertEmbedding, num_choices=2, dropout=0.1): + """ + + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_choices: 多选任务选项数目,默认值为2. + :param float dropout: dropout的大小,默认值为0.1. + """ super(BertForMultipleChoice, self).__init__() + self.num_choices = num_choices - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - else: - if config is None: - config = BertConfig(30522) - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, 1) - - @classmethod - def from_pretrained(cls, num_choices, pretrained_model_dir): - config = BertConfig(pretrained_model_dir) - model = cls(num_choices=num_choices, config=config, bert_dir=pretrained_model_dir) - return model - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): - flat_input_ids = input_ids.view(-1, input_ids.size(-1)) - flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) - flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) - _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False) - pooled_output = self.dropout(pooled_output) + self.bert = embed + self.dropout = nn.Dropout(p=dropout) + self.classifier = nn.Linear(self.bert.embedding_dim, 1) + + if not self.bert.model.include_cls_sep: + self.bert.model.include_cls_sep = True + warn_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, " \ + "but got False. FastNLP has changed it to True." + logger.warning(warn_msg) + warnings.warn(warn_msg) + + def forward(self, words): + """ + :param torch.LongTensor words: [batch_size, num_choices, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size, num_choices] + """ + batch_size, num_choices, seq_len = words.size() + + input_ids = words.view(batch_size * num_choices, seq_len) + hidden = self.bert(input_ids) + pooled_output = self.dropout(hidden[:, 0]) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, self.num_choices) - if labels is not None: - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - return {Const.OUTPUT: reshaped_logits, Const.LOSS: loss} - else: - return {Const.OUTPUT: reshaped_logits} + return {Const.OUTPUT: reshaped_logits} - def predict(self, input_ids, token_type_ids=None, attention_mask=None): - logits = self.forward(input_ids, token_type_ids, attention_mask)[Const.OUTPUT] + def predict(self, words): + """ + :param torch.LongTensor words: [batch_size, num_choices, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size] + """ + logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} class BertForTokenClassification(BaseModel): - """BERT model for token-level classification. - This module is composed of the BERT model with a linear layer on top of - the full hidden state of the last layer. - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `num_labels`: the number of classes for the classifier. Default = 2. - `bert_dir`: a dir which contains the bert parameters within file `pytorch_model.bin` - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] - with indices selected in [0, ..., num_labels]. - Outputs: - if `labels` is not `None`: - Outputs the CrossEntropy classification loss of the output with the labels. - if `labels` is `None`: - Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - num_labels = 2 - bert_dir = 'your-bert-file-dir' - model = BertForTokenClassification(num_labels, config, bert_dir) - logits = model(input_ids, token_type_ids, input_mask) - ``` """ - def __init__(self, num_labels, config=None, bert_dir=None): + BERT model for token classification. + + """ + def __init__(self, embed: BertEmbedding, num_labels, dropout=0.1): + """ + + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_labels: 序列标注标签数目,无默认值. + :param float dropout: dropout的大小,默认值为0.1. + """ super(BertForTokenClassification, self).__init__() + self.num_labels = num_labels - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - else: - if config is None: - config = BertConfig(30522) - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, num_labels) - - @classmethod - def from_pretrained(cls, num_labels, pretrained_model_dir): - config = BertConfig(pretrained_model_dir) - model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir) - return model - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): - sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + self.bert = embed + self.dropout = nn.Dropout(p=dropout) + self.classifier = nn.Linear(self.bert.embedding_dim, num_labels) + + if self.bert.model.include_cls_sep: + self.bert.model.include_cls_sep = False + warn_msg = "Bert for token classification excepts BertEmbedding `include_cls_sep` False, " \ + "but got True. FastNLP has changed it to False." + logger.warning(warn_msg) + warnings.warn(warn_msg) + + def forward(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.Tensor [batch_size, seq_len, num_labels] + """ + sequence_output = self.bert(words) # [batch_size, seq_len, embed_dim] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) - if labels is not None: - loss_fct = nn.CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels)[active_loss] - active_labels = labels.view(-1)[active_loss] - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - return {Const.OUTPUT: logits, Const.LOSS: loss} - else: - return {Const.OUTPUT: logits} - - def predict(self, input_ids, token_type_ids=None, attention_mask=None): - logits = self.forward(input_ids, token_type_ids, attention_mask)[Const.OUTPUT] + return {Const.OUTPUT: logits} + + def predict(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size, seq_len] + """ + logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} class BertForQuestionAnswering(BaseModel): - """BERT model for Question Answering (span extraction). - This module is composed of the BERT model with a linear layer on top of - the sequence output that computes start_logits and end_logits - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `bert_dir`: a dir which contains the bert parameters within file `pytorch_model.bin` - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size]. - Positions are clamped to the length of the sequence and position outside of the sequence are not taken - into account for computing the loss. - `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size]. - Positions are clamped to the length of the sequence and position outside of the sequence are not taken - into account for computing the loss. - Outputs: - if `start_positions` and `end_positions` are not `None`: - Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. - if `start_positions` or `end_positions` is `None`: - Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end - position tokens of shape [batch_size, sequence_length]. - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - bert_dir = 'your-bert-file-dir' - model = BertForQuestionAnswering(config, bert_dir) - start_logits, end_logits = model(input_ids, token_type_ids, input_mask) - ``` """ - def __init__(self, config=None, bert_dir=None): + 用于做Q&A的Bert模型,如果是Squad2.0请将BertEmbedding的include_cls_sep设置为True,Squad1.0或CMRC则设置为False + + """ + def __init__(self, embed: BertEmbedding): + """ + + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_labels: 抽取式QA列数,默认值为2(即第一列为start_span, 第二列为end_span). + """ super(BertForQuestionAnswering, self).__init__() - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - else: - if config is None: - config = BertConfig(30522) - self.bert = BertModel(config) - # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version - # self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.qa_outputs = nn.Linear(config.hidden_size, 2) - - @classmethod - def from_pretrained(cls, pretrained_model_dir): - config = BertConfig(pretrained_model_dir) - model = cls(config=config, bert_dir=pretrained_model_dir) - return model - - def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None): - sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) - - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions.clamp_(0, ignored_index) - end_positions.clamp_(0, ignored_index) - - loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits, Const.LOSS: total_loss} - else: - return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits} - - def predict(self, input_ids, token_type_ids=None, attention_mask=None, **kwargs): - logits = self.forward(input_ids, token_type_ids, attention_mask) - start_logits = logits[Const.OUTPUTS(0)] - end_logits = logits[Const.OUTPUTS(1)] - return {Const.OUTPUTS(0): torch.argmax(start_logits, dim=-1), - Const.OUTPUTS(1): torch.argmax(end_logits, dim=-1)} + + self.bert = embed + self.qa_outputs = nn.Linear(self.bert.embedding_dim, 2) + + def forward(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: 一个包含num_labels个logit的dict,每一个logit的形状都是[batch_size, seq_len + 2] + """ + sequence_output = self.bert(words) + logits = self.qa_outputs(sequence_output) # [batch_size, seq_len, num_labels] + + return {'pred_start': logits[:, :, 0], 'pred_end': logits[:, :, 1]} + + def predict(self, words): + return self.forward(words) diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py index 29487864..45f8adb7 100644 --- a/fastNLP/models/biaffine_parser.py +++ b/fastNLP/models/biaffine_parser.py @@ -6,23 +6,23 @@ __all__ = [ "GraphParser" ] +from collections import defaultdict + import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -from collections import defaultdict - +from .base_model import BaseModel from ..core.const import Const as C from ..core.losses import LossFunc from ..core.metrics import MetricBase +from ..core.utils import seq_len_to_mask +from ..embeddings.utils import get_embeddings from ..modules.dropout import TimestepDropout from ..modules.encoder.transformer import TransformerEncoder from ..modules.encoder.variational_rnn import VarLSTM from ..modules.utils import initial_parameter -from ..embeddings.utils import get_embeddings -from .base_model import BaseModel -from ..core.utils import seq_len_to_mask def _mst(scores): @@ -130,8 +130,6 @@ def _find_cycle(vertices, edges): class GraphParser(BaseModel): """ - 别名::class:`fastNLP.models.GraphParser` :class:`fastNLP.models.baffine_parser.GraphParser` - 基于图的parser base class, 支持贪婪解码和最大生成树解码 """ @@ -150,7 +148,7 @@ class GraphParser(BaseModel): """ _, seq_len, _ = arc_matrix.shape matrix = arc_matrix + torch.diag(arc_matrix.new(seq_len).fill_(-np.inf)) - flip_mask = (mask == 0).byte() + flip_mask = mask.eq(0) matrix.masked_fill_(flip_mask.unsqueeze(1), -np.inf) _, heads = torch.max(matrix, dim=2) if mask is not None: @@ -183,11 +181,14 @@ class ArcBiaffine(nn.Module): """ Biaffine Dependency Parser 的子模块, 用于构建预测边的图 - :param hidden_size: 输入的特征维度 - :param bias: 是否使用bias. Default: ``True`` """ def __init__(self, hidden_size, bias=True): + """ + + :param hidden_size: 输入的特征维度 + :param bias: 是否使用bias. Default: ``True`` + """ super(ArcBiaffine, self).__init__() self.U = nn.Parameter(torch.Tensor(hidden_size, hidden_size), requires_grad=True) self.has_bias = bias @@ -207,7 +208,7 @@ class ArcBiaffine(nn.Module): output = dep.matmul(self.U) output = output.bmm(head.transpose(-1, -2)) if self.has_bias: - output += head.matmul(self.bias).unsqueeze(1) + output = output + head.matmul(self.bias).unsqueeze(1) return output @@ -215,13 +216,16 @@ class LabelBilinear(nn.Module): """ Biaffine Dependency Parser 的子模块, 用于构建预测边类别的图 - :param in1_features: 输入的特征1维度 - :param in2_features: 输入的特征2维度 - :param num_label: 边类别的个数 - :param bias: 是否使用bias. Default: ``True`` """ def __init__(self, in1_features, in2_features, num_label, bias=True): + """ + + :param in1_features: 输入的特征1维度 + :param in2_features: 输入的特征2维度 + :param num_label: 边类别的个数 + :param bias: 是否使用bias. Default: ``True`` + """ super(LabelBilinear, self).__init__() self.bilinear = nn.Bilinear(in1_features, in2_features, num_label, bias=bias) self.lin = nn.Linear(in1_features + in2_features, num_label, bias=False) @@ -234,35 +238,19 @@ class LabelBilinear(nn.Module): :return output: [batch, seq_len, num_cls] 每个元素对应类别的概率图 """ output = self.bilinear(x1, x2) - output += self.lin(torch.cat([x1, x2], dim=2)) + output = output + self.lin(torch.cat([x1, x2], dim=2)) return output class BiaffineParser(GraphParser): """ - 别名::class:`fastNLP.models.BiaffineParser` :class:`fastNLP.models.baffine_parser.BiaffineParser` - Biaffine Dependency Parser 实现. 论文参考 `Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) `_ . - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 - embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, - 此时就以传入的对象作为embedding - :param pos_vocab_size: part-of-speech 词典大小 - :param pos_emb_dim: part-of-speech 向量维度 - :param num_label: 边的类别个数 - :param rnn_layers: rnn encoder的层数 - :param rnn_hidden_size: rnn encoder 的隐状态维度 - :param arc_mlp_size: 边预测的MLP维度 - :param label_mlp_size: 类别预测的MLP维度 - :param dropout: dropout概率. - :param encoder: encoder类别, 可选 ('lstm', 'var-lstm', 'transformer'). Default: lstm - :param use_greedy_infer: 是否在inference时使用贪心算法. - 若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False`` """ def __init__(self, - init_embed, + embed, pos_vocab_size, pos_emb_dim, num_label, @@ -273,10 +261,27 @@ class BiaffineParser(GraphParser): dropout=0.3, encoder='lstm', use_greedy_infer=False): + """ + + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, + 此时就以传入的对象作为embedding + :param pos_vocab_size: part-of-speech 词典大小 + :param pos_emb_dim: part-of-speech 向量维度 + :param num_label: 边的类别个数 + :param rnn_layers: rnn encoder的层数 + :param rnn_hidden_size: rnn encoder 的隐状态维度 + :param arc_mlp_size: 边预测的MLP维度 + :param label_mlp_size: 类别预测的MLP维度 + :param dropout: dropout概率. + :param encoder: encoder类别, 可选 ('lstm', 'var-lstm', 'transformer'). Default: lstm + :param use_greedy_infer: 是否在inference时使用贪心算法. + 若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False`` + """ super(BiaffineParser, self).__init__() rnn_out_size = 2 * rnn_hidden_size word_hid_dim = pos_hid_dim = rnn_hidden_size - self.word_embedding = get_embeddings(init_embed) + self.word_embedding = get_embeddings(embed) word_emb_dim = self.word_embedding.embedding_dim self.pos_embedding = nn.Embedding(num_embeddings=pos_vocab_size, embedding_dim=pos_emb_dim) self.word_fc = nn.Linear(word_emb_dim, word_hid_dim) @@ -363,7 +368,7 @@ class BiaffineParser(GraphParser): # print('forward {} {}'.format(batch_size, seq_len)) # get sequence mask - mask = seq_len_to_mask(seq_len).long() + mask = seq_len_to_mask(seq_len, max_len=length).long() word = self.word_embedding(words1) # [N,L] -> [N,L,C_0] pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1] @@ -435,10 +440,10 @@ class BiaffineParser(GraphParser): """ batch_size, length, _ = pred1.shape - mask = seq_len_to_mask(seq_len) + mask = seq_len_to_mask(seq_len, max_len=length) flip_mask = (mask == 0) _arc_pred = pred1.clone() - _arc_pred.masked_fill_(flip_mask.unsqueeze(1), -float('inf')) + _arc_pred = _arc_pred.masked_fill(flip_mask.unsqueeze(1), -float('inf')) arc_logits = F.log_softmax(_arc_pred, dim=2) label_logits = F.log_softmax(pred2, dim=2) batch_index = torch.arange(batch_size, device=arc_logits.device, dtype=torch.long).unsqueeze(1) @@ -446,9 +451,8 @@ class BiaffineParser(GraphParser): arc_loss = arc_logits[batch_index, child_index, target1] label_loss = label_logits[batch_index, child_index, target2] - byte_mask = flip_mask.byte() - arc_loss.masked_fill_(byte_mask, 0) - label_loss.masked_fill_(byte_mask, 0) + arc_loss = arc_loss.masked_fill(flip_mask, 0) + label_loss = label_loss.masked_fill(flip_mask, 0) arc_nll = -arc_loss.mean() label_nll = -label_loss.mean() return arc_nll + label_nll @@ -476,21 +480,22 @@ class BiaffineParser(GraphParser): class ParserLoss(LossFunc): """ - 别名::class:`fastNLP.models.ParserLoss` :class:`fastNLP.models.baffine_parser.ParserLoss` - 计算parser的loss - :param pred1: [batch_size, seq_len, seq_len] 边预测logits - :param pred2: [batch_size, seq_len, num_label] label预测logits - :param target1: [batch_size, seq_len] 真实边的标注 - :param target2: [batch_size, seq_len] 真实类别的标注 - :param seq_len: [batch_size, seq_len] 真实目标的长度 - :return loss: scalar """ def __init__(self, pred1=None, pred2=None, target1=None, target2=None, seq_len=None): + """ + + :param pred1: [batch_size, seq_len, seq_len] 边预测logits + :param pred2: [batch_size, seq_len, num_label] label预测logits + :param target1: [batch_size, seq_len] 真实边的标注 + :param target2: [batch_size, seq_len] 真实类别的标注 + :param seq_len: [batch_size, seq_len] 真实目标的长度 + :return loss: scalar + """ super(ParserLoss, self).__init__(BiaffineParser.loss, pred1=pred1, pred2=pred2, @@ -501,24 +506,24 @@ class ParserLoss(LossFunc): class ParserMetric(MetricBase): """ - 别名::class:`fastNLP.models.ParserMetric` :class:`fastNLP.models.baffine_parser.ParserMetric` - 评估parser的性能 - :param pred1: 边预测logits - :param pred2: label预测logits - :param target1: 真实边的标注 - :param target2: 真实类别的标注 - :param seq_len: 序列长度 - :return dict: 评估结果:: - - UAS: 不带label时, 边预测的准确率 - LAS: 同时预测边和label的准确率 """ def __init__(self, pred1=None, pred2=None, target1=None, target2=None, seq_len=None): + """ + :param pred1: 边预测logits + :param pred2: label预测logits + :param target1: 真实边的标注 + :param target2: 真实类别的标注 + :param seq_len: 序列长度 + :return dict: 评估结果:: + + UAS: 不带label时, 边预测的准确率 + LAS: 同时预测边和label的准确率 + """ super().__init__() self._init_param_map(pred1=pred1, pred2=pred2, target1=target1, target2=target2, diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index e00a0697..863c4941 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -1,3 +1,8 @@ +""" +.. todo:: + doc +""" + __all__ = [ "CNNText" ] @@ -7,34 +12,35 @@ import torch.nn as nn from ..core.const import Const as C from ..core.utils import seq_len_to_mask -from ..modules import encoder from ..embeddings import embedding +from ..modules import encoder class CNNText(torch.nn.Module): """ - 别名::class:`fastNLP.models.CNNText` :class:`fastNLP.models.cnn_text_classification.CNNText` - 使用CNN进行文本分类的模型 'Yoon Kim. 2014. Convolution Neural Networks for Sentence Classification.' - :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), - 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding - :param int num_classes: 一共有多少类 - :param int,tuple(int) out_channels: 输出channel的数量。如果为list,则需要与kernel_sizes的数量保持一致 - :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。 - :param float dropout: Dropout的大小 """ - def __init__(self, init_embed, + def __init__(self, embed, num_classes, kernel_nums=(30, 40, 50), kernel_sizes=(1, 3, 5), dropout=0.5): + """ + + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray embed: Embedding的大小(传入tuple(int, int), + 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding + :param int num_classes: 一共有多少类 + :param int,tuple(int) out_channels: 输出channel的数量。如果为list,则需要与kernel_sizes的数量保持一致 + :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。 + :param float dropout: Dropout的大小 + """ super(CNNText, self).__init__() # no support for pre-trained embedding currently - self.embed = embedding.Embedding(init_embed) + self.embed = embedding.Embedding(embed) self.conv_pool = encoder.ConvMaxpool( in_channels=self.embed.embedding_dim, out_channels=kernel_nums, diff --git a/fastNLP/models/enas_controller.py b/fastNLP/models/enas_controller.py deleted file mode 100644 index e83c6b51..00000000 --- a/fastNLP/models/enas_controller.py +++ /dev/null @@ -1,223 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch -"""A module with NAS controller-related code.""" -import collections -import os - -import torch -import torch.nn.functional as F - -from . import enas_utils as utils -from .enas_utils import Node - - -def _construct_dags(prev_nodes, activations, func_names, num_blocks): - """Constructs a set of DAGs based on the actions, i.e., previous nodes and - activation functions, sampled from the controller/policy pi. - - Args: - prev_nodes: Previous node actions from the policy. - activations: Activations sampled from the policy. - func_names: Mapping from activation function names to functions. - num_blocks: Number of blocks in the target RNN cell. - - Returns: - A list of DAGs defined by the inputs. - - RNN cell DAGs are represented in the following way: - - 1. Each element (node) in a DAG is a list of `Node`s. - - 2. The `Node`s in the list dag[i] correspond to the subsequent nodes - that take the output from node i as their own input. - - 3. dag[-1] is the node that takes input from x^{(t)} and h^{(t - 1)}. - dag[-1] always feeds dag[0]. - dag[-1] acts as if `w_xc`, `w_hc`, `w_xh` and `w_hh` are its - weights. - - 4. dag[N - 1] is the node that produces the hidden state passed to - the next timestep. dag[N - 1] is also always a leaf node, and therefore - is always averaged with the other leaf nodes and fed to the output - decoder. - """ - dags = [] - for nodes, func_ids in zip(prev_nodes, activations): - dag = collections.defaultdict(list) - - # add first node - dag[-1] = [Node(0, func_names[func_ids[0]])] - dag[-2] = [Node(0, func_names[func_ids[0]])] - - # add following nodes - for jdx, (idx, func_id) in enumerate(zip(nodes, func_ids[1:])): - dag[utils.to_item(idx)].append(Node(jdx + 1, func_names[func_id])) - - leaf_nodes = set(range(num_blocks)) - dag.keys() - - # merge with avg - for idx in leaf_nodes: - dag[idx] = [Node(num_blocks, 'avg')] - - # This is actually y^{(t)}. h^{(t)} is node N - 1 in - # the graph, where N Is the number of nodes. I.e., h^{(t)} takes - # only one other node as its input. - # last h[t] node - last_node = Node(num_blocks + 1, 'h[t]') - dag[num_blocks] = [last_node] - dags.append(dag) - - return dags - - -class Controller(torch.nn.Module): - """Based on - https://github.com/pytorch/examples/blob/master/word_language_model/model.py - - RL controllers do not necessarily have much to do with - language models. - - Base the controller RNN on the GRU from: - https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py - """ - def __init__(self, num_blocks=4, controller_hid=100, cuda=False): - torch.nn.Module.__init__(self) - - # `num_tokens` here is just the activation function - # for every even step, - self.shared_rnn_activations = ['tanh', 'ReLU', 'identity', 'sigmoid'] - self.num_tokens = [len(self.shared_rnn_activations)] - self.controller_hid = controller_hid - self.use_cuda = cuda - self.num_blocks = num_blocks - for idx in range(num_blocks): - self.num_tokens += [idx + 1, len(self.shared_rnn_activations)] - self.func_names = self.shared_rnn_activations - - num_total_tokens = sum(self.num_tokens) - - self.encoder = torch.nn.Embedding(num_total_tokens, - controller_hid) - self.lstm = torch.nn.LSTMCell(controller_hid, controller_hid) - - # Perhaps these weights in the decoder should be - # shared? At least for the activation functions, which all have the - # same size. - self.decoders = [] - for idx, size in enumerate(self.num_tokens): - decoder = torch.nn.Linear(controller_hid, size) - self.decoders.append(decoder) - - self._decoders = torch.nn.ModuleList(self.decoders) - - self.reset_parameters() - self.static_init_hidden = utils.keydefaultdict(self.init_hidden) - - def _get_default_hidden(key): - return utils.get_variable( - torch.zeros(key, self.controller_hid), - self.use_cuda, - requires_grad=False) - - self.static_inputs = utils.keydefaultdict(_get_default_hidden) - - def reset_parameters(self): - init_range = 0.1 - for param in self.parameters(): - param.data.uniform_(-init_range, init_range) - for decoder in self.decoders: - decoder.bias.data.fill_(0) - - def forward(self, # pylint:disable=arguments-differ - inputs, - hidden, - block_idx, - is_embed): - if not is_embed: - embed = self.encoder(inputs) - else: - embed = inputs - - hx, cx = self.lstm(embed, hidden) - logits = self.decoders[block_idx](hx) - - logits /= 5.0 - - # # exploration - # if self.args.mode == 'train': - # logits = (2.5 * F.tanh(logits)) - - return logits, (hx, cx) - - def sample(self, batch_size=1, with_details=False, save_dir=None): - """Samples a set of `args.num_blocks` many computational nodes from the - controller, where each node is made up of an activation function, and - each node except the last also includes a previous node. - """ - if batch_size < 1: - raise Exception(f'Wrong batch_size: {batch_size} < 1') - - # [B, L, H] - inputs = self.static_inputs[batch_size] - hidden = self.static_init_hidden[batch_size] - - activations = [] - entropies = [] - log_probs = [] - prev_nodes = [] - # The RNN controller alternately outputs an activation, - # followed by a previous node, for each block except the last one, - # which only gets an activation function. The last node is the output - # node, and its previous node is the average of all leaf nodes. - for block_idx in range(2*(self.num_blocks - 1) + 1): - logits, hidden = self.forward(inputs, - hidden, - block_idx, - is_embed=(block_idx == 0)) - - probs = F.softmax(logits, dim=-1) - log_prob = F.log_softmax(logits, dim=-1) - # .mean() for entropy? - entropy = -(log_prob * probs).sum(1, keepdim=False) - - action = probs.multinomial(num_samples=1).data - selected_log_prob = log_prob.gather( - 1, utils.get_variable(action, requires_grad=False)) - - # why the [:, 0] here? Should it be .squeeze(), or - # .view()? Same below with `action`. - entropies.append(entropy) - log_probs.append(selected_log_prob[:, 0]) - - # 0: function, 1: previous node - mode = block_idx % 2 - inputs = utils.get_variable( - action[:, 0] + sum(self.num_tokens[:mode]), - requires_grad=False) - - if mode == 0: - activations.append(action[:, 0]) - elif mode == 1: - prev_nodes.append(action[:, 0]) - - prev_nodes = torch.stack(prev_nodes).transpose(0, 1) - activations = torch.stack(activations).transpose(0, 1) - - dags = _construct_dags(prev_nodes, - activations, - self.func_names, - self.num_blocks) - - if save_dir is not None: - for idx, dag in enumerate(dags): - utils.draw_network(dag, - os.path.join(save_dir, f'graph{idx}.png')) - - if with_details: - return dags, torch.cat(log_probs), torch.cat(entropies) - - return dags - - def init_hidden(self, batch_size): - zeros = torch.zeros(batch_size, self.controller_hid) - return (utils.get_variable(zeros, self.use_cuda, requires_grad=False), - utils.get_variable(zeros.clone(), self.use_cuda, requires_grad=False)) diff --git a/fastNLP/models/enas_model.py b/fastNLP/models/enas_model.py deleted file mode 100644 index b6b683c0..00000000 --- a/fastNLP/models/enas_model.py +++ /dev/null @@ -1,390 +0,0 @@ -""" -Module containing the shared RNN model. -Code Modified from https://github.com/carpedm20/ENAS-pytorch -""" -import collections - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import Variable - -from . import enas_utils as utils -from .base_model import BaseModel - - -def _get_dropped_weights(w_raw, dropout_p, is_training): - """Drops out weights to implement DropConnect. - - Args: - w_raw: Full, pre-dropout, weights to be dropped out. - dropout_p: Proportion of weights to drop out. - is_training: True iff _shared_ model is training. - - Returns: - The dropped weights. - - Why does torch.nn.functional.dropout() return: - 1. `torch.autograd.Variable()` on the training loop - 2. `torch.nn.Parameter()` on the controller or eval loop, when - training = False... - - Even though the call to `_setweights` in the Smerity repo's - `weight_drop.py` does not have this behaviour, and `F.dropout` always - returns `torch.autograd.Variable` there, even when `training=False`? - - The above TODO is the reason for the hacky check for `torch.nn.Parameter`. - """ - dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training) - - if isinstance(dropped_w, torch.nn.Parameter): - dropped_w = dropped_w.clone() - - return dropped_w - - -class EmbeddingDropout(torch.nn.Embedding): - """Class for dropping out embeddings by zero'ing out parameters in the - embedding matrix. - - This is equivalent to dropping out particular words, e.g., in the sentence - 'the quick brown fox jumps over the lazy dog', dropping out 'the' would - lead to the sentence '### quick brown fox jumps over ### lazy dog' (in the - embedding vector space). - - See 'A Theoretically Grounded Application of Dropout in Recurrent Neural - Networks', (Gal and Ghahramani, 2016). - """ - - def __init__(self, - num_embeddings, - embedding_dim, - max_norm=None, - norm_type=2, - scale_grad_by_freq=False, - sparse=False, - dropout=0.1, - scale=None): - """Embedding constructor. - - Args: - dropout: Dropout probability. - scale: Used to scale parameters of embedding weight matrix that are - not dropped out. Note that this is _in addition_ to the - `1/(1 - dropout)` scaling. - - See `torch.nn.Embedding` for remaining arguments. - """ - torch.nn.Embedding.__init__(self, - num_embeddings=num_embeddings, - embedding_dim=embedding_dim, - max_norm=max_norm, - norm_type=norm_type, - scale_grad_by_freq=scale_grad_by_freq, - sparse=sparse) - self.dropout = dropout - assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 ' - 'and < 1.0') - self.scale = scale - - def forward(self, inputs): # pylint:disable=arguments-differ - """Embeds `inputs` with the dropped out embedding weight matrix.""" - if self.training: - dropout = self.dropout - else: - dropout = 0 - - if dropout: - mask = self.weight.data.new(self.weight.size(0), 1) - mask.bernoulli_(1 - dropout) - mask = mask.expand_as(self.weight) - mask = mask / (1 - dropout) - masked_weight = self.weight * Variable(mask) - else: - masked_weight = self.weight - if self.scale and self.scale != 1: - masked_weight = masked_weight * self.scale - - return F.embedding(inputs, - masked_weight, - max_norm=self.max_norm, - norm_type=self.norm_type, - scale_grad_by_freq=self.scale_grad_by_freq, - sparse=self.sparse) - - -class LockedDropout(nn.Module): - # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py - def __init__(self): - super().__init__() - - def forward(self, x, dropout=0.5): - if not self.training or not dropout: - return x - m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout) - mask = Variable(m, requires_grad=False) / (1 - dropout) - mask = mask.expand_as(x) - return mask * x - - -class ENASModel(BaseModel): - """Shared RNN model.""" - - def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000): - super(ENASModel, self).__init__() - - self.use_cuda = cuda - - self.shared_hid = shared_hid - self.num_blocks = num_blocks - self.decoder = nn.Linear(self.shared_hid, num_classes) - self.encoder = EmbeddingDropout(embed_num, - shared_embed, - dropout=0.1) - self.lockdrop = LockedDropout() - self.dag = None - - # Tie weights - # self.decoder.weight = self.encoder.weight - - # Since W^{x, c} and W^{h, c} are always summed, there - # is no point duplicating their bias offset parameter. Likewise for - # W^{x, h} and W^{h, h}. - self.w_xc = nn.Linear(shared_embed, self.shared_hid) - self.w_xh = nn.Linear(shared_embed, self.shared_hid) - - # The raw weights are stored here because the hidden-to-hidden weights - # are weight dropped on the forward pass. - self.w_hc_raw = torch.nn.Parameter( - torch.Tensor(self.shared_hid, self.shared_hid)) - self.w_hh_raw = torch.nn.Parameter( - torch.Tensor(self.shared_hid, self.shared_hid)) - self.w_hc = None - self.w_hh = None - - self.w_h = collections.defaultdict(dict) - self.w_c = collections.defaultdict(dict) - - for idx in range(self.num_blocks): - for jdx in range(idx + 1, self.num_blocks): - self.w_h[idx][jdx] = nn.Linear(self.shared_hid, - self.shared_hid, - bias=False) - self.w_c[idx][jdx] = nn.Linear(self.shared_hid, - self.shared_hid, - bias=False) - - self._w_h = nn.ModuleList([self.w_h[idx][jdx] - for idx in self.w_h - for jdx in self.w_h[idx]]) - self._w_c = nn.ModuleList([self.w_c[idx][jdx] - for idx in self.w_c - for jdx in self.w_c[idx]]) - - self.batch_norm = None - # if args.mode == 'train': - # self.batch_norm = nn.BatchNorm1d(self.shared_hid) - # else: - # self.batch_norm = None - - self.reset_parameters() - self.static_init_hidden = utils.keydefaultdict(self.init_hidden) - - def setDAG(self, dag): - if self.dag is None: - self.dag = dag - - def forward(self, word_seq, hidden=None): - inputs = torch.transpose(word_seq, 0, 1) - - time_steps = inputs.size(0) - batch_size = inputs.size(1) - - self.w_hh = _get_dropped_weights(self.w_hh_raw, - 0.5, - self.training) - self.w_hc = _get_dropped_weights(self.w_hc_raw, - 0.5, - self.training) - - # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden - hidden = self.static_init_hidden[batch_size] - - embed = self.encoder(inputs) - - embed = self.lockdrop(embed, 0.65 if self.training else 0) - - # The norm of hidden states are clipped here because - # otherwise ENAS is especially prone to exploding activations on the - # forward pass. This could probably be fixed in a more elegant way, but - # it might be exposing a weakness in the ENAS algorithm as currently - # proposed. - # - # For more details, see - # https://github.com/carpedm20/ENAS-pytorch/issues/6 - clipped_num = 0 - max_clipped_norm = 0 - h1tohT = [] - logits = [] - for step in range(time_steps): - x_t = embed[step] - logit, hidden = self.cell(x_t, hidden, self.dag) - - hidden_norms = hidden.norm(dim=-1) - max_norm = 25.0 - if hidden_norms.data.max() > max_norm: - # Just directly use the torch slice operations - # in PyTorch v0.4. - # - # This workaround for PyTorch v0.3.1 does everything in numpy, - # because the PyTorch slicing and slice assignment is too - # flaky. - hidden_norms = hidden_norms.data.cpu().numpy() - - clipped_num += 1 - if hidden_norms.max() > max_clipped_norm: - max_clipped_norm = hidden_norms.max() - - clip_select = hidden_norms > max_norm - clip_norms = hidden_norms[clip_select] - - mask = np.ones(hidden.size()) - normalizer = max_norm / clip_norms - normalizer = normalizer[:, np.newaxis] - - mask[clip_select] = normalizer - - if self.use_cuda: - hidden *= torch.autograd.Variable( - torch.FloatTensor(mask).cuda(), requires_grad=False) - else: - hidden *= torch.autograd.Variable( - torch.FloatTensor(mask), requires_grad=False) - logits.append(logit) - h1tohT.append(hidden) - - h1tohT = torch.stack(h1tohT) - output = torch.stack(logits) - raw_output = output - - output = self.lockdrop(output, 0.4 if self.training else 0) - - # Pooling - output = torch.mean(output, 0) - - decoded = self.decoder(output) - - extra_out = {'dropped': decoded, - 'hiddens': h1tohT, - 'raw': raw_output} - return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out} - - def cell(self, x, h_prev, dag): - """Computes a single pass through the discovered RNN cell.""" - c = {} - h = {} - f = {} - - f[0] = self.get_f(dag[-1][0].name) - c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None)) - h[0] = (c[0] * f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + - (1 - c[0]) * h_prev) - - leaf_node_ids = [] - q = collections.deque() - q.append(0) - - # Computes connections from the parent nodes `node_id` - # to their child nodes `next_id` recursively, skipping leaf nodes. A - # leaf node is a node whose id == `self.num_blocks`. - # - # Connections between parent i and child j should be computed as - # h_j = c_j*f_{ij}{(W^h_{ij}*h_i)} + (1 - c_j)*h_i, - # where c_j = \sigmoid{(W^c_{ij}*h_i)} - # - # See Training details from Section 3.1 of the paper. - # - # The following algorithm does a breadth-first (since `q.popleft()` is - # used) search over the nodes and computes all the hidden states. - while True: - if len(q) == 0: - break - - node_id = q.popleft() - nodes = dag[node_id] - - for next_node in nodes: - next_id = next_node.id - if next_id == self.num_blocks: - leaf_node_ids.append(node_id) - assert len(nodes) == 1, ('parent of leaf node should have ' - 'only one child') - continue - - w_h = self.w_h[node_id][next_id] - w_c = self.w_c[node_id][next_id] - - f[next_id] = self.get_f(next_node.name) - c[next_id] = torch.sigmoid(w_c(h[node_id])) - h[next_id] = (c[next_id] * f[next_id](w_h(h[node_id])) + - (1 - c[next_id]) * h[node_id]) - - q.append(next_id) - - # Instead of averaging loose ends, perhaps there should - # be a set of separate unshared weights for each "loose" connection - # between each node in a cell and the output. - # - # As it stands, all weights W^h_{ij} are doing double duty by - # connecting both from i to j, as well as from i to the output. - - # average all the loose ends - leaf_nodes = [h[node_id] for node_id in leaf_node_ids] - output = torch.mean(torch.stack(leaf_nodes, 2), -1) - - # stabilizing the Updates of omega - if self.batch_norm is not None: - output = self.batch_norm(output) - - return output, h[self.num_blocks - 1] - - def init_hidden(self, batch_size): - zeros = torch.zeros(batch_size, self.shared_hid) - return utils.get_variable(zeros, self.use_cuda, requires_grad=False) - - def get_f(self, name): - name = name.lower() - if name == 'relu': - f = torch.relu - elif name == 'tanh': - f = torch.tanh - elif name == 'identity': - f = lambda x: x - elif name == 'sigmoid': - f = torch.sigmoid - return f - - @property - def num_parameters(self): - def size(p): - return np.prod(p.size()) - - return sum([size(param) for param in self.parameters()]) - - def reset_parameters(self): - init_range = 0.025 - # init_range = 0.025 if self.args.mode == 'train' else 0.04 - for param in self.parameters(): - param.data.uniform_(-init_range, init_range) - self.decoder.bias.data.fill_(0) - - def predict(self, word_seq): - """ - - :param word_seq: torch.LongTensor, [batch_size, seq_len] - :return predict: dict of torch.LongTensor, [batch_size, seq_len] - """ - output = self(word_seq) - _, predict = output['pred'].max(dim=1) - return {'pred': predict} diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py deleted file mode 100644 index 7abcc45f..00000000 --- a/fastNLP/models/enas_trainer.py +++ /dev/null @@ -1,380 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch -import math -import numpy as np -import time -import torch - -from datetime import datetime, timedelta - -from torch.optim import Adam - -try: - from tqdm.auto import tqdm -except: - from ..core.utils import _pseudo_tqdm as tqdm - -from ..core.trainer import Trainer -from ..core.batch import DataSetIter -from ..core.callback import CallbackManager, CallbackException -from ..core.dataset import DataSet -from ..core.utils import _move_dict_value_to_device -from . import enas_utils as utils -from ..core.utils import _build_args - - -def _get_no_grad_ctx_mgr(): - """Returns a the `torch.no_grad` context manager for PyTorch version >= - 0.4, or a no-op context manager otherwise. - """ - return torch.no_grad() - - -class ENASTrainer(Trainer): - """A class to wrap training code.""" - - def __init__(self, train_data, model, controller, **kwargs): - """Constructor for training algorithm. - :param DataSet train_data: the training data - :param torch.nn.modules.module model: a PyTorch model - :param torch.nn.modules.module controller: a PyTorch model - """ - self.final_epochs = kwargs['final_epochs'] - kwargs.pop('final_epochs') - super(ENASTrainer, self).__init__(train_data, model, **kwargs) - self.controller_step = 0 - self.shared_step = 0 - self.max_length = 35 - - self.shared = model - self.controller = controller - - self.shared_optim = Adam( - self.shared.parameters(), - lr=20.0, - weight_decay=1e-7) - - self.controller_optim = Adam( - self.controller.parameters(), - lr=3.5e-4) - - def train(self, load_best_model=True): - """ - :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 - 最好的模型参数。 - :return results: 返回一个字典类型的数据, - 内含以下内容:: - - seconds: float, 表示训练时长 - 以下三个内容只有在提供了dev_data的情况下会有。 - best_eval: Dict of Dict, 表示evaluation的结果 - best_epoch: int,在第几个epoch取得的最佳值 - best_step: int, 在第几个step(batch)更新取得的最佳值 - - """ - results = {} - if self.n_epochs <= 0: - print(f"training epoch is {self.n_epochs}, nothing was done.") - results['seconds'] = 0. - return results - try: - if torch.cuda.is_available() and "cuda" in self.device: - self.model = self.model.cuda() - self._model_device = self.model.parameters().__next__().device - self._mode(self.model, is_test=False) - - self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) - start_time = time.time() - print("training epochs started " + self.start_time, flush=True) - - try: - self.callback_manager.on_train_begin() - self._train() - self.callback_manager.on_train_end() - except (CallbackException, KeyboardInterrupt) as e: - self.callback_manager.on_exception(e) - - if self.dev_data is not None: - print( - "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + - self.tester._format_eval_results(self.best_dev_perf), ) - results['best_eval'] = self.best_dev_perf - results['best_epoch'] = self.best_dev_epoch - results['best_step'] = self.best_dev_step - if load_best_model: - model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]) - load_succeed = self._load_model(self.model, model_name) - if load_succeed: - print("Reloaded the best model.") - else: - print("Fail to reload best model.") - finally: - pass - results['seconds'] = round(time.time() - start_time, 2) - - return results - - def _train(self): - if not self.use_tqdm: - from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm - else: - inner_tqdm = tqdm - self.step = 0 - start = time.time() - total_steps = (len(self.train_data) // self.batch_size + int( - len(self.train_data) % self.batch_size != 0)) * self.n_epochs - with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: - avg_loss = 0 - data_iterator = DataSetIter(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - for epoch in range(1, self.n_epochs + 1): - pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) - last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) - if epoch == self.n_epochs + 1 - self.final_epochs: - print('Entering the final stage. (Only train the selected structure)') - # early stopping - self.callback_manager.on_epoch_begin() - - # 1. Training the shared parameters omega of the child models - self.train_shared(pbar) - - # 2. Training the controller parameters theta - if not last_stage: - self.train_controller() - - if ((self.validate_every > 0 and self.step % self.validate_every == 0) or - (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ - and self.dev_data is not None: - if not last_stage: - self.derive() - eval_res = self._do_validation(epoch=epoch, step=self.step) - eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, - total_steps) + \ - self.tester._format_eval_results(eval_res) - pbar.write(eval_str) - - # lr decay; early stopping - self.callback_manager.on_epoch_end() - # =============== epochs end =================== # - pbar.close() - # ============ tqdm end ============== # - - def get_loss(self, inputs, targets, hidden, dags): - """Computes the loss for the same batch for M models. - - This amounts to an estimate of the loss, which is turned into an - estimate for the gradients of the shared model. - """ - if not isinstance(dags, list): - dags = [dags] - - loss = 0 - for dag in dags: - self.shared.setDAG(dag) - inputs = _build_args(self.shared.forward, **inputs) - inputs['hidden'] = hidden - result = self.shared(**inputs) - output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out'] - - self.callback_manager.on_loss_begin(targets, result) - sample_loss = self._compute_loss(result, targets) - loss += sample_loss - - assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`' - return loss, hidden, extra_out - - def train_shared(self, pbar=None, max_step=None, dag=None): - """Train the language model for 400 steps of minibatches of 64 - examples. - - Args: - max_step: Used to run extra training steps as a warm-up. - dag: If not None, is used instead of calling sample(). - - BPTT is truncated at 35 timesteps. - - For each weight update, gradients are estimated by sampling M models - from the fixed controller policy, and averaging their gradients - computed on a batch of training data. - """ - model = self.shared - model.train() - self.controller.eval() - - hidden = self.shared.init_hidden(self.batch_size) - - abs_max_grad = 0 - abs_max_hidden_norm = 0 - step = 0 - raw_total_loss = 0 - total_loss = 0 - train_idx = 0 - avg_loss = 0 - data_iterator = DataSetIter(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - - for batch_x, batch_y in data_iterator: - _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - indices = data_iterator.get_batch_indices() - # negative sampling; replace unknown; re-weight batch_y - self.callback_manager.on_batch_begin(batch_x, batch_y, indices) - # prediction = self._data_forward(self.model, batch_x) - - dags = self.controller.sample(1) - inputs, targets = batch_x, batch_y - # self.callback_manager.on_loss_begin(batch_y, prediction) - loss, hidden, extra_out = self.get_loss(inputs, - targets, - hidden, - dags) - hidden.detach_() - - avg_loss += loss.item() - - # Is loss NaN or inf? requires_grad = False - self.callback_manager.on_backward_begin(loss) - self._grad_backward(loss) - self.callback_manager.on_backward_end() - - self._update() - self.callback_manager.on_step_end() - - if (self.step + 1) % self.print_every == 0: - if self.use_tqdm: - print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every) - pbar.update(self.print_every) - else: - end = time.time() - diff = timedelta(seconds=round(end - start)) - print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format( - epoch, self.step, avg_loss, diff) - pbar.set_postfix_str(print_output) - avg_loss = 0 - self.step += 1 - step += 1 - self.shared_step += 1 - self.callback_manager.on_batch_end() - # ================= mini-batch end ==================== # - - def get_reward(self, dag, entropies, hidden, valid_idx=0): - """Computes the perplexity of a single sampled model on a minibatch of - validation data. - """ - if not isinstance(entropies, np.ndarray): - entropies = entropies.data.cpu().numpy() - - data_iterator = DataSetIter(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - - for inputs, targets in data_iterator: - valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) - valid_loss = utils.to_item(valid_loss.data) - - valid_ppl = math.exp(valid_loss) - - R = 80 / valid_ppl - - rewards = R + 1e-4 * entropies - - return rewards, hidden - - def train_controller(self): - """Fixes the shared parameters and updates the controller parameters. - - The controller is updated with a score function gradient estimator - (i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl - is computed on a minibatch of validation data. - - A moving average baseline is used. - - The controller is trained for 2000 steps per epoch (i.e., - first (Train Shared) phase -> second (Train Controller) phase). - """ - model = self.controller - model.train() - # Why can't we call shared.eval() here? Leads to loss - # being uniformly zero for the controller. - # self.shared.eval() - - avg_reward_base = None - baseline = None - adv_history = [] - entropy_history = [] - reward_history = [] - - hidden = self.shared.init_hidden(self.batch_size) - total_loss = 0 - valid_idx = 0 - for step in range(20): - # sample models - dags, log_probs, entropies = self.controller.sample( - with_details=True) - - # calculate reward - np_entropies = entropies.data.cpu().numpy() - # No gradients should be backpropagated to the - # shared model during controller training, obviously. - with _get_no_grad_ctx_mgr(): - rewards, hidden = self.get_reward(dags, - np_entropies, - hidden, - valid_idx) - - reward_history.extend(rewards) - entropy_history.extend(np_entropies) - - # moving average baseline - if baseline is None: - baseline = rewards - else: - decay = 0.95 - baseline = decay * baseline + (1 - decay) * rewards - - adv = rewards - baseline - adv_history.extend(adv) - - # policy loss - loss = -log_probs * utils.get_variable(adv, - 'cuda' in self.device, - requires_grad=False) - - loss = loss.sum() # or loss.mean() - - # update - self.controller_optim.zero_grad() - loss.backward() - - self.controller_optim.step() - - total_loss += utils.to_item(loss.data) - - if ((step % 50) == 0) and (step > 0): - reward_history, adv_history, entropy_history = [], [], [] - total_loss = 0 - - self.controller_step += 1 - # prev_valid_idx = valid_idx - # valid_idx = ((valid_idx + self.max_length) % - # (self.valid_data.size(0) - 1)) - # # Whenever we wrap around to the beginning of the - # # validation data, we reset the hidden states. - # if prev_valid_idx > valid_idx: - # hidden = self.shared.init_hidden(self.batch_size) - - def derive(self, sample_num=10, valid_idx=0): - """We are always deriving based on the very first batch - of validation data? This seems wrong... - """ - hidden = self.shared.init_hidden(self.batch_size) - - dags, _, entropies = self.controller.sample(sample_num, - with_details=True) - - max_R = 0 - best_dag = None - for dag in dags: - R, _ = self.get_reward(dag, entropies, hidden, valid_idx) - if R.max() > max_R: - max_R = R.max() - best_dag = dag - - self.model.setDAG(best_dag) diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py deleted file mode 100644 index 4e402a9a..00000000 --- a/fastNLP/models/enas_utils.py +++ /dev/null @@ -1,54 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch - -from collections import defaultdict -import collections - -import numpy as np -import torch -from torch.autograd import Variable - - -def detach(h): - if type(h) == Variable: - return Variable(h.data) - else: - return tuple(detach(v) for v in h) - - -def get_variable(inputs, cuda=False, **kwargs): - if type(inputs) in [list, np.ndarray]: - inputs = torch.Tensor(inputs) - if cuda: - out = Variable(inputs.cuda(), **kwargs) - else: - out = Variable(inputs, **kwargs) - return out - - -def update_lr(optimizer, lr): - for param_group in optimizer.param_groups: - param_group['lr'] = lr - - -Node = collections.namedtuple('Node', ['id', 'name']) - - -class keydefaultdict(defaultdict): - def __missing__(self, key): - if self.default_factory is None: - raise KeyError(key) - else: - ret = self[key] = self.default_factory(key) - return ret - - -def to_item(x): - """Converts x, possibly scalar and possibly tensor, to a Python scalar.""" - if isinstance(x, (float, int)): - return x - - if float(torch.__version__[0:3]) < 0.4: - assert (x.dim() == 1) and (len(x) == 1) - return x[0] - - return x.item() diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py index 4bf3f95f..ab232d04 100644 --- a/fastNLP/models/sequence_labeling.py +++ b/fastNLP/models/sequence_labeling.py @@ -1,10 +1,10 @@ """ - 本模块实现了几种序列标注模型 +本模块实现了几种序列标注模型 """ __all__ = [ "SeqLabeling", "AdvSeqLabel", - # "BiLSTMCRF" + "BiLSTMCRF" ] import torch @@ -12,55 +12,54 @@ import torch.nn as nn import torch.nn.functional as F from .base_model import BaseModel -from ..embeddings import embedding -from ..modules import decoder, encoder -from ..modules.decoder.crf import allowed_transitions -from ..core.utils import seq_len_to_mask from ..core.const import Const as C -from ..modules import LSTM +from ..core.utils import seq_len_to_mask from ..embeddings import get_embeddings from ..modules import ConditionalRandomField +from ..modules import LSTM +from ..modules import decoder, encoder +from ..modules.decoder.crf import allowed_transitions class BiLSTMCRF(BaseModel): """ - 结构为BiLSTM + FC + Dropout + CRF. - - .. todo:: - 继续补充文档 + 结构为embedding + BiLSTM + FC + Dropout + CRF. - :param embed: tuple: - :param num_classes: - :param num_layers: - :param hidden_size: - :param dropout: - :param target_vocab: - :param encoding_type: """ def __init__(self, embed, num_classes, num_layers=1, hidden_size=100, dropout=0.5, - target_vocab=None, encoding_type=None): + target_vocab=None): + """ + + :param embed: 支持(1)fastNLP的各种Embedding, (2) tuple, 指明num_embedding, dimension, 如(1000, 100) + :param num_classes: 一共多少个类 + :param num_layers: BiLSTM的层数 + :param hidden_size: BiLSTM的hidden_size,实际hidden size为该值的两倍(前向、后向) + :param dropout: dropout的概率,0为不dropout + :param target_vocab: Vocabulary对象,target与index的对应关系。如果传入该值,将自动避免非法的解码序列。 + """ super().__init__() self.embed = get_embeddings(embed) if num_layers>1: - self.lstm = LSTM(embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size, bidirectional=True, + self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size, bidirectional=True, batch_first=True, dropout=dropout) else: - self.lstm = LSTM(embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size, bidirectional=True, + self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size, bidirectional=True, batch_first=True) self.dropout = nn.Dropout(dropout) - self.fc = nn.Linear(hidden_size, num_classes) + self.fc = nn.Linear(hidden_size*2, num_classes) trans = None - if target_vocab is not None and encoding_type is not None: - trans = allowed_transitions(target_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) + if target_vocab is not None: + assert len(target_vocab)==num_classes, "The number of classes should be same with the length of target vocabulary." + trans = allowed_transitions(target_vocab.idx2word, include_start_end=True) self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans) def _forward(self, words, seq_len=None, target=None): words = self.embed(words) - feats = self.lstm(words, seq_len=seq_len) + feats, _ = self.lstm(words, seq_len=seq_len) feats = self.fc(feats) feats = self.dropout(feats) logits = F.log_softmax(feats, dim=-1) @@ -81,26 +80,26 @@ class BiLSTMCRF(BaseModel): class SeqLabeling(BaseModel): """ - 别名::class:`fastNLP.models.SeqLabeling` :class:`fastNLP.models.sequence_labeling.SeqLabeling` - 一个基础的Sequence labeling的模型。 用于做sequence labeling的基础类。结构包含一层Embedding,一层LSTM(单向,一层),一层FC,以及一层CRF。 - :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), - 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding - :param int hidden_size: LSTM隐藏层的大小 - :param int num_classes: 一共有多少类 """ - def __init__(self, init_embed, hidden_size, num_classes): + def __init__(self, embed, hidden_size, num_classes): + """ + + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray embed: Embedding的大小(传入tuple(int, int), + 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, embedding, ndarray等则直接使用该值初始化Embedding + :param int hidden_size: LSTM隐藏层的大小 + :param int num_classes: 一共有多少类 + """ super(SeqLabeling, self).__init__() - self.Embedding = embedding.Embedding(init_embed) - self.Rnn = encoder.LSTM(self.Embedding.embedding_dim, hidden_size) - self.Linear = nn.Linear(hidden_size, num_classes) - self.Crf = decoder.ConditionalRandomField(num_classes) - self.mask = None - + self.embedding = get_embeddings(embed) + self.rnn = encoder.LSTM(self.embedding.embedding_dim, hidden_size) + self.fc = nn.Linear(hidden_size, num_classes) + self.crf = decoder.ConditionalRandomField(num_classes) + def forward(self, words, seq_len, target): """ :param torch.LongTensor words: [batch_size, max_len],序列的index @@ -109,17 +108,14 @@ class SeqLabeling(BaseModel): :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. If truth is not None, return loss, a scalar. Used in training. """ - assert words.shape[0] == seq_len.shape[0] - assert target.shape == words.shape - self.mask = self._make_mask(words, seq_len) - - x = self.Embedding(words) + mask = seq_len_to_mask(seq_len, max_len=words.size(1)) + x = self.embedding(words) # [batch_size, max_len, word_emb_dim] - x, _ = self.Rnn(x, seq_len) + x, _ = self.rnn(x, seq_len) # [batch_size, max_len, hidden_size * direction] - x = self.Linear(x) + x = self.fc(x) # [batch_size, max_len, num_classes] - return {C.LOSS: self._internal_loss(x, target)} + return {C.LOSS: self._internal_loss(x, target, mask)} def predict(self, words, seq_len): """ @@ -129,18 +125,18 @@ class SeqLabeling(BaseModel): :param torch.LongTensor seq_len: [batch_size,] :return: {'pred': xx}, [batch_size, max_len] """ - self.mask = self._make_mask(words, seq_len) + mask = seq_len_to_mask(seq_len, max_len=words.size(1)) - x = self.Embedding(words) + x = self.embedding(words) # [batch_size, max_len, word_emb_dim] - x, _ = self.Rnn(x, seq_len) + x, _ = self.rnn(x, seq_len) # [batch_size, max_len, hidden_size * direction] - x = self.Linear(x) + x = self.fc(x) # [batch_size, max_len, num_classes] - pred = self._decode(x) + pred = self._decode(x, mask) return {C.OUTPUT: pred} - def _internal_loss(self, x, y): + def _internal_loss(self, x, y, mask): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] @@ -150,49 +146,39 @@ class SeqLabeling(BaseModel): """ x = x.float() y = y.long() - assert x.shape[:2] == y.shape - assert y.shape == self.mask.shape - total_loss = self.Crf(x, y, self.mask) + total_loss = self.crf(x, y, mask) return torch.mean(total_loss) - def _make_mask(self, x, seq_len): - batch_size, max_len = x.size(0), x.size(1) - mask = seq_len_to_mask(seq_len) - mask = mask.view(batch_size, max_len) - mask = mask.to(x).float() - return mask - - def _decode(self, x): + def _decode(self, x, mask): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] :return prediction: [batch_size, max_len] """ - tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) + tag_seq, _ = self.crf.viterbi_decode(x, mask) return tag_seq class AdvSeqLabel(nn.Module): """ - 别名::class:`fastNLP.models.AdvSeqLabel` :class:`fastNLP.models.sequence_labeling.AdvSeqLabel` - 更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。 - - :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), - 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding - :param int hidden_size: LSTM的隐层大小 - :param int num_classes: 有多少个类 - :param float dropout: LSTM中以及DropOut层的drop概率 - :param dict id2words: tag id转为其tag word的表。用于在CRF解码时防止解出非法的顺序,比如'BMES'这个标签规范中,'S' - 不能出现在'B'之后。这里也支持类似与'B-NN',即'-'前为标签类型的指示,后面为具体的tag的情况。这里不但会保证 - 'B-NN'后面不为'S-NN'还会保证'B-NN'后面不会出现'M-xx'(任何非'M-NN'和'E-NN'的情况。) - :param str encoding_type: 支持"BIO", "BMES", "BEMSO", 只有在id2words不为None的情况有用。 """ - def __init__(self, init_embed, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bmes'): + def __init__(self, embed, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bmes'): + """ + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray embed: Embedding的大小(传入tuple(int, int), + 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding + :param int hidden_size: LSTM的隐层大小 + :param int num_classes: 有多少个类 + :param float dropout: LSTM中以及DropOut层的drop概率 + :param dict id2words: tag id转为其tag word的表。用于在CRF解码时防止解出非法的顺序,比如'BMES'这个标签规范中,'S' + 不能出现在'B'之后。这里也支持类似与'B-NN',即'-'前为标签类型的指示,后面为具体的tag的情况。这里不但会保证 + 'B-NN'后面不为'S-NN'还会保证'B-NN'后面不会出现'M-xx'(任何非'M-NN'和'E-NN'的情况。) + :param str encoding_type: 支持"BIO", "BMES", "BEMSO", 只有在id2words不为None的情况有用。 + """ super().__init__() - self.Embedding = embedding.Embedding(init_embed) + self.Embedding = get_embeddings(embed) self.norm1 = torch.nn.LayerNorm(self.Embedding.embedding_dim) self.Rnn = encoder.LSTM(input_size=self.Embedding.embedding_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, @@ -210,36 +196,29 @@ class AdvSeqLabel(nn.Module): allowed_transitions=allowed_transitions(id2words, encoding_type=encoding_type)) - def _decode(self, x): + def _decode(self, x, mask): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] + :param torch.ByteTensor mask: [batch_size, max_len] :return torch.LongTensor, [batch_size, max_len] """ - tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) + tag_seq, _ = self.Crf.viterbi_decode(x, mask) return tag_seq - def _internal_loss(self, x, y): + def _internal_loss(self, x, y, mask): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] :param y: Tensor, [batch_size, max_len] + :param mask: Tensor, [batch_size, max_len] :return loss: a scalar Tensor """ x = x.float() y = y.long() - assert x.shape[:2] == y.shape - assert y.shape == self.mask.shape - total_loss = self.Crf(x, y, self.mask) + total_loss = self.Crf(x, y, mask) return torch.mean(total_loss) - def _make_mask(self, x, seq_len): - batch_size, max_len = x.size(0), x.size(1) - mask = seq_len_to_mask(seq_len) - mask = mask.view(batch_size, max_len) - mask = mask.to(x).float() - return mask - def _forward(self, words, seq_len, target=None): """ :param torch.LongTensor words: [batch_size, mex_len] @@ -251,15 +230,13 @@ class AdvSeqLabel(nn.Module): words = words.long() seq_len = seq_len.long() - self.mask = self._make_mask(words, seq_len) - - # seq_len = seq_len.long() + mask = seq_len_to_mask(seq_len, max_len=words.size(1)) + target = target.long() if target is not None else None if next(self.parameters()).is_cuda: words = words.cuda() - self.mask = self.mask.cuda() - + x = self.Embedding(words) x = self.norm1(x) # [batch_size, max_len, word_emb_dim] @@ -272,9 +249,9 @@ class AdvSeqLabel(nn.Module): x = self.drop(x) x = self.Linear2(x) if target is not None: - return {"loss": self._internal_loss(x, target)} + return {"loss": self._internal_loss(x, target, mask)} else: - return {"pred": self._decode(x)} + return {"pred": self._decode(x, mask)} def forward(self, words, seq_len, target): """ diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py index 8e35b6bc..1661d191 100644 --- a/fastNLP/models/snli.py +++ b/fastNLP/models/snli.py @@ -1,3 +1,7 @@ +""" +.. todo:: + doc +""" __all__ = [ "ESIM" ] @@ -5,34 +9,38 @@ __all__ = [ import torch import torch.nn as nn import torch.nn.functional as F - from torch.nn import CrossEntropyLoss from .base_model import BaseModel -from ..embeddings.embedding import TokenEmbedding from ..core.const import Const from ..core.utils import seq_len_to_mask +from ..embeddings.embedding import TokenEmbedding, Embedding +from ..modules.encoder import BiAttention class ESIM(BaseModel): """ - 别名::class:`fastNLP.models.ESIM` :class:`fastNLP.models.snli.ESIM` - ESIM model的一个PyTorch实现 论文参见: https://arxiv.org/pdf/1609.06038.pdf - :param fastNLP.TokenEmbedding init_embedding: 初始化的TokenEmbedding - :param int hidden_size: 隐藏层大小,默认值为Embedding的维度 - :param int num_labels: 目标标签种类数量,默认值为3 - :param float dropout_rate: dropout的比率,默认值为0.3 - :param float dropout_embed: 对Embedding的dropout比率,默认值为0.1 """ - def __init__(self, init_embedding: TokenEmbedding, hidden_size=None, num_labels=3, dropout_rate=0.3, + def __init__(self, embed, hidden_size=None, num_labels=3, dropout_rate=0.3, dropout_embed=0.1): + """ + + :param embed: 初始化的Embedding + :param int hidden_size: 隐藏层大小,默认值为Embedding的维度 + :param int num_labels: 目标标签种类数量,默认值为3 + :param float dropout_rate: dropout的比率,默认值为0.3 + :param float dropout_embed: 对Embedding的dropout比率,默认值为0.1 + """ super(ESIM, self).__init__() - self.embedding = init_embedding + if isinstance(embed, TokenEmbedding) or isinstance(embed, Embedding): + self.embedding = embed + else: + self.embedding = Embedding(embed) self.dropout_embed = EmbedDropout(p=dropout_embed) if hidden_size is None: hidden_size = self.embedding.embed_size @@ -43,7 +51,7 @@ class ESIM(BaseModel): nn.Linear(8 * hidden_size, hidden_size), nn.ReLU()) nn.init.xavier_uniform_(self.interfere[1].weight.data) - self.bi_attention = SoftmaxAttention() + self.bi_attention = BiAttention() self.rnn_high = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate) # self.rnn_high = LSTM(hidden_size, hidden_size, dropout=dropout_rate, bidirectional=True,) @@ -167,48 +175,3 @@ class BiRNN(nn.Module): output = torch.cat([output, padding], 1) return output - -def masked_softmax(tensor, mask): - tensor_shape = tensor.size() - reshaped_tensor = tensor.view(-1, tensor_shape[-1]) - - # Reshape the mask so it matches the size of the input tensor. - while mask.dim() < tensor.dim(): - mask = mask.unsqueeze(1) - mask = mask.expand_as(tensor).contiguous().float() - reshaped_mask = mask.view(-1, mask.size()[-1]) - result = F.softmax(reshaped_tensor * reshaped_mask, dim=-1) - result = result * reshaped_mask - # 1e-13 is added to avoid divisions by zero. - result = result / (result.sum(dim=-1, keepdim=True) + 1e-13) - return result.view(*tensor_shape) - - -def weighted_sum(tensor, weights, mask): - w_sum = weights.bmm(tensor) - while mask.dim() < w_sum.dim(): - mask = mask.unsqueeze(1) - mask = mask.transpose(-1, -2) - mask = mask.expand_as(w_sum).contiguous().float() - return w_sum * mask - - -class SoftmaxAttention(nn.Module): - - def forward(self, premise_batch, premise_mask, hypothesis_batch, hypothesis_mask): - similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1) - .contiguous()) - - prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask) - hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2) - .contiguous(), - premise_mask) - - attended_premises = weighted_sum(hypothesis_batch, - prem_hyp_attn, - premise_mask) - attended_hypotheses = weighted_sum(premise_batch, - hyp_prem_attn, - hypothesis_mask) - - return attended_premises, attended_hypotheses diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py index b95d1c25..117a63a2 100644 --- a/fastNLP/models/star_transformer.py +++ b/fastNLP/models/star_transformer.py @@ -11,31 +11,19 @@ __all__ = [ import torch from torch import nn -from ..modules.encoder.star_transformer import StarTransformer +from ..core.const import Const from ..core.utils import seq_len_to_mask from ..embeddings.utils import get_embeddings -from ..core.const import Const +from ..modules.encoder.star_transformer import StarTransformer class StarTransEnc(nn.Module): """ - 别名::class:`fastNLP.models.StarTransEnc` :class:`fastNLP.models.star_transformer.StarTransEnc` - 带word embedding的Star-Transformer Encoder - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 - embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, - 此时就以传入的对象作为embedding - :param hidden_size: 模型中特征维度. - :param num_layers: 模型层数. - :param num_head: 模型中multi-head的head个数. - :param head_dim: 模型中multi-head中每个head特征维度. - :param max_len: 模型能接受的最大输入长度. - :param emb_dropout: 词嵌入的dropout概率. - :param dropout: 模型除词嵌入外的dropout概率. """ - def __init__(self, init_embed, + def __init__(self, embed, hidden_size, num_layers, num_head, @@ -43,8 +31,20 @@ class StarTransEnc(nn.Module): max_len, emb_dropout, dropout): + """ + + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象,此时就以传入的对象作为embedding + :param hidden_size: 模型中特征维度. + :param num_layers: 模型层数. + :param num_head: 模型中multi-head的head个数. + :param head_dim: 模型中multi-head中每个head特征维度. + :param max_len: 模型能接受的最大输入长度. + :param emb_dropout: 词嵌入的dropout概率. + :param dropout: 模型除词嵌入外的dropout概率. + """ super(StarTransEnc, self).__init__() - self.embedding = get_embeddings(init_embed) + self.embedding = get_embeddings(embed) emb_dim = self.embedding.embedding_dim self.emb_fc = nn.Linear(emb_dim, hidden_size) # self.emb_drop = nn.Dropout(emb_dropout) @@ -104,25 +104,11 @@ class _NLICls(nn.Module): class STSeqLabel(nn.Module): """ - 别名::class:`fastNLP.models.STSeqLabel` :class:`fastNLP.models.star_transformer.STSeqLabel` - 用于序列标注的Star-Transformer模型 - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 - embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, - 此时就以传入的对象作为embedding - :param num_cls: 输出类别个数 - :param hidden_size: 模型中特征维度. Default: 300 - :param num_layers: 模型层数. Default: 4 - :param num_head: 模型中multi-head的head个数. Default: 8 - :param head_dim: 模型中multi-head中每个head特征维度. Default: 32 - :param max_len: 模型能接受的最大输入长度. Default: 512 - :param cls_hidden_size: 分类器隐层维度. Default: 600 - :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 - :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ - def __init__(self, init_embed, num_cls, + def __init__(self, embed, num_cls, hidden_size=300, num_layers=4, num_head=8, @@ -131,8 +117,22 @@ class STSeqLabel(nn.Module): cls_hidden_size=600, emb_dropout=0.1, dropout=0.1, ): + """ + + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding + :param num_cls: 输出类别个数 + :param hidden_size: 模型中特征维度. Default: 300 + :param num_layers: 模型层数. Default: 4 + :param num_head: 模型中multi-head的head个数. Default: 8 + :param head_dim: 模型中multi-head中每个head特征维度. Default: 32 + :param max_len: 模型能接受的最大输入长度. Default: 512 + :param cls_hidden_size: 分类器隐层维度. Default: 600 + :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 + :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 + """ super(STSeqLabel, self).__init__() - self.enc = StarTransEnc(init_embed=init_embed, + self.enc = StarTransEnc(embed=embed, hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, @@ -169,25 +169,11 @@ class STSeqLabel(nn.Module): class STSeqCls(nn.Module): """ - 别名::class:`fastNLP.models.STSeqCls` :class:`fastNLP.models.star_transformer.STSeqCls` - 用于分类任务的Star-Transformer - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 - embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, - 此时就以传入的对象作为embedding - :param num_cls: 输出类别个数 - :param hidden_size: 模型中特征维度. Default: 300 - :param num_layers: 模型层数. Default: 4 - :param num_head: 模型中multi-head的head个数. Default: 8 - :param head_dim: 模型中multi-head中每个head特征维度. Default: 32 - :param max_len: 模型能接受的最大输入长度. Default: 512 - :param cls_hidden_size: 分类器隐层维度. Default: 600 - :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 - :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ - def __init__(self, init_embed, num_cls, + def __init__(self, embed, num_cls, hidden_size=300, num_layers=4, num_head=8, @@ -196,8 +182,22 @@ class STSeqCls(nn.Module): cls_hidden_size=600, emb_dropout=0.1, dropout=0.1, ): + """ + + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding + :param num_cls: 输出类别个数 + :param hidden_size: 模型中特征维度. Default: 300 + :param num_layers: 模型层数. Default: 4 + :param num_head: 模型中multi-head的head个数. Default: 8 + :param head_dim: 模型中multi-head中每个head特征维度. Default: 32 + :param max_len: 模型能接受的最大输入长度. Default: 512 + :param cls_hidden_size: 分类器隐层维度. Default: 600 + :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 + :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 + """ super(STSeqCls, self).__init__() - self.enc = StarTransEnc(init_embed=init_embed, + self.enc = StarTransEnc(embed=embed, hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, @@ -234,25 +234,11 @@ class STSeqCls(nn.Module): class STNLICls(nn.Module): """ - 别名::class:`fastNLP.models.STNLICls` :class:`fastNLP.models.star_transformer.STNLICls` - 用于自然语言推断(NLI)的Star-Transformer - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 - embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, - 此时就以传入的对象作为embedding - :param num_cls: 输出类别个数 - :param hidden_size: 模型中特征维度. Default: 300 - :param num_layers: 模型层数. Default: 4 - :param num_head: 模型中multi-head的head个数. Default: 8 - :param head_dim: 模型中multi-head中每个head特征维度. Default: 32 - :param max_len: 模型能接受的最大输入长度. Default: 512 - :param cls_hidden_size: 分类器隐层维度. Default: 600 - :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 - :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ - def __init__(self, init_embed, num_cls, + def __init__(self, embed, num_cls, hidden_size=300, num_layers=4, num_head=8, @@ -261,8 +247,22 @@ class STNLICls(nn.Module): cls_hidden_size=600, emb_dropout=0.1, dropout=0.1, ): + """ + + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding + :param num_cls: 输出类别个数 + :param hidden_size: 模型中特征维度. Default: 300 + :param num_layers: 模型层数. Default: 4 + :param num_head: 模型中multi-head的head个数. Default: 8 + :param head_dim: 模型中multi-head中每个head特征维度. Default: 32 + :param max_len: 模型能接受的最大输入长度. Default: 512 + :param cls_hidden_size: 分类器隐层维度. Default: 600 + :param emb_dropout: 词嵌入的dropout概率. Default: 0.1 + :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 + """ super(STNLICls, self).__init__() - self.enc = StarTransEnc(init_embed=init_embed, + self.enc = StarTransEnc(embed=embed, hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, diff --git a/fastNLP/modules/__init__.py b/fastNLP/modules/__init__.py index 7959e454..77283806 100644 --- a/fastNLP/modules/__init__.py +++ b/fastNLP/modules/__init__.py @@ -9,7 +9,7 @@ .. csv-table:: :header: "类型", "功能", "常见组件" - "embedding", 参见 :doc:`/fastNLP.embeddings` , "Elmo, Bert" + "embedding", 参见 :mod:`/fastNLP.embeddings` , "Elmo, Bert" "encoder", "将输入编码为具有表示能力的向量", "CNN, LSTM, Transformer" "decoder", "将具有某种表示意义的向量解码为需要的输出形式 ", "MLP, CRF" "其它", "配合其它组件使用的组件", "Dropout" @@ -36,6 +36,7 @@ __all__ = [ "MaxPool", "MaxPoolWithMask", + "KMaxPool", "AvgPool", "AvgPoolWithMask", @@ -47,10 +48,18 @@ __all__ = [ "allowed_transitions", "TimestepDropout", + + 'summary' ] +import sys + from . import decoder from . import encoder from .decoder import * from .dropout import TimestepDropout from .encoder import * +from .utils import summary +from ..doc_utils import doc_process + +doc_process(sys.modules[__name__]) diff --git a/fastNLP/modules/decoder/__init__.py b/fastNLP/modules/decoder/__init__.py index 664618b2..57acb172 100644 --- a/fastNLP/modules/decoder/__init__.py +++ b/fastNLP/modules/decoder/__init__.py @@ -1,3 +1,7 @@ +""" +.. todo:: + doc +""" __all__ = [ "MLP", "ConditionalRandomField", @@ -6,6 +10,6 @@ __all__ = [ ] from .crf import ConditionalRandomField +from .crf import allowed_transitions from .mlp import MLP from .utils import viterbi_decode -from .crf import allowed_transitions diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py index 7c496868..669501e9 100644 --- a/fastNLP/modules/decoder/crf.py +++ b/fastNLP/modules/decoder/crf.py @@ -1,37 +1,54 @@ +"""undocumented""" + __all__ = [ "ConditionalRandomField", "allowed_transitions" ] +from typing import Union + import torch from torch import nn from ..utils import initial_parameter +from ...core.metrics import _get_encoding_type_from_tag_vocab, _check_tag_vocab_and_encoding_type +from ...core.vocabulary import Vocabulary -def allowed_transitions(id2target, encoding_type='bio', include_start_end=False): +def allowed_transitions(tag_vocab:Union[Vocabulary, dict], encoding_type=None, include_start_end=False): """ - 别名::class:`fastNLP.modules.allowed_transitions` :class:`fastNLP.modules.decoder.allowed_transitions` - 给定一个id到label的映射表,返回所有可以跳转的(from_tag_id, to_tag_id)列表。 - :param dict id2target: key是label的indices,value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是 - "B-NN", "M-NN", tag和label之间一定要用"-"隔开。一般可以通过Vocabulary.idx2word得到id2label。 - :param str encoding_type: 支持"bio", "bmes", "bmeso", "bioes"。 + :param ~fastNLP.Vocabulary,dict tag_vocab: 支持类型为tag或tag-label。只有tag的,比如"B", "M"; 也可以是"B-NN", "M-NN", + tag和label之间一定要用"-"隔开。如果传入dict,格式需要形如{0:"O", 1:"B-tag1"},即index在前,tag在后。 + :param str encoding_type: 支持"bio", "bmes", "bmeso", "bioes"。默认为None,通过vocab自动推断 :param bool include_start_end: 是否包含开始与结尾的转换。比如在bio中,b/o可以在开头,但是i不能在开头; 为True,返回的结果中会包含(start_idx, b_idx), (start_idx, o_idx), 但是不包含(start_idx, i_idx); start_idx=len(id2label), end_idx=len(id2label)+1。为False, 返回的结果中不含与开始结尾相关的内容 :return: List[Tuple(int, int)]], 内部的Tuple是可以进行跳转的(from_tag_id, to_tag_id)。 """ - num_tags = len(id2target) + if encoding_type is None: + encoding_type = _get_encoding_type_from_tag_vocab(tag_vocab) + else: + encoding_type = encoding_type.lower() + _check_tag_vocab_and_encoding_type(tag_vocab, encoding_type) + + pad_token = '' + unk_token = '' + + if isinstance(tag_vocab, Vocabulary): + id_label_lst = list(tag_vocab.idx2word.items()) + pad_token = tag_vocab.padding + unk_token = tag_vocab.unknown + else: + id_label_lst = list(tag_vocab.items()) + + num_tags = len(tag_vocab) start_idx = num_tags end_idx = num_tags + 1 - encoding_type = encoding_type.lower() allowed_trans = [] - id_label_lst = list(id2target.items()) if include_start_end: id_label_lst += [(start_idx, 'start'), (end_idx, 'end')] - def split_tag_label(from_label): from_label = from_label.lower() if from_label in ['start', 'end']: @@ -43,11 +60,11 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=False) return from_tag, from_label for from_id, from_label in id_label_lst: - if from_label in ['', '']: + if from_label in [pad_token, unk_token]: continue from_tag, from_label = split_tag_label(from_label) for to_id, to_label in id_label_lst: - if to_label in ['', '']: + if to_label in [pad_token, unk_token]: continue to_tag, to_label = split_tag_label(to_label) if _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label): @@ -151,22 +168,21 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label class ConditionalRandomField(nn.Module): """ - 别名::class:`fastNLP.modules.ConditionalRandomField` :class:`fastNLP.modules.decoder.ConditionalRandomField` - - 条件随机场。 - 提供forward()以及viterbi_decode()两个方法,分别用于训练与inference。 + 条件随机场。提供forward()以及viterbi_decode()两个方法,分别用于训练与inference。 - :param int num_tags: 标签的数量 - :param bool include_start_end_trans: 是否考虑各个tag作为开始以及结尾的分数。 - :param List[Tuple[from_tag_id(int), to_tag_id(int)]] allowed_transitions: 内部的Tuple[from_tag_id(int), - to_tag_id(int)]视为允许发生的跃迁,其他没有包含的跃迁认为是禁止跃迁,可以通过 - allowed_transitions()函数得到;如果为None,则所有跃迁均为合法 - :param str initial_method: 初始化方法。见initial_parameter """ def __init__(self, num_tags, include_start_end_trans=False, allowed_transitions=None, initial_method=None): - + """ + + :param int num_tags: 标签的数量 + :param bool include_start_end_trans: 是否考虑各个tag作为开始以及结尾的分数。 + :param List[Tuple[from_tag_id(int), to_tag_id(int)]] allowed_transitions: 内部的Tuple[from_tag_id(int), + to_tag_id(int)]视为允许发生的跃迁,其他没有包含的跃迁认为是禁止跃迁,可以通过 + allowed_transitions()函数得到;如果为None,则所有跃迁均为合法 + :param str initial_method: 初始化方法。见initial_parameter + """ super(ConditionalRandomField, self).__init__() self.include_start_end_trans = include_start_end_trans @@ -208,7 +224,7 @@ class ConditionalRandomField(nn.Module): trans_score = self.trans_m.view(1, n_tags, n_tags) tmp = alpha.view(batch_size, n_tags, 1) + emit_score + trans_score alpha = torch.logsumexp(tmp, 1).masked_fill(flip_mask[i].view(batch_size, 1), 0) + \ - alpha.masked_fill(mask[i].byte().view(batch_size, 1), 0) + alpha.masked_fill(mask[i].eq(1).view(batch_size, 1), 0) if self.include_start_end_trans: alpha = alpha + self.end_scores.view(1, -1) @@ -228,7 +244,7 @@ class ConditionalRandomField(nn.Module): seq_idx = torch.arange(seq_len, dtype=torch.long, device=logits.device) # trans_socre [L-1, B] - mask = mask.byte() + mask = mask.eq(1) flip_mask = mask.eq(0) trans_score = self.trans_m[tags[:seq_len - 1], tags[1:]].masked_fill(flip_mask[1:, :], 0) # emit_score [L, B] @@ -276,7 +292,7 @@ class ConditionalRandomField(nn.Module): """ batch_size, seq_len, n_tags = logits.size() logits = logits.transpose(0, 1).data # L, B, H - mask = mask.transpose(0, 1).data.byte() # L, B + mask = mask.transpose(0, 1).data.eq(1) # L, B # dp vpath = logits.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long) diff --git a/fastNLP/modules/decoder/mlp.py b/fastNLP/modules/decoder/mlp.py index 9d9d80f2..0f23f481 100644 --- a/fastNLP/modules/decoder/mlp.py +++ b/fastNLP/modules/decoder/mlp.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "MLP" ] @@ -10,16 +12,8 @@ from ..utils import initial_parameter class MLP(nn.Module): """ - 别名::class:`fastNLP.modules.MLP` :class:`fastNLP.modules.decoder.MLP` - 多层感知器 - :param List[int] size_layer: 一个int的列表,用来定义MLP的层数,列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1 - :param Union[str,func,List[str]] activation: 一个字符串或者函数的列表,用来定义每一个隐层的激活函数,字符串包括relu,tanh和 - sigmoid,默认值为relu - :param Union[str,func] output_activation: 字符串或者函数,用来定义输出层的激活函数,默认值为None,表示输出层没有激活函数 - :param str initial_method: 参数初始化方式 - :param float dropout: dropout概率,默认值为0 .. note:: 隐藏层的激活函数通过activation定义。一个str/function或者一个str/function的list可以被传入activation。 @@ -42,6 +36,15 @@ class MLP(nn.Module): """ def __init__(self, size_layer, activation='relu', output_activation=None, initial_method=None, dropout=0.0): + """ + + :param List[int] size_layer: 一个int的列表,用来定义MLP的层数,列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1 + :param Union[str,func,List[str]] activation: 一个字符串或者函数的列表,用来定义每一个隐层的激活函数,字符串包括relu,tanh和 + sigmoid,默认值为relu + :param Union[str,func] output_activation: 字符串或者函数,用来定义输出层的激活函数,默认值为None,表示输出层没有激活函数 + :param str initial_method: 参数初始化方式 + :param float dropout: dropout概率,默认值为0 + """ super(MLP, self).__init__() self.hiddens = nn.ModuleList() self.output = None diff --git a/fastNLP/modules/decoder/utils.py b/fastNLP/modules/decoder/utils.py index 9e773336..e0d2af68 100644 --- a/fastNLP/modules/decoder/utils.py +++ b/fastNLP/modules/decoder/utils.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "viterbi_decode" ] @@ -6,8 +8,6 @@ import torch def viterbi_decode(logits, transitions, mask=None, unpad=False): r""" - 别名::class:`fastNLP.modules.viterbi_decode` :class:`fastNLP.modules.decoder.viterbi_decode` - 给定一个特征矩阵以及转移分数矩阵,计算出最佳的路径以及对应的分数 :param torch.FloatTensor logits: batch_size x max_len x num_tags,特征矩阵。 @@ -27,7 +27,7 @@ def viterbi_decode(logits, transitions, mask=None, unpad=False): "compatible." logits = logits.transpose(0, 1).data # L, B, H if mask is not None: - mask = mask.transpose(0, 1).data.byte() # L, B + mask = mask.transpose(0, 1).data.eq(1) # L, B else: mask = logits.new_ones((seq_len, batch_size), dtype=torch.uint8) diff --git a/fastNLP/modules/dropout.py b/fastNLP/modules/dropout.py index 0ea2a2d9..24c20cc6 100644 --- a/fastNLP/modules/dropout.py +++ b/fastNLP/modules/dropout.py @@ -1,4 +1,8 @@ -__all__ = [] +"""undocumented""" + +__all__ = [ + "TimestepDropout" +] import torch diff --git a/fastNLP/modules/encoder/__init__.py b/fastNLP/modules/encoder/__init__.py index 1e99a0fd..cbb42d7e 100644 --- a/fastNLP/modules/encoder/__init__.py +++ b/fastNLP/modules/encoder/__init__.py @@ -1,3 +1,8 @@ +""" +.. todo:: + doc +""" + __all__ = [ # "BertModel", @@ -18,19 +23,21 @@ __all__ = [ "MaxPool", "MaxPoolWithMask", + "KMaxPool", "AvgPool", "AvgPoolWithMask", "MultiHeadAttention", + "BiAttention", + "SelfAttention", ] +from .attention import MultiHeadAttention, BiAttention, SelfAttention from .bert import BertModel from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder from .conv_maxpool import ConvMaxpool from .lstm import LSTM +from .pooling import MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask, KMaxPool from .star_transformer import StarTransformer from .transformer import TransformerEncoder from .variational_rnn import VarRNN, VarLSTM, VarGRU - -from .pooling import MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask -from .attention import MultiHeadAttention diff --git a/fastNLP/modules/encoder/_elmo.py b/fastNLP/modules/encoder/_elmo.py index befae8bc..554cf8a9 100644 --- a/fastNLP/modules/encoder/_elmo.py +++ b/fastNLP/modules/encoder/_elmo.py @@ -1,7 +1,9 @@ -""" +"""undocumented 这个页面的代码大量参考了 allenNLP """ +__all__ = [] + from typing import Optional, Tuple, List, Callable import torch diff --git a/fastNLP/modules/encoder/attention.py b/fastNLP/modules/encoder/attention.py index fe3f7fd8..b48be579 100644 --- a/fastNLP/modules/encoder/attention.py +++ b/fastNLP/modules/encoder/attention.py @@ -1,5 +1,9 @@ +"""undocumented""" + __all__ = [ - "MultiHeadAttention" + "MultiHeadAttention", + "BiAttention", + "SelfAttention", ] import math @@ -13,8 +17,7 @@ from fastNLP.modules.utils import initial_parameter class DotAttention(nn.Module): """ - .. todo:: - 补上文档 + Transformer当中的DotAttention """ def __init__(self, key_size, value_size, dropout=0.0): @@ -28,14 +31,14 @@ class DotAttention(nn.Module): def forward(self, Q, K, V, mask_out=None): """ - :param Q: [batch, seq_len_q, key_size] - :param K: [batch, seq_len_k, key_size] - :param V: [batch, seq_len_k, value_size] - :param mask_out: [batch, 1, seq_len] or [batch, seq_len_q, seq_len_k] + :param Q: [..., seq_len_q, key_size] + :param K: [..., seq_len_k, key_size] + :param V: [..., seq_len_k, value_size] + :param mask_out: [..., 1, seq_len] or [..., seq_len_q, seq_len_k] """ - output = torch.matmul(Q, K.transpose(1, 2)) / self.scale + output = torch.matmul(Q, K.transpose(-1, -2)) / self.scale if mask_out is not None: - output.masked_fill_(mask_out, -1e18) + output.masked_fill_(mask_out, -1e9) output = self.softmax(output) output = self.drop(output) return torch.matmul(output, V) @@ -43,16 +46,18 @@ class DotAttention(nn.Module): class MultiHeadAttention(nn.Module): """ - 别名::class:`fastNLP.modules.MultiHeadAttention` :class:`fastNLP.modules.encoder.MultiHeadAttention` - - :param input_size: int, 输入维度的大小。同时也是输出维度的大小。 - :param key_size: int, 每个head的维度大小。 - :param value_size: int,每个head中value的维度。 - :param num_head: int,head的数量。 - :param dropout: float。 + Transformer当中的MultiHeadAttention """ def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1): + """ + + :param input_size: int, 输入维度的大小。同时也是输出维度的大小。 + :param key_size: int, 每个head的维度大小。 + :param value_size: int,每个head中value的维度。 + :param num_head: int,head的数量。 + :param dropout: float。 + """ super(MultiHeadAttention, self).__init__() self.input_size = input_size self.key_size = key_size @@ -63,17 +68,16 @@ class MultiHeadAttention(nn.Module): self.q_in = nn.Linear(input_size, in_size) self.k_in = nn.Linear(input_size, in_size) self.v_in = nn.Linear(input_size, in_size) - # follow the paper, do not apply dropout within dot-product self.attention = DotAttention(key_size=key_size, value_size=value_size, dropout=dropout) self.out = nn.Linear(value_size * num_head, input_size) self.reset_parameters() def reset_parameters(self): sqrt = math.sqrt - nn.init.normal_(self.q_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size))) - nn.init.normal_(self.k_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size))) - nn.init.normal_(self.v_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.value_size))) - nn.init.xavier_normal_(self.out.weight) + nn.init.normal_(self.q_in.weight, mean=0, std=sqrt(1.0 / self.input_size)) + nn.init.normal_(self.k_in.weight, mean=0, std=sqrt(1.0 / self.input_size)) + nn.init.normal_(self.v_in.weight, mean=0, std=sqrt(1.0 / self.input_size)) + nn.init.normal_(self.out.weight, mean=0, std=sqrt(1.0 / self.input_size)) def forward(self, Q, K, V, atte_mask_out=None): """ @@ -87,101 +91,102 @@ class MultiHeadAttention(nn.Module): sk = K.size(1) d_k, d_v, n_head = self.key_size, self.value_size, self.num_head # input linear - q = self.q_in(Q).view(batch, sq, n_head, d_k) - k = self.k_in(K).view(batch, sk, n_head, d_k) - v = self.v_in(V).view(batch, sk, n_head, d_v) - - # transpose q, k and v to do batch attention - q = q.permute(2, 0, 1, 3).contiguous().view(-1, sq, d_k) - k = k.permute(2, 0, 1, 3).contiguous().view(-1, sk, d_k) - v = v.permute(2, 0, 1, 3).contiguous().view(-1, sk, d_v) + q = self.q_in(Q).view(batch, sq, n_head, d_k).transpose(1, 2) + k = self.k_in(K).view(batch, sk, n_head, d_k).transpose(1, 2) + v = self.v_in(V).view(batch, sk, n_head, d_v).transpose(1, 2) + if atte_mask_out is not None: - atte_mask_out = atte_mask_out.repeat(n_head, 1, 1) - atte = self.attention(q, k, v, atte_mask_out).view(n_head, batch, sq, d_v) + atte_mask_out = atte_mask_out[:,None,:,:] # [bsz,1,1,len] + atte = self.attention(q, k, v, atte_mask_out).view(batch, n_head, sq, d_v) # concat all heads, do output linear - atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, sq, -1) + atte = atte.transpose(1, 2).contiguous().view(batch, sq, -1) output = self.out(atte) return output +def _masked_softmax(tensor, mask): + tensor_shape = tensor.size() + reshaped_tensor = tensor.view(-1, tensor_shape[-1]) + + # Reshape the mask so it matches the size of the input tensor. + while mask.dim() < tensor.dim(): + mask = mask.unsqueeze(1) + mask = mask.expand_as(tensor).contiguous().float() + reshaped_mask = mask.view(-1, mask.size()[-1]) + result = F.softmax(reshaped_tensor * reshaped_mask, dim=-1) + result = result * reshaped_mask + # 1e-13 is added to avoid divisions by zero. + result = result / (result.sum(dim=-1, keepdim=True) + 1e-13) + return result.view(*tensor_shape) + + +def _weighted_sum(tensor, weights, mask): + w_sum = weights.bmm(tensor) + while mask.dim() < w_sum.dim(): + mask = mask.unsqueeze(1) + mask = mask.transpose(-1, -2) + mask = mask.expand_as(w_sum).contiguous().float() + return w_sum * mask + + class BiAttention(nn.Module): - r"""Bi Attention module - - .. todo:: - 这个模块的负责人来继续完善一下 - - Calculate Bi Attention matrix `e` - + r""" + Bi Attention module + + 对于给定的两个向量序列 :math:`a_i` 和 :math:`b_j` , BiAttention模块将通过以下的公式来计算attention结果 + .. math:: - + \begin{array}{ll} \\ - e_ij = {a}^{\mathbf{T}}_{i}{b}_{j} \\ - a_i = - b_j = + e_{ij} = {a}^{\mathrm{T}}_{i}{b}_{j} \\ + {\hat{a}}_{i} = \sum_{j=1}^{\mathcal{l}_{b}}{\frac{\mathrm{exp}(e_{ij})}{\sum_{k=1}^{\mathcal{l}_{b}}{\mathrm{exp}(e_{ik})}}}{b}_{j} \\ + {\hat{b}}_{j} = \sum_{i=1}^{\mathcal{l}_{a}}{\frac{\mathrm{exp}(e_{ij})}{\sum_{k=1}^{\mathcal{l}_{a}}{\mathrm{exp}(e_{ik})}}}{a}_{i} \\ \end{array} - - """ - def __init__(self): - super(BiAttention, self).__init__() - self.inf = 10e12 + """ - def forward(self, in_x1, in_x2, x1_len, x2_len): + def forward(self, premise_batch, premise_mask, hypothesis_batch, hypothesis_mask): """ - :param torch.Tensor in_x1: [batch_size, x1_seq_len, hidden_size] 第一句的特征表示 - :param torch.Tensor in_x2: [batch_size, x2_seq_len, hidden_size] 第二句的特征表示 - :param torch.Tensor x1_len: [batch_size, x1_seq_len] 第一句的0/1mask矩阵 - :param torch.Tensor x2_len: [batch_size, x2_seq_len] 第二句的0/1mask矩阵 - :return: torch.Tensor out_x1: [batch_size, x1_seq_len, hidden_size] 第一句attend到的特征表示 - torch.Tensor out_x2: [batch_size, x2_seq_len, hidden_size] 第一句attend到的特征表示 - + :param torch.Tensor premise_batch: [batch_size, a_seq_len, hidden_size] + :param torch.Tensor premise_mask: [batch_size, a_seq_len] + :param torch.Tensor hypothesis_batch: [batch_size, b_seq_len, hidden_size] + :param torch.Tensor hypothesis_mask: [batch_size, b_seq_len] + :return: torch.Tensor attended_premises: [batch_size, a_seq_len, hidden_size] torch.Tensor attended_hypotheses: [batch_size, b_seq_len, hidden_size] """ + similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1) + .contiguous()) - assert in_x1.size()[0] == in_x2.size()[0] - assert in_x1.size()[2] == in_x2.size()[2] - # The batch size and hidden size must be equal. - assert in_x1.size()[1] == x1_len.size()[1] and in_x2.size()[1] == x2_len.size()[1] - # The seq len in in_x and x_len must be equal. - assert in_x1.size()[0] == x1_len.size()[0] and x1_len.size()[0] == x2_len.size()[0] - - batch_size = in_x1.size()[0] - x1_max_len = in_x1.size()[1] - x2_max_len = in_x2.size()[1] - - in_x2_t = torch.transpose(in_x2, 1, 2) # [batch_size, hidden_size, x2_seq_len] + prem_hyp_attn = _masked_softmax(similarity_matrix, hypothesis_mask) + hyp_prem_attn = _masked_softmax(similarity_matrix.transpose(1, 2) + .contiguous(), + premise_mask) - attention_matrix = torch.bmm(in_x1, in_x2_t) # [batch_size, x1_seq_len, x2_seq_len] + attended_premises = _weighted_sum(hypothesis_batch, + prem_hyp_attn, + premise_mask) + attended_hypotheses = _weighted_sum(premise_batch, + hyp_prem_attn, + hypothesis_mask) - a_mask = x1_len.le(0.5).float() * -self.inf # [batch_size, x1_seq_len] - a_mask = a_mask.view(batch_size, x1_max_len, -1) - a_mask = a_mask.expand(-1, -1, x2_max_len) # [batch_size, x1_seq_len, x2_seq_len] - b_mask = x2_len.le(0.5).float() * -self.inf - b_mask = b_mask.view(batch_size, -1, x2_max_len) - b_mask = b_mask.expand(-1, x1_max_len, -1) # [batch_size, x1_seq_len, x2_seq_len] - - attention_a = F.softmax(attention_matrix + a_mask, dim=2) # [batch_size, x1_seq_len, x2_seq_len] - attention_b = F.softmax(attention_matrix + b_mask, dim=1) # [batch_size, x1_seq_len, x2_seq_len] - - out_x1 = torch.bmm(attention_a, in_x2) # [batch_size, x1_seq_len, hidden_size] - attention_b_t = torch.transpose(attention_b, 1, 2) - out_x2 = torch.bmm(attention_b_t, in_x1) # [batch_size, x2_seq_len, hidden_size] - - return out_x1, out_x2 + return attended_premises, attended_hypotheses class SelfAttention(nn.Module): """ - Self Attention Module. - - :param int input_size: 输入tensor的hidden维度 - :param int attention_unit: 输出tensor的hidden维度 - :param int attention_hops: - :param float drop: dropout概率,默认值为0.5 - :param str initial_method: 初始化参数方法 + 这是一个基于论文 `A structured self-attentive sentence embedding `_ + 的Self Attention Module. """ def __init__(self, input_size, attention_unit=300, attention_hops=10, drop=0.5, initial_method=None, ): + """ + + :param int input_size: 输入tensor的hidden维度 + :param int attention_unit: 输出tensor的hidden维度 + :param int attention_hops: + :param float drop: dropout概率,默认值为0.5 + :param str initial_method: 初始化参数方法 + """ super(SelfAttention, self).__init__() self.attention_hops = attention_hops @@ -209,9 +214,9 @@ class SelfAttention(nn.Module): def forward(self, input, input_origin): """ - :param torch.Tensor input: [baz, senLen, h_dim] 要做attention的矩阵 - :param torch.Tensor input_origin: [baz , senLen] 原始token的index组成的矩阵,含有pad部分内容 - :return torch.Tensor output1: [baz, multi-head , h_dim] 经过attention操作后输入矩阵的结果 + :param torch.Tensor input: [batch_size, seq_len, hidden_size] 要做attention的矩阵 + :param torch.Tensor input_origin: [batch_size, seq_len] 原始token的index组成的矩阵,含有pad部分内容 + :return torch.Tensor output1: [batch_size, multi-head, hidden_size] 经过attention操作后输入矩阵的结果 :return torch.Tensor output2: [1] attention惩罚项,是一个标量 """ input = input.contiguous() diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index ce175df1..821b9c5c 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -1,4 +1,4 @@ -""" +"""undocumented 这个页面的代码很大程度上参考(复制粘贴)了https://github.com/huggingface/pytorch-pretrained-BERT的代码, 如果你发现该代码对你 有用,也请引用一下他们。 """ @@ -8,22 +8,40 @@ __all__ = [ ] import collections - -import unicodedata import copy import json import math import os +import unicodedata import torch from torch import nn -import sys +import numpy as np from ..utils import _get_file_name_base_on_postfix +from ...io.file_utils import _get_embedding_url, cached_path, PRETRAINED_BERT_MODEL_DIR +from ...core import logger CONFIG_FILE = 'bert_config.json' VOCAB_NAME = 'vocab.txt' +BERT_KEY_RENAME_MAP_1 = { + 'gamma': 'weight', + 'beta': 'bias', + 'distilbert.embeddings': 'bert.embeddings', + 'distilbert.transformer': 'bert.encoder', +} + +BERT_KEY_RENAME_MAP_2 = { + 'q_lin': 'self.query', + 'k_lin': 'self.key', + 'v_lin': 'self.value', + 'out_lin': 'output.dense', + 'sa_layer_norm': 'attention.output.LayerNorm', + 'ffn.lin1': 'intermediate.dense', + 'ffn.lin2': 'output.dense', + 'output_layer_norm': 'output.LayerNorm', +} class BertConfig(object): @@ -134,6 +152,19 @@ def swish(x): ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} +def _get_bert_dir(model_dir_or_name: str = 'en-base-uncased'): + if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: + model_url = _get_embedding_url('bert', model_dir_or_name.lower()) + model_dir = cached_path(model_url, name='embedding') + # 检查是否存在 + elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): + model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) + else: + logger.error(f"Cannot recognize BERT dir or name ``{model_dir_or_name}``.") + raise ValueError(f"Cannot recognize BERT dir or name ``{model_dir_or_name}``.") + return str(model_dir) + + class BertLayerNorm(nn.Module): def __init__(self, hidden_size, eps=1e-12): """Construct a layernorm module in the TF style (epsilon inside the square root). @@ -150,6 +181,55 @@ class BertLayerNorm(nn.Module): return self.weight * x + self.bias +class DistilBertEmbeddings(nn.Module): + def __init__(self, config): + super(DistilBertEmbeddings, self).__init__() + + def create_sinusoidal_embeddings(n_pos, dim, out): + position_enc = np.array([ + [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] + for pos in range(n_pos) + ]) + out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) + out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) + out.detach_() + out.requires_grad = False + + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + if config.sinusoidal_pos_embds: + create_sinusoidal_embeddings(n_pos=config.max_position_embeddings, + dim=config.hidden_size, + out=self.position_embeddings.weight) + + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=1e-12) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids): + """ + Parameters + ---------- + input_ids: torch.tensor(bs, max_seq_length) + The token ids to embed. + token_type_ids: no used. + Outputs + ------- + embeddings: torch.tensor(bs, max_seq_length, dim) + The embedded tokens (plus position embeddings, no token_type embeddings) + """ + seq_length = input_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) + + word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) + position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) + + embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) + embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) + embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) + return embeddings + + class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ @@ -336,31 +416,11 @@ class BertPooler(nn.Module): class BertModel(nn.Module): """ - 别名::class:`fastNLP.modules.BertModel` :class:`fastNLP.modules.encoder.BertModel` - BERT(Bidirectional Embedding Representations from Transformers). - 如果你想使用预训练好的权重矩阵,请在以下网址下载. - sources:: - - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", - 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-pytorch_model.bin", - 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", - 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", - 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", - 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", - 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin" - - 用预训练权重矩阵来建立BERT模型:: - model = BertModel.from_pretrained("path/to/weights/directory") + model = BertModel.from_pretrained(model_dir_or_name) 用随机初始化权重矩阵来建立BERT模型:: @@ -391,9 +451,22 @@ class BertModel(nn.Module): super(BertModel, self).__init__() self.config = config self.hidden_size = self.config.hidden_size - self.embeddings = BertEmbeddings(config) + self.model_type = 'bert' + if hasattr(config, 'sinusoidal_pos_embds'): + self.model_type = 'distilbert' + elif 'model_type' in kwargs: + self.model_type = kwargs['model_type'].lower() + + if self.model_type == 'distilbert': + self.embeddings = DistilBertEmbeddings(config) + else: + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) - self.pooler = BertPooler(config) + if self.model_type != 'distilbert': + self.pooler = BertPooler(config) + else: + logger.info('DistilBert has NOT pooler, will use hidden states of [CLS] token as pooled output.') self.apply(self.init_bert_weights) def init_bert_weights(self, module): @@ -435,41 +508,69 @@ class BertModel(nn.Module): extended_attention_mask, output_all_encoded_layers=output_all_encoded_layers) sequence_output = encoded_layers[-1] - pooled_output = self.pooler(sequence_output) + if self.model_type != 'distilbert': + pooled_output = self.pooler(sequence_output) + else: + pooled_output = sequence_output[:, 0] if not output_all_encoded_layers: encoded_layers = encoded_layers[-1] return encoded_layers, pooled_output @classmethod - def from_pretrained(cls, pretrained_model_dir, *inputs, **kwargs): + def from_pretrained(cls, model_dir_or_name, *inputs, **kwargs): state_dict = kwargs.get('state_dict', None) kwargs.pop('state_dict', None) kwargs.pop('cache_dir', None) kwargs.pop('from_tf', None) + + # get model dir from name or dir + pretrained_model_dir = _get_bert_dir(model_dir_or_name) + # Load config config_file = _get_file_name_base_on_postfix(pretrained_model_dir, '.json') config = BertConfig.from_json_file(config_file) - # logger.info("Model config {}".format(config)) - # Instantiate model. - model = cls(config, *inputs, **kwargs) + if state_dict is None: weights_path = _get_file_name_base_on_postfix(pretrained_model_dir, '.bin') state_dict = torch.load(weights_path, map_location='cpu') + else: + logger.error(f'Cannot load parameters through `state_dict` variable.') + raise RuntimeError(f'Cannot load parameters through `state_dict` variable.') + model_type = 'BERT' old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None - if 'gamma' in key: - new_key = key.replace('gamma', 'weight') - if 'beta' in key: - new_key = key.replace('beta', 'bias') + for key_name in BERT_KEY_RENAME_MAP_1: + if key_name in key: + new_key = key.replace(key_name, BERT_KEY_RENAME_MAP_1[key_name]) + if 'distilbert' in key: + model_type = 'DistilBert' + break if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + for key_name in BERT_KEY_RENAME_MAP_2: + if key_name in key: + new_key = key.replace(key_name, BERT_KEY_RENAME_MAP_2[key_name]) + break + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + # Instantiate model. + model = cls(config, model_type=model_type, *inputs, **kwargs) + missing_keys = [] unexpected_keys = [] error_msgs = [] @@ -489,11 +590,13 @@ class BertModel(nn.Module): load(model, prefix='' if hasattr(model, 'bert') else 'bert.') if len(missing_keys) > 0: - print("Weights of {} not initialized from pretrained model: {}".format( + logger.warning("Weights of {} not initialized from pretrained model: {}".format( model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: - print("Weights from pretrained model not used in {}: {}".format( + logger.warning("Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) + + logger.info(f"Load pre-trained {model_type} parameters from file {weights_path}.") return model @@ -563,6 +666,8 @@ class WordpieceTokenizer(object): output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) + if len(output_tokens) == 0: # 防止里面全是空格或者回车符号 + return [self.unk_token] return output_tokens @@ -672,14 +777,14 @@ class BasicTokenizer(object): # as is Japanese Hiragana and Katakana. Those alphabets are used to write # space-separated words, so they are not treated specially and handled # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # + if (((cp >= 0x4E00) and (cp <= 0x9FFF)) or # + ((cp >= 0x3400) and (cp <= 0x4DBF)) or # + ((cp >= 0x20000) and (cp <= 0x2A6DF)) or # + ((cp >= 0x2A700) and (cp <= 0x2B73F)) or # + ((cp >= 0x2B740) and (cp <= 0x2B81F)) or # + ((cp >= 0x2B820) and (cp <= 0x2CEAF)) or + ((cp >= 0xF900) and (cp <= 0xFAFF)) or # + ((cp >= 0x2F800) and (cp <= 0x2FA1F))): # return True return False @@ -729,8 +834,8 @@ def _is_punctuation(char): # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + if (((cp >= 33) and (cp <= 47)) or ((cp >= 58) and (cp <= 64)) or + ((cp >= 91) and (cp <= 96)) or ((cp >= 123) and (cp <= 126))): return True cat = unicodedata.category(char) if cat.startswith("P"): @@ -797,7 +902,7 @@ class BertTokenizer(object): for token in tokens: ids.append(self.vocab[token]) if len(ids) > self.max_len: - print( + logger.warning( "Token indices sequence length is longer than the specified maximum " " sequence length for this BERT model ({} > {}). Running this" " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) @@ -821,38 +926,39 @@ class BertTokenizer(object): with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: - print("Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file)) + logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file)) index = token_index writer.write(token + u'\n') index += 1 return vocab_file @classmethod - def from_pretrained(cls, model_dir, *inputs, **kwargs): + def from_pretrained(cls, model_dir_or_name, *inputs, **kwargs): """ - 给定path,直接读取vocab. - + 给定模型的名字或者路径,直接读取vocab. """ + model_dir = _get_bert_dir(model_dir_or_name) pretrained_model_name_or_path = _get_file_name_base_on_postfix(model_dir, '.txt') - print("loading vocabulary file {}".format(pretrained_model_name_or_path)) + logger.info("loading vocabulary file {}".format(pretrained_model_name_or_path)) max_len = 512 kwargs['max_len'] = min(kwargs.get('max_position_embeddings', int(1e12)), max_len) # Instantiate tokenizer. tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs) return tokenizer + class _WordPieceBertModel(nn.Module): """ 这个模块用于直接计算word_piece的结果. """ - def __init__(self, model_dir: str, layers: str = '-1'): + def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool=False): super().__init__() - self.tokenzier = BertTokenizer.from_pretrained(model_dir) - self.encoder = BertModel.from_pretrained(model_dir) + self.tokenzier = BertTokenizer.from_pretrained(model_dir_or_name) + self.encoder = BertModel.from_pretrained(model_dir_or_name) # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) self.layers = list(map(int, layers.split(','))) @@ -866,9 +972,11 @@ class _WordPieceBertModel(nn.Module): self._cls_index = self.tokenzier.vocab['[CLS]'] self._sep_index = self.tokenzier.vocab['[SEP]'] + self._wordpiece_unknown_index = self.tokenzier.vocab['[UNK]'] self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece + self.pooled_cls = pooled_cls - def index_dataset(self, *datasets, field_name): + def index_dataset(self, *datasets, field_name, add_cls_sep=True): """ 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 @@ -884,10 +992,11 @@ class _WordPieceBertModel(nn.Module): tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word) word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens) word_pieces.extend(word_piece_ids) - if word_pieces[0] != self._cls_index: - word_pieces.insert(0, self._cls_index) - if word_pieces[-1] != self._sep_index: - word_pieces.insert(-1, self._sep_index) + if add_cls_sep: + if word_pieces[0] != self._cls_index: + word_pieces.insert(0, self._cls_index) + if word_pieces[-1] != self._sep_index: + word_pieces.insert(-1, self._sep_index) return word_pieces for index, dataset in enumerate(datasets): @@ -896,7 +1005,7 @@ class _WordPieceBertModel(nn.Module): is_input=True) dataset.set_pad_val('word_pieces', self._wordpiece_pad_index) except Exception as e: - print(f"Exception happens when processing the {index} dataset.") + logger.error(f"Exception happens when processing the {index} dataset.") raise e def forward(self, word_pieces, token_type_ids=None): @@ -909,10 +1018,13 @@ class _WordPieceBertModel(nn.Module): batch_size, max_len = word_pieces.size() attn_masks = word_pieces.ne(self._wordpiece_pad_index) - bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, - output_all_encoded_layers=True) + bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, + output_all_encoded_layers=True) # output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1))) for l_index, l in enumerate(self.layers): - outputs[l_index] = bert_outputs[l] + bert_output = bert_outputs[l] + if l in (len(bert_outputs)-1, -1) and self.pooled_cls: + bert_output[:, 0] = pooled_cls + outputs[l_index] = bert_output return outputs diff --git a/fastNLP/modules/encoder/char_encoder.py b/fastNLP/modules/encoder/char_encoder.py index 6a6e1470..786a2467 100644 --- a/fastNLP/modules/encoder/char_encoder.py +++ b/fastNLP/modules/encoder/char_encoder.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "ConvolutionCharEncoder", "LSTMCharEncoder" @@ -11,18 +13,19 @@ from ..utils import initial_parameter # from torch.nn.init import xavier_uniform class ConvolutionCharEncoder(nn.Module): """ - 别名::class:`fastNLP.modules.ConvolutionCharEncoder` :class:`fastNLP.modules.encoder.ConvolutionCharEncoder` - char级别的卷积编码器. - :param int char_emb_size: char级别embedding的维度. Default: 50 - :例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50. - :param tuple feature_maps: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的filter. - :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核. - :param initial_method: 初始化参数的方式, 默认为`xavier normal` """ def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(1, 3, 5), initial_method=None): + """ + + :param int char_emb_size: char级别embedding的维度. Default: 50 + :例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50. + :param tuple feature_maps: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的filter. + :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核. + :param initial_method: 初始化参数的方式, 默认为`xavier normal` + """ super(ConvolutionCharEncoder, self).__init__() self.convs = nn.ModuleList([ nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, @@ -58,11 +61,7 @@ class ConvolutionCharEncoder(nn.Module): class LSTMCharEncoder(nn.Module): """ - 别名::class:`fastNLP.modules.LSTMCharEncoder` :class:`fastNLP.modules.encoder.LSTMCharEncoder` - char级别基于LSTM的encoder. - - """ def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None): diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py index 8ce6b163..f19a92f3 100644 --- a/fastNLP/modules/encoder/conv_maxpool.py +++ b/fastNLP/modules/encoder/conv_maxpool.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "ConvMaxpool" ] @@ -8,19 +10,20 @@ import torch.nn.functional as F class ConvMaxpool(nn.Module): """ - 别名::class:`fastNLP.modules.ConvMaxpool` :class:`fastNLP.modules.encoder.ConvMaxpool` - 集合了Convolution和Max-Pooling于一体的层。给定一个batch_size x max_len x input_size的输入,返回batch_size x sum(output_channels) 大小的matrix。在内部,是先使用CNN给输入做卷积,然后经过activation激活层,在通过在长度(max_len) 这一维进行max_pooling。最后得到每个sample的一个向量表示。 - :param int in_channels: 输入channel的大小,一般是embedding的维度; 或encoder的output维度 - :param int,tuple(int) out_channels: 输出channel的数量。如果为list,则需要与kernel_sizes的数量保持一致 - :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。 - :param str activation: Convolution后的结果将通过该activation后再经过max-pooling。支持relu, sigmoid, tanh """ def __init__(self, in_channels, out_channels, kernel_sizes, activation="relu"): + """ + + :param int in_channels: 输入channel的大小,一般是embedding的维度; 或encoder的output维度 + :param int,tuple(int) out_channels: 输出channel的数量。如果为list,则需要与kernel_sizes的数量保持一致 + :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。 + :param str activation: Convolution后的结果将通过该activation后再经过max-pooling。支持relu, sigmoid, tanh + """ super(ConvMaxpool, self).__init__() for kernel_size in kernel_sizes: diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index e2358132..06b437ef 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -1,7 +1,8 @@ -""" +"""undocumented 轻量封装的 Pytorch LSTM 模块. 可在 forward 时传入序列的长度, 自动对padding做合适的处理. """ + __all__ = [ "LSTM" ] @@ -13,23 +14,24 @@ import torch.nn.utils.rnn as rnn class LSTM(nn.Module): """ - 别名::class:`fastNLP.modules.LSTM` :class:`fastNLP.modules.encoder.LSTM` - LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下,将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化 - 为1; 且可以应对DataParallel中LSTM的使用问题。 + 为1; 且可以应对DataParallel中LSTM的使用问题。 - :param input_size: 输入 `x` 的特征维度 - :param hidden_size: 隐状态 `h` 的特征维度. - :param num_layers: rnn的层数. Default: 1 - :param dropout: 层间dropout概率. Default: 0 - :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False`` - :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 - :(batch, seq, feature). Default: ``False`` - :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` """ def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True, bidirectional=False, bias=True): + """ + + :param input_size: 输入 `x` 的特征维度 + :param hidden_size: 隐状态 `h` 的特征维度. + :param num_layers: rnn的层数. Default: 1 + :param dropout: 层间dropout概率. Default: 0 + :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False`` + :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 + :(batch, seq, feature). Default: ``False`` + :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` + """ super(LSTM, self).__init__() self.batch_first = batch_first self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first, diff --git a/fastNLP/modules/encoder/pooling.py b/fastNLP/modules/encoder/pooling.py index d8aa54ad..80ff419d 100644 --- a/fastNLP/modules/encoder/pooling.py +++ b/fastNLP/modules/encoder/pooling.py @@ -1,6 +1,9 @@ +"""undocumented""" + __all__ = [ "MaxPool", "MaxPoolWithMask", + "KMaxPool", "AvgPool", "AvgPoolWithMask" ] @@ -10,22 +13,22 @@ import torch.nn as nn class MaxPool(nn.Module): """ - 别名::class:`fastNLP.modules.MaxPool` :class:`fastNLP.modules.encoder.MaxPool` - Max-pooling模块。 - :param stride: 窗口移动大小,默认为kernel_size - :param padding: padding的内容,默认为0 - :param dilation: 控制窗口内元素移动距离的大小 - :param dimension: MaxPool的维度,支持1,2,3维。 - :param kernel_size: max pooling的窗口大小,默认为tensor最后k维,其中k为dimension - :param ceil_mode: """ def __init__(self, stride=None, padding=0, dilation=1, dimension=1, kernel_size=None, ceil_mode=False): - + """ + + :param stride: 窗口移动大小,默认为kernel_size + :param padding: padding的内容,默认为0 + :param dilation: 控制窗口内元素移动距离的大小 + :param dimension: MaxPool的维度,支持1,2,3维。 + :param kernel_size: max pooling的窗口大小,默认为tensor最后k维,其中k为dimension + :param ceil_mode: + """ super(MaxPool, self).__init__() - assert (1 <= dimension) and (dimension <= 3) + assert dimension in [1, 2, 3], f'Now we only support 1d, 2d, or 3d Pooling' self.dimension = dimension self.stride = stride self.padding = padding @@ -35,12 +38,12 @@ class MaxPool(nn.Module): def forward(self, x): if self.dimension == 1: + x = torch.transpose(x, 1, 2) # [N,L,C] -> [N,C,L] pooling = nn.MaxPool1d( stride=self.stride, padding=self.padding, dilation=self.dilation, kernel_size=self.kernel_size if self.kernel_size is not None else x.size(-1), return_indices=False, ceil_mode=self.ceil_mode ) - x = torch.transpose(x, 1, 2) # [N,L,C] -> [N,C,L] elif self.dimension == 2: pooling = nn.MaxPool2d( stride=self.stride, padding=self.padding, dilation=self.dilation, @@ -48,7 +51,7 @@ class MaxPool(nn.Module): return_indices=False, ceil_mode=self.ceil_mode ) else: - pooling = nn.MaxPool2d( + pooling = nn.MaxPool3d( stride=self.stride, padding=self.padding, dilation=self.dilation, kernel_size=self.kernel_size if self.kernel_size is not None else (x.size(-3), x.size(-2), x.size(-1)), return_indices=False, ceil_mode=self.ceil_mode @@ -59,8 +62,6 @@ class MaxPool(nn.Module): class MaxPoolWithMask(nn.Module): """ - 别名::class:`fastNLP.modules.MaxPoolWithMask` :class:`fastNLP.modules.encoder.MaxPoolWithMask` - 带mask矩阵的max pooling。在做max-pooling的时候不会考虑mask值为0的位置。 """ @@ -99,8 +100,6 @@ class KMaxPool(nn.Module): class AvgPool(nn.Module): """ - 别名::class:`fastNLP.modules.AvgPool` :class:`fastNLP.modules.encoder.AvgPool` - 给定形如[batch_size, max_len, hidden_size]的输入,在最后一维进行avg pooling. 输出为[batch_size, hidden_size] """ @@ -126,8 +125,6 @@ class AvgPool(nn.Module): class AvgPoolWithMask(nn.Module): """ - 别名::class:`fastNLP.modules.AvgPoolWithMask` :class:`fastNLP.modules.encoder.AvgPoolWithMask` - 给定形如[batch_size, max_len, hidden_size]的输入,在最后一维进行avg pooling. 输出为[batch_size, hidden_size], pooling 的时候只会考虑mask为1的位置 """ diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py index 3927a494..85b1ac4d 100644 --- a/fastNLP/modules/encoder/star_transformer.py +++ b/fastNLP/modules/encoder/star_transformer.py @@ -1,6 +1,7 @@ -""" +"""undocumented Star-Transformer 的encoder部分的 Pytorch 实现 """ + __all__ = [ "StarTransformer" ] @@ -13,24 +14,24 @@ from torch.nn import functional as F class StarTransformer(nn.Module): """ - 别名::class:`fastNLP.modules.StarTransformer` :class:`fastNLP.modules.encoder.StarTransformer` - - Star-Transformer 的encoder部分。 输入3d的文本输入, 返回相同长度的文本编码 paper: https://arxiv.org/abs/1902.09113 - :param int hidden_size: 输入维度的大小。同时也是输出维度的大小。 - :param int num_layers: star-transformer的层数 - :param int num_head: head的数量。 - :param int head_dim: 每个head的维度大小。 - :param float dropout: dropout 概率. Default: 0.1 - :param int max_len: int or None, 如果为int,输入序列的最大长度, - 模型会为输入序列加上position embedding。 - 若为`None`,忽略加上position embedding的步骤. Default: `None` """ def __init__(self, hidden_size, num_layers, num_head, head_dim, dropout=0.1, max_len=None): + """ + + :param int hidden_size: 输入维度的大小。同时也是输出维度的大小。 + :param int num_layers: star-transformer的层数 + :param int num_head: head的数量。 + :param int head_dim: 每个head的维度大小。 + :param float dropout: dropout 概率. Default: 0.1 + :param int max_len: int or None, 如果为int,输入序列的最大长度, + 模型会为输入序列加上position embedding。 + 若为`None`,忽略加上position embedding的步骤. Default: `None` + """ super(StarTransformer, self).__init__() self.iters = num_layers @@ -68,7 +69,7 @@ class StarTransformer(nn.Module): smask = torch.cat([torch.zeros(B, 1, ).byte().to(mask), mask], 1) embs = data.permute(0, 2, 1)[:, :, :, None] # B H L 1 - if self.pos_emb and False: + if self.pos_emb: P = self.pos_emb(torch.arange(L, dtype=torch.long, device=embs.device) \ .view(1, L)).permute(0, 2, 1).contiguous()[:, :, :, None] # 1 H L 1 embs = embs + P diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py index bc488e54..323091b0 100644 --- a/fastNLP/modules/encoder/transformer.py +++ b/fastNLP/modules/encoder/transformer.py @@ -1,38 +1,30 @@ +"""undocumented""" + __all__ = [ "TransformerEncoder" ] from torch import nn -from fastNLP.modules.encoder.attention import MultiHeadAttention -from ..dropout import TimestepDropout +from .attention import MultiHeadAttention class TransformerEncoder(nn.Module): """ - 别名::class:`fastNLP.modules.TransformerEncoder` :class:`fastNLP.modules.encoder.TransformerEncoder` - - transformer的encoder模块,不包含embedding层 - :param int num_layers: transformer的层数 - :param int model_size: 输入维度的大小。同时也是输出维度的大小。 - :param int inner_size: FFN层的hidden大小 - :param int key_size: 每个head的维度大小。 - :param int value_size: 每个head中value的维度。 - :param int num_head: head的数量。 - :param float dropout: dropout概率. Default: 0.1 """ class SubLayer(nn.Module): def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1): super(TransformerEncoder.SubLayer, self).__init__() self.atte = MultiHeadAttention(model_size, key_size, value_size, num_head, dropout) - self.norm1 = nn.LayerNorm(model_size) + self.norm1 = nn.LayerNorm(model_size, eps=1e-6) self.ffn = nn.Sequential(nn.Linear(model_size, inner_size), nn.ReLU(), - nn.Linear(inner_size, model_size), - TimestepDropout(dropout), ) - self.norm2 = nn.LayerNorm(model_size) + nn.Dropout(dropout), + nn.Linear(inner_size, model_size)) + self.norm2 = nn.LayerNorm(model_size, eps=1e-6) + self.dropout = nn.Dropout(dropout) def forward(self, input, seq_mask=None, atte_mask_out=None): """ @@ -41,17 +33,32 @@ class TransformerEncoder(nn.Module): :param seq_mask: [batch, seq_len] :return: [batch, seq_len, model_size] """ + if seq_mask is None: # 防止后续乘法时出错 + seq_mask = 1 + input = self.norm1(input) attention = self.atte(input, input, input, atte_mask_out) - norm_atte = self.norm1(attention + input) + input = input + self.dropout(attention) attention *= seq_mask - output = self.ffn(norm_atte) - output = self.norm2(output + norm_atte) - output *= seq_mask - return output + input = self.norm2(input) + output = self.ffn(input) + input = input + self.dropout(output) + input *= seq_mask + return input def __init__(self, num_layers, **kargs): + """ + + :param int num_layers: transformer的层数 + :param int model_size: 输入维度的大小。同时也是输出维度的大小。 + :param int inner_size: FFN层的hidden大小 + :param int key_size: 每个head的维度大小。 + :param int value_size: 每个head中value的维度。 + :param int num_head: head的数量。 + :param float dropout: dropout概率. Default: 0.1 + """ super(TransformerEncoder, self).__init__() self.layers = nn.ModuleList([self.SubLayer(**kargs) for _ in range(num_layers)]) + self.norm = nn.LayerNorm(kargs['model_size'], eps=1e-6) def forward(self, x, seq_mask=None): """ @@ -64,8 +71,8 @@ class TransformerEncoder(nn.Module): if seq_mask is None: atte_mask_out = None else: - atte_mask_out = (seq_mask < 1)[:, None, :] + atte_mask_out = (seq_mask == 0)[:, None, :] seq_mask = seq_mask[:, :, None] for layer in self.layers: output = layer(output, seq_mask, atte_mask_out) - return output + return self.norm(output) diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py index 8e5e804b..b09b3af9 100644 --- a/fastNLP/modules/encoder/variational_rnn.py +++ b/fastNLP/modules/encoder/variational_rnn.py @@ -1,6 +1,8 @@ +"""undocumented +Variational RNN 及相关模型的 fastNLP实现,相关论文参考: +`A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016) `_ """ -Variational RNN 的 Pytorch 实现 -""" + __all__ = [ "VarRNN", "VarLSTM", @@ -105,22 +107,25 @@ class VarRNNBase(nn.Module): 论文参考: `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016) https://arxiv.org/abs/1512.05287`. - :param mode: rnn 模式, (lstm or not) - :param Cell: rnn cell 类型, (lstm, gru, etc) - :param input_size: 输入 `x` 的特征维度 - :param hidden_size: 隐状态 `h` 的特征维度 - :param num_layers: rnn的层数. Default: 1 - :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` - :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 - (batch, seq, feature). Default: ``False`` - :param input_dropout: 对输入的dropout概率. Default: 0 - :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 - :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False`` """ def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1, bias=True, batch_first=False, input_dropout=0, hidden_dropout=0, bidirectional=False): + """ + + :param mode: rnn 模式, (lstm or not) + :param Cell: rnn cell 类型, (lstm, gru, etc) + :param input_size: 输入 `x` 的特征维度 + :param hidden_size: 隐状态 `h` 的特征维度 + :param num_layers: rnn的层数. Default: 1 + :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` + :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 + (batch, seq, feature). Default: ``False`` + :param input_dropout: 对输入的dropout概率. Default: 0 + :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 + :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False`` + """ super(VarRNNBase, self).__init__() self.mode = mode self.input_size = input_size @@ -222,22 +227,24 @@ class VarRNNBase(nn.Module): class VarLSTM(VarRNNBase): """ - 别名::class:`fastNLP.modules.VarLSTM` :class:`fastNLP.modules.encoder.VarLSTM` - Variational Dropout LSTM. + 相关论文参考:`A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016) `_ - :param input_size: 输入 `x` 的特征维度 - :param hidden_size: 隐状态 `h` 的特征维度 - :param num_layers: rnn的层数. Default: 1 - :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` - :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 - (batch, seq, feature). Default: ``False`` - :param input_dropout: 对输入的dropout概率. Default: 0 - :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 - :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False`` """ def __init__(self, *args, **kwargs): + """ + + :param input_size: 输入 `x` 的特征维度 + :param hidden_size: 隐状态 `h` 的特征维度 + :param num_layers: rnn的层数. Default: 1 + :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` + :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 + (batch, seq, feature). Default: ``False`` + :param input_dropout: 对输入的dropout概率. Default: 0 + :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 + :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False`` + """ super(VarLSTM, self).__init__( mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs) @@ -247,22 +254,24 @@ class VarLSTM(VarRNNBase): class VarRNN(VarRNNBase): """ - 别名::class:`fastNLP.modules.VarRNN` :class:`fastNLP.modules.encoder.VarRNN` - Variational Dropout RNN. - - :param input_size: 输入 `x` 的特征维度 - :param hidden_size: 隐状态 `h` 的特征维度 - :param num_layers: rnn的层数. Default: 1 - :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` - :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 - (batch, seq, feature). Default: ``False`` - :param input_dropout: 对输入的dropout概率. Default: 0 - :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 - :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False`` + 相关论文参考:`A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016) `_ + """ def __init__(self, *args, **kwargs): + """ + + :param input_size: 输入 `x` 的特征维度 + :param hidden_size: 隐状态 `h` 的特征维度 + :param num_layers: rnn的层数. Default: 1 + :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` + :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 + (batch, seq, feature). Default: ``False`` + :param input_dropout: 对输入的dropout概率. Default: 0 + :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 + :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False`` + """ super(VarRNN, self).__init__( mode="RNN", Cell=nn.RNNCell, *args, **kwargs) @@ -272,22 +281,24 @@ class VarRNN(VarRNNBase): class VarGRU(VarRNNBase): """ - 别名::class:`fastNLP.modules.VarGRU` :class:`fastNLP.modules.encoder.VarGRU` - Variational Dropout GRU. - - :param input_size: 输入 `x` 的特征维度 - :param hidden_size: 隐状态 `h` 的特征维度 - :param num_layers: rnn的层数. Default: 1 - :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` - :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 - (batch, seq, feature). Default: ``False`` - :param input_dropout: 对输入的dropout概率. Default: 0 - :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 - :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False`` + 相关论文参考:`A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016) `_ + """ def __init__(self, *args, **kwargs): + """ + + :param input_size: 输入 `x` 的特征维度 + :param hidden_size: 隐状态 `h` 的特征维度 + :param num_layers: rnn的层数. Default: 1 + :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True`` + :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为 + (batch, seq, feature). Default: ``False`` + :param input_dropout: 对输入的dropout概率. Default: 0 + :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0 + :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False`` + """ super(VarGRU, self).__init__( mode="GRU", Cell=nn.GRUCell, *args, **kwargs) diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index dbae9c73..54993479 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -1,3 +1,14 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "initial_parameter", + "summary" +] + +import os from functools import reduce import torch @@ -39,7 +50,7 @@ def initial_parameter(net, initial_method=None): init_method = init.uniform_ else: init_method = init.xavier_normal_ - + def weights_init(m): # classname = m.__class__.__name__ if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv3d): # for all the cnn @@ -65,7 +76,7 @@ def initial_parameter(net, initial_method=None): else: init.normal_(w.data) # bias # print("init else") - + net.apply(weights_init) @@ -78,19 +89,22 @@ def summary(model: nn.Module): """ train = [] nontrain = [] - + buffer = [] + def layer_summary(module: nn.Module): def count_size(sizes): - return reduce(lambda x, y: x*y, sizes) - + return reduce(lambda x, y: x * y, sizes) + for p in module.parameters(recurse=False): if p.requires_grad: train.append(count_size(p.shape)) else: nontrain.append(count_size(p.shape)) + for p in module.buffers(): + buffer.append(count_size(p)) for subm in module.children(): layer_summary(subm) - + layer_summary(model) total_train = sum(train) total_nontrain = sum(nontrain) @@ -99,8 +113,9 @@ def summary(model: nn.Module): strings.append('Total params: {:,}'.format(total)) strings.append('Trainable params: {:,}'.format(total_train)) strings.append('Non-trainable params: {:,}'.format(total_nontrain)) + strings.append("Buffer params: {:,}".format(sum(buffer))) max_len = len(max(strings, key=len)) - bar = '-'*(max_len + 3) + bar = '-' * (max_len + 3) strings = [bar] + strings + [bar] print('\n'.join(strings)) return total, total_train, total_nontrain @@ -111,7 +126,7 @@ def get_dropout_mask(drop_p: float, tensor: torch.Tensor): 根据tensor的形状,生成一个mask :param drop_p: float, 以多大的概率置为0。 - :param tensor:torch.Tensor + :param tensor: torch.Tensor :return: torch.FloatTensor. 与tensor一样的shape """ mask_x = torch.ones_like(tensor) @@ -119,7 +134,6 @@ def get_dropout_mask(drop_p: float, tensor: torch.Tensor): training=False, inplace=True) return mask_x -import glob def _get_file_name_base_on_postfix(dir_path, postfix): """ @@ -128,9 +142,9 @@ def _get_file_name_base_on_postfix(dir_path, postfix): :param postfix: 形如".bin", ".json"等 :return: str,文件的路径 """ - files = glob.glob(os.path.join(dir_path, '*' + postfix)) + files = list(filter(lambda filename: filename.endswith(postfix), os.listdir(os.path.join(dir_path)))) if len(files) == 0: - raise FileNotFoundError(f"There is no file endswith *.{postfix} file in {dir_path}") + raise FileNotFoundError(f"There is no file endswith *{postfix} file in {dir_path}") elif len(files) > 1: - raise FileExistsError(f"There are multiple *.{postfix} files in {dir_path}") - return os.path.join(dir_path, files[0]) \ No newline at end of file + raise FileExistsError(f"There are multiple *{postfix} files in {dir_path}") + return os.path.join(dir_path, files[0]) diff --git a/legacy/api/README.md b/legacy/api/README.md deleted file mode 100644 index 73560f9f..00000000 --- a/legacy/api/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# fastNLP 高级接口 - -### 环境与配置 -1. 系统环境:linux/ubuntu(推荐) -2. 编程语言:Python>=3.6 -3. Python包依赖 - - **torch==1.0** - - numpy>=1.14.2 - -### 中文分词 -```python -text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。', - '这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', - '那么这款无人机到底有多厉害?'] -from fastNLP.api import CWS -cws = CWS(device='cpu') -print(cws.predict(text)) -# ['编者 按 : 7月 12日 , 英国 航空 航天 系统 公司 公布 了 该 公司 研制 的 第一 款 高 科技 隐形 无人 机雷电 之 神 。', '这 款 飞行 从 外型 上 来 看 酷似 电影 中 的 太空 飞行器 , 据 英国 方面 介绍 , 可以 实现 洲际 远程 打击 。', '那么 这 款 无人 机 到底 有 多 厉害 ?'] -``` - -### 词性标注 -```python -# 输入已分词序列 -text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司', - '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'], - ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']] -from fastNLP.api import POS -pos = POS(device='cpu') -print(pos.predict(text)) -# [['编者/NN', '按:/NN', '7月/NT', '12日/NT', ',/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一款/NN', '高科技/NN', '隐形/AD', '无人机/VV', '雷电之神/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无人机/VV', '到底/AD', '有/VE', '多/AD', '厉害/VA', '?/PU']] -``` - -### 句法分析 -```python -text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司', - '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'], - ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']] -from fastNLP.api import Parser -parser = Parser(device='cpu') -print(parser.predict(text)) -# [['2/nn', '4/nn', '4/nn', '20/tmod', '11/punct', '10/nn', '10/nn', '10/nn', '10/nn', '11/nsubj', '20/dep', '11/asp', '14/det', '15/nsubj', '18/rcmod', '15/cpm', '18/nn', '11/dobj', '20/advmod', '0/root', '20/dobj', '20/punct'], ['4/advmod', '3/det', '8/xsubj', '8/dep', '8/advmod', '8/dep', '8/advmod', '0/root', '8/punct']] -``` - -完整样例见`examples.py` \ No newline at end of file diff --git a/legacy/api/__init__.py b/legacy/api/__init__.py deleted file mode 100644 index 5171d8c2..00000000 --- a/legacy/api/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -__all__ = ["CWS", "POS", "Parser"] -from .api import CWS, POS, Parser diff --git a/legacy/api/api.py b/legacy/api/api.py deleted file mode 100644 index 1408731f..00000000 --- a/legacy/api/api.py +++ /dev/null @@ -1,463 +0,0 @@ -import warnings - -import torch - -warnings.filterwarnings('ignore') -import os - -from fastNLP.core.dataset import DataSet -from .utils import load_url -from .processor import ModelProcessor -from fastNLP.io.dataset_loader import _cut_long_sentence -from fastNLP.io.data_loader import ConllLoader -from fastNLP.core.instance import Instance -from ..api.pipeline import Pipeline -from fastNLP.core.metrics import SpanFPreRecMetric -from .processor import IndexerProcessor - -# TODO add pretrain urls -model_urls = { - "cws": "http://123.206.98.91:8888/download/cws_lstm_ctb9_1_20-09908656.pkl", - "pos": "http://123.206.98.91:8888/download/pos_tag_model_20190119-43f8b435.pkl", - "parser": "http://123.206.98.91:8888/download/parser_20190204-c72ca5c0.pkl" -} - - -class ConllCWSReader(object): - """Deprecated. Use ConllLoader for all types of conll-format files.""" - - def __init__(self): - pass - - def load(self, path, cut_long_sent=False): - """ - 返回的DataSet只包含raw_sentence这个field,内容为str。 - 假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即 - :: - - 1 编者按 编者按 NN O 11 nmod:topic - 2 : : PU O 11 punct - 3 7月 7月 NT DATE 4 compound:nn - 4 12日 12日 NT DATE 11 nmod:tmod - 5 , , PU O 11 punct - - 1 这 这 DT O 3 det - 2 款 款 M O 1 mark:clf - 3 飞行 飞行 NN O 8 nsubj - 4 从 从 P O 5 case - 5 外型 外型 NN O 8 nmod:prep - - """ - datalist = [] - with open(path, 'r', encoding='utf-8') as f: - sample = [] - for line in f: - if line.startswith('\n'): - datalist.append(sample) - sample = [] - elif line.startswith('#'): - continue - else: - sample.append(line.strip().split()) - if len(sample) > 0: - datalist.append(sample) - - ds = DataSet() - for sample in datalist: - # print(sample) - res = self.get_char_lst(sample) - if res is None: - continue - line = ' '.join(res) - if cut_long_sent: - sents = _cut_long_sentence(line) - else: - sents = [line] - for raw_sentence in sents: - ds.append(Instance(raw_sentence=raw_sentence)) - return ds - - def get_char_lst(self, sample): - if len(sample) == 0: - return None - text = [] - for w in sample: - t1, t2, t3, t4 = w[1], w[3], w[6], w[7] - if t3 == '_': - return None - text.append(t1) - return text - - -class ConllxDataLoader(ConllLoader): - """返回“词级别”的标签信息,包括词、词性、(句法)头依赖、(句法)边标签。跟``ZhConllPOSReader``完全不同。 - - Deprecated. Use ConllLoader for all types of conll-format files. - """ - - def __init__(self): - headers = [ - 'words', 'pos_tags', 'heads', 'labels', - ] - indexs = [ - 1, 3, 6, 7, - ] - super(ConllxDataLoader, self).__init__(headers=headers, indexes=indexs) - - -class API: - def __init__(self): - self.pipeline = None - self._dict = None - - def predict(self, *args, **kwargs): - """Do prediction for the given input. - """ - raise NotImplementedError - - def test(self, file_path): - """Test performance over the given data set. - - :param str file_path: - :return: a dictionary of metric values - """ - raise NotImplementedError - - def load(self, path, device): - if os.path.exists(os.path.expanduser(path)): - _dict = torch.load(path, map_location='cpu') - else: - _dict = load_url(path, map_location='cpu') - self._dict = _dict - self.pipeline = _dict['pipeline'] - for processor in self.pipeline.pipeline: - if isinstance(processor, ModelProcessor): - processor.set_model_device(device) - - -class POS(API): - """FastNLP API for Part-Of-Speech tagging. - - :param str model_path: the path to the model. - :param str device: device name such as "cpu" or "cuda:0". Use the same notation as PyTorch. - - """ - - def __init__(self, model_path=None, device='cpu'): - super(POS, self).__init__() - if model_path is None: - model_path = model_urls['pos'] - - self.load(model_path, device) - - def predict(self, content): - """predict函数的介绍, - 函数介绍的第二句,这句话不会换行 - - :param content: list of list of str. Each string is a token(word). - :return answer: list of list of str. Each string is a tag. - """ - if not hasattr(self, "pipeline"): - raise ValueError("You have to load model first.") - - sentence_list = content - # 1. 检查sentence的类型 - for sentence in sentence_list: - if not all((type(obj) == str for obj in sentence)): - raise ValueError("Input must be list of list of string.") - - # 2. 组建dataset - dataset = DataSet() - dataset.add_field("words", sentence_list) - - # 3. 使用pipeline - self.pipeline(dataset) - - def merge_tag(words_list, tags_list): - rtn = [] - for words, tags in zip(words_list, tags_list): - rtn.append([w + "/" + t for w, t in zip(words, tags)]) - return rtn - - output = dataset.field_arrays["tag"].content - if isinstance(content, str): - return output[0] - elif isinstance(content, list): - return merge_tag(content, output) - - def test(self, file_path): - test_data = ConllxDataLoader().load(file_path) - - save_dict = self._dict - tag_vocab = save_dict["tag_vocab"] - pipeline = save_dict["pipeline"] - index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False) - pipeline.pipeline = [index_tag] + pipeline.pipeline - - test_data.rename_field("pos_tags", "tag") - pipeline(test_data) - test_data.set_target("truth") - prediction = test_data.field_arrays["predict"].content - truth = test_data.field_arrays["truth"].content - seq_len = test_data.field_arrays["word_seq_origin_len"].content - - # padding by hand - max_length = max([len(seq) for seq in prediction]) - for idx in range(len(prediction)): - prediction[idx] = list(prediction[idx]) + ([0] * (max_length - len(prediction[idx]))) - truth[idx] = list(truth[idx]) + ([0] * (max_length - len(truth[idx]))) - evaluator = SpanFPreRecMetric(tag_vocab=tag_vocab, pred="predict", target="truth", - seq_len="word_seq_origin_len") - evaluator({"predict": torch.Tensor(prediction), "word_seq_origin_len": torch.Tensor(seq_len)}, - {"truth": torch.Tensor(truth)}) - test_result = evaluator.get_metric() - f1 = round(test_result['f'] * 100, 2) - pre = round(test_result['pre'] * 100, 2) - rec = round(test_result['rec'] * 100, 2) - - return {"F1": f1, "precision": pre, "recall": rec} - - -class CWS(API): - """ - 中文分词高级接口。 - - :param model_path: 当model_path为None,使用默认位置的model。如果默认位置不存在,则自动下载模型 - :param device: str,可以为'cpu', 'cuda'或'cuda:0'等。会将模型load到相应device进行推断。 - """ - - def __init__(self, model_path=None, device='cpu'): - - super(CWS, self).__init__() - if model_path is None: - model_path = model_urls['cws'] - - self.load(model_path, device) - - def predict(self, content): - """ - 分词接口。 - - :param content: str或List[str], 例如: "中文分词很重要!", 返回的结果是"中文 分词 很 重要 !"。 如果传入的为List[str],比如 - [ "中文分词很重要!", ...], 返回的结果["中文 分词 很 重要 !", ...]。 - :return: str或List[str], 根据输入的的类型决定。 - """ - if not hasattr(self, 'pipeline'): - raise ValueError("You have to load model first.") - - sentence_list = [] - # 1. 检查sentence的类型 - if isinstance(content, str): - sentence_list.append(content) - elif isinstance(content, list): - sentence_list = content - - # 2. 组建dataset - dataset = DataSet() - dataset.add_field('raw_sentence', sentence_list) - - # 3. 使用pipeline - self.pipeline(dataset) - - output = dataset.get_field('output').content - if isinstance(content, str): - return output[0] - elif isinstance(content, list): - return output - - def test(self, filepath): - """ - 传入一个分词文件路径,返回该数据集上分词f1, precision, recall。 - 分词文件应该为:: - - 1 编者按 编者按 NN O 11 nmod:topic - 2 : : PU O 11 punct - 3 7月 7月 NT DATE 4 compound:nn - 4 12日 12日 NT DATE 11 nmod:tmod - 5 , , PU O 11 punct - - 1 这 这 DT O 3 det - 2 款 款 M O 1 mark:clf - 3 飞行 飞行 NN O 8 nsubj - 4 从 从 P O 5 case - 5 外型 外型 NN O 8 nmod:prep - - 以空行分割两个句子,有内容的每行有7列。 - - :param filepath: str, 文件路径路径。 - :return: float, float, float. 分别f1, precision, recall. - """ - tag_proc = self._dict['tag_proc'] - cws_model = self.pipeline.pipeline[-2].model - pipeline = self.pipeline.pipeline[:-2] - - pipeline.insert(1, tag_proc) - pp = Pipeline(pipeline) - - reader = ConllCWSReader() - - # te_filename = '/home/hyan/ctb3/test.conllx' - te_dataset = reader.load(filepath) - pp(te_dataset) - - from ..core.tester import Tester - from ..core.metrics import SpanFPreRecMetric - - tester = Tester(data=te_dataset, model=cws_model, metrics=SpanFPreRecMetric(tag_proc.get_vocab()), batch_size=64, - verbose=0) - eval_res = tester.test() - - f1 = eval_res['SpanFPreRecMetric']['f'] - pre = eval_res['SpanFPreRecMetric']['pre'] - rec = eval_res['SpanFPreRecMetric']['rec'] - # print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec)) - - return {"F1": f1, "precision": pre, "recall": rec} - - -class Parser(API): - def __init__(self, model_path=None, device='cpu'): - super(Parser, self).__init__() - if model_path is None: - model_path = model_urls['parser'] - - self.pos_tagger = POS(device=device) - self.load(model_path, device) - - def predict(self, content): - if not hasattr(self, 'pipeline'): - raise ValueError("You have to load model first.") - - # 1. 利用POS得到分词和pos tagging结果 - pos_out = self.pos_tagger.predict(content) - # pos_out = ['这里/NN 是/VB 分词/NN 结果/NN'.split()] - - # 2. 组建dataset - dataset = DataSet() - dataset.add_field('wp', pos_out) - dataset.apply(lambda x: [''] + [w.split('/')[0] for w in x['wp']], new_field_name='words') - dataset.apply(lambda x: [''] + [w.split('/')[1] for w in x['wp']], new_field_name='pos') - dataset.rename_field("words", "raw_words") - - # 3. 使用pipeline - self.pipeline(dataset) - dataset.apply(lambda x: [str(arc) for arc in x['arc_pred']], new_field_name='arc_pred') - dataset.apply(lambda x: [arc + '/' + label for arc, label in - zip(x['arc_pred'], x['label_pred_seq'])][1:], new_field_name='output') - # output like: [['2/top', '0/root', '4/nn', '2/dep']] - return dataset.field_arrays['output'].content - - def load_test_file(self, path): - def get_one(sample): - sample = list(map(list, zip(*sample))) - if len(sample) == 0: - return None - for w in sample[7]: - if w == '_': - print('Error Sample {}'.format(sample)) - return None - # return word_seq, pos_seq, head_seq, head_tag_seq - return sample[1], sample[3], list(map(int, sample[6])), sample[7] - - datalist = [] - with open(path, 'r', encoding='utf-8') as f: - sample = [] - for line in f: - if line.startswith('\n'): - datalist.append(sample) - sample = [] - elif line.startswith('#'): - continue - else: - sample.append(line.split('\t')) - if len(sample) > 0: - datalist.append(sample) - - data = [get_one(sample) for sample in datalist] - data_list = list(filter(lambda x: x is not None, data)) - return data_list - - def test(self, filepath): - data = self.load_test_file(filepath) - - def convert(data): - BOS = '' - dataset = DataSet() - for sample in data: - word_seq = [BOS] + sample[0] - pos_seq = [BOS] + sample[1] - heads = [0] + sample[2] - head_tags = [BOS] + sample[3] - dataset.append(Instance(raw_words=word_seq, - pos=pos_seq, - gold_heads=heads, - arc_true=heads, - tags=head_tags)) - return dataset - - ds = convert(data) - pp = self.pipeline - for p in pp: - if p.field_name == 'word_list': - p.field_name = 'gold_words' - elif p.field_name == 'pos_list': - p.field_name = 'gold_pos' - # ds.rename_field("words", "raw_words") - # ds.rename_field("tag", "pos") - pp(ds) - head_cor, label_cor, total = 0, 0, 0 - for ins in ds: - head_gold = ins['gold_heads'] - head_pred = ins['arc_pred'] - length = len(head_gold) - total += length - for i in range(length): - head_cor += 1 if head_pred[i] == head_gold[i] else 0 - uas = head_cor / total - # print('uas:{:.2f}'.format(uas)) - - for p in pp: - if p.field_name == 'gold_words': - p.field_name = 'word_list' - elif p.field_name == 'gold_pos': - p.field_name = 'pos_list' - - return {"USA": round(uas, 5)} - - -class Analyzer: - def __init__(self, device='cpu'): - - self.cws = CWS(device=device) - self.pos = POS(device=device) - self.parser = Parser(device=device) - - def predict(self, content, seg=False, pos=False, parser=False): - if seg is False and pos is False and parser is False: - seg = True - output_dict = {} - if seg: - seg_output = self.cws.predict(content) - output_dict['seg'] = seg_output - if pos: - pos_output = self.pos.predict(content) - output_dict['pos'] = pos_output - if parser: - parser_output = self.parser.predict(content) - output_dict['parser'] = parser_output - - return output_dict - - def test(self, filepath): - output_dict = {} - if self.cws: - seg_output = self.cws.test(filepath) - output_dict['seg'] = seg_output - if self.pos: - pos_output = self.pos.test(filepath) - output_dict['pos'] = pos_output - if self.parser: - parser_output = self.parser.test(filepath) - output_dict['parser'] = parser_output - - return output_dict diff --git a/legacy/api/converter.py b/legacy/api/converter.py deleted file mode 100644 index 4e03e465..00000000 --- a/legacy/api/converter.py +++ /dev/null @@ -1,181 +0,0 @@ -import re - - -class SpanConverter: - def __init__(self, replace_tag, pattern): - super(SpanConverter, self).__init__() - - self.replace_tag = replace_tag - self.pattern = pattern - - def find_certain_span_and_replace(self, sentence): - replaced_sentence = '' - prev_end = 0 - for match in re.finditer(self.pattern, sentence): - start, end = match.span() - span = sentence[start:end] - replaced_sentence += sentence[prev_end:start] + self.span_to_special_tag(span) - prev_end = end - replaced_sentence += sentence[prev_end:] - - return replaced_sentence - - def span_to_special_tag(self, span): - - return self.replace_tag - - def find_certain_span(self, sentence): - spans = [] - for match in re.finditer(self.pattern, sentence): - spans.append(match.span()) - return spans - - -class AlphaSpanConverter(SpanConverter): - def __init__(self): - replace_tag = '' - # 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag). - pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])' - - super(AlphaSpanConverter, self).__init__(replace_tag, pattern) - - -class DigitSpanConverter(SpanConverter): - def __init__(self): - replace_tag = '' - pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])' - - super(DigitSpanConverter, self).__init__(replace_tag, pattern) - - def span_to_special_tag(self, span): - # return self.special_tag - if span[0] == '0' and len(span) > 2: - return '' - decimal_point_count = 0 # one might have more than one decimal pointers - for idx, char in enumerate(span): - if char == '.' or char == '﹒' or char == '·': - decimal_point_count += 1 - if span[-1] == '.' or span[-1] == '﹒' or span[-1] == '·': - # last digit being decimal point means this is not a number - if decimal_point_count == 1: - return span - else: - return '' - if decimal_point_count == 1: - return '' - elif decimal_point_count > 1: - return '' - else: - return '' - - -class TimeConverter(SpanConverter): - def __init__(self): - replace_tag = '' - pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])' - - super().__init__(replace_tag, pattern) - - -class MixNumAlphaConverter(SpanConverter): - def __init__(self): - replace_tag = '' - pattern = None - - super().__init__(replace_tag, pattern) - - def find_certain_span_and_replace(self, sentence): - replaced_sentence = '' - start = 0 - matching_flag = False - number_flag = False - alpha_flag = False - link_flag = False - slash_flag = False - bracket_flag = False - for idx in range(len(sentence)): - if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): - if not matching_flag: - replaced_sentence += sentence[start:idx] - start = idx - if re.match('[0-9]', sentence[idx]): - number_flag = True - elif re.match('[\'′&\\-]', sentence[idx]): - link_flag = True - elif re.match('/', sentence[idx]): - slash_flag = True - elif re.match('[\\(\\)]', sentence[idx]): - bracket_flag = True - else: - alpha_flag = True - matching_flag = True - elif re.match('[\\.]', sentence[idx]): - pass - else: - if matching_flag: - if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ - or (slash_flag and alpha_flag) or (link_flag and number_flag) \ - or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): - span = sentence[start:idx] - start = idx - replaced_sentence += self.span_to_special_tag(span) - matching_flag = False - number_flag = False - alpha_flag = False - link_flag = False - slash_flag = False - bracket_flag = False - - replaced_sentence += sentence[start:] - return replaced_sentence - - def find_certain_span(self, sentence): - spans = [] - start = 0 - matching_flag = False - number_flag = False - alpha_flag = False - link_flag = False - slash_flag = False - bracket_flag = False - for idx in range(len(sentence)): - if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): - if not matching_flag: - start = idx - if re.match('[0-9]', sentence[idx]): - number_flag = True - elif re.match('[\'′&\\-]', sentence[idx]): - link_flag = True - elif re.match('/', sentence[idx]): - slash_flag = True - elif re.match('[\\(\\)]', sentence[idx]): - bracket_flag = True - else: - alpha_flag = True - matching_flag = True - elif re.match('[\\.]', sentence[idx]): - pass - else: - if matching_flag: - if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ - or (slash_flag and alpha_flag) or (link_flag and number_flag) \ - or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): - spans.append((start, idx)) - start = idx - - matching_flag = False - number_flag = False - alpha_flag = False - link_flag = False - slash_flag = False - bracket_flag = False - - return spans - - -class EmailConverter(SpanConverter): - def __init__(self): - replaced_tag = "" - pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])' - - super(EmailConverter, self).__init__(replaced_tag, pattern) diff --git a/legacy/api/examples.py b/legacy/api/examples.py deleted file mode 100644 index c1b2e155..00000000 --- a/legacy/api/examples.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -api/example.py contains all API examples provided by fastNLP. -It is used as a tutorial for API or a test script since it is difficult to test APIs in travis. - -""" -from . import CWS, POS, Parser - -text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。', - '这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', - '那么这款无人机到底有多厉害?'] - - -def chinese_word_segmentation(): - cws = CWS(device='cpu') - print(cws.predict(text)) - - -def chinese_word_segmentation_test(): - cws = CWS(device='cpu') - print(cws.test("../../test/data_for_tests/zh_sample.conllx")) - - -def pos_tagging(): - # 输入已分词序列 - text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司', - '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'], - ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']] - pos = POS(device='cpu') - print(pos.predict(text)) - - -def pos_tagging_test(): - pos = POS(device='cpu') - print(pos.test("../../test/data_for_tests/zh_sample.conllx")) - - -def syntactic_parsing(): - text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司', - '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'], - ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']] - parser = Parser(device='cpu') - print(parser.predict(text)) - - -def syntactic_parsing_test(): - parser = Parser(device='cpu') - print(parser.test("../../test/data_for_tests/zh_sample.conllx")) - - -if __name__ == "__main__": - # chinese_word_segmentation() - # chinese_word_segmentation_test() - # pos_tagging() - # pos_tagging_test() - syntactic_parsing() - # syntactic_parsing_test() diff --git a/legacy/api/pipeline.py b/legacy/api/pipeline.py deleted file mode 100644 index 2cec16b3..00000000 --- a/legacy/api/pipeline.py +++ /dev/null @@ -1,33 +0,0 @@ -from ..api.processor import Processor - - -class Pipeline: - """ - Pipeline takes a DataSet object as input, runs multiple processors sequentially, and - outputs a DataSet object. - """ - - def __init__(self, processors=None): - self.pipeline = [] - if isinstance(processors, list): - for proc in processors: - assert isinstance(proc, Processor), "Must be a Processor, not {}.".format(type(proc)) - self.pipeline = processors - - def add_processor(self, processor): - assert isinstance(processor, Processor), "Must be a Processor, not {}.".format(type(processor)) - self.pipeline.append(processor) - - def process(self, dataset): - assert len(self.pipeline) != 0, "You need to add some processor first." - - for proc in self.pipeline: - dataset = proc(dataset) - - return dataset - - def __call__(self, *args, **kwargs): - return self.process(*args, **kwargs) - - def __getitem__(self, item): - return self.pipeline[item] diff --git a/legacy/api/processor.py b/legacy/api/processor.py deleted file mode 100644 index 4c442ed2..00000000 --- a/legacy/api/processor.py +++ /dev/null @@ -1,428 +0,0 @@ -import re -from collections import defaultdict - -import torch - -from fastNLP.core.batch import Batch -from fastNLP.core.dataset import DataSet -from fastNLP.core.sampler import SequentialSampler -from fastNLP.core.vocabulary import Vocabulary - - -class Processor(object): - def __init__(self, field_name, new_added_field_name): - """ - - :param field_name: 处理哪个field - :param new_added_field_name: 如果为None,则认为是field_name,即覆盖原有的field - """ - self.field_name = field_name - if new_added_field_name is None: - self.new_added_field_name = field_name - else: - self.new_added_field_name = new_added_field_name - - def process(self, *args, **kwargs): - raise NotImplementedError - - def __call__(self, *args, **kwargs): - return self.process(*args, **kwargs) - - -class FullSpaceToHalfSpaceProcessor(Processor): - """全角转半角,以字符为处理单元 - - """ - - def __init__(self, field_name, change_alpha=True, change_digit=True, change_punctuation=True, - change_space=True): - super(FullSpaceToHalfSpaceProcessor, self).__init__(field_name, None) - - self.change_alpha = change_alpha - self.change_digit = change_digit - self.change_punctuation = change_punctuation - self.change_space = change_space - - FH_SPACE = [(u" ", u" ")] - FH_NUM = [ - (u"0", u"0"), (u"1", u"1"), (u"2", u"2"), (u"3", u"3"), (u"4", u"4"), - (u"5", u"5"), (u"6", u"6"), (u"7", u"7"), (u"8", u"8"), (u"9", u"9")] - FH_ALPHA = [ - (u"a", u"a"), (u"b", u"b"), (u"c", u"c"), (u"d", u"d"), (u"e", u"e"), - (u"f", u"f"), (u"g", u"g"), (u"h", u"h"), (u"i", u"i"), (u"j", u"j"), - (u"k", u"k"), (u"l", u"l"), (u"m", u"m"), (u"n", u"n"), (u"o", u"o"), - (u"p", u"p"), (u"q", u"q"), (u"r", u"r"), (u"s", u"s"), (u"t", u"t"), - (u"u", u"u"), (u"v", u"v"), (u"w", u"w"), (u"x", u"x"), (u"y", u"y"), - (u"z", u"z"), - (u"A", u"A"), (u"B", u"B"), (u"C", u"C"), (u"D", u"D"), (u"E", u"E"), - (u"F", u"F"), (u"G", u"G"), (u"H", u"H"), (u"I", u"I"), (u"J", u"J"), - (u"K", u"K"), (u"L", u"L"), (u"M", u"M"), (u"N", u"N"), (u"O", u"O"), - (u"P", u"P"), (u"Q", u"Q"), (u"R", u"R"), (u"S", u"S"), (u"T", u"T"), - (u"U", u"U"), (u"V", u"V"), (u"W", u"W"), (u"X", u"X"), (u"Y", u"Y"), - (u"Z", u"Z")] - # 谨慎使用标点符号转换, 因为"5.12特大地震"转换后可能就成了"5.12特大地震" - FH_PUNCTUATION = [ - (u'%', u'%'), (u'!', u'!'), (u'"', u'\"'), (u''', u'\''), (u'#', u'#'), - (u'¥', u'$'), (u'&', u'&'), (u'(', u'('), (u')', u')'), (u'*', u'*'), - (u'+', u'+'), (u',', u','), (u'-', u'-'), (u'.', u'.'), (u'/', u'/'), - (u':', u':'), (u';', u';'), (u'<', u'<'), (u'=', u'='), (u'>', u'>'), - (u'?', u'?'), (u'@', u'@'), (u'[', u'['), (u']', u']'), (u'\', u'\\'), - (u'^', u'^'), (u'_', u'_'), (u'`', u'`'), (u'~', u'~'), (u'{', u'{'), - (u'}', u'}'), (u'|', u'|')] - FHs = [] - if self.change_alpha: - FHs = FH_ALPHA - if self.change_digit: - FHs += FH_NUM - if self.change_punctuation: - FHs += FH_PUNCTUATION - if self.change_space: - FHs += FH_SPACE - self.convert_map = {k: v for k, v in FHs} - - def process(self, dataset): - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - - def inner_proc(ins): - sentence = ins[self.field_name] - new_sentence = [""] * len(sentence) - for idx, char in enumerate(sentence): - if char in self.convert_map: - char = self.convert_map[char] - new_sentence[idx] = char - return "".join(new_sentence) - - dataset.apply(inner_proc, new_field_name=self.field_name) - return dataset - - -class PreAppendProcessor(Processor): - """ - 向某个field的起始增加data(应该为str类型)。该field需要为list类型。即新增的field为 - [data] + instance[field_name] - - """ - - def __init__(self, data, field_name, new_added_field_name=None): - super(PreAppendProcessor, self).__init__(field_name, new_added_field_name) - self.data = data - - def process(self, dataset): - dataset.apply(lambda ins: [self.data] + ins[self.field_name], new_field_name=self.new_added_field_name) - return dataset - - -class SliceProcessor(Processor): - """ - 从某个field中只取部分内容。等价于instance[field_name][start:end:step] - - """ - - def __init__(self, start, end, step, field_name, new_added_field_name=None): - super(SliceProcessor, self).__init__(field_name, new_added_field_name) - for o in (start, end, step): - assert isinstance(o, int) or o is None - self.slice = slice(start, end, step) - - def process(self, dataset): - dataset.apply(lambda ins: ins[self.field_name][self.slice], new_field_name=self.new_added_field_name) - return dataset - - -class Num2TagProcessor(Processor): - """ - 将一句话中的数字转换为某个tag。 - - """ - - def __init__(self, tag, field_name, new_added_field_name=None): - """ - - :param tag: str, 将数字转换为该tag - :param field_name: - :param new_added_field_name: - """ - super(Num2TagProcessor, self).__init__(field_name, new_added_field_name) - self.tag = tag - self.pattern = r'[-+]?([0-9]+[.]?[0-9]*)+[/eE]?[-+]?([0-9]+[.]?[0-9]*)' - - def process(self, dataset): - - def inner_proc(ins): - s = ins[self.field_name] - new_s = [None] * len(s) - for i, w in enumerate(s): - if re.search(self.pattern, w) is not None: - w = self.tag - new_s[i] = w - return new_s - - dataset.apply(inner_proc, new_field_name=self.new_added_field_name) - return dataset - - -class IndexerProcessor(Processor): - """ - 给定一个vocabulary , 将指定field转换为index形式。指定field应该是一维的list,比如 - ['我', '是', xxx] - """ - - def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False, is_input=True): - - assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) - - super(IndexerProcessor, self).__init__(field_name, new_added_field_name) - self.vocab = vocab - self.delete_old_field = delete_old_field - self.is_input = is_input - - def set_vocab(self, vocab): - assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) - - self.vocab = vocab - - def process(self, dataset): - assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]], - new_field_name=self.new_added_field_name) - if self.is_input: - dataset.set_input(self.new_added_field_name) - - if self.delete_old_field: - dataset.delete_field(self.field_name) - - return dataset - - -class VocabProcessor(Processor): - """ - 传入若干个DataSet以建立vocabulary。 - - """ - - def __init__(self, field_name, min_freq=1, max_size=None): - super(VocabProcessor, self).__init__(field_name, None) - self.vocab = Vocabulary(min_freq=min_freq, max_size=max_size) - - def process(self, *datasets): - for dataset in datasets: - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: self.vocab.update(ins[self.field_name])) - - def get_vocab(self): - self.vocab.build_vocab() - return self.vocab - - -class SeqLenProcessor(Processor): - """ - 根据某个field新增一个sequence length的field。取该field的第一维 - - """ - - def __init__(self, field_name, new_added_field_name='seq_lens', is_input=True): - super(SeqLenProcessor, self).__init__(field_name, new_added_field_name) - self.is_input = is_input - - def process(self, dataset): - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: len(ins[self.field_name]), new_field_name=self.new_added_field_name) - if self.is_input: - dataset.set_input(self.new_added_field_name) - return dataset - - -from fastNLP.core.utils import _build_args - - -class ModelProcessor(Processor): - def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32): - """ - 传入一个model,在process()时传入一个dataset,该processor会通过Batch将DataSet的内容输出给model.predict或者model.forward. - model输出的内容会被增加到dataset中,field_name由model输出决定。如果生成的内容维度不是(Batch_size, )与 - (Batch_size, 1),则使用seqence length这个field进行unpad - TODO 这个类需要删除对seq_lens的依赖。 - - :param seq_len_field_name: - :param batch_size: - """ - super(ModelProcessor, self).__init__(None, None) - self.batch_size = batch_size - self.seq_len_field_name = seq_len_field_name - self.model = model - - def process(self, dataset): - self.model.eval() - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler()) - - batch_output = defaultdict(list) - predict_func = self.model.forward - with torch.no_grad(): - for batch_x, _ in data_iterator: - refined_batch_x = _build_args(predict_func, **batch_x) - prediction = predict_func(**refined_batch_x) - seq_lens = batch_x[self.seq_len_field_name].tolist() - - for key, value in prediction.items(): - tmp_batch = [] - value = value.cpu().numpy() - if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1): - batch_output[key].extend(value.tolist()) - else: - for idx, seq_len in enumerate(seq_lens): - tmp_batch.append(value[idx, :seq_len]) - batch_output[key].extend(tmp_batch) - if not self.seq_len_field_name in prediction: - batch_output[self.seq_len_field_name].extend(seq_lens) - - # TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么 - for field_name, fields in batch_output.items(): - dataset.add_field(field_name, fields, is_input=True, is_target=False) - - return dataset - - def set_model(self, model): - self.model = model - - def set_model_device(self, device): - device = torch.device(device) - self.model.to(device) - - -class Index2WordProcessor(Processor): - """ - 将DataSet中某个为index的field根据vocab转换为str - - """ - - def __init__(self, vocab, field_name, new_added_field_name): - super(Index2WordProcessor, self).__init__(field_name, new_added_field_name) - self.vocab = vocab - - def process(self, dataset): - dataset.apply(lambda ins: [self.vocab.to_word(w) for w in ins[self.field_name]], - new_field_name=self.new_added_field_name) - return dataset - - -class SetTargetProcessor(Processor): - def __init__(self, *fields, flag=True): - super(SetTargetProcessor, self).__init__(None, None) - self.fields = fields - self.flag = flag - - def process(self, dataset): - dataset.set_target(*self.fields, flag=self.flag) - return dataset - - -class SetInputProcessor(Processor): - def __init__(self, *fields, flag=True): - super(SetInputProcessor, self).__init__(None, None) - self.fields = fields - self.flag = flag - - def process(self, dataset): - dataset.set_input(*self.fields, flag=self.flag) - return dataset - - -class VocabIndexerProcessor(Processor): - """ - 根据DataSet创建Vocabulary,并将其用数字index。新生成的index的field会被放在new_added_filed_name, 如果没有提供 - new_added_field_name, 则覆盖原有的field_name. - - """ - - def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None, - verbose=0, is_input=True): - """ - - :param field_name: 从哪个field_name创建词表,以及对哪个field_name进行index操作 - :param new_added_filed_name: index时,生成的index field的名称,如果不传入,则覆盖field_name. - :param min_freq: 创建的Vocabulary允许的单词最少出现次数. - :param max_size: 创建的Vocabulary允许的最大的单词数量 - :param verbose: 0, 不输出任何信息;1,输出信息 - :param bool is_input: - """ - super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name) - self.min_freq = min_freq - self.max_size = max_size - - self.verbose = verbose - self.is_input = is_input - - def construct_vocab(self, *datasets): - """ - 使用传入的DataSet创建vocabulary - - :param datasets: DataSet类型的数据,用于构建vocabulary - :return: - """ - self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size) - for dataset in datasets: - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: self.vocab.update(ins[self.field_name])) - self.vocab.build_vocab() - if self.verbose: - print("Vocabulary Constructed, has {} items.".format(len(self.vocab))) - - def process(self, *datasets, only_index_dataset=None): - """ - 若还未建立Vocabulary,则使用dataset中的DataSet建立vocabulary;若已经有了vocabulary则使用已有的vocabulary。得到vocabulary - 后,则会index datasets与only_index_dataset。 - - :param datasets: DataSet类型的数据 - :param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index,不会被用于生成vocabulary。 - :return: - """ - if len(datasets) == 0 and not hasattr(self, 'vocab'): - raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.") - if not hasattr(self, 'vocab'): - self.construct_vocab(*datasets) - else: - if self.verbose: - print("Using constructed vocabulary with {} items.".format(len(self.vocab))) - to_index_datasets = [] - if len(datasets) != 0: - for dataset in datasets: - assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) - to_index_datasets.append(dataset) - - if not (only_index_dataset is None): - if isinstance(only_index_dataset, list): - for dataset in only_index_dataset: - assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) - to_index_datasets.append(dataset) - elif isinstance(only_index_dataset, DataSet): - to_index_datasets.append(only_index_dataset) - else: - raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset))) - - for dataset in to_index_datasets: - assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]], - new_field_name=self.new_added_field_name, is_input=self.is_input) - # 只返回一个,infer时为了跟其他processor保持一致 - if len(to_index_datasets) == 1: - return to_index_datasets[0] - - def set_vocab(self, vocab): - assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab)) - self.vocab = vocab - - def delete_vocab(self): - del self.vocab - - def get_vocab_size(self): - return len(self.vocab) - - def set_verbose(self, verbose): - """ - 设置processor verbose状态。 - - :param verbose: int, 0,不输出任何信息;1,输出vocab 信息。 - :return: - """ - self.verbose = verbose diff --git a/legacy/api/utils.py b/legacy/api/utils.py deleted file mode 100644 index 184e5fe6..00000000 --- a/legacy/api/utils.py +++ /dev/null @@ -1,134 +0,0 @@ -import hashlib -import os -import re -import shutil -import sys -import tempfile - -import torch - -try: - from requests.utils import urlparse - from requests import get as urlopen - requests_available = True -except ImportError: - requests_available = False - if sys.version_info[0] == 2: - from urlparse import urlparse # noqa f811 - from urllib2 import urlopen # noqa f811 - else: - from urllib.request import urlopen - from urllib.parse import urlparse -try: - from tqdm.auto import tqdm -except: - from fastNLP.core.utils import _pseudo_tqdm as tqdm - -# matches bfd8deac from resnet18-bfd8deac.pth -HASH_REGEX = re.compile(r'-([a-f0-9]*)\.') - - -def load_url(url, model_dir=None, map_location=None, progress=True): - r"""Loads the Torch serialized object at the given URL. - - If the object is already present in `model_dir`, it's deserialized and - returned. The filename part of the URL should follow the naming convention - ``filename-.ext`` where ```` is the first eight or more - digits of the SHA256 hash of the contents of the file. The hash is used to - ensure unique names and to verify the contents of the file. - - The default value of `model_dir` is ``$TORCH_HOME/models`` where - ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be - overridden with the ``$TORCH_MODEL_ZOO`` environment variable. - - Args: - url (string): URL of the object to download - model_dir (string, optional): directory in which to save the object - map_location (optional): a function or a dict specifying how to remap storage locations (see torch.load) - progress (bool, optional): whether or not to display a progress bar to stderr - - Example: - # >>> state_dict = model_zoo.load_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') - - """ - if model_dir is None: - torch_home = os.path.expanduser(os.getenv('fastNLP_HOME', '~/.fastNLP')) - model_dir = os.getenv('fastNLP_MODEL_ZOO', os.path.join(torch_home, 'models')) - if not os.path.exists(model_dir): - os.makedirs(model_dir) - parts = urlparse(url) - filename = os.path.basename(parts.path) - cached_file = os.path.join(model_dir, filename) - if not os.path.exists(cached_file): - sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) - # hash_prefix = HASH_REGEX.search(filename).group(1) - _download_url_to_file(url, cached_file, hash_prefix=None, progress=progress) - return torch.load(cached_file, map_location=map_location) - - -def _download_url_to_file(url, dst, hash_prefix, progress): - if requests_available: - u = urlopen(url, stream=True) - file_size = int(u.headers["Content-Length"]) - u = u.raw - else: - u = urlopen(url) - meta = u.info() - if hasattr(meta, 'getheaders'): - file_size = int(meta.getheaders("Content-Length")[0]) - else: - file_size = int(meta.get_all("Content-Length")[0]) - - f = tempfile.NamedTemporaryFile(delete=False) - try: - if hash_prefix is not None: - sha256 = hashlib.sha256() - with tqdm(total=file_size, disable=not progress) as pbar: - while True: - buffer = u.read(8192) - if len(buffer) == 0: - break - f.write(buffer) - if hash_prefix is not None: - sha256.update(buffer) - pbar.update(len(buffer)) - - f.close() - if hash_prefix is not None: - digest = sha256.hexdigest() - if digest[:len(hash_prefix)] != hash_prefix: - raise RuntimeError('invalid hash value (expected "{}", got "{}")' - .format(hash_prefix, digest)) - shutil.move(f.name, dst) - finally: - f.close() - if os.path.exists(f.name): - os.remove(f.name) - - -if tqdm is None: - # fake tqdm if it's not installed - class tqdm(object): - - def __init__(self, total, disable=False): - self.total = total - self.disable = disable - self.n = 0 - - def update(self, n): - if self.disable: - return - - self.n += n - sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(self.total))) - sys.stderr.flush() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.disable: - return - - sys.stderr.write('\n') - diff --git a/legacy/automl/enas_controller.py b/legacy/automl/enas_controller.py deleted file mode 100644 index 6ddbb211..00000000 --- a/legacy/automl/enas_controller.py +++ /dev/null @@ -1,223 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch -"""A module with NAS controller-related code.""" -import collections -import os - -import torch -import torch.nn.functional as F - -import fastNLP.automl.enas_utils as utils -from fastNLP.automl.enas_utils import Node - - -def _construct_dags(prev_nodes, activations, func_names, num_blocks): - """Constructs a set of DAGs based on the actions, i.e., previous nodes and - activation functions, sampled from the controller/policy pi. - - Args: - prev_nodes: Previous node actions from the policy. - activations: Activations sampled from the policy. - func_names: Mapping from activation function names to functions. - num_blocks: Number of blocks in the target RNN cell. - - Returns: - A list of DAGs defined by the inputs. - - RNN cell DAGs are represented in the following way: - - 1. Each element (node) in a DAG is a list of `Node`s. - - 2. The `Node`s in the list dag[i] correspond to the subsequent nodes - that take the output from node i as their own input. - - 3. dag[-1] is the node that takes input from x^{(t)} and h^{(t - 1)}. - dag[-1] always feeds dag[0]. - dag[-1] acts as if `w_xc`, `w_hc`, `w_xh` and `w_hh` are its - weights. - - 4. dag[N - 1] is the node that produces the hidden state passed to - the next timestep. dag[N - 1] is also always a leaf node, and therefore - is always averaged with the other leaf nodes and fed to the output - decoder. - """ - dags = [] - for nodes, func_ids in zip(prev_nodes, activations): - dag = collections.defaultdict(list) - - # add first node - dag[-1] = [Node(0, func_names[func_ids[0]])] - dag[-2] = [Node(0, func_names[func_ids[0]])] - - # add following nodes - for jdx, (idx, func_id) in enumerate(zip(nodes, func_ids[1:])): - dag[utils.to_item(idx)].append(Node(jdx + 1, func_names[func_id])) - - leaf_nodes = set(range(num_blocks)) - dag.keys() - - # merge with avg - for idx in leaf_nodes: - dag[idx] = [Node(num_blocks, 'avg')] - - # This is actually y^{(t)}. h^{(t)} is node N - 1 in - # the graph, where N Is the number of nodes. I.e., h^{(t)} takes - # only one other node as its input. - # last h[t] node - last_node = Node(num_blocks + 1, 'h[t]') - dag[num_blocks] = [last_node] - dags.append(dag) - - return dags - - -class Controller(torch.nn.Module): - """Based on - https://github.com/pytorch/examples/blob/master/word_language_model/model.py - - RL controllers do not necessarily have much to do with - language models. - - Base the controller RNN on the GRU from: - https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py - """ - def __init__(self, num_blocks=4, controller_hid=100, cuda=False): - torch.nn.Module.__init__(self) - - # `num_tokens` here is just the activation function - # for every even step, - self.shared_rnn_activations = ['tanh', 'ReLU', 'identity', 'sigmoid'] - self.num_tokens = [len(self.shared_rnn_activations)] - self.controller_hid = controller_hid - self.use_cuda = cuda - self.num_blocks = num_blocks - for idx in range(num_blocks): - self.num_tokens += [idx + 1, len(self.shared_rnn_activations)] - self.func_names = self.shared_rnn_activations - - num_total_tokens = sum(self.num_tokens) - - self.encoder = torch.nn.Embedding(num_total_tokens, - controller_hid) - self.lstm = torch.nn.LSTMCell(controller_hid, controller_hid) - - # Perhaps these weights in the decoder should be - # shared? At least for the activation functions, which all have the - # same size. - self.decoders = [] - for idx, size in enumerate(self.num_tokens): - decoder = torch.nn.Linear(controller_hid, size) - self.decoders.append(decoder) - - self._decoders = torch.nn.ModuleList(self.decoders) - - self.reset_parameters() - self.static_init_hidden = utils.keydefaultdict(self.init_hidden) - - def _get_default_hidden(key): - return utils.get_variable( - torch.zeros(key, self.controller_hid), - self.use_cuda, - requires_grad=False) - - self.static_inputs = utils.keydefaultdict(_get_default_hidden) - - def reset_parameters(self): - init_range = 0.1 - for param in self.parameters(): - param.data.uniform_(-init_range, init_range) - for decoder in self.decoders: - decoder.bias.data.fill_(0) - - def forward(self, # pylint:disable=arguments-differ - inputs, - hidden, - block_idx, - is_embed): - if not is_embed: - embed = self.encoder(inputs) - else: - embed = inputs - - hx, cx = self.lstm(embed, hidden) - logits = self.decoders[block_idx](hx) - - logits /= 5.0 - - # # exploration - # if self.args.mode == 'train': - # logits = (2.5 * F.tanh(logits)) - - return logits, (hx, cx) - - def sample(self, batch_size=1, with_details=False, save_dir=None): - """Samples a set of `args.num_blocks` many computational nodes from the - controller, where each node is made up of an activation function, and - each node except the last also includes a previous node. - """ - if batch_size < 1: - raise Exception(f'Wrong batch_size: {batch_size} < 1') - - # [B, L, H] - inputs = self.static_inputs[batch_size] - hidden = self.static_init_hidden[batch_size] - - activations = [] - entropies = [] - log_probs = [] - prev_nodes = [] - # The RNN controller alternately outputs an activation, - # followed by a previous node, for each block except the last one, - # which only gets an activation function. The last node is the output - # node, and its previous node is the average of all leaf nodes. - for block_idx in range(2*(self.num_blocks - 1) + 1): - logits, hidden = self.forward(inputs, - hidden, - block_idx, - is_embed=(block_idx == 0)) - - probs = F.softmax(logits, dim=-1) - log_prob = F.log_softmax(logits, dim=-1) - # .mean() for entropy? - entropy = -(log_prob * probs).sum(1, keepdim=False) - - action = probs.multinomial(num_samples=1).data - selected_log_prob = log_prob.gather( - 1, utils.get_variable(action, requires_grad=False)) - - # why the [:, 0] here? Should it be .squeeze(), or - # .view()? Same below with `action`. - entropies.append(entropy) - log_probs.append(selected_log_prob[:, 0]) - - # 0: function, 1: previous node - mode = block_idx % 2 - inputs = utils.get_variable( - action[:, 0] + sum(self.num_tokens[:mode]), - requires_grad=False) - - if mode == 0: - activations.append(action[:, 0]) - elif mode == 1: - prev_nodes.append(action[:, 0]) - - prev_nodes = torch.stack(prev_nodes).transpose(0, 1) - activations = torch.stack(activations).transpose(0, 1) - - dags = _construct_dags(prev_nodes, - activations, - self.func_names, - self.num_blocks) - - if save_dir is not None: - for idx, dag in enumerate(dags): - utils.draw_network(dag, - os.path.join(save_dir, f'graph{idx}.png')) - - if with_details: - return dags, torch.cat(log_probs), torch.cat(entropies) - - return dags - - def init_hidden(self, batch_size): - zeros = torch.zeros(batch_size, self.controller_hid) - return (utils.get_variable(zeros, self.use_cuda, requires_grad=False), - utils.get_variable(zeros.clone(), self.use_cuda, requires_grad=False)) diff --git a/legacy/automl/enas_model.py b/legacy/automl/enas_model.py deleted file mode 100644 index 4f9fb449..00000000 --- a/legacy/automl/enas_model.py +++ /dev/null @@ -1,388 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch - -"""Module containing the shared RNN model.""" -import collections - -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn -from torch.autograd import Variable - -import fastNLP.automl.enas_utils as utils -from fastNLP.models.base_model import BaseModel - - -def _get_dropped_weights(w_raw, dropout_p, is_training): - """Drops out weights to implement DropConnect. - - Args: - w_raw: Full, pre-dropout, weights to be dropped out. - dropout_p: Proportion of weights to drop out. - is_training: True iff _shared_ model is training. - - Returns: - The dropped weights. - - Why does torch.nn.functional.dropout() return: - 1. `torch.autograd.Variable()` on the training loop - 2. `torch.nn.Parameter()` on the controller or eval loop, when - training = False... - - Even though the call to `_setweights` in the Smerity repo's - `weight_drop.py` does not have this behaviour, and `F.dropout` always - returns `torch.autograd.Variable` there, even when `training=False`? - - The above TODO is the reason for the hacky check for `torch.nn.Parameter`. - """ - dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training) - - if isinstance(dropped_w, torch.nn.Parameter): - dropped_w = dropped_w.clone() - - return dropped_w - -class EmbeddingDropout(torch.nn.Embedding): - """Class for dropping out embeddings by zero'ing out parameters in the - embedding matrix. - - This is equivalent to dropping out particular words, e.g., in the sentence - 'the quick brown fox jumps over the lazy dog', dropping out 'the' would - lead to the sentence '### quick brown fox jumps over ### lazy dog' (in the - embedding vector space). - - See 'A Theoretically Grounded Application of Dropout in Recurrent Neural - Networks', (Gal and Ghahramani, 2016). - """ - def __init__(self, - num_embeddings, - embedding_dim, - max_norm=None, - norm_type=2, - scale_grad_by_freq=False, - sparse=False, - dropout=0.1, - scale=None): - """Embedding constructor. - - Args: - dropout: Dropout probability. - scale: Used to scale parameters of embedding weight matrix that are - not dropped out. Note that this is _in addition_ to the - `1/(1 - dropout)` scaling. - - See `torch.nn.Embedding` for remaining arguments. - """ - torch.nn.Embedding.__init__(self, - num_embeddings=num_embeddings, - embedding_dim=embedding_dim, - max_norm=max_norm, - norm_type=norm_type, - scale_grad_by_freq=scale_grad_by_freq, - sparse=sparse) - self.dropout = dropout - assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 ' - 'and < 1.0') - self.scale = scale - - def forward(self, inputs): # pylint:disable=arguments-differ - """Embeds `inputs` with the dropped out embedding weight matrix.""" - if self.training: - dropout = self.dropout - else: - dropout = 0 - - if dropout: - mask = self.weight.data.new(self.weight.size(0), 1) - mask.bernoulli_(1 - dropout) - mask = mask.expand_as(self.weight) - mask = mask / (1 - dropout) - masked_weight = self.weight * Variable(mask) - else: - masked_weight = self.weight - if self.scale and self.scale != 1: - masked_weight = masked_weight * self.scale - - return F.embedding(inputs, - masked_weight, - max_norm=self.max_norm, - norm_type=self.norm_type, - scale_grad_by_freq=self.scale_grad_by_freq, - sparse=self.sparse) - - -class LockedDropout(nn.Module): - # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py - def __init__(self): - super().__init__() - - def forward(self, x, dropout=0.5): - if not self.training or not dropout: - return x - m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout) - mask = Variable(m, requires_grad=False) / (1 - dropout) - mask = mask.expand_as(x) - return mask * x - - -class ENASModel(BaseModel): - """Shared RNN model.""" - def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000): - super(ENASModel, self).__init__() - - self.use_cuda = cuda - - self.shared_hid = shared_hid - self.num_blocks = num_blocks - self.decoder = nn.Linear(self.shared_hid, num_classes) - self.encoder = EmbeddingDropout(embed_num, - shared_embed, - dropout=0.1) - self.lockdrop = LockedDropout() - self.dag = None - - # Tie weights - # self.decoder.weight = self.encoder.weight - - # Since W^{x, c} and W^{h, c} are always summed, there - # is no point duplicating their bias offset parameter. Likewise for - # W^{x, h} and W^{h, h}. - self.w_xc = nn.Linear(shared_embed, self.shared_hid) - self.w_xh = nn.Linear(shared_embed, self.shared_hid) - - # The raw weights are stored here because the hidden-to-hidden weights - # are weight dropped on the forward pass. - self.w_hc_raw = torch.nn.Parameter( - torch.Tensor(self.shared_hid, self.shared_hid)) - self.w_hh_raw = torch.nn.Parameter( - torch.Tensor(self.shared_hid, self.shared_hid)) - self.w_hc = None - self.w_hh = None - - self.w_h = collections.defaultdict(dict) - self.w_c = collections.defaultdict(dict) - - for idx in range(self.num_blocks): - for jdx in range(idx + 1, self.num_blocks): - self.w_h[idx][jdx] = nn.Linear(self.shared_hid, - self.shared_hid, - bias=False) - self.w_c[idx][jdx] = nn.Linear(self.shared_hid, - self.shared_hid, - bias=False) - - self._w_h = nn.ModuleList([self.w_h[idx][jdx] - for idx in self.w_h - for jdx in self.w_h[idx]]) - self._w_c = nn.ModuleList([self.w_c[idx][jdx] - for idx in self.w_c - for jdx in self.w_c[idx]]) - - self.batch_norm = None - # if args.mode == 'train': - # self.batch_norm = nn.BatchNorm1d(self.shared_hid) - # else: - # self.batch_norm = None - - self.reset_parameters() - self.static_init_hidden = utils.keydefaultdict(self.init_hidden) - - def setDAG(self, dag): - if self.dag is None: - self.dag = dag - - def forward(self, word_seq, hidden=None): - inputs = torch.transpose(word_seq, 0, 1) - - time_steps = inputs.size(0) - batch_size = inputs.size(1) - - - self.w_hh = _get_dropped_weights(self.w_hh_raw, - 0.5, - self.training) - self.w_hc = _get_dropped_weights(self.w_hc_raw, - 0.5, - self.training) - - # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden - hidden = self.static_init_hidden[batch_size] - - embed = self.encoder(inputs) - - embed = self.lockdrop(embed, 0.65 if self.training else 0) - - # The norm of hidden states are clipped here because - # otherwise ENAS is especially prone to exploding activations on the - # forward pass. This could probably be fixed in a more elegant way, but - # it might be exposing a weakness in the ENAS algorithm as currently - # proposed. - # - # For more details, see - # https://github.com/carpedm20/ENAS-pytorch/issues/6 - clipped_num = 0 - max_clipped_norm = 0 - h1tohT = [] - logits = [] - for step in range(time_steps): - x_t = embed[step] - logit, hidden = self.cell(x_t, hidden, self.dag) - - hidden_norms = hidden.norm(dim=-1) - max_norm = 25.0 - if hidden_norms.data.max() > max_norm: - # Just directly use the torch slice operations - # in PyTorch v0.4. - # - # This workaround for PyTorch v0.3.1 does everything in numpy, - # because the PyTorch slicing and slice assignment is too - # flaky. - hidden_norms = hidden_norms.data.cpu().numpy() - - clipped_num += 1 - if hidden_norms.max() > max_clipped_norm: - max_clipped_norm = hidden_norms.max() - - clip_select = hidden_norms > max_norm - clip_norms = hidden_norms[clip_select] - - mask = np.ones(hidden.size()) - normalizer = max_norm/clip_norms - normalizer = normalizer[:, np.newaxis] - - mask[clip_select] = normalizer - - if self.use_cuda: - hidden *= torch.autograd.Variable( - torch.FloatTensor(mask).cuda(), requires_grad=False) - else: - hidden *= torch.autograd.Variable( - torch.FloatTensor(mask), requires_grad=False) - logits.append(logit) - h1tohT.append(hidden) - - h1tohT = torch.stack(h1tohT) - output = torch.stack(logits) - raw_output = output - - output = self.lockdrop(output, 0.4 if self.training else 0) - - #Pooling - output = torch.mean(output, 0) - - decoded = self.decoder(output) - - extra_out = {'dropped': decoded, - 'hiddens': h1tohT, - 'raw': raw_output} - return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out} - - def cell(self, x, h_prev, dag): - """Computes a single pass through the discovered RNN cell.""" - c = {} - h = {} - f = {} - - f[0] = self.get_f(dag[-1][0].name) - c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None)) - h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + - (1 - c[0])*h_prev) - - leaf_node_ids = [] - q = collections.deque() - q.append(0) - - # Computes connections from the parent nodes `node_id` - # to their child nodes `next_id` recursively, skipping leaf nodes. A - # leaf node is a node whose id == `self.num_blocks`. - # - # Connections between parent i and child j should be computed as - # h_j = c_j*f_{ij}{(W^h_{ij}*h_i)} + (1 - c_j)*h_i, - # where c_j = \sigmoid{(W^c_{ij}*h_i)} - # - # See Training details from Section 3.1 of the paper. - # - # The following algorithm does a breadth-first (since `q.popleft()` is - # used) search over the nodes and computes all the hidden states. - while True: - if len(q) == 0: - break - - node_id = q.popleft() - nodes = dag[node_id] - - for next_node in nodes: - next_id = next_node.id - if next_id == self.num_blocks: - leaf_node_ids.append(node_id) - assert len(nodes) == 1, ('parent of leaf node should have ' - 'only one child') - continue - - w_h = self.w_h[node_id][next_id] - w_c = self.w_c[node_id][next_id] - - f[next_id] = self.get_f(next_node.name) - c[next_id] = torch.sigmoid(w_c(h[node_id])) - h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) + - (1 - c[next_id])*h[node_id]) - - q.append(next_id) - - # Instead of averaging loose ends, perhaps there should - # be a set of separate unshared weights for each "loose" connection - # between each node in a cell and the output. - # - # As it stands, all weights W^h_{ij} are doing double duty by - # connecting both from i to j, as well as from i to the output. - - # average all the loose ends - leaf_nodes = [h[node_id] for node_id in leaf_node_ids] - output = torch.mean(torch.stack(leaf_nodes, 2), -1) - - # stabilizing the Updates of omega - if self.batch_norm is not None: - output = self.batch_norm(output) - - return output, h[self.num_blocks - 1] - - def init_hidden(self, batch_size): - zeros = torch.zeros(batch_size, self.shared_hid) - return utils.get_variable(zeros, self.use_cuda, requires_grad=False) - - def get_f(self, name): - name = name.lower() - if name == 'relu': - f = torch.relu - elif name == 'tanh': - f = torch.tanh - elif name == 'identity': - f = lambda x: x - elif name == 'sigmoid': - f = torch.sigmoid - return f - - - @property - def num_parameters(self): - def size(p): - return np.prod(p.size()) - return sum([size(param) for param in self.parameters()]) - - - def reset_parameters(self): - init_range = 0.025 - # init_range = 0.025 if self.args.mode == 'train' else 0.04 - for param in self.parameters(): - param.data.uniform_(-init_range, init_range) - self.decoder.bias.data.fill_(0) - - def predict(self, word_seq): - """ - - :param word_seq: torch.LongTensor, [batch_size, seq_len] - :return predict: dict of torch.LongTensor, [batch_size, seq_len] - """ - output = self(word_seq) - _, predict = output['pred'].max(dim=1) - return {'pred': predict} diff --git a/legacy/automl/enas_trainer.py b/legacy/automl/enas_trainer.py deleted file mode 100644 index e3524aa9..00000000 --- a/legacy/automl/enas_trainer.py +++ /dev/null @@ -1,383 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch - -import math -import time -from datetime import datetime -from datetime import timedelta - -import numpy as np -import torch - -try: - from tqdm.auto import tqdm -except: - from fastNLP.core.utils import _pseudo_tqdm as tqdm - -from fastNLP.core.batch import Batch -from fastNLP.core.callback import CallbackException -from fastNLP.core.dataset import DataSet -from fastNLP.core.utils import _move_dict_value_to_device -import fastNLP -from . import enas_utils as utils -from fastNLP.core.utils import _build_args - -from torch.optim import Adam - - -def _get_no_grad_ctx_mgr(): - """Returns a the `torch.no_grad` context manager for PyTorch version >= - 0.4, or a no-op context manager otherwise. - """ - return torch.no_grad() - - -class ENASTrainer(fastNLP.Trainer): - """A class to wrap training code.""" - def __init__(self, train_data, model, controller, **kwargs): - """Constructor for training algorithm. - :param DataSet train_data: the training data - :param torch.nn.modules.module model: a PyTorch model - :param torch.nn.modules.module controller: a PyTorch model - """ - self.final_epochs = kwargs['final_epochs'] - kwargs.pop('final_epochs') - super(ENASTrainer, self).__init__(train_data, model, **kwargs) - self.controller_step = 0 - self.shared_step = 0 - self.max_length = 35 - - self.shared = model - self.controller = controller - - self.shared_optim = Adam( - self.shared.parameters(), - lr=20.0, - weight_decay=1e-7) - - self.controller_optim = Adam( - self.controller.parameters(), - lr=3.5e-4) - - def train(self, load_best_model=True): - """ - :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 - 最好的模型参数。 - :return results: 返回一个字典类型的数据, - 内含以下内容:: - - seconds: float, 表示训练时长 - 以下三个内容只有在提供了dev_data的情况下会有。 - best_eval: Dict of Dict, 表示evaluation的结果 - best_epoch: int,在第几个epoch取得的最佳值 - best_step: int, 在第几个step(batch)更新取得的最佳值 - - """ - results = {} - if self.n_epochs <= 0: - print(f"training epoch is {self.n_epochs}, nothing was done.") - results['seconds'] = 0. - return results - try: - if torch.cuda.is_available() and self.use_cuda: - self.model = self.model.cuda() - self._model_device = self.model.parameters().__next__().device - self._mode(self.model, is_test=False) - - self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) - start_time = time.time() - print("training epochs started " + self.start_time, flush=True) - - try: - self.callback_manager.on_train_begin() - self._train() - self.callback_manager.on_train_end(self.model) - except (CallbackException, KeyboardInterrupt) as e: - self.callback_manager.on_exception(e, self.model) - - if self.dev_data is not None: - print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + - self.tester._format_eval_results(self.best_dev_perf),) - results['best_eval'] = self.best_dev_perf - results['best_epoch'] = self.best_dev_epoch - results['best_step'] = self.best_dev_step - if load_best_model: - model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]) - load_succeed = self._load_model(self.model, model_name) - if load_succeed: - print("Reloaded the best model.") - else: - print("Fail to reload best model.") - finally: - pass - results['seconds'] = round(time.time() - start_time, 2) - - return results - - def _train(self): - if not self.use_tqdm: - from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm - else: - inner_tqdm = tqdm - self.step = 0 - start = time.time() - total_steps = (len(self.train_data) // self.batch_size + int( - len(self.train_data) % self.batch_size != 0)) * self.n_epochs - with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: - avg_loss = 0 - data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - for epoch in range(1, self.n_epochs+1): - pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) - last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) - if epoch == self.n_epochs + 1 - self.final_epochs: - print('Entering the final stage. (Only train the selected structure)') - # early stopping - self.callback_manager.on_epoch_begin(epoch, self.n_epochs) - - # 1. Training the shared parameters omega of the child models - self.train_shared(pbar) - - # 2. Training the controller parameters theta - if not last_stage: - self.train_controller() - - if ((self.validate_every > 0 and self.step % self.validate_every == 0) or - (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ - and self.dev_data is not None: - if not last_stage: - self.derive() - eval_res = self._do_validation(epoch=epoch, step=self.step) - eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, - total_steps) + \ - self.tester._format_eval_results(eval_res) - pbar.write(eval_str) - - # lr decay; early stopping - self.callback_manager.on_epoch_end(epoch, self.n_epochs, self.optimizer) - # =============== epochs end =================== # - pbar.close() - # ============ tqdm end ============== # - - - def get_loss(self, inputs, targets, hidden, dags): - """Computes the loss for the same batch for M models. - - This amounts to an estimate of the loss, which is turned into an - estimate for the gradients of the shared model. - """ - if not isinstance(dags, list): - dags = [dags] - - loss = 0 - for dag in dags: - self.shared.setDAG(dag) - inputs = _build_args(self.shared.forward, **inputs) - inputs['hidden'] = hidden - result = self.shared(**inputs) - output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out'] - - self.callback_manager.on_loss_begin(targets, result) - sample_loss = self._compute_loss(result, targets) - loss += sample_loss - - assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`' - return loss, hidden, extra_out - - def train_shared(self, pbar=None, max_step=None, dag=None): - """Train the language model for 400 steps of minibatches of 64 - examples. - - Args: - max_step: Used to run extra training steps as a warm-up. - dag: If not None, is used instead of calling sample(). - - BPTT is truncated at 35 timesteps. - - For each weight update, gradients are estimated by sampling M models - from the fixed controller policy, and averaging their gradients - computed on a batch of training data. - """ - model = self.shared - model.train() - self.controller.eval() - - hidden = self.shared.init_hidden(self.batch_size) - - abs_max_grad = 0 - abs_max_hidden_norm = 0 - step = 0 - raw_total_loss = 0 - total_loss = 0 - train_idx = 0 - avg_loss = 0 - data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - - for batch_x, batch_y in data_iterator: - _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - indices = data_iterator.get_batch_indices() - # negative sampling; replace unknown; re-weight batch_y - self.callback_manager.on_batch_begin(batch_x, batch_y, indices) - # prediction = self._data_forward(self.model, batch_x) - - dags = self.controller.sample(1) - inputs, targets = batch_x, batch_y - # self.callback_manager.on_loss_begin(batch_y, prediction) - loss, hidden, extra_out = self.get_loss(inputs, - targets, - hidden, - dags) - hidden.detach_() - - avg_loss += loss.item() - - # Is loss NaN or inf? requires_grad = False - self.callback_manager.on_backward_begin(loss, self.model) - self._grad_backward(loss) - self.callback_manager.on_backward_end(self.model) - - self._update() - self.callback_manager.on_step_end(self.optimizer) - - if (self.step+1) % self.print_every == 0: - if self.use_tqdm: - print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every) - pbar.update(self.print_every) - else: - end = time.time() - diff = timedelta(seconds=round(end - start)) - print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format( - epoch, self.step, avg_loss, diff) - pbar.set_postfix_str(print_output) - avg_loss = 0 - self.step += 1 - step += 1 - self.shared_step += 1 - self.callback_manager.on_batch_end() - # ================= mini-batch end ==================== # - - - def get_reward(self, dag, entropies, hidden, valid_idx=0): - """Computes the perplexity of a single sampled model on a minibatch of - validation data. - """ - if not isinstance(entropies, np.ndarray): - entropies = entropies.data.cpu().numpy() - - data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - - for inputs, targets in data_iterator: - valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) - valid_loss = utils.to_item(valid_loss.data) - - valid_ppl = math.exp(valid_loss) - - R = 80 / valid_ppl - - rewards = R + 1e-4 * entropies - - return rewards, hidden - - def train_controller(self): - """Fixes the shared parameters and updates the controller parameters. - - The controller is updated with a score function gradient estimator - (i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl - is computed on a minibatch of validation data. - - A moving average baseline is used. - - The controller is trained for 2000 steps per epoch (i.e., - first (Train Shared) phase -> second (Train Controller) phase). - """ - model = self.controller - model.train() - # Why can't we call shared.eval() here? Leads to loss - # being uniformly zero for the controller. - # self.shared.eval() - - avg_reward_base = None - baseline = None - adv_history = [] - entropy_history = [] - reward_history = [] - - hidden = self.shared.init_hidden(self.batch_size) - total_loss = 0 - valid_idx = 0 - for step in range(20): - # sample models - dags, log_probs, entropies = self.controller.sample( - with_details=True) - - # calculate reward - np_entropies = entropies.data.cpu().numpy() - # No gradients should be backpropagated to the - # shared model during controller training, obviously. - with _get_no_grad_ctx_mgr(): - rewards, hidden = self.get_reward(dags, - np_entropies, - hidden, - valid_idx) - - - reward_history.extend(rewards) - entropy_history.extend(np_entropies) - - # moving average baseline - if baseline is None: - baseline = rewards - else: - decay = 0.95 - baseline = decay * baseline + (1 - decay) * rewards - - adv = rewards - baseline - adv_history.extend(adv) - - # policy loss - loss = -log_probs*utils.get_variable(adv, - self.use_cuda, - requires_grad=False) - - loss = loss.sum() # or loss.mean() - - # update - self.controller_optim.zero_grad() - loss.backward() - - self.controller_optim.step() - - total_loss += utils.to_item(loss.data) - - if ((step % 50) == 0) and (step > 0): - reward_history, adv_history, entropy_history = [], [], [] - total_loss = 0 - - self.controller_step += 1 - # prev_valid_idx = valid_idx - # valid_idx = ((valid_idx + self.max_length) % - # (self.valid_data.size(0) - 1)) - # # Whenever we wrap around to the beginning of the - # # validation data, we reset the hidden states. - # if prev_valid_idx > valid_idx: - # hidden = self.shared.init_hidden(self.batch_size) - - def derive(self, sample_num=10, valid_idx=0): - """We are always deriving based on the very first batch - of validation data? This seems wrong... - """ - hidden = self.shared.init_hidden(self.batch_size) - - dags, _, entropies = self.controller.sample(sample_num, - with_details=True) - - max_R = 0 - best_dag = None - for dag in dags: - R, _ = self.get_reward(dag, entropies, hidden, valid_idx) - if R.max() > max_R: - max_R = R.max() - best_dag = dag - - self.model.setDAG(best_dag) diff --git a/legacy/automl/enas_utils.py b/legacy/automl/enas_utils.py deleted file mode 100644 index 7a53dd12..00000000 --- a/legacy/automl/enas_utils.py +++ /dev/null @@ -1,53 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch - -from __future__ import print_function - -import collections -from collections import defaultdict - -import numpy as np -import torch -from torch.autograd import Variable - - -def detach(h): - if type(h) == Variable: - return Variable(h.data) - else: - return tuple(detach(v) for v in h) - -def get_variable(inputs, cuda=False, **kwargs): - if type(inputs) in [list, np.ndarray]: - inputs = torch.Tensor(inputs) - if cuda: - out = Variable(inputs.cuda(), **kwargs) - else: - out = Variable(inputs, **kwargs) - return out - -def update_lr(optimizer, lr): - for param_group in optimizer.param_groups: - param_group['lr'] = lr - -Node = collections.namedtuple('Node', ['id', 'name']) - - -class keydefaultdict(defaultdict): - def __missing__(self, key): - if self.default_factory is None: - raise KeyError(key) - else: - ret = self[key] = self.default_factory(key) - return ret - - -def to_item(x): - """Converts x, possibly scalar and possibly tensor, to a Python scalar.""" - if isinstance(x, (float, int)): - return x - - if float(torch.__version__[0:3]) < 0.4: - assert (x.dim() == 1) and (len(x) == 1) - return x[0] - - return x.item() diff --git a/legacy/component/__init__.py b/legacy/component/__init__.py deleted file mode 100644 index c6784aef..00000000 --- a/legacy/component/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .bert_tokenizer import BertTokenizer diff --git a/legacy/component/bert_tokenizer.py b/legacy/component/bert_tokenizer.py deleted file mode 100644 index 6354076d..00000000 --- a/legacy/component/bert_tokenizer.py +++ /dev/null @@ -1,378 +0,0 @@ -""" -bert_tokenizer.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0. -""" -import collections -import os -import unicodedata -from io import open - - -PRETRAINED_VOCAB_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", -} -PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { - 'bert-base-uncased': 512, - 'bert-large-uncased': 512, - 'bert-base-cased': 512, - 'bert-large-cased': 512, - 'bert-base-multilingual-uncased': 512, - 'bert-base-multilingual-cased': 512, - 'bert-base-chinese': 512, -} -VOCAB_NAME = 'vocab.txt' - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - index = 0 - with open(vocab_file, "r", encoding="utf-8") as reader: - while True: - token = reader.readline() - if not token: - break - token = token.strip() - vocab[token] = index - index += 1 - return vocab - - -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class BertTokenizer(object): - """Runs end-to-end tokenization: punctuation splitting + wordpiece""" - - def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, - never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): - """Constructs a BertTokenizer. - Args: - vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input - Only has an effect when do_wordpiece_only=False - do_basic_tokenize: Whether to do basic tokenization before wordpiece. - max_len: An artificial maximum length to truncate tokenized sequences to; - Effective maximum length is always the minimum of this - value (if specified) and the underlying BERT model's - sequence length. - never_split: List of tokens which will never be split during tokenization. - Only has an effect when do_wordpiece_only=False - """ - if not os.path.isfile(vocab_file): - raise ValueError( - "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) - self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict( - [(ids, tok) for tok, ids in self.vocab.items()]) - self.do_basic_tokenize = do_basic_tokenize - if do_basic_tokenize: - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, - never_split=never_split) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - self.max_len = max_len if max_len is not None else int(1e12) - - def tokenize(self, text): - split_tokens = [] - if self.do_basic_tokenize: - for token in self.basic_tokenizer.tokenize(text): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - else: - split_tokens = self.wordpiece_tokenizer.tokenize(text) - return split_tokens - - def convert_tokens_to_ids(self, tokens): - """Converts a sequence of tokens into ids using the vocab.""" - ids = [] - for token in tokens: - ids.append(self.vocab[token]) - if len(ids) > self.max_len: - print( - "WARNING!\n\"" - "Token indices sequence length is longer than the specified maximum " - "sequence length for this BERT model ({} > {}). Running this" - " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) - ) - return ids - - def convert_ids_to_tokens(self, ids): - """Converts a sequence of ids in wordpiece tokens using the vocab.""" - tokens = [] - for i in ids: - tokens.append(self.ids_to_tokens[i]) - return tokens - - def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a directory or file.""" - index = 0 - if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_NAME) - with open(vocab_file, "w", encoding="utf-8") as writer: - for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): - if index != token_index: - print("Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file)) - index = token_index - writer.write(token + u'\n') - index += 1 - return vocab_file - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): - """ - Instantiate a PreTrainedBertModel from a pre-trained model file. - Download and cache the pre-trained model file if needed. - """ - if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: - vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] - if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True): - print("The pre-trained model you are loading is a cased model but you have not set " - "`do_lower_case` to False. We are setting `do_lower_case=False` for you but " - "you may want to check this behavior.") - kwargs['do_lower_case'] = False - elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True): - print("The pre-trained model you are loading is an uncased model but you have set " - "`do_lower_case` to False. We are setting `do_lower_case=True` for you " - "but you may want to check this behavior.") - kwargs['do_lower_case'] = True - else: - vocab_file = pretrained_model_name_or_path - if os.path.isdir(vocab_file): - vocab_file = os.path.join(vocab_file, VOCAB_NAME) - # redirect to the cache, if necessary - resolved_vocab_file = vocab_file - print("loading vocabulary file {}".format(vocab_file)) - if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: - # if we're using a pretrained model, ensure the tokenizer wont index sequences longer - # than the number of positional embeddings - max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] - kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) - # Instantiate tokenizer. - tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) - return tokenizer - - -class BasicTokenizer(object): - """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - - def __init__(self, - do_lower_case=True, - never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): - """Constructs a BasicTokenizer. - Args: - do_lower_case: Whether to lower case the input. - """ - self.do_lower_case = do_lower_case - self.never_split = never_split - - def tokenize(self, text): - """Tokenizes a piece of text.""" - text = self._clean_text(text) - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case and token not in self.never_split: - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) - - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - if text in self.never_split: - return [text] - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(" ") - output.append(char) - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - -class WordpieceTokenizer(object): - """Runs WordPiece tokenization.""" - - def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """Tokenizes a piece of text into its word pieces. - This uses a greedy longest-match-first algorithm to perform tokenization - using the given vocabulary. - For example: - input = "unaffable" - output = ["un", "##aff", "##able"] - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer`. - Returns: - A list of wordpiece tokens. - """ - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = "".join(chars[start:end]) - if start > 0: - substr = "##" + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end - - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens - - -def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically contorl characters but we treat them - # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": - return True - cat = unicodedata.category(char) - if cat == "Zs": - return True - return False - - -def _is_control(char): - """Checks whether `chars` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == "\t" or char == "\n" or char == "\r": - return False - cat = unicodedata.category(char) - if cat.startswith("C"): - return True - return False - - -def _is_punctuation(char): - """Checks whether `chars` is a punctuation character.""" - cp = ord(char) - # We treat all non-letter/number ASCII as punctuation. - # Characters such as "^", "$", and "`" are not in the Unicode - # Punctuation class but we treat them as punctuation anyways, for - # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith("P"): - return True - return False - diff --git a/reproduction/README.md b/reproduction/README.md index c2478713..1ddca315 100644 --- a/reproduction/README.md +++ b/reproduction/README.md @@ -3,8 +3,8 @@ 复现的模型有: - [Star-Transformer](Star_transformer) -- [Biaffine](https://github.com/fastnlp/fastNLP/blob/999a14381747068e9e6a7cc370037b320197db00/fastNLP/models/biaffine_parser.py#L239) -- [CNNText](https://github.com/fastnlp/fastNLP/blob/999a14381747068e9e6a7cc370037b320197db00/fastNLP/models/cnn_text_classification.py#L12) +- [Biaffine](https://github.com/fastnlp/fastNLP/blob/master/fastNLP/models/biaffine_parser.py) +- [CNNText](https://github.com/fastnlp/fastNLP/blob/master/fastNLP/models/cnn_text_classification.py) - ... # 任务复现 @@ -17,11 +17,11 @@ ## Sequence Labeling (序列标注) -- [NER](seqence_labelling/ner) +- [NER](sequence_labelling/ner) -## Coreference Resolution (共指消解) -- [Coreference Resolution 共指消解任务复现](coreference_resolution) +## Coreference Resolution (指代消解) +- [Coreference Resolution 指代消解任务复现](coreference_resolution) ## Summarization (摘要) diff --git a/reproduction/Summarization/Baseline/data/dataloader.py b/reproduction/Summarization/Baseline/data/dataloader.py deleted file mode 100644 index 47cd0856..00000000 --- a/reproduction/Summarization/Baseline/data/dataloader.py +++ /dev/null @@ -1,188 +0,0 @@ -import pickle -import numpy as np - -from fastNLP.core.vocabulary import Vocabulary -from fastNLP.io.base_loader import DataBundle -from fastNLP.io.dataset_loader import JsonLoader -from fastNLP.core.const import Const - -from tools.logger import * - -WORD_PAD = "[PAD]" -WORD_UNK = "[UNK]" -DOMAIN_UNK = "X" -TAG_UNK = "X" - - -class SummarizationLoader(JsonLoader): - """ - 读取summarization数据集,读取的DataSet包含fields:: - - text: list(str),document - summary: list(str), summary - text_wd: list(list(str)),tokenized document - summary_wd: list(list(str)), tokenized summary - labels: list(int), - flatten_label: list(int), 0 or 1, flatten labels - domain: str, optional - tag: list(str), optional - - 数据来源: CNN_DailyMail Newsroom DUC - """ - - def __init__(self): - super(SummarizationLoader, self).__init__() - - def _load(self, path): - ds = super(SummarizationLoader, self)._load(path) - - def _lower_text(text_list): - return [text.lower() for text in text_list] - - def _split_list(text_list): - return [text.split() for text in text_list] - - def _convert_label(label, sent_len): - np_label = np.zeros(sent_len, dtype=int) - if label != []: - np_label[np.array(label)] = 1 - return np_label.tolist() - - ds.apply(lambda x: _lower_text(x['text']), new_field_name='text') - ds.apply(lambda x: _lower_text(x['summary']), new_field_name='summary') - ds.apply(lambda x:_split_list(x['text']), new_field_name='text_wd') - ds.apply(lambda x:_split_list(x['summary']), new_field_name='summary_wd') - ds.apply(lambda x:_convert_label(x["label"], len(x["text"])), new_field_name="flatten_label") - - return ds - - def process(self, paths, vocab_size, vocab_path, sent_max_len, doc_max_timesteps, domain=False, tag=False, load_vocab_file=True): - """ - :param paths: dict path for each dataset - :param vocab_size: int max_size for vocab - :param vocab_path: str vocab path - :param sent_max_len: int max token number of the sentence - :param doc_max_timesteps: int max sentence number of the document - :param domain: bool build vocab for publication, use 'X' for unknown - :param tag: bool build vocab for tag, use 'X' for unknown - :param load_vocab_file: bool build vocab (False) or load vocab (True) - :return: DataBundle - datasets: dict keys correspond to the paths dict - vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) - embeddings: optional - """ - - def _pad_sent(text_wd): - pad_text_wd = [] - for sent_wd in text_wd: - if len(sent_wd) < sent_max_len: - pad_num = sent_max_len - len(sent_wd) - sent_wd.extend([WORD_PAD] * pad_num) - else: - sent_wd = sent_wd[:sent_max_len] - pad_text_wd.append(sent_wd) - return pad_text_wd - - def _token_mask(text_wd): - token_mask_list = [] - for sent_wd in text_wd: - token_num = len(sent_wd) - if token_num < sent_max_len: - mask = [1] * token_num + [0] * (sent_max_len - token_num) - else: - mask = [1] * sent_max_len - token_mask_list.append(mask) - return token_mask_list - - def _pad_label(label): - text_len = len(label) - if text_len < doc_max_timesteps: - pad_label = label + [0] * (doc_max_timesteps - text_len) - else: - pad_label = label[:doc_max_timesteps] - return pad_label - - def _pad_doc(text_wd): - text_len = len(text_wd) - if text_len < doc_max_timesteps: - padding = [WORD_PAD] * sent_max_len - pad_text = text_wd + [padding] * (doc_max_timesteps - text_len) - else: - pad_text = text_wd[:doc_max_timesteps] - return pad_text - - def _sent_mask(text_wd): - text_len = len(text_wd) - if text_len < doc_max_timesteps: - sent_mask = [1] * text_len + [0] * (doc_max_timesteps - text_len) - else: - sent_mask = [1] * doc_max_timesteps - return sent_mask - - - datasets = {} - train_ds = None - for key, value in paths.items(): - ds = self.load(value) - # pad sent - ds.apply(lambda x:_pad_sent(x["text_wd"]), new_field_name="pad_text_wd") - ds.apply(lambda x:_token_mask(x["text_wd"]), new_field_name="pad_token_mask") - # pad document - ds.apply(lambda x:_pad_doc(x["pad_text_wd"]), new_field_name="pad_text") - ds.apply(lambda x:_sent_mask(x["pad_text_wd"]), new_field_name="seq_len") - ds.apply(lambda x:_pad_label(x["flatten_label"]), new_field_name="pad_label") - - # rename field - ds.rename_field("pad_text", Const.INPUT) - ds.rename_field("seq_len", Const.INPUT_LEN) - ds.rename_field("pad_label", Const.TARGET) - - # set input and target - ds.set_input(Const.INPUT, Const.INPUT_LEN) - ds.set_target(Const.TARGET, Const.INPUT_LEN) - - datasets[key] = ds - if "train" in key: - train_ds = datasets[key] - - vocab_dict = {} - if load_vocab_file == False: - logger.info("[INFO] Build new vocab from training dataset!") - if train_ds == None: - raise ValueError("Lack train file to build vocabulary!") - - vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) - vocabs.from_dataset(train_ds, field_name=["text_wd","summary_wd"]) - vocab_dict["vocab"] = vocabs - else: - logger.info("[INFO] Load existing vocab from %s!" % vocab_path) - word_list = [] - with open(vocab_path, 'r', encoding='utf8') as vocab_f: - cnt = 2 # pad and unk - for line in vocab_f: - pieces = line.split("\t") - word_list.append(pieces[0]) - cnt += 1 - if cnt > vocab_size: - break - vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) - vocabs.add_word_lst(word_list) - vocabs.build_vocab() - vocab_dict["vocab"] = vocabs - - if domain == True: - domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK) - domaindict.from_dataset(train_ds, field_name="publication") - vocab_dict["domain"] = domaindict - if tag == True: - tagdict = Vocabulary(padding=None, unknown=TAG_UNK) - tagdict.from_dataset(train_ds, field_name="tag") - vocab_dict["tag"] = tagdict - - for ds in datasets.values(): - vocab_dict["vocab"].index_dataset(ds, field_name=Const.INPUT, new_field_name=Const.INPUT) - - return DataBundle(vocabs=vocab_dict, datasets=datasets) - - - diff --git a/reproduction/Summarization/Baseline/model/Encoder.py b/reproduction/Summarization/Baseline/model/Encoder.py index 8a30fd29..271270b3 100644 --- a/reproduction/Summarization/Baseline/model/Encoder.py +++ b/reproduction/Summarization/Baseline/model/Encoder.py @@ -94,6 +94,8 @@ class Encoder(nn.Module): if self._hps.cuda: input_pos = input_pos.cuda() enc_pos_embed_input = self.position_embedding(input_pos.long()) # [batch_size*N, D] + # print(enc_embed_input.size()) + # print(enc_pos_embed_input.size()) enc_conv_input = enc_embed_input + enc_pos_embed_input enc_conv_input = enc_conv_input.unsqueeze(1) # (batch * N,Ci,L,D) enc_conv_output = [F.relu(conv(enc_conv_input)).squeeze(3) for conv in self.convs] # kernel_sizes * (batch*N, Co, W) diff --git a/reproduction/Summarization/Baseline/model/LSTMModel.py b/reproduction/Summarization/Baseline/model/LSTMModel.py index 1fae03dd..3dfbf6ba 100644 --- a/reproduction/Summarization/Baseline/model/LSTMModel.py +++ b/reproduction/Summarization/Baseline/model/LSTMModel.py @@ -17,11 +17,12 @@ class SummarizationModel(nn.Module): """ :param hps: hyperparameters for the model - :param vocab: vocab object + :param embed: word embedding """ super(SummarizationModel, self).__init__() self._hps = hps + self.Train = (hps.mode == 'train') # sentence encoder self.encoder = Encoder(hps, embed) @@ -45,18 +46,19 @@ class SummarizationModel(nn.Module): self.wh = nn.Linear(self.d_v, 2) - def forward(self, input, input_len, Train): + def forward(self, words, seq_len): """ :param input: [batch_size, N, seq_len], word idx long tensor :param input_len: [batch_size, N], 1 for sentence and 0 for padding - :param Train: True for train and False for eval and test - :param return_atten: True or False to return multi-head attention output self.output_slf_attn :return: p_sent: [batch_size, N, 2] output_slf_attn: (option) [n_head, batch_size, N, N] """ + input = words + input_len = seq_len + # -- Sentence Encoder self.sent_embedding = self.encoder(input) # [batch, N, Co * kernel_sizes] @@ -67,7 +69,7 @@ class SummarizationModel(nn.Module): self.inputs[0] = self.sent_embedding.permute(1, 0, 2) # [N, batch, Co * kernel_sizes] self.input_masks[0] = input_len.permute(1, 0).unsqueeze(2) - self.lstm_output_state = self.deep_lstm(self.inputs, self.input_masks, Train) # [batch, N, hidden_size] + self.lstm_output_state = self.deep_lstm(self.inputs, self.input_masks, Train=self.train) # [batch, N, hidden_size] # -- Prepare masks batch_size, N = input_len.size() diff --git a/reproduction/Summarization/Baseline/model/Loss.py b/reproduction/Summarization/Baseline/model/Loss.py index 24f10748..e5244261 100644 --- a/reproduction/Summarization/Baseline/model/Loss.py +++ b/reproduction/Summarization/Baseline/model/Loss.py @@ -21,7 +21,7 @@ import torch import torch.nn.functional as F from fastNLP.core.losses import LossBase -from tools.logger import * +from fastNLP.core._logger import logger class MyCrossEntropyLoss(LossBase): def __init__(self, pred=None, target=None, mask=None, padding_idx=-100, reduce='mean'): diff --git a/reproduction/Summarization/Baseline/model/Metric.py b/reproduction/Summarization/Baseline/model/Metric.py index 441c27b1..df2cd9eb 100644 --- a/reproduction/Summarization/Baseline/model/Metric.py +++ b/reproduction/Summarization/Baseline/model/Metric.py @@ -20,14 +20,60 @@ from __future__ import division import torch +import torch.nn.functional as F + from rouge import Rouge from fastNLP.core.const import Const from fastNLP.core.metrics import MetricBase -from tools.logger import * +# from tools.logger import * +from fastNLP.core._logger import logger from tools.utils import pyrouge_score_all, pyrouge_score_all_multi + +class LossMetric(MetricBase): + def __init__(self, pred=None, target=None, mask=None, padding_idx=-100, reduce='mean'): + super().__init__() + + self._init_param_map(pred=pred, target=target, mask=mask) + self.padding_idx = padding_idx + self.reduce = reduce + self.loss = 0.0 + self.iteration = 0 + + def evaluate(self, pred, target, mask): + """ + + :param pred: [batch, N, 2] + :param target: [batch, N] + :param input_mask: [batch, N] + :return: + """ + + batch, N, _ = pred.size() + pred = pred.view(-1, 2) + target = target.view(-1) + loss = F.cross_entropy(input=pred, target=target, + ignore_index=self.padding_idx, reduction=self.reduce) + loss = loss.view(batch, -1) + loss = loss.masked_fill(mask.eq(0), 0) + loss = loss.sum(1).mean() + self.loss += loss + self.iteration += 1 + + def get_metric(self, reset=True): + epoch_avg_loss = self.loss / self.iteration + if reset: + self.loss = 0.0 + self.iteration = 0 + metric = {"loss": -epoch_avg_loss} + logger.info(metric) + return metric + + + + class LabelFMetric(MetricBase): def __init__(self, pred=None, target=None): super().__init__() diff --git a/reproduction/Summarization/Baseline/model/TForiginal.py b/reproduction/Summarization/Baseline/model/TForiginal.py index e66bc061..a08a9213 100644 --- a/reproduction/Summarization/Baseline/model/TForiginal.py +++ b/reproduction/Summarization/Baseline/model/TForiginal.py @@ -51,7 +51,7 @@ class TransformerModel(nn.Module): ffn_inner_hidden_size: FFN hiddens size atten_dropout_prob: dropout size doc_max_timesteps: max sentence number of the document - :param vocab: + :param embed: word embedding """ super(TransformerModel, self).__init__() diff --git a/reproduction/Summarization/Baseline/tools/Callback.py b/reproduction/Summarization/Baseline/tools/Callback.py index 7f2e01c0..3fe27daa 100644 --- a/reproduction/Summarization/Baseline/tools/Callback.py +++ b/reproduction/Summarization/Baseline/tools/Callback.py @@ -28,7 +28,7 @@ from fastNLP.core.const import Const from fastNLP.io.model_io import ModelSaver from fastNLP.core.callback import Callback, EarlyStopError -from tools.logger import * +from fastNLP.core._logger import logger class TrainCallback(Callback): def __init__(self, hps, patience=3, quit_all=True): @@ -36,6 +36,9 @@ class TrainCallback(Callback): self._hps = hps self.patience = patience self.wait = 0 + self.train_loss = 0.0 + self.prev_train_avg_loss = 1000.0 + self.train_dir = os.path.join(self._hps.save_root, "train") if type(quit_all) != bool: raise ValueError("In KeyBoardInterrupt, quit_all arguemnt must be a bool.") @@ -43,20 +46,7 @@ class TrainCallback(Callback): def on_epoch_begin(self): self.epoch_start_time = time.time() - - # def on_loss_begin(self, batch_y, predict_y): - # """ - # - # :param batch_y: dict - # input_len: [batch, N] - # :param predict_y: dict - # p_sent: [batch, N, 2] - # :return: - # """ - # input_len = batch_y[Const.INPUT_LEN] - # batch_y[Const.TARGET] = batch_y[Const.TARGET] * ((1 - input_len) * -100) - # # predict_y["p_sent"] = predict_y["p_sent"] * input_len.unsqueeze(-1) - # # logger.debug(predict_y["p_sent"][0:5,:,:]) + self.model.Train = True def on_backward_begin(self, loss): """ @@ -72,19 +62,34 @@ class TrainCallback(Callback): logger.info(name) logger.info(param.grad.data.sum()) raise Exception("train Loss is not finite. Stopping.") + self.train_loss += loss.data def on_backward_end(self): if self._hps.grad_clip: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self._hps.max_grad_norm) + torch.cuda.empty_cache() def on_epoch_end(self): - logger.info(' | end of epoch {:3d} | time: {:5.2f}s | ' - .format(self.epoch, (time.time() - self.epoch_start_time))) + epoch_avg_loss = self.train_loss / self.n_steps + logger.info(' | end of epoch {:3d} | time: {:5.2f}s | train loss: {:5.6f}' + .format(self.epoch, (time.time() - self.epoch_start_time), epoch_avg_loss)) + if self.prev_train_avg_loss < epoch_avg_loss: + save_file = os.path.join(self.train_dir, "earlystop.pkl") + self.save_model(save_file) + else: + self.prev_train_avg_loss = epoch_avg_loss + self.train_loss = 0.0 + + # save epoch + save_file = os.path.join(self.train_dir, "epoch_%d.pkl" % self.epoch) + self.save_model(save_file) + def on_valid_begin(self): self.valid_start_time = time.time() + self.model.Train = False def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): logger.info(' | end of valid {:3d} | time: {:5.2f}s | ' @@ -95,9 +100,7 @@ class TrainCallback(Callback): if self.wait == self.patience: train_dir = os.path.join(self._hps.save_root, "train") save_file = os.path.join(train_dir, "earlystop.pkl") - saver = ModelSaver(save_file) - saver.save_pytorch(self.model) - logger.info('[INFO] Saving early stop model to %s', save_file) + self.save_model(save_file) raise EarlyStopError("Early stopping raised.") else: self.wait += 1 @@ -111,14 +114,12 @@ class TrainCallback(Callback): param_group['lr'] = new_lr logger.info("[INFO] The learning rate now is %f", new_lr) + def on_exception(self, exception): if isinstance(exception, KeyboardInterrupt): logger.error("[Error] Caught keyboard interrupt on worker. Stopping supervisor...") - train_dir = os.path.join(self._hps.save_root, "train") - save_file = os.path.join(train_dir, "earlystop.pkl") - saver = ModelSaver(save_file) - saver.save_pytorch(self.model) - logger.info('[INFO] Saving early stop model to %s', save_file) + save_file = os.path.join(self.train_dir, "earlystop.pkl") + self.save_model(save_file) if self.quit_all is True: sys.exit(0) # 直接退出程序 @@ -127,6 +128,11 @@ class TrainCallback(Callback): else: raise exception # 抛出陌生Error + def save_model(self, save_file): + saver = ModelSaver(save_file) + saver.save_pytorch(self.model) + logger.info('[INFO] Saving model to %s', save_file) + diff --git a/reproduction/Summarization/Baseline/tools/Encoder.py b/reproduction/Summarization/Baseline/tools/Encoder.py deleted file mode 100644 index f77944a6..00000000 --- a/reproduction/Summarization/Baseline/tools/Encoder.py +++ /dev/null @@ -1,562 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import * -import torch.nn.init as init - -import data -from tools.logger import * -from transformer.Models import get_sinusoid_encoding_table - -class Encoder(nn.Module): - def __init__(self, hps, vocab): - super(Encoder, self).__init__() - - self._hps = hps - self._vocab = vocab - self.sent_max_len = hps.sent_max_len - - vocab_size = len(vocab) - logger.info("[INFO] Vocabulary size is %d", vocab_size) - embed_size = hps.word_emb_dim - sent_max_len = hps.sent_max_len - - input_channels = 1 - out_channels = hps.output_channel - min_kernel_size = hps.min_kernel_size - max_kernel_size = hps.max_kernel_size - width = embed_size - - # word embedding - self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=vocab.word2id('[PAD]')) - if hps.word_embedding: - word2vec = data.Word_Embedding(hps.embedding_path, vocab) - word_vecs = word2vec.load_my_vecs(embed_size) - # pretrained_weight = word2vec.add_unknown_words_by_zero(word_vecs, embed_size) - pretrained_weight = word2vec.add_unknown_words_by_avg(word_vecs, embed_size) - pretrained_weight = np.array(pretrained_weight) - self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight)) - self.embed.weight.requires_grad = hps.embed_train - - # position embedding - self.position_embedding = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(sent_max_len + 1, embed_size, padding_idx=0), freeze=True) - - # cnn - self.convs = nn.ModuleList([nn.Conv2d(input_channels, out_channels, kernel_size = (height, width)) for height in range(min_kernel_size, max_kernel_size+1)]) - logger.info("[INFO] Initing W for CNN.......") - for conv in self.convs: - init_weight_value = 6.0 - init.xavier_normal_(conv.weight.data, gain=np.sqrt(init_weight_value)) - fan_in, fan_out = Encoder.calculate_fan_in_and_fan_out(conv.weight.data) - std = np.sqrt(init_weight_value) * np.sqrt(2.0 / (fan_in + fan_out)) - - def calculate_fan_in_and_fan_out(tensor): - dimensions = tensor.ndimension() - if dimensions < 2: - logger.error("[Error] Fan in and fan out can not be computed for tensor with less than 2 dimensions") - raise ValueError("[Error] Fan in and fan out can not be computed for tensor with less than 2 dimensions") - - if dimensions == 2: # Linear - fan_in = tensor.size(1) - fan_out = tensor.size(0) - else: - num_input_fmaps = tensor.size(1) - num_output_fmaps = tensor.size(0) - receptive_field_size = 1 - if tensor.dim() > 2: - receptive_field_size = tensor[0][0].numel() - fan_in = num_input_fmaps * receptive_field_size - fan_out = num_output_fmaps * receptive_field_size - - return fan_in, fan_out - - def forward(self, input): - # input: a batch of Example object [batch_size, N, seq_len] - vocab = self._vocab - - batch_size, N, _ = input.size() - input = input.view(-1, input.size(2)) # [batch_size*N, L] - input_sent_len = ((input!=vocab.word2id('[PAD]')).sum(dim=1)).int() # [batch_size*N, 1] - enc_embed_input = self.embed(input) # [batch_size*N, L, D] - - input_pos = torch.Tensor([np.hstack((np.arange(1, sentlen + 1), np.zeros(self.sent_max_len - sentlen))) for sentlen in input_sent_len]) - if self._hps.cuda: - input_pos = input_pos.cuda() - enc_pos_embed_input = self.position_embedding(input_pos.long()) # [batch_size*N, D] - enc_conv_input = enc_embed_input + enc_pos_embed_input - enc_conv_input = enc_conv_input.unsqueeze(1) # (batch * N,Ci,L,D) - enc_conv_output = [F.relu(conv(enc_conv_input)).squeeze(3) for conv in self.convs] # kernel_sizes * (batch*N, Co, W) - enc_maxpool_output = [F.max_pool1d(x, x.size(2)).squeeze(2) for x in enc_conv_output] # kernel_sizes * (batch*N, Co) - sent_embedding = torch.cat(enc_maxpool_output, 1) # (batch*N, Co * kernel_sizes) - sent_embedding = sent_embedding.view(batch_size, N, -1) - return sent_embedding - -class DomainEncoder(Encoder): - def __init__(self, hps, vocab, domaindict): - super(DomainEncoder, self).__init__(hps, vocab) - - # domain embedding - self.domain_embedding = nn.Embedding(domaindict.size(), hps.domain_emb_dim) - self.domain_embedding.weight.requires_grad = True - - def forward(self, input, domain): - """ - :param input: [batch_size, N, seq_len], N sentence number, seq_len token number - :param domain: [batch_size] - :return: sent_embedding: [batch_size, N, Co * kernel_sizes] - """ - - batch_size, N, _ = input.size() - - sent_embedding = super().forward(input) - enc_domain_input = self.domain_embedding(domain) # [batch, D] - enc_domain_input = enc_domain_input.unsqueeze(1).expand(batch_size, N, -1) # [batch, N, D] - sent_embedding = torch.cat((sent_embedding, enc_domain_input), dim=2) - return sent_embedding - -class MultiDomainEncoder(Encoder): - def __init__(self, hps, vocab, domaindict): - super(MultiDomainEncoder, self).__init__(hps, vocab) - - self.domain_size = domaindict.size() - - # domain embedding - self.domain_embedding = nn.Embedding(self.domain_size, hps.domain_emb_dim) - self.domain_embedding.weight.requires_grad = True - - def forward(self, input, domain): - """ - :param input: [batch_size, N, seq_len], N sentence number, seq_len token number - :param domain: [batch_size, domain_size] - :return: sent_embedding: [batch_size, N, Co * kernel_sizes] - """ - - batch_size, N, _ = input.size() - - # logger.info(domain[:5, :]) - - sent_embedding = super().forward(input) - domain_padding = torch.arange(self.domain_size).unsqueeze(0).expand(batch_size, -1) - domain_padding = domain_padding.cuda().view(-1) if self._hps.cuda else domain_padding.view(-1) # [batch * domain_size] - - enc_domain_input = self.domain_embedding(domain_padding) # [batch * domain_size, D] - enc_domain_input = enc_domain_input.view(batch_size, self.domain_size, -1) * domain.unsqueeze(-1).float() # [batch, domain_size, D] - - # logger.info(enc_domain_input[:5,:]) # [batch, domain_size, D] - - enc_domain_input = enc_domain_input.sum(1) / domain.sum(1).float().unsqueeze(-1) # [batch, D] - enc_domain_input = enc_domain_input.unsqueeze(1).expand(batch_size, N, -1) # [batch, N, D] - sent_embedding = torch.cat((sent_embedding, enc_domain_input), dim=2) - return sent_embedding - - -class BertEncoder(nn.Module): - def __init__(self, hps): - super(BertEncoder, self).__init__() - - from pytorch_pretrained_bert.modeling import BertModel - - self._hps = hps - self.sent_max_len = hps.sent_max_len - self._cuda = hps.cuda - - embed_size = hps.word_emb_dim - sent_max_len = hps.sent_max_len - - input_channels = 1 - out_channels = hps.output_channel - min_kernel_size = hps.min_kernel_size - max_kernel_size = hps.max_kernel_size - width = embed_size - - # word embedding - self._bert = BertModel.from_pretrained("/remote-home/dqwang/BERT/pre-train/uncased_L-24_H-1024_A-16") - self._bert.eval() - for p in self._bert.parameters(): - p.requires_grad = False - - self.word_embedding_proj = nn.Linear(4096, embed_size) - - # position embedding - self.position_embedding = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(sent_max_len + 1, embed_size, padding_idx=0), freeze=True) - - # cnn - self.convs = nn.ModuleList([nn.Conv2d(input_channels, out_channels, kernel_size = (height, width)) for height in range(min_kernel_size, max_kernel_size+1)]) - logger.info("[INFO] Initing W for CNN.......") - for conv in self.convs: - init_weight_value = 6.0 - init.xavier_normal_(conv.weight.data, gain=np.sqrt(init_weight_value)) - fan_in, fan_out = Encoder.calculate_fan_in_and_fan_out(conv.weight.data) - std = np.sqrt(init_weight_value) * np.sqrt(2.0 / (fan_in + fan_out)) - - def calculate_fan_in_and_fan_out(tensor): - dimensions = tensor.ndimension() - if dimensions < 2: - logger.error("[Error] Fan in and fan out can not be computed for tensor with less than 2 dimensions") - raise ValueError("[Error] Fan in and fan out can not be computed for tensor with less than 2 dimensions") - - if dimensions == 2: # Linear - fan_in = tensor.size(1) - fan_out = tensor.size(0) - else: - num_input_fmaps = tensor.size(1) - num_output_fmaps = tensor.size(0) - receptive_field_size = 1 - if tensor.dim() > 2: - receptive_field_size = tensor[0][0].numel() - fan_in = num_input_fmaps * receptive_field_size - fan_out = num_output_fmaps * receptive_field_size - - return fan_in, fan_out - - def pad_encoder_input(self, input_list): - """ - :param input_list: N [seq_len, hidden_state] - :return: enc_sent_input_pad: list, N [max_len, hidden_state] - """ - max_len = self.sent_max_len - enc_sent_input_pad = [] - _, hidden_size = input_list[0].size() - for i in range(len(input_list)): - article_words = input_list[i] # [seq_len, hidden_size] - seq_len = article_words.size(0) - if seq_len > max_len: - pad_words = article_words[:max_len, :] - else: - pad_tensor = torch.zeros(max_len - seq_len, hidden_size).cuda() if self._cuda else torch.zeros(max_len - seq_len, hidden_size) - pad_words = torch.cat([article_words, pad_tensor], dim=0) - enc_sent_input_pad.append(pad_words) - return enc_sent_input_pad - - def forward(self, inputs, input_masks, enc_sent_len): - """ - - :param inputs: a batch of Example object [batch_size, doc_len=512] - :param input_masks: 0 or 1, [batch, doc_len=512] - :param enc_sent_len: sentence original length [batch, N] - :return: - """ - - - # Use Bert to get word embedding - batch_size, N = enc_sent_len.size() - input_pad_list = [] - for i in range(batch_size): - tokens_id = inputs[i] - input_mask = input_masks[i] - sent_len = enc_sent_len[i] - input_ids = tokens_id.unsqueeze(0) - input_mask = input_mask.unsqueeze(0) - - out, _ = self._bert(input_ids, token_type_ids=None, attention_mask=input_mask) - out = torch.cat(out[-4:], dim=-1).squeeze(0) # [doc_len=512, hidden_state=4096] - - _, hidden_size = out.size() - - # restore the sentence - last_end = 1 - enc_sent_input = [] - for length in sent_len: - if length != 0 and last_end < 511: - enc_sent_input.append(out[last_end: min(511, last_end + length), :]) - last_end += length - else: - pad_tensor = torch.zeros(self.sent_max_len, hidden_size).cuda() if self._hps.cuda else torch.zeros(self.sent_max_len, hidden_size) - enc_sent_input.append(pad_tensor) - - - # pad the sentence - enc_sent_input_pad = self.pad_encoder_input(enc_sent_input) # [N, seq_len, hidden_state=4096] - input_pad_list.append(torch.stack(enc_sent_input_pad)) - - input_pad = torch.stack(input_pad_list) - - input_pad = input_pad.view(batch_size*N, self.sent_max_len, -1) - enc_sent_len = enc_sent_len.view(-1) # [batch_size*N] - enc_embed_input = self.word_embedding_proj(input_pad) # [batch_size * N, L, D] - - sent_pos_list = [] - for sentlen in enc_sent_len: - sent_pos = list(range(1, min(self.sent_max_len, sentlen) + 1)) - for k in range(self.sent_max_len - sentlen): - sent_pos.append(0) - sent_pos_list.append(sent_pos) - input_pos = torch.Tensor(sent_pos_list).long() - - if self._hps.cuda: - input_pos = input_pos.cuda() - enc_pos_embed_input = self.position_embedding(input_pos.long()) # [batch_size*N, D] - enc_conv_input = enc_embed_input + enc_pos_embed_input - enc_conv_input = enc_conv_input.unsqueeze(1) # (batch * N,Ci,L,D) - enc_conv_output = [F.relu(conv(enc_conv_input)).squeeze(3) for conv in self.convs] # kernel_sizes * (batch*N, Co, W) - enc_maxpool_output = [F.max_pool1d(x, x.size(2)).squeeze(2) for x in enc_conv_output] # kernel_sizes * (batch*N, Co) - sent_embedding = torch.cat(enc_maxpool_output, 1) # (batch*N, Co * kernel_sizes) - sent_embedding = sent_embedding.view(batch_size, N, -1) - return sent_embedding - - -class BertTagEncoder(BertEncoder): - def __init__(self, hps, domaindict): - super(BertTagEncoder, self).__init__(hps) - - # domain embedding - self.domain_embedding = nn.Embedding(domaindict.size(), hps.domain_emb_dim) - self.domain_embedding.weight.requires_grad = True - - def forward(self, inputs, input_masks, enc_sent_len, domain): - sent_embedding = super().forward(inputs, input_masks, enc_sent_len) - - batch_size, N = enc_sent_len.size() - - enc_domain_input = self.domain_embedding(domain) # [batch, D] - enc_domain_input = enc_domain_input.unsqueeze(1).expand(batch_size, N, -1) # [batch, N, D] - sent_embedding = torch.cat((sent_embedding, enc_domain_input), dim=2) - - return sent_embedding - -class ELMoEndoer(nn.Module): - def __init__(self, hps): - super(ELMoEndoer, self).__init__() - - self._hps = hps - self.sent_max_len = hps.sent_max_len - - from allennlp.modules.elmo import Elmo - - elmo_dim = 1024 - options_file = "/remote-home/dqwang/ELMo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json" - weight_file = "/remote-home/dqwang/ELMo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5" - - # elmo_dim = 512 - # options_file = "/remote-home/dqwang/ELMo/elmo_2x2048_256_2048cnn_1xhighway_options.json" - # weight_file = "/remote-home/dqwang/ELMo/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" - - embed_size = hps.word_emb_dim - sent_max_len = hps.sent_max_len - - input_channels = 1 - out_channels = hps.output_channel - min_kernel_size = hps.min_kernel_size - max_kernel_size = hps.max_kernel_size - width = embed_size - - # elmo embedding - self.elmo = Elmo(options_file, weight_file, 1, dropout=0) - self.embed_proj = nn.Linear(elmo_dim, embed_size) - - # position embedding - self.position_embedding = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(sent_max_len + 1, embed_size, padding_idx=0), freeze=True) - - # cnn - self.convs = nn.ModuleList([nn.Conv2d(input_channels, out_channels, kernel_size = (height, width)) for height in range(min_kernel_size, max_kernel_size+1)]) - logger.info("[INFO] Initing W for CNN.......") - for conv in self.convs: - init_weight_value = 6.0 - init.xavier_normal_(conv.weight.data, gain=np.sqrt(init_weight_value)) - fan_in, fan_out = Encoder.calculate_fan_in_and_fan_out(conv.weight.data) - std = np.sqrt(init_weight_value) * np.sqrt(2.0 / (fan_in + fan_out)) - - def calculate_fan_in_and_fan_out(tensor): - dimensions = tensor.ndimension() - if dimensions < 2: - logger.error("[Error] Fan in and fan out can not be computed for tensor with less than 2 dimensions") - raise ValueError("[Error] Fan in and fan out can not be computed for tensor with less than 2 dimensions") - - if dimensions == 2: # Linear - fan_in = tensor.size(1) - fan_out = tensor.size(0) - else: - num_input_fmaps = tensor.size(1) - num_output_fmaps = tensor.size(0) - receptive_field_size = 1 - if tensor.dim() > 2: - receptive_field_size = tensor[0][0].numel() - fan_in = num_input_fmaps * receptive_field_size - fan_out = num_output_fmaps * receptive_field_size - - return fan_in, fan_out - - def forward(self, input): - # input: a batch of Example object [batch_size, N, seq_len, character_len] - - batch_size, N, seq_len, _ = input.size() - input = input.view(batch_size * N, seq_len, -1) # [batch_size*N, seq_len, character_len] - input_sent_len = ((input.sum(-1)!=0).sum(dim=1)).int() # [batch_size*N, 1] - logger.debug(input_sent_len.view(batch_size, -1)) - enc_embed_input = self.elmo(input)['elmo_representations'][0] # [batch_size*N, L, D] - enc_embed_input = self.embed_proj(enc_embed_input) - - # input_pos = torch.Tensor([np.hstack((np.arange(1, sentlen + 1), np.zeros(self.sent_max_len - sentlen))) for sentlen in input_sent_len]) - - sent_pos_list = [] - for sentlen in input_sent_len: - sent_pos = list(range(1, min(self.sent_max_len, sentlen) + 1)) - for k in range(self.sent_max_len - sentlen): - sent_pos.append(0) - sent_pos_list.append(sent_pos) - input_pos = torch.Tensor(sent_pos_list).long() - - if self._hps.cuda: - input_pos = input_pos.cuda() - enc_pos_embed_input = self.position_embedding(input_pos.long()) # [batch_size*N, D] - enc_conv_input = enc_embed_input + enc_pos_embed_input - enc_conv_input = enc_conv_input.unsqueeze(1) # (batch * N,Ci,L,D) - enc_conv_output = [F.relu(conv(enc_conv_input)).squeeze(3) for conv in self.convs] # kernel_sizes * (batch*N, Co, W) - enc_maxpool_output = [F.max_pool1d(x, x.size(2)).squeeze(2) for x in enc_conv_output] # kernel_sizes * (batch*N, Co) - sent_embedding = torch.cat(enc_maxpool_output, 1) # (batch*N, Co * kernel_sizes) - sent_embedding = sent_embedding.view(batch_size, N, -1) - return sent_embedding - -class ELMoEndoer2(nn.Module): - def __init__(self, hps): - super(ELMoEndoer2, self).__init__() - - self._hps = hps - self._cuda = hps.cuda - self.sent_max_len = hps.sent_max_len - - from allennlp.modules.elmo import Elmo - - elmo_dim = 1024 - options_file = "/remote-home/dqwang/ELMo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json" - weight_file = "/remote-home/dqwang/ELMo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5" - - # elmo_dim = 512 - # options_file = "/remote-home/dqwang/ELMo/elmo_2x2048_256_2048cnn_1xhighway_options.json" - # weight_file = "/remote-home/dqwang/ELMo/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" - - embed_size = hps.word_emb_dim - sent_max_len = hps.sent_max_len - - input_channels = 1 - out_channels = hps.output_channel - min_kernel_size = hps.min_kernel_size - max_kernel_size = hps.max_kernel_size - width = embed_size - - # elmo embedding - self.elmo = Elmo(options_file, weight_file, 1, dropout=0) - self.embed_proj = nn.Linear(elmo_dim, embed_size) - - # position embedding - self.position_embedding = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(sent_max_len + 1, embed_size, padding_idx=0), freeze=True) - - # cnn - self.convs = nn.ModuleList([nn.Conv2d(input_channels, out_channels, kernel_size = (height, width)) for height in range(min_kernel_size, max_kernel_size+1)]) - logger.info("[INFO] Initing W for CNN.......") - for conv in self.convs: - init_weight_value = 6.0 - init.xavier_normal_(conv.weight.data, gain=np.sqrt(init_weight_value)) - fan_in, fan_out = Encoder.calculate_fan_in_and_fan_out(conv.weight.data) - std = np.sqrt(init_weight_value) * np.sqrt(2.0 / (fan_in + fan_out)) - - def calculate_fan_in_and_fan_out(tensor): - dimensions = tensor.ndimension() - if dimensions < 2: - logger.error("[Error] Fan in and fan out can not be computed for tensor with less than 2 dimensions") - raise ValueError("[Error] Fan in and fan out can not be computed for tensor with less than 2 dimensions") - - if dimensions == 2: # Linear - fan_in = tensor.size(1) - fan_out = tensor.size(0) - else: - num_input_fmaps = tensor.size(1) - num_output_fmaps = tensor.size(0) - receptive_field_size = 1 - if tensor.dim() > 2: - receptive_field_size = tensor[0][0].numel() - fan_in = num_input_fmaps * receptive_field_size - fan_out = num_output_fmaps * receptive_field_size - - return fan_in, fan_out - - def pad_encoder_input(self, input_list): - """ - :param input_list: N [seq_len, hidden_state] - :return: enc_sent_input_pad: list, N [max_len, hidden_state] - """ - max_len = self.sent_max_len - enc_sent_input_pad = [] - _, hidden_size = input_list[0].size() - for i in range(len(input_list)): - article_words = input_list[i] # [seq_len, hidden_size] - seq_len = article_words.size(0) - if seq_len > max_len: - pad_words = article_words[:max_len, :] - else: - pad_tensor = torch.zeros(max_len - seq_len, hidden_size).cuda() if self._cuda else torch.zeros(max_len - seq_len, hidden_size) - pad_words = torch.cat([article_words, pad_tensor], dim=0) - enc_sent_input_pad.append(pad_words) - return enc_sent_input_pad - - def forward(self, inputs, input_masks, enc_sent_len): - """ - - :param inputs: a batch of Example object [batch_size, doc_len=512, character_len=50] - :param input_masks: 0 or 1, [batch, doc_len=512] - :param enc_sent_len: sentence original length [batch, N] - :return: - sent_embedding: [batch, N, D] - """ - - # Use Bert to get word embedding - batch_size, N = enc_sent_len.size() - input_pad_list = [] - - elmo_output = self.elmo(inputs)['elmo_representations'][0] # [batch_size, 512, D] - elmo_output = elmo_output * input_masks.unsqueeze(-1).float() - # print("END elmo") - - for i in range(batch_size): - sent_len = enc_sent_len[i] # [1, N] - out = elmo_output[i] - - _, hidden_size = out.size() - - # restore the sentence - last_end = 0 - enc_sent_input = [] - for length in sent_len: - if length != 0 and last_end < 512: - enc_sent_input.append(out[last_end : min(512, last_end + length), :]) - last_end += length - else: - pad_tensor = torch.zeros(self.sent_max_len, hidden_size).cuda() if self._hps.cuda else torch.zeros(self.sent_max_len, hidden_size) - enc_sent_input.append(pad_tensor) - - # pad the sentence - enc_sent_input_pad = self.pad_encoder_input(enc_sent_input) # [N, seq_len, hidden_state=4096] - input_pad_list.append(torch.stack(enc_sent_input_pad)) # batch * [N, max_len, hidden_state] - - input_pad = torch.stack(input_pad_list) - - input_pad = input_pad.view(batch_size * N, self.sent_max_len, -1) - enc_sent_len = enc_sent_len.view(-1) # [batch_size*N] - enc_embed_input = self.embed_proj(input_pad) # [batch_size * N, L, D] - - # input_pos = torch.Tensor([np.hstack((np.arange(1, sentlen + 1), np.zeros(self.sent_max_len - sentlen))) for sentlen in input_sent_len]) - - sent_pos_list = [] - for sentlen in enc_sent_len: - sent_pos = list(range(1, min(self.sent_max_len, sentlen) + 1)) - for k in range(self.sent_max_len - sentlen): - sent_pos.append(0) - sent_pos_list.append(sent_pos) - input_pos = torch.Tensor(sent_pos_list).long() - - if self._hps.cuda: - input_pos = input_pos.cuda() - enc_pos_embed_input = self.position_embedding(input_pos.long()) # [batch_size*N, D] - enc_conv_input = enc_embed_input + enc_pos_embed_input - enc_conv_input = enc_conv_input.unsqueeze(1) # (batch * N,Ci,L,D) - enc_conv_output = [F.relu(conv(enc_conv_input)).squeeze(3) for conv in self.convs] # kernel_sizes * (batch*N, Co, W) - enc_maxpool_output = [F.max_pool1d(x, x.size(2)).squeeze(2) for x in enc_conv_output] # kernel_sizes * (batch*N, Co) - sent_embedding = torch.cat(enc_maxpool_output, 1) # (batch*N, Co * kernel_sizes) - sent_embedding = sent_embedding.view(batch_size, N, -1) - return sent_embedding \ No newline at end of file diff --git a/reproduction/Summarization/Baseline/train.py b/reproduction/Summarization/Baseline/train.py index b3170307..fa45a6fc 100644 --- a/reproduction/Summarization/Baseline/train.py +++ b/reproduction/Summarization/Baseline/train.py @@ -21,6 +21,7 @@ import os import sys import json +import shutil import argparse import datetime @@ -32,20 +33,25 @@ os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' sys.path.append('/remote-home/dqwang/FastNLP/fastNLP_brxx/') +from fastNLP.core._logger import logger +# from fastNLP.core._logger import _init_logger from fastNLP.core.const import Const from fastNLP.core.trainer import Trainer, Tester +from fastNLP.io.pipe.summarization import ExtCNNDMPipe from fastNLP.io.model_io import ModelLoader, ModelSaver from fastNLP.io.embed_loader import EmbedLoader -from tools.logger import * -from data.dataloader import SummarizationLoader +# from tools.logger import * # from model.TransformerModel import TransformerModel from model.TForiginal import TransformerModel -from model.Metric import LabelFMetric, FastRougeMetric, PyRougeMetric +from model.LSTMModel import SummarizationModel +from model.Metric import LossMetric, LabelFMetric, FastRougeMetric, PyRougeMetric from model.Loss import MyCrossEntropyLoss from tools.Callback import TrainCallback + + def setup_training(model, train_loader, valid_loader, hps): """Does setup before starting training (run_training)""" @@ -60,32 +66,23 @@ def setup_training(model, train_loader, valid_loader, hps): else: logger.info("[INFO] Create new model for training...") - try: - run_training(model, train_loader, valid_loader, hps) # this is an infinite loop until interrupted - except KeyboardInterrupt: - logger.error("[Error] Caught keyboard interrupt on worker. Stopping supervisor...") - save_file = os.path.join(train_dir, "earlystop.pkl") - saver = ModelSaver(save_file) - saver.save_pytorch(model) - logger.info('[INFO] Saving early stop model to %s', save_file) + run_training(model, train_loader, valid_loader, hps) # this is an infinite loop until interrupted def run_training(model, train_loader, valid_loader, hps): - """Repeatedly runs training iterations, logging loss to screen and writing summaries""" logger.info("[INFO] Starting run_training") train_dir = os.path.join(hps.save_root, "train") - if not os.path.exists(train_dir): os.makedirs(train_dir) + if os.path.exists(train_dir): shutil.rmtree(train_dir) + os.makedirs(train_dir) eval_dir = os.path.join(hps.save_root, "eval") # make a subdir of the root dir for eval data if not os.path.exists(eval_dir): os.makedirs(eval_dir) - lr = hps.lr - optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr) + optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=hps.lr) criterion = MyCrossEntropyLoss(pred = "p_sent", target=Const.TARGET, mask=Const.INPUT_LEN, reduce='none') - # criterion = torch.nn.CrossEntropyLoss(reduce="none") trainer = Trainer(model=model, train_data=train_loader, optimizer=optimizer, loss=criterion, - n_epochs=hps.n_epochs, print_every=100, dev_data=valid_loader, metrics=[LabelFMetric(pred="prediction"), FastRougeMetric(hps, pred="prediction")], - metric_key="f", validate_every=-1, save_path=eval_dir, + n_epochs=hps.n_epochs, print_every=100, dev_data=valid_loader, metrics=[LossMetric(pred = "p_sent", target=Const.TARGET, mask=Const.INPUT_LEN, reduce='none'), LabelFMetric(pred="prediction"), FastRougeMetric(hps, pred="prediction")], + metric_key="loss", validate_every=-1, save_path=eval_dir, callbacks=[TrainCallback(hps, patience=5)], use_tqdm=False) train_info = trainer.train(load_best_model=True) @@ -98,8 +95,8 @@ def run_training(model, train_loader, valid_loader, hps): saver.save_pytorch(model) logger.info('[INFO] Saving eval best model to %s', bestmodel_save_path) -def run_test(model, loader, hps, limited=False): - """Repeatedly runs eval iterations, logging to screen and writing summaries. Saves the model with the best loss seen so far.""" + +def run_test(model, loader, hps): test_dir = os.path.join(hps.save_root, "test") # make a subdir of the root dir for eval data eval_dir = os.path.join(hps.save_root, "eval") if not os.path.exists(test_dir) : os.makedirs(test_dir) @@ -113,8 +110,8 @@ def run_test(model, loader, hps, limited=False): train_dir = os.path.join(hps.save_root, "train") bestmodel_load_path = os.path.join(train_dir, 'earlystop.pkl') else: - logger.error("None of such model! Must be one of evalbestmodel/trainbestmodel/earlystop") - raise ValueError("None of such model! Must be one of evalbestmodel/trainbestmodel/earlystop") + logger.error("None of such model! Must be one of evalbestmodel/earlystop") + raise ValueError("None of such model! Must be one of evalbestmodel/earlystop") logger.info("[INFO] Restoring %s for testing...The path is %s", hps.test_model, bestmodel_load_path) modelloader = ModelLoader() @@ -174,13 +171,11 @@ def main(): # Training parser.add_argument('--lr', type=float, default=0.0001, help='learning rate') parser.add_argument('--lr_descent', action='store_true', default=False, help='learning rate descent') - parser.add_argument('--warmup_steps', type=int, default=4000, help='warmup_steps') parser.add_argument('--grad_clip', action='store_true', default=False, help='for gradient clipping') parser.add_argument('--max_grad_norm', type=float, default=10, help='for gradient clipping max gradient normalization') # test parser.add_argument('-m', type=int, default=3, help='decode summary length') - parser.add_argument('--limited', action='store_true', default=False, help='limited decode summary length') parser.add_argument('--test_model', type=str, default='evalbestmodel', help='choose different model to test [evalbestmodel/evalbestFmodel/trainbestmodel/trainbestFmodel/earlystop]') parser.add_argument('--use_pyrouge', action='store_true', default=False, help='use_pyrouge') @@ -195,36 +190,42 @@ def main(): VOCAL_FILE = args.vocab_path LOG_PATH = args.log_root - # train_log setting + # # train_log setting if not os.path.exists(LOG_PATH): if args.mode == "train": os.makedirs(LOG_PATH) else: - logger.exception("[Error] Logdir %s doesn't exist. Run in train mode to create it.", LOG_PATH) raise Exception("[Error] Logdir %s doesn't exist. Run in train mode to create it." % (LOG_PATH)) nowTime=datetime.datetime.now().strftime('%Y%m%d_%H%M%S') log_path = os.path.join(LOG_PATH, args.mode + "_" + nowTime) - file_handler = logging.FileHandler(log_path) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) + # logger = _init_logger(path=log_path) + # file_handler = logging.FileHandler(log_path) + # file_handler.setFormatter(formatter) + # logger.addHandler(file_handler) logger.info("Pytorch %s", torch.__version__) - sum_loader = SummarizationLoader() + # dataset hps = args + dbPipe = ExtCNNDMPipe(vocab_size=hps.vocab_size, + vocab_path=VOCAL_FILE, + sent_max_len=hps.sent_max_len, + doc_max_timesteps=hps.doc_max_timesteps) if hps.mode == 'test': - paths = {"test": DATA_FILE} hps.recurrent_dropout_prob = 0.0 hps.atten_dropout_prob = 0.0 hps.ffn_dropout_prob = 0.0 logger.info(hps) + paths = {"test": DATA_FILE} + db = dbPipe.process_from_file(paths) else: paths = {"train": DATA_FILE, "valid": VALID_FILE} + db = dbPipe.process_from_file(paths) - dataInfo = sum_loader.process(paths=paths, vocab_size=hps.vocab_size, vocab_path=VOCAL_FILE, sent_max_len=hps.sent_max_len, doc_max_timesteps=hps.doc_max_timesteps, load_vocab=os.path.exists(VOCAL_FILE)) + # embedding if args.embedding == "glove": - vocab = dataInfo.vocabs["vocab"] + vocab = db.get_vocab("vocab") embed = torch.nn.Embedding(len(vocab), hps.word_emb_dim) if hps.word_embedding: embed_loader = EmbedLoader() @@ -235,26 +236,31 @@ def main(): logger.error("[ERROR] embedding To Be Continued!") sys.exit(1) + # model if args.sentence_encoder == "transformer" and args.sentence_decoder == "SeqLab": model_param = json.load(open("config/transformer.config", "rb")) hps.__dict__.update(model_param) model = TransformerModel(hps, embed) + elif args.sentence_encoder == "deeplstm" and args.sentence_decoder == "SeqLab": + model_param = json.load(open("config/deeplstm.config", "rb")) + hps.__dict__.update(model_param) + model = SummarizationModel(hps, embed) else: logger.error("[ERROR] Model To Be Continued!") sys.exit(1) - - logger.info(hps) - if hps.cuda: model = model.cuda() logger.info("[INFO] Use cuda") + + logger.info(hps) + if hps.mode == 'train': - dataInfo.datasets["valid"].set_target("text", "summary") - setup_training(model, dataInfo.datasets["train"], dataInfo.datasets["valid"], hps) + db.get_dataset("valid").set_target("text", "summary") + setup_training(model, db.get_dataset("train"), db.get_dataset("valid"), hps) elif hps.mode == 'test': logger.info("[INFO] Decoding...") - dataInfo.datasets["test"].set_target("text", "summary") - run_test(model, dataInfo.datasets["test"], hps, limited=hps.limited) + db.get_dataset("test").set_target("text", "summary") + run_test(model, db.get_dataset("test"), hps, limited=hps.limited) else: logger.error("The 'mode' flag must be one of train/eval/test") raise ValueError("The 'mode' flag must be one of train/eval/test") diff --git a/reproduction/Summarization/BertSum/dataloader.py b/reproduction/Summarization/BertSum/dataloader.py index c5201261..6af797e4 100644 --- a/reproduction/Summarization/BertSum/dataloader.py +++ b/reproduction/Summarization/BertSum/dataloader.py @@ -3,7 +3,7 @@ from datetime import timedelta from fastNLP.io.dataset_loader import JsonLoader from fastNLP.modules.encoder._bert import BertTokenizer -from fastNLP.io.base_loader import DataBundle +from fastNLP.io.data_bundle import DataBundle from fastNLP.core.const import Const class BertData(JsonLoader): diff --git a/reproduction/Summarization/README.md b/reproduction/Summarization/README.md index b584269f..1df15d56 100644 --- a/reproduction/Summarization/README.md +++ b/reproduction/Summarization/README.md @@ -18,7 +18,7 @@ FastNLP中实现的模型包括: 这里提供的摘要任务数据集包括: -- CNN/DailyMail +- CNN/DailyMail ([Get To The Point: Summarization with Pointer-Generator Networks](http://arxiv.org/abs/1704.04368)) - Newsroom - The New York Times Annotated Corpus - NYT @@ -110,11 +110,11 @@ $ python -m pyrouge.test LSTM + Sequence Labeling - python train.py --cuda --gpu --sentence_encoder deeplstm --sentence_decoder seqlab --save_root --log_root --lr_descent --grad_clip --max_grad_norm 10 + python train.py --cuda --gpu --sentence_encoder deeplstm --sentence_decoder SeqLab --save_root --log_root --lr_descent --grad_clip --max_grad_norm 10 Transformer + Sequence Labeling - python train.py --cuda --gpu --sentence_encoder transformer --sentence_decoder seqlab --save_root --log_root --lr_descent --grad_clip --max_grad_norm 10 + python train.py --cuda --gpu --sentence_encoder transformer --sentence_decoder SeqLab --save_root --log_root --lr_descent --grad_clip --max_grad_norm 10 diff --git a/reproduction/coreference_resolution/README.md b/reproduction/coreference_resolution/README.md index 7cbcd052..c1a286e5 100644 --- a/reproduction/coreference_resolution/README.md +++ b/reproduction/coreference_resolution/README.md @@ -1,4 +1,4 @@ -# 共指消解复现 +# 指代消解复现 ## 介绍 Coreference resolution是查找文本中指向同一现实实体的所有表达式的任务。 对于涉及自然语言理解的许多更高级别的NLP任务来说, diff --git a/reproduction/coreference_resolution/data_load/cr_loader.py b/reproduction/coreference_resolution/data_load/cr_loader.py deleted file mode 100644 index a424b0d1..00000000 --- a/reproduction/coreference_resolution/data_load/cr_loader.py +++ /dev/null @@ -1,68 +0,0 @@ -from fastNLP.io.dataset_loader import JsonLoader,DataSet,Instance -from fastNLP.io.file_reader import _read_json -from fastNLP.core.vocabulary import Vocabulary -from fastNLP.io.base_loader import DataBundle -from reproduction.coreference_resolution.model.config import Config -import reproduction.coreference_resolution.model.preprocess as preprocess - - -class CRLoader(JsonLoader): - def __init__(self, fields=None, dropna=False): - super().__init__(fields, dropna) - - def _load(self, path): - """ - 加载数据 - :param path: - :return: - """ - dataset = DataSet() - for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): - if self.fields: - ins = {self.fields[k]: v for k, v in d.items()} - else: - ins = d - dataset.append(Instance(**ins)) - return dataset - - def process(self, paths, **kwargs): - data_info = DataBundle() - for name in ['train', 'test', 'dev']: - data_info.datasets[name] = self.load(paths[name]) - - config = Config() - vocab = Vocabulary().from_dataset(*data_info.datasets.values(), field_name='sentences') - vocab.build_vocab() - word2id = vocab.word2idx - - char_dict = preprocess.get_char_dict(config.char_path) - data_info.vocabs = vocab - - genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} - - for name, ds in data_info.datasets.items(): - ds.apply(lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), - config.max_sentences, is_train=name=='train')[0], - new_field_name='doc_np') - ds.apply(lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), - config.max_sentences, is_train=name=='train')[1], - new_field_name='char_index') - ds.apply(lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter), - config.max_sentences, is_train=name=='train')[2], - new_field_name='seq_len') - ds.apply(lambda x: preprocess.speaker2numpy(x["speakers"], config.max_sentences, is_train=name=='train'), - new_field_name='speaker_ids_np') - ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre') - - ds.set_ignore_type('clusters') - ds.set_padder('clusters', None) - ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len") - ds.set_target("clusters") - - # train_dev, test = self.ds.split(348 / (2802 + 343 + 348), shuffle=False) - # train, dev = train_dev.split(343 / (2802 + 343), shuffle=False) - - return data_info - - - diff --git a/reproduction/coreference_resolution/model/model_re.py b/reproduction/coreference_resolution/model/model_re.py index 9dd90ec4..eaa2941b 100644 --- a/reproduction/coreference_resolution/model/model_re.py +++ b/reproduction/coreference_resolution/model/model_re.py @@ -8,6 +8,7 @@ from fastNLP.models.base_model import BaseModel from fastNLP.modules.encoder.variational_rnn import VarLSTM from reproduction.coreference_resolution.model import preprocess from fastNLP.io.embed_loader import EmbedLoader +from fastNLP.core.const import Const import random # 设置seed @@ -415,7 +416,7 @@ class Model(BaseModel): return predicted_clusters - def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len): + def forward(self, words1 , words2, words3, words4, chars, seq_len): """ 实际输入都是tensor :param sentences: 句子,被fastNLP转化成了numpy, @@ -426,6 +427,14 @@ class Model(BaseModel): :param seq_len: 被fastNLP转化成了Tensor :return: """ + + sentences = words3 + doc_np = words4 + speaker_ids_np = words2 + genre = words1 + char_index = chars + + # change for fastNLP sentences = sentences[0].tolist() doc_tensor = doc_np[0] diff --git a/reproduction/coreference_resolution/model/softmax_loss.py b/reproduction/coreference_resolution/model/softmax_loss.py index c75a31d6..1c1fcc69 100644 --- a/reproduction/coreference_resolution/model/softmax_loss.py +++ b/reproduction/coreference_resolution/model/softmax_loss.py @@ -11,18 +11,18 @@ class SoftmaxLoss(LossBase): 允许多标签分类 """ - def __init__(self, antecedent_scores=None, clusters=None, mention_start_tensor=None, mention_end_tensor=None): + def __init__(self, antecedent_scores=None, target=None, mention_start_tensor=None, mention_end_tensor=None): """ :param pred: :param target: """ super().__init__() - self._init_param_map(antecedent_scores=antecedent_scores, clusters=clusters, + self._init_param_map(antecedent_scores=antecedent_scores, target=target, mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor) - def get_loss(self, antecedent_scores, clusters, mention_start_tensor, mention_end_tensor): - antecedent_labels = get_labels(clusters[0], mention_start_tensor, mention_end_tensor, + def get_loss(self, antecedent_scores, target, mention_start_tensor, mention_end_tensor): + antecedent_labels = get_labels(target[0], mention_start_tensor, mention_end_tensor, Config().max_antecedents) antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda)) diff --git a/reproduction/coreference_resolution/test/test_dataloader.py b/reproduction/coreference_resolution/test/test_dataloader.py deleted file mode 100644 index 0d9dae52..00000000 --- a/reproduction/coreference_resolution/test/test_dataloader.py +++ /dev/null @@ -1,14 +0,0 @@ -import unittest -from ..data_load.cr_loader import CRLoader - -class Test_CRLoader(unittest.TestCase): - def test_cr_loader(self): - train_path = 'data/train.english.jsonlines.mini' - dev_path = 'data/dev.english.jsonlines.minid' - test_path = 'data/test.english.jsonlines' - cr = CRLoader() - data_info = cr.process({'train':train_path,'dev':dev_path,'test':test_path}) - - print(data_info.datasets['train'][0]) - print(data_info.datasets['dev'][0]) - print(data_info.datasets['test'][0]) diff --git a/reproduction/coreference_resolution/train.py b/reproduction/coreference_resolution/train.py index a231a575..d5445cd5 100644 --- a/reproduction/coreference_resolution/train.py +++ b/reproduction/coreference_resolution/train.py @@ -1,5 +1,3 @@ -import sys -sys.path.append('../..') import torch from torch.optim import Adam @@ -7,17 +5,14 @@ from torch.optim import Adam from fastNLP.core.callback import Callback, GradientClipCallback from fastNLP.core.trainer import Trainer -from reproduction.coreference_resolution.data_load.cr_loader import CRLoader +from fastNLP.io.pipe.coreference import CoReferencePipe +from fastNLP.core.const import Const + from reproduction.coreference_resolution.model.config import Config from reproduction.coreference_resolution.model.model_re import Model from reproduction.coreference_resolution.model.softmax_loss import SoftmaxLoss from reproduction.coreference_resolution.model.metric import CRMetric -from fastNLP import SequentialSampler -from fastNLP import cache_results - -# torch.backends.cudnn.benchmark = False -# torch.backends.cudnn.deterministic = True class LRCallback(Callback): def __init__(self, parameters, decay_rate=1e-3): @@ -36,18 +31,13 @@ if __name__ == "__main__": print(config) - @cache_results('cache.pkl') def cache(): - cr_train_dev_test = CRLoader() - - data_info = cr_train_dev_test.process({'train': config.train_path, 'dev': config.dev_path, - 'test': config.test_path}) - return data_info - data_info = cache() - print("数据集划分:\ntrain:", str(len(data_info.datasets["train"])), - "\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"]))) - # print(data_info) - model = Model(data_info.vocabs, config) + bundle = CoReferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path, + 'test': config.test_path}) + return bundle + data_bundle = cache() + print(data_bundle) + model = Model(data_bundle.get_vocab(Const.INPUTS(0)), config) print(model) loss = SoftmaxLoss() @@ -58,11 +48,12 @@ if __name__ == "__main__": lr_decay_callback = LRCallback(optim.param_groups, config.lr_decay) - trainer = Trainer(model=model, train_data=data_info.datasets["train"], dev_data=data_info.datasets["dev"], - loss=loss, metrics=metric, check_code_level=-1,sampler=None, - batch_size=1, device=torch.device("cuda:" + config.cuda), metric_key='f', n_epochs=config.epoch, + trainer = Trainer(model=model, train_data=data_bundle.datasets["train"], dev_data=data_bundle.datasets["dev"], + loss=loss, metrics=metric, check_code_level=-1, sampler=None, + batch_size=1, device=torch.device("cuda:" + config.cuda) if torch.cuda.is_available() else None, + metric_key='f', n_epochs=config.epoch, optimizer=optim, - save_path='/remote-home/xxliu/pycharm/fastNLP/fastNLP/reproduction/coreference_resolution/save', + save_path=None, callbacks=[lr_decay_callback, GradientClipCallback(clip_value=5)]) print() diff --git a/reproduction/coreference_resolution/valid.py b/reproduction/coreference_resolution/valid.py index 826332c6..e79642b8 100644 --- a/reproduction/coreference_resolution/valid.py +++ b/reproduction/coreference_resolution/valid.py @@ -1,7 +1,8 @@ import torch from reproduction.coreference_resolution.model.config import Config from reproduction.coreference_resolution.model.metric import CRMetric -from reproduction.coreference_resolution.data_load.cr_loader import CRLoader +from fastNLP.io.pipe.coreference import CoReferencePipe + from fastNLP import Tester import argparse @@ -11,13 +12,12 @@ if __name__=='__main__': parser.add_argument('--path') args = parser.parse_args() - cr_loader = CRLoader() config = Config() - data_info = cr_loader.process({'train': config.train_path, 'dev': config.dev_path, - 'test': config.test_path}) + bundle = CoReferencePipe(Config()).process_from_file( + {'train': config.train_path, 'dev': config.dev_path, 'test': config.test_path}) metirc = CRMetric() model = torch.load(args.path) - tester = Tester(data_info.datasets['test'],model,metirc,batch_size=1,device="cuda:0") + tester = Tester(bundle.get_dataset("test"),model,metirc,batch_size=1,device="cuda:0") tester.test() print('test over') diff --git a/reproduction/joint_cws_parse/data/data_loader.py b/reproduction/joint_cws_parse/data/data_loader.py index 3e6fec4b..4df46b04 100644 --- a/reproduction/joint_cws_parse/data/data_loader.py +++ b/reproduction/joint_cws_parse/data/data_loader.py @@ -1,6 +1,6 @@ -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from fastNLP.io.data_loader import ConllLoader import numpy as np diff --git a/reproduction/joint_cws_parse/models/CharParser.py b/reproduction/joint_cws_parse/models/CharParser.py index c07c070e..7d89cacb 100644 --- a/reproduction/joint_cws_parse/models/CharParser.py +++ b/reproduction/joint_cws_parse/models/CharParser.py @@ -224,11 +224,11 @@ class CharBiaffineParser(BiaffineParser): batch_size, seq_len, _ = arc_pred.shape flip_mask = (mask == 0) - _arc_pred = arc_pred.clone() - _arc_pred.masked_fill_(flip_mask.unsqueeze(1), -float('inf')) + # _arc_pred = arc_pred.clone() + _arc_pred = arc_pred.masked_fill(flip_mask.unsqueeze(1), -float('inf')) - arc_true[:, 0].fill_(-1) - label_true[:, 0].fill_(-1) + arc_true.data[:, 0].fill_(-1) + label_true.data[:, 0].fill_(-1) arc_nll = F.cross_entropy(_arc_pred.view(-1, seq_len), arc_true.view(-1), ignore_index=-1) label_nll = F.cross_entropy(label_pred.view(-1, label_pred.size(-1)), label_true.view(-1), ignore_index=-1) diff --git a/reproduction/joint_cws_parse/train.py b/reproduction/joint_cws_parse/train.py index 0c34614b..ed4b07f0 100644 --- a/reproduction/joint_cws_parse/train.py +++ b/reproduction/joint_cws_parse/train.py @@ -14,6 +14,7 @@ from torch.optim.lr_scheduler import StepLR from fastNLP import Tester from fastNLP import GradientClipCallback, LRScheduler import os +from fastNLP import cache_results def set_random_seed(random_seed=666): import random, numpy, torch @@ -39,43 +40,42 @@ label_mlp_size = 100 batch_size = 32 update_every = 4 n_epochs = 100 -data_folder = '' # 填写在数据所在文件夹, 文件夹下应该有train, dev, test等三个文件 -vector_folder = '' # 预训练的vector,下面应该包含三个文件: 1grams_t3_m50_corpus.txt, 2grams_t3_m50_corpus.txt, 3grams_t3_m50_corpus.txt +data_name = 'new_ctb7' #################################################### +data_folder = f'/remote-home/hyan01/exps/JointCwsPosParser/data/{data_name}/output' # 填写在数据所在文件夹, 文件夹下应该有train, dev, test等三个文件 +vector_folder = '/remote-home/hyan01/exps/CWS/pretrain/vectors' # 预训练的vector,下面应该包含三个文件: 1grams_t3_m50_corpus.txt, 2grams_t3_m50_corpus.txt, 3grams_t3_m50_corpus.txt set_random_seed(1234) device = 0 -# @cache_results('caches/{}.pkl'.format(data_name)) -# def get_data(): -data = CTBxJointLoader().process(data_folder) - -char_labels_vocab = data.vocabs['char_labels'] - -pre_chars_vocab = data.vocabs['pre_chars'] -pre_bigrams_vocab = data.vocabs['pre_bigrams'] -pre_trigrams_vocab = data.vocabs['pre_trigrams'] - -chars_vocab = data.vocabs['chars'] -bigrams_vocab = data.vocabs['bigrams'] -trigrams_vocab = data.vocabs['trigrams'] - -pre_chars_embed = StaticEmbedding(pre_chars_vocab, - model_dir_or_name=os.path.join(vector_folder, '1grams_t3_m50_corpus.txt'), - init_method=uniform_init, normalize=False) -pre_chars_embed.embedding.weight.data = pre_chars_embed.embedding.weight.data/pre_chars_embed.embedding.weight.data.std() -pre_bigrams_embed = StaticEmbedding(pre_bigrams_vocab, - model_dir_or_name=os.path.join(vector_folder, '2grams_t3_m50_corpus.txt'), - init_method=uniform_init, normalize=False) -pre_bigrams_embed.embedding.weight.data = pre_bigrams_embed.embedding.weight.data/pre_bigrams_embed.embedding.weight.data.std() -pre_trigrams_embed = StaticEmbedding(pre_trigrams_vocab, - model_dir_or_name=os.path.join(vector_folder, '3grams_t3_m50_corpus.txt'), - init_method=uniform_init, normalize=False) -pre_trigrams_embed.embedding.weight.data = pre_trigrams_embed.embedding.weight.data/pre_trigrams_embed.embedding.weight.data.std() - - # return chars_vocab, bigrams_vocab, trigrams_vocab, char_labels_vocab, pre_chars_embed, pre_bigrams_embed, pre_trigrams_embed, data - -# chars_vocab, bigrams_vocab, trigrams_vocab, char_labels_vocab, pre_chars_embed, pre_bigrams_embed, pre_trigrams_embed, data = get_data() +@cache_results('caches/{}.pkl'.format(data_name)) +def get_data(): + data = CTBxJointLoader().process(data_folder) + char_labels_vocab = data.vocabs['char_labels'] + + pre_chars_vocab = data.vocabs['pre_chars'] + pre_bigrams_vocab = data.vocabs['pre_bigrams'] + pre_trigrams_vocab = data.vocabs['pre_trigrams'] + + chars_vocab = data.vocabs['chars'] + bigrams_vocab = data.vocabs['bigrams'] + trigrams_vocab = data.vocabs['trigrams'] + pre_chars_embed = StaticEmbedding(pre_chars_vocab, + model_dir_or_name=os.path.join(vector_folder, '1grams_t3_m50_corpus.txt'), + init_method=uniform_init, normalize=False) + pre_chars_embed.embedding.weight.data = pre_chars_embed.embedding.weight.data / pre_chars_embed.embedding.weight.data.std() + pre_bigrams_embed = StaticEmbedding(pre_bigrams_vocab, + model_dir_or_name=os.path.join(vector_folder, '2grams_t3_m50_corpus.txt'), + init_method=uniform_init, normalize=False) + pre_bigrams_embed.embedding.weight.data = pre_bigrams_embed.embedding.weight.data / pre_bigrams_embed.embedding.weight.data.std() + pre_trigrams_embed = StaticEmbedding(pre_trigrams_vocab, + model_dir_or_name=os.path.join(vector_folder, '3grams_t3_m50_corpus.txt'), + init_method=uniform_init, normalize=False) + pre_trigrams_embed.embedding.weight.data = pre_trigrams_embed.embedding.weight.data / pre_trigrams_embed.embedding.weight.data.std() + + return chars_vocab, bigrams_vocab, trigrams_vocab, char_labels_vocab, pre_chars_embed, pre_bigrams_embed, pre_trigrams_embed, data + +chars_vocab, bigrams_vocab, trigrams_vocab, char_labels_vocab, pre_chars_embed, pre_bigrams_embed, pre_trigrams_embed, data = get_data() print(data) model = CharParser(char_vocab_size=len(chars_vocab), @@ -104,11 +104,24 @@ optimizer = optim.Adam([param for param in model.parameters() if param.requires_ sampler = BucketSampler(seq_len_field_name='seq_lens') callbacks = [] + +from fastNLP.core.callback import Callback +from torch.optim.lr_scheduler import LambdaLR +class SchedulerCallback(Callback): + def __init__(self, scheduler): + super().__init__() + self.scheduler = scheduler + + def on_backward_end(self): + if self.step % self.update_every==0: + self.scheduler.step() + +scheduler = LambdaLR(optimizer, lr_lambda=lambda step:(0.75)**(step//5000)) # scheduler = LambdaLR(optimizer, lr_lambda=lambda step:(0.75)**(step//5000)) -scheduler = StepLR(optimizer, step_size=18, gamma=0.75) -# optim_callback = OptimizerCallback(optimizer, scheduler, update_every) +# scheduler = StepLR(optimizer, step_size=18, gamma=0.75) +scheduler_callback = SchedulerCallback(scheduler) # callbacks.append(optim_callback) -scheduler_callback = LRScheduler(scheduler) +# scheduler_callback = LRScheduler(scheduler) callbacks.append(scheduler_callback) callbacks.append(GradientClipCallback(clip_type='value', clip_value=5)) @@ -119,6 +132,6 @@ callbacks.append(dev_callback) trainer = Trainer(data.datasets['train'], model, loss=None, metrics=metrics, n_epochs=n_epochs, batch_size=batch_size, print_every=3, validate_every=-1, dev_data=data.datasets['dev'], save_path=None, optimizer=optimizer, - check_code_level=0, metric_key='u_f1', sampler=sampler, prefetch=True, use_tqdm=True, + check_code_level=0, metric_key='u_f1', sampler=sampler, num_workers=2, use_tqdm=True, device=device, callbacks=callbacks, update_every=update_every) trainer.train() \ No newline at end of file diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py deleted file mode 100644 index bba26a8a..00000000 --- a/reproduction/matching/data/MatchingDataLoader.py +++ /dev/null @@ -1,435 +0,0 @@ -""" -这个文件的内容已合并到fastNLP.io.data_loader里,这个文件的内容不再更新 -""" - - -import os - -from typing import Union, Dict - -from fastNLP.core.const import Const -from fastNLP.core.vocabulary import Vocabulary -from fastNLP.io.base_loader import DataBundle, DataSetLoader -from fastNLP.io.dataset_loader import JsonLoader, CSVLoader -from fastNLP.io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR -from fastNLP.modules.encoder._bert import BertTokenizer - - -class MatchingLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader` - - 读取Matching任务的数据集 - - :param dict paths: key是数据集名称(如train、dev、test),value是对应的文件名 - """ - - def __init__(self, paths: dict=None): - self.paths = paths - - def _load(self, path): - """ - :param str path: 待读取数据集的路径名 - :return: fastNLP.DataSet ds: 返回一个DataSet对象,里面必须包含3个field:其中两个分别为两个句子 - 的原始字符串文本,第三个为标签 - """ - raise NotImplementedError - - def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None, - to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None, - cut_text: int = None, get_index=True, auto_pad_length: int=None, - auto_pad_token: str='', set_input: Union[list, str, bool]=True, - set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataBundle: - """ - :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹, - 则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和 - 对应的全路径文件名。 - :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义 - 这个数据集的名字,如果不定义则默认为train。 - :param bool to_lower: 是否将文本自动转为小写。默认值为False。 - :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` : - 提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和 - attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len - :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径 - :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。 - :param bool get_index: 是否需要根据词表将文本转为index - :param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad - :param str auto_pad_token: 自动pad的内容 - :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False - 则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input, - 于此同时其他field不会被设置为input。默认值为True。 - :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。 - :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个。 - 如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果 - 传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]']. - :return: - """ - if isinstance(set_input, str): - set_input = [set_input] - if isinstance(set_target, str): - set_target = [set_target] - if isinstance(set_input, bool): - auto_set_input = set_input - else: - auto_set_input = False - if isinstance(set_target, bool): - auto_set_target = set_target - else: - auto_set_target = False - if isinstance(paths, str): - if os.path.isdir(paths): - path = {n: os.path.join(paths, self.paths[n]) for n in self.paths.keys()} - else: - path = {dataset_name if dataset_name is not None else 'train': paths} - else: - path = paths - - data_info = DataBundle() - for data_name in path.keys(): - data_info.datasets[data_name] = self._load(path[data_name]) - - for data_name, data_set in data_info.datasets.items(): - if auto_set_input: - data_set.set_input(Const.INPUTS(0), Const.INPUTS(1)) - if auto_set_target: - if Const.TARGET in data_set.get_field_names(): - data_set.set_target(Const.TARGET) - - if to_lower: - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0), - is_input=auto_set_input) - data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1), - is_input=auto_set_input) - - if bert_tokenizer is not None: - if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: - PRETRAIN_URL = _get_base_url('bert') - model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) - # 检查是否存在 - elif os.path.isdir(bert_tokenizer): - model_dir = bert_tokenizer - else: - raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.") - - words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]') - with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f: - lines = f.readlines() - lines = [line.strip() for line in lines] - words_vocab.add_word_lst(lines) - words_vocab.build_vocab() - - tokenizer = BertTokenizer.from_pretrained(model_dir) - - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields, - is_input=auto_set_input) - - if isinstance(concat, bool): - concat = 'default' if concat else None - if concat is not None: - if isinstance(concat, str): - CONCAT_MAP = {'bert': ['[CLS]', '[SEP]', '', '[SEP]'], - 'default': ['', '', '', '']} - if concat.lower() in CONCAT_MAP: - concat = CONCAT_MAP[concat] - else: - concat = 4 * [concat] - assert len(concat) == 4, \ - f'Please choose a list with 4 symbols which at the beginning of first sentence ' \ - f'the end of first sentence, the begin of second sentence, and the end of second' \ - f'sentence. Your input is {concat}' - - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[1]] + [concat[2]] + - x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT) - data_set.apply(lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT, - is_input=auto_set_input) - - if seq_len_type is not None: - if seq_len_type == 'seq_len': # - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), - is_input=auto_set_input) - elif seq_len_type == 'mask': - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: [1] * len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), - is_input=auto_set_input) - elif seq_len_type == 'bert': - for data_name, data_set in data_info.datasets.items(): - if Const.INPUT not in data_set.get_field_names(): - raise KeyError(f'Field ``{Const.INPUT}`` not in {data_name} data set: ' - f'got {data_set.get_field_names()}') - data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), - new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input) - data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), - new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) - - if auto_pad_length is not None: - cut_text = min(auto_pad_length, cut_text if cut_text is not None else auto_pad_length) - - if cut_text is not None: - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')): - data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields, - is_input=auto_set_input) - - data_set_list = [d for n, d in data_info.datasets.items()] - assert len(data_set_list) > 0, f'There are NO data sets in data info!' - - if bert_tokenizer is None: - words_vocab = Vocabulary(padding=auto_pad_token) - words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n], - field_name=[n for n in data_set_list[0].get_field_names() - if (Const.INPUT in n)], - no_create_entry_dataset=[d for n, d in data_info.datasets.items() - if 'train' not in n]) - target_vocab = Vocabulary(padding=None, unknown=None) - target_vocab = target_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n], - field_name=Const.TARGET) - data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab} - - if get_index: - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields, - is_input=auto_set_input) - - if Const.TARGET in data_set.get_field_names(): - data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET, - is_input=auto_set_input, is_target=auto_set_target) - - if auto_pad_length is not None: - if seq_len_type == 'seq_len': - raise RuntimeError(f'the sequence will be padded with the length {auto_pad_length}, ' - f'so the seq_len_type cannot be `{seq_len_type}`!') - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] * - (auto_pad_length - len(x[fields])), new_field_name=fields, - is_input=auto_set_input) - elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'): - data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])), - new_field_name=fields, is_input=auto_set_input) - - for data_name, data_set in data_info.datasets.items(): - if isinstance(set_input, list): - data_set.set_input(*[inputs for inputs in set_input if inputs in data_set.get_field_names()]) - if isinstance(set_target, list): - data_set.set_target(*[target for target in set_target if target in data_set.get_field_names()]) - - return data_info - - -class SNLILoader(MatchingLoader, JsonLoader): - """ - 别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader` - - 读取SNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip - """ - - def __init__(self, paths: dict=None): - fields = { - 'sentence1_binary_parse': Const.INPUTS(0), - 'sentence2_binary_parse': Const.INPUTS(1), - 'gold_label': Const.TARGET, - } - paths = paths if paths is not None else { - 'train': 'snli_1.0_train.jsonl', - 'dev': 'snli_1.0_dev.jsonl', - 'test': 'snli_1.0_test.jsonl'} - MatchingLoader.__init__(self, paths=paths) - JsonLoader.__init__(self, fields=fields) - - def _load(self, path): - ds = JsonLoader._load(self, path) - - parentheses_table = str.maketrans({'(': None, ')': None}) - - ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(0)) - ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(1)) - ds.drop(lambda x: x[Const.TARGET] == '-') - return ds - - -class RTELoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.RTELoader` :class:`fastNLP.io.dataset_loader.RTELoader` - - 读取RTE数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv' # test set has not label - } - MatchingLoader.__init__(self, paths=paths) - self.fields = { - 'sentence1': Const.INPUTS(0), - 'sentence2': Const.INPUTS(1), - 'label': Const.TARGET, - } - CSVLoader.__init__(self, sep='\t') - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if v in ds.get_field_names(): - ds.rename_field(k, v) - for fields in ds.get_all_fields(): - if Const.INPUT in fields: - ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) - - return ds - - -class QNLILoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.dataset_loader.QNLILoader` - - 读取QNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv' # test set has not label - } - MatchingLoader.__init__(self, paths=paths) - self.fields = { - 'question': Const.INPUTS(0), - 'sentence': Const.INPUTS(1), - 'label': Const.TARGET, - } - CSVLoader.__init__(self, sep='\t') - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if v in ds.get_field_names(): - ds.rename_field(k, v) - for fields in ds.get_all_fields(): - if Const.INPUT in fields: - ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) - - return ds - - -class MNLILoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.MNLILoader` :class:`fastNLP.io.dataset_loader.MNLILoader` - - 读取MNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev_matched': 'dev_matched.tsv', - 'dev_mismatched': 'dev_mismatched.tsv', - 'test_matched': 'test_matched.tsv', - 'test_mismatched': 'test_mismatched.tsv', - # 'test_0.9_matched': 'multinli_0.9_test_matched_unlabeled.txt', - # 'test_0.9_mismatched': 'multinli_0.9_test_mismatched_unlabeled.txt', - - # test_0.9_mathed与mismatched是MNLI0.9版本的(数据来源:kaggle) - } - MatchingLoader.__init__(self, paths=paths) - CSVLoader.__init__(self, sep='\t') - self.fields = { - 'sentence1_binary_parse': Const.INPUTS(0), - 'sentence2_binary_parse': Const.INPUTS(1), - 'gold_label': Const.TARGET, - } - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - - if Const.TARGET in ds.get_field_names(): - if ds[0][Const.TARGET] == 'hidden': - ds.delete_field(Const.TARGET) - - parentheses_table = str.maketrans({'(': None, ')': None}) - - ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(0)) - ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(1)) - if Const.TARGET in ds.get_field_names(): - ds.drop(lambda x: x[Const.TARGET] == '-') - return ds - - -class QuoraLoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.QuoraLoader` :class:`fastNLP.io.dataset_loader.QuoraLoader` - - 读取MNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv', - } - MatchingLoader.__init__(self, paths=paths) - CSVLoader.__init__(self, sep='\t', headers=(Const.TARGET, Const.INPUTS(0), Const.INPUTS(1), 'pairID')) - - def _load(self, path): - ds = CSVLoader._load(self, path) - return ds diff --git a/reproduction/matching/matching_bert.py b/reproduction/matching/matching_bert.py index 3ed75fd1..05377dff 100644 --- a/reproduction/matching/matching_bert.py +++ b/reproduction/matching/matching_bert.py @@ -2,26 +2,35 @@ import random import numpy as np import torch -from fastNLP.core import Trainer, Tester, AccuracyMetric, Const, Adam -from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader - -from reproduction.matching.model.bert import BertForNLI +from fastNLP.core import Trainer, Tester, AccuracyMetric, Const +from fastNLP.core.callback import WarmupCallback, EvaluateCallback +from fastNLP.core.optimizer import AdamW +from fastNLP.embeddings import BertEmbedding +from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, MNLIBertPipe,\ + QNLIBertPipe, QuoraBertPipe +from fastNLP.models.bert import BertForSentenceMatching # define hyper-parameters class BERTConfig: task = 'snli' + batch_size_per_gpu = 6 n_epochs = 6 lr = 2e-5 - seq_len_type = 'bert' + warm_up_rate = 0.1 seed = 42 + save_path = None # 模型存储的位置,None表示不存储模型。 + train_dataset_name = 'train' dev_dataset_name = 'dev' test_dataset_name = 'test' - save_path = None # 模型存储的位置,None表示不存储模型。 - bert_dir = 'path/to/bert/dir' # 预训练BERT参数文件的文件夹 + + to_lower = True # 忽略大小写 + tokenizer = 'spacy' # 使用spacy进行分词 + + bert_model_dir_or_name = 'bert-base-uncased' arg = BERTConfig() @@ -37,58 +46,52 @@ if n_gpu > 0: # load data set if arg.task == 'snli': - data_info = SNLILoader().process( - paths='path/to/snli/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = SNLIBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'rte': - data_info = RTELoader().process( - paths='path/to/rte/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = RTEBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'qnli': - data_info = QNLILoader().process( - paths='path/to/qnli/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = QNLIBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'mnli': - data_info = MNLILoader().process( - paths='path/to/mnli/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = MNLIBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'quora': - data_info = QuoraLoader().process( - paths='path/to/quora/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = QuoraBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() else: raise RuntimeError(f'NOT support {arg.task} task yet!') +print(data_bundle) # print details in data_bundle + +# load embedding +embed = BertEmbedding(data_bundle.vocabs[Const.INPUT], model_dir_or_name=arg.bert_model_dir_or_name) + # define model -model = BertForNLI(class_num=len(data_info.vocabs[Const.TARGET]), bert_dir=arg.bert_dir) +model = BertForSentenceMatching(embed, num_labels=len(data_bundle.vocabs[Const.TARGET])) + +# define optimizer and callback +optimizer = AdamW(lr=arg.lr, params=model.parameters()) +callbacks = [WarmupCallback(warmup=arg.warm_up_rate, schedule='linear'), ] + +if arg.task in ['snli']: + callbacks.append(EvaluateCallback(data=data_bundle.datasets[arg.test_dataset_name])) + # evaluate test set in every epoch if task is snli. # define trainer -trainer = Trainer(train_data=data_info.datasets[arg.train_dataset_name], model=model, - optimizer=Adam(lr=arg.lr, model_params=model.parameters()), +trainer = Trainer(train_data=data_bundle.get_dataset(arg.train_dataset_name), model=model, + optimizer=optimizer, batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, n_epochs=arg.n_epochs, print_every=-1, - dev_data=data_info.datasets[arg.dev_dataset_name], + dev_data=data_bundle.get_dataset(arg.dev_dataset_name), metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], check_code_level=-1, - save_path=arg.save_path) + save_path=arg.save_path, + callbacks=callbacks) # train model trainer.train(load_best_model=True) # define tester tester = Tester( - data=data_info.datasets[arg.test_dataset_name], + data=data_bundle.get_dataset(arg.test_dataset_name), model=model, metrics=AccuracyMetric(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, diff --git a/reproduction/matching/matching_cntn.py b/reproduction/matching/matching_cntn.py index 098f3bc4..9be716ba 100644 --- a/reproduction/matching/matching_cntn.py +++ b/reproduction/matching/matching_cntn.py @@ -1,9 +1,9 @@ import argparse import torch -from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const +from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const, CrossEntropyLoss from fastNLP.embeddings import StaticEmbedding -from fastNLP.io.data_loader import QNLILoader, RTELoader, SNLILoader, MNLILoader +from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, MNLIPipe, QNLIPipe from reproduction.matching.model.cntn import CNTNModel @@ -13,14 +13,12 @@ argument.add_argument('--embedding', choices=['glove', 'word2vec'], default='glo argument.add_argument('--batch-size-per-gpu', type=int, default=256) argument.add_argument('--n-epochs', type=int, default=200) argument.add_argument('--lr', type=float, default=1e-5) -argument.add_argument('--seq-len-type', choices=['mask', 'seq_len'], default='mask') argument.add_argument('--save-dir', type=str, default=None) argument.add_argument('--cntn-depth', type=int, default=1) argument.add_argument('--cntn-ns', type=int, default=200) argument.add_argument('--cntn-k-top', type=int, default=10) argument.add_argument('--cntn-r', type=int, default=5) argument.add_argument('--dataset', choices=['qnli', 'rte', 'snli', 'mnli'], default='qnli') -argument.add_argument('--max-len', type=int, default=50) arg = argument.parse_args() # dataset dict @@ -45,30 +43,25 @@ else: num_labels = 3 # load data set -if arg.dataset == 'qnli': - data_info = QNLILoader().process( - paths='path/to/qnli/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None, - get_index=True, concat=False, auto_pad_length=arg.max_len) +if arg.dataset == 'snli': + data_bundle = SNLIPipe(lower=True, tokenizer='raw').process_from_file() elif arg.dataset == 'rte': - data_info = RTELoader().process( - paths='path/to/rte/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None, - get_index=True, concat=False, auto_pad_length=arg.max_len) -elif arg.dataset == 'snli': - data_info = SNLILoader().process( - paths='path/to/snli/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None, - get_index=True, concat=False, auto_pad_length=arg.max_len) + data_bundle = RTEPipe(lower=True, tokenizer='raw').process_from_file() +elif arg.dataset == 'qnli': + data_bundle = QNLIPipe(lower=True, tokenizer='raw').process_from_file() elif arg.dataset == 'mnli': - data_info = MNLILoader().process( - paths='path/to/mnli/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None, - get_index=True, concat=False, auto_pad_length=arg.max_len) + data_bundle = MNLIPipe(lower=True, tokenizer='raw').process_from_file() else: - raise ValueError(f'now we only support [qnli,rte,snli,mnli] dataset for cntn model!') + raise RuntimeError(f'NOT support {arg.task} task yet!') + +print(data_bundle) # print details in data_bundle # load embedding if arg.embedding == 'word2vec': - embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], model_dir_or_name='en-word2vec-300', requires_grad=True) + embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-word2vec-300', + requires_grad=True) elif arg.embedding == 'glove': - embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], model_dir_or_name='en-glove-840b-300', + embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-glove-840b-300d', requires_grad=True) else: raise ValueError(f'now we only support word2vec or glove embedding for cntn model!') @@ -79,11 +72,12 @@ model = CNTNModel(embedding, ns=arg.cntn_ns, k_top=arg.cntn_k_top, num_labels=nu print(model) # define trainer -trainer = Trainer(train_data=data_info.datasets['train'], model=model, +trainer = Trainer(train_data=data_bundle.datasets['train'], model=model, optimizer=Adam(lr=arg.lr, model_params=model.parameters()), + loss=CrossEntropyLoss(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, n_epochs=arg.n_epochs, print_every=-1, - dev_data=data_info.datasets[dev_dict[arg.dataset]], + dev_data=data_bundle.datasets[dev_dict[arg.dataset]], metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], check_code_level=-1) @@ -93,7 +87,7 @@ trainer.train(load_best_model=True) # define tester tester = Tester( - data=data_info.datasets[test_dict[arg.dataset]], + data=data_bundle.datasets[test_dict[arg.dataset]], model=model, metrics=AccuracyMetric(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, diff --git a/reproduction/matching/matching_esim.py b/reproduction/matching/matching_esim.py index 2ff6916a..9d50c0fb 100644 --- a/reproduction/matching/matching_esim.py +++ b/reproduction/matching/matching_esim.py @@ -6,10 +6,11 @@ from torch.optim import Adamax from torch.optim.lr_scheduler import StepLR from fastNLP.core import Trainer, Tester, AccuracyMetric, Const -from fastNLP.core.callback import GradientClipCallback, LRScheduler -from fastNLP.embeddings.static_embedding import StaticEmbedding -from fastNLP.embeddings.elmo_embedding import ElmoEmbedding -from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader +from fastNLP.core.callback import GradientClipCallback, LRScheduler, EvaluateCallback +from fastNLP.core.losses import CrossEntropyLoss +from fastNLP.embeddings import StaticEmbedding +from fastNLP.embeddings import ElmoEmbedding +from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, MNLIPipe, QNLIPipe, QuoraPipe from fastNLP.models.snli import ESIM @@ -17,18 +18,21 @@ from fastNLP.models.snli import ESIM class ESIMConfig: task = 'snli' + embedding = 'glove' + batch_size_per_gpu = 196 n_epochs = 30 lr = 2e-3 - seq_len_type = 'seq_len' - # seq_len表示在process的时候用len(words)来表示长度信息; - # mask表示用0/1掩码矩阵来表示长度信息; seed = 42 + save_path = None # 模型存储的位置,None表示不存储模型。 + train_dataset_name = 'train' dev_dataset_name = 'dev' test_dataset_name = 'test' - save_path = None # 模型存储的位置,None表示不存储模型。 + + to_lower = True # 忽略大小写 + tokenizer = 'spacy' # 使用spacy进行分词 arg = ESIMConfig() @@ -44,43 +48,32 @@ if n_gpu > 0: # load data set if arg.task == 'snli': - data_info = SNLILoader().process( - paths='path/to/snli/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = SNLIPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'rte': - data_info = RTELoader().process( - paths='path/to/rte/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = RTEPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'qnli': - data_info = QNLILoader().process( - paths='path/to/qnli/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = QNLIPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'mnli': - data_info = MNLILoader().process( - paths='path/to/mnli/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = MNLIPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'quora': - data_info = QuoraLoader().process( - paths='path/to/quora/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = QuoraPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() else: raise RuntimeError(f'NOT support {arg.task} task yet!') +print(data_bundle) # print details in data_bundle + # load embedding if arg.embedding == 'elmo': - embedding = ElmoEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True) + embedding = ElmoEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-medium', + requires_grad=True) elif arg.embedding == 'glove': - embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True, normalize=False) + embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-glove-840b-300d', + requires_grad=True, normalize=False) else: raise RuntimeError(f'NOT support {arg.embedding} embedding yet!') # define model -model = ESIM(embedding, num_labels=len(data_info.vocabs[Const.TARGET])) +model = ESIM(embedding, num_labels=len(data_bundle.vocabs[Const.TARGET])) # define optimizer and callback optimizer = Adamax(lr=arg.lr, params=model.parameters()) @@ -91,23 +84,29 @@ callbacks = [ LRScheduler(scheduler), ] +if arg.task in ['snli']: + callbacks.append(EvaluateCallback(data=data_bundle.datasets[arg.test_dataset_name])) + # evaluate test set in every epoch if task is snli. + # define trainer -trainer = Trainer(train_data=data_info.datasets[arg.train_dataset_name], model=model, +trainer = Trainer(train_data=data_bundle.datasets[arg.train_dataset_name], model=model, optimizer=optimizer, + loss=CrossEntropyLoss(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, n_epochs=arg.n_epochs, print_every=-1, - dev_data=data_info.datasets[arg.dev_dataset_name], + dev_data=data_bundle.datasets[arg.dev_dataset_name], metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], check_code_level=-1, - save_path=arg.save_path) + save_path=arg.save_path, + callbacks=callbacks) # train model trainer.train(load_best_model=True) # define tester tester = Tester( - data=data_info.datasets[arg.test_dataset_name], + data=data_bundle.datasets[arg.test_dataset_name], model=model, metrics=AccuracyMetric(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, diff --git a/reproduction/matching/matching_mwan.py b/reproduction/matching/matching_mwan.py index 31af54c5..026ea7b4 100644 --- a/reproduction/matching/matching_mwan.py +++ b/reproduction/matching/matching_mwan.py @@ -6,12 +6,11 @@ from torch.optim import Adadelta from torch.optim.lr_scheduler import StepLR from fastNLP import CrossEntropyLoss -from fastNLP import cache_results from fastNLP.core import Trainer, Tester, AccuracyMetric, Const -from fastNLP.core.callback import LRScheduler, FitlogCallback +from fastNLP.core.callback import LRScheduler, EvaluateCallback from fastNLP.embeddings import StaticEmbedding -from fastNLP.io.data_loader import MNLILoader, QNLILoader, SNLILoader, RTELoader +from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, MNLIPipe, QNLIPipe, QuoraPipe from reproduction.matching.model.mwan import MwanModel import fitlog @@ -46,47 +45,25 @@ for k in arg.__dict__: # load data set if arg.task == 'snli': - @cache_results(f'snli_mwan.pkl') - def read_snli(): - data_info = SNLILoader().process( - paths='path/to/snli/data', to_lower=True, seq_len_type=None, bert_tokenizer=None, - get_index=True, concat=False, extra_split=['/','%','-'], - ) - return data_info - data_info = read_snli() + data_bundle = SNLIPipe(lower=True, tokenizer='spacy').process_from_file() elif arg.task == 'rte': - @cache_results(f'rte_mwan.pkl') - def read_rte(): - data_info = RTELoader().process( - paths='path/to/rte/data', to_lower=True, seq_len_type=None, bert_tokenizer=None, - get_index=True, concat=False, extra_split=['/','%','-'], - ) - return data_info - data_info = read_rte() + data_bundle = RTEPipe(lower=True, tokenizer='spacy').process_from_file() elif arg.task == 'qnli': - data_info = QNLILoader().process( - paths='path/to/qnli/data', to_lower=True, seq_len_type=None, bert_tokenizer=None, - get_index=True, concat=False , cut_text=512, extra_split=['/','%','-'], - ) + data_bundle = QNLIPipe(lower=True, tokenizer='spacy').process_from_file() elif arg.task == 'mnli': - @cache_results(f'mnli_v0.9_mwan.pkl') - def read_mnli(): - data_info = MNLILoader().process( - paths='path/to/mnli/data', to_lower=True, seq_len_type=None, bert_tokenizer=None, - get_index=True, concat=False, extra_split=['/','%','-'], - ) - return data_info - data_info = read_mnli() + data_bundle = MNLIPipe(lower=True, tokenizer='spacy').process_from_file() +elif arg.task == 'quora': + data_bundle = QuoraPipe(lower=True, tokenizer='spacy').process_from_file() else: raise RuntimeError(f'NOT support {arg.task} task yet!') -print(data_info) -print(len(data_info.vocabs['words'])) +print(data_bundle) +print(len(data_bundle.vocabs[Const.INPUTS(0)])) model = MwanModel( - num_class = len(data_info.vocabs[Const.TARGET]), - EmbLayer = StaticEmbedding(data_info.vocabs[Const.INPUT], requires_grad=False, normalize=False), + num_class = len(data_bundle.vocabs[Const.TARGET]), + EmbLayer = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], requires_grad=False, normalize=False), ElmoLayer = None, args_of_imm = { "input_size" : 300 , @@ -105,21 +82,20 @@ callbacks = [ ] if arg.task in ['snli']: - callbacks.append(FitlogCallback(data_info.datasets[arg.testset_name], verbose=1)) + callbacks.append(EvaluateCallback(data=data_bundle.datasets[arg.testset_name])) elif arg.task == 'mnli': - callbacks.append(FitlogCallback({'dev_matched': data_info.datasets['dev_matched'], - 'dev_mismatched': data_info.datasets['dev_mismatched']}, - verbose=1)) + callbacks.append(EvaluateCallback(data={'dev_matched': data_bundle.datasets['dev_matched'], + 'dev_mismatched': data_bundle.datasets['dev_mismatched']},)) trainer = Trainer( - train_data = data_info.datasets['train'], + train_data = data_bundle.datasets['train'], model = model, optimizer = optimizer, num_workers = 0, batch_size = arg.batch_size, n_epochs = arg.n_epochs, print_every = -1, - dev_data = data_info.datasets[arg.devset_name], + dev_data = data_bundle.datasets[arg.devset_name], metrics = AccuracyMetric(pred = "pred" , target = "target"), metric_key = 'acc', device = [i for i in range(torch.cuda.device_count())], @@ -130,7 +106,7 @@ trainer = Trainer( trainer.train(load_best_model=True) tester = Tester( - data=data_info.datasets[arg.testset_name], + data=data_bundle.datasets[arg.testset_name], model=model, metrics=AccuracyMetric(), batch_size=arg.batch_size, diff --git a/reproduction/matching/model/bert.py b/reproduction/matching/model/bert.py index a21f8c36..73a0c533 100644 --- a/reproduction/matching/model/bert.py +++ b/reproduction/matching/model/bert.py @@ -3,39 +3,28 @@ import torch import torch.nn as nn from fastNLP.core.const import Const -from fastNLP.models import BaseModel -from fastNLP.embeddings.bert import BertModel +from fastNLP.models.base_model import BaseModel +from fastNLP.embeddings import BertEmbedding class BertForNLI(BaseModel): - # TODO: still in progress - def __init__(self, class_num=3, bert_dir=None): + def __init__(self, bert_embed: BertEmbedding, class_num=3): super(BertForNLI, self).__init__() - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - else: - self.bert = BertModel() - hidden_size = self.bert.pooler.dense._parameters['bias'].size(-1) - self.classifier = nn.Linear(hidden_size, class_num) - - def forward(self, words, seq_len1, seq_len2, target=None): + self.embed = bert_embed + self.classifier = nn.Linear(self.embed.embedding_dim, class_num) + + def forward(self, words): """ :param torch.Tensor words: [batch_size, seq_len] input_ids - :param torch.Tensor seq_len1: [batch_size, seq_len] token_type_ids - :param torch.Tensor seq_len2: [batch_size, seq_len] attention_mask - :param torch.Tensor target: [batch] :return: """ - _, pooled_output = self.bert(words, seq_len1, seq_len2) - logits = self.classifier(pooled_output) + hidden = self.embed(words) + logits = self.classifier(hidden) - if target is not None: - loss_func = torch.nn.CrossEntropyLoss() - loss = loss_func(logits, target) - return {Const.OUTPUT: logits, Const.LOSS: loss} return {Const.OUTPUT: logits} - def predict(self, words, seq_len1, seq_len2, target=None): - return self.forward(words, seq_len1, seq_len2) + def predict(self, words): + logits = self.forward(words)[Const.OUTPUT] + return {Const.OUTPUT: logits.argmax(dim=-1)} diff --git a/reproduction/matching/model/cntn.py b/reproduction/matching/model/cntn.py index a0a104a3..cfa5e5a8 100644 --- a/reproduction/matching/model/cntn.py +++ b/reproduction/matching/model/cntn.py @@ -3,10 +3,8 @@ import torch.nn as nn import torch.nn.functional as F import numpy as np -from torch.nn import CrossEntropyLoss - -from fastNLP.models import BaseModel -from fastNLP.embeddings.embedding import TokenEmbedding +from fastNLP.models.base_model import BaseModel +from fastNLP.embeddings import TokenEmbedding from fastNLP.core.const import Const @@ -83,13 +81,12 @@ class CNTNModel(BaseModel): self.weight_V = nn.Linear(2 * ns, r) self.weight_u = nn.Sequential(nn.Dropout(p=dropout_rate), nn.Linear(r, num_labels)) - def forward(self, words1, words2, seq_len1, seq_len2, target=None): + def forward(self, words1, words2, seq_len1, seq_len2): """ :param words1: [batch, seq_len, emb_size] Question. :param words2: [batch, seq_len, emb_size] Answer. :param seq_len1: [batch] :param seq_len2: [batch] - :param target: [batch] Glod labels. :return: """ in_q = self.embedding(words1) @@ -109,12 +106,7 @@ class CNTNModel(BaseModel): in_a = self.fc_q(in_a.view(in_a.size(0), -1)) score = torch.tanh(self.weight_u(self.weight_M(in_q, in_a) + self.weight_V(torch.cat((in_q, in_a), -1)))) - if target is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(score, target) - return {Const.LOSS: loss, Const.OUTPUT: score} - else: - return {Const.OUTPUT: score} + return {Const.OUTPUT: score} - def predict(self, **kwargs): - return self.forward(**kwargs) + def predict(self, words1, words2, seq_len1, seq_len2): + return self.forward(words1, words2, seq_len1, seq_len2) diff --git a/reproduction/matching/model/esim.py b/reproduction/matching/model/esim.py index 87e5ba65..d704e2f8 100644 --- a/reproduction/matching/model/esim.py +++ b/reproduction/matching/model/esim.py @@ -2,10 +2,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -from torch.nn import CrossEntropyLoss - -from fastNLP.models import BaseModel -from fastNLP.embeddings.embedding import TokenEmbedding +from fastNLP.models.base_model import BaseModel +from fastNLP.embeddings import TokenEmbedding from fastNLP.core.const import Const from fastNLP.core.utils import seq_len_to_mask @@ -42,13 +40,12 @@ class ESIMModel(BaseModel): nn.init.xavier_uniform_(self.classifier[1].weight.data) nn.init.xavier_uniform_(self.classifier[4].weight.data) - def forward(self, words1, words2, seq_len1, seq_len2, target=None): + def forward(self, words1, words2, seq_len1, seq_len2): """ :param words1: [batch, seq_len] :param words2: [batch, seq_len] :param seq_len1: [batch] :param seq_len2: [batch] - :param target: :return: """ mask1 = seq_len_to_mask(seq_len1, words1.size(1)) @@ -82,16 +79,10 @@ class ESIMModel(BaseModel): logits = torch.tanh(self.classifier(out)) # logits = self.classifier(out) - if target is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits, target) - - return {Const.LOSS: loss, Const.OUTPUT: logits} - else: - return {Const.OUTPUT: logits} + return {Const.OUTPUT: logits} - def predict(self, **kwargs): - pred = self.forward(**kwargs)[Const.OUTPUT].argmax(-1) + def predict(self, words1, words2, seq_len1, seq_len2): + pred = self.forward(words1, words2, seq_len1, seq_len2)[Const.OUTPUT].argmax(-1) return {Const.OUTPUT: pred} # input [batch_size, len , hidden] diff --git a/reproduction/matching/test/test_snlidataloader.py b/reproduction/matching/test/test_snlidataloader.py deleted file mode 100644 index 60b3ad59..00000000 --- a/reproduction/matching/test/test_snlidataloader.py +++ /dev/null @@ -1,10 +0,0 @@ -import unittest -from ..data import MatchingDataLoader -from fastNLP.core.vocabulary import Vocabulary - - -class TestCWSDataLoader(unittest.TestCase): - def test_case1(self): - snli_loader = MatchingDataLoader() - # TODO: still in progress - diff --git a/reproduction/multi-criteria-cws/README.md b/reproduction/multi-criteria-cws/README.md new file mode 100644 index 00000000..0f4ab8d8 --- /dev/null +++ b/reproduction/multi-criteria-cws/README.md @@ -0,0 +1,61 @@ + + +# Multi-Criteria-CWS + +An implementation of [Multi-Criteria Chinese Word Segmentation with Transformer](http://arxiv.org/abs/1906.12035) with fastNLP. + +## Dataset +### Overview +We use the same datasets listed in paper. +- sighan2005 + - pku + - msr + - as + - cityu +- sighan2008 + - ctb + - ckip + - cityu (combined with data in sighan2005) + - ncc + - sxu + +### Preprocess +First, download OpenCC to convert between Traditional Chinese and Simplified Chinese. +``` shell +pip install opencc-python-reimplemented +``` +Then, set a path to save processed data, and run the shell script to process the data. +```shell +export DATA_DIR=path/to/processed-data +bash make_data.sh path/to/sighan2005 path/to/sighan2008 +``` +It would take a few minutes to finish the process. + +## Model +We use transformer to build the model, as described in paper. + +## Train +Finally, to train the model, run the shell script. +The `train.sh` takes one argument, the GPU-IDs to use, for example: +``` shell +bash train.sh 0,1 +``` +This command use GPUs with ID 0 and 1. + +Note: Please refer to the paper for details of hyper-parameters. And modify the settings in `train.sh` to match your experiment environment. + +Type +``` shell +python main.py --help +``` +to learn all arguments to be specified in training. + +## Performance + +Results on the test sets of eight CWS datasets with multi-criteria learning. + +| Dataset | MSRA | AS | PKU | CTB | CKIP | CITYU | NCC | SXU | Avg. | +| -------------- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- | +| Original paper | 98.05 | 96.44 | 96.41 | 96.99 | 96.51 | 96.91 | 96.04 | 97.61 | 96.87 | +| Ours | 96.92 | 95.71 | 95.65 | 95.96 | 96.00 | 96.09 | 94.61 | 96.64 | 95.95 | + diff --git a/reproduction/multi-criteria-cws/data-prepare.py b/reproduction/multi-criteria-cws/data-prepare.py new file mode 100644 index 00000000..1d6e89b5 --- /dev/null +++ b/reproduction/multi-criteria-cws/data-prepare.py @@ -0,0 +1,262 @@ +import os +import re +import argparse +from opencc import OpenCC + +cc = OpenCC("t2s") + +from utils import make_sure_path_exists, append_tags + +sighan05_root = "" +sighan08_root = "" +data_path = "" + +E_pun = u",.!?[]()<>\"\"''," +C_pun = u",。!?【】()《》“”‘’、" +Table = {ord(f): ord(t) for f, t in zip(C_pun, E_pun)} +Table[12288] = 32 # 全半角空格 + + +def C_trans_to_E(string): + return string.translate(Table) + + +def normalize(ustring): + """全角转半角""" + rstring = "" + for uchar in ustring: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif 65281 <= inside_code <= 65374: # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + + rstring += chr(inside_code) + return rstring + + +def preprocess(text): + rNUM = u"(-|\+)?\d+((\.|·)\d+)?%?" + rENG = u"[A-Za-z_]+.*" + sent = normalize(C_trans_to_E(text.strip())).split() + new_sent = [] + for word in sent: + word = re.sub(u"\s+", "", word, flags=re.U) + word = re.sub(rNUM, u"0", word, flags=re.U) + word = re.sub(rENG, u"X", word) + new_sent.append(word) + return new_sent + + +def to_sentence_list(text, split_long_sentence=False): + text = preprocess(text) + delimiter = set() + delimiter.update("。!?:;…、,(),;!?、,\"'") + delimiter.add("……") + sent_list = [] + sent = [] + sent_len = 0 + for word in text: + sent.append(word) + sent_len += len(word) + if word in delimiter or (split_long_sentence and sent_len >= 50): + sent_list.append(sent) + sent = [] + sent_len = 0 + + if len(sent) > 0: + sent_list.append(sent) + + return sent_list + + +def is_traditional(dataset): + return dataset in ["as", "cityu", "ckip"] + + +def convert_file( + src, des, need_cc=False, split_long_sentence=False, encode="utf-8-sig" +): + with open(src, encoding=encode) as src, open(des, "w", encoding="utf-8") as des: + for line in src: + for sent in to_sentence_list(line, split_long_sentence): + line = " ".join(sent) + "\n" + if need_cc: + line = cc.convert(line) + des.write(line) + # if len(''.join(sent)) > 200: + # print(' '.join(sent)) + + +def split_train_dev(dataset): + root = data_path + "/" + dataset + "/raw/" + with open(root + "train-all.txt", encoding="UTF-8") as src, open( + root + "train.txt", "w", encoding="UTF-8" + ) as train, open(root + "dev.txt", "w", encoding="UTF-8") as dev: + lines = src.readlines() + idx = int(len(lines) * 0.9) + for line in lines[:idx]: + train.write(line) + for line in lines[idx:]: + dev.write(line) + + +def combine_files(one, two, out): + if os.path.exists(out): + os.remove(out) + with open(one, encoding="utf-8") as one, open(two, encoding="utf-8") as two, open( + out, "a", encoding="utf-8" + ) as out: + for line in one: + out.write(line) + for line in two: + out.write(line) + + +def bmes_tag(input_file, output_file): + with open(input_file, encoding="utf-8") as input_data, open( + output_file, "w", encoding="utf-8" + ) as output_data: + for line in input_data: + word_list = line.strip().split() + for word in word_list: + if len(word) == 1 or ( + len(word) > 2 and word[0] == "<" and word[-1] == ">" + ): + output_data.write(word + "\tS\n") + else: + output_data.write(word[0] + "\tB\n") + for w in word[1 : len(word) - 1]: + output_data.write(w + "\tM\n") + output_data.write(word[len(word) - 1] + "\tE\n") + output_data.write("\n") + + +def make_bmes(dataset="pku"): + path = data_path + "/" + dataset + "/" + make_sure_path_exists(path + "bmes") + bmes_tag(path + "raw/train.txt", path + "bmes/train.txt") + bmes_tag(path + "raw/train-all.txt", path + "bmes/train-all.txt") + bmes_tag(path + "raw/dev.txt", path + "bmes/dev.txt") + bmes_tag(path + "raw/test.txt", path + "bmes/test.txt") + + +def convert_sighan2005_dataset(dataset): + global sighan05_root + root = os.path.join(data_path, dataset) + make_sure_path_exists(root) + make_sure_path_exists(root + "/raw") + file_path = "{}/{}_training.utf8".format(sighan05_root, dataset) + convert_file( + file_path, "{}/raw/train-all.txt".format(root), is_traditional(dataset), True + ) + if dataset == "as": + file_path = "{}/{}_testing_gold.utf8".format(sighan05_root, dataset) + else: + file_path = "{}/{}_test_gold.utf8".format(sighan05_root, dataset) + convert_file( + file_path, "{}/raw/test.txt".format(root), is_traditional(dataset), False + ) + split_train_dev(dataset) + + +def convert_sighan2008_dataset(dataset, utf=16): + global sighan08_root + root = os.path.join(data_path, dataset) + make_sure_path_exists(root) + make_sure_path_exists(root + "/raw") + convert_file( + "{}/{}_train_utf{}.seg".format(sighan08_root, dataset, utf), + "{}/raw/train-all.txt".format(root), + is_traditional(dataset), + True, + "utf-{}".format(utf), + ) + convert_file( + "{}/{}_seg_truth&resource/{}_truth_utf{}.seg".format( + sighan08_root, dataset, dataset, utf + ), + "{}/raw/test.txt".format(root), + is_traditional(dataset), + False, + "utf-{}".format(utf), + ) + split_train_dev(dataset) + + +def extract_conll(src, out): + words = [] + with open(src, encoding="utf-8") as src, open(out, "w", encoding="utf-8") as out: + for line in src: + line = line.strip() + if len(line) == 0: + out.write(" ".join(words) + "\n") + words = [] + continue + cells = line.split() + words.append(cells[1]) + + +def make_joint_corpus(datasets, joint): + parts = ["dev", "test", "train", "train-all"] + for part in parts: + old_file = "{}/{}/raw/{}.txt".format(data_path, joint, part) + if os.path.exists(old_file): + os.remove(old_file) + elif not os.path.exists(os.path.dirname(old_file)): + os.makedirs(os.path.dirname(old_file)) + for name in datasets: + append_tags( + os.path.join(data_path, name, "raw"), + os.path.dirname(old_file), + name, + part, + encode="utf-8", + ) + + +def convert_all_sighan2005(datasets): + for dataset in datasets: + print(("Converting sighan bakeoff 2005 corpus: {}".format(dataset))) + convert_sighan2005_dataset(dataset) + make_bmes(dataset) + + +def convert_all_sighan2008(datasets): + for dataset in datasets: + print(("Converting sighan bakeoff 2008 corpus: {}".format(dataset))) + convert_sighan2008_dataset(dataset, 16) + make_bmes(dataset) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument("--sighan05", required=True, type=str, help="path to sighan2005 dataset") + parser.add_argument("--sighan08", required=True, type=str, help="path to sighan2008 dataset") + parser.add_argument("--data_path", required=True, type=str, help="path to save dataset") + # fmt: on + + args, _ = parser.parse_known_args() + sighan05_root = args.sighan05 + sighan08_root = args.sighan08 + data_path = args.data_path + + print("Converting sighan2005 Simplified Chinese corpus") + datasets = "pku", "msr", "as", "cityu" + convert_all_sighan2005(datasets) + + print("Combining sighan2005 corpus to one joint Simplified Chinese corpus") + datasets = "pku", "msr", "as", "cityu" + make_joint_corpus(datasets, "joint-sighan2005") + make_bmes("joint-sighan2005") + + # For researchers who have access to sighan2008 corpus, use official corpora please. + print("Converting sighan2008 Simplified Chinese corpus") + datasets = "ctb", "ckip", "cityu", "ncc", "sxu" + convert_all_sighan2008(datasets) + print("Combining those 8 sighan corpora to one joint corpus") + datasets = "pku", "msr", "as", "ctb", "ckip", "cityu", "ncc", "sxu" + make_joint_corpus(datasets, "joint-sighan2008") + make_bmes("joint-sighan2008") + diff --git a/reproduction/multi-criteria-cws/data-process.py b/reproduction/multi-criteria-cws/data-process.py new file mode 100644 index 00000000..829580ef --- /dev/null +++ b/reproduction/multi-criteria-cws/data-process.py @@ -0,0 +1,166 @@ +import os +import sys + +import codecs +import argparse +from _pickle import load, dump +import collections +from utils import get_processing_word, is_dataset_tag, make_sure_path_exists, get_bmes +from fastNLP import Instance, DataSet, Vocabulary, Const + +max_len = 0 + + +def expand(x): + sent = [""] + x[1:] + [""] + return [x + y for x, y in zip(sent[:-1], sent[1:])] + + +def read_file(filename, processing_word=get_processing_word(lowercase=False)): + dataset = DataSet() + niter = 0 + with codecs.open(filename, "r", "utf-8-sig") as f: + words, tags = [], [] + for line in f: + line = line.strip() + if len(line) == 0 or line.startswith("-DOCSTART-"): + if len(words) != 0: + assert len(words) > 2 + if niter == 1: + print(words, tags) + niter += 1 + dataset.append(Instance(ori_words=words[:-1], ori_tags=tags[:-1])) + words, tags = [], [] + else: + word, tag = line.split() + word = processing_word(word) + words.append(word) + tags.append(tag.lower()) + + dataset.apply_field(lambda x: [x[0]], field_name="ori_words", new_field_name="task") + dataset.apply_field( + lambda x: len(x), field_name="ori_tags", new_field_name="seq_len" + ) + dataset.apply_field( + lambda x: expand(x), field_name="ori_words", new_field_name="bi1" + ) + return dataset + + +def main(): + parser = argparse.ArgumentParser() + # fmt: off + parser.add_argument("--data_path", required=True, type=str, help="all of datasets pkl paths") + # fmt: on + + options, _ = parser.parse_known_args() + + train_set, test_set = DataSet(), DataSet() + + input_dir = os.path.join(options.data_path, "joint-sighan2008/bmes") + options.output = os.path.join(options.data_path, "total_dataset.pkl") + print(input_dir, options.output) + + for fn in os.listdir(input_dir): + if fn not in ["test.txt", "train-all.txt"]: + continue + print(fn) + abs_fn = os.path.join(input_dir, fn) + ds = read_file(abs_fn) + if "test.txt" == fn: + test_set = ds + else: + train_set = ds + + print( + "num samples of total train, test: {}, {}".format(len(train_set), len(test_set)) + ) + + uni_vocab = Vocabulary(min_freq=None).from_dataset( + train_set, test_set, field_name="ori_words" + ) + # bi_vocab = Vocabulary(min_freq=3, max_size=50000).from_dataset(train_set,test_set, field_name="bi1") + bi_vocab = Vocabulary(min_freq=3, max_size=None).from_dataset( + train_set, field_name="bi1", no_create_entry_dataset=[test_set] + ) + tag_vocab = Vocabulary(min_freq=None, padding="s", unknown=None).from_dataset( + train_set, field_name="ori_tags" + ) + task_vocab = Vocabulary(min_freq=None, padding=None, unknown=None).from_dataset( + train_set, field_name="task" + ) + + def to_index(dataset): + uni_vocab.index_dataset(dataset, field_name="ori_words", new_field_name="uni") + tag_vocab.index_dataset(dataset, field_name="ori_tags", new_field_name="tags") + task_vocab.index_dataset(dataset, field_name="task", new_field_name="task") + + dataset.apply_field(lambda x: x[1:], field_name="bi1", new_field_name="bi2") + dataset.apply_field(lambda x: x[:-1], field_name="bi1", new_field_name="bi1") + bi_vocab.index_dataset(dataset, field_name="bi1", new_field_name="bi1") + bi_vocab.index_dataset(dataset, field_name="bi2", new_field_name="bi2") + + dataset.set_input("task", "uni", "bi1", "bi2", "seq_len") + dataset.set_target("tags") + return dataset + + train_set = to_index(train_set) + test_set = to_index(test_set) + + output = {} + output["train_set"] = train_set + output["test_set"] = test_set + output["uni_vocab"] = uni_vocab + output["bi_vocab"] = bi_vocab + output["tag_vocab"] = tag_vocab + output["task_vocab"] = task_vocab + + print(tag_vocab.word2idx) + print(task_vocab.word2idx) + + make_sure_path_exists(os.path.dirname(options.output)) + + print("Saving dataset to {}".format(os.path.abspath(options.output))) + with open(options.output, "wb") as outfile: + dump(output, outfile) + + print(len(task_vocab), len(tag_vocab), len(uni_vocab), len(bi_vocab)) + dic = {} + tokens = {} + + def process(words): + name = words[0][1:-1] + if name not in dic: + dic[name] = set() + tokens[name] = 0 + tokens[name] += len(words[1:]) + dic[name].update(words[1:]) + + train_set.apply_field(process, "ori_words", None) + for name in dic.keys(): + print(name, len(dic[name]), tokens[name]) + + with open(os.path.join(os.path.dirname(options.output), "oovdict.pkl"), "wb") as f: + dump(dic, f) + + def get_max_len(ds): + global max_len + max_len = 0 + + def find_max_len(words): + global max_len + if max_len < len(words): + max_len = len(words) + + ds.apply_field(find_max_len, "ori_words", None) + return max_len + + print( + "train max len: {}, test max len: {}".format( + get_max_len(train_set), get_max_len(test_set) + ) + ) + + +if __name__ == "__main__": + main() diff --git a/reproduction/multi-criteria-cws/main.py b/reproduction/multi-criteria-cws/main.py new file mode 100644 index 00000000..049a1974 --- /dev/null +++ b/reproduction/multi-criteria-cws/main.py @@ -0,0 +1,506 @@ +import _pickle as pickle +import argparse +import collections +import logging +import math +import os +import pickle +import random +import sys +import time +from sys import maxsize + +import fastNLP +import fastNLP.embeddings +import numpy as np +import torch +import torch.distributed as dist +import torch.nn as nn +from fastNLP import BucketSampler, DataSetIter, SequentialSampler, logger +from torch.nn.parallel import DistributedDataParallel +from torch.utils.data.distributed import DistributedSampler + +import models +import optm +import utils + +NONE_TAG = "" +START_TAG = "" +END_TAG = "" + +DEFAULT_WORD_EMBEDDING_SIZE = 100 +DEBUG_SCALE = 200 + +# ===-----------------------------------------------------------------------=== +# Argument parsing +# ===-----------------------------------------------------------------------=== +# fmt: off +parser = argparse.ArgumentParser() +parser.add_argument("--dataset", required=True, dest="dataset", help="processed data dir") +parser.add_argument("--word-embeddings", dest="word_embeddings", help="File from which to read in pretrained embeds") +parser.add_argument("--bigram-embeddings", dest="bigram_embeddings", help="File from which to read in pretrained embeds") +parser.add_argument("--crf", dest="crf", action="store_true", help="crf") +# parser.add_argument("--devi", default="0", dest="devi", help="gpu") +parser.add_argument("--step", default=0, dest="step", type=int,help="step") +parser.add_argument("--num-epochs", default=100, dest="num_epochs", type=int, + help="Number of full passes through training set") +parser.add_argument("--batch-size", default=128, dest="batch_size", type=int, + help="Minibatch size of training set") +parser.add_argument("--d_model", default=256, dest="d_model", type=int, help="d_model") +parser.add_argument("--d_ff", default=1024, dest="d_ff", type=int, help="d_ff") +parser.add_argument("--N", default=6, dest="N", type=int, help="N") +parser.add_argument("--h", default=4, dest="h", type=int, help="h") +parser.add_argument("--factor", default=2, dest="factor", type=float, help="Initial learning rate") +parser.add_argument("--dropout", default=0.2, dest="dropout", type=float, + help="Amount of dropout(not keep rate, but drop rate) to apply to embeddings part of graph") +parser.add_argument("--log-dir", default="result", dest="log_dir", + help="Directory where to write logs / serialized models") +parser.add_argument("--task-name", default=time.strftime("%Y-%m-%d-%H-%M-%S"), dest="task_name", + help="Name for this task, use a comprehensive one") +parser.add_argument("--no-model", dest="no_model", action="store_true", help="Don't serialize model") +parser.add_argument("--always-model", dest="always_model", action="store_true", + help="Always serialize model after every epoch") +parser.add_argument("--old-model", dest="old_model", help="Path to old model for incremental training") +parser.add_argument("--skip-dev", dest="skip_dev", action="store_true", help="Skip dev set, would save some time") +parser.add_argument("--freeze", dest="freeze", action="store_true", help="freeze pretrained embedding") +parser.add_argument("--only-task", dest="only_task", action="store_true", help="only train task embedding") +parser.add_argument("--subset", dest="subset", help="Only train and test on a subset of the whole dataset") +parser.add_argument("--seclude", dest="seclude", help="train and test except a subset") +parser.add_argument("--instances", default=None, dest="instances", type=int,help="num of instances of subset") + +parser.add_argument("--seed", dest="python_seed", type=int, default=random.randrange(maxsize), + help="Random seed of Python and NumPy") +parser.add_argument("--debug", dest="debug", default=False, action="store_true", help="Debug mode") +parser.add_argument("--test", dest="test", action="store_true", help="Test mode") +parser.add_argument('--local_rank', type=int, default=None) +parser.add_argument('--init_method', type=str, default='env://') +# fmt: on + +options, _ = parser.parse_known_args() +print("unknown args", _) +task_name = options.task_name +root_dir = "{}/{}".format(options.log_dir, task_name) +utils.make_sure_path_exists(root_dir) + +if options.local_rank is not None: + torch.cuda.set_device(options.local_rank) + dist.init_process_group("nccl", init_method=options.init_method) + + +def init_logger(): + if not os.path.exists(root_dir): + os.mkdir(root_dir) + log_formatter = logging.Formatter("%(asctime)s - %(message)s") + logger = logging.getLogger() + file_handler = logging.FileHandler("{0}/info.log".format(root_dir), mode="w") + file_handler.setFormatter(log_formatter) + logger.addHandler(file_handler) + console_handler = logging.StreamHandler() + console_handler.setFormatter(log_formatter) + logger.addHandler(console_handler) + if options.local_rank is None or options.local_rank == 0: + logger.setLevel(logging.INFO) + else: + logger.setLevel(logging.WARNING) + return logger + + +# ===-----------------------------------------------------------------------=== +# Set up logging +# ===-----------------------------------------------------------------------=== +# logger = init_logger() +logger.add_file("{}/info.log".format(root_dir), "INFO") +logger.setLevel(logging.INFO if dist.get_rank() == 0 else logging.WARNING) + +# ===-----------------------------------------------------------------------=== +# Log some stuff about this run +# ===-----------------------------------------------------------------------=== +logger.info(" ".join(sys.argv)) +logger.info("") +logger.info(options) + +if options.debug: + logger.info("DEBUG MODE") + options.num_epochs = 2 + options.batch_size = 20 + +random.seed(options.python_seed) +np.random.seed(options.python_seed % (2 ** 32 - 1)) +torch.cuda.manual_seed_all(options.python_seed) +logger.info("Python random seed: {}".format(options.python_seed)) + +# ===-----------------------------------------------------------------------=== +# Read in dataset +# ===-----------------------------------------------------------------------=== +dataset = pickle.load(open(options.dataset + "/total_dataset.pkl", "rb")) +train_set = dataset["train_set"] +test_set = dataset["test_set"] +uni_vocab = dataset["uni_vocab"] +bi_vocab = dataset["bi_vocab"] +task_vocab = dataset["task_vocab"] +tag_vocab = dataset["tag_vocab"] +for v in (bi_vocab, uni_vocab, tag_vocab, task_vocab): + if hasattr(v, "_word2idx"): + v.word2idx = v._word2idx +for ds in (train_set, test_set): + ds.rename_field("ori_words", "words") + +logger.info("{} {}".format(bi_vocab.to_word(0), tag_vocab.word2idx)) +logger.info(task_vocab.word2idx) +if options.skip_dev: + dev_set = test_set +else: + train_set, dev_set = train_set.split(0.1) + +logger.info("{} {} {}".format(len(train_set), len(dev_set), len(test_set))) + +if options.debug: + train_set = train_set[0:DEBUG_SCALE] + dev_set = dev_set[0:DEBUG_SCALE] + test_set = test_set[0:DEBUG_SCALE] + +# ===-----------------------------------------------------------------------=== +# Build model and trainer +# ===-----------------------------------------------------------------------=== + +# =============================== +if dist.get_rank() != 0: + dist.barrier() + +if options.word_embeddings is None: + init_embedding = None +else: + # logger.info("Load: {}".format(options.word_embeddings)) + # init_embedding = utils.embedding_load_with_cache(options.word_embeddings, options.cache_dir, uni_vocab, normalize=False) + init_embedding = fastNLP.embeddings.StaticEmbedding( + uni_vocab, options.word_embeddings, word_drop=0.01 + ) + +bigram_embedding = None +if options.bigram_embeddings: + # logger.info("Load: {}".format(options.bigram_embeddings)) + # bigram_embedding = utils.embedding_load_with_cache(options.bigram_embeddings, options.cache_dir, bi_vocab, normalize=False) + bigram_embedding = fastNLP.embeddings.StaticEmbedding( + bi_vocab, options.bigram_embeddings + ) + +if dist.get_rank() == 0: + dist.barrier() +# =============================== + +# select subset training +if options.seclude is not None: + setname = "<{}>".format(options.seclude) + logger.info("seclude {}".format(setname)) + train_set.drop(lambda x: x["words"][0] == setname, inplace=True) + test_set.drop(lambda x: x["words"][0] == setname, inplace=True) + dev_set.drop(lambda x: x["words"][0] == setname, inplace=True) + +if options.subset is not None: + setname = "<{}>".format(options.subset) + logger.info("select {}".format(setname)) + train_set.drop(lambda x: x["words"][0] != setname, inplace=True) + test_set.drop(lambda x: x["words"][0] != setname, inplace=True) + dev_set.drop(lambda x: x["words"][0] != setname, inplace=True) + +# build model and optimizer +i2t = None +if options.crf: + # i2t=utils.to_id_list(tag_vocab.word2idx) + i2t = {} + for x, y in tag_vocab.word2idx.items(): + i2t[y] = x + logger.info(i2t) + +freeze = True if options.freeze else False +model = models.make_CWS( + d_model=options.d_model, + N=options.N, + h=options.h, + d_ff=options.d_ff, + dropout=options.dropout, + word_embedding=init_embedding, + bigram_embedding=bigram_embedding, + tag_size=len(tag_vocab), + task_size=len(task_vocab), + crf=i2t, + freeze=freeze, +) + +device = "cpu" + +if torch.cuda.device_count() > 0: + if options.local_rank is not None: + device = "cuda:{}".format(options.local_rank) + # model=nn.DataParallel(model) + model = model.to(device) + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[options.local_rank], output_device=options.local_rank + ) + else: + device = "cuda:0" + model.to(device) + + +if options.only_task and options.old_model is not None: + logger.info("fix para except task embedding") + for name, para in model.named_parameters(): + if name.find("task_embed") == -1: + para.requires_grad = False + else: + para.requires_grad = True + logger.info(name) + +optimizer = optm.NoamOpt( + options.d_model, + options.factor, + 4000, + torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9), +) + +optimizer._step = options.step + +best_model_file_name = "{}/model.bin".format(root_dir) + +if options.local_rank is None: + train_sampler = BucketSampler( + batch_size=options.batch_size, seq_len_field_name="seq_len" + ) +else: + train_sampler = DistributedSampler( + train_set, dist.get_world_size(), dist.get_rank() + ) +dev_sampler = SequentialSampler() + +i2t = utils.to_id_list(tag_vocab.word2idx) +i2task = utils.to_id_list(task_vocab.word2idx) +dev_set.set_input("words") +test_set.set_input("words") +test_batch = DataSetIter(test_set, options.batch_size, num_workers=2) + +word_dic = pickle.load(open(options.dataset + "/oovdict.pkl", "rb")) + + +def batch_to_device(batch, device): + for k, v in batch.items(): + if torch.is_tensor(v): + batch[k] = v.to(device) + return batch + + +def tester(model, test_batch, write_out=False): + res = [] + prf = utils.CWSEvaluator(i2t) + prf_dataset = {} + oov_dataset = {} + + logger.info("start evaluation") + # import ipdb; ipdb.set_trace() + with torch.no_grad(): + for batch_x, batch_y in test_batch: + batch_to_device(batch_x, device) + # batch_to_device(batch_y, device) + if bigram_embedding is not None: + out = model( + batch_x["task"], + batch_x["uni"], + batch_x["seq_len"], + batch_x["bi1"], + batch_x["bi2"], + ) + else: + out = model(batch_x["task"], batch_x["uni"], batch_x["seq_len"]) + out = out["pred"] + # print(out) + num = out.size(0) + out = out.detach().cpu().numpy() + for i in range(num): + length = int(batch_x["seq_len"][i]) + + out_tags = out[i, 1:length].tolist() + sentence = batch_x["words"][i] + gold_tags = batch_y["tags"][i][1:length].numpy().tolist() + dataset_name = sentence[0] + sentence = sentence[1:] + # print(out_tags,gold_tags) + assert utils.is_dataset_tag(dataset_name), dataset_name + assert len(gold_tags) == len(out_tags) and len(gold_tags) == len( + sentence + ) + + if dataset_name not in prf_dataset: + prf_dataset[dataset_name] = utils.CWSEvaluator(i2t) + oov_dataset[dataset_name] = utils.CWS_OOV( + word_dic[dataset_name[1:-1]] + ) + + prf_dataset[dataset_name].add_instance(gold_tags, out_tags) + prf.add_instance(gold_tags, out_tags) + + if write_out: + gold_strings = utils.to_tag_strings(i2t, gold_tags) + obs_strings = utils.to_tag_strings(i2t, out_tags) + + word_list = utils.bmes_to_words(sentence, obs_strings) + oov_dataset[dataset_name].update( + utils.bmes_to_words(sentence, gold_strings), word_list + ) + + raw_string = " ".join(word_list) + res.append(dataset_name + " " + raw_string + " " + dataset_name) + + Ap = 0.0 + Ar = 0.0 + Af = 0.0 + Aoov = 0.0 + tot = 0 + nw = 0.0 + for dataset_name, performance in sorted(prf_dataset.items()): + p = performance.result() + if write_out: + nw = oov_dataset[dataset_name].oov() + # nw = 0 + logger.info( + "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format( + dataset_name, p[0], p[1], p[2], nw + ) + ) + else: + logger.info( + "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format( + dataset_name, p[0], p[1], p[2] + ) + ) + Ap += p[0] + Ar += p[1] + Af += p[2] + Aoov += nw + tot += 1 + + prf = prf.result() + logger.info( + "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format("TOT", prf[0], prf[1], prf[2]) + ) + if not write_out: + logger.info( + "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format( + "AVG", Ap / tot, Ar / tot, Af / tot + ) + ) + else: + logger.info( + "{}\t{:04.2f}\t{:04.2f}\t{:04.2f}\t{:04.2f}".format( + "AVG", Ap / tot, Ar / tot, Af / tot, Aoov / tot + ) + ) + return prf[-1], res + + +# start training +if not options.test: + if options.old_model: + # incremental training + logger.info("Incremental training from old model: {}".format(options.old_model)) + model.load_state_dict(torch.load(options.old_model, map_location="cuda:0")) + + logger.info("Number training instances: {}".format(len(train_set))) + logger.info("Number dev instances: {}".format(len(dev_set))) + + train_batch = DataSetIter( + batch_size=options.batch_size, + dataset=train_set, + sampler=train_sampler, + num_workers=4, + ) + dev_batch = DataSetIter( + batch_size=options.batch_size, + dataset=dev_set, + sampler=dev_sampler, + num_workers=4, + ) + + best_f1 = 0.0 + for epoch in range(int(options.num_epochs)): + logger.info("Epoch {} out of {}".format(epoch + 1, options.num_epochs)) + train_loss = 0.0 + model.train() + tot = 0 + t1 = time.time() + for batch_x, batch_y in train_batch: + model.zero_grad() + if bigram_embedding is not None: + out = model( + batch_x["task"], + batch_x["uni"], + batch_x["seq_len"], + batch_x["bi1"], + batch_x["bi2"], + batch_y["tags"], + ) + else: + out = model( + batch_x["task"], batch_x["uni"], batch_x["seq_len"], batch_y["tags"] + ) + loss = out["loss"] + train_loss += loss.item() + tot += 1 + loss.backward() + # nn.utils.clip_grad_value_(model.parameters(), 1) + optimizer.step() + + t2 = time.time() + train_loss = train_loss / tot + logger.info( + "time: {} loss: {} step: {}".format(t2 - t1, train_loss, optimizer._step) + ) + # Evaluate dev data + if options.skip_dev and dist.get_rank() == 0: + logger.info("Saving model to {}".format(best_model_file_name)) + torch.save(model.module.state_dict(), best_model_file_name) + continue + + model.eval() + if dist.get_rank() == 0: + f1, _ = tester(model.module, dev_batch) + if f1 > best_f1: + best_f1 = f1 + logger.info("- new best score!") + if not options.no_model: + logger.info("Saving model to {}".format(best_model_file_name)) + torch.save(model.module.state_dict(), best_model_file_name) + + elif options.always_model: + logger.info("Saving model to {}".format(best_model_file_name)) + torch.save(model.module.state_dict(), best_model_file_name) + dist.barrier() + +# Evaluate test data (once) +logger.info("\nNumber test instances: {}".format(len(test_set))) + + +if not options.skip_dev: + if options.test: + model.module.load_state_dict( + torch.load(options.old_model, map_location="cuda:0") + ) + else: + model.module.load_state_dict( + torch.load(best_model_file_name, map_location="cuda:0") + ) + +if dist.get_rank() == 0: + for name, para in model.named_parameters(): + if name.find("task_embed") != -1: + tm = para.detach().cpu().numpy() + logger.info(tm.shape) + np.save("{}/task.npy".format(root_dir), tm) + break + +_, res = tester(model.module, test_batch, True) + +if dist.get_rank() == 0: + with open("{}/testout.txt".format(root_dir), "w", encoding="utf-8") as raw_writer: + for sent in res: + raw_writer.write(sent) + raw_writer.write("\n") + diff --git a/reproduction/multi-criteria-cws/make_data.sh b/reproduction/multi-criteria-cws/make_data.sh new file mode 100644 index 00000000..9c2b09d8 --- /dev/null +++ b/reproduction/multi-criteria-cws/make_data.sh @@ -0,0 +1,14 @@ +if [ -z "$DATA_DIR" ] +then + DATA_DIR="./data" +fi + +mkdir -vp $DATA_DIR + +cmd="python -u ./data-prepare.py --sighan05 $1 --sighan08 $2 --data_path $DATA_DIR" +echo $cmd +eval $cmd + +cmd="python -u ./data-process.py --data_path $DATA_DIR" +echo $cmd +eval $cmd diff --git a/legacy/automl/__init__.py b/reproduction/multi-criteria-cws/model.py similarity index 100% rename from legacy/automl/__init__.py rename to reproduction/multi-criteria-cws/model.py diff --git a/reproduction/multi-criteria-cws/models.py b/reproduction/multi-criteria-cws/models.py new file mode 100644 index 00000000..965da651 --- /dev/null +++ b/reproduction/multi-criteria-cws/models.py @@ -0,0 +1,200 @@ +import fastNLP +import torch +import math +from fastNLP.modules.encoder.transformer import TransformerEncoder +from fastNLP.modules.decoder.crf import ConditionalRandomField +from fastNLP import Const +import copy +import numpy as np +from torch.autograd import Variable +import torch.autograd as autograd +import torch.nn as nn +import torch.nn.functional as F +import transformer + + +class PositionalEncoding(nn.Module): + "Implement the PE function." + + def __init__(self, d_model, dropout, max_len=512): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + # Compute the positional encodings once in log space. + pe = torch.zeros(max_len, d_model).float() + position = torch.arange(0, max_len).unsqueeze(1).float() + div_term = torch.exp( + torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model) + ) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + self.register_buffer("pe", pe) + + def forward(self, x): + x = x + Variable(self.pe[:, : x.size(1)], requires_grad=False) + return self.dropout(x) + + +class Embedding(nn.Module): + def __init__( + self, + task_size, + d_model, + word_embedding=None, + bi_embedding=None, + word_size=None, + freeze=True, + ): + super(Embedding, self).__init__() + self.task_size = task_size + self.embed_dim = 0 + + self.task_embed = nn.Embedding(task_size, d_model) + if word_embedding is not None: + # self.uni_embed = nn.Embedding.from_pretrained(torch.FloatTensor(word_embedding), freeze=freeze) + # self.embed_dim+=word_embedding.shape[1] + self.uni_embed = word_embedding + self.embed_dim += word_embedding.embedding_dim + else: + if bi_embedding is not None: + self.embed_dim += bi_embedding.shape[1] + else: + self.embed_dim = d_model + assert word_size is not None + self.uni_embed = Embedding(word_size, self.embed_dim) + + if bi_embedding is not None: + # self.bi_embed = nn.Embedding.from_pretrained(torch.FloatTensor(bi_embedding), freeze=freeze) + # self.embed_dim += bi_embedding.shape[1]*2 + self.bi_embed = bi_embedding + self.embed_dim += bi_embedding.embedding_dim * 2 + + print("Trans Freeze", freeze, self.embed_dim) + + if d_model != self.embed_dim: + self.F = nn.Linear(self.embed_dim, d_model) + else: + self.F = None + + self.d_model = d_model + + def forward(self, task, uni, bi1=None, bi2=None): + y_task = self.task_embed(task[:, 0:1]) + y = self.uni_embed(uni[:, 1:]) + if bi1 is not None: + assert self.bi_embed is not None + + y = torch.cat([y, self.bi_embed(bi1), self.bi_embed(bi2)], dim=-1) + # y2=self.bi_embed(bi) + # y=torch.cat([y,y2[:,:-1,:],y2[:,1:,:]],dim=-1) + + # y=torch.cat([y_task,y],dim=1) + if self.F is not None: + y = self.F(y) + y = torch.cat([y_task, y], dim=1) + return y * math.sqrt(self.d_model) + + +def seq_len_to_mask(seq_len, max_len=None): + if isinstance(seq_len, np.ndarray): + assert ( + len(np.shape(seq_len)) == 1 + ), f"seq_len can only have one dimension, got {len(np.shape(seq_len))}." + if max_len is None: + max_len = int(seq_len.max()) + broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1)) + mask = broad_cast_seq_len < seq_len.reshape(-1, 1) + + elif isinstance(seq_len, torch.Tensor): + assert ( + seq_len.dim() == 1 + ), f"seq_len can only have one dimension, got {seq_len.dim() == 1}." + batch_size = seq_len.size(0) + if max_len is None: + max_len = seq_len.max().long() + broad_cast_seq_len = torch.arange(max_len).expand(batch_size, -1).to(seq_len) + mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1)) + else: + raise TypeError("Only support 1-d numpy.ndarray or 1-d torch.Tensor.") + + return mask + + +class CWSModel(nn.Module): + def __init__(self, encoder, src_embed, position, d_model, tag_size, crf=None): + super(CWSModel, self).__init__() + self.encoder = encoder + self.src_embed = src_embed + self.pos = copy.deepcopy(position) + self.proj = nn.Linear(d_model, tag_size) + self.tag_size = tag_size + if crf is None: + self.crf = None + self.loss_f = nn.CrossEntropyLoss(reduction="mean", ignore_index=-100) + else: + print("crf") + trans = fastNLP.modules.decoder.crf.allowed_transitions( + crf, encoding_type="bmes" + ) + self.crf = ConditionalRandomField(tag_size, allowed_transitions=trans) + # self.norm=nn.LayerNorm(d_model) + + def forward(self, task, uni, seq_len, bi1=None, bi2=None, tags=None): + # mask=fastNLP.core.utils.seq_len_to_mask(seq_len,uni.size(1)) # for dev 0.5.1 + mask = seq_len_to_mask(seq_len, uni.size(1)) + out = self.src_embed(task, uni, bi1, bi2) + out = self.pos(out) + # out=self.norm(out) + out = self.proj(self.encoder(out, mask.float())) + + if self.crf is not None: + if tags is not None: + out = self.crf(out, tags, mask) + return {"loss": out} + else: + out, _ = self.crf.viterbi_decode(out, mask) + return {"pred": out} + else: + if tags is not None: + out = out.contiguous().view(-1, self.tag_size) + tags = tags.data.masked_fill_(mask == 0, -100).view(-1) + loss = self.loss_f(out, tags) + return {"loss": loss} + else: + out = torch.argmax(out, dim=-1) + return {"pred": out} + + +def make_CWS( + N=6, + d_model=256, + d_ff=1024, + h=4, + dropout=0.2, + tag_size=4, + task_size=8, + bigram_embedding=None, + word_embedding=None, + word_size=None, + crf=None, + freeze=True, +): + c = copy.deepcopy + # encoder=TransformerEncoder(num_layers=N,model_size=d_model,inner_size=d_ff,key_size=d_model//h,value_size=d_model//h,num_head=h,dropout=dropout) + encoder = transformer.make_encoder( + N=N, d_model=d_model, h=h, dropout=dropout, d_ff=d_ff + ) + + position = PositionalEncoding(d_model, dropout) + + embed = Embedding( + task_size, d_model, word_embedding, bigram_embedding, word_size, freeze + ) + model = CWSModel(encoder, embed, position, d_model, tag_size, crf=crf) + + for p in model.parameters(): + if p.dim() > 1 and p.requires_grad: + nn.init.xavier_uniform_(p) + + return model diff --git a/reproduction/multi-criteria-cws/optm.py b/reproduction/multi-criteria-cws/optm.py new file mode 100644 index 00000000..a2b68de5 --- /dev/null +++ b/reproduction/multi-criteria-cws/optm.py @@ -0,0 +1,49 @@ +import torch +import torch.optim as optim + + +class NoamOpt: + "Optim wrapper that implements rate." + + def __init__(self, model_size, factor, warmup, optimizer): + self.optimizer = optimizer + self._step = 0 + self.warmup = warmup + self.factor = factor + self.model_size = model_size + self._rate = 0 + + def step(self): + "Update parameters and rate" + self._step += 1 + rate = self.rate() + for p in self.optimizer.param_groups: + p["lr"] = rate + self._rate = rate + self.optimizer.step() + + def rate(self, step=None): + "Implement `lrate` above" + if step is None: + step = self._step + lr = self.factor * ( + self.model_size ** (-0.5) + * min(step ** (-0.5), step * self.warmup ** (-1.5)) + ) + # if step>self.warmup: lr = max(1e-4,lr) + return lr + + +def get_std_opt(model): + return NoamOpt( + model.src_embed[0].d_model, + 2, + 4000, + torch.optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=0, + betas=(0.9, 0.98), + eps=1e-9, + ), + ) + diff --git a/reproduction/multi-criteria-cws/train.py b/reproduction/multi-criteria-cws/train.py new file mode 100644 index 00000000..fce914a1 --- /dev/null +++ b/reproduction/multi-criteria-cws/train.py @@ -0,0 +1,138 @@ +from fastNLP import (Trainer, Tester, Callback, GradientClipCallback, LRScheduler, SpanFPreRecMetric) +import torch +import torch.cuda +from torch.optim import Adam, SGD +from argparse import ArgumentParser +import logging +from .utils import set_seed + + +class LoggingCallback(Callback): + def __init__(self, filepath=None): + super().__init__() + # create file handler and set level to debug + if filepath is not None: + file_handler = logging.FileHandler(filepath, "a") + else: + file_handler = logging.StreamHandler() + + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter( + logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S')) + + # create logger and set level to debug + logger = logging.getLogger() + logger.handlers = [] + logger.setLevel(logging.DEBUG) + logger.propagate = False + logger.addHandler(file_handler) + self.log_writer = logger + + def on_backward_begin(self, loss): + if self.step % self.trainer.print_every == 0: + self.log_writer.info( + 'Step/Epoch {}/{}: Loss {}'.format(self.step, self.epoch, loss.item())) + + def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): + self.log_writer.info( + 'Step/Epoch {}/{}: Eval result {}'.format(self.step, self.epoch, eval_result)) + + def on_backward_end(self): + pass + + +def main(): + parser = ArgumentParser() + register_args(parser) + args = parser.parse_known_args()[0] + + set_seed(args.seed) + if args.train: + train(args) + if args.eval: + evaluate(args) + +def get_optim(args): + name = args.optim.strip().split(' ')[0].lower() + p = args.optim.strip() + l = p.find('(') + r = p.find(')') + optim_args = eval('dict({})'.format(p[[l+1,r]])) + if name == 'sgd': + return SGD(**optim_args) + elif name == 'adam': + return Adam(**optim_args) + else: + raise ValueError(args.optim) + +def load_model_from_path(args): + pass + +def train(args): + data = get_data(args) + train_data = data['train'] + dev_data = data['dev'] + model = get_model(args) + optimizer = get_optim(args) + device = 'cuda' if torch.cuda.is_available() else 'cpu' + callbacks = [] + trainer = Trainer( + train_data=train_data, + model=model, + optimizer=optimizer, + loss=None, + batch_size=args.batch_size, + n_epochs=args.epochs, + num_workers=4, + metrics=SpanFPreRecMetric( + tag_vocab=data['tag_vocab'], encoding_type=data['encoding_type'], + ignore_labels=data['ignore_labels']), + metric_key='f1', + dev_data=dev_data, + save_path=args.save_path, + device=device, + callbacks=callbacks, + check_code_level=-1, + ) + + print(trainer.train()) + + + +def evaluate(args): + data = get_data(args) + test_data = data['test'] + model = load_model_from_path(args) + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + tester = Tester( + data=test_data, model=model, batch_size=args.batch_size, + num_workers=2, device=device, + metrics=SpanFPreRecMetric( + tag_vocab=data['tag_vocab'], encoding_type=data['encoding_type'], + ignore_labels=data['ignore_labels']), + ) + print(tester.test()) + +def register_args(parser): + parser.add_argument('--optim', type=str, default='adam (lr=2e-3, weight_decay=0.0)') + parser.add_argument('--batch_size', type=int, default=128) + parser.add_argument('--epochs', type=int, default=10) + parser.add_argument('--save_path', type=str, default=None) + parser.add_argument('--data_path', type=str, required=True) + parser.add_argument('--log_path', type=str, default=None) + parser.add_argument('--model_config', type=str, required=True) + parser.add_argument('--load_path', type=str, default=None) + parser.add_argument('--train', action='store_true', default=False) + parser.add_argument('--eval', action='store_true', default=False) + parser.add_argument('--seed', type=int, default=42, help='rng seed') + +def get_model(args): + pass + +def get_data(args): + return torch.load(args.data_path) + +if __name__ == '__main__': + main() diff --git a/reproduction/multi-criteria-cws/train.sh b/reproduction/multi-criteria-cws/train.sh new file mode 100644 index 00000000..aa47b8af --- /dev/null +++ b/reproduction/multi-criteria-cws/train.sh @@ -0,0 +1,26 @@ +export EXP_NAME=release04 +export NGPU=2 +export PORT=9988 +export CUDA_DEVICE_ORDER=PCI_BUS_ID +export CUDA_VISIBLE_DEVICES=$1 + +if [ -z "$DATA_DIR" ] +then + DATA_DIR="./data" +fi + +echo $CUDA_VISIBLE_DEVICES +cmd=" +python -m torch.distributed.launch --nproc_per_node=$NGPU --master_port $PORT\ + main.py \ + --word-embeddings cn-char-fastnlp-100d \ + --bigram-embeddings cn-bi-fastnlp-100d \ + --num-epochs 100 \ + --batch-size 256 \ + --seed 1234 \ + --task-name $EXP_NAME \ + --dataset $DATA_DIR \ + --freeze \ +" +echo $cmd +eval $cmd diff --git a/reproduction/multi-criteria-cws/transformer.py b/reproduction/multi-criteria-cws/transformer.py new file mode 100644 index 00000000..fc352e44 --- /dev/null +++ b/reproduction/multi-criteria-cws/transformer.py @@ -0,0 +1,152 @@ +import numpy as np +import torch +import torch.autograd as autograd +import torch.nn as nn +import torch.nn.functional as F +import math, copy, time +from torch.autograd import Variable + +# import matplotlib.pyplot as plt + + +def clones(module, N): + "Produce N identical layers." + return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) + + +def subsequent_mask(size): + "Mask out subsequent positions." + attn_shape = (1, size, size) + subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype("uint8") + return torch.from_numpy(subsequent_mask) == 0 + + +def attention(query, key, value, mask=None, dropout=None): + "Compute 'Scaled Dot Product Attention'" + d_k = query.size(-1) + scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k) + if mask is not None: + # print(scores.size(),mask.size()) # [bsz,1,1,len] + scores = scores.masked_fill(mask == 0, -1e9) + p_attn = F.softmax(scores, dim=-1) + if dropout is not None: + p_attn = dropout(p_attn) + return torch.matmul(p_attn, value), p_attn + + +class MultiHeadedAttention(nn.Module): + def __init__(self, h, d_model, dropout=0.1): + "Take in model size and number of heads." + super(MultiHeadedAttention, self).__init__() + assert d_model % h == 0 + # We assume d_v always equals d_k + self.d_k = d_model // h + self.h = h + self.linears = clones(nn.Linear(d_model, d_model), 4) + self.attn = None + self.dropout = nn.Dropout(p=dropout) + + def forward(self, query, key, value, mask=None): + "Implements Figure 2" + if mask is not None: + # Same mask applied to all h heads. + mask = mask.unsqueeze(1) + + nbatches = query.size(0) + + # 1) Do all the linear projections in batch from d_model => h x d_k + query, key, value = [ + l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) + for l, x in zip(self.linears, (query, key, value)) + ] + + # 2) Apply attention on all the projected vectors in batch. + x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout) + + # 3) "Concat" using a view and apply a final linear. + x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) + return self.linears[-1](x) + + +class LayerNorm(nn.Module): + "Construct a layernorm module (See citation for details)." + + def __init__(self, features, eps=1e-6): + super(LayerNorm, self).__init__() + self.a_2 = nn.Parameter(torch.ones(features)) + self.b_2 = nn.Parameter(torch.zeros(features)) + self.eps = eps + + def forward(self, x): + mean = x.mean(-1, keepdim=True) + std = x.std(-1, keepdim=True) + return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 + + +class PositionwiseFeedForward(nn.Module): + "Implements FFN equation." + + def __init__(self, d_model, d_ff, dropout=0.1): + super(PositionwiseFeedForward, self).__init__() + self.w_1 = nn.Linear(d_model, d_ff) + self.w_2 = nn.Linear(d_ff, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + return self.w_2(self.dropout(F.relu(self.w_1(x)))) + + +class SublayerConnection(nn.Module): + """ + A residual connection followed by a layer norm. + Note for code simplicity the norm is first as opposed to last. + """ + + def __init__(self, size, dropout): + super(SublayerConnection, self).__init__() + self.norm = LayerNorm(size) + self.dropout = nn.Dropout(dropout) + + def forward(self, x, sublayer): + "Apply residual connection to any sublayer with the same size." + return x + self.dropout(sublayer(self.norm(x))) + + +class EncoderLayer(nn.Module): + "Encoder is made up of self-attn and feed forward (defined below)" + + def __init__(self, size, self_attn, feed_forward, dropout): + super(EncoderLayer, self).__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.sublayer = clones(SublayerConnection(size, dropout), 2) + self.size = size + + def forward(self, x, mask): + "Follow Figure 1 (left) for connections." + x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) + return self.sublayer[1](x, self.feed_forward) + + +class Encoder(nn.Module): + "Core encoder is a stack of N layers" + + def __init__(self, layer, N): + super(Encoder, self).__init__() + self.layers = clones(layer, N) + self.norm = LayerNorm(layer.size) + + def forward(self, x, mask): + # print(x.size(),mask.size()) + "Pass the input (and mask) through each layer in turn." + mask = mask.byte().unsqueeze(-2) + for layer in self.layers: + x = layer(x, mask) + return self.norm(x) + + +def make_encoder(N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): + c = copy.deepcopy + attn = MultiHeadedAttention(h, d_model) + ff = PositionwiseFeedForward(d_model, d_ff, dropout) + return Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N) diff --git a/reproduction/multi-criteria-cws/utils.py b/reproduction/multi-criteria-cws/utils.py new file mode 100644 index 00000000..aeb7e43c --- /dev/null +++ b/reproduction/multi-criteria-cws/utils.py @@ -0,0 +1,308 @@ +import numpy as np +import torch +import torch.cuda +import random +import os +import sys +import errno +import time +import codecs +import hashlib +import _pickle as pickle +import warnings +from fastNLP.io import EmbedLoader + +UNK_TAG = "" + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def bmes_to_words(chars, tags): + result = [] + if len(chars) == 0: + return result + word = chars[0] + + for c, t in zip(chars[1:], tags[1:]): + if t.upper() == "B" or t.upper() == "S": + result.append(word) + word = "" + word += c + if len(word) != 0: + result.append(word) + + return result + + +def bmes_to_index(tags): + result = [] + if len(tags) == 0: + return result + word = (0, 0) + + for i, t in enumerate(tags): + if i == 0: + word = (0, 0) + elif t.upper() == "B" or t.upper() == "S": + result.append(word) + word = (i, 0) + word = (word[0], word[1] + 1) + if word[1] != 0: + result.append(word) + return result + + +def get_bmes(sent): + x = [] + y = [] + for word in sent: + length = len(word) + tag = ["m"] * length if length > 1 else ["s"] * length + if length > 1: + tag[0] = "b" + tag[-1] = "e" + x += list(word) + y += tag + return x, y + + +class CWSEvaluator: + def __init__(self, i2t): + self.correct_preds = 0.0 + self.total_preds = 0.0 + self.total_correct = 0.0 + self.i2t = i2t + + def add_instance(self, pred_tags, gold_tags): + pred_tags = [self.i2t[i] for i in pred_tags] + gold_tags = [self.i2t[i] for i in gold_tags] + # Evaluate PRF + lab_gold_chunks = set(bmes_to_index(gold_tags)) + lab_pred_chunks = set(bmes_to_index(pred_tags)) + self.correct_preds += len(lab_gold_chunks & lab_pred_chunks) + self.total_preds += len(lab_pred_chunks) + self.total_correct += len(lab_gold_chunks) + + def result(self, percentage=True): + p = self.correct_preds / self.total_preds if self.correct_preds > 0 else 0 + r = self.correct_preds / self.total_correct if self.correct_preds > 0 else 0 + f1 = 2 * p * r / (p + r) if p + r > 0 else 0 + if percentage: + p *= 100 + r *= 100 + f1 *= 100 + return p, r, f1 + + +class CWS_OOV: + def __init__(self, dic): + self.dic = dic + self.recall = 0 + self.tot = 0 + + def update(self, gold_sent, pred_sent): + i = 0 + j = 0 + id = 0 + for w in gold_sent: + if w not in self.dic: + self.tot += 1 + while i + len(pred_sent[id]) <= j: + i += len(pred_sent[id]) + id += 1 + if ( + i == j + and len(pred_sent[id]) == len(w) + and w.find(pred_sent[id]) != -1 + ): + self.recall += 1 + j += len(w) + # print(gold_sent,pred_sent,self.tot) + + def oov(self, percentage=True): + ins = 1.0 * self.recall / self.tot + if percentage: + ins *= 100 + return ins + + +def get_processing_word( + vocab_words=None, vocab_chars=None, lowercase=False, chars=False +): + def f(word): + # 0. get chars of words + if vocab_chars is not None and chars: + char_ids = [] + for char in word: + # ignore chars out of vocabulary + if char in vocab_chars: + char_ids += [vocab_chars[char]] + + # 1. preprocess word + if lowercase: + word = word.lower() + if word.isdigit(): + word = "0" + + # 2. get id of word + if vocab_words is not None: + if word in vocab_words: + word = vocab_words[word] + else: + word = vocab_words[UNK_TAG] + + # 3. return tuple char ids, word id + if vocab_chars is not None and chars: + return char_ids, word + else: + return word + + return f + + +def append_tags(src, des, name, part, encode="utf-16"): + with open("{}/{}.txt".format(src, part), encoding=encode) as input, open( + "{}/{}.txt".format(des, part), "a", encoding=encode + ) as output: + for line in input: + line = line.strip() + if len(line) > 0: + output.write("<{}> {} ".format(name, line, name)) + output.write("\n") + + +def is_dataset_tag(word): + return len(word) > 2 and word[0] == "<" and word[-1] == ">" + + +def to_tag_strings(i2ts, tag_mapping, pos_separate_col=True): + senlen = len(tag_mapping) + key_value_strs = [] + + for j in range(senlen): + val = i2ts[tag_mapping[j]] + pos_str = val + key_value_strs.append(pos_str) + return key_value_strs + + +def to_id_list(w2i): + i2w = [None] * len(w2i) + for w, i in w2i.items(): + i2w[i] = w + return i2w + + +def make_sure_path_exists(path): + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +def md5_for_file(fn): + md5 = hashlib.md5() + with open(fn, "rb") as f: + for chunk in iter(lambda: f.read(128 * md5.block_size), b""): + md5.update(chunk) + return md5.hexdigest() + + +def embedding_match_vocab( + vocab, + emb, + ori_vocab, + dtype=np.float32, + padding="", + unknown="", + normalize=True, + error="ignore", + init_method=None, +): + dim = emb.shape[-1] + matrix = np.random.randn(len(vocab), dim).astype(dtype) + hit_flags = np.zeros(len(vocab), dtype=bool) + + if init_method: + matrix = init_method(matrix) + for word, idx in ori_vocab.word2idx.items(): + try: + if word == padding and vocab.padding is not None: + word = vocab.padding + elif word == unknown and vocab.unknown is not None: + word = vocab.unknown + if word in vocab: + index = vocab.to_index(word) + matrix[index] = emb[idx] + hit_flags[index] = True + except Exception as e: + if error == "ignore": + warnings.warn("Error occurred at the {} line.".format(idx)) + else: + print("Error occurred at the {} line.".format(idx)) + raise e + + total_hits = np.sum(hit_flags) + print( + "Found {} out of {} words in the pre-training embedding.".format( + total_hits, len(vocab) + ) + ) + if init_method is None: + found_vectors = matrix[hit_flags] + if len(found_vectors) != 0: + mean = np.mean(found_vectors, axis=0, keepdims=True) + std = np.std(found_vectors, axis=0, keepdims=True) + unfound_vec_num = len(vocab) - total_hits + r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean + matrix[hit_flags == False] = r_vecs + + if normalize: + matrix /= np.linalg.norm(matrix, axis=1, keepdims=True) + + return matrix + + +def embedding_load_with_cache(emb_file, cache_dir, vocab, **kwargs): + def match_cache(file, cache_dir): + md5 = md5_for_file(file) + cache_files = os.listdir(cache_dir) + for fn in cache_files: + if md5 in fn.split("-")[-1]: + return os.path.join(cache_dir, fn), True + return ( + "{}-{}.pkl".format(os.path.join(cache_dir, os.path.basename(file)), md5), + False, + ) + + def get_cache(file): + if not os.path.exists(file): + return None + with open(file, "rb") as f: + emb = pickle.load(f) + return emb + + os.makedirs(cache_dir, exist_ok=True) + cache_fn, match = match_cache(emb_file, cache_dir) + if not match: + print("cache missed, re-generating cache at {}".format(cache_fn)) + emb, ori_vocab = EmbedLoader.load_without_vocab( + emb_file, padding=None, unknown=None, normalize=False + ) + with open(cache_fn, "wb") as f: + pickle.dump((emb, ori_vocab), f) + + else: + print("cache matched at {}".format(cache_fn)) + + # use cache + print("loading embeddings ...") + emb = get_cache(cache_fn) + assert emb is not None + return embedding_match_vocab(vocab, emb[0], emb[1], **kwargs) diff --git a/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py b/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py deleted file mode 100644 index cec5ab76..00000000 --- a/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py +++ /dev/null @@ -1,115 +0,0 @@ - - -from fastNLP.io.base_loader import DataSetLoader, DataBundle -from fastNLP.io import ConllLoader -from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 -from fastNLP import Const -from reproduction.utils import check_dataloader_paths -from fastNLP import Vocabulary - -class ChineseNERLoader(DataSetLoader): - """ - 读取中文命名实体数据集,包括PeopleDaily, MSRA-NER, Weibo。数据在这里可以找到https://github.com/OYE93/Chinese-NLP-Corpus/tree/master/NER - 请确保输入数据的格式如下, 共两列,第一列为字,第二列为标签,不同句子以空行隔开 - 我 O - 们 O - 变 O - 而 O - 以 O - 书 O - 会 O - ... - - """ - def __init__(self, encoding_type:str='bioes'): - """ - - :param str encoding_type: 支持bio和bioes格式 - """ - super().__init__() - self._loader = ConllLoader(headers=['raw_chars', 'target'], indexes=[0, 1]) - - assert encoding_type in ('bio', 'bioes') - - self._tag_converters = [iob2] - if encoding_type == 'bioes': - self._tag_converters.append(iob2bioes) - - def load(self, path:str): - dataset = self._loader.load(path) - def convert_tag_schema(tags): - for converter in self._tag_converters: - tags = converter(tags) - return tags - if self._tag_converters: - dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) - return dataset - - def process(self, paths, bigrams=False, trigrams=False): - """ - - :param paths: - :param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d] - :param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd, d] - :return: DataBundle - 包含以下的fields - raw_chars: List[str] - chars: List[int] - seq_len: int, 字的长度 - bigrams: List[int], optional - trigrams: List[int], optional - target: List[int] - """ - paths = check_dataloader_paths(paths) - data = DataBundle() - input_fields = [Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET] - target_fields = [Const.TARGET, Const.INPUT_LEN] - - for name, path in paths.items(): - dataset = self.load(path) - if bigrams: - dataset.apply_field(lambda raw_chars: [c1+c2 for c1, c2 in zip(raw_chars, raw_chars[1:]+[''])], - field_name='raw_chars', new_field_name='bigrams') - - if trigrams: - dataset.apply_field(lambda raw_chars: [c1+c2+c3 for c1, c2, c3 in zip(raw_chars, - raw_chars[1:]+[''], - raw_chars[2:]+['']*2)], - field_name='raw_chars', new_field_name='trigrams') - data.datasets[name] = dataset - - char_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='raw_chars', - no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) - char_vocab.index_dataset(*data.datasets.values(), field_name='raw_chars', new_field_name=Const.CHAR_INPUT) - data.vocabs[Const.CHAR_INPUT] = char_vocab - - target_vocab = Vocabulary(unknown=None, padding=None).from_dataset(data.datasets['train'], field_name=Const.TARGET) - target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) - data.vocabs[Const.TARGET] = target_vocab - - if bigrams: - bigram_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='bigrams', - no_create_entry_dataset=[dataset for name, dataset in - data.datasets.items() if name != 'train']) - bigram_vocab.index_dataset(*data.datasets.values(), field_name='bigrams', new_field_name='bigrams') - data.vocabs['bigrams'] = bigram_vocab - input_fields.append('bigrams') - - if trigrams: - trigram_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='trigrams', - no_create_entry_dataset=[dataset for name, dataset in - data.datasets.items() if name != 'train']) - trigram_vocab.index_dataset(*data.datasets.values(), field_name='trigrams', new_field_name='trigrams') - data.vocabs['trigrams'] = trigram_vocab - input_fields.append('trigrams') - - for name, dataset in data.datasets.items(): - dataset.add_seq_len(Const.CHAR_INPUT) - dataset.set_input(*input_fields) - dataset.set_target(*target_fields) - - return data - - - - diff --git a/reproduction/seqence_labelling/cws/data/CWSDataLoader.py b/reproduction/seqence_labelling/cws/data/CWSDataLoader.py deleted file mode 100644 index 3c82d814..00000000 --- a/reproduction/seqence_labelling/cws/data/CWSDataLoader.py +++ /dev/null @@ -1,249 +0,0 @@ - -from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader -from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle -from typing import Union, Dict, List, Iterator -from fastNLP import DataSet -from fastNLP import Instance -from fastNLP import Vocabulary -from fastNLP import Const -from reproduction.utils import check_dataloader_paths -from functools import partial - -class SigHanLoader(DataSetLoader): - """ - 任务相关的说明可以在这里找到http://sighan.cs.uchicago.edu/ - 支持的数据格式为,一行一句,不同的word用空格隔开。如下例 - - 共同 创造 美好 的 新 世纪 —— 二○○一年 新年 - 女士 们 , 先生 们 , 同志 们 , 朋友 们 : - - 读取sighan中的数据集,返回的DataSet将包含以下的内容fields: - raw_chars: list(str), 每个元素是一个汉字 - chars: list(str), 每个元素是一个index(汉字对应的index) - target: list(int), 根据不同的encoding_type会有不同的变化 - - :param target_type: target的类型,当前支持以下的两种: "bmes", "shift_relay" - """ - - def __init__(self, target_type:str): - super().__init__() - - if target_type.lower() not in ('bmes', 'shift_relay'): - raise ValueError("target_type only supports 'bmes', 'shift_relay'.") - - self.target_type = target_type - if target_type=='bmes': - self._word_len_to_target = self._word_len_to_bems - elif target_type=='shift_relay': - self._word_len_to_target = self._word_lens_to_relay - - @staticmethod - def _word_lens_to_relay(word_lens: Iterator[int]): - """ - [1, 2, 3, ..] 转换为[0, 1, 0, 2, 1, 0,](start指示seg有多长); - :param word_lens: - :return: {'target': , 'end_seg_mask':, 'start_seg_mask':} - """ - tags = [] - end_seg_mask = [] - start_seg_mask = [] - for word_len in word_lens: - tags.extend([idx for idx in range(word_len - 1, -1, -1)]) - end_seg_mask.extend([0] * (word_len - 1) + [1]) - start_seg_mask.extend([1] + [0] * (word_len - 1)) - return {'target': tags, 'end_seg_mask': end_seg_mask, 'start_seg_mask': start_seg_mask} - - @staticmethod - def _word_len_to_bems(word_lens:Iterator[int])->Dict[str, List[str]]: - """ - - :param word_lens: 每个word的长度 - :return: - """ - tags = [] - for word_len in word_lens: - if word_len==1: - tags.append('S') - else: - tags.append('B') - for _ in range(word_len-2): - tags.append('M') - tags.append('E') - return {'target':tags} - - @staticmethod - def _gen_bigram(chars:List[str])->List[str]: - """ - - :param chars: - :return: - """ - return [c1+c2 for c1, c2 in zip(chars, chars[1:]+[''])] - - def load(self, path:str, bigram:bool=False)->DataSet: - """ - :param path: str - :param bigram: 是否使用bigram feature - :return: - """ - dataset = DataSet() - with open(path, 'r', encoding='utf-8') as f: - for line in f: - line = line.strip() - if not line: # 去掉空行 - continue - parts = line.split() - word_lens = map(len, parts) - chars = list(''.join(parts)) - tags = self._word_len_to_target(word_lens) - assert len(chars)==len(tags['target']) - dataset.append(Instance(raw_chars=chars, **tags, seq_len=len(chars))) - if len(dataset)==0: - raise RuntimeError(f"{path} has no valid data.") - if bigram: - dataset.apply_field(self._gen_bigram, field_name='raw_chars', new_field_name='bigrams') - return dataset - - def process(self, paths: Union[str, Dict[str, str]], char_vocab_opt:VocabularyOption=None, - char_embed_opt:EmbeddingOption=None, bigram_vocab_opt:VocabularyOption=None, - bigram_embed_opt:EmbeddingOption=None, L:int=4): - """ - 支持的数据格式为一行一个sample,并且用空格隔开不同的词语。例如 - - Option:: - - 共同 创造 美好 的 新 世纪 —— 二○○一年 新年 贺词 - ( 二○○○年 十二月 三十一日 ) ( 附 图片 1 张 ) - 女士 们 , 先生 们 , 同志 们 , 朋友 们 : - - paths支持两种格式,第一种是str,第二种是Dict[str, str]. - - Option:: - - # 1. str类型 - # 1.1 传入具体的文件路径 - data = SigHanLoader('bmes').process('/path/to/cws/data.txt') # 将读取data.txt的内容 - # 包含以下的内容data.vocabs['chars']:Vocabulary对象, - # data.vocabs['target']: Vocabulary对象,根据encoding_type可能会没有该值 - # data.embeddings['chars']: Embedding对象. 只有提供了预训练的词向量的路径才有该项 - # data.datasets['train']: DataSet对象 - # 包含的field有: - # raw_chars: list[str], 每个元素是一个汉字 - # chars: list[int], 每个元素是汉字对应的index - # target: list[int], 根据encoding_type有对应的变化 - # 1.2 传入一个目录, 里面必须包含train.txt文件 - data = SigHanLoader('bmes').process('path/to/cws/') #将尝试在该目录下读取 train.txt, test.txt以及dev.txt - # 包含以下的内容data.vocabs['chars']: Vocabulary对象 - # data.vocabs['target']:Vocabulary对象 - # data.embeddings['chars']: 仅在提供了预训练embedding路径的情况下,为Embedding对象; - # data.datasets['train']: DataSet对象 - # 包含的field有: - # raw_chars: list[str], 每个元素是一个汉字 - # chars: list[int], 每个元素是汉字对应的index - # target: list[int], 根据encoding_type有对应的变化 - # data.datasets['dev']: DataSet对象,如果文件夹下包含了dev.txt;内容与data.datasets['train']一样 - - # 2. dict类型, key是文件的名称,value是对应的读取路径. 必须包含'train'这个key - paths = {'train': '/path/to/train/train.txt', 'test':'/path/to/test/test.txt', 'dev':'/path/to/dev/dev.txt'} - data = SigHanLoader(paths).process(paths) - # 结果与传入目录时是一致的,但是可以传入多个数据集。data.datasets中的key将与这里传入的一致 - - :param paths: 支持传入目录,文件路径,以及dict。 - :param char_vocab_opt: 用于构建chars的vocabulary参数,默认为min_freq=2 - :param char_embed_opt: 用于读取chars的Embedding的参数,默认不读取pretrained的embedding - :param bigram_vocab_opt: 用于构建bigram的vocabulary参数,默认不使用bigram, 仅在指定该参数的情况下会带有bigrams这个field。 - 为List[int], 每个instance长度与chars一样, abcde的bigram为ab bc cd de e - :param bigram_embed_opt: 用于读取预训练bigram的参数,仅在传入bigram_vocab_opt有效 - :param L: 当target_type为shift_relay时传入的segment长度 - :return: - """ - # 推荐大家使用这个check_data_loader_paths进行paths的验证 - paths = check_dataloader_paths(paths) - datasets = {} - data = DataBundle() - bigram = bigram_vocab_opt is not None - for name, path in paths.items(): - dataset = self.load(path, bigram=bigram) - datasets[name] = dataset - input_fields = [] - target_fields = [] - # 创建vocab - char_vocab = Vocabulary(min_freq=2) if char_vocab_opt is None else Vocabulary(**char_vocab_opt) - char_vocab.from_dataset(datasets['train'], field_name='raw_chars') - char_vocab.index_dataset(*datasets.values(), field_name='raw_chars', new_field_name='chars') - data.vocabs[Const.CHAR_INPUT] = char_vocab - input_fields.extend([Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET]) - target_fields.append(Const.TARGET) - # 创建target - if self.target_type == 'bmes': - target_vocab = Vocabulary(unknown=None, padding=None) - target_vocab.add_word_lst(['B']*4+['M']*3+['E']*2+['S']) - target_vocab.index_dataset(*datasets.values(), field_name='target') - data.vocabs[Const.TARGET] = target_vocab - if char_embed_opt is not None: - char_embed = EmbedLoader.load_with_vocab(**char_embed_opt, vocab=char_vocab) - data.embeddings['chars'] = char_embed - if bigram: - bigram_vocab = Vocabulary(**bigram_vocab_opt) - bigram_vocab.from_dataset(datasets['train'], field_name='bigrams') - bigram_vocab.index_dataset(*datasets.values(), field_name='bigrams') - data.vocabs['bigrams'] = bigram_vocab - if bigram_embed_opt is not None: - bigram_embed = EmbedLoader.load_with_vocab(**bigram_embed_opt, vocab=bigram_vocab) - data.embeddings['bigrams'] = bigram_embed - input_fields.append('bigrams') - if self.target_type == 'shift_relay': - func = partial(self._clip_target, L=L) - for name, dataset in datasets.items(): - res = dataset.apply_field(func, field_name='target') - relay_target = [res_i[0] for res_i in res] - relay_mask = [res_i[1] for res_i in res] - dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) - dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) - if self.target_type == 'shift_relay': - input_fields.extend(['end_seg_mask']) - target_fields.append('start_seg_mask') - # 将dataset加入DataInfo - for name, dataset in datasets.items(): - dataset.set_input(*input_fields) - dataset.set_target(*target_fields) - data.datasets[name] = dataset - - return data - - @staticmethod - def _clip_target(target:List[int], L:int): - """ - - 只有在target_type为shift_relay的使用 - :param target: List[int] - :param L: - :return: - """ - relay_target_i = [] - tmp = [] - for j in range(len(target) - 1): - tmp.append(target[j]) - if target[j] > target[j + 1]: - pass - else: - relay_target_i.extend([L - 1 if t >= L else t for t in tmp[::-1]]) - tmp = [] - # 处理未结束的部分 - if len(tmp) == 0: - relay_target_i.append(0) - else: - tmp.append(target[-1]) - relay_target_i.extend([L - 1 if t >= L else t for t in tmp[::-1]]) - relay_mask_i = [] - j = 0 - while j < len(target): - seg_len = target[j] + 1 - if target[j] < L: - relay_mask_i.extend([0] * (seg_len)) - else: - relay_mask_i.extend([1] * (seg_len - L) + [0] * L) - j = seg_len + j - return relay_target_i, relay_mask_i - diff --git a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py b/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py deleted file mode 100644 index f4260849..00000000 --- a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py +++ /dev/null @@ -1,17 +0,0 @@ - - -import unittest -from ..data.CWSDataLoader import SigHanLoader -from fastNLP.core.vocabulary import VocabularyOption - - -class TestCWSDataLoader(unittest.TestCase): - def test_case1(self): - cws_loader = SigHanLoader(target_type='bmes') - data = cws_loader.process('pku_demo.txt') - print(data.datasets) - - def test_calse2(self): - cws_loader = SigHanLoader(target_type='bmes') - data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption()) - print(data.datasets) \ No newline at end of file diff --git a/reproduction/seqence_labelling/cws/train_shift_relay.py b/reproduction/seqence_labelling/cws/train_shift_relay.py deleted file mode 100644 index 55576575..00000000 --- a/reproduction/seqence_labelling/cws/train_shift_relay.py +++ /dev/null @@ -1,64 +0,0 @@ - -import os - -from fastNLP import cache_results -from reproduction.seqence_labelling.cws.data.CWSDataLoader import SigHanLoader -from reproduction.seqence_labelling.cws.model.model import ShiftRelayCWSModel -from fastNLP.io.embed_loader import EmbeddingOption -from fastNLP.core.vocabulary import VocabularyOption -from fastNLP import Trainer -from torch.optim import Adam -from fastNLP import BucketSampler -from fastNLP import GradientClipCallback -from reproduction.seqence_labelling.cws.model.metric import RelayMetric - - -# 借助一下fastNLP的自动缓存机制,但是只能缓存4G以下的结果 -@cache_results(None) -def prepare_data(): - data = SigHanLoader(target_type='shift_relay').process(file_dir, char_embed_opt=char_embed_opt, - bigram_vocab_opt=bigram_vocab_opt, - bigram_embed_opt=bigram_embed_opt, - L=L) - return data - -#########hyper -L = 4 -hidden_size = 200 -num_layers = 1 -drop_p = 0.2 -lr = 0.02 - -#########hyper -device = 0 - -# !!!!这里千万不要放完全路径,因为这样会暴露你们在服务器上的用户名,比较危险。所以一定要使用相对路径,最好把数据放到 -# 你们的reproduction路径下,然后设置.gitignore -file_dir = '/path/to/' -char_embed_path = '/pretrain/vectors/1grams_t3_m50_corpus.txt' -bigram_embed_path = '/pretrain/vectors/2grams_t3_m50_corpus.txt' -bigram_vocab_opt = VocabularyOption(min_freq=3) -char_embed_opt = EmbeddingOption(embed_filepath=char_embed_path) -bigram_embed_opt = EmbeddingOption(embed_filepath=bigram_embed_path) - -data_name = os.path.basename(file_dir) -cache_fp = 'caches/{}.pkl'.format(data_name) - -data = prepare_data(_cache_fp=cache_fp, _refresh=True) - -model = ShiftRelayCWSModel(char_embed=data.embeddings['chars'], bigram_embed=data.embeddings['bigrams'], - hidden_size=hidden_size, num_layers=num_layers, - L=L, num_bigram_per_char=1, drop_p=drop_p) - -sampler = BucketSampler(batch_size=32) -optimizer = Adam(model.parameters(), lr=lr) -clipper = GradientClipCallback(clip_value=5, clip_type='value') -callbacks = [clipper] -# if pretrain: -# fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until) -# callbacks.append(fixer) -trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler, - update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(), - metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks, - check_code_level=0) -trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py deleted file mode 100644 index 1aeddcf8..00000000 --- a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py +++ /dev/null @@ -1,93 +0,0 @@ - -from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle -from typing import Union, Dict -from fastNLP import Vocabulary -from fastNLP import Const -from reproduction.utils import check_dataloader_paths - -from fastNLP.io import ConllLoader -from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 - - -class Conll2003DataLoader(DataSetLoader): - def __init__(self, task:str='ner', encoding_type:str='bioes'): - """ - 加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos - 时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回 - 的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但 - 鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的-DOCTSTART-开头的行 - ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。 - - :param task: 指定需要标注任务。可选ner, pos, chunk - """ - assert task in ('ner', 'pos', 'chunk') - index = {'ner':3, 'pos':1, 'chunk':2}[task] - self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index]) - self._tag_converters = [] - if task in ('ner', 'chunk'): - self._tag_converters = [iob2] - if encoding_type == 'bioes': - self._tag_converters.append(iob2bioes) - - def load(self, path: str): - dataset = self._loader.load(path) - def convert_tag_schema(tags): - for converter in self._tag_converters: - tags = converter(tags) - return tags - if self._tag_converters: - dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) - return dataset - - def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, lower:bool=False): - """ - 读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略 - - :param paths: - :param word_vocab_opt: vocabulary的初始化值 - :param lower: 是否将所有字母转为小写。 - :return: - """ - # 读取数据 - paths = check_dataloader_paths(paths) - data = DataBundle() - input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] - target_fields = [Const.TARGET, Const.INPUT_LEN] - for name, path in paths.items(): - dataset = self.load(path) - dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) - if lower: - dataset.words.lower() - data.datasets[name] = dataset - - # 对construct vocab - word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, - no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) - word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) - data.vocabs[Const.INPUT] = word_vocab - - # cap words - cap_word_vocab = Vocabulary() - cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words', - no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) - cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') - input_fields.append('cap_words') - data.vocabs['cap_words'] = cap_word_vocab - - # 对target建vocab - target_vocab = Vocabulary(unknown=None, padding=None) - target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) - target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) - data.vocabs[Const.TARGET] = target_vocab - - for name, dataset in data.datasets.items(): - dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) - dataset.set_input(*input_fields) - dataset.set_target(*target_fields) - - return data - -if __name__ == '__main__': - pass \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py deleted file mode 100644 index a6070f39..00000000 --- a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py +++ /dev/null @@ -1,152 +0,0 @@ -from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle -from typing import Union, Dict -from fastNLP import DataSet -from fastNLP import Vocabulary -from fastNLP import Const -from reproduction.utils import check_dataloader_paths - -from fastNLP.io import ConllLoader -from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 - -class OntoNoteNERDataLoader(DataSetLoader): - """ - 用于读取处理为Conll格式后的OntoNote数据。将OntoNote数据处理为conll格式的过程可以参考https://github.com/yhcc/OntoNotes-5.0-NER。 - - """ - def __init__(self, encoding_type:str='bioes'): - assert encoding_type in ('bioes', 'bio') - self.encoding_type = encoding_type - if encoding_type=='bioes': - self.encoding_method = iob2bioes - else: - self.encoding_method = iob2 - - def load(self, path:str)->DataSet: - """ - 给定一个文件路径,读取数据。返回的DataSet包含以下的field - raw_words: List[str] - target: List[str] - - :param path: - :return: - """ - dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path) - def convert_to_bio(tags): - bio_tags = [] - flag = None - for tag in tags: - label = tag.strip("()*") - if '(' in tag: - bio_label = 'B-' + label - flag = label - elif flag: - bio_label = 'I-' + flag - else: - bio_label = 'O' - if ')' in tag: - flag = None - bio_tags.append(bio_label) - return self.encoding_method(bio_tags) - - def convert_word(words): - converted_words = [] - for word in words: - word = word.replace('/.', '.') # 有些结尾的.是/.形式的 - if not word.startswith('-'): - converted_words.append(word) - continue - # 以下是由于这些符号被转义了,再转回来 - tfrs = {'-LRB-':'(', - '-RRB-': ')', - '-LSB-': '[', - '-RSB-': ']', - '-LCB-': '{', - '-RCB-': '}' - } - if word in tfrs: - converted_words.append(tfrs[word]) - else: - converted_words.append(word) - return converted_words - - dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words') - dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target') - - return dataset - - def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, - lower:bool=True)->DataBundle: - """ - 读取并处理数据。返回的DataInfo包含以下的内容 - vocabs: - word: Vocabulary - target: Vocabulary - datasets: - train: DataSet - words: List[int], 被设置为input - target: int. label,被同时设置为input和target - seq_len: int. 句子的长度,被同时设置为input和target - raw_words: List[str] - xxx(根据传入的paths可能有所变化) - - :param paths: - :param word_vocab_opt: vocabulary的初始化值 - :param lower: 是否使用小写 - :return: - """ - paths = check_dataloader_paths(paths) - data = DataBundle() - input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] - target_fields = [Const.TARGET, Const.INPUT_LEN] - for name, path in paths.items(): - dataset = self.load(path) - dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) - if lower: - dataset.words.lower() - data.datasets[name] = dataset - - # 对construct vocab - word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, - no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) - word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) - data.vocabs[Const.INPUT] = word_vocab - - # cap words - cap_word_vocab = Vocabulary() - cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words') - cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') - input_fields.append('cap_words') - data.vocabs['cap_words'] = cap_word_vocab - - # 对target建vocab - target_vocab = Vocabulary(unknown=None, padding=None) - target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) - target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) - data.vocabs[Const.TARGET] = target_vocab - - for name, dataset in data.datasets.items(): - dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) - dataset.set_input(*input_fields) - dataset.set_target(*target_fields) - - return data - - -if __name__ == '__main__': - loader = OntoNoteNERDataLoader() - dataset = loader.load('/hdd/fudanNLP/fastNLP/others/data/v4/english/test.txt') - print(dataset.target.value_count()) - print(dataset[:4]) - - -""" -train 115812 2200752 -development 15680 304684 -test 12217 230111 - -train 92403 1901772 -valid 13606 279180 -test 10258 204135 -""" \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/data/utils.py b/reproduction/seqence_labelling/ner/data/utils.py deleted file mode 100644 index 8f7af792..00000000 --- a/reproduction/seqence_labelling/ner/data/utils.py +++ /dev/null @@ -1,49 +0,0 @@ -from typing import List - -def iob2(tags:List[str])->List[str]: - """ - 检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。 - - :param tags: 需要转换的tags - """ - for i, tag in enumerate(tags): - if tag == "O": - continue - split = tag.split("-") - if len(split) != 2 or split[0] not in ["I", "B"]: - raise TypeError("The encoding schema is not a valid IOB type.") - if split[0] == "B": - continue - elif i == 0 or tags[i - 1] == "O": # conversion IOB1 to IOB2 - tags[i] = "B" + tag[1:] - elif tags[i - 1][1:] == tag[1:]: - continue - else: # conversion IOB1 to IOB2 - tags[i] = "B" + tag[1:] - return tags - -def iob2bioes(tags:List[str])->List[str]: - """ - 将iob的tag转换为bmeso编码 - :param tags: - :return: - """ - new_tags = [] - for i, tag in enumerate(tags): - if tag == 'O': - new_tags.append(tag) - else: - split = tag.split('-')[0] - if split == 'B': - if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I': - new_tags.append(tag) - else: - new_tags.append(tag.replace('B-', 'S-')) - elif split == 'I': - if i + 1= 3.7.3 ++ fastNLP >= dev.0.5.0 ++ pytorch >= 1.1.0 ++ numpy >= 1.16.4 ++ fitlog >= 0.2.0 +## 支持的数据集: ++ Resume,可以从[这里](https://github.com/jiesutd/LatticeLSTM)下载 ++ Ontonote ++ [Weibo](https://github.com/hltcoe/golden-horse) + +未包含的数据集可以通过提供增加类似 load_data.py 中 load_ontonotes4ner 这个输出格式的函数来增加对其的支持 +## 性能: +|数据集| 目前达到的F1分数(test)|原文中的F1分数(test)| +|:----:|:----:|:----:| +|Weibo|58.66|58.79| +|Resume|95.18|94.46| +|Ontonote|73.62|73.88| + +备注:Weibo数据集我用的是V2版本,也就是更新过的版本,根据杨杰博士Github上LatticeLSTM仓库里的某个issue,应该是一致的。 + +## 如有任何疑问请联系: ++ lixiaonan_xdu@outlook.com + +--- + +# Batch Parallel LatticeLSTM ++ paper:https://arxiv.org/abs/1805.02023 ++ when batch is 10,the computation efficiency exceeds that of [original code](https://github.com/jiesutd/LatticeLSTM)。 ++ set the path of embeddings and corpus before you run main.py ++ this code set has been added to fastNLP + +## Environment: ++ python >= 3.7.3 ++ fastNLP >= dev.0.5.0 ++ pytorch >= 1.1.0 ++ numpy >= 1.16.4 ++ fitlog >= 0.2.0 + +## Dataset: ++ Resume,downloaded from [here](https://github.com/jiesutd/LatticeLSTM) ++ Ontonote ++ [Weibo](https://github.com/hltcoe/golden-horse) + +to those unincluded dataset, you can write the interface function whose output form is like *load_ontonotes4ner* in load_data.py + +## Performance: +|Dataset|F1 of my code(test)|F1 in paper(test)| +|:----:|:----:|:----:| +|Weibo|58.66|58.79| +|Resume|95.18|94.46| +|Ontonote|73.62|73.88| + +PS:The Weibo dataset I use is V2, namely revised version. +## If any confusion, please contact: ++ lixiaonan_xdu@outlook.com diff --git a/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/load_data.py b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/load_data.py new file mode 100644 index 00000000..fcba17db --- /dev/null +++ b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/load_data.py @@ -0,0 +1,854 @@ +from fastNLP.io import CSVLoader +from fastNLP import Vocabulary +from fastNLP import Const +import numpy as np +import fitlog +import pickle +import os +from fastNLP.embeddings import StaticEmbedding +from fastNLP import cache_results + + +@cache_results(_cache_fp='mtl16', _refresh=False) +def load_16_task(dict_path): + ''' + + :param dict_path: /remote-home/txsun/fnlp/MTL-LT/data + :return: + ''' + task_path = os.path.join(dict_path,'data.pkl') + embedding_path = os.path.join(dict_path,'word_embedding.npy') + + embedding = np.load(embedding_path).astype(np.float32) + + task_list = pickle.load(open(task_path, 'rb'))['task_lst'] + + for t in task_list: + t.train_set.rename_field('words_idx', 'words') + t.dev_set.rename_field('words_idx', 'words') + t.test_set.rename_field('words_idx', 'words') + + t.train_set.rename_field('label', 'target') + t.dev_set.rename_field('label', 'target') + t.test_set.rename_field('label', 'target') + + t.train_set.add_seq_len('words') + t.dev_set.add_seq_len('words') + t.test_set.add_seq_len('words') + + t.train_set.set_input(Const.INPUT, Const.INPUT_LEN) + t.dev_set.set_input(Const.INPUT, Const.INPUT_LEN) + t.test_set.set_input(Const.INPUT, Const.INPUT_LEN) + + return task_list,embedding + + +@cache_results(_cache_fp='SST2', _refresh=False) +def load_sst2(dict_path,embedding_path=None): + ''' + + :param dict_path: /remote-home/xnli/data/corpus/text_classification/SST-2/ + :param embedding_path: glove 300d txt + :return: + ''' + train_path = os.path.join(dict_path,'train.tsv') + dev_path = os.path.join(dict_path,'dev.tsv') + + loader = CSVLoader(headers=('words', 'target'), sep='\t') + train_data = loader.load(train_path).datasets['train'] + dev_data = loader.load(dev_path).datasets['train'] + + train_data.apply_field(lambda x: x.split(), field_name='words', new_field_name='words') + dev_data.apply_field(lambda x: x.split(), field_name='words', new_field_name='words') + + train_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') + dev_data.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len') + + vocab = Vocabulary(min_freq=2) + vocab.from_dataset(train_data, field_name='words') + vocab.from_dataset(dev_data, field_name='words') + + # pretrained_embedding = load_word_emb(embedding_path, 300, vocab) + + label_vocab = Vocabulary(padding=None, unknown=None).from_dataset(train_data, field_name='target') + + label_vocab.index_dataset(train_data, field_name='target') + label_vocab.index_dataset(dev_data, field_name='target') + + vocab.index_dataset(train_data, field_name='words', new_field_name='words') + vocab.index_dataset(dev_data, field_name='words', new_field_name='words') + + train_data.set_input(Const.INPUT, Const.INPUT_LEN) + train_data.set_target(Const.TARGET) + + dev_data.set_input(Const.INPUT, Const.INPUT_LEN) + dev_data.set_target(Const.TARGET) + + if embedding_path is not None: + pretrained_embedding = load_word_emb(embedding_path, 300, vocab) + return (train_data,dev_data),(vocab,label_vocab),pretrained_embedding + + else: + return (train_data,dev_data),(vocab,label_vocab) + +@cache_results(_cache_fp='OntonotesPOS', _refresh=False) +def load_conllized_ontonote_POS(path,embedding_path=None): + from fastNLP.io.data_loader import ConllLoader + header2index = {'words':3,'POS':4,'NER':10} + headers = ['words','POS'] + + if 'NER' in headers: + print('警告!通过 load_conllized_ontonote 函数读出来的NER标签不是BIOS,是纯粹的conll格式,是错误的!') + indexes = list(map(lambda x:header2index[x],headers)) + + loader = ConllLoader(headers,indexes) + + bundle = loader.load(path) + + # print(bundle.datasets) + + train_set = bundle.datasets['train'] + dev_set = bundle.datasets['dev'] + test_set = bundle.datasets['test'] + + + + + # train_set = loader.load(os.path.join(path,'train.txt')) + # dev_set = loader.load(os.path.join(path, 'dev.txt')) + # test_set = loader.load(os.path.join(path, 'test.txt')) + + # print(len(train_set)) + + train_set.add_seq_len('words','seq_len') + dev_set.add_seq_len('words','seq_len') + test_set.add_seq_len('words','seq_len') + + + + # print(dataset['POS']) + + vocab = Vocabulary(min_freq=1) + vocab.from_dataset(train_set,field_name='words') + vocab.from_dataset(dev_set, field_name='words') + vocab.from_dataset(test_set, field_name='words') + + vocab.index_dataset(train_set,field_name='words') + vocab.index_dataset(dev_set, field_name='words') + vocab.index_dataset(test_set, field_name='words') + + + + + label_vocab_dict = {} + + for i,h in enumerate(headers): + if h == 'words': + continue + label_vocab_dict[h] = Vocabulary(min_freq=1,padding=None,unknown=None) + label_vocab_dict[h].from_dataset(train_set,field_name=h) + + label_vocab_dict[h].index_dataset(train_set,field_name=h) + label_vocab_dict[h].index_dataset(dev_set,field_name=h) + label_vocab_dict[h].index_dataset(test_set,field_name=h) + + train_set.set_input(Const.INPUT, Const.INPUT_LEN) + train_set.set_target(headers[1]) + + dev_set.set_input(Const.INPUT, Const.INPUT_LEN) + dev_set.set_target(headers[1]) + + test_set.set_input(Const.INPUT, Const.INPUT_LEN) + test_set.set_target(headers[1]) + + if len(headers) > 2: + print('警告:由于任务数量大于1,所以需要每次手动设置target!') + + + print('train:',len(train_set),'dev:',len(dev_set),'test:',len(test_set)) + + if embedding_path is not None: + pretrained_embedding = load_word_emb(embedding_path, 300, vocab) + return (train_set,dev_set,test_set),(vocab,label_vocab_dict),pretrained_embedding + else: + return (train_set, dev_set, test_set), (vocab, label_vocab_dict) + + +@cache_results(_cache_fp='OntonotesNER', _refresh=False) +def load_conllized_ontonote_NER(path,embedding_path=None): + from fastNLP.io.pipe.conll import OntoNotesNERPipe + ontoNotesNERPipe = OntoNotesNERPipe(lower=True,target_pad_val=-100) + bundle_NER = ontoNotesNERPipe.process_from_file(path) + + train_set_NER = bundle_NER.datasets['train'] + dev_set_NER = bundle_NER.datasets['dev'] + test_set_NER = bundle_NER.datasets['test'] + + train_set_NER.add_seq_len('words','seq_len') + dev_set_NER.add_seq_len('words','seq_len') + test_set_NER.add_seq_len('words','seq_len') + + + NER_vocab = bundle_NER.get_vocab('target') + word_vocab = bundle_NER.get_vocab('words') + + if embedding_path is not None: + + embed = StaticEmbedding(vocab=word_vocab, model_dir_or_name=embedding_path, word_dropout=0.01, + dropout=0.5,lower=True) + + + # pretrained_embedding = load_word_emb(embedding_path, 300, word_vocab) + return (train_set_NER,dev_set_NER,test_set_NER),\ + (word_vocab,NER_vocab),embed + else: + return (train_set_NER, dev_set_NER, test_set_NER), (NER_vocab, word_vocab) + +@cache_results(_cache_fp='OntonotesPOSNER', _refresh=False) + +def load_conllized_ontonote_NER_POS(path,embedding_path=None): + from fastNLP.io.pipe.conll import OntoNotesNERPipe + ontoNotesNERPipe = OntoNotesNERPipe(lower=True) + bundle_NER = ontoNotesNERPipe.process_from_file(path) + + train_set_NER = bundle_NER.datasets['train'] + dev_set_NER = bundle_NER.datasets['dev'] + test_set_NER = bundle_NER.datasets['test'] + + NER_vocab = bundle_NER.get_vocab('target') + word_vocab = bundle_NER.get_vocab('words') + + (train_set_POS,dev_set_POS,test_set_POS),(_,POS_vocab) = load_conllized_ontonote_POS(path) + POS_vocab = POS_vocab['POS'] + + train_set_NER.add_field('pos',train_set_POS['POS'],is_target=True) + dev_set_NER.add_field('pos', dev_set_POS['POS'], is_target=True) + test_set_NER.add_field('pos', test_set_POS['POS'], is_target=True) + + if train_set_NER.has_field('target'): + train_set_NER.rename_field('target','ner') + + if dev_set_NER.has_field('target'): + dev_set_NER.rename_field('target','ner') + + if test_set_NER.has_field('target'): + test_set_NER.rename_field('target','ner') + + + + if train_set_NER.has_field('pos'): + train_set_NER.rename_field('pos','posid') + if dev_set_NER.has_field('pos'): + dev_set_NER.rename_field('pos','posid') + if test_set_NER.has_field('pos'): + test_set_NER.rename_field('pos','posid') + + if train_set_NER.has_field('ner'): + train_set_NER.rename_field('ner','nerid') + if dev_set_NER.has_field('ner'): + dev_set_NER.rename_field('ner','nerid') + if test_set_NER.has_field('ner'): + test_set_NER.rename_field('ner','nerid') + + if embedding_path is not None: + + embed = StaticEmbedding(vocab=word_vocab, model_dir_or_name=embedding_path, word_dropout=0.01, + dropout=0.5,lower=True) + + return (train_set_NER,dev_set_NER,test_set_NER),\ + (word_vocab,POS_vocab,NER_vocab),embed + else: + return (train_set_NER, dev_set_NER, test_set_NER), (NER_vocab, word_vocab) + +@cache_results(_cache_fp='Ontonotes3', _refresh=True) +def load_conllized_ontonote_pkl(path,embedding_path=None): + + data_bundle = pickle.load(open(path,'rb')) + train_set = data_bundle.datasets['train'] + dev_set = data_bundle.datasets['dev'] + test_set = data_bundle.datasets['test'] + + train_set.rename_field('pos','posid') + train_set.rename_field('ner','nerid') + train_set.rename_field('chunk','chunkid') + + dev_set.rename_field('pos','posid') + dev_set.rename_field('ner','nerid') + dev_set.rename_field('chunk','chunkid') + + test_set.rename_field('pos','posid') + test_set.rename_field('ner','nerid') + test_set.rename_field('chunk','chunkid') + + + word_vocab = data_bundle.vocabs['words'] + pos_vocab = data_bundle.vocabs['pos'] + ner_vocab = data_bundle.vocabs['ner'] + chunk_vocab = data_bundle.vocabs['chunk'] + + + if embedding_path is not None: + + embed = StaticEmbedding(vocab=word_vocab, model_dir_or_name=embedding_path, word_dropout=0.01, + dropout=0.5,lower=True) + + return (train_set,dev_set,test_set),\ + (word_vocab,pos_vocab,ner_vocab,chunk_vocab),embed + else: + return (train_set, dev_set, test_set), (word_vocab,ner_vocab) + # print(data_bundle) + + + + + + + + + + +# @cache_results(_cache_fp='Conll2003', _refresh=False) +# def load_conll_2003(path,embedding_path=None): +# f = open(path, 'rb') +# data_pkl = pickle.load(f) +# +# task_lst = data_pkl['task_lst'] +# vocabs = data_pkl['vocabs'] +# # word_vocab = vocabs['words'] +# # pos_vocab = vocabs['pos'] +# # chunk_vocab = vocabs['chunk'] +# # ner_vocab = vocabs['ner'] +# +# if embedding_path is not None: +# embed = StaticEmbedding(vocab=vocabs['words'], model_dir_or_name=embedding_path, word_dropout=0.01, +# dropout=0.5) +# return task_lst,vocabs,embed +# else: +# return task_lst,vocabs + +# @cache_results(_cache_fp='Conll2003_mine', _refresh=False) +@cache_results(_cache_fp='Conll2003_mine_embed_100', _refresh=True) +def load_conll_2003_mine(path,embedding_path=None,pad_val=-100): + f = open(path, 'rb') + + data_pkl = pickle.load(f) + # print(data_pkl) + # print(data_pkl) + train_set = data_pkl[0]['train'] + dev_set = data_pkl[0]['dev'] + test_set = data_pkl[0]['test'] + + train_set.set_pad_val('posid',pad_val) + train_set.set_pad_val('nerid', pad_val) + train_set.set_pad_val('chunkid', pad_val) + + dev_set.set_pad_val('posid',pad_val) + dev_set.set_pad_val('nerid', pad_val) + dev_set.set_pad_val('chunkid', pad_val) + + test_set.set_pad_val('posid',pad_val) + test_set.set_pad_val('nerid', pad_val) + test_set.set_pad_val('chunkid', pad_val) + + if train_set.has_field('task_id'): + + train_set.delete_field('task_id') + + if dev_set.has_field('task_id'): + dev_set.delete_field('task_id') + + if test_set.has_field('task_id'): + test_set.delete_field('task_id') + + if train_set.has_field('words_idx'): + train_set.rename_field('words_idx','words') + + if dev_set.has_field('words_idx'): + dev_set.rename_field('words_idx','words') + + if test_set.has_field('words_idx'): + test_set.rename_field('words_idx','words') + + + + word_vocab = data_pkl[1]['words'] + pos_vocab = data_pkl[1]['pos'] + ner_vocab = data_pkl[1]['ner'] + chunk_vocab = data_pkl[1]['chunk'] + + if embedding_path is not None: + embed = StaticEmbedding(vocab=word_vocab, model_dir_or_name=embedding_path, word_dropout=0.01, + dropout=0.5,lower=True) + return (train_set,dev_set,test_set),(word_vocab,pos_vocab,ner_vocab,chunk_vocab),embed + else: + return (train_set,dev_set,test_set),(word_vocab,pos_vocab,ner_vocab,chunk_vocab) + + +def load_conllized_ontonote_pkl_yf(path): + def init_task(task): + task_name = task.task_name + for ds in [task.train_set, task.dev_set, task.test_set]: + if ds.has_field('words'): + ds.rename_field('words', 'x') + else: + ds.rename_field('words_idx', 'x') + if ds.has_field('label'): + ds.rename_field('label', 'y') + else: + ds.rename_field(task_name, 'y') + ds.set_input('x', 'y', 'task_id') + ds.set_target('y') + + if task_name in ['ner', 'chunk'] or 'pos' in task_name: + ds.set_input('seq_len') + ds.set_target('seq_len') + return task + #/remote-home/yfshao/workdir/datasets/conll03/data.pkl + def pload(fn): + with open(fn, 'rb') as f: + return pickle.load(f) + + DB = pload(path) + task_lst = DB['task_lst'] + vocabs = DB['vocabs'] + task_lst = [init_task(task) for task in task_lst] + + return task_lst, vocabs + + +@cache_results(_cache_fp='weiboNER old uni+bi', _refresh=False) +def load_weibo_ner_old(path,unigram_embedding_path=None,bigram_embedding_path=None,index_token=True, + normlize={'char':True,'bigram':True,'word':False}): + from fastNLP.io.data_loader import ConllLoader + from utils import get_bigrams + + loader = ConllLoader(['chars','target']) + # from fastNLP.io.file_reader import _read_conll + # from fastNLP.core import Instance,DataSet + # def _load(path): + # ds = DataSet() + # for idx, data in _read_conll(path, indexes=loader.indexes, dropna=loader.dropna, + # encoding='ISO-8859-1'): + # ins = {h: data[i] for i, h in enumerate(loader.headers)} + # ds.append(Instance(**ins)) + # return ds + # from fastNLP.io.utils import check_loader_paths + # paths = check_loader_paths(path) + # datasets = {name: _load(path) for name, path in paths.items()} + datasets = {} + train_path = os.path.join(path,'train.all.bmes') + dev_path = os.path.join(path,'dev.all.bmes') + test_path = os.path.join(path,'test.all.bmes') + datasets['train'] = loader.load(train_path).datasets['train'] + datasets['dev'] = loader.load(dev_path).datasets['train'] + datasets['test'] = loader.load(test_path).datasets['train'] + + for k,v in datasets.items(): + print('{}:{}'.format(k,len(v))) + + vocabs = {} + word_vocab = Vocabulary() + bigram_vocab = Vocabulary() + label_vocab = Vocabulary(padding=None,unknown=None) + + for k,v in datasets.items(): + # ignore the word segmentation tag + v.apply_field(lambda x: [w[0] for w in x],'chars','chars') + v.apply_field(get_bigrams,'chars','bigrams') + + + word_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'],datasets['test']]) + label_vocab.from_dataset(datasets['train'],field_name='target') + print('label_vocab:{}\n{}'.format(len(label_vocab),label_vocab.idx2word)) + + + for k,v in datasets.items(): + # v.set_pad_val('target',-100) + v.add_seq_len('chars',new_field_name='seq_len') + + + vocabs['char'] = word_vocab + vocabs['label'] = label_vocab + + + bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'],datasets['test']]) + if index_token: + word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') + bigram_vocab.index_dataset(*list(datasets.values()),field_name='raw_bigrams',new_field_name='bigrams') + label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') + + # for k,v in datasets.items(): + # v.set_input('chars','bigrams','seq_len','target') + # v.set_target('target','seq_len') + + vocabs['bigram'] = bigram_vocab + + embeddings = {} + + if unigram_embedding_path is not None: + unigram_embedding = StaticEmbedding(word_vocab, model_dir_or_name=unigram_embedding_path, + word_dropout=0.01,normalize=normlize['char']) + embeddings['char'] = unigram_embedding + + if bigram_embedding_path is not None: + bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path, + word_dropout=0.01,normalize=normlize['bigram']) + embeddings['bigram'] = bigram_embedding + + return datasets, vocabs, embeddings + + +@cache_results(_cache_fp='weiboNER uni+bi', _refresh=False) +def load_weibo_ner(path,unigram_embedding_path=None,bigram_embedding_path=None,index_token=True, + normlize={'char':True,'bigram':True,'word':False}): + from fastNLP.io.loader import ConllLoader + from utils import get_bigrams + + loader = ConllLoader(['chars','target']) + bundle = loader.load(path) + + datasets = bundle.datasets + for k,v in datasets.items(): + print('{}:{}'.format(k,len(v))) + # print(*list(datasets.keys())) + vocabs = {} + word_vocab = Vocabulary() + bigram_vocab = Vocabulary() + label_vocab = Vocabulary(padding=None,unknown=None) + + for k,v in datasets.items(): + # ignore the word segmentation tag + v.apply_field(lambda x: [w[0] for w in x],'chars','chars') + v.apply_field(get_bigrams,'chars','bigrams') + + + word_vocab.from_dataset(datasets['train'],field_name='chars',no_create_entry_dataset=[datasets['dev'],datasets['test']]) + label_vocab.from_dataset(datasets['train'],field_name='target') + print('label_vocab:{}\n{}'.format(len(label_vocab),label_vocab.idx2word)) + + + for k,v in datasets.items(): + # v.set_pad_val('target',-100) + v.add_seq_len('chars',new_field_name='seq_len') + + + vocabs['char'] = word_vocab + vocabs['label'] = label_vocab + + + bigram_vocab.from_dataset(datasets['train'],field_name='bigrams',no_create_entry_dataset=[datasets['dev'],datasets['test']]) + if index_token: + word_vocab.index_dataset(*list(datasets.values()), field_name='raw_words', new_field_name='words') + bigram_vocab.index_dataset(*list(datasets.values()),field_name='raw_bigrams',new_field_name='bigrams') + label_vocab.index_dataset(*list(datasets.values()), field_name='raw_target', new_field_name='target') + + # for k,v in datasets.items(): + # v.set_input('chars','bigrams','seq_len','target') + # v.set_target('target','seq_len') + + vocabs['bigram'] = bigram_vocab + + embeddings = {} + + if unigram_embedding_path is not None: + unigram_embedding = StaticEmbedding(word_vocab, model_dir_or_name=unigram_embedding_path, + word_dropout=0.01,normalize=normlize['char']) + embeddings['char'] = unigram_embedding + + if bigram_embedding_path is not None: + bigram_embedding = StaticEmbedding(bigram_vocab, model_dir_or_name=bigram_embedding_path, + word_dropout=0.01,normalize=normlize['bigram']) + embeddings['bigram'] = bigram_embedding + + return datasets, vocabs, embeddings + + + +# datasets,vocabs = load_weibo_ner('/remote-home/xnli/data/corpus/sequence_labelling/ner_weibo') +# +# print(datasets['train'][:5]) +# print(vocabs['word'].idx2word) +# print(vocabs['target'].idx2word) + + +@cache_results(_cache_fp='cache/ontonotes4ner',_refresh=False) +def load_ontonotes4ner(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True, + normalize={'char':True,'bigram':True,'word':False}): + from fastNLP.io.loader import ConllLoader + from utils import get_bigrams + + train_path = os.path.join(path,'train.char.bmes') + dev_path = os.path.join(path,'dev.char.bmes') + test_path = os.path.join(path,'test.char.bmes') + + loader = ConllLoader(['chars','target']) + train_bundle = loader.load(train_path) + dev_bundle = loader.load(dev_path) + test_bundle = loader.load(test_path) + + + datasets = dict() + datasets['train'] = train_bundle.datasets['train'] + datasets['dev'] = dev_bundle.datasets['train'] + datasets['test'] = test_bundle.datasets['train'] + + + datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams') + datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') + datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') + + datasets['train'].add_seq_len('chars') + datasets['dev'].add_seq_len('chars') + datasets['test'].add_seq_len('chars') + + + + char_vocab = Vocabulary() + bigram_vocab = Vocabulary() + label_vocab = Vocabulary(padding=None,unknown=None) + print(datasets.keys()) + print(len(datasets['dev'])) + print(len(datasets['test'])) + print(len(datasets['train'])) + char_vocab.from_dataset(datasets['train'],field_name='chars', + no_create_entry_dataset=[datasets['dev'],datasets['test']] ) + bigram_vocab.from_dataset(datasets['train'],field_name='bigrams', + no_create_entry_dataset=[datasets['dev'],datasets['test']]) + label_vocab.from_dataset(datasets['train'],field_name='target') + if index_token: + char_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], + field_name='chars',new_field_name='chars') + bigram_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], + field_name='bigrams',new_field_name='bigrams') + label_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], + field_name='target',new_field_name='target') + + vocabs = {} + vocabs['char'] = char_vocab + vocabs['label'] = label_vocab + vocabs['bigram'] = bigram_vocab + vocabs['label'] = label_vocab + + embeddings = {} + if char_embedding_path is not None: + char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01, + normalize=normalize['char']) + embeddings['char'] = char_embedding + + if bigram_embedding_path is not None: + bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01, + normalize=normalize['bigram']) + embeddings['bigram'] = bigram_embedding + + return datasets,vocabs,embeddings + + + +@cache_results(_cache_fp='cache/resume_ner',_refresh=False) +def load_resume_ner(path,char_embedding_path=None,bigram_embedding_path=None,index_token=True, + normalize={'char':True,'bigram':True,'word':False}): + from fastNLP.io.data_loader import ConllLoader + from utils import get_bigrams + + train_path = os.path.join(path,'train.char.bmes') + dev_path = os.path.join(path,'dev.char.bmes') + test_path = os.path.join(path,'test.char.bmes') + + loader = ConllLoader(['chars','target']) + train_bundle = loader.load(train_path) + dev_bundle = loader.load(dev_path) + test_bundle = loader.load(test_path) + + + datasets = dict() + datasets['train'] = train_bundle.datasets['train'] + datasets['dev'] = dev_bundle.datasets['train'] + datasets['test'] = test_bundle.datasets['train'] + + + datasets['train'].apply_field(get_bigrams,field_name='chars',new_field_name='bigrams') + datasets['dev'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') + datasets['test'].apply_field(get_bigrams, field_name='chars', new_field_name='bigrams') + + datasets['train'].add_seq_len('chars') + datasets['dev'].add_seq_len('chars') + datasets['test'].add_seq_len('chars') + + + + char_vocab = Vocabulary() + bigram_vocab = Vocabulary() + label_vocab = Vocabulary(padding=None,unknown=None) + print(datasets.keys()) + print(len(datasets['dev'])) + print(len(datasets['test'])) + print(len(datasets['train'])) + char_vocab.from_dataset(datasets['train'],field_name='chars', + no_create_entry_dataset=[datasets['dev'],datasets['test']] ) + bigram_vocab.from_dataset(datasets['train'],field_name='bigrams', + no_create_entry_dataset=[datasets['dev'],datasets['test']]) + label_vocab.from_dataset(datasets['train'],field_name='target') + if index_token: + char_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], + field_name='chars',new_field_name='chars') + bigram_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], + field_name='bigrams',new_field_name='bigrams') + label_vocab.index_dataset(datasets['train'],datasets['dev'],datasets['test'], + field_name='target',new_field_name='target') + + vocabs = {} + vocabs['char'] = char_vocab + vocabs['label'] = label_vocab + vocabs['bigram'] = bigram_vocab + + embeddings = {} + if char_embedding_path is not None: + char_embedding = StaticEmbedding(char_vocab,char_embedding_path,word_dropout=0.01,normalize=normalize['char']) + embeddings['char'] = char_embedding + + if bigram_embedding_path is not None: + bigram_embedding = StaticEmbedding(bigram_vocab,bigram_embedding_path,word_dropout=0.01,normalize=normalize['bigram']) + embeddings['bigram'] = bigram_embedding + + return datasets,vocabs,embeddings + + +@cache_results(_cache_fp='need_to_defined_fp',_refresh=False) +def equip_chinese_ner_with_skip(datasets,vocabs,embeddings,w_list,word_embedding_path=None, + normalize={'char':True,'bigram':True,'word':False}): + from utils_ import Trie,get_skip_path + from functools import partial + w_trie = Trie() + for w in w_list: + w_trie.insert(w) + + # for k,v in datasets.items(): + # v.apply_field(partial(get_skip_path,w_trie=w_trie),'chars','skips') + + def skips2skips_l2r(chars,w_trie): + ''' + + :param lexicons: list[[int,int,str]] + :return: skips_l2r + ''' + # print(lexicons) + # print('******') + + lexicons = get_skip_path(chars,w_trie=w_trie) + + + # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0 + + result = [[] for _ in range(len(chars))] + + for lex in lexicons: + s = lex[0] + e = lex[1] + w = lex[2] + + result[e].append([s,w]) + + return result + + def skips2skips_r2l(chars,w_trie): + ''' + + :param lexicons: list[[int,int,str]] + :return: skips_l2r + ''' + # print(lexicons) + # print('******') + + lexicons = get_skip_path(chars,w_trie=w_trie) + + + # max_len = max(list(map(lambda x:max(x[:2]),lexicons)))+1 if len(lexicons) != 0 else 0 + + result = [[] for _ in range(len(chars))] + + for lex in lexicons: + s = lex[0] + e = lex[1] + w = lex[2] + + result[s].append([e,w]) + + return result + + for k,v in datasets.items(): + v.apply_field(partial(skips2skips_l2r,w_trie=w_trie),'chars','skips_l2r') + + for k,v in datasets.items(): + v.apply_field(partial(skips2skips_r2l,w_trie=w_trie),'chars','skips_r2l') + + # print(v['skips_l2r'][0]) + word_vocab = Vocabulary() + word_vocab.add_word_lst(w_list) + vocabs['word'] = word_vocab + for k,v in datasets.items(): + v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source') + v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word') + + for k,v in datasets.items(): + v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source') + v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word') + + for k,v in datasets.items(): + v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count') + v.apply_field(lambda x: + list(map(lambda y: + list(map(lambda z:word_vocab.to_index(z),y)),x)), + 'skips_l2r_word',new_field_name='skips_l2r_word') + + v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back') + + v.apply_field(lambda x: + list(map(lambda y: + list(map(lambda z:word_vocab.to_index(z),y)),x)), + 'skips_r2l_word',new_field_name='skips_r2l_word') + + + + + + if word_embedding_path is not None: + word_embedding = StaticEmbedding(word_vocab,word_embedding_path,word_dropout=0,normalize=normalize['word']) + embeddings['word'] = word_embedding + + vocabs['char'].index_dataset(datasets['train'], datasets['dev'], datasets['test'], + field_name='chars', new_field_name='chars') + vocabs['bigram'].index_dataset(datasets['train'], datasets['dev'], datasets['test'], + field_name='bigrams', new_field_name='bigrams') + vocabs['label'].index_dataset(datasets['train'], datasets['dev'], datasets['test'], + field_name='target', new_field_name='target') + + return datasets,vocabs,embeddings + + + +@cache_results(_cache_fp='cache/load_yangjie_rich_pretrain_word_list',_refresh=False) +def load_yangjie_rich_pretrain_word_list(embedding_path,drop_characters=True): + f = open(embedding_path,'r') + lines = f.readlines() + w_list = [] + for line in lines: + splited = line.strip().split(' ') + w = splited[0] + w_list.append(w) + + if drop_characters: + w_list = list(filter(lambda x:len(x) != 1, w_list)) + + return w_list + + + +# from pathes import * +# +# datasets,vocabs,embeddings = load_ontonotes4ner(ontonote4ner_cn_path, +# yangjie_rich_pretrain_unigram_path,yangjie_rich_pretrain_bigram_path) +# print(datasets.keys()) +# print(vocabs.keys()) +# print(embeddings) +# yangjie_rich_pretrain_word_path +# datasets['train'].set_pad_val \ No newline at end of file diff --git a/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/main.py b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/main.py new file mode 100644 index 00000000..a2df5a91 --- /dev/null +++ b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/main.py @@ -0,0 +1,205 @@ +import torch.nn as nn +# print(1111111111) +# from pathes import * +from load_data import load_ontonotes4ner,equip_chinese_ner_with_skip,load_yangjie_rich_pretrain_word_list,\ + load_resume_ner,load_weibo_ner,load_weibo_ner_old +from fastNLP.embeddings import StaticEmbedding +from models import LatticeLSTM_SeqLabel,LSTM_SeqLabel,LatticeLSTM_SeqLabel_V1 +from fastNLP import CrossEntropyLoss,SpanFPreRecMetric,Trainer,AccuracyMetric,LossInForward +import torch.optim as optim +import argparse +import torch +import sys +from utils_ import LatticeLexiconPadder,SpanFPreRecMetric_YJ +from fastNLP import Tester +import fitlog +from fastNLP.core.callback import FitlogCallback +from utils import set_seed +import os +from fastNLP import LRScheduler +from torch.optim.lr_scheduler import LambdaLR + +parser = argparse.ArgumentParser() +parser.add_argument('--device',default='cuda:1') +parser.add_argument('--debug',default=False) + +parser.add_argument('--norm_embed',default=False) +parser.add_argument('--batch',default=1) +parser.add_argument('--test_batch',default=1024) +parser.add_argument('--optim',default='sgd',help='adam|sgd') +parser.add_argument('--lr',default=0.045) +parser.add_argument('--model',default='lattice',help='lattice|lstm') +parser.add_argument('--skip_before_head',default=False)#in paper it's false +parser.add_argument('--hidden',default=113) +parser.add_argument('--momentum',default=0) +parser.add_argument('--bi',default=True) +parser.add_argument('--dataset',default='weibo',help='resume|ontonote|weibo|msra') +parser.add_argument('--use_bigram',default=True) + +parser.add_argument('--embed_dropout',default=0.5) +parser.add_argument('--gaz_dropout',default=-1) +parser.add_argument('--output_dropout',default=0.5) +parser.add_argument('--epoch',default=100) +parser.add_argument('--seed',default=100) + +args = parser.parse_args() + +set_seed(args.seed) + +fit_msg_list = [args.model,'bi' if args.bi else 'uni',str(args.batch)] +if args.model == 'lattice': + fit_msg_list.append(str(args.skip_before_head)) +fit_msg = ' '.join(fit_msg_list) +fitlog.commit(__file__,fit_msg=fit_msg) + +device = torch.device(args.device) +for k,v in args.__dict__.items(): + print(k,v) + +refresh_data = False + + +from pathes import * +# ontonote4ner_cn_path = 0 +# yangjie_rich_pretrain_unigram_path = 0 +# yangjie_rich_pretrain_bigram_path = 0 +# resume_ner_path = 0 +# weibo_ner_path = 0 + +if args.dataset == 'ontonote': + datasets,vocabs,embeddings = load_ontonotes4ner(ontonote4ner_cn_path,yangjie_rich_pretrain_unigram_path,yangjie_rich_pretrain_bigram_path, + _refresh=refresh_data,index_token=False, + ) +elif args.dataset == 'resume': + datasets,vocabs,embeddings = load_resume_ner(resume_ner_path,yangjie_rich_pretrain_unigram_path,yangjie_rich_pretrain_bigram_path, + _refresh=refresh_data,index_token=False, + ) +elif args.dataset == 'weibo': + datasets,vocabs,embeddings = load_weibo_ner(weibo_ner_path,yangjie_rich_pretrain_unigram_path,yangjie_rich_pretrain_bigram_path, + _refresh=refresh_data,index_token=False, + ) + +elif args.dataset == 'weibo_old': + datasets,vocabs,embeddings = load_weibo_ner_old(weibo_ner_old_path,yangjie_rich_pretrain_unigram_path,yangjie_rich_pretrain_bigram_path, + _refresh=refresh_data,index_token=False, + ) +if args.dataset == 'ontonote': + args.batch = 10 + args.lr = 0.045 +elif args.dataset == 'resume': + args.batch = 1 + args.lr = 0.015 +elif args.dataset == 'weibo': + args.batch = 10 + args.gaz_dropout = 0.1 + args.embed_dropout = 0.1 + args.output_dropout = 0.1 +elif args.dataset == 'weibo_old': + args.embed_dropout = 0.1 + args.output_dropout = 0.1 + +if args.gaz_dropout < 0: + args.gaz_dropout = args.embed_dropout + +fitlog.add_hyper(args) +w_list = load_yangjie_rich_pretrain_word_list(yangjie_rich_pretrain_word_path, + _refresh=refresh_data) + +cache_name = os.path.join('cache',args.dataset+'_lattice') +datasets,vocabs,embeddings = equip_chinese_ner_with_skip(datasets,vocabs,embeddings,w_list,yangjie_rich_pretrain_word_path, + _refresh=refresh_data,_cache_fp=cache_name) + +print(datasets['train'][0]) +print('vocab info:') +for k,v in vocabs.items(): + print('{}:{}'.format(k,len(v))) + +for k,v in datasets.items(): + if args.model == 'lattice': + v.set_ignore_type('skips_l2r_word','skips_l2r_source','skips_r2l_word', 'skips_r2l_source') + if args.skip_before_head: + v.set_padder('skips_l2r_word',LatticeLexiconPadder()) + v.set_padder('skips_l2r_source',LatticeLexiconPadder()) + v.set_padder('skips_r2l_word',LatticeLexiconPadder()) + v.set_padder('skips_r2l_source',LatticeLexiconPadder(pad_val_dynamic=True)) + else: + v.set_padder('skips_l2r_word',LatticeLexiconPadder()) + v.set_padder('skips_r2l_word', LatticeLexiconPadder()) + v.set_padder('skips_l2r_source', LatticeLexiconPadder(-1)) + v.set_padder('skips_r2l_source', LatticeLexiconPadder(pad_val_dynamic=True,dynamic_offset=1)) + if args.bi: + v.set_input('chars','bigrams','seq_len', + 'skips_l2r_word','skips_l2r_source','lexicon_count', + 'skips_r2l_word', 'skips_r2l_source','lexicon_count_back', + 'target', + use_1st_ins_infer_dim_type=True) + else: + v.set_input('chars','bigrams','seq_len', + 'skips_l2r_word','skips_l2r_source','lexicon_count', + 'target', + use_1st_ins_infer_dim_type=True) + v.set_target('target','seq_len') + + v['target'].set_pad_val(0) + elif args.model == 'lstm': + v.set_ignore_type('skips_l2r_word','skips_l2r_source') + v.set_padder('skips_l2r_word',LatticeLexiconPadder()) + v.set_padder('skips_l2r_source',LatticeLexiconPadder()) + v.set_input('chars','bigrams','seq_len','target', + use_1st_ins_infer_dim_type=True) + v.set_target('target','seq_len') + + v['target'].set_pad_val(0) + +print(datasets['dev']['skips_l2r_word'][100]) + + +if args.model =='lattice': + model = LatticeLSTM_SeqLabel_V1(embeddings['char'],embeddings['bigram'],embeddings['word'], + hidden_size=args.hidden,label_size=len(vocabs['label']),device=args.device, + embed_dropout=args.embed_dropout,output_dropout=args.output_dropout, + skip_batch_first=True,bidirectional=args.bi,debug=args.debug, + skip_before_head=args.skip_before_head,use_bigram=args.use_bigram, + gaz_dropout=args.gaz_dropout + ) +elif args.model == 'lstm': + model = LSTM_SeqLabel(embeddings['char'],embeddings['bigram'],embeddings['word'], + hidden_size=args.hidden,label_size=len(vocabs['label']),device=args.device, + bidirectional=args.bi, + embed_dropout=args.embed_dropout,output_dropout=args.output_dropout, + use_bigram=args.use_bigram) + + +loss = LossInForward() +encoding_type = 'bmeso' +if args.dataset == 'weibo': + encoding_type = 'bio' +f1_metric = SpanFPreRecMetric(vocabs['label'],pred='pred',target='target',seq_len='seq_len',encoding_type=encoding_type) +acc_metric = AccuracyMetric(pred='pred',target='target',seq_len='seq_len') +metrics = [f1_metric,acc_metric] + +if args.optim == 'adam': + optimizer = optim.Adam(model.parameters(),lr=args.lr) +elif args.optim == 'sgd': + optimizer = optim.SGD(model.parameters(),lr=args.lr,momentum=args.momentum) + + + + +callbacks = [ + FitlogCallback({'test':datasets['test'],'train':datasets['train']}), + LRScheduler(lr_scheduler=LambdaLR(optimizer, lambda ep: 1 / (1 + 0.03)**ep)) +] +print('label_vocab:{}\n{}'.format(len(vocabs['label']),vocabs['label'].idx2word)) +trainer = Trainer(datasets['train'],model, + optimizer=optimizer, + loss=loss, + metrics=metrics, + dev_data=datasets['dev'], + device=device, + batch_size=args.batch, + n_epochs=args.epoch, + dev_batch_size=args.test_batch, + callbacks=callbacks) + +trainer.train() \ No newline at end of file diff --git a/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/models.py b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/models.py new file mode 100644 index 00000000..0b419015 --- /dev/null +++ b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/models.py @@ -0,0 +1,310 @@ +import torch.nn as nn +from fastNLP.embeddings import StaticEmbedding +from fastNLP.modules import LSTM, ConditionalRandomField +import torch +from fastNLP import seq_len_to_mask +from utils import better_init_rnn,print_info + + +class LatticeLSTM_SeqLabel(nn.Module): + def __init__(self, char_embed, bigram_embed, word_embed, hidden_size, label_size, bias=True, bidirectional=False, + device=None, embed_dropout=0, output_dropout=0, skip_batch_first=True,debug=False, + skip_before_head=False,use_bigram=True,vocabs=None): + if device is None: + self.device = torch.device('cpu') + else: + self.device = torch.device(device) + from modules import LatticeLSTMLayer_sup_back_V0 + super().__init__() + self.debug = debug + self.skip_batch_first = skip_batch_first + self.char_embed_size = char_embed.embedding.weight.size(1) + self.bigram_embed_size = bigram_embed.embedding.weight.size(1) + self.word_embed_size = word_embed.embedding.weight.size(1) + self.hidden_size = hidden_size + self.label_size = label_size + self.bidirectional = bidirectional + self.use_bigram = use_bigram + self.vocabs = vocabs + + if self.use_bigram: + self.input_size = self.char_embed_size + self.bigram_embed_size + else: + self.input_size = self.char_embed_size + + self.char_embed = char_embed + self.bigram_embed = bigram_embed + self.word_embed = word_embed + self.encoder = LatticeLSTMLayer_sup_back_V0(self.input_size,self.word_embed_size, + self.hidden_size, + left2right=True, + bias=bias, + device=self.device, + debug=self.debug, + skip_before_head=skip_before_head) + if self.bidirectional: + self.encoder_back = LatticeLSTMLayer_sup_back_V0(self.input_size, + self.word_embed_size, self.hidden_size, + left2right=False, + bias=bias, + device=self.device, + debug=self.debug, + skip_before_head=skip_before_head) + + self.output = nn.Linear(self.hidden_size * (2 if self.bidirectional else 1), self.label_size) + self.crf = ConditionalRandomField(label_size, True) + + self.crf.trans_m = nn.Parameter(torch.zeros(size=[label_size, label_size],requires_grad=True)) + if self.crf.include_start_end_trans: + self.crf.start_scores = nn.Parameter(torch.zeros(size=[label_size],requires_grad=True)) + self.crf.end_scores = nn.Parameter(torch.zeros(size=[label_size],requires_grad=True)) + + self.loss_func = nn.CrossEntropyLoss() + self.embed_dropout = nn.Dropout(embed_dropout) + self.output_dropout = nn.Dropout(output_dropout) + + def forward(self, chars, bigrams, seq_len, target, + skips_l2r_source, skips_l2r_word, lexicon_count, + skips_r2l_source=None, skips_r2l_word=None, lexicon_count_back=None): + # print('skips_l2r_word_id:{}'.format(skips_l2r_word.size())) + batch = chars.size(0) + max_seq_len = chars.size(1) + # max_lexicon_count = skips_l2r_word.size(2) + + + embed_char = self.char_embed(chars) + if self.use_bigram: + + embed_bigram = self.bigram_embed(bigrams) + + embedding = torch.cat([embed_char, embed_bigram], dim=-1) + else: + + embedding = embed_char + + + embed_nonword = self.embed_dropout(embedding) + + # skips_l2r_word = torch.reshape(skips_l2r_word,shape=[batch,-1]) + embed_word = self.word_embed(skips_l2r_word) + embed_word = self.embed_dropout(embed_word) + # embed_word = torch.reshape(embed_word,shape=[batch,max_seq_len,max_lexicon_count,-1]) + + + encoded_h, encoded_c = self.encoder(embed_nonword, seq_len, skips_l2r_source, embed_word, lexicon_count) + + if self.bidirectional: + embed_word_back = self.word_embed(skips_r2l_word) + embed_word_back = self.embed_dropout(embed_word_back) + encoded_h_back, encoded_c_back = self.encoder_back(embed_nonword, seq_len, skips_r2l_source, + embed_word_back, lexicon_count_back) + encoded_h = torch.cat([encoded_h, encoded_h_back], dim=-1) + + encoded_h = self.output_dropout(encoded_h) + + pred = self.output(encoded_h) + + mask = seq_len_to_mask(seq_len) + + if self.training: + loss = self.crf(pred, target, mask) + return {'loss': loss} + else: + pred, path = self.crf.viterbi_decode(pred, mask) + return {'pred': pred} + + # batch_size, sent_len = pred.shape[0], pred.shape[1] + # loss = self.loss_func(pred.reshape(batch_size * sent_len, -1), target.reshape(batch_size * sent_len)) + # return {'pred':pred,'loss':loss} + +class LatticeLSTM_SeqLabel_V1(nn.Module): + def __init__(self, char_embed, bigram_embed, word_embed, hidden_size, label_size, bias=True, bidirectional=False, + device=None, embed_dropout=0, output_dropout=0, skip_batch_first=True,debug=False, + skip_before_head=False,use_bigram=True,vocabs=None,gaz_dropout=0): + if device is None: + self.device = torch.device('cpu') + else: + self.device = torch.device(device) + from modules import LatticeLSTMLayer_sup_back_V1 + super().__init__() + self.count = 0 + self.debug = debug + self.skip_batch_first = skip_batch_first + self.char_embed_size = char_embed.embedding.weight.size(1) + self.bigram_embed_size = bigram_embed.embedding.weight.size(1) + self.word_embed_size = word_embed.embedding.weight.size(1) + self.hidden_size = hidden_size + self.label_size = label_size + self.bidirectional = bidirectional + self.use_bigram = use_bigram + self.vocabs = vocabs + + if self.use_bigram: + self.input_size = self.char_embed_size + self.bigram_embed_size + else: + self.input_size = self.char_embed_size + + self.char_embed = char_embed + self.bigram_embed = bigram_embed + self.word_embed = word_embed + self.encoder = LatticeLSTMLayer_sup_back_V1(self.input_size,self.word_embed_size, + self.hidden_size, + left2right=True, + bias=bias, + device=self.device, + debug=self.debug, + skip_before_head=skip_before_head) + if self.bidirectional: + self.encoder_back = LatticeLSTMLayer_sup_back_V1(self.input_size, + self.word_embed_size, self.hidden_size, + left2right=False, + bias=bias, + device=self.device, + debug=self.debug, + skip_before_head=skip_before_head) + + self.output = nn.Linear(self.hidden_size * (2 if self.bidirectional else 1), self.label_size) + self.crf = ConditionalRandomField(label_size, True) + + self.crf.trans_m = nn.Parameter(torch.zeros(size=[label_size, label_size],requires_grad=True)) + if self.crf.include_start_end_trans: + self.crf.start_scores = nn.Parameter(torch.zeros(size=[label_size],requires_grad=True)) + self.crf.end_scores = nn.Parameter(torch.zeros(size=[label_size],requires_grad=True)) + + self.loss_func = nn.CrossEntropyLoss() + self.embed_dropout = nn.Dropout(embed_dropout) + self.gaz_dropout = nn.Dropout(gaz_dropout) + self.output_dropout = nn.Dropout(output_dropout) + + def forward(self, chars, bigrams, seq_len, target, + skips_l2r_source, skips_l2r_word, lexicon_count, + skips_r2l_source=None, skips_r2l_word=None, lexicon_count_back=None): + + batch = chars.size(0) + max_seq_len = chars.size(1) + + + + embed_char = self.char_embed(chars) + if self.use_bigram: + + embed_bigram = self.bigram_embed(bigrams) + + embedding = torch.cat([embed_char, embed_bigram], dim=-1) + else: + + embedding = embed_char + + + embed_nonword = self.embed_dropout(embedding) + + # skips_l2r_word = torch.reshape(skips_l2r_word,shape=[batch,-1]) + embed_word = self.word_embed(skips_l2r_word) + embed_word = self.embed_dropout(embed_word) + + + + encoded_h, encoded_c = self.encoder(embed_nonword, seq_len, skips_l2r_source, embed_word, lexicon_count) + + if self.bidirectional: + embed_word_back = self.word_embed(skips_r2l_word) + embed_word_back = self.embed_dropout(embed_word_back) + encoded_h_back, encoded_c_back = self.encoder_back(embed_nonword, seq_len, skips_r2l_source, + embed_word_back, lexicon_count_back) + encoded_h = torch.cat([encoded_h, encoded_h_back], dim=-1) + + encoded_h = self.output_dropout(encoded_h) + + pred = self.output(encoded_h) + + mask = seq_len_to_mask(seq_len) + + if self.training: + loss = self.crf(pred, target, mask) + return {'loss': loss} + else: + pred, path = self.crf.viterbi_decode(pred, mask) + return {'pred': pred} + + +class LSTM_SeqLabel(nn.Module): + def __init__(self, char_embed, bigram_embed, word_embed, hidden_size, label_size, bias=True, + bidirectional=False, device=None, embed_dropout=0, output_dropout=0,use_bigram=True): + + if device is None: + self.device = torch.device('cpu') + else: + self.device = torch.device(device) + super().__init__() + self.char_embed_size = char_embed.embedding.weight.size(1) + self.bigram_embed_size = bigram_embed.embedding.weight.size(1) + self.word_embed_size = word_embed.embedding.weight.size(1) + self.hidden_size = hidden_size + self.label_size = label_size + self.bidirectional = bidirectional + self.use_bigram = use_bigram + + self.char_embed = char_embed + self.bigram_embed = bigram_embed + self.word_embed = word_embed + + if self.use_bigram: + self.input_size = self.char_embed_size + self.bigram_embed_size + else: + self.input_size = self.char_embed_size + + self.encoder = LSTM(self.input_size, self.hidden_size, + bidirectional=self.bidirectional) + + better_init_rnn(self.encoder.lstm) + + + self.output = nn.Linear(self.hidden_size * (2 if self.bidirectional else 1), self.label_size) + + self.debug = True + self.loss_func = nn.CrossEntropyLoss() + self.embed_dropout = nn.Dropout(embed_dropout) + self.output_dropout = nn.Dropout(output_dropout) + self.crf = ConditionalRandomField(label_size, True) + + def forward(self, chars, bigrams, seq_len, target): + if self.debug: + + print_info('chars:{}'.format(chars.size())) + print_info('bigrams:{}'.format(bigrams.size())) + print_info('seq_len:{}'.format(seq_len.size())) + print_info('target:{}'.format(target.size())) + embed_char = self.char_embed(chars) + + if self.use_bigram: + + embed_bigram = self.bigram_embed(bigrams) + + embedding = torch.cat([embed_char, embed_bigram], dim=-1) + else: + + embedding = embed_char + + embedding = self.embed_dropout(embedding) + + encoded_h, encoded_c = self.encoder(embedding, seq_len) + + encoded_h = self.output_dropout(encoded_h) + + pred = self.output(encoded_h) + + mask = seq_len_to_mask(seq_len) + + # pred = self.crf(pred) + + # batch_size, sent_len = pred.shape[0], pred.shape[1] + # loss = self.loss_func(pred.reshape(batch_size * sent_len, -1), target.reshape(batch_size * sent_len)) + if self.debug: + print('debug mode:finish') + exit(1208) + if self.training: + loss = self.crf(pred, target, mask) + return {'loss': loss} + else: + pred, path = self.crf.viterbi_decode(pred, mask) + return {'pred': pred} diff --git a/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/modules.py b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/modules.py new file mode 100644 index 00000000..70182250 --- /dev/null +++ b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/modules.py @@ -0,0 +1,638 @@ +import torch.nn as nn +import torch +from fastNLP.core.utils import seq_len_to_mask +from utils import better_init_rnn +import numpy as np + + +class WordLSTMCell_yangjie(nn.Module): + + """A basic LSTM cell.""" + + def __init__(self, input_size, hidden_size, use_bias=True,debug=False, left2right=True): + """ + Most parts are copied from torch.nn.LSTMCell. + """ + + super().__init__() + self.left2right = left2right + self.debug = debug + self.input_size = input_size + self.hidden_size = hidden_size + self.use_bias = use_bias + self.weight_ih = nn.Parameter( + torch.FloatTensor(input_size, 3 * hidden_size)) + self.weight_hh = nn.Parameter( + torch.FloatTensor(hidden_size, 3 * hidden_size)) + if use_bias: + self.bias = nn.Parameter(torch.FloatTensor(3 * hidden_size)) + else: + self.register_parameter('bias', None) + self.reset_parameters() + + def reset_parameters(self): + """ + Initialize parameters following the way proposed in the paper. + """ + nn.init.orthogonal(self.weight_ih.data) + weight_hh_data = torch.eye(self.hidden_size) + weight_hh_data = weight_hh_data.repeat(1, 3) + with torch.no_grad(): + self.weight_hh.set_(weight_hh_data) + # The bias is just set to zero vectors. + if self.use_bias: + nn.init.constant(self.bias.data, val=0) + + def forward(self, input_, hx): + """ + Args: + input_: A (batch, input_size) tensor containing input + features. + hx: A tuple (h_0, c_0), which contains the initial hidden + and cell state, where the size of both states is + (batch, hidden_size). + Returns: + h_1, c_1: Tensors containing the next hidden and cell state. + """ + + h_0, c_0 = hx + + + + batch_size = h_0.size(0) + bias_batch = (self.bias.unsqueeze(0).expand(batch_size, *self.bias.size())) + wh_b = torch.addmm(bias_batch, h_0, self.weight_hh) + wi = torch.mm(input_, self.weight_ih) + f, i, g = torch.split(wh_b + wi, split_size_or_sections=self.hidden_size, dim=1) + c_1 = torch.sigmoid(f)*c_0 + torch.sigmoid(i)*torch.tanh(g) + + return c_1 + + def __repr__(self): + s = '{name}({input_size}, {hidden_size})' + return s.format(name=self.__class__.__name__, **self.__dict__) + + +class MultiInputLSTMCell_V0(nn.Module): + def __init__(self, char_input_size, hidden_size, use_bias=True,debug=False): + super().__init__() + self.char_input_size = char_input_size + self.hidden_size = hidden_size + self.use_bias = use_bias + + self.weight_ih = nn.Parameter( + torch.FloatTensor(char_input_size, 3 * hidden_size) + ) + + self.weight_hh = nn.Parameter( + torch.FloatTensor(hidden_size, 3 * hidden_size) + ) + + self.alpha_weight_ih = nn.Parameter( + torch.FloatTensor(char_input_size, hidden_size) + ) + + self.alpha_weight_hh = nn.Parameter( + torch.FloatTensor(hidden_size, hidden_size) + ) + + if self.use_bias: + self.bias = nn.Parameter(torch.FloatTensor(3 * hidden_size)) + self.alpha_bias = nn.Parameter(torch.FloatTensor(hidden_size)) + else: + self.register_parameter('bias', None) + self.register_parameter('alpha_bias', None) + + self.debug = debug + self.reset_parameters() + + def reset_parameters(self): + """ + Initialize parameters following the way proposed in the paper. + """ + nn.init.orthogonal(self.weight_ih.data) + nn.init.orthogonal(self.alpha_weight_ih.data) + + weight_hh_data = torch.eye(self.hidden_size) + weight_hh_data = weight_hh_data.repeat(1, 3) + with torch.no_grad(): + self.weight_hh.set_(weight_hh_data) + + alpha_weight_hh_data = torch.eye(self.hidden_size) + alpha_weight_hh_data = alpha_weight_hh_data.repeat(1, 1) + with torch.no_grad(): + self.alpha_weight_hh.set_(alpha_weight_hh_data) + + # The bias is just set to zero vectors. + if self.use_bias: + nn.init.constant_(self.bias.data, val=0) + nn.init.constant_(self.alpha_bias.data, val=0) + + def forward(self, inp, skip_c, skip_count, hx): + ''' + + :param inp: chars B * hidden + :param skip_c: 由跳边得到的c, B * X * hidden + :param skip_count: 这个batch中每个example中当前位置的跳边的数量,用于mask + :param hx: + :return: + ''' + max_skip_count = torch.max(skip_count).item() + + + + if True: + h_0, c_0 = hx + batch_size = h_0.size(0) + + bias_batch = (self.bias.unsqueeze(0).expand(batch_size, *self.bias.size())) + + wi = torch.matmul(inp, self.weight_ih) + wh = torch.matmul(h_0, self.weight_hh) + + + + i, o, g = torch.split(wh + wi + bias_batch, split_size_or_sections=self.hidden_size, dim=1) + + i = torch.sigmoid(i).unsqueeze(1) + o = torch.sigmoid(o).unsqueeze(1) + g = torch.tanh(g).unsqueeze(1) + + + + alpha_wi = torch.matmul(inp, self.alpha_weight_ih) + alpha_wi.unsqueeze_(1) + + # alpha_wi = alpha_wi.expand(1,skip_count,self.hidden_size) + alpha_wh = torch.matmul(skip_c, self.alpha_weight_hh) + + alpha_bias_batch = self.alpha_bias.unsqueeze(0) + + alpha = torch.sigmoid(alpha_wi + alpha_wh + alpha_bias_batch) + + skip_mask = seq_len_to_mask(skip_count,max_len=skip_c.size()[1]) + + skip_mask = 1 - skip_mask + + + skip_mask = skip_mask.unsqueeze(-1).expand(*skip_mask.size(), self.hidden_size) + + skip_mask = (skip_mask).float()*1e20 + + alpha = alpha - skip_mask + + alpha = torch.exp(torch.cat([i, alpha], dim=1)) + + + + alpha_sum = torch.sum(alpha, dim=1, keepdim=True) + + alpha = torch.div(alpha, alpha_sum) + + merge_i_c = torch.cat([g, skip_c], dim=1) + + c_1 = merge_i_c * alpha + + c_1 = c_1.sum(1, keepdim=True) + # h_1 = o * c_1 + h_1 = o * torch.tanh(c_1) + + return h_1.squeeze(1), c_1.squeeze(1) + + else: + + h_0, c_0 = hx + batch_size = h_0.size(0) + + bias_batch = (self.bias.unsqueeze(0).expand(batch_size, *self.bias.size())) + + wi = torch.matmul(inp, self.weight_ih) + wh = torch.matmul(h_0, self.weight_hh) + + i, o, g = torch.split(wh + wi + bias_batch, split_size_or_sections=self.hidden_size, dim=1) + + i = torch.sigmoid(i).unsqueeze(1) + o = torch.sigmoid(o).unsqueeze(1) + g = torch.tanh(g).unsqueeze(1) + + c_1 = g + h_1 = o * c_1 + + return h_1,c_1 + +class MultiInputLSTMCell_V1(nn.Module): + def __init__(self, char_input_size, hidden_size, use_bias=True,debug=False): + super().__init__() + self.char_input_size = char_input_size + self.hidden_size = hidden_size + self.use_bias = use_bias + + self.weight_ih = nn.Parameter( + torch.FloatTensor(char_input_size, 3 * hidden_size) + ) + + self.weight_hh = nn.Parameter( + torch.FloatTensor(hidden_size, 3 * hidden_size) + ) + + self.alpha_weight_ih = nn.Parameter( + torch.FloatTensor(char_input_size, hidden_size) + ) + + self.alpha_weight_hh = nn.Parameter( + torch.FloatTensor(hidden_size, hidden_size) + ) + + if self.use_bias: + self.bias = nn.Parameter(torch.FloatTensor(3 * hidden_size)) + self.alpha_bias = nn.Parameter(torch.FloatTensor(hidden_size)) + else: + self.register_parameter('bias', None) + self.register_parameter('alpha_bias', None) + + self.debug = debug + self.reset_parameters() + + def reset_parameters(self): + """ + Initialize parameters following the way proposed in the paper. + """ + nn.init.orthogonal(self.weight_ih.data) + nn.init.orthogonal(self.alpha_weight_ih.data) + + weight_hh_data = torch.eye(self.hidden_size) + weight_hh_data = weight_hh_data.repeat(1, 3) + with torch.no_grad(): + self.weight_hh.set_(weight_hh_data) + + alpha_weight_hh_data = torch.eye(self.hidden_size) + alpha_weight_hh_data = alpha_weight_hh_data.repeat(1, 1) + with torch.no_grad(): + self.alpha_weight_hh.set_(alpha_weight_hh_data) + + # The bias is just set to zero vectors. + if self.use_bias: + nn.init.constant_(self.bias.data, val=0) + nn.init.constant_(self.alpha_bias.data, val=0) + + def forward(self, inp, skip_c, skip_count, hx): + ''' + + :param inp: chars B * hidden + :param skip_c: 由跳边得到的c, B * X * hidden + :param skip_count: 这个batch中每个example中当前位置的跳边的数量,用于mask + :param hx: + :return: + ''' + max_skip_count = torch.max(skip_count).item() + + + + if True: + h_0, c_0 = hx + batch_size = h_0.size(0) + + bias_batch = (self.bias.unsqueeze(0).expand(batch_size, *self.bias.size())) + + wi = torch.matmul(inp, self.weight_ih) + wh = torch.matmul(h_0, self.weight_hh) + + + i, o, g = torch.split(wh + wi + bias_batch, split_size_or_sections=self.hidden_size, dim=1) + + i = torch.sigmoid(i).unsqueeze(1) + o = torch.sigmoid(o).unsqueeze(1) + g = torch.tanh(g).unsqueeze(1) + + + + ##basic lstm start + + f = 1 - i + c_1_basic = f*c_0.unsqueeze(1) + i*g + c_1_basic = c_1_basic.squeeze(1) + + + + + + alpha_wi = torch.matmul(inp, self.alpha_weight_ih) + alpha_wi.unsqueeze_(1) + + + alpha_wh = torch.matmul(skip_c, self.alpha_weight_hh) + + alpha_bias_batch = self.alpha_bias.unsqueeze(0) + + alpha = torch.sigmoid(alpha_wi + alpha_wh + alpha_bias_batch) + + skip_mask = seq_len_to_mask(skip_count,max_len=skip_c.size()[1]).float() + + skip_mask = 1 - skip_mask + + + skip_mask = skip_mask.unsqueeze(-1).expand(*skip_mask.size(), self.hidden_size) + + skip_mask = (skip_mask).float()*1e20 + + alpha = alpha - skip_mask + + alpha = torch.exp(torch.cat([i, alpha], dim=1)) + + + + alpha_sum = torch.sum(alpha, dim=1, keepdim=True) + + alpha = torch.div(alpha, alpha_sum) + + merge_i_c = torch.cat([g, skip_c], dim=1) + + c_1 = merge_i_c * alpha + + c_1 = c_1.sum(1, keepdim=True) + # h_1 = o * c_1 + c_1 = c_1.squeeze(1) + count_select = (skip_count != 0).float().unsqueeze(-1) + + + + + c_1 = c_1*count_select + c_1_basic*(1-count_select) + + + o = o.squeeze(1) + h_1 = o * torch.tanh(c_1) + + return h_1, c_1 + +class LatticeLSTMLayer_sup_back_V0(nn.Module): + def __init__(self, char_input_size, word_input_size, hidden_size, left2right, + bias=True,device=None,debug=False,skip_before_head=False): + super().__init__() + + self.skip_before_head = skip_before_head + + self.hidden_size = hidden_size + + self.char_cell = MultiInputLSTMCell_V0(char_input_size, hidden_size, bias,debug) + + self.word_cell = WordLSTMCell_yangjie(word_input_size,hidden_size,bias,debug=self.debug) + + self.word_input_size = word_input_size + self.left2right = left2right + self.bias = bias + self.device = device + self.debug = debug + + def forward(self, inp, seq_len, skip_sources, skip_words, skip_count, init_state=None): + ''' + + :param inp: batch * seq_len * embedding, chars + :param seq_len: batch, length of chars + :param skip_sources: batch * seq_len * X, 跳边的起点 + :param skip_words: batch * seq_len * X * embedding, 跳边的词 + :param lexicon_count: batch * seq_len, count of lexicon per example per position + :param init_state: the hx of rnn + :return: + ''' + + + if self.left2right: + + max_seq_len = max(seq_len) + batch_size = inp.size(0) + c_ = torch.zeros(size=[batch_size, 1, self.hidden_size], requires_grad=True).to(self.device) + h_ = torch.zeros(size=[batch_size, 1, self.hidden_size], requires_grad=True).to(self.device) + + for i in range(max_seq_len): + max_lexicon_count = max(torch.max(skip_count[:, i]).item(), 1) + h_0, c_0 = h_[:, i, :], c_[:, i, :] + + skip_word_flat = skip_words[:, i, :max_lexicon_count].contiguous() + + skip_word_flat = skip_word_flat.view(batch_size*max_lexicon_count,self.word_input_size) + skip_source_flat = skip_sources[:, i, :max_lexicon_count].contiguous().view(batch_size, max_lexicon_count) + + + index_0 = torch.tensor(range(batch_size)).unsqueeze(1).expand(batch_size,max_lexicon_count) + index_1 = skip_source_flat + + if not self.skip_before_head: + c_x = c_[[index_0, index_1+1]] + h_x = h_[[index_0, index_1+1]] + else: + c_x = c_[[index_0,index_1]] + h_x = h_[[index_0,index_1]] + + c_x_flat = c_x.view(batch_size*max_lexicon_count,self.hidden_size) + h_x_flat = h_x.view(batch_size*max_lexicon_count,self.hidden_size) + + + + + c_1_flat = self.word_cell(skip_word_flat,(h_x_flat,c_x_flat)) + + c_1_skip = c_1_flat.view(batch_size,max_lexicon_count,self.hidden_size) + + h_1,c_1 = self.char_cell(inp[:,i,:],c_1_skip,skip_count[:,i],(h_0,c_0)) + + + h_ = torch.cat([h_,h_1.unsqueeze(1)],dim=1) + c_ = torch.cat([c_, c_1.unsqueeze(1)], dim=1) + + return h_[:,1:],c_[:,1:] + else: + mask_for_seq_len = seq_len_to_mask(seq_len) + + max_seq_len = max(seq_len) + batch_size = inp.size(0) + c_ = torch.zeros(size=[batch_size, 1, self.hidden_size], requires_grad=True).to(self.device) + h_ = torch.zeros(size=[batch_size, 1, self.hidden_size], requires_grad=True).to(self.device) + + for i in reversed(range(max_seq_len)): + max_lexicon_count = max(torch.max(skip_count[:, i]).item(), 1) + + + + h_0, c_0 = h_[:, 0, :], c_[:, 0, :] + + skip_word_flat = skip_words[:, i, :max_lexicon_count].contiguous() + + skip_word_flat = skip_word_flat.view(batch_size*max_lexicon_count,self.word_input_size) + skip_source_flat = skip_sources[:, i, :max_lexicon_count].contiguous().view(batch_size, max_lexicon_count) + + + index_0 = torch.tensor(range(batch_size)).unsqueeze(1).expand(batch_size,max_lexicon_count) + index_1 = skip_source_flat-i + + if not self.skip_before_head: + c_x = c_[[index_0, index_1-1]] + h_x = h_[[index_0, index_1-1]] + else: + c_x = c_[[index_0,index_1]] + h_x = h_[[index_0,index_1]] + + c_x_flat = c_x.view(batch_size*max_lexicon_count,self.hidden_size) + h_x_flat = h_x.view(batch_size*max_lexicon_count,self.hidden_size) + + + + + c_1_flat = self.word_cell(skip_word_flat,(h_x_flat,c_x_flat)) + + c_1_skip = c_1_flat.view(batch_size,max_lexicon_count,self.hidden_size) + + h_1,c_1 = self.char_cell(inp[:,i,:],c_1_skip,skip_count[:,i],(h_0,c_0)) + + + h_1_mask = h_1.masked_fill(1-mask_for_seq_len[:,i].unsqueeze(-1),0) + c_1_mask = c_1.masked_fill(1 - mask_for_seq_len[:, i].unsqueeze(-1), 0) + + + h_ = torch.cat([h_1_mask.unsqueeze(1),h_],dim=1) + c_ = torch.cat([c_1_mask.unsqueeze(1),c_], dim=1) + + return h_[:,:-1],c_[:,:-1] + +class LatticeLSTMLayer_sup_back_V1(nn.Module): + # V1与V0的不同在于,V1在当前位置完全无lexicon匹配时,会采用普通的lstm计算公式, + # 普通的lstm计算公式与杨杰实现的lattice lstm在lexicon数量为0时不同 + def __init__(self, char_input_size, word_input_size, hidden_size, left2right, + bias=True,device=None,debug=False,skip_before_head=False): + super().__init__() + + self.debug = debug + + self.skip_before_head = skip_before_head + + self.hidden_size = hidden_size + + self.char_cell = MultiInputLSTMCell_V1(char_input_size, hidden_size, bias,debug) + + self.word_cell = WordLSTMCell_yangjie(word_input_size,hidden_size,bias,debug=self.debug) + + self.word_input_size = word_input_size + self.left2right = left2right + self.bias = bias + self.device = device + + def forward(self, inp, seq_len, skip_sources, skip_words, skip_count, init_state=None): + ''' + + :param inp: batch * seq_len * embedding, chars + :param seq_len: batch, length of chars + :param skip_sources: batch * seq_len * X, 跳边的起点 + :param skip_words: batch * seq_len * X * embedding_size, 跳边的词 + :param lexicon_count: batch * seq_len, + lexicon_count[i,j]为第i个例子以第j个位子为结尾匹配到的词的数量 + :param init_state: the hx of rnn + :return: + ''' + + + if self.left2right: + + max_seq_len = max(seq_len) + batch_size = inp.size(0) + c_ = torch.zeros(size=[batch_size, 1, self.hidden_size], requires_grad=True).to(self.device) + h_ = torch.zeros(size=[batch_size, 1, self.hidden_size], requires_grad=True).to(self.device) + + for i in range(max_seq_len): + max_lexicon_count = max(torch.max(skip_count[:, i]).item(), 1) + h_0, c_0 = h_[:, i, :], c_[:, i, :] + + #为了使rnn能够计算B*lexicon_count*embedding_size的张量,需要将其reshape成二维张量 + #为了匹配pytorch的[]取址方式,需要将reshape成二维张量 + + skip_word_flat = skip_words[:, i, :max_lexicon_count].contiguous() + + skip_word_flat = skip_word_flat.view(batch_size*max_lexicon_count,self.word_input_size) + skip_source_flat = skip_sources[:, i, :max_lexicon_count].contiguous().view(batch_size, max_lexicon_count) + + + index_0 = torch.tensor(range(batch_size)).unsqueeze(1).expand(batch_size,max_lexicon_count) + index_1 = skip_source_flat + + + if not self.skip_before_head: + c_x = c_[[index_0, index_1+1]] + h_x = h_[[index_0, index_1+1]] + else: + c_x = c_[[index_0,index_1]] + h_x = h_[[index_0,index_1]] + + c_x_flat = c_x.view(batch_size*max_lexicon_count,self.hidden_size) + h_x_flat = h_x.view(batch_size*max_lexicon_count,self.hidden_size) + + + + c_1_flat = self.word_cell(skip_word_flat,(h_x_flat,c_x_flat)) + + c_1_skip = c_1_flat.view(batch_size,max_lexicon_count,self.hidden_size) + + h_1,c_1 = self.char_cell(inp[:,i,:],c_1_skip,skip_count[:,i],(h_0,c_0)) + + + h_ = torch.cat([h_,h_1.unsqueeze(1)],dim=1) + c_ = torch.cat([c_, c_1.unsqueeze(1)], dim=1) + + return h_[:,1:],c_[:,1:] + else: + mask_for_seq_len = seq_len_to_mask(seq_len) + + max_seq_len = max(seq_len) + batch_size = inp.size(0) + c_ = torch.zeros(size=[batch_size, 1, self.hidden_size], requires_grad=True).to(self.device) + h_ = torch.zeros(size=[batch_size, 1, self.hidden_size], requires_grad=True).to(self.device) + + for i in reversed(range(max_seq_len)): + max_lexicon_count = max(torch.max(skip_count[:, i]).item(), 1) + + + h_0, c_0 = h_[:, 0, :], c_[:, 0, :] + + skip_word_flat = skip_words[:, i, :max_lexicon_count].contiguous() + + skip_word_flat = skip_word_flat.view(batch_size*max_lexicon_count,self.word_input_size) + skip_source_flat = skip_sources[:, i, :max_lexicon_count].contiguous().view(batch_size, max_lexicon_count) + + + index_0 = torch.tensor(range(batch_size)).unsqueeze(1).expand(batch_size,max_lexicon_count) + index_1 = skip_source_flat-i + + if not self.skip_before_head: + c_x = c_[[index_0, index_1-1]] + h_x = h_[[index_0, index_1-1]] + else: + c_x = c_[[index_0,index_1]] + h_x = h_[[index_0,index_1]] + + c_x_flat = c_x.view(batch_size*max_lexicon_count,self.hidden_size) + h_x_flat = h_x.view(batch_size*max_lexicon_count,self.hidden_size) + + + + + c_1_flat = self.word_cell(skip_word_flat,(h_x_flat,c_x_flat)) + + + + c_1_skip = c_1_flat.view(batch_size,max_lexicon_count,self.hidden_size) + + h_1,c_1 = self.char_cell(inp[:,i,:],c_1_skip,skip_count[:,i],(h_0,c_0)) + + + h_1_mask = h_1.masked_fill(~ mask_for_seq_len[:,i].unsqueeze(-1),0) + c_1_mask = c_1.masked_fill(~ mask_for_seq_len[:, i].unsqueeze(-1), 0) + + + h_ = torch.cat([h_1_mask.unsqueeze(1),h_],dim=1) + c_ = torch.cat([c_1_mask.unsqueeze(1),c_], dim=1) + + + + return h_[:,:-1],c_[:,:-1] + + + + diff --git a/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/pathes.py b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/pathes.py new file mode 100644 index 00000000..fe3f6162 --- /dev/null +++ b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/pathes.py @@ -0,0 +1,24 @@ + + +glove_100_path = 'en-glove-6b-100d' +glove_50_path = 'en-glove-6b-50d' +glove_200_path = '' +glove_300_path = 'en-glove-840b-300' +fasttext_path = 'en-fasttext' #300 +tencent_chinese_word_path = 'cn' # tencent 200 +fasttext_cn_path = 'cn-fasttext' # 300 +yangjie_rich_pretrain_unigram_path = '/remote-home/xnli/data/pretrain/chinese/gigaword_chn.all.a2b.uni.ite50.vec' +yangjie_rich_pretrain_bigram_path = '/remote-home/xnli/data/pretrain/chinese/gigaword_chn.all.a2b.bi.ite50.vec' +yangjie_rich_pretrain_word_path = '/remote-home/xnli/data/pretrain/chinese/ctb.50d.vec' + + +conll_2003_path = '/remote-home/xnli/data/corpus/multi_task/conll_2013/data_mine.pkl' +conllized_ontonote_path = '/remote-home/txsun/data/OntoNotes-5.0-NER-master/v12/english' +conllized_ontonote_pkl_path = '/remote-home/txsun/data/ontonotes5.pkl' +sst2_path = '/remote-home/xnli/data/corpus/text_classification/SST-2/' +# weibo_ner_path = '/remote-home/xnli/data/corpus/sequence_labelling/ner_weibo' +ontonote4ner_cn_path = '/remote-home/xnli/data/corpus/sequence_labelling/chinese_ner/OntoNote4NER' +msra_ner_cn_path = '/remote-home/xnli/data/corpus/sequence_labelling/chinese_ner/MSRANER' +resume_ner_path = '/remote-home/xnli/data/corpus/sequence_labelling/chinese_ner/ResumeNER' +weibo_ner_path = '/remote-home/xnli/data/corpus/sequence_labelling/chinese_ner/WeiboNER' +weibo_ner_old_path = '/remote-home/xnli/data/corpus/sequence_labelling/chinese_ner/WeiboNER_old' \ No newline at end of file diff --git a/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/small.py b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/small.py new file mode 100644 index 00000000..c877d96f --- /dev/null +++ b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/small.py @@ -0,0 +1,126 @@ +from utils_ import get_skip_path_trivial, Trie, get_skip_path +from load_data import load_yangjie_rich_pretrain_word_list, load_ontonotes4ner, equip_chinese_ner_with_skip +from pathes import * +from functools import partial +from fastNLP import cache_results +from fastNLP.embeddings.static_embedding import StaticEmbedding +import torch +import torch.nn as nn +import torch.nn.functional as F +from fastNLP.core.metrics import _bmes_tag_to_spans,_bmeso_tag_to_spans +from load_data import load_resume_ner + + +# embed = StaticEmbedding(None,embedding_dim=2) +# datasets,vocabs,embeddings = load_ontonotes4ner(ontonote4ner_cn_path,yangjie_rich_pretrain_unigram_path,yangjie_rich_pretrain_bigram_path, +# _refresh=True,index_token=False) +# +# w_list = load_yangjie_rich_pretrain_word_list(yangjie_rich_pretrain_word_path, +# _refresh=False) +# +# datasets,vocabs,embeddings = equip_chinese_ner_with_skip(datasets,vocabs,embeddings,w_list,yangjie_rich_pretrain_word_path, +# _refresh=True) +# + +def reverse_style(input_string): + target_position = input_string.index('[') + input_len = len(input_string) + output_string = input_string[target_position:input_len] + input_string[0:target_position] + # print('in:{}.out:{}'.format(input_string, output_string)) + return output_string + + + + + +def get_yangjie_bmeso(label_list): + def get_ner_BMESO_yj(label_list): + # list_len = len(word_list) + # assert(list_len == len(label_list)), "word list size unmatch with label list" + list_len = len(label_list) + begin_label = 'b-' + end_label = 'e-' + single_label = 's-' + whole_tag = '' + index_tag = '' + tag_list = [] + stand_matrix = [] + for i in range(0, list_len): + # wordlabel = word_list[i] + current_label = label_list[i].lower() + if begin_label in current_label: + if index_tag != '': + tag_list.append(whole_tag + ',' + str(i - 1)) + whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i) + index_tag = current_label.replace(begin_label, "", 1) + + elif single_label in current_label: + if index_tag != '': + tag_list.append(whole_tag + ',' + str(i - 1)) + whole_tag = current_label.replace(single_label, "", 1) + '[' + str(i) + tag_list.append(whole_tag) + whole_tag = "" + index_tag = "" + elif end_label in current_label: + if index_tag != '': + tag_list.append(whole_tag + ',' + str(i)) + whole_tag = '' + index_tag = '' + else: + continue + if (whole_tag != '') & (index_tag != ''): + tag_list.append(whole_tag) + tag_list_len = len(tag_list) + + for i in range(0, tag_list_len): + if len(tag_list[i]) > 0: + tag_list[i] = tag_list[i] + ']' + insert_list = reverse_style(tag_list[i]) + stand_matrix.append(insert_list) + # print stand_matrix + return stand_matrix + + def transform_YJ_to_fastNLP(span): + span = span[1:] + span_split = span.split(']') + # print('span_list:{}'.format(span_split)) + span_type = span_split[1] + # print('span_split[0].split(','):{}'.format(span_split[0].split(','))) + if ',' in span_split[0]: + b, e = span_split[0].split(',') + else: + b = span_split[0] + e = b + + b = int(b) + e = int(e) + + e += 1 + + return (span_type, (b, e)) + yj_form = get_ner_BMESO_yj(label_list) + # print('label_list:{}'.format(label_list)) + # print('yj_from:{}'.format(yj_form)) + fastNLP_form = list(map(transform_YJ_to_fastNLP,yj_form)) + return fastNLP_form + + +# tag_list = ['O', 'B-singer', 'M-singer', 'E-singer', 'O', 'O'] +# span_list = get_ner_BMES(tag_list) +# print(span_list) +# yangjie_label_list = ['B-NAME', 'E-NAME', 'O', 'B-CONT', 'M-CONT', 'E-CONT', 'B-RACE', 'E-RACE', 'B-TITLE', 'M-TITLE', 'E-TITLE', 'B-EDU', 'M-EDU', 'E-EDU', 'B-ORG', 'M-ORG', 'E-ORG', 'M-NAME', 'B-PRO', 'M-PRO', 'E-PRO', 'S-RACE', 'S-NAME', 'B-LOC', 'M-LOC', 'E-LOC', 'M-RACE', 'S-ORG'] +# my_label_list = ['O', 'M-ORG', 'M-TITLE', 'B-TITLE', 'E-TITLE', 'B-ORG', 'E-ORG', 'M-EDU', 'B-NAME', 'E-NAME', 'B-EDU', 'E-EDU', 'M-NAME', 'M-PRO', 'M-CONT', 'B-PRO', 'E-PRO', 'B-CONT', 'E-CONT', 'M-LOC', 'B-RACE', 'E-RACE', 'S-NAME', 'B-LOC', 'E-LOC', 'M-RACE', 'S-RACE', 'S-ORG'] +# yangjie_label = set(yangjie_label_list) +# my_label = set(my_label_list) + +a = torch.tensor([0,2,0,3]) +b = (a==0) +print(b) +print(b.float()) +from fastNLP import RandomSampler + +# f = open('/remote-home/xnli/weight_debug/lattice_yangjie.pkl','rb') +# weight_dict = torch.load(f) +# print(weight_dict.keys()) +# for k,v in weight_dict.items(): +# print("{}:{}".format(k,v.size())) \ No newline at end of file diff --git a/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/utils.py b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/utils.py new file mode 100644 index 00000000..8c64c43c --- /dev/null +++ b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/utils.py @@ -0,0 +1,361 @@ +import torch.nn.functional as F +import torch +import random +import numpy as np +from fastNLP import Const +from fastNLP import CrossEntropyLoss +from fastNLP import AccuracyMetric +from fastNLP import Tester +import os +from fastNLP import logger +def should_mask(name, t=''): + if 'bias' in name: + return False + if 'embedding' in name: + splited = name.split('.') + if splited[-1]!='weight': + return False + if 'embedding' in splited[-2]: + return False + if 'c0' in name: + return False + if 'h0' in name: + return False + + if 'output' in name and t not in name: + return False + + return True +def get_init_mask(model): + init_masks = {} + for name, param in model.named_parameters(): + if should_mask(name): + init_masks[name+'.mask'] = torch.ones_like(param) + # logger.info(init_masks[name+'.mask'].requires_grad) + + return init_masks + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed+100) + torch.manual_seed(seed+200) + torch.cuda.manual_seed_all(seed+300) + +def get_parameters_size(model): + result = {} + for name,p in model.state_dict().items(): + result[name] = p.size() + + return result + +def prune_by_proportion_model(model,proportion,task): + # print('this time prune to ',proportion*100,'%') + for name, p in model.named_parameters(): + # print(name) + if not should_mask(name,task): + continue + + tensor = p.data.cpu().numpy() + index = np.nonzero(model.mask[task][name+'.mask'].data.cpu().numpy()) + # print(name,'alive count',len(index[0])) + alive = tensor[index] + # print('p and mask size:',p.size(),print(model.mask[task][name+'.mask'].size())) + percentile_value = np.percentile(abs(alive), (1 - proportion) * 100) + # tensor = p + # index = torch.nonzero(model.mask[task][name+'.mask']) + # # print('nonzero len',index) + # alive = tensor[index] + # print('alive size:',alive.shape) + # prune_by_proportion_model() + + # percentile_value = torch.topk(abs(alive), int((1-proportion)*len(index[0]))).values + # print('the',(1-proportion)*len(index[0]),'th big') + # print('threshold:',percentile_value) + + prune_by_threshold_parameter(p, model.mask[task][name+'.mask'],percentile_value) + # for + +def prune_by_proportion_model_global(model,proportion,task): + # print('this time prune to ',proportion*100,'%') + alive = None + for name, p in model.named_parameters(): + # print(name) + if not should_mask(name,task): + continue + + tensor = p.data.cpu().numpy() + index = np.nonzero(model.mask[task][name+'.mask'].data.cpu().numpy()) + # print(name,'alive count',len(index[0])) + if alive is None: + alive = tensor[index] + else: + alive = np.concatenate([alive,tensor[index]],axis=0) + + percentile_value = np.percentile(abs(alive), (1 - proportion) * 100) + + for name, p in model.named_parameters(): + if should_mask(name,task): + prune_by_threshold_parameter(p, model.mask[task][name+'.mask'],percentile_value) + + +def prune_by_threshold_parameter(p, mask, threshold): + p_abs = torch.abs(p) + + new_mask = (p_abs > threshold).float() + # print(mask) + mask[:]*=new_mask + + +def one_time_train_and_prune_single_task(trainer,PRUNE_PER, + optimizer_init_state_dict=None, + model_init_state_dict=None, + is_global=None, + ): + + + from fastNLP import Trainer + + + trainer.optimizer.load_state_dict(optimizer_init_state_dict) + trainer.model.load_state_dict(model_init_state_dict) + # print('metrics:',metrics.__dict__) + # print('loss:',loss.__dict__) + # print('trainer input:',task.train_set.get_input_name()) + # trainer = Trainer(model=model, train_data=task.train_set, dev_data=task.dev_set, loss=loss, metrics=metrics, + # optimizer=optimizer, n_epochs=EPOCH, batch_size=BATCH, device=device,callbacks=callbacks) + + + trainer.train(load_best_model=True) + # tester = Tester(task.train_set, model, metrics, BATCH, device=device, verbose=1,use_tqdm=False) + # print('FOR DEBUG: test train_set:',tester.test()) + # print('**'*20) + # if task.test_set: + # tester = Tester(task.test_set, model, metrics, BATCH, device=device, verbose=1) + # tester.test() + if is_global: + + prune_by_proportion_model_global(trainer.model, PRUNE_PER, trainer.model.now_task) + + else: + prune_by_proportion_model(trainer.model, PRUNE_PER, trainer.model.now_task) + + + +# def iterative_train_and_prune_single_task(get_trainer,ITER,PRUNE,is_global=False,save_path=None): +def iterative_train_and_prune_single_task(get_trainer,args,model,train_set,dev_set,test_set,device,save_path=None): + + ''' + + :param trainer: + :param ITER: + :param PRUNE: + :param is_global: + :param save_path: should be a dictionary which will be filled with mask and state dict + :return: + ''' + + + + from fastNLP import Trainer + import torch + import math + import copy + PRUNE = args.prune + ITER = args.iter + trainer = get_trainer(args,model,train_set,dev_set,test_set,device) + optimizer_init_state_dict = copy.deepcopy(trainer.optimizer.state_dict()) + model_init_state_dict = copy.deepcopy(trainer.model.state_dict()) + if save_path is not None: + if not os.path.exists(save_path): + os.makedirs(save_path) + # if not os.path.exists(os.path.join(save_path, 'model_init.pkl')): + # f = open(os.path.join(save_path, 'model_init.pkl'), 'wb') + # torch.save(trainer.model.state_dict(),f) + + + mask_count = 0 + model = trainer.model + task = trainer.model.now_task + for name, p in model.mask[task].items(): + mask_count += torch.sum(p).item() + init_mask_count = mask_count + logger.info('init mask count:{}'.format(mask_count)) + # logger.info('{}th traning mask count: {} / {} = {}%'.format(i, mask_count, init_mask_count, + # mask_count / init_mask_count * 100)) + + prune_per_iter = math.pow(PRUNE, 1 / ITER) + + + for i in range(ITER): + trainer = get_trainer(args,model,train_set,dev_set,test_set,device) + one_time_train_and_prune_single_task(trainer,prune_per_iter,optimizer_init_state_dict,model_init_state_dict) + if save_path is not None: + f = open(os.path.join(save_path,task+'_mask_'+str(i)+'.pkl'),'wb') + torch.save(model.mask[task],f) + + mask_count = 0 + for name, p in model.mask[task].items(): + mask_count += torch.sum(p).item() + logger.info('{}th traning mask count: {} / {} = {}%'.format(i,mask_count,init_mask_count,mask_count/init_mask_count*100)) + + +def get_appropriate_cuda(task_scale='s'): + if task_scale not in {'s','m','l'}: + logger.info('task scale wrong!') + exit(2) + import pynvml + pynvml.nvmlInit() + total_cuda_num = pynvml.nvmlDeviceGetCount() + for i in range(total_cuda_num): + logger.info(i) + handle = pynvml.nvmlDeviceGetHandleByIndex(i) # 这里的0是GPU id + memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) + utilizationInfo = pynvml.nvmlDeviceGetUtilizationRates(handle) + logger.info(i, 'mem:', memInfo.used / memInfo.total, 'util:',utilizationInfo.gpu) + if memInfo.used / memInfo.total < 0.15 and utilizationInfo.gpu <0.2: + logger.info(i,memInfo.used / memInfo.total) + return 'cuda:'+str(i) + + if task_scale=='s': + max_memory=2000 + elif task_scale=='m': + max_memory=6000 + else: + max_memory = 9000 + + max_id = -1 + for i in range(total_cuda_num): + handle = pynvml.nvmlDeviceGetHandleByIndex(0) # 这里的0是GPU id + memInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) + utilizationInfo = pynvml.nvmlDeviceGetUtilizationRates(handle) + if max_memory < memInfo.free: + max_memory = memInfo.free + max_id = i + + if id == -1: + logger.info('no appropriate gpu, wait!') + exit(2) + + return 'cuda:'+str(max_id) + + # if memInfo.used / memInfo.total < 0.5: + # return + +def print_mask(mask_dict): + def seq_mul(*X): + res = 1 + for x in X: + res*=x + return res + + for name,p in mask_dict.items(): + total_size = seq_mul(*p.size()) + unmasked_size = len(np.nonzero(p)) + + print(name,':',unmasked_size,'/',total_size,'=',unmasked_size/total_size*100,'%') + + + print() + + +def check_words_same(dataset_1,dataset_2,field_1,field_2): + if len(dataset_1[field_1]) != len(dataset_2[field_2]): + logger.info('CHECK: example num not same!') + return False + + for i, words in enumerate(dataset_1[field_1]): + if len(dataset_1[field_1][i]) != len(dataset_2[field_2][i]): + logger.info('CHECK {} th example length not same'.format(i)) + logger.info('1:{}'.format(dataset_1[field_1][i])) + logger.info('2:'.format(dataset_2[field_2][i])) + return False + + # for j,w in enumerate(words): + # if dataset_1[field_1][i][j] != dataset_2[field_2][i][j]: + # print('CHECK', i, 'th example has words different!') + # print('1:',dataset_1[field_1][i]) + # print('2:',dataset_2[field_2][i]) + # return False + + logger.info('CHECK: totally same!') + + return True + +def get_now_time(): + import time + from datetime import datetime, timezone, timedelta + dt = datetime.utcnow() + # print(dt) + tzutc_8 = timezone(timedelta(hours=8)) + local_dt = dt.astimezone(tzutc_8) + result = ("_{}_{}_{}__{}_{}_{}".format(local_dt.year, local_dt.month, local_dt.day, local_dt.hour, local_dt.minute, + local_dt.second)) + + return result + + +def get_bigrams(words): + result = [] + for i,w in enumerate(words): + if i!=len(words)-1: + result.append(words[i]+words[i+1]) + else: + result.append(words[i]+'') + + return result + +def print_info(*inp,islog=False,sep=' '): + from fastNLP import logger + if islog: + print(*inp,sep=sep) + else: + inp = sep.join(map(str,inp)) + logger.info(inp) + +def better_init_rnn(rnn,coupled=False): + import torch.nn as nn + if coupled: + repeat_size = 3 + else: + repeat_size = 4 + # print(list(rnn.named_parameters())) + if hasattr(rnn,'num_layers'): + for i in range(rnn.num_layers): + nn.init.orthogonal(getattr(rnn,'weight_ih_l'+str(i)).data) + weight_hh_data = torch.eye(rnn.hidden_size) + weight_hh_data = weight_hh_data.repeat(1, repeat_size) + with torch.no_grad(): + getattr(rnn,'weight_hh_l'+str(i)).set_(weight_hh_data) + nn.init.constant(getattr(rnn,'bias_ih_l'+str(i)).data, val=0) + nn.init.constant(getattr(rnn,'bias_hh_l'+str(i)).data, val=0) + + if rnn.bidirectional: + for i in range(rnn.num_layers): + nn.init.orthogonal(getattr(rnn, 'weight_ih_l' + str(i)+'_reverse').data) + weight_hh_data = torch.eye(rnn.hidden_size) + weight_hh_data = weight_hh_data.repeat(1, repeat_size) + with torch.no_grad(): + getattr(rnn, 'weight_hh_l' + str(i)+'_reverse').set_(weight_hh_data) + nn.init.constant(getattr(rnn, 'bias_ih_l' + str(i)+'_reverse').data, val=0) + nn.init.constant(getattr(rnn, 'bias_hh_l' + str(i)+'_reverse').data, val=0) + + + else: + nn.init.orthogonal(rnn.weight_ih.data) + weight_hh_data = torch.eye(rnn.hidden_size) + weight_hh_data = weight_hh_data.repeat(repeat_size,1) + with torch.no_grad(): + rnn.weight_hh.set_(weight_hh_data) + # The bias is just set to zero vectors. + print('rnn param size:{},{}'.format(rnn.weight_hh.size(),type(rnn))) + if rnn.bias: + nn.init.constant(rnn.bias_ih.data, val=0) + nn.init.constant(rnn.bias_hh.data, val=0) + + # print(list(rnn.named_parameters())) + + + + + + diff --git a/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/utils_.py b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/utils_.py new file mode 100644 index 00000000..dfc05486 --- /dev/null +++ b/reproduction/sequence_labelling/chinese_ner/LatticeLSTM/utils_.py @@ -0,0 +1,405 @@ +import collections +from fastNLP import cache_results +def get_skip_path(chars,w_trie): + sentence = ''.join(chars) + result = w_trie.get_lexicon(sentence) + + return result + +# @cache_results(_cache_fp='cache/get_skip_path_trivial',_refresh=True) +def get_skip_path_trivial(chars,w_list): + chars = ''.join(chars) + w_set = set(w_list) + result = [] + # for i in range(len(chars)): + # result.append([]) + for i in range(len(chars)-1): + for j in range(i+2,len(chars)+1): + if chars[i:j] in w_set: + result.append([i,j-1,chars[i:j]]) + + return result + + +class TrieNode: + def __init__(self): + self.children = collections.defaultdict(TrieNode) + self.is_w = False + +class Trie: + def __init__(self): + self.root = TrieNode() + + def insert(self,w): + + current = self.root + for c in w: + current = current.children[c] + + current.is_w = True + + def search(self,w): + ''' + + :param w: + :return: + -1:not w route + 0:subroute but not word + 1:subroute and word + ''' + current = self.root + + for c in w: + current = current.children.get(c) + + if current is None: + return -1 + + if current.is_w: + return 1 + else: + return 0 + + def get_lexicon(self,sentence): + result = [] + for i in range(len(sentence)): + current = self.root + for j in range(i, len(sentence)): + current = current.children.get(sentence[j]) + if current is None: + break + + if current.is_w: + result.append([i,j,sentence[i:j+1]]) + + return result + +from fastNLP.core.field import Padder +import numpy as np +import torch +from collections import defaultdict +class LatticeLexiconPadder(Padder): + + def __init__(self, pad_val=0, pad_val_dynamic=False,dynamic_offset=0, **kwargs): + ''' + + :param pad_val: + :param pad_val_dynamic: if True, pad_val is the seq_len + :param kwargs: + ''' + self.pad_val = pad_val + self.pad_val_dynamic = pad_val_dynamic + self.dynamic_offset = dynamic_offset + + def __call__(self, contents, field_name, field_ele_dtype, dim: int): + # 与autoPadder中 dim=2 的情况一样 + max_len = max(map(len, contents)) + + max_len = max(max_len,1)#avoid 0 size dim which causes cuda wrong + + max_word_len = max([max([len(content_ii) for content_ii in content_i]) for + content_i in contents]) + + max_word_len = max(max_word_len,1) + if self.pad_val_dynamic: + # print('pad_val_dynamic:{}'.format(max_len-1)) + + array = np.full((len(contents), max_len, max_word_len), max_len-1+self.dynamic_offset, + dtype=field_ele_dtype) + + else: + array = np.full((len(contents), max_len, max_word_len), self.pad_val, dtype=field_ele_dtype) + for i, content_i in enumerate(contents): + for j, content_ii in enumerate(content_i): + array[i, j, :len(content_ii)] = content_ii + array = torch.tensor(array) + + return array + +from fastNLP.core.metrics import MetricBase + +def get_yangjie_bmeso(label_list,ignore_labels=None): + def get_ner_BMESO_yj(label_list): + def reverse_style(input_string): + target_position = input_string.index('[') + input_len = len(input_string) + output_string = input_string[target_position:input_len] + input_string[0:target_position] + # print('in:{}.out:{}'.format(input_string, output_string)) + return output_string + + # list_len = len(word_list) + # assert(list_len == len(label_list)), "word list size unmatch with label list" + list_len = len(label_list) + begin_label = 'b-' + end_label = 'e-' + single_label = 's-' + whole_tag = '' + index_tag = '' + tag_list = [] + stand_matrix = [] + for i in range(0, list_len): + # wordlabel = word_list[i] + current_label = label_list[i].lower() + if begin_label in current_label: + if index_tag != '': + tag_list.append(whole_tag + ',' + str(i - 1)) + whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i) + index_tag = current_label.replace(begin_label, "", 1) + + elif single_label in current_label: + if index_tag != '': + tag_list.append(whole_tag + ',' + str(i - 1)) + whole_tag = current_label.replace(single_label, "", 1) + '[' + str(i) + tag_list.append(whole_tag) + whole_tag = "" + index_tag = "" + elif end_label in current_label: + if index_tag != '': + tag_list.append(whole_tag + ',' + str(i)) + whole_tag = '' + index_tag = '' + else: + continue + if (whole_tag != '') & (index_tag != ''): + tag_list.append(whole_tag) + tag_list_len = len(tag_list) + + for i in range(0, tag_list_len): + if len(tag_list[i]) > 0: + tag_list[i] = tag_list[i] + ']' + insert_list = reverse_style(tag_list[i]) + stand_matrix.append(insert_list) + # print stand_matrix + return stand_matrix + + def transform_YJ_to_fastNLP(span): + span = span[1:] + span_split = span.split(']') + # print('span_list:{}'.format(span_split)) + span_type = span_split[1] + # print('span_split[0].split(','):{}'.format(span_split[0].split(','))) + if ',' in span_split[0]: + b, e = span_split[0].split(',') + else: + b = span_split[0] + e = b + + b = int(b) + e = int(e) + + e += 1 + + return (span_type, (b, e)) + yj_form = get_ner_BMESO_yj(label_list) + # print('label_list:{}'.format(label_list)) + # print('yj_from:{}'.format(yj_form)) + fastNLP_form = list(map(transform_YJ_to_fastNLP,yj_form)) + return fastNLP_form +class SpanFPreRecMetric_YJ(MetricBase): + r""" + 别名::class:`fastNLP.SpanFPreRecMetric` :class:`fastNLP.core.metrics.SpanFPreRecMetric` + + 在序列标注问题中,以span的方式计算F, pre, rec. + 比如中文Part of speech中,会以character的方式进行标注,句子 `中国在亚洲` 对应的POS可能为(以BMES为例) + ['B-NN', 'E-NN', 'S-DET', 'B-NN', 'E-NN']。该metric就是为类似情况下的F1计算。 + 最后得到的metric结果为:: + + { + 'f': xxx, # 这里使用f考虑以后可以计算f_beta值 + 'pre': xxx, + 'rec':xxx + } + + 若only_gross=False, 即还会返回各个label的metric统计值:: + + { + 'f': xxx, + 'pre': xxx, + 'rec':xxx, + 'f-label': xxx, + 'pre-label': xxx, + 'rec-label':xxx, + ... + } + + :param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN), + 在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'. + :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用 `pred` 取数据 + :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用 `target` 取数据 + :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用 `seq_len` 取数据。 + :param str encoding_type: 目前支持bio, bmes, bmeso, bioes + :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这 + 个label + :param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个 + label的f1, pre, rec + :param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; `macro` : + 分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同) + :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . + 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 + """ + def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type='bio', ignore_labels=None, + only_gross=True, f_type='micro', beta=1): + from fastNLP.core import Vocabulary + from fastNLP.core.metrics import _bmes_tag_to_spans,_bio_tag_to_spans,\ + _bioes_tag_to_spans,_bmeso_tag_to_spans + from collections import defaultdict + + encoding_type = encoding_type.lower() + + if not isinstance(tag_vocab, Vocabulary): + raise TypeError("tag_vocab can only be fastNLP.Vocabulary, not {}.".format(type(tag_vocab))) + if f_type not in ('micro', 'macro'): + raise ValueError("f_type only supports `micro` or `macro`', got {}.".format(f_type)) + + self.encoding_type = encoding_type + # print('encoding_type:{}'self.encoding_type) + if self.encoding_type == 'bmes': + self.tag_to_span_func = _bmes_tag_to_spans + elif self.encoding_type == 'bio': + self.tag_to_span_func = _bio_tag_to_spans + elif self.encoding_type == 'bmeso': + self.tag_to_span_func = _bmeso_tag_to_spans + elif self.encoding_type == 'bioes': + self.tag_to_span_func = _bioes_tag_to_spans + elif self.encoding_type == 'bmesoyj': + self.tag_to_span_func = get_yangjie_bmeso + # self.tag_to_span_func = + else: + raise ValueError("Only support 'bio', 'bmes', 'bmeso' type.") + + self.ignore_labels = ignore_labels + self.f_type = f_type + self.beta = beta + self.beta_square = self.beta ** 2 + self.only_gross = only_gross + + super().__init__() + self._init_param_map(pred=pred, target=target, seq_len=seq_len) + + self.tag_vocab = tag_vocab + + self._true_positives = defaultdict(int) + self._false_positives = defaultdict(int) + self._false_negatives = defaultdict(int) + + def evaluate(self, pred, target, seq_len): + from fastNLP.core.utils import _get_func_signature + """evaluate函数将针对一个批次的预测结果做评价指标的累计 + + :param pred: [batch, seq_len] 或者 [batch, seq_len, len(tag_vocab)], 预测的结果 + :param target: [batch, seq_len], 真实值 + :param seq_len: [batch] 文本长度标记 + :return: + """ + if not isinstance(pred, torch.Tensor): + raise TypeError(f"`pred` in {_get_func_signature(self.evaluate)} must be torch.Tensor," + f"got {type(pred)}.") + if not isinstance(target, torch.Tensor): + raise TypeError(f"`target` in {_get_func_signature(self.evaluate)} must be torch.Tensor," + f"got {type(target)}.") + + if not isinstance(seq_len, torch.Tensor): + raise TypeError(f"`seq_lens` in {_get_func_signature(self.evaluate)} must be torch.Tensor," + f"got {type(seq_len)}.") + + if pred.size() == target.size() and len(target.size()) == 2: + pass + elif len(pred.size()) == len(target.size()) + 1 and len(target.size()) == 2: + num_classes = pred.size(-1) + pred = pred.argmax(dim=-1) + if (target >= num_classes).any(): + raise ValueError("A gold label passed to SpanBasedF1Metric contains an " + "id >= {}, the number of classes.".format(num_classes)) + else: + raise RuntimeError(f"In {_get_func_signature(self.evaluate)}, when pred have " + f"size:{pred.size()}, target should have size: {pred.size()} or " + f"{pred.size()[:-1]}, got {target.size()}.") + + batch_size = pred.size(0) + pred = pred.tolist() + target = target.tolist() + for i in range(batch_size): + pred_tags = pred[i][:int(seq_len[i])] + gold_tags = target[i][:int(seq_len[i])] + + pred_str_tags = [self.tag_vocab.to_word(tag) for tag in pred_tags] + gold_str_tags = [self.tag_vocab.to_word(tag) for tag in gold_tags] + + pred_spans = self.tag_to_span_func(pred_str_tags, ignore_labels=self.ignore_labels) + gold_spans = self.tag_to_span_func(gold_str_tags, ignore_labels=self.ignore_labels) + + for span in pred_spans: + if span in gold_spans: + self._true_positives[span[0]] += 1 + gold_spans.remove(span) + else: + self._false_positives[span[0]] += 1 + for span in gold_spans: + self._false_negatives[span[0]] += 1 + + def get_metric(self, reset=True): + """get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果.""" + evaluate_result = {} + if not self.only_gross or self.f_type == 'macro': + tags = set(self._false_negatives.keys()) + tags.update(set(self._false_positives.keys())) + tags.update(set(self._true_positives.keys())) + f_sum = 0 + pre_sum = 0 + rec_sum = 0 + for tag in tags: + tp = self._true_positives[tag] + fn = self._false_negatives[tag] + fp = self._false_positives[tag] + f, pre, rec = self._compute_f_pre_rec(tp, fn, fp) + f_sum += f + pre_sum += pre + rec_sum += rec + if not self.only_gross and tag != '': # tag!=''防止无tag的情况 + f_key = 'f-{}'.format(tag) + pre_key = 'pre-{}'.format(tag) + rec_key = 'rec-{}'.format(tag) + evaluate_result[f_key] = f + evaluate_result[pre_key] = pre + evaluate_result[rec_key] = rec + + if self.f_type == 'macro': + evaluate_result['f'] = f_sum / len(tags) + evaluate_result['pre'] = pre_sum / len(tags) + evaluate_result['rec'] = rec_sum / len(tags) + + if self.f_type == 'micro': + f, pre, rec = self._compute_f_pre_rec(sum(self._true_positives.values()), + sum(self._false_negatives.values()), + sum(self._false_positives.values())) + evaluate_result['f'] = f + evaluate_result['pre'] = pre + evaluate_result['rec'] = rec + + if reset: + self._true_positives = defaultdict(int) + self._false_positives = defaultdict(int) + self._false_negatives = defaultdict(int) + + for key, value in evaluate_result.items(): + evaluate_result[key] = round(value, 6) + + return evaluate_result + + def _compute_f_pre_rec(self, tp, fn, fp): + """ + + :param tp: int, true positive + :param fn: int, false negative + :param fp: int, false positive + :return: (f, pre, rec) + """ + pre = tp / (fp + tp + 1e-13) + rec = tp / (fn + tp + 1e-13) + f = (1 + self.beta_square) * pre * rec / (self.beta_square * pre + rec + 1e-13) + + return f, pre, rec + + + + diff --git a/reproduction/sequence_labelling/chinese_ner/readme.md b/reproduction/sequence_labelling/chinese_ner/readme.md new file mode 100644 index 00000000..3a9d37d8 --- /dev/null +++ b/reproduction/sequence_labelling/chinese_ner/readme.md @@ -0,0 +1,30 @@ +使用以下中文NERPipe自动下载的统计数据 + +| MsraNERPipe | # of sents | # of tokens | +| ----------- | ---------- | ----------- | +| train | 41747 | 1954374 | +| dev | 4617 | 215505 | +| test | 4365 | 172601 | +| total | 50729 | 2342480 | +这里报道的统计数据,与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致 + + + +| WeiboNERPipe | # of sents | # of tokens | +| ------------ | ---------- | ----------- | +| train | 1350 | 73778 | +| dev | 270 | 14509 | +| test | 270 | 14842 | +| total | 1890 | 1890 | +这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致 + + + + +| PeopleDailyPipe | # of sents | # of tokens | +| --------------- | ---------- | ----------- | +| train | 50658 | 2169879 | +| dev | 4631 | 172601 | +| test | 68 | 2270 | +| total | 55357 | 2344750 | +这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的 diff --git a/reproduction/seqence_labelling/chinese_ner/train_bert.py b/reproduction/sequence_labelling/chinese_ner/train_bert.py similarity index 65% rename from reproduction/seqence_labelling/chinese_ner/train_bert.py rename to reproduction/sequence_labelling/chinese_ner/train_bert.py index a34b7d01..b12c8f75 100644 --- a/reproduction/seqence_labelling/chinese_ner/train_bert.py +++ b/reproduction/sequence_labelling/chinese_ner/train_bert.py @@ -12,22 +12,23 @@ sys.path.append('../../../') from torch import nn from fastNLP.embeddings import BertEmbedding, Embedding -from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader from fastNLP import Trainer, Const from fastNLP import BucketSampler, SpanFPreRecMetric, GradientClipCallback from fastNLP.modules import MLP from fastNLP.core.callback import WarmupCallback from fastNLP import CrossEntropyLoss from fastNLP.core.optimizer import AdamW -import os +from fastNLP.io import MsraNERPipe, MsraNERLoader, WeiboNERPipe from fastNLP import cache_results encoding_type = 'bio' -@cache_results('caches/msra.pkl') +@cache_results('caches/weibo.pkl', _refresh=False) def get_data(): - data = ChineseNERLoader(encoding_type=encoding_type).process("MSRA/") + # data_dir = MsraNERLoader().download(dev_ratio=0) + # data = MsraNERPipe(encoding_type=encoding_type, target_pad_val=-100).process_from_file(data_dir) + data = WeiboNERPipe(encoding_type=encoding_type).process_from_file() return data data = get_data() print(data) @@ -35,10 +36,10 @@ print(data) class BertCNNER(nn.Module): def __init__(self, embed, tag_size): super().__init__() - - self.embedding = Embedding(embed, dropout=0.1) + self.embedding = embed self.tag_size = tag_size self.mlp = MLP(size_layer=[self.embedding.embedding_dim, tag_size]) + def forward(self, chars): # batch_size, max_len = words.size() chars = self.embedding(chars) @@ -46,11 +47,15 @@ class BertCNNER(nn.Module): return {Const.OUTPUT: outputs} -embed = BertEmbedding(data.vocabs[Const.CHAR_INPUT], model_dir_or_name='en-base', - pool_method='max', requires_grad=True, layers='11') + def predict(self, chars): + # batch_size, max_len = words.size() + chars = self.embedding(chars) + outputs = self.mlp(chars) -for name, dataset in data.datasets.items(): - dataset.set_pad_val(Const.TARGET, -100) + return {Const.OUTPUT: outputs} + +embed = BertEmbedding(data.get_vocab(Const.CHAR_INPUT), model_dir_or_name='cn-wwm-ext', + pool_method='first', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5) callbacks = [ GradientClipCallback(clip_type='norm', clip_value=1), @@ -58,7 +63,7 @@ callbacks = [ ] model = BertCNNER(embed, len(data.vocabs[Const.TARGET])) -optimizer = AdamW(model.parameters(), lr=1e-4) +optimizer = AdamW(model.parameters(), lr=3e-5) for name, dataset in data.datasets.items(): original_len = len(dataset) @@ -66,13 +71,11 @@ for name, dataset in data.datasets.items(): clipped_len = len(dataset) print("Delete {} instances in {}.".format(original_len-clipped_len, name)) -os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' - trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), - device=[0, 1], dev_data=data.datasets['test'], batch_size=20, + device=0, dev_data=data.datasets['test'], batch_size=6, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), loss=CrossEntropyLoss(reduction='sum'), callbacks=callbacks, num_workers=2, n_epochs=5, - check_code_level=-1, update_every=3) + check_code_level=0, update_every=3) trainer.train() diff --git a/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py b/reproduction/sequence_labelling/chinese_ner/train_cn_ner.py similarity index 58% rename from reproduction/seqence_labelling/chinese_ner/train_cn_ner.py rename to reproduction/sequence_labelling/chinese_ner/train_cn_ner.py index 53a85186..58b32265 100644 --- a/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py +++ b/reproduction/sequence_labelling/chinese_ner/train_cn_ner.py @@ -1,7 +1,6 @@ +import sys +sys.path.append('../../..') - - -from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader from fastNLP.embeddings import StaticEmbedding from torch import nn @@ -14,7 +13,51 @@ import torch.nn.functional as F from fastNLP import seq_len_to_mask from fastNLP.core.const import Const as C from fastNLP import SpanFPreRecMetric, Trainer -from fastNLP import cache_results +from fastNLP import cache_results, Vocabulary +from fastNLP.io.pipe.utils import _add_chars_field, _indexize + +from fastNLP.io.pipe import Pipe +from fastNLP.core.utils import iob2bioes, iob2 +from fastNLP.io import MsraNERLoader, WeiboNERLoader + +class ChineseNERPipe(Pipe): + def __init__(self, encoding_type: str = 'bio', target_pad_val=0, bigram=False): + if encoding_type == 'bio': + self.convert_tag = iob2 + else: + self.convert_tag = lambda words: iob2bioes(iob2(words)) + self.target_pad_val = int(target_pad_val) + self.bigram = bigram + + def process(self, data_bundle): + data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT) + input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN] + target_fields = [C.TARGET, C.INPUT_LEN] + if self.bigram: + for dataset in data_bundle.datasets.values(): + dataset.apply_field(lambda chars:[c1+c2 for c1, c2 in zip(chars, chars[1:]+[''])], + field_name=C.CHAR_INPUT, new_field_name='bigrams') + bigram_vocab = Vocabulary() + bigram_vocab.from_dataset(data_bundle.get_dataset('train'),field_name='bigrams', + no_create_entry_dataset=[ds for name, ds in data_bundle.datasets.items() if name!='train']) + bigram_vocab.index_dataset(*data_bundle.datasets.values(), field_name='bigrams') + data_bundle.set_vocab(bigram_vocab, field_name='bigrams') + input_fields.append('bigrams') + + _add_chars_field(data_bundle, lower=False) + + # index + _indexize(data_bundle, input_field_names=C.CHAR_INPUT, target_field_names=C.TARGET) + + for name, dataset in data_bundle.datasets.items(): + dataset.set_pad_val(C.TARGET, self.target_pad_val) + dataset.add_seq_len(C.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + class CNBiLSTMCRFNER(nn.Module): def __init__(self, char_embed, num_classes, bigram_embed=None, trigram_embed=None, num_layers=1, hidden_size=100, @@ -73,22 +116,21 @@ class CNBiLSTMCRFNER(nn.Module): return self._forward(chars, bigrams, trigrams, seq_len) # data_bundle = pickle.load(open('caches/msra.pkl', 'rb')) -@cache_results('caches/msra.pkl', _refresh=True) +@cache_results('caches/weibo-lstm.pkl', _refresh=False) def get_data(): - data_bundle = ChineseNERLoader().process('MSRA-NER/', bigrams=True) - char_embed = StaticEmbedding(data_bundle.vocabs['chars'], - model_dir_or_name='cn-char') - bigram_embed = StaticEmbedding(data_bundle.vocabs['bigrams'], - model_dir_or_name='cn-bigram') + data_bundle = WeiboNERLoader().load() + data_bundle = ChineseNERPipe(encoding_type='bioes', bigram=True).process(data_bundle) + char_embed = StaticEmbedding(data_bundle.get_vocab(C.CHAR_INPUT), model_dir_or_name='cn-fasttext') + bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), embedding_dim=100, min_freq=3) return data_bundle, char_embed, bigram_embed data_bundle, char_embed, bigram_embed = get_data() +# data_bundle = get_data() print(data_bundle) + # exit(0) -data_bundle.datasets['train'].set_input('target') -data_bundle.datasets['dev'].set_input('target') model = CNBiLSTMCRFNER(char_embed, num_classes=len(data_bundle.vocabs['target']), bigram_embed=bigram_embed) -Trainer(data_bundle.datasets['train'], model, batch_size=640, +Trainer(data_bundle.datasets['train'], model, batch_size=20, metrics=SpanFPreRecMetric(data_bundle.vocabs['target'], encoding_type='bioes'), - num_workers=2, dev_data=data_bundle. datasets['dev'], device=3).train() + num_workers=2, dev_data=data_bundle. datasets['dev'], device=0).train() diff --git a/reproduction/sequence_labelling/cws/data/cws_shift_pipe.py b/reproduction/sequence_labelling/cws/data/cws_shift_pipe.py new file mode 100644 index 00000000..0ae4064d --- /dev/null +++ b/reproduction/sequence_labelling/cws/data/cws_shift_pipe.py @@ -0,0 +1,202 @@ +from fastNLP.io.pipe import Pipe +from fastNLP.io import DataBundle +from fastNLP.io.loader import CWSLoader +from fastNLP import Const +from itertools import chain +from fastNLP.io.pipe.utils import _indexize +from functools import partial +from fastNLP.io.pipe.cws import _find_and_replace_alpha_spans, _find_and_replace_digit_spans + + +def _word_lens_to_relay(word_lens): + """ + [1, 2, 3, ..] 转换为[0, 1, 0, 2, 1, 0,](start指示seg有多长); + :param word_lens: + :return: + """ + tags = [] + for word_len in word_lens: + tags.extend([idx for idx in range(word_len - 1, -1, -1)]) + return tags + +def _word_lens_to_end_seg_mask(word_lens): + """ + [1, 2, 3, ..] 转换为[0, 1, 0, 2, 1, 0,](start指示seg有多长); + :param word_lens: + :return: + """ + end_seg_mask = [] + for word_len in word_lens: + end_seg_mask.extend([0] * (word_len - 1) + [1]) + return end_seg_mask + +def _word_lens_to_start_seg_mask(word_lens): + """ + [1, 2, 3, ..] 转换为[0, 1, 0, 2, 1, 0,](start指示seg有多长); + :param word_lens: + :return: + """ + start_seg_mask = [] + for word_len in word_lens: + start_seg_mask.extend([1] + [0] * (word_len - 1)) + return start_seg_mask + + +class CWSShiftRelayPipe(Pipe): + """ + + :param str,None dataset_name: 支持'pku', 'msra', 'cityu', 'as', None + :param int L: ShiftRelay模型的超参数 + :param bool replace_num_alpha: 是否将数字和字母用特殊字符替换。 + :param bool bigrams: 是否增加一列bigram. bigram的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...] + :param bool trigrams: 是否增加一列trigram. trigram的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + """ + def __init__(self, dataset_name=None, L=5, replace_num_alpha=True, bigrams=True): + self.dataset_name = dataset_name + self.bigrams = bigrams + self.replace_num_alpha = replace_num_alpha + self.L = L + + def _tokenize(self, data_bundle): + """ + 将data_bundle中的'chars'列切分成一个一个的word. + 例如输入是"共同 创造 美好.."->[[共, 同], [创, 造], [...], ] + + :param data_bundle: + :return: + """ + def split_word_into_chars(raw_chars): + words = raw_chars.split() + chars = [] + for word in words: + char = [] + subchar = [] + for c in word: + if c=='<': + subchar.append(c) + continue + if c=='>' and subchar[0]=='<': + char.append(''.join(subchar)) + subchar = [] + if subchar: + subchar.append(c) + else: + char.append(c) + char.extend(subchar) + chars.append(char) + return chars + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(split_word_into_chars, field_name=Const.CHAR_INPUT, + new_field_name=Const.CHAR_INPUT) + return data_bundle + + def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 可以处理的DataSet需要包含raw_words列 + + .. csv-table:: + :header: "raw_words" + + "上海 浦东 开发 与 法制 建设 同步" + "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )" + "..." + + :param data_bundle: + :return: + """ + data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT) + + if self.replace_num_alpha: + data_bundle.apply_field(_find_and_replace_alpha_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) + data_bundle.apply_field(_find_and_replace_digit_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) + + self._tokenize(data_bundle) + input_field_names = [Const.CHAR_INPUT] + target_field_names = [] + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars:_word_lens_to_relay(map(len, chars)), field_name=Const.CHAR_INPUT, + new_field_name=Const.TARGET) + dataset.apply_field(lambda chars:_word_lens_to_start_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, + new_field_name='start_seg_mask') + dataset.apply_field(lambda chars:_word_lens_to_end_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, + new_field_name='end_seg_mask') + dataset.apply_field(lambda chars:list(chain(*chars)), field_name=Const.CHAR_INPUT, + new_field_name=Const.CHAR_INPUT) + target_field_names.append('start_seg_mask') + input_field_names.append('end_seg_mask') + if self.bigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1+c2 for c1, c2 in zip(chars, chars[1:]+[''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + + _indexize(data_bundle, ['chars', 'bigrams'], []) + + func = partial(_clip_target, L=self.L) + for name, dataset in data_bundle.datasets.items(): + res = dataset.apply_field(func, field_name='target') + relay_target = [res_i[0] for res_i in res] + relay_mask = [res_i[1] for res_i in res] + dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) + dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) + input_field_names.append('relay_target') + input_field_names.append('relay_mask') + + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names + target_fields = [Const.TARGET, Const.INPUT_LEN] + target_field_names + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths=None) -> DataBundle: + """ + + :param str paths: + :return: + """ + if self.dataset_name is None and paths is None: + raise RuntimeError("You have to set `paths` when calling process_from_file() or `dataset_name `when initialization.") + if self.dataset_name is not None and paths is not None: + raise RuntimeError("You cannot specify `paths` and `dataset_name` simultaneously") + data_bundle = CWSLoader(self.dataset_name).load(paths) + return self.process(data_bundle) + +def _clip_target(target, L:int): + """ + + 只有在target_type为shift_relay的使用 + :param target: List[int] + :param L: + :return: + """ + relay_target_i = [] + tmp = [] + for j in range(len(target) - 1): + tmp.append(target[j]) + if target[j] > target[j + 1]: + pass + else: + relay_target_i.extend([L - 1 if t >= L else t for t in tmp[::-1]]) + tmp = [] + # 处理未结束的部分 + if len(tmp) == 0: + relay_target_i.append(0) + else: + tmp.append(target[-1]) + relay_target_i.extend([L - 1 if t >= L else t for t in tmp[::-1]]) + relay_mask_i = [] + j = 0 + while j < len(target): + seg_len = target[j] + 1 + if target[j] < L: + relay_mask_i.extend([0] * (seg_len)) + else: + relay_mask_i.extend([1] * (seg_len - L) + [0] * L) + j = seg_len + j + return relay_target_i, relay_mask_i diff --git a/reproduction/sequence_labelling/cws/model/bilstm_crf_cws.py b/reproduction/sequence_labelling/cws/model/bilstm_crf_cws.py new file mode 100644 index 00000000..4f87a81c --- /dev/null +++ b/reproduction/sequence_labelling/cws/model/bilstm_crf_cws.py @@ -0,0 +1,60 @@ + +import torch +from fastNLP.modules import LSTM +from fastNLP.modules import allowed_transitions, ConditionalRandomField +from fastNLP import seq_len_to_mask +from torch import nn +from fastNLP import Const +import torch.nn.functional as F + +class BiLSTMCRF(nn.Module): + def __init__(self, char_embed, hidden_size, num_layers, target_vocab=None, bigram_embed=None, trigram_embed=None, + dropout=0.5): + super().__init__() + + embed_size = char_embed.embed_size + self.char_embed = char_embed + if bigram_embed: + embed_size += bigram_embed.embed_size + self.bigram_embed = bigram_embed + if trigram_embed: + embed_size += trigram_embed.embed_size + self.trigram_embed = trigram_embed + + self.lstm = LSTM(embed_size, hidden_size=hidden_size//2, bidirectional=True, batch_first=True, + num_layers=num_layers) + self.dropout = nn.Dropout(p=dropout) + self.fc = nn.Linear(hidden_size, len(target_vocab)) + + transitions = None + if target_vocab: + transitions = allowed_transitions(target_vocab, include_start_end=True, encoding_type='bmes') + + self.crf = ConditionalRandomField(num_tags=len(target_vocab), allowed_transitions=transitions) + + def _forward(self, chars, bigrams, trigrams, seq_len, target=None): + chars = self.char_embed(chars) + if bigrams is not None: + bigrams = self.bigram_embed(bigrams) + chars = torch.cat([chars, bigrams], dim=-1) + if trigrams is not None: + trigrams = self.trigram_embed(trigrams) + chars = torch.cat([chars, trigrams], dim=-1) + + output, _ = self.lstm(chars, seq_len) + output = self.dropout(output) + output = self.fc(output) + output = F.log_softmax(output, dim=-1) + mask = seq_len_to_mask(seq_len) + if target is None: + pred, _ = self.crf.viterbi_decode(output, mask) + return {Const.OUTPUT:pred} + else: + loss = self.crf.forward(output, tags=target, mask=mask) + return {Const.LOSS:loss} + + def forward(self, chars, seq_len, target, bigrams=None, trigrams=None): + return self._forward(chars, bigrams, trigrams, seq_len, target) + + def predict(self, chars, seq_len, bigrams=None, trigrams=None): + return self._forward(chars, bigrams, trigrams, seq_len) \ No newline at end of file diff --git a/reproduction/seqence_labelling/cws/model/model.py b/reproduction/sequence_labelling/cws/model/bilstm_shift_relay.py similarity index 72% rename from reproduction/seqence_labelling/cws/model/model.py rename to reproduction/sequence_labelling/cws/model/bilstm_shift_relay.py index de945ac3..efba5c41 100644 --- a/reproduction/seqence_labelling/cws/model/model.py +++ b/reproduction/sequence_labelling/cws/model/bilstm_shift_relay.py @@ -1,8 +1,6 @@ from torch import nn import torch -from fastNLP.embeddings import Embedding -import numpy as np -from reproduction.seqence_labelling.cws.model.module import FeatureFunMax, SemiCRFShiftRelay +from reproduction.sequence_labelling.cws.model.module import FeatureFunMax, SemiCRFShiftRelay from fastNLP.modules import LSTM class ShiftRelayCWSModel(nn.Module): @@ -21,25 +19,21 @@ class ShiftRelayCWSModel(nn.Module): :param num_bigram_per_char: 每个character对应的bigram的数量 :param drop_p: Dropout的大小 """ - def __init__(self, char_embed:Embedding, bigram_embed:Embedding, hidden_size:int=400, num_layers:int=1, - L:int=6, num_bigram_per_char:int=1, drop_p:float=0.2): + def __init__(self, char_embed, bigram_embed, hidden_size:int=400, num_layers:int=1, L:int=6, drop_p:float=0.2): super().__init__() - self.char_embedding = Embedding(char_embed, dropout=drop_p) - self._pretrained_embed = False - if isinstance(char_embed, np.ndarray): - self._pretrained_embed = True - self.bigram_embedding = Embedding(bigram_embed, dropout=drop_p) - self.lstm = LSTM(100 * (num_bigram_per_char + 1), hidden_size // 2, num_layers=num_layers, bidirectional=True, + self.char_embedding = char_embed + self.bigram_embedding = bigram_embed + self.lstm = LSTM(char_embed.embed_size+bigram_embed.embed_size, hidden_size // 2, num_layers=num_layers, + bidirectional=True, batch_first=True) self.feature_fn = FeatureFunMax(hidden_size, L) self.semi_crf_relay = SemiCRFShiftRelay(L) self.feat_drop = nn.Dropout(drop_p) self.reset_param() - # self.feature_fn.reset_parameters() def reset_param(self): for name, param in self.named_parameters(): - if 'embedding' in name and self._pretrained_embed: + if 'embedding' in name: continue if 'bias_hh' in name: nn.init.constant_(param, 0) @@ -51,10 +45,8 @@ class ShiftRelayCWSModel(nn.Module): nn.init.xavier_uniform_(param) def get_feats(self, chars, bigrams, seq_len): - batch_size, max_len = chars.size() chars = self.char_embedding(chars) bigrams = self.bigram_embedding(bigrams) - bigrams = bigrams.view(bigrams.size(0), max_len, -1) chars = torch.cat([chars, bigrams], dim=-1) feats, _ = self.lstm(chars, seq_len) feats = self.feat_drop(feats) diff --git a/reproduction/seqence_labelling/cws/model/metric.py b/reproduction/sequence_labelling/cws/model/metric.py similarity index 100% rename from reproduction/seqence_labelling/cws/model/metric.py rename to reproduction/sequence_labelling/cws/model/metric.py diff --git a/reproduction/seqence_labelling/cws/model/module.py b/reproduction/sequence_labelling/cws/model/module.py similarity index 100% rename from reproduction/seqence_labelling/cws/model/module.py rename to reproduction/sequence_labelling/cws/model/module.py diff --git a/reproduction/sequence_labelling/cws/readme.md b/reproduction/sequence_labelling/cws/readme.md new file mode 100644 index 00000000..a25bb0ed --- /dev/null +++ b/reproduction/sequence_labelling/cws/readme.md @@ -0,0 +1,32 @@ +四个数据集的统计信息,最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。 + +| pku | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 17173 | 1650222 | +| dev | 1881 | 176226 | +| test | 1944 | 172733 | +| total | 20998 | 1999181 | + + +| cityu | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 47696 | 2164907 | +| dev | 5323 | 238447 | +| test | 1492 | 67690 | +| total | 54511 | 2471044 | + + +| msra | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 78242 | 3644550 | +| dev | 8676 | 405919 | +| test | 3985 | 184355 | +| total | 90903 | 4234824 | + + +| as | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 638273 | 7536586 | +| dev | 70680 | 831464 | +| test | 14429 | 197681 | +| total | 723382 | 8565731 | diff --git a/reproduction/sequence_labelling/cws/train_bilstm_crf.py b/reproduction/sequence_labelling/cws/train_bilstm_crf.py new file mode 100644 index 00000000..30760d8f --- /dev/null +++ b/reproduction/sequence_labelling/cws/train_bilstm_crf.py @@ -0,0 +1,52 @@ +import sys +sys.path.append('../../..') + +from fastNLP.io.pipe.cws import CWSPipe +from reproduction.sequence_labelling.cws.model.bilstm_crf_cws import BiLSTMCRF +from fastNLP import Trainer, cache_results +from fastNLP.embeddings import StaticEmbedding +from fastNLP import EvaluateCallback, BucketSampler, SpanFPreRecMetric, GradientClipCallback +from torch.optim import Adagrad + +###########hyper +dataname = 'pku' +hidden_size = 400 +num_layers = 1 +lr = 0.05 +###########hyper + + +@cache_results('{}.pkl'.format(dataname), _refresh=False) +def get_data(): + data_bundle = CWSPipe(dataset_name=dataname, bigrams=True, trigrams=False).process_from_file() + char_embed = StaticEmbedding(data_bundle.get_vocab('chars'), dropout=0.33, word_dropout=0.01, + model_dir_or_name='~/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt') + bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), dropout=0.33,min_freq=3, word_dropout=0.01, + model_dir_or_name='~/exps/CWS/pretrain/vectors/2grams_t3_m50_corpus.txt') + return data_bundle, char_embed, bigram_embed + +data_bundle, char_embed, bigram_embed = get_data() +print(data_bundle) + +model = BiLSTMCRF(char_embed, hidden_size, num_layers, target_vocab=data_bundle.get_vocab('target'), bigram_embed=bigram_embed, + trigram_embed=None, dropout=0.3) +model.cuda() + +callbacks = [] +callbacks.append(EvaluateCallback(data_bundle.get_dataset('test'))) +callbacks.append(GradientClipCallback(clip_type='value', clip_value=5)) +optimizer = Adagrad(model.parameters(), lr=lr) + +metrics = [] +metric1 = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'), encoding_type='bmes') +metrics.append(metric1) + +trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer=optimizer, loss=None, + batch_size=128, sampler=BucketSampler(), update_every=1, + num_workers=1, n_epochs=10, print_every=5, + dev_data=data_bundle.get_dataset('dev'), + metrics=metrics, + metric_key=None, + validate_every=-1, save_path=None, use_tqdm=True, device=0, + callbacks=callbacks, check_code_level=0, dev_batch_size=128) +trainer.train() diff --git a/reproduction/sequence_labelling/cws/train_shift_relay.py b/reproduction/sequence_labelling/cws/train_shift_relay.py new file mode 100644 index 00000000..1a519028 --- /dev/null +++ b/reproduction/sequence_labelling/cws/train_shift_relay.py @@ -0,0 +1,53 @@ + +import sys +sys.path.append('../../..') + +from fastNLP import cache_results +from reproduction.sequence_labelling.cws.data.cws_shift_pipe import CWSShiftRelayPipe +from reproduction.sequence_labelling.cws.model.bilstm_shift_relay import ShiftRelayCWSModel +from fastNLP import Trainer +from torch.optim import Adam +from fastNLP import BucketSampler +from fastNLP import GradientClipCallback +from reproduction.sequence_labelling.cws.model.metric import RelayMetric +from fastNLP.embeddings import StaticEmbedding +from fastNLP import EvaluateCallback + +#########hyper +L = 4 +hidden_size = 200 +num_layers = 1 +drop_p = 0.2 +lr = 0.008 +data_name = 'pku' +#########hyper +device = 0 + +cache_fp = 'caches/{}.pkl'.format(data_name) +@cache_results(_cache_fp=cache_fp, _refresh=True) # 将结果缓存到cache_fp中,这样下次运行就直接读取,而不需要再次运行 +def prepare_data(): + data_bundle = CWSShiftRelayPipe(dataset_name=data_name, L=L).process_from_file() + # 预训练的character embedding和bigram embedding + char_embed = StaticEmbedding(data_bundle.get_vocab('chars'), dropout=0.5, word_dropout=0.01, + model_dir_or_name='~/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt') + bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), dropout=0.5, min_freq=3, word_dropout=0.01, + model_dir_or_name='~/exps/CWS/pretrain/vectors/2grams_t3_m50_corpus.txt') + + return data_bundle, char_embed, bigram_embed + +data, char_embed, bigram_embed = prepare_data() + +model = ShiftRelayCWSModel(char_embed=char_embed, bigram_embed=bigram_embed, + hidden_size=hidden_size, num_layers=num_layers, drop_p=drop_p, L=L) + +sampler = BucketSampler() +optimizer = Adam(model.parameters(), lr=lr) +clipper = GradientClipCallback(clip_value=5, clip_type='value') # 截断太大的梯度 +evaluator = EvaluateCallback(data.get_dataset('test')) # 额外测试在test集上的效果 +callbacks = [clipper, evaluator] + +trainer = Trainer(data.get_dataset('train'), model, optimizer=optimizer, loss=None, batch_size=128, sampler=sampler, + update_every=1, n_epochs=10, print_every=5, dev_data=data.get_dataset('dev'), metrics=RelayMetric(), + metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks, + check_code_level=0, num_workers=1) +trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/README.md b/reproduction/sequence_labelling/ner/README.md similarity index 100% rename from reproduction/seqence_labelling/ner/README.md rename to reproduction/sequence_labelling/ner/README.md diff --git a/reproduction/Summarization/Baseline/data/__init__.py b/reproduction/sequence_labelling/ner/__init__.py similarity index 100% rename from reproduction/Summarization/Baseline/data/__init__.py rename to reproduction/sequence_labelling/ner/__init__.py diff --git a/reproduction/sequence_labelling/ner/model/bert_crf.py b/reproduction/sequence_labelling/ner/model/bert_crf.py new file mode 100644 index 00000000..8061d116 --- /dev/null +++ b/reproduction/sequence_labelling/ner/model/bert_crf.py @@ -0,0 +1,31 @@ + + +from torch import nn +from fastNLP.modules import ConditionalRandomField, allowed_transitions +import torch.nn.functional as F + +class BertCRF(nn.Module): + def __init__(self, embed, tag_vocab, encoding_type='bio'): + super().__init__() + self.embed = embed + self.fc = nn.Linear(self.embed.embed_size, len(tag_vocab)) + trans = allowed_transitions(tag_vocab, encoding_type=encoding_type, include_start_end=True) + self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans) + + def _forward(self, words, target): + mask = words.ne(0) + words = self.embed(words) + words = self.fc(words) + logits = F.log_softmax(words, dim=-1) + if target is not None: + loss = self.crf(logits, target, mask) + return {'loss': loss} + else: + paths, _ = self.crf.viterbi_decode(logits, mask) + return {'pred': paths} + + def forward(self, words, target): + return self._forward(words, target) + + def predict(self, words): + return self._forward(words, None) diff --git a/reproduction/seqence_labelling/ner/model/dilated_cnn.py b/reproduction/sequence_labelling/ner/model/dilated_cnn.py similarity index 99% rename from reproduction/seqence_labelling/ner/model/dilated_cnn.py rename to reproduction/sequence_labelling/ner/model/dilated_cnn.py index 89d51d56..a691560a 100644 --- a/reproduction/seqence_labelling/ner/model/dilated_cnn.py +++ b/reproduction/sequence_labelling/ner/model/dilated_cnn.py @@ -2,7 +2,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from fastNLP.modules.decoder import ConditionalRandomField -from fastNLP.modules.encoder import Embedding +from fastNLP.embeddings import Embedding from fastNLP.core.utils import seq_len_to_mask from fastNLP.core.const import Const as C diff --git a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py b/reproduction/sequence_labelling/ner/model/lstm_cnn_crf.py similarity index 82% rename from reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py rename to reproduction/sequence_labelling/ner/model/lstm_cnn_crf.py index 79d704ba..1d51ab79 100644 --- a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py +++ b/reproduction/sequence_labelling/ner/model/lstm_cnn_crf.py @@ -1,19 +1,16 @@ -import torch from torch import nn from fastNLP import seq_len_to_mask -from fastNLP.modules import Embedding from fastNLP.modules import LSTM from fastNLP.modules import ConditionalRandomField, allowed_transitions import torch.nn.functional as F from fastNLP import Const class CNNBiLSTMCRF(nn.Module): - def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): + def __init__(self, embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): super().__init__() self.embedding = embed - self.char_embedding = char_embed - self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, + self.lstm = LSTM(input_size=self.embedding.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers, bidirectional=True, batch_first=True) self.fc = nn.Linear(hidden_size, len(tag_vocab)) @@ -33,9 +30,7 @@ class CNNBiLSTMCRF(nn.Module): nn.init.zeros_(param) def _forward(self, words, seq_len, target=None): - word_embeds = self.embedding(words) - char_embeds = self.char_embedding(words) - words = torch.cat((word_embeds, char_embeds), dim=-1) + words = self.embedding(words) outputs, _ = self.lstm(words, seq_len) self.dropout(outputs) diff --git a/reproduction/sequence_labelling/ner/train_bert.py b/reproduction/sequence_labelling/ner/train_bert.py new file mode 100644 index 00000000..a90e9998 --- /dev/null +++ b/reproduction/sequence_labelling/ner/train_bert.py @@ -0,0 +1,52 @@ + + +""" +使用Bert进行英文命名实体识别 + +""" + +import sys + +sys.path.append('../../../') + +from reproduction.sequence_labelling.ner.model.bert_crf import BertCRF +from fastNLP.embeddings import BertEmbedding +from fastNLP import Trainer, Const +from fastNLP import BucketSampler, SpanFPreRecMetric, GradientClipCallback +from fastNLP.core.callback import WarmupCallback +from fastNLP.core.optimizer import AdamW +from fastNLP.io import Conll2003NERPipe + +from fastNLP import cache_results, EvaluateCallback + +encoding_type = 'bioes' + +@cache_results('caches/conll2003.pkl', _refresh=False) +def load_data(): + # 替换路径 + paths = 'data/conll2003' + data = Conll2003NERPipe(encoding_type=encoding_type).process_from_file(paths) + return data +data = load_data() +print(data) + +embed = BertEmbedding(data.get_vocab(Const.INPUT), model_dir_or_name='en-base-cased', + pool_method='max', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5, + word_dropout=0.01) + +callbacks = [ + GradientClipCallback(clip_type='norm', clip_value=1), + WarmupCallback(warmup=0.1, schedule='linear'), + EvaluateCallback(data.get_dataset('test')) + ] + +model = BertCRF(embed, tag_vocab=data.get_vocab('target'), encoding_type=encoding_type) +optimizer = AdamW(model.parameters(), lr=2e-5) + +trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), + device=0, dev_data=data.datasets['dev'], batch_size=6, + metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), + loss=None, callbacks=callbacks, num_workers=2, n_epochs=5, + check_code_level=0, update_every=3, test_use_tqdm=False) +trainer.train() + diff --git a/reproduction/sequence_labelling/ner/train_cnn_lstm_crf_conll2003.py b/reproduction/sequence_labelling/ner/train_cnn_lstm_crf_conll2003.py new file mode 100644 index 00000000..d74963ab --- /dev/null +++ b/reproduction/sequence_labelling/ner/train_cnn_lstm_crf_conll2003.py @@ -0,0 +1,54 @@ +import sys +sys.path.append('../../..') + +from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding, StackEmbedding + +from reproduction.sequence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF +from fastNLP import Trainer +from fastNLP import SpanFPreRecMetric +from fastNLP import BucketSampler +from fastNLP import Const +from torch.optim import SGD +from fastNLP import GradientClipCallback +from fastNLP.core.callback import EvaluateCallback, LRScheduler +from torch.optim.lr_scheduler import LambdaLR +from fastNLP import cache_results + +from fastNLP.io.pipe.conll import Conll2003NERPipe +encoding_type = 'bioes' +@cache_results('caches/conll2003_new.pkl', _refresh=True) +def load_data(): + # 替换路径 + paths = {'test':"NER/corpus/CoNLL-2003/eng.testb", + 'train':"NER/corpus/CoNLL-2003/eng.train", + 'dev':"NER/corpus/CoNLL-2003/eng.testa"} + data = Conll2003NERPipe(encoding_type=encoding_type).process_from_file(paths) + return data +data = load_data() +print(data) + +char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], + kernel_sizes=[3], word_dropout=0, dropout=0.5) +word_embed = StaticEmbedding(vocab=data.get_vocab('words'), + model_dir_or_name='en-glove-6b-100d', + requires_grad=True, lower=True, word_dropout=0.01, dropout=0.5) +word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() +embed = StackEmbedding([word_embed, char_embed]) + +model = CNNBiLSTMCRF(embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], + encoding_type=encoding_type) + +callbacks = [ + GradientClipCallback(clip_type='value', clip_value=5), + EvaluateCallback(data=data.get_dataset('test')) # 额外对test上的数据进行性能评测 + ] + +optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9) +scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) +callbacks.append(scheduler) + +trainer = Trainer(train_data=data.get_dataset('train'), model=model, optimizer=optimizer, sampler=BucketSampler(), + device=0, dev_data=data.get_dataset('dev'), batch_size=20, + metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), + callbacks=callbacks, num_workers=2, n_epochs=100, dev_batch_size=512) +trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/train_idcnn.py b/reproduction/sequence_labelling/ner/train_idcnn.py similarity index 78% rename from reproduction/seqence_labelling/ner/train_idcnn.py rename to reproduction/sequence_labelling/ner/train_idcnn.py index 53f2798f..7f4e43af 100644 --- a/reproduction/seqence_labelling/ner/train_idcnn.py +++ b/reproduction/sequence_labelling/ner/train_idcnn.py @@ -1,4 +1,4 @@ -from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader +from fastNLP.io import OntoNotesNERPipe from fastNLP.core.callback import LRScheduler from fastNLP import GradientClipCallback from torch.optim.lr_scheduler import LambdaLR @@ -8,16 +8,12 @@ from fastNLP import BucketSampler from fastNLP import SpanFPreRecMetric from fastNLP import Trainer, Tester from fastNLP.core.metrics import MetricBase -from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN +from reproduction.sequence_labelling.ner.model.dilated_cnn import IDCNN from fastNLP.core.utils import Option -from fastNLP.embeddings.embedding import StaticEmbedding +from fastNLP.embeddings import StaticEmbedding from fastNLP.core.utils import cache_results -from fastNLP.core.vocabulary import VocabularyOption import torch.cuda import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' -os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" encoding_type = 'bioes' @@ -40,18 +36,8 @@ ops = Option( @cache_results('ontonotes-case-cache') def load_data(): print('loading data') - data = OntoNoteNERDataLoader(encoding_type=encoding_type).process( - paths = get_path('workdir/datasets/ontonotes-v4'), - lower=False, - word_vocab_opt=VocabularyOption(min_freq=0), - ) - # data = Conll2003DataLoader(task='ner', encoding_type=encoding_type).process( - # paths=get_path('workdir/datasets/conll03'), - # lower=False, word_vocab_opt=VocabularyOption(min_freq=0) - # ) - - # char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], - # kernel_sizes=[3]) + data = OntoNotesNERPipe(encoding_type=encoding_type).process_from_file( + paths = get_path('workdir/datasets/ontonotes-v4')) print('loading embedding') word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], model_dir_or_name='en-glove-840b-300', diff --git a/reproduction/sequence_labelling/ner/train_ontonote.py b/reproduction/sequence_labelling/ner/train_ontonote.py new file mode 100644 index 00000000..a0484ec3 --- /dev/null +++ b/reproduction/sequence_labelling/ner/train_ontonote.py @@ -0,0 +1,66 @@ +import sys + +sys.path.append('../../..') + +from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding, StackEmbedding + +from reproduction.sequence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF +from fastNLP import Trainer +from fastNLP import SpanFPreRecMetric +from fastNLP import Const +from torch.optim import SGD +from torch.optim.lr_scheduler import LambdaLR +from fastNLP import GradientClipCallback +from fastNLP import BucketSampler +from fastNLP.core.callback import EvaluateCallback, LRScheduler +from fastNLP import cache_results +from fastNLP.io.pipe.conll import OntoNotesNERPipe + +#######hyper +normalize = False +lr = 0.01 +dropout = 0.5 +batch_size = 32 +data_name = 'ontonote' +#######hyper + + +encoding_type = 'bioes' + +@cache_results('caches/ontonotes.pkl', _refresh=True) +def cache(): + data = OntoNotesNERPipe(encoding_type=encoding_type).process_from_file('../../../../others/data/v4/english') + char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30], + kernel_sizes=[3], dropout=dropout) + word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], + model_dir_or_name='en-glove-6b-100d', + requires_grad=True, + normalize=normalize, + word_dropout=0.01, + dropout=dropout, + lower=True, + min_freq=1) + return data, char_embed, word_embed +data, char_embed, word_embed = cache() + +print(data) + +embed = StackEmbedding([word_embed, char_embed]) +model = CNNBiLSTMCRF(embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], + encoding_type=encoding_type, dropout=dropout) + +callbacks = [ + GradientClipCallback(clip_value=5, clip_type='value'), + EvaluateCallback(data.datasets['test']) + ] + +optimizer = SGD(model.parameters(), lr=lr, momentum=0.9) +scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) +callbacks.append(scheduler) + + +trainer = Trainer(train_data=data.get_dataset('train'), model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100), + device=0, dev_data=data.get_dataset('dev'), batch_size=batch_size, + metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), + callbacks=callbacks, num_workers=1, n_epochs=100, dev_batch_size=256) +trainer.train() \ No newline at end of file diff --git a/reproduction/text_classification/README.md b/reproduction/text_classification/README.md index 8bdfb9fe..5767d9e8 100644 --- a/reproduction/text_classification/README.md +++ b/reproduction/text_classification/README.md @@ -11,6 +11,20 @@ LSTM+self_attention:论文链接[A Structured Self-attentive Sentence Embedding] AWD-LSTM:论文链接[Regularizing and Optimizing LSTM Language Models](https://arxiv.org/pdf/1708.02182.pdf) +#数据集来源 +IMDB:http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz +SST-2:https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8 +SST:https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip +yelp_full:https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M +yelp_polarity:https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M + +dataset |classes | train samples | dev samples | test samples|refer| +:---: | :---: | :---: | :---: | :---: | :---: | +yelp_polarity | 2 |560k | - |38k|[char_cnn](https://arxiv.org/pdf/1509.01626v3.pdf)| +yelp_full | 5|650k | - |50k|[char_cnn](https://arxiv.org/pdf/1509.01626v3.pdf)| +IMDB | 2 |25k | - |25k|[IMDB](https://ai.stanford.edu/~ang/papers/acl11-WordVectorsSentimentAnalysis.pdf)| +sst-2 | 2 |67k | 872 |1.8k|[GLUE](https://arxiv.org/pdf/1804.07461.pdf)| + # 数据集及复现结果汇总 使用fastNLP复现的结果vs论文汇报结果(/前为fastNLP实现,后面为论文报道,-表示论文没有在该数据集上列出结果) diff --git a/reproduction/text_classification/data/IMDBLoader.py b/reproduction/text_classification/data/IMDBLoader.py index 94244431..1585fe44 100644 --- a/reproduction/text_classification/data/IMDBLoader.py +++ b/reproduction/text_classification/data/IMDBLoader.py @@ -1,6 +1,6 @@ from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from typing import Union, Dict, List, Iterator from fastNLP import DataSet from fastNLP import Instance diff --git a/reproduction/text_classification/data/MTL16Loader.py b/reproduction/text_classification/data/MTL16Loader.py index 68969069..225fffe6 100644 --- a/reproduction/text_classification/data/MTL16Loader.py +++ b/reproduction/text_classification/data/MTL16Loader.py @@ -1,6 +1,6 @@ from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from typing import Union, Dict, List, Iterator from fastNLP import DataSet from fastNLP import Instance diff --git a/reproduction/text_classification/data/sstloader.py b/reproduction/text_classification/data/sstloader.py index fa4d1837..4e860279 100644 --- a/reproduction/text_classification/data/sstloader.py +++ b/reproduction/text_classification/data/sstloader.py @@ -1,6 +1,6 @@ from typing import Iterable from nltk import Tree -from fastNLP.io.base_loader import DataBundle, DataSetLoader +from fastNLP.io.data_bundle import DataBundle, DataSetLoader from fastNLP.core.vocabulary import VocabularyOption, Vocabulary from fastNLP import DataSet from fastNLP import Instance @@ -11,11 +11,7 @@ from reproduction.utils import check_dataloader_paths, get_tokenizer class SSTLoader(DataSetLoader): - URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' - DATA_DIR = 'sst/' - """ - 别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.dataset_loader.SSTLoader` 读取SST数据集, DataSet包含fields:: words: list(str) 需要分类的文本 target: str 文本的标签 @@ -23,6 +19,10 @@ class SSTLoader(DataSetLoader): :param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False`` :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` """ + + URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' + DATA_DIR = 'sst/' + def __init__(self, subtree=False, fine_grained=False): self.subtree = subtree tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral', diff --git a/reproduction/text_classification/data/yelpLoader.py b/reproduction/text_classification/data/yelpLoader.py index d2272a88..1f7634fc 100644 --- a/reproduction/text_classification/data/yelpLoader.py +++ b/reproduction/text_classification/data/yelpLoader.py @@ -4,7 +4,7 @@ from typing import Iterable from fastNLP import DataSet, Instance, Vocabulary from fastNLP.core.vocabulary import VocabularyOption from fastNLP.io import JsonLoader -from fastNLP.io.base_loader import DataBundle,DataSetLoader +from fastNLP.io.data_bundle import DataBundle,DataSetLoader from fastNLP.io.embed_loader import EmbeddingOption from fastNLP.io.file_reader import _read_json from typing import Union, Dict diff --git a/reproduction/text_classification/model/BertTC.py b/reproduction/text_classification/model/BertTC.py new file mode 100644 index 00000000..702c0cd1 --- /dev/null +++ b/reproduction/text_classification/model/BertTC.py @@ -0,0 +1,24 @@ +from fastNLP.embeddings import BertEmbedding +import torch +import torch.nn as nn +from fastNLP.core.const import Const as C + +class BertTC(nn.Module): + def __init__(self, vocab,num_class,bert_model_dir_or_name,fine_tune=False): + super(BertTC, self).__init__() + self.embed=BertEmbedding(vocab, requires_grad=fine_tune, + model_dir_or_name=bert_model_dir_or_name,include_cls_sep=True) + self.classifier = nn.Linear(self.embed.embedding_dim, num_class) + + def forward(self, words): + embedding_cls=self.embed(words)[:,0] + output=self.classifier(embedding_cls) + return {C.OUTPUT: output} + + def predict(self,words): + return self.forward(words) + +if __name__=="__main__": + ta=torch.tensor([[1,2,3],[4,5,6],[7,8,9]]) + tb=ta[:,0] + print(tb) diff --git a/reproduction/text_classification/model/awdlstm_module.py b/reproduction/text_classification/model/awdlstm_module.py index 87bfe730..a586ed2d 100644 --- a/reproduction/text_classification/model/awdlstm_module.py +++ b/reproduction/text_classification/model/awdlstm_module.py @@ -17,8 +17,6 @@ from .weight_drop import WeightDrop class LSTM(nn.Module): """ - 别名::class:`fastNLP.modules.LSTM` :class:`fastNLP.modules.encoder.lstm.LSTM` - LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下,将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化 为1; 且可以应对DataParallel中LSTM的使用问题。 diff --git a/reproduction/text_classification/model/dpcnn.py b/reproduction/text_classification/model/dpcnn.py index ae2d46bd..b63c6d38 100644 --- a/reproduction/text_classification/model/dpcnn.py +++ b/reproduction/text_classification/model/dpcnn.py @@ -1,6 +1,5 @@ import torch import torch.nn as nn -from fastNLP.embeddings.utils import get_embeddings from fastNLP.core import Const as C @@ -64,7 +63,8 @@ class RegionEmbedding(nn.Module): kernel_sizes = [5, 9] assert isinstance( kernel_sizes, list), 'kernel_sizes should be List(int)' - self.embed = get_embeddings(init_embed) + # self.embed = nn.Embedding.from_pretrained(torch.tensor(init_embed).float(), freeze=False) + self.embed = init_embed try: embed_dim = self.embed.embedding_dim except Exception: diff --git a/reproduction/text_classification/train_awdlstm.py b/reproduction/text_classification/train_awdlstm.py index b2a67fdb..7537e6f7 100644 --- a/reproduction/text_classification/train_awdlstm.py +++ b/reproduction/text_classification/train_awdlstm.py @@ -1,11 +1,9 @@ # 这个模型需要在pytorch=0.4下运行,weight_drop不支持1.0 -# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径 -import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' +import sys +sys.path.append('../..') -from fastNLP.io.data_loader import IMDBLoader +from fastNLP.io.pipe.classification import IMDBPipe from fastNLP.embeddings import StaticEmbedding from model.awd_lstm import AWDLSTMSentiment @@ -32,15 +30,14 @@ opt=Config() # load data -dataloader=IMDBLoader() -datainfo=dataloader.process(opt.datapath) +data_bundle=IMDBPipe.process_from_file(opt.datapath) -# print(datainfo.datasets["train"]) -# print(datainfo) +# print(data_bundle.datasets["train"]) +# print(data_bundle) # define model -vocab=datainfo.vocabs['words'] +vocab=data_bundle.vocabs['words'] embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True) model=AWDLSTMSentiment(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, nfc=opt.nfc, wdrop=opt.wdrop) @@ -52,11 +49,11 @@ optimizer= Adam([param for param in model.parameters() if param.requires_grad==T def train(datainfo, model, optimizer, loss, metrics, opt): - trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, - metrics=metrics, dev_data=datainfo.datasets['test'], device=0, check_code_level=-1, + trainer = Trainer(data_bundle.datasets['train'], model, optimizer=optimizer, loss=loss, + metrics=metrics, dev_data=data_bundle.datasets['test'], device=0, check_code_level=-1, n_epochs=opt.train_epoch, save_path=opt.save_model_path) trainer.train() if __name__ == "__main__": - train(datainfo, model, optimizer, loss, metrics, opt) + train(data_bundle, model, optimizer, loss, metrics, opt) diff --git a/reproduction/text_classification/train_char_cnn.py b/reproduction/text_classification/train_char_cnn.py index 0b8fc535..55d830e6 100644 --- a/reproduction/text_classification/train_char_cnn.py +++ b/reproduction/text_classification/train_char_cnn.py @@ -1,14 +1,8 @@ -# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径 -import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' - import sys sys.path.append('../..') from fastNLP.core.const import Const as C import torch.nn as nn -from fastNLP.io.data_loader import YelpLoader -#from data.sstLoader import sst2Loader +from fastNLP.io.pipe.classification import YelpFullPipe,YelpPolarityPipe,SST2Pipe,IMDBPipe from model.char_cnn import CharacterLevelCNN from fastNLP import CrossEntropyLoss, AccuracyMetric from fastNLP.core.trainer import Trainer @@ -26,19 +20,9 @@ class Config(): model_dir_or_name="en-base-uncased" embedding_grad= False, bert_embedding_larers= '4,-2,-1' - train_epoch= 50 + train_epoch= 100 num_classes=2 task= "yelp_p" - #yelp_p - datapath = {"train": "/remote-home/ygwang/yelp_polarity/train.csv", - "test": "/remote-home/ygwang/yelp_polarity/test.csv"} - #IMDB - #datapath = {"train": "/remote-home/ygwang/IMDB_data/train.csv", - # "test": "/remote-home/ygwang/IMDB_data/test.csv"} - # sst - # datapath = {"train": "/remote-home/ygwang/workspace/GLUE/SST-2/train.tsv", - # "dev": "/remote-home/ygwang/workspace/GLUE/SST-2/dev.tsv"} - lr=0.01 batch_size=128 model_size="large" @@ -46,6 +30,8 @@ class Config(): extra_characters='' max_length=1014 weight_decay = 1e-5 + to_lower=True + tokenizer = 'spacy' # 使用spacy进行分词 char_cnn_config={ "alphabet": { @@ -111,12 +97,37 @@ ops=Config ##1.task相关信息:利用dataloader载入dataInfo #dataloader=SST2Loader() #dataloader=IMDBLoader() -dataloader=YelpLoader(fine_grained=True) -datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False) +# dataloader=YelpLoader(fine_grained=True) +# datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False) char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"] ops.number_of_characters=len(char_vocab) ops.embedding_dim=ops.number_of_characters +# load data set +if ops.task == 'yelp_p': + data_bundle = YelpPolarityPipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file() +elif ops.task == 'yelp_f': + data_bundle = YelpFullPipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file() +elif ops.task == 'imdb': + data_bundle = IMDBPipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file() +elif ops.task == 'sst-2': + data_bundle = SST2Pipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file() +else: + raise RuntimeError(f'NOT support {ops.task} task yet!') + +print(data_bundle) + +def wordtochar(words): + chars = [] + + #for word in words: + #word = word.lower() + for char in words: + chars.append(char) + #chars.append('') + #chars.pop() + return chars + #chartoindex def chartoindex(chars): max_seq_len=ops.max_length @@ -136,13 +147,18 @@ def chartoindex(chars): char_index_list=[zero_index]*max_seq_len return char_index_list -for dataset in datainfo.datasets.values(): + +for dataset in data_bundle.datasets.values(): + dataset.apply_field(wordtochar, field_name="raw_words", new_field_name='chars') dataset.apply_field(chartoindex,field_name='chars',new_field_name='chars') -datainfo.datasets['train'].set_input('chars') -datainfo.datasets['test'].set_input('chars') -datainfo.datasets['train'].set_target('target') -datainfo.datasets['test'].set_target('target') +# print(data_bundle.datasets['train'][0]['chars']) +# print(data_bundle.datasets['train'][0]['raw_words']) + +data_bundle.datasets['train'].set_input('chars') +data_bundle.datasets['test'].set_input('chars') +data_bundle.datasets['train'].set_target('target') +data_bundle.datasets['test'].set_target('target') ##2. 定义/组装模型,这里可以随意,就如果是fastNLP封装好的,类似CNNText就直接用初始化调用就好了,这里只是给出一个伪框架表示占位,在这里建立符合fastNLP输入输出规范的model class ModelFactory(nn.Module): @@ -165,7 +181,7 @@ class ModelFactory(nn.Module): ## 2.或直接复用fastNLP的模型 #vocab=datainfo.vocabs['words'] -vocab_label=datainfo.vocabs['target'] +vocab_label=data_bundle.vocabs['target'] ''' # emded_char=CNNCharEmbedding(vocab) # embed_word = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) @@ -189,7 +205,6 @@ model=CharacterLevelCNN(ops,embedding) ## 3. 声明loss,metric,optimizer loss=CrossEntropyLoss metric=AccuracyMetric -#optimizer= SGD([param for param in model.parameters() if param.requires_grad==True], lr=ops.lr) optimizer = SGD([param for param in model.parameters() if param.requires_grad == True], lr=ops.lr, momentum=0.9, weight_decay=ops.weight_decay) callbacks = [] @@ -203,14 +218,10 @@ callbacks.append( def train(model,datainfo,loss,metrics,optimizer,num_epochs=100): trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss(target='target'),batch_size=ops.batch_size, metrics=[metrics(target='target')], dev_data=datainfo.datasets['test'], device=[0,1,2], check_code_level=-1, - n_epochs=num_epochs) + n_epochs=num_epochs,callbacks=callbacks) print(trainer.train()) if __name__=="__main__": - #print(vocab_label) - - #print(datainfo.datasets["train"]) - train(model,datainfo,loss,metric,optimizer,num_epochs=ops.train_epoch) - \ No newline at end of file + train(model,data_bundle,loss,metric,optimizer,num_epochs=ops.train_epoch) diff --git a/reproduction/text_classification/train_dpcnn.py b/reproduction/text_classification/train_dpcnn.py index 6cce453b..c7f5751c 100644 --- a/reproduction/text_classification/train_dpcnn.py +++ b/reproduction/text_classification/train_dpcnn.py @@ -8,19 +8,18 @@ from fastNLP.core.trainer import Trainer from fastNLP import CrossEntropyLoss, AccuracyMetric from fastNLP.embeddings import StaticEmbedding from reproduction.text_classification.model.dpcnn import DPCNN -from fastNLP.io.data_loader import YelpLoader from fastNLP.core.sampler import BucketSampler from fastNLP.core import LRScheduler from fastNLP.core.const import Const as C from fastNLP.core.vocabulary import VocabularyOption from utils.util_init import set_rng_seeds +from fastNLP import logger import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' -os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - +from fastNLP.io import YelpFullPipe, YelpPolarityPipe +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # hyper +logger.add_file('log', 'INFO') class Config(): seed = 12345 @@ -45,46 +44,41 @@ class Config(): self.datapath = {k: os.path.join(self.datadir, v) for k, v in self.datafile.items()} - ops = Config() set_rng_seeds(ops.seed) -print('RNG SEED: {}'.format(ops.seed)) +logger.info('RNG SEED %d'%ops.seed) # 1.task相关信息:利用dataloader载入dataInfo -#datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train']) - @cache_results(ops.model_dir_or_name+'-data-cache') def load_data(): - datainfo = YelpLoader(fine_grained=True, lower=True).process( - paths=ops.datapath, train_ds=['train'], src_vocab_op=ops.src_vocab_op) + datainfo = YelpFullPipe(lower=True, tokenizer='raw').process_from_file(ops.datapath) for ds in datainfo.datasets.values(): ds.apply_field(len, C.INPUT, C.INPUT_LEN) ds.set_input(C.INPUT, C.INPUT_LEN) ds.set_target(C.TARGET) - embedding = StaticEmbedding( - datainfo.vocabs['words'], model_dir_or_name='en-glove-840b-300', requires_grad=ops.embedding_grad, - normalize=False - ) - return datainfo, embedding + + return datainfo -datainfo, embedding = load_data() +datainfo = load_data() +embedding = StaticEmbedding( + datainfo.vocabs['words'], model_dir_or_name='en-glove-6b-100d', requires_grad=ops.embedding_grad, + normalize=False) embedding.embedding.weight.data /= embedding.embedding.weight.data.std() -print(embedding.embedding.weight.mean(), embedding.embedding.weight.std()) +print(embedding.embedding.weight.data.mean(), embedding.embedding.weight.data.std()) # 2.或直接复用fastNLP的模型 -# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)]) - -print(datainfo) -print(datainfo.datasets['train'][0]) +# datainfo.datasets['train'] = datainfo.datasets['train'][:1000] # for debug purpose +# datainfo.datasets['test'] = datainfo.datasets['test'][:1000] +logger.info(datainfo) model = DPCNN(init_embed=embedding, num_cls=len(datainfo.vocabs[C.TARGET]), embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout) -print(model) +# print(model) # 3. 声明loss,metric,optimizer loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET) @@ -95,27 +89,28 @@ optimizer = SGD([param for param in model.parameters() if param.requires_grad == callbacks = [] callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) -# callbacks.append( -# LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch < -# ops.train_epoch * 0.8 else ops.lr * 0.1)) -# ) -# callbacks.append( -# FitlogCallback(data=datainfo.datasets, verbose=1) -# ) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' -print(device) +# print(device) +logger.info(device) # 4.定义train方法 +# normal trainer trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), - metrics=[metric], + metrics=[metric], use_tqdm=False, save_path='save', dev_data=datainfo.datasets['test'], device=device, check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks, n_epochs=ops.train_epoch, num_workers=4) +# distributed trainer +# trainer = DistTrainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, +# metrics=[metric], +# dev_data=datainfo.datasets['test'], device='cuda', +# batch_size_per_gpu=ops.batch_size, callbacks_all=callbacks, +# n_epochs=ops.train_epoch, num_workers=4) if __name__ == "__main__": diff --git a/reproduction/text_classification/train_lstm.py b/reproduction/text_classification/train_lstm.py index 40f77061..a23be0cb 100644 --- a/reproduction/text_classification/train_lstm.py +++ b/reproduction/text_classification/train_lstm.py @@ -1,9 +1,7 @@ -# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径 -import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' +import sys +sys.path.append('../..') -from fastNLP.io.data_loader import IMDBLoader +from fastNLP.io.pipe.classification import IMDBPipe from fastNLP.embeddings import StaticEmbedding from model.lstm import BiLSTMSentiment @@ -29,15 +27,14 @@ opt=Config() # load data -dataloader=IMDBLoader() -datainfo=dataloader.process(opt.datapath) +data_bundle=IMDBPipe.process_from_file(opt.datapath) -# print(datainfo.datasets["train"]) -# print(datainfo) +# print(data_bundle.datasets["train"]) +# print(data_bundle) # define model -vocab=datainfo.vocabs['words'] +vocab=data_bundle.vocabs['words'] embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True) model=BiLSTMSentiment(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, nfc=opt.nfc) @@ -48,12 +45,12 @@ metrics=AccuracyMetric() optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr) -def train(datainfo, model, optimizer, loss, metrics, opt): - trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, - metrics=metrics, dev_data=datainfo.datasets['test'], device=0, check_code_level=-1, +def train(data_bundle, model, optimizer, loss, metrics, opt): + trainer = Trainer(data_bundle.datasets['train'], model, optimizer=optimizer, loss=loss, + metrics=metrics, dev_data=data_bundle.datasets['test'], device=0, check_code_level=-1, n_epochs=opt.train_epoch, save_path=opt.save_model_path) trainer.train() if __name__ == "__main__": - train(datainfo, model, optimizer, loss, metrics, opt) \ No newline at end of file + train(data_bundle, model, optimizer, loss, metrics, opt) \ No newline at end of file diff --git a/reproduction/text_classification/train_lstm_att.py b/reproduction/text_classification/train_lstm_att.py index 1052f606..a2b8612d 100644 --- a/reproduction/text_classification/train_lstm_att.py +++ b/reproduction/text_classification/train_lstm_att.py @@ -1,9 +1,7 @@ -# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径 -import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' +import sys +sys.path.append('../..') -from fastNLP.io.data_loader import IMDBLoader +from fastNLP.io.pipe.classification import IMDBPipe from fastNLP.embeddings import StaticEmbedding from model.lstm_self_attention import BiLSTM_SELF_ATTENTION @@ -31,15 +29,14 @@ opt=Config() # load data -dataloader=IMDBLoader() -datainfo=dataloader.process(opt.datapath) +data_bundle=IMDBPipe.process_from_file(opt.datapath) -# print(datainfo.datasets["train"]) -# print(datainfo) +# print(data_bundle.datasets["train"]) +# print(data_bundle) # define model -vocab=datainfo.vocabs['words'] +vocab=data_bundle.vocabs['words'] embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True) model=BiLSTM_SELF_ATTENTION(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, attention_unit=opt.attention_unit, attention_hops=opt.attention_hops, nfc=opt.nfc) @@ -50,12 +47,12 @@ metrics=AccuracyMetric() optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr) -def train(datainfo, model, optimizer, loss, metrics, opt): - trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, - metrics=metrics, dev_data=datainfo.datasets['test'], device=0, check_code_level=-1, +def train(data_bundle, model, optimizer, loss, metrics, opt): + trainer = Trainer(data_bundle.datasets['train'], model, optimizer=optimizer, loss=loss, + metrics=metrics, dev_data=data_bundle.datasets['test'], device=0, check_code_level=-1, n_epochs=opt.train_epoch, save_path=opt.save_model_path) trainer.train() if __name__ == "__main__": - train(datainfo, model, optimizer, loss, metrics, opt) + train(data_bundle, model, optimizer, loss, metrics, opt) diff --git a/requirements.txt b/requirements.txt index db0b89ac..b07aed3f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ nltk>=3.4.1 prettytable>=0.7.2 requests spacy +prettytable>=0.7.2 \ No newline at end of file diff --git a/setup.py b/setup.py index 0dbef455..72f92d16 100644 --- a/setup.py +++ b/setup.py @@ -11,15 +11,19 @@ with open('LICENSE', encoding='utf-8') as f: with open('requirements.txt', encoding='utf-8') as f: reqs = f.read() +pkgs = [p for p in find_packages() if p.startswith('fastNLP')] +print(pkgs) + setup( name='FastNLP', - version='dev0.5.0', + version='0.4.10', + url='https://github.com/fastnlp/fastNLP', description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', long_description=readme, long_description_content_type='text/markdown', license='Apache License', author='FudanNLP', python_requires='>=3.6', - packages=find_packages(), + packages=pkgs, install_requires=reqs.strip().split('\n'), ) diff --git a/reproduction/coreference_resolution/data_load/__init__.py b/test/core/__init__.py similarity index 100% rename from reproduction/coreference_resolution/data_load/__init__.py rename to test/core/__init__.py diff --git a/test/core/test_batch.py b/test/core/test_batch.py index aa9808ee..d9898bc7 100644 --- a/test/core/test_batch.py +++ b/test/core/test_batch.py @@ -3,7 +3,7 @@ import unittest import numpy as np import torch -from fastNLP import DataSetIter +from fastNLP import DataSetIter, TorchLoaderIter from fastNLP import DataSet from fastNLP import Instance from fastNLP import SequentialSampler @@ -149,7 +149,33 @@ class TestCase1(unittest.TestCase): batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_x, batch_y in batch: pass - + + def testTensorLoaderIter(self): + class FakeData: + def __init__(self, return_dict=True): + self.x = [[1,2,3], [4,5,6]] + self.return_dict = return_dict + + def __len__(self): + return len(self.x) + + def __getitem__(self, i): + x = self.x[i] + y = 0 + if self.return_dict: + return {'x':x}, {'y':y} + return x, y + + data1 = FakeData() + dataiter = TorchLoaderIter(data1, batch_size=2) + for x, y in dataiter: + print(x, y) + + def func(): + data2 = FakeData(return_dict=False) + dataiter = TorchLoaderIter(data2, batch_size=2) + self.assertRaises(Exception, func) + """ def test_multi_workers_batch(self): batch_size = 32 diff --git a/test/core/test_callbacks.py b/test/core/test_callbacks.py index 909295c0..db95a32d 100644 --- a/test/core/test_callbacks.py +++ b/test/core/test_callbacks.py @@ -1,35 +1,35 @@ +import os +import tempfile import unittest import numpy as np import torch -from fastNLP.core.callback import EarlyStopCallback, GradientClipCallback, LRScheduler, ControlC, \ - LRFinder, TensorboardCallback +from fastNLP import AccuracyMetric +from fastNLP import BCELoss from fastNLP import DataSet from fastNLP import Instance -from fastNLP import BCELoss -from fastNLP import AccuracyMetric from fastNLP import SGD from fastNLP import Trainer +from fastNLP.core.callback import EarlyStopCallback, GradientClipCallback, LRScheduler, ControlC, \ + LRFinder, TensorboardCallback +from fastNLP.core.callback import EvaluateCallback, FitlogCallback, SaveModelCallback +from fastNLP.core.callback import WarmupCallback from fastNLP.models.base_model import NaiveClassifier -from fastNLP.core.callback import EarlyStopError def prepare_env(): - def prepare_fake_dataset(): - mean = np.array([-3, -3]) - cov = np.array([[1, 0], [0, 1]]) - class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) - - mean = np.array([3, 3]) - cov = np.array([[1, 0], [0, 1]]) - class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) - - data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + - [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) - return data_set + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) + + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) - data_set = prepare_fake_dataset() data_set.set_input("x") data_set.set_target("y") model = NaiveClassifier(2, 1) @@ -37,6 +37,12 @@ def prepare_env(): class TestCallback(unittest.TestCase): + def setUp(self): + self.tempdir = tempfile.mkdtemp() + + def tearDown(self): + pass + # shutil.rmtree(self.tempdir) def test_gradient_clip(self): data_set, model = prepare_env() @@ -85,6 +91,11 @@ class TestCallback(unittest.TestCase): metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, callbacks=[TensorboardCallback("loss", "metric")], check_code_level=2) trainer.train() + import os + import shutil + path = os.path.join("./", 'tensorboard_logs_{}'.format(trainer.start_time)) + if os.path.exists(path): + shutil.rmtree(path) def test_readonly_property(self): from fastNLP.core.callback import Callback @@ -108,3 +119,98 @@ class TestCallback(unittest.TestCase): check_code_level=2) trainer.train() assert passed_epochs == list(range(1, total_epochs + 1)) + + def test_evaluate_callback(self): + data_set, model = prepare_env() + from fastNLP import Tester + tester = Tester(data=data_set, model=model, metrics=AccuracyMetric(pred="predict", target="y")) + evaluate_callback = EvaluateCallback(data_set, tester) + + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, + callbacks=evaluate_callback, check_code_level=2) + trainer.train() + + def test_fitlog_callback(self): + import fitlog + fitlog.set_log_dir(self.tempdir) + data_set, model = prepare_env() + from fastNLP import Tester + tester = Tester(data=data_set, model=model, metrics=AccuracyMetric(pred="predict", target="y")) + fitlog_callback = FitlogCallback(data_set, tester) + + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, + callbacks=fitlog_callback, check_code_level=2) + trainer.train() + + def test_save_model_callback(self): + data_set, model = prepare_env() + top = 3 + save_model_callback = SaveModelCallback(self.tempdir, top=top) + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, + callbacks=save_model_callback, check_code_level=2) + trainer.train() + + timestamp = os.listdir(self.tempdir)[0] + self.assertEqual(len(os.listdir(os.path.join(self.tempdir, timestamp))), top) + + def test_warmup_callback(self): + data_set, model = prepare_env() + warmup_callback = WarmupCallback() + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, + callbacks=warmup_callback, check_code_level=2) + trainer.train() + + def test_early_stop_callback(self): + """ + 需要观察是否真的 EarlyStop + """ + data_set, model = prepare_env() + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=2, n_epochs=10, print_every=5, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, + callbacks=EarlyStopCallback(1), check_code_level=2) + trainer.train() + + +def test_control_C(): + # 用于测试 ControlC , 再两次训练时用 Control+C 进行退出,如果最后不显示 "Test failed!" 则通过测试 + from fastNLP import ControlC, Callback + import time + + line1 = "\n\n\n\n\n*************************" + line2 = "*************************\n\n\n\n\n" + + class Wait(Callback): + def on_epoch_end(self): + time.sleep(5) + + data_set, model = prepare_env() + + print(line1 + "Test starts!" + line2) + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=20, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, + callbacks=[Wait(), ControlC(False)], check_code_level=2) + trainer.train() + + print(line1 + "Program goes on ..." + line2) + + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=20, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, + callbacks=[Wait(), ControlC(True)], check_code_level=2) + trainer.train() + + print(line1 + "Test failed!" + line2) + + +if __name__ == "__main__": + test_control_C() diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 0228f207..e05148a6 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -1,4 +1,5 @@ import os +import sys import unittest from fastNLP import DataSet @@ -79,6 +80,16 @@ class TestDataSetMethods(unittest.TestCase): self.assertFalse("x" in dd.field_arrays) self.assertTrue("y" in dd.field_arrays) + def test_delete_instance(self): + dd = DataSet() + old_length = 2 + dd.add_field("x", [[1, 2, 3]] * old_length) + dd.add_field("y", [[1, 2, 3, 4]] * old_length) + dd.delete_instance(0) + self.assertEqual(len(dd), old_length-1) + dd.delete_instance(0) + self.assertEqual(len(dd), old_length-2) + def test_getitem(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ins_1, ins_0 = ds[0], ds[1] @@ -124,6 +135,14 @@ class TestDataSetMethods(unittest.TestCase): ds.apply(lambda ins: (len(ins["x"]), "hahaha"), new_field_name="k", ignore_type=True) # expect no exception raised + def test_apply_cannot_modify_instance(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) + def modify_inplace(instance): + instance['words'] = 1 + + with self.assertRaises(TypeError): + ds.apply(modify_inplace) + def test_drop(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20}) ds.drop(lambda ins: len(ins["y"]) < 3, inplace=True) @@ -171,8 +190,9 @@ class TestDataSetMethods(unittest.TestCase): def test_apply2(self): def split_sent(ins): return ins['raw_sentence'].split() - csv_loader = CSVLoader(headers=['raw_sentence', 'label'],sep='\t') - dataset = csv_loader.load('test/data_for_tests/tutorial_sample_dataset.csv') + csv_loader = CSVLoader(headers=['raw_sentence', 'label'], sep='\t') + data_bundle = csv_loader.load('test/data_for_tests/tutorial_sample_dataset.csv') + dataset = data_bundle.datasets['train'] dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0, inplace=True) dataset.apply(split_sent, new_field_name='words', is_input=True) # print(dataset) @@ -217,4 +237,17 @@ class TestDataSetIter(unittest.TestCase): def test__repr__(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) for iter in ds: - self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4] type=list,\n'y': [5, 6] type=list}") + self.assertEqual(iter.__repr__(), """+--------------+--------+ +| x | y | ++--------------+--------+ +| [1, 2, 3, 4] | [5, 6] | ++--------------+--------+""") + + +class TestDataSetFieldMeta(unittest.TestCase): + def test_print_field_meta(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + ds.print_field_meta() + + ds.set_input('x') + ds.print_field_meta() diff --git a/test/core/test_dist_trainer.py b/test/core/test_dist_trainer.py new file mode 100644 index 00000000..d2a11a76 --- /dev/null +++ b/test/core/test_dist_trainer.py @@ -0,0 +1,176 @@ +import os +import shutil +import subprocess +import unittest +from argparse import ArgumentParser + +import numpy as np +import torch.cuda + +from fastNLP import AccuracyMetric +from fastNLP import CrossEntropyLoss, BCELoss +from fastNLP import DataSet +from fastNLP import Instance +from fastNLP import SGD +from fastNLP.core.callback import EchoCallback +from fastNLP.core.dist_trainer import DistTrainer, get_local_rank +from fastNLP.models.base_model import NaiveClassifier + + +def prepare_fake_dataset(): + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) + + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=0) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=1) for item in class_B]) + return data_set + + +def prepare_fake_dataset2(*args, size=100): + ys = np.random.randint(4, size=100, dtype=np.int64) + data = {'y': ys} + for arg in args: + data[arg] = np.random.randn(size, 5) + return DataSet(data=data) + + +def set_rng_seed(seed): + np.random.seed(seed) + + +def prepare_env(): + def prepare_fake_dataset(): + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) + + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) + return data_set + + data_set = prepare_fake_dataset() + data_set.set_input("x") + data_set.set_target("y") + model = NaiveClassifier(2, 1) + return data_set, model + + +class TestDistTrainer(unittest.TestCase): + save_path = './save_cp' + + def run1(self): + # test distributed training + print('local rank', get_local_rank()) + set_rng_seed(100) + data_set = prepare_fake_dataset() + data_set.set_input("x", flag=True) + data_set.set_target("y", flag=True) + + model = NaiveClassifier(2, 2) + + trainer = DistTrainer( + model=model, train_data=data_set, optimizer=SGD(lr=0.1), + loss=CrossEntropyLoss(pred="predict", target="y"), + batch_size_per_gpu=8, n_epochs=3, print_every=50, save_path=self.save_path, + ) + trainer.train() + """ + # 应该正确运行 + """ + if trainer.is_master and os.path.exists(self.save_path): + shutil.rmtree(self.save_path) + + def run2(self): + # test fp16 with distributed training + print('local rank', get_local_rank()) + set_rng_seed(100) + data_set = prepare_fake_dataset() + data_set.set_input("x", flag=True) + data_set.set_target("y", flag=True) + + model = NaiveClassifier(2, 2) + + trainer = DistTrainer( + model=model, train_data=data_set, optimizer=SGD(lr=0.1), + loss=CrossEntropyLoss(pred="predict", target="y"), + batch_size_per_gpu=8, n_epochs=3, print_every=50, save_path=self.save_path, + fp16='O1' + ) + trainer.train() + """ + # 应该正确运行 + """ + if trainer.is_master and os.path.exists(self.save_path): + shutil.rmtree(self.save_path) + + def run3(self): + set_rng_seed(100) + data_set, model = prepare_env() + trainer = DistTrainer( + data_set, model, optimizer=None, + loss=BCELoss(pred="predict", target="y"), + n_epochs=3, print_every=50, + callbacks_all=[EchoCallback('callbacks_all')], + callbacks_master=[EchoCallback('callbacks_master')] + ) + trainer.train() + + def run4(self): + set_rng_seed(100) + data_set, model = prepare_env() + + train_set, dev_set = data_set.split(0.3) + + model = NaiveClassifier(2, 1) + + trainer = DistTrainer( + train_set, model, optimizer=SGD(lr=0.1), + loss=BCELoss(pred="predict", target="y"), + batch_size_per_gpu=32, n_epochs=3, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=self.save_path, + ) + trainer.train() + """ + # 应该正确运行 + """ + if trainer.is_master and os.path.exists(self.save_path): + shutil.rmtree(self.save_path) + + def run_dist(self, run_id): + if torch.cuda.is_available(): + ngpu = min(2, torch.cuda.device_count()) + path = os.path.abspath(__file__) + cmd = ['python', '-m', 'torch.distributed.launch', + '--nproc_per_node', str(ngpu), path, '--test', str(run_id)] + print(' '.join(cmd)) + subprocess.check_call(cmd) + + def test_normal_run(self): + self.run_dist(1) + + def no_test_fp16(self): + self.run_dist(2) + + def test_callback(self): + self.run_dist(3) + + def test_dev_data(self): + self.run_dist(4) + + +if __name__ == '__main__': + runner = TestDistTrainer() + parser = ArgumentParser() + parser.add_argument('--test', type=int) + args, _ = parser.parse_known_args() + if args.test and hasattr(runner, 'run%s' % args.test): + getattr(runner, 'run%s' % args.test)() diff --git a/test/core/test_field.py b/test/core/test_field.py index e9053f37..c46e2de2 100644 --- a/test/core/test_field.py +++ b/test/core/test_field.py @@ -170,22 +170,22 @@ class TestFieldArray(unittest.TestCase): def test_append(self): with self.assertRaises(Exception): - fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False) fa.append(0) with self.assertRaises(Exception): - fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True) + fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True, use_1st_ins_infer_dim_type=False) fa.append([1, 2, 3, 4, 5]) with self.assertRaises(Exception): - fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False) fa.append([]) with self.assertRaises(Exception): - fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False) fa.append(["str", 0, 0, 0, 1.89]) - fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True) + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True, use_1st_ins_infer_dim_type=False) fa.append([1.2, 2.3, 3.4, 4.5, 5.6]) self.assertEqual(len(fa), 3) self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6]) diff --git a/test/core/test_logger.py b/test/core/test_logger.py new file mode 100644 index 00000000..610f42bd --- /dev/null +++ b/test/core/test_logger.py @@ -0,0 +1,33 @@ +from fastNLP import logger +import unittest +from unittest.mock import patch +import os +import io +import tempfile +import shutil + +class TestLogger(unittest.TestCase): + msg = 'some test logger msg' + + def setUp(self): + self.tmpdir = tempfile.mkdtemp() + + def tearDown(self): + pass + # shutil.rmtree(self.tmpdir) + + def test_add_file(self): + fn = os.path.join(self.tmpdir, 'log.txt') + logger.add_file(fn) + logger.info(self.msg) + with open(fn, 'r') as f: + line = ''.join([l for l in f]) + print(line) + self.assertTrue(self.msg in line) + + @patch('sys.stdout', new_callable=io.StringIO) + def test_stdout(self, mock_out): + for i in range(3): + logger.info(self.msg) + + self.assertEqual([self.msg for i in range(3)], mock_out.getvalue().strip().split('\n')) diff --git a/test/core/test_loss.py b/test/core/test_loss.py index 8db54615..9ba8159f 100644 --- a/test/core/test_loss.py +++ b/test/core/test_loss.py @@ -4,7 +4,6 @@ import torch import torch.nn.functional as F import fastNLP as loss -from fastNLP.core.losses import squash, unpad class TestLoss(unittest.TestCase): @@ -73,15 +72,3 @@ class TestLosserError(unittest.TestCase): with self.assertRaises(Exception): ans = l1({"my_predict": a}, {"truth": b, "my": a}) - - -class TestLossUtils(unittest.TestCase): - def test_squash(self): - a, b = squash(torch.randn(3, 5), torch.randn(3, 5)) - self.assertEqual(tuple(a.size()), (3, 5)) - self.assertEqual(tuple(b.size()), (15,)) - - def test_unpad(self): - a, b = unpad(torch.randn(5, 8, 3), torch.randn(5, 8)) - self.assertEqual(tuple(a.size()), (5, 8, 3)) - self.assertEqual(tuple(b.size()), (5, 8)) diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index 9c8a586c..16711064 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -7,10 +7,16 @@ from fastNLP import AccuracyMetric from fastNLP.core.metrics import _pred_topk, _accuracy_topk from fastNLP.core.vocabulary import Vocabulary from collections import Counter -from fastNLP.core.metrics import SpanFPreRecMetric +from fastNLP.core.metrics import SpanFPreRecMetric, CMRC2018Metric def _generate_tags(encoding_type, number_labels=4): + """ + + :param encoding_type: 例如BIOES, BMES, BIO等 + :param number_labels: 多少个label,大于1 + :return: + """ vocab = {} for i in range(number_labels): label = str(i) @@ -184,7 +190,7 @@ class TestAccuracyMetric(unittest.TestCase): self.assertDictEqual(metric.get_metric(), {'acc': 1.}) -class SpanF1PreRecMetric(unittest.TestCase): +class SpanFPreRecMetricTest(unittest.TestCase): def test_case1(self): from fastNLP.core.metrics import _bmes_tag_to_spans from fastNLP.core.metrics import _bio_tag_to_spans @@ -338,6 +344,97 @@ class SpanF1PreRecMetric(unittest.TestCase): for key, value in expected_metric.items(): self.assertAlmostEqual(value, metric_value[key], places=5) + def test_auto_encoding_type_infer(self): + # 检查是否可以自动check encode的类型 + vocabs = {} + import random + for encoding_type in ['bio', 'bioes', 'bmeso']: + vocab = Vocabulary(unknown=None, padding=None) + for i in range(random.randint(10, 100)): + label = str(random.randint(1, 10)) + for tag in encoding_type: + if tag!='o': + vocab.add_word(f'{tag}-{label}') + else: + vocab.add_word('o') + vocabs[encoding_type] = vocab + for e in ['bio', 'bioes', 'bmeso']: + with self.subTest(e=e): + metric = SpanFPreRecMetric(tag_vocab=vocabs[e]) + assert metric.encoding_type == e + + bmes_vocab = _generate_tags('bmes') + vocab = Vocabulary() + for tag, index in bmes_vocab.items(): + vocab.add_word(tag) + metric = SpanFPreRecMetric(vocab) + assert metric.encoding_type == 'bmes' + + # 一些无法check的情况 + vocab = Vocabulary() + for i in range(10): + vocab.add_word(str(i)) + with self.assertRaises(Exception): + metric = SpanFPreRecMetric(vocab) + + def test_encoding_type(self): + # 检查传入的tag_vocab与encoding_type不符合时,是否会报错 + vocabs = {} + import random + from itertools import product + for encoding_type in ['bio', 'bioes', 'bmeso']: + vocab = Vocabulary(unknown=None, padding=None) + for i in range(random.randint(10, 100)): + label = str(random.randint(1, 10)) + for tag in encoding_type: + if tag!='o': + vocab.add_word(f'{tag}-{label}') + else: + vocab.add_word('o') + vocabs[encoding_type] = vocab + for e1, e2 in product(['bio', 'bioes', 'bmeso'], ['bio', 'bioes', 'bmeso']): + with self.subTest(e1=e1, e2=e2): + if e1==e2: + metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2) + else: + s2 = set(e2) + s2.update(set(e1)) + if s2==set(e2): + continue + with self.assertRaises(AssertionError): + metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2) + for encoding_type in ['bio', 'bioes', 'bmeso']: + with self.assertRaises(AssertionError): + metric = SpanFPreRecMetric(vocabs[encoding_type], encoding_type='bmes') + + with self.assertWarns(Warning): + vocab = Vocabulary(unknown=None, padding=None).add_word_lst(list('bmes')) + metric = SpanFPreRecMetric(vocab, encoding_type='bmeso') + vocab = Vocabulary().add_word_lst(list('bmes')) + metric = SpanFPreRecMetric(vocab, encoding_type='bmeso') + + +class TestCMRC2018Metric(unittest.TestCase): + def test_case1(self): + # 测试能否正确计算 + import torch + metric = CMRC2018Metric() + + raw_chars = [list("abcsdef"), list("123456s789")] + context_len = torch.LongTensor([3, 6]) + answers = [["abc", "abc", "abc"], ["12", "12", "12"]] + pred_start = torch.randn(2, max(map(len, raw_chars))) + pred_end = torch.randn(2, max(map(len, raw_chars))) + pred_start[0, 0] = 1000 # 正好是abc + pred_end[0, 2] = 1000 + pred_start[1, 1] = 1000 # 取出234 + pred_end[1, 3] = 1000 + + metric.evaluate(answers, raw_chars, context_len, pred_start, pred_end) + + eval_res = metric.get_metric() + self.assertDictEqual(eval_res, {'f1': 70.0, 'em': 50.0}) + class TestUsefulFunctions(unittest.TestCase): # 测试metrics.py中一些看上去挺有用的函数 @@ -347,3 +444,6 @@ class TestUsefulFunctions(unittest.TestCase): _ = _pred_topk(np.random.randint(0, 3, size=(10, 1))) # 跑通即可 + + + diff --git a/test/core/test_optimizer.py b/test/core/test_optimizer.py index b9a1c271..2f2487c7 100644 --- a/test/core/test_optimizer.py +++ b/test/core/test_optimizer.py @@ -2,7 +2,7 @@ import unittest import torch -from fastNLP import SGD, Adam +from fastNLP import SGD, Adam, AdamW class TestOptim(unittest.TestCase): @@ -52,3 +52,12 @@ class TestOptim(unittest.TestCase): self.assertEqual(optim.__dict__["settings"]["lr"], 0.001) res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters()) self.assertTrue(isinstance(res, torch.optim.Adam)) + + def test_AdamW(self): + optim = AdamW(params=torch.nn.Linear(10, 3).parameters()) + self.assertTrue('lr' in optim.defaults) + self.assertTrue('weight_decay' in optim.defaults) + + optim = AdamW(params=torch.nn.Linear(10, 3).parameters(), lr=0.002, weight_decay=0.989) + self.assertEqual(optim.defaults['lr'], 0.002) + self.assertTrue(optim.defaults['weight_decay'], 0.989) diff --git a/test/core/test_utils.py b/test/core/test_utils.py index 363d5fa1..0093c3e8 100644 --- a/test/core/test_utils.py +++ b/test/core/test_utils.py @@ -10,7 +10,8 @@ import torch from torch import nn from fastNLP.core.utils import _move_model_to_device, _get_model_device import numpy as np -from fastNLP.core.utils import seq_len_to_mask +from fastNLP.core.utils import seq_len_to_mask, get_seq_len +from fastNLP.core.utils import iob2, iob2bioes class Model(nn.Module): def __init__(self): @@ -119,7 +120,8 @@ class TestCache(unittest.TestCase): def test_cache_save(self): try: start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train') + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train') end_time = time.time() pre_time = end_time - start_time with open('test/demo1.pkl', 'rb') as f: @@ -128,7 +130,8 @@ class TestCache(unittest.TestCase): for i in range(embed.shape[0]): self.assertListEqual(embed[i].tolist(), _embed[i].tolist()) start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train') + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train') end_time = time.time() read_time = end_time - start_time print("Read using {:.3f}, while prepare using:{:.3f}".format(read_time, pre_time)) @@ -139,7 +142,7 @@ class TestCache(unittest.TestCase): def test_cache_save_overwrite_path(self): try: start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', 'test/data_for_tests/cws_train', _cache_fp='test/demo_overwrite.pkl') end_time = time.time() pre_time = end_time - start_time @@ -149,7 +152,8 @@ class TestCache(unittest.TestCase): for i in range(embed.shape[0]): self.assertListEqual(embed[i].tolist(), _embed[i].tolist()) start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train', _cache_fp='test/demo_overwrite.pkl') end_time = time.time() read_time = end_time - start_time @@ -161,7 +165,8 @@ class TestCache(unittest.TestCase): def test_cache_refresh(self): try: start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train', _refresh=True) end_time = time.time() pre_time = end_time - start_time @@ -171,7 +176,8 @@ class TestCache(unittest.TestCase): for i in range(embed.shape[0]): self.assertListEqual(embed[i].tolist(), _embed[i].tolist()) start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train', _refresh=True) end_time = time.time() read_time = end_time - start_time @@ -258,4 +264,27 @@ class TestSeqLenToMask(unittest.TestCase): # 3. pad到指定长度 seq_len = torch.randint(1, 10, size=(10, )) mask = seq_len_to_mask(seq_len, 100) - self.assertEqual(100, mask.size(1)) \ No newline at end of file + self.assertEqual(100, mask.size(1)) + + +class TestUtils(unittest.TestCase): + def test_get_seq_len(self): + seq_len = torch.randint(1, 10, size=(10, )) + mask = seq_len_to_mask(seq_len) + new_seq_len = get_seq_len(mask) + self.assertSequenceEqual(seq_len.tolist(), new_seq_len.tolist()) + + def test_iob2(self): + tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] + convert_tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] + self.assertSequenceEqual(convert_tags, iob2(tags)) + + tags = ['I-NP', 'O', 'I-NP', 'I-VP', 'B-NP', 'I-NP', 'O', 'I-NP', 'I-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] + self.assertSequenceEqual(convert_tags, iob2(tags)) + + def test_iob2bioes(self): + tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] + convert_tags = ['S-NP', 'O', 'S-NP', 'S-VP', 'B-NP', 'E-NP', 'O', 'S-NP', 'S-PP', 'B-NP', 'E-NP', 'O', 'B-NP', 'E-NP', 'S-NP', 'O', 'B-NP', 'I-NP', 'E-NP'] + + self.assertSequenceEqual(convert_tags, iob2bioes(tags)) + diff --git a/test/data_for_tests/embedding/small_bert/config.json b/test/data_for_tests/embedding/small_bert/config.json new file mode 100644 index 00000000..3e516872 --- /dev/null +++ b/test/data_for_tests/embedding/small_bert/config.json @@ -0,0 +1,13 @@ +{ + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 16, + "initializer_range": 0.02, + "intermediate_size": 64, + "max_position_embeddings": 32, + "num_attention_heads": 4, + "num_hidden_layers": 2, + "type_vocab_size": 2, + "vocab_size": 20 +} \ No newline at end of file diff --git a/test/data_for_tests/embedding/small_bert/small_pytorch_model.bin b/test/data_for_tests/embedding/small_bert/small_pytorch_model.bin new file mode 100644 index 00000000..fe968fb5 Binary files /dev/null and b/test/data_for_tests/embedding/small_bert/small_pytorch_model.bin differ diff --git a/test/data_for_tests/embedding/small_bert/vocab.txt b/test/data_for_tests/embedding/small_bert/vocab.txt new file mode 100644 index 00000000..565e67af --- /dev/null +++ b/test/data_for_tests/embedding/small_bert/vocab.txt @@ -0,0 +1,20 @@ +[PAD] +[UNK] +[CLS] +[SEP] +this +is +a +small +bert +model +vocab +file +and +only +twenty +line +for +the +whole +text diff --git a/test/data_for_tests/embedding/small_elmo/char.dic b/test/data_for_tests/embedding/small_elmo/char.dic new file mode 100644 index 00000000..74285f34 --- /dev/null +++ b/test/data_for_tests/embedding/small_elmo/char.dic @@ -0,0 +1,229 @@ +! 33 +" 34 +# 35 +$ 36 +% 37 +& 38 +' 39 +( 40 +) 41 +* 42 ++ 43 +, 44 +- 45 +. 46 +/ 47 +0 48 +1 49 +2 50 +3 51 +4 52 +5 53 +6 54 +7 55 +8 56 +9 57 +: 58 +; 59 +< 60 += 61 +> 62 +? 63 +@ 64 +A 65 +B 66 +C 67 +D 68 +E 69 +F 70 +G 71 +H 72 +I 73 +J 74 +K 75 +L 76 +M 77 +N 78 +O 79 +P 80 +Q 81 +R 82 +S 83 +T 84 +U 85 +V 86 +W 87 +X 88 +Y 89 +Z 90 +[ 91 +\ 92 +] 93 +^ 94 +_ 95 +` 96 +a 97 +b 98 +c 99 +d 100 +e 101 +f 102 +g 103 +h 104 +i 105 +j 106 +k 107 +l 108 +m 109 +n 110 +o 111 +p 112 +q 113 +r 114 +s 115 +t 116 +u 117 +v 118 +w 119 +x 120 +y 121 +z 122 +{ 123 +| 124 +} 125 +~ 126 + 127 +€ 128 + 129 +‚ 130 +ƒ 131 +„ 132 +† 134 +‡ 135 +ˆ 136 +‰ 137 +Š 138 +‹ 139 +Œ 140 + 141 +Ž 142 + 143 + 144 +‘ 145 +’ 146 +“ 147 +” 148 +• 149 +– 150 +— 151 +˜ 152 +™ 153 +š 154 +› 155 +œ 156 + 157 +ž 158 +Ÿ 159 +  160 +¡ 161 +¢ 162 +£ 163 +¤ 164 +¥ 165 +¦ 166 +§ 167 +¨ 168 +© 169 +ª 170 +« 171 +¬ 172 +­ 173 +® 174 +¯ 175 +° 176 +± 177 +² 178 +³ 179 +´ 180 +µ 181 +¶ 182 +· 183 +¸ 184 +¹ 185 +º 186 +» 187 +¼ 188 +½ 189 +¾ 190 +¿ 191 +À 192 +Á 193 + 194 +à 195 +Ä 196 +Å 197 +Æ 198 +Ç 199 +È 200 +É 201 +Ê 202 +Ë 203 +Ì 204 +Í 205 +Î 206 +Ï 207 +Ð 208 +Ñ 209 +Ò 210 +Ó 211 +Ô 212 +Õ 213 +Ö 214 +× 215 +Ø 216 +Ù 217 +Ú 218 +Û 219 +Ü 220 +Ý 221 +Þ 222 +ß 223 +à 224 +á 225 +â 226 +ã 227 +ä 228 +å 229 +æ 230 +ç 231 +è 232 +é 233 +ê 234 +ë 235 +ì 236 +í 237 +î 238 +ï 239 +ð 240 +ñ 241 +ò 242 +ó 243 +ô 244 +õ 245 +ö 246 +÷ 247 +ø 248 +ù 249 +ú 250 +û 251 +ü 252 +ý 253 +þ 254 +ÿ 255 + 256 + 257 + 258 + 259 + 260 + 1 + -1 diff --git a/test/data_for_tests/embedding/small_elmo/elmo_1x16_16_32cnn_1xhighway_options.json b/test/data_for_tests/embedding/small_elmo/elmo_1x16_16_32cnn_1xhighway_options.json new file mode 100644 index 00000000..9c02ef72 --- /dev/null +++ b/test/data_for_tests/embedding/small_elmo/elmo_1x16_16_32cnn_1xhighway_options.json @@ -0,0 +1,29 @@ +{ + "lstm": { + "use_skip_connections": true, + "projection_dim": 16, + "cell_clip": 3, + "proj_clip": 3, + "dim": 16, + "n_layers": 1 + }, + "char_cnn": { + "activation": "relu", + "filters": [ + [ + 1, + 16 + ], + [ + 2, + 16 + ] + ], + "n_highway": 1, + "embedding": { + "dim": 4 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } +} diff --git a/test/data_for_tests/embedding/small_elmo/elmo_mini_for_testing.pkl b/test/data_for_tests/embedding/small_elmo/elmo_mini_for_testing.pkl new file mode 100644 index 00000000..4c72f3d5 Binary files /dev/null and b/test/data_for_tests/embedding/small_elmo/elmo_mini_for_testing.pkl differ diff --git a/test/data_for_tests/glove.6B.50d_test.txt b/test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt similarity index 100% rename from test/data_for_tests/glove.6B.50d_test.txt rename to test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt diff --git a/test/data_for_tests/word2vec_test.txt b/test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt similarity index 100% rename from test/data_for_tests/word2vec_test.txt rename to test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt diff --git a/test/data_for_tests/io/BQCorpus/dev.txt b/test/data_for_tests/io/BQCorpus/dev.txt new file mode 100644 index 00000000..2bd7414e --- /dev/null +++ b/test/data_for_tests/io/BQCorpus/dev.txt @@ -0,0 +1,6 @@ +sentence1,sentence2,label +综合评分不足什么原因,综合评估的依据,0 +什么时候我能使用微粒贷,你就赶快给我开通就行了,0 +如何修改每个月的还款日期,可以申请延期还款日吗?,0 +没什么问的,不能登陆就是我最大的问题了,登录不上,1 +你的意思是不能取现,借到的钱可不可以提出来,1 diff --git a/test/data_for_tests/io/BQCorpus/test.txt b/test/data_for_tests/io/BQCorpus/test.txt new file mode 100644 index 00000000..949583ad --- /dev/null +++ b/test/data_for_tests/io/BQCorpus/test.txt @@ -0,0 +1,6 @@ +sentence1,sentence2,label +你电话号码多少,你们的客服电话是多少?,1 +10000块日利息是多少,0.05%就是借2000块,利息为1块钱一天,1 +17号还款了,我现在想提前几天还,怎么弄,一直按时还款,提前还款,怎么会评估不足,0 +我昨晚申请的,现在钱没到,也没有人联系我,审核多久才会打电话,1 +假如我贷四万还款怎么,18号还款日可以不凌晨扣款,我18日下午还款可以吗,0 diff --git a/test/data_for_tests/io/BQCorpus/train.txt b/test/data_for_tests/io/BQCorpus/train.txt new file mode 100644 index 00000000..f2ac4e84 --- /dev/null +++ b/test/data_for_tests/io/BQCorpus/train.txt @@ -0,0 +1,6 @@ +sentence1,sentence2,label +一天了还是不能登录,你好,用app干嘛但是无法登入,1 +为什么我的钱包点开,没显示微粒贷呀,点击我进入钱包,没有,借款的,提示呀!,1 +什么要求,借款没有,0 +微信注册的手机号停机了,还可以办理吗,没有邀请可以注册嘛,0 +开通微粒贷,开通微粒贷!强烈要求,1 diff --git a/test/data_for_tests/io/ChnSentiCorp/dev.txt b/test/data_for_tests/io/ChnSentiCorp/dev.txt new file mode 100644 index 00000000..9387b569 --- /dev/null +++ b/test/data_for_tests/io/ChnSentiCorp/dev.txt @@ -0,0 +1,7 @@ +label text_a +1 基金痛所有投资项目一样,必须先要有所了解,才能把握分寸,不至于跟风而造成损失。此本基金入门的书是一个不错的选择,不像一般的书一样偏重概念,虽然也涉及到概念,但作者用自己的方式解读,使第一次接触基金的人能更好的理解。内容以非常容易理解的语言象大众普及了基金的很多观念,对于普通基民来说,要想有所收获,必须了解基金界的很多情况,在关键的时候才不会盲目跟风。对于新手,强烈推荐。 +1 系统很好装,LED屏是不错,就是16比9的比例看起来比较长,是14.0的屏。外观比较酷,适合年轻人,键盘模仿SONY的,还不错。 +1 这书的装帧很好的,既适合家庭收藏亦适合阅读了解。了解一个人,通过他的书信,而且是家书,再好不过了,而了解这个人也更了解些那个时代,那个社会,给我们现代人些许启发吧。而我从中也知道了他的学习习惯、方法以及教子方面。比较有收获。软精装的封面,封面要是每个唐老师那个照片就更好了,分上下册便于阅读。内里字体有分别:信是用的启功老师的手写字体,评点是宋体。 +0 屏幕没有坏点和暗点,这个比较不错。配置性价比较高,目前使用已有半个月,基本正常。 +0 典型的国营酒店,管理层缺乏责任心,管理混乱。房间里的大灯镜灯台灯都是坏的,只有一盏床头灯可用,不知道酒店是怎么维护的。最可气的是结帐时竟然要求客人赔偿房间里已损坏很久的鞋盒,简直是讹诈。 +0 普通游客旅馆 还三星 让我伤心 店名好大 奇差无比 补充点评 2006年12月8日 : 还说有地下车库 谁敢下去 晕 狭小 黑暗 要卡壳儿的 CTRIP上怎么让它这么忽悠顾客的 ?!!!!!!! diff --git a/test/data_for_tests/io/ChnSentiCorp/test.txt b/test/data_for_tests/io/ChnSentiCorp/test.txt new file mode 100644 index 00000000..35f7d2c5 --- /dev/null +++ b/test/data_for_tests/io/ChnSentiCorp/test.txt @@ -0,0 +1,7 @@ +label text_a +0 v系统和XP系统能做到二选一就更好了,毕竟大部分人还是更偏爱XP系统。 +0 自带的Linix系统上上网还可以,想玩其他的功能毫无疑问得换XP.偶在京东订的时候为了装XP方便,一起买了阿帕奇的USB光驱。到货后,发现该USB光驱无法引导系统光盘启动,已验证过该光驱读写功能正常。 +1 非常不错的酒店,依山傍水,里面大片森林,散散步很不错,坐在湖边也休息也是不错的选择;房间很幽静,房间的设施很好,服务员态度也很好。 +0 5月8日付款成功,当当网显示5月10日发货,可是至今还没看到货物,也没收到任何通知,简不知怎么说好!!! +1 收到书,还未打开就被封面的鲜艳色彩及版样吸引,迫不急待的打开,书内的设计及彩图也不错,色泽及印刷质量都称的上好,没有味道,贴图也从简入深。价格也不贵。拿回家,小宝贝也很喜欢,我家宝宝只有2岁5个月对于她贴片不太好撕,大一些的贴片要我来帮她撕。不过,今天再玩时已经比昨天撕的好很多了,可以锻炼她的小手呢。等这几本用完了,我想我还会再给她买一些类似的书。 +0 挺失望的,还不如买一本张爱玲文集呢,以<色戒>命名,可这篇文章仅仅10多页,且无头无尾的,完全比不上里面的任意一篇其它文章. diff --git a/test/data_for_tests/io/ChnSentiCorp/train.txt b/test/data_for_tests/io/ChnSentiCorp/train.txt new file mode 100644 index 00000000..9e53f1bd --- /dev/null +++ b/test/data_for_tests/io/ChnSentiCorp/train.txt @@ -0,0 +1,7 @@ +label text_a +1 很好的酒店,很规范植得一住.餐厅一般不应该的,不知道为什么. 宾馆反馈 2008年4月17日 : 餐厅现已重新装修,用餐环境较以前要好的多。谢谢您的宝贵意见! +0 这是我看过文字写得很糟糕的书,因为买了,还是耐着性子看完了,但是总体来说不好,文字、内容、结构都不好 +1 拿房时没大床房了,给我们免费升成套房,这点还蛮满意的。酒店大致不错,有国内五星水准。比国际品牌的要差一点。酒店有点年纪了,维修要加强,比如我们浴室的下水就堵塞不通,这些在客人入住前就该发觉修好。其它都还可以。 +1 开始看了2005年的几位朋友的评价,都不敢去入住。没想到现在改观了很多,房间虽小,但很整洁。下次再来的话,还会选择这个酒店。只是希望宽带能一直免费! +0 本机预装的Vista跟瑞星杀软不兼容,蓝屏,不能进入系统,不能自行卸载!!千万小心别装,用卡巴可以。 +0 跟心灵鸡汤没什么本质区别嘛,至少我不喜欢这样读经典,把经典都解读成这样有点去中国化的味道了 diff --git a/test/data_for_tests/io/LCQMC/dev.txt b/test/data_for_tests/io/LCQMC/dev.txt new file mode 100644 index 00000000..3e253c93 --- /dev/null +++ b/test/data_for_tests/io/LCQMC/dev.txt @@ -0,0 +1,6 @@ +开初婚未育证明怎么弄? 初婚未育情况证明怎么开? 1 +脚气怎么治疗 醋怎么治疗脚气 0 +世界是先有男人还是先有女人 世界上是先有男人还是先有女人 1 +有什么小说软件好用的 那个看小说的阅读器较好 1 +网上兼职是做什么的,手机可以做吗 手机可以做什么网上兼职,拍单子是什么 0 +郑州有什么好玩的地方? 郑州有什么好玩的地方啊 1 diff --git a/test/data_for_tests/io/LCQMC/test.txt b/test/data_for_tests/io/LCQMC/test.txt new file mode 100644 index 00000000..bc694d3a --- /dev/null +++ b/test/data_for_tests/io/LCQMC/test.txt @@ -0,0 +1,5 @@ +谁有狂三这张高清的 这张高清图,谁有 0 +淘宝模特叫什么?急 淘宝的模特她叫什么 1 +不要嘛用韩语怎么说 韩语的请不要走怎么说 0 +倒瓜子脸适合什么发型 额头高又是瓜子脸的女生适合什么刘海 0 +淘宝流量怎么买 刚淘宝店如何才能有流量 0 diff --git a/test/data_for_tests/io/LCQMC/train.txt b/test/data_for_tests/io/LCQMC/train.txt new file mode 100644 index 00000000..9f6d4924 --- /dev/null +++ b/test/data_for_tests/io/LCQMC/train.txt @@ -0,0 +1,6 @@ +喜欢打篮球的男生喜欢什么样的女生 爱打篮球的男生喜欢什么样的女生 1 +你帮我设计小说的封面吧 谁能帮我给小说设计个封面? 0 +移动手机卡刷砖 关于移动手机卡 0 +有什么好听的短信铃声啊 有什么好听的韩剧短信铃声 0 +人生的三大事是什么 人生三大事是什么? 1 +您好是后8位的 您提供后8位即可, 1 diff --git a/test/data_for_tests/io/MNLI/dev_matched.tsv b/test/data_for_tests/io/MNLI/dev_matched.tsv new file mode 100755 index 00000000..ace2dd27 --- /dev/null +++ b/test/data_for_tests/io/MNLI/dev_matched.tsv @@ -0,0 +1,6 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 label2 label3 label4 label5 gold_label +0 63735 63735n slate ( ( The ( new rights ) ) ( are ( nice enough ) ) ) ( Everyone ( really ( likes ( the ( newest benefits ) ) ) ) ) (ROOT (S (NP (DT The) (JJ new) (NNS rights)) (VP (VBP are) (ADJP (JJ nice) (RB enough))))) (ROOT (S (NP (NN Everyone)) (VP (ADVP (RB really)) (VBZ likes) (NP (DT the) (JJS newest) (NNS benefits))))) The new rights are nice enough Everyone really likes the newest benefits neutral entailment neutral neutral neutral neutral +1 91383 91383c government ( ( This site ) ( ( includes ( ( ( ( a list ) ( of ( all ( award winners ) ) ) ) and ) ( ( a ( searchable database ) ) ( of ( Government ( Executive articles ) ) ) ) ) ) . ) ) ( ( ( The ( Government ( Executive articles ) ) ) ( housed ( on ( the website ) ) ) ) ( ( ( are not ) ( able ( to ( be searched ) ) ) ) . ) ) (ROOT (S (NP (DT This) (NN site)) (VP (VBZ includes) (NP (NP (NP (DT a) (NN list)) (PP (IN of) (NP (DT all) (NN award) (NNS winners)))) (CC and) (NP (NP (DT a) (JJ searchable) (NN database)) (PP (IN of) (NP (NNP Government) (NNP Executive) (NNS articles)))))) (. .))) (ROOT (S (NP (NP (DT The) (NNP Government) (NNP Executive) (NNS articles)) (VP (VBN housed) (PP (IN on) (NP (DT the) (NN website))))) (VP (VBP are) (RB not) (ADJP (JJ able) (S (VP (TO to) (VP (VB be) (ADJP (JJ searched))))))) (. .))) This site includes a list of all award winners and a searchable database of Government Executive articles. The Government Executive articles housed on the website are not able to be searched. contradiction contradiction contradiction contradiction contradiction contradiction +2 755 755e telephone ( ( ( ( uh ( i ( ( do n't ) ( know ( ( i i ) ( have ( ( mixed emotions ) ( about ( him ( ( uh sometimes ) ( i ( like him ) ) ) ) ) ) ) ) ) ) ) ) but ) ( ( at ( the ( same times ) ) ) ( i ( love ( to ( see somebody ) ) ) ) ) ) ( beat him ) ) ( I ( ( ( ( ( ( like him ) ( for ( the ( most part ) ) ) ) , ) but ) ( ( would still ) ( enjoy ( seeing ( someone ( beat him ) ) ) ) ) ) . ) ) (ROOT (SINV (S (S (INTJ (UH uh)) (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP have) (VP (VBN mixed) (NP (NNS emotions)) (PP (IN about) (S (NP (PRP him)) (VP (VBG uh) (ADVP (RB sometimes)) (NP (NP (FW i)) (PP (IN like) (NP (PRP him))))))))))))))) (CC but) (S (PP (IN at) (NP (DT the) (JJ same) (NNS times))) (NP (FW i)) (VP (VBP love) (S (VP (TO to) (VP (VB see) (NP (NN somebody)))))))) (VP (VBD beat)) (NP (PRP him)))) (ROOT (S (NP (PRP I)) (VP (VP (VBP like) (NP (PRP him)) (PP (IN for) (NP (DT the) (JJS most) (NN part)))) (, ,) (CC but) (VP (MD would) (ADVP (RB still)) (VP (VB enjoy) (S (VP (VBG seeing) (S (NP (NN someone)) (VP (VB beat) (NP (PRP him))))))))) (. .))) uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him I like him for the most part, but would still enjoy seeing someone beat him. entailment entailment entailment entailment entailment entailment +3 78013 78013c telephone ( yeah ( ( i i ) ( think ( ( my ( favorite restaurant ) ) ( ( is always ) ( been ( ( the ( one closest ) ) ( you ( ( know ( the closest ) ) ( ( as long ) ( as ( it ( 's ( it ( meets ( ( the ( minimum criteria ) ) ( you ( know ( of ( good food ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( ( My ( favorite restaurants ) ) ( ( ( ( are always ) ( ( ( ( ( at least ) a ) hundred ) miles ) away ) ) ( from ( my house ) ) ) . ) ) (ROOT (S (VP (VB yeah) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP think) (SBAR (S (NP (PRP$ my) (JJ favorite) (NN restaurant)) (VP (VBZ is) (ADVP (RB always)) (VP (VBN been) (NP (NP (DT the) (CD one) (JJS closest)) (SBAR (S (NP (PRP you)) (VP (VBP know) (NP (DT the) (JJS closest)) (ADVP (ADVP (RB as) (RB long)) (SBAR (IN as) (S (NP (PRP it)) (VP (VBZ 's) (SBAR (S (NP (PRP it)) (VP (VBZ meets) (NP (NP (DT the) (JJ minimum) (NNS criteria)) (SBAR (S (NP (PRP you)) (VP (VBP know) (PP (IN of) (NP (JJ good) (NN food))))))))))))))))))))))))))))) (ROOT (S (NP (PRP$ My) (JJ favorite) (NNS restaurants)) (VP (VBP are) (ADVP (RB always)) (ADVP (NP (QP (IN at) (JJS least) (DT a) (CD hundred)) (NNS miles)) (RB away)) (PP (IN from) (NP (PRP$ my) (NN house)))) (. .))) yeah i i think my favorite restaurant is always been the one closest you know the closest as long as it's it meets the minimum criteria you know of good food My favorite restaurants are always at least a hundred miles away from my house. contradiction contradiction contradiction contradiction contradiction contradiction +4 96377 96377c telephone ( i ( ( do n't ) ( know ( um ( do ( you ( do ( ( a lot ) ( of camping ) ) ) ) ) ) ) ) ) ( I ( ( know exactly ) . ) ) (ROOT (S (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (SBAR (S (NP (NN um)) (VP (VBP do) (SBAR (S (NP (PRP you)) (VP (VBP do) (NP (NP (DT a) (NN lot)) (PP (IN of) (NP (NN camping)))))))))))))) (ROOT (S (NP (PRP I)) (VP (VBP know) (ADVP (RB exactly))) (. .))) i don't know um do you do a lot of camping I know exactly. contradiction contradiction contradiction contradiction contradiction contradiction diff --git a/test/data_for_tests/io/MNLI/dev_mismatched.tsv b/test/data_for_tests/io/MNLI/dev_mismatched.tsv new file mode 100755 index 00000000..a1da8897 --- /dev/null +++ b/test/data_for_tests/io/MNLI/dev_mismatched.tsv @@ -0,0 +1,6 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 label2 label3 label4 label5 gold_label +0 75290 75290c letters ( ( Your contribution ) ( ( helped ( make ( it ( possible ( for ( us ( to ( ( provide ( our students ) ) ( with ( a ( quality education ) ) ) ) ) ) ) ) ) ) ) . ) ) ( ( Your contributions ) ( ( were ( of ( ( no help ) ( with ( ( our ( students ' ) ) education ) ) ) ) ) . ) ) (ROOT (S (NP (PRP$ Your) (NN contribution)) (VP (VBD helped) (VP (VB make) (S (NP (PRP it)) (ADJP (JJ possible)) (SBAR (IN for) (S (NP (PRP us)) (VP (TO to) (VP (VB provide) (NP (PRP$ our) (NNS students)) (PP (IN with) (NP (DT a) (NN quality) (NN education)))))))))) (. .))) (ROOT (S (NP (PRP$ Your) (NNS contributions)) (VP (VBD were) (PP (IN of) (NP (NP (DT no) (NN help)) (PP (IN with) (NP (NP (PRP$ our) (NNS students) (POS ')) (NN education)))))) (. .))) Your contribution helped make it possible for us to provide our students with a quality education. Your contributions were of no help with our students' education. contradiction contradiction contradiction contradiction contradiction contradiction +1 133794 133794c verbatim ( ( ( ( ( ( The answer ) ( ( ( ( has nothing ) ( to ( do ( with ( their cause ) ) ) ) ) , ) however ) ) , ) but ) ( ( with ( ( ( ( ( ( ( ( the ( simple fact ) ) ( that ( dictionaries ( ( are not ) ( exercises ( in ( bi-unique substitutability ) ) ) ) ) ) ) ; ) ( in ( ( ( other words ) , ) ( if ( ( one ( of ( ( the senses ) ( of run ) ) ) ) ( ( is ` ) ( ( ( ( operate ' ) -LRB- ) ( as ( in ( She ( runs ( an ( engine factory ) ) ) ) ) ) ) -RRB- ) ) ) ) ) ) ) , ) ( that ( ( does not ) ( ( make it ) ( ( valid ( to ( assume ( that ( one ( can ( substitute ( ( operate ( for run ) ) ( in ( We ( ( run ( in ( ( the marathon ) ( every year ) ) ) ) . ) ) ) ) ) ) ) ) ) ) ) ( Although ( ( ( ( recognizing this ) ( as ( ( a shortcoming ) ( of dictionaries ) ) ) ) and ) ( ( ( assigning it ) arbitrarily ) ( to ( what ( , ( ( for ( lack ( of ( a ( better term ) ) ) ) ) ( , ( we ( might ( call ( ( the genius ) ( of ( the language ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) , ) ( might ( seem ( trivial ( to ( the ( casual observer ) ) ) ) ) ) ) ) ( , ( it ( is ( ( a ( valid matter ) ) ( for ( concern ( in ( ( the realm ) ( of lexicology ) ) ) ) ) ) ) ) ) ) ) . ) ( Dictionaries ( ( ( are indeed ) ( exercises ( in ( bi-unique substitutability ) ) ) ) . ) ) (ROOT (S (S (NP (DT The) (NN answer)) (VP (VBZ has) (ADVP (NN nothing)) (S (VP (TO to) (VP (VB do) (PP (IN with) (NP (PRP$ their) (NN cause)))))) (, ,) (ADVP (RB however)))) (, ,) (CC but) (S (SBAR (IN with) (S (NP (NP (DT the) (JJ simple) (NN fact)) (SBAR (IN that) (S (NP (NNS dictionaries)) (VP (VBP are) (RB not) (NP (NP (NNS exercises)) (PP (IN in) (NP (JJ bi-unique) (NN substitutability))))))) (: ;) (PP (IN in) (NP (NP (JJ other) (NNS words)) (, ,) (SBAR (IN if) (S (NP (NP (CD one)) (PP (IN of) (NP (NP (DT the) (NNS senses)) (PP (IN of) (NP (NN run)))))) (VP (VBZ is) (`` `) (VP (VB operate) ('' ') (-LRB- -LRB-) (SBAR (RB as) (IN in) (S (NP (PRP She)) (VP (VBZ runs) (NP (DT an) (NN engine) (NN factory))))) (-RRB- -RRB-))))))) (, ,) (SBAR (WHNP (WDT that)) (S (VP (VBZ does) (RB not) (VP (VB make) (NP (PRP it)) (S (ADJP (JJ valid) (S (VP (TO to) (VP (VB assume) (SBAR (IN that) (S (NP (PRP one)) (VP (MD can) (VP (VB substitute) (VP (VB operate) (PP (IN for) (NP (NN run))) (SBAR (IN in) (S (NP (PRP We)) (VP (VB run) (PP (IN in) (NP (NP (DT the) (NN marathon)) (NP (DT every) (NN year)))) (. .))))))))))))) (SBAR (IN Although) (S (S (VP (VBG recognizing) (NP (DT this)) (PP (IN as) (NP (NP (DT a) (NN shortcoming)) (PP (IN of) (NP (NNS dictionaries))))))) (CC and) (S (VP (VBG assigning) (NP (PRP it)) (ADVP (RB arbitrarily)) (PP (TO to) (SBAR (WHNP (WP what)) (S (, ,) (PP (IN for) (NP (NP (NN lack)) (PP (IN of) (NP (DT a) (JJR better) (NN term))))) (, ,) (NP (PRP we)) (VP (MD might) (VP (VB call) (NP (NP (DT the) (NN genius)) (PP (IN of) (NP (DT the) (NN language)))))))))))))))))) (, ,)) (VP (MD might) (VP (VB seem) (ADJP (JJ trivial) (PP (TO to) (NP (DT the) (JJ casual) (NN observer)))))))) (, ,) (NP (PRP it)) (VP (VBZ is) (NP (NP (DT a) (JJ valid) (NN matter)) (PP (IN for) (NP (NP (NN concern)) (PP (IN in) (NP (NP (DT the) (NN realm)) (PP (IN of) (NP (NN lexicology)))))))))) (. .))) (ROOT (S (NP (NNS Dictionaries)) (VP (VBP are) (ADVP (RB indeed)) (NP (NP (NNS exercises)) (PP (IN in) (NP (JJ bi-unique) (NN substitutability))))) (. .))) The answer has nothing to do with their cause, however, but with the simple fact that dictionaries are not exercises in bi-unique substitutability; in other words, if one of the senses of run is `operate' (as in She runs an engine factory ), that does not make it valid to assume that one can substitute operate for run in We run in the marathon every year . Although recognizing this as a shortcoming of dictionaries and assigning it arbitrarily to what, for lack of a better term, we might call the genius of the language, might seem trivial to the casual observer, it is a valid matter for concern in the realm of lexicology. Dictionaries are indeed exercises in bi-unique substitutability. contradiction contradiction contradiction contradiction contradiction contradiction +2 3628 3628c verbatim ( We ( ( serve ( ( a ( classic ( Tuscan meal ) ) ) ( that ( includes ( ( a ( Florentine terrine ) ) ( made ( with ( dick ( and ( chicken livers ) ) ) ) ) ) ) ) ) ) . ) ) ( We ( ( serve ( ( a meal ) ( of ( Florentine terrine ) ) ) ) . ) ) (ROOT (S (NP (PRP We)) (VP (VBP serve) (NP (NP (DT a) (JJ classic) (NNP Tuscan) (NN meal)) (SBAR (WHNP (WDT that)) (S (VP (VBZ includes) (NP (NP (DT a) (JJ Florentine) (NN terrine)) (VP (VBN made) (PP (IN with) (NP (NN dick) (CC and) (NN chicken) (NNS livers)))))))))) (. .))) (ROOT (S (NP (PRP We)) (VP (VBP serve) (NP (NP (DT a) (NN meal)) (PP (IN of) (NP (NNP Florentine) (NN terrine))))) (. .))) We serve a classic Tuscan meal that includes a Florentine terrine made with dick and chicken livers. We serve a meal of Florentine terrine. contradiction neutral entailment entailment entailment entailment +3 89411 89411c letters ( ( ( A ( few months ) ) ago ) ( , ( ( ( ( Carl Newton ) and ) I ) ( ( ( wrote ( a letter ) ) ( asking ( you ( to ( ( consider ( a ( financial contribution ) ) ) ( to ( ( graduate Endodontics ) ( at ( Indiana University ) ) ) ) ) ) ) ) ) . ) ) ) ) ( ( ( ( Carl Newton ) and ) I ) ( ( ( have never ) ( ( had ( any ( other ( previous contact ) ) ) ) ( with you ) ) ) . ) ) (ROOT (S (ADVP (NP (DT A) (JJ few) (NNS months)) (RB ago)) (, ,) (NP (NP (NNP Carl) (NNP Newton)) (CC and) (NP (PRP I))) (VP (VBD wrote) (NP (DT a) (NN letter)) (S (VP (VBG asking) (S (NP (PRP you)) (VP (TO to) (VP (VB consider) (NP (DT a) (JJ financial) (NN contribution)) (PP (TO to) (NP (NP (JJ graduate) (NNS Endodontics)) (PP (IN at) (NP (NNP Indiana) (NNP University))))))))))) (. .))) (ROOT (S (NP (NP (NNP Carl) (NNP Newton)) (CC and) (NP (PRP I))) (VP (VBP have) (ADVP (RB never)) (VP (VBN had) (NP (DT any) (JJ other) (JJ previous) (NN contact)) (PP (IN with) (NP (PRP you))))) (. .))) A few months ago, Carl Newton and I wrote a letter asking you to consider a financial contribution to graduate Endodontics at Indiana University. Carl Newton and I have never had any other previous contact with you. contradiction contradiction contradiction contradiction contradiction contradiction +4 136158 136158e facetoface ( I ( ( was ( on ( ( this earth ) ( you ( know ( ( , ( ( I ( 've ( lived ( on ( ( this earth ) ( for ( some reason ) ) ) ) ) ) ) , ) ) ( I ( just ( ( do n't ) ( know ( what ( it ( is yet ) ) ) ) ) ) ) ) ) ) ) ) ) . ) ) ( I ( ( ( ( do n't ) yet ) ( ( know ( the reason ) ) ( why ( I ( have ( lived ( on earth ) ) ) ) ) ) ) . ) ) (ROOT (S (NP (PRP I)) (VP (VBD was) (PP (IN on) (NP (NP (DT this) (NN earth)) (SBAR (S (NP (PRP you)) (VP (VBP know) (SBAR (S (PRN (, ,) (S (NP (PRP I)) (VP (VBP 've) (VP (VBN lived) (PP (IN on) (NP (NP (DT this) (NN earth)) (PP (IN for) (NP (DT some) (NN reason)))))))) (, ,)) (NP (PRP I)) (ADVP (RB just)) (VP (VBP do) (RB n't) (VP (VB know) (SBAR (WHNP (WP what)) (S (NP (PRP it)) (VP (VBZ is) (ADVP (RB yet))))))))))))))) (. .))) (ROOT (S (NP (PRP I)) (VP (VBP do) (RB n't) (ADVP (RB yet)) (VP (VB know) (NP (DT the) (NN reason)) (SBAR (WHADVP (WRB why)) (S (NP (PRP I)) (VP (VBP have) (VP (VBN lived) (PP (IN on) (NP (NN earth))))))))) (. .))) I was on this earth you know, I've lived on this earth for some reason, I just don't know what it is yet. I don't yet know the reason why I have lived on earth. entailment entailment entailment entailment entailment entailment diff --git a/test/data_for_tests/io/MNLI/test_matched.tsv b/test/data_for_tests/io/MNLI/test_matched.tsv new file mode 100755 index 00000000..b90c2d2a --- /dev/null +++ b/test/data_for_tests/io/MNLI/test_matched.tsv @@ -0,0 +1,6 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 +0 31493 31493 travel ( ( ( ( ( ( ( ( Hierbas , ) ( ans seco ) ) , ) ( ans dulce ) ) , ) and ) frigola ) ( ( ( are just ) ( ( a ( few names ) ) ( worth ( ( keeping ( a look-out ) ) for ) ) ) ) . ) ) ( Hierbas ( ( is ( ( a name ) ( worth ( ( looking out ) for ) ) ) ) . ) ) (ROOT (S (NP (NP (NNS Hierbas)) (, ,) (NP (NN ans) (NN seco)) (, ,) (NP (NN ans) (NN dulce)) (, ,) (CC and) (NP (NN frigola))) (VP (VBP are) (ADVP (RB just)) (NP (NP (DT a) (JJ few) (NNS names)) (PP (JJ worth) (S (VP (VBG keeping) (NP (DT a) (NN look-out)) (PP (IN for))))))) (. .))) (ROOT (S (NP (NNS Hierbas)) (VP (VBZ is) (NP (NP (DT a) (NN name)) (PP (JJ worth) (S (VP (VBG looking) (PRT (RP out)) (PP (IN for))))))) (. .))) Hierbas, ans seco, ans dulce, and frigola are just a few names worth keeping a look-out for. Hierbas is a name worth looking out for. +1 92164 92164 government ( ( ( The extent ) ( of ( the ( behavioral effects ) ) ) ) ( ( would ( ( depend ( in ( part ( on ( ( the structure ) ( of ( ( ( the ( individual ( account program ) ) ) and ) ( any limits ) ) ) ) ) ) ) ) ( on ( accessing ( the funds ) ) ) ) ) . ) ) ( ( Many people ) ( ( would ( be ( very ( unhappy ( to ( ( loose control ) ( over ( their ( own money ) ) ) ) ) ) ) ) ) . ) ) (ROOT (S (NP (NP (DT The) (NN extent)) (PP (IN of) (NP (DT the) (JJ behavioral) (NNS effects)))) (VP (MD would) (VP (VB depend) (PP (IN in) (NP (NP (NN part)) (PP (IN on) (NP (NP (DT the) (NN structure)) (PP (IN of) (NP (NP (DT the) (JJ individual) (NN account) (NN program)) (CC and) (NP (DT any) (NNS limits)))))))) (PP (IN on) (S (VP (VBG accessing) (NP (DT the) (NNS funds))))))) (. .))) (ROOT (S (NP (JJ Many) (NNS people)) (VP (MD would) (VP (VB be) (ADJP (RB very) (JJ unhappy) (PP (TO to) (NP (NP (JJ loose) (NN control)) (PP (IN over) (NP (PRP$ their) (JJ own) (NN money)))))))) (. .))) The extent of the behavioral effects would depend in part on the structure of the individual account program and any limits on accessing the funds. Many people would be very unhappy to loose control over their own money. +2 9662 9662 government ( ( ( Timely access ) ( to information ) ) ( ( is ( in ( ( the ( best interests ) ) ( of ( ( ( both GAO ) and ) ( the agencies ) ) ) ) ) ) . ) ) ( It ( ( ( is ( in ( ( everyone 's ) ( best interest ) ) ) ) ( to ( ( have access ) ( to ( information ( in ( a ( timely manner ) ) ) ) ) ) ) ) . ) ) (ROOT (S (NP (NP (JJ Timely) (NN access)) (PP (TO to) (NP (NN information)))) (VP (VBZ is) (PP (IN in) (NP (NP (DT the) (JJS best) (NNS interests)) (PP (IN of) (NP (NP (DT both) (NNP GAO)) (CC and) (NP (DT the) (NNS agencies))))))) (. .))) (ROOT (S (NP (PRP It)) (VP (VBZ is) (PP (IN in) (NP (NP (NN everyone) (POS 's)) (JJS best) (NN interest))) (S (VP (TO to) (VP (VB have) (NP (NN access)) (PP (TO to) (NP (NP (NN information)) (PP (IN in) (NP (DT a) (JJ timely) (NN manner))))))))) (. .))) Timely access to information is in the best interests of both GAO and the agencies. It is in everyone's best interest to have access to information in a timely manner. +3 5991 5991 travel ( ( Based ( in ( ( the ( Auvergnat ( spa town ) ) ) ( of Vichy ) ) ) ) ( , ( ( the ( French government ) ) ( often ( ( ( ( proved ( more zealous ) ) ( than ( its masters ) ) ) ( in ( ( ( suppressing ( civil liberties ) ) and ) ( ( drawing up ) ( anti-Jewish legislation ) ) ) ) ) . ) ) ) ) ) ( ( The ( French government ) ) ( ( passed ( ( anti-Jewish laws ) ( aimed ( at ( helping ( the Nazi ) ) ) ) ) ) . ) ) (ROOT (S (PP (VBN Based) (PP (IN in) (NP (NP (DT the) (NNP Auvergnat) (NN spa) (NN town)) (PP (IN of) (NP (NNP Vichy)))))) (, ,) (NP (DT the) (JJ French) (NN government)) (ADVP (RB often)) (VP (VBD proved) (NP (JJR more) (NNS zealous)) (PP (IN than) (NP (PRP$ its) (NNS masters))) (PP (IN in) (S (VP (VP (VBG suppressing) (NP (JJ civil) (NNS liberties))) (CC and) (VP (VBG drawing) (PRT (RP up)) (NP (JJ anti-Jewish) (NN legislation))))))) (. .))) (ROOT (S (NP (DT The) (JJ French) (NN government)) (VP (VBD passed) (NP (NP (JJ anti-Jewish) (NNS laws)) (VP (VBN aimed) (PP (IN at) (S (VP (VBG helping) (NP (DT the) (JJ Nazi)))))))) (. .))) Based in the Auvergnat spa town of Vichy, the French government often proved more zealous than its masters in suppressing civil liberties and drawing up anti-Jewish legislation. The French government passed anti-Jewish laws aimed at helping the Nazi. +4 50156 50156 travel ( ( ( ( ( Built ( in 1870 ) ) ( , ( ( ( its canopy ) ( of ( stained ( glass ( and ( cast iron ) ) ) ) ) ) ( is ( ( the oldest ) ( in Dublin ) ) ) ) ) ) ; ) ( ( its ( enthusiastic ( interior decoration ) ) ) ( ( is also ) ( typical ( of ( the era ) ) ) ) ) ) . ) ( It ( ( ( ( was ( constructed ( in 1870 ) ) ) and ) ( has ( ( the ( oldest canopy ) ) ( in Dublin ) ) ) ) . ) ) (ROOT (S (S (S (VP (VBN Built) (PP (IN in) (NP (CD 1870))))) (, ,) (NP (NP (PRP$ its) (NN canopy)) (PP (IN of) (NP (JJ stained) (NN glass) (CC and) (NN cast) (NN iron)))) (VP (VBZ is) (NP (NP (DT the) (JJS oldest)) (PP (IN in) (NP (NNP Dublin)))))) (: ;) (S (NP (PRP$ its) (JJ enthusiastic) (JJ interior) (NN decoration)) (VP (VBZ is) (ADVP (RB also)) (ADJP (JJ typical) (PP (IN of) (NP (DT the) (NN era)))))) (. .))) (ROOT (S (NP (PRP It)) (VP (VP (VBD was) (VP (VBN constructed) (PP (IN in) (NP (CD 1870))))) (CC and) (VP (VBZ has) (NP (NP (DT the) (JJS oldest) (NN canopy)) (PP (IN in) (NP (NNP Dublin)))))) (. .))) Built in 1870, its canopy of stained glass and cast iron is the oldest in Dublin; its enthusiastic interior decoration is also typical of the era. It was constructed in 1870 and has the oldest canopy in Dublin. diff --git a/test/data_for_tests/io/MNLI/test_mismatched.tsv b/test/data_for_tests/io/MNLI/test_mismatched.tsv new file mode 100755 index 00000000..798cd395 --- /dev/null +++ b/test/data_for_tests/io/MNLI/test_mismatched.tsv @@ -0,0 +1,6 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 +0 16130 16130 facetoface ( ( What ( have ( you decided ) ) ) ( , ( what ( ( ( are you ) ( going ( to do ) ) ) ? ) ) ) ) ( So ( what ( ( 's ( your decision ) ) ? ) ) ) (ROOT (SBARQ (SBAR (WHNP (WP What)) (S (VP (VBP have) (S (NP (PRP you)) (VP (VBD decided)))))) (, ,) (WHNP (WP what)) (SQ (VBP are) (NP (PRP you)) (VP (VBG going) (S (VP (TO to) (VP (VB do)))))) (. ?))) (ROOT (SBARQ (RB So) (WHNP (WP what)) (SQ (VBZ 's) (NP (PRP$ your) (NN decision))) (. ?))) What have you decided, what are you going to do? So what's your decision? +1 128269 128269 oup ( ( ( Women 's ) clothing ) ( ( is ( characterized ( by ( ( great diversity ) ( in ( ( styles and ) ( short ( production runs ) ) ) ) ) ) ) ) . ) ) ( ( ( Men 's ) clothing ) ( typically ( ( ( has ( the ( ( most stylistic ) diversity ) ) ) ( unlike ( ( the blandness ) ( of ( ( women 's ) fashion ) ) ) ) ) . ) ) ) (ROOT (S (NP (NP (NNP Women) (POS 's)) (NN clothing)) (VP (VBZ is) (VP (VBN characterized) (PP (IN by) (NP (NP (JJ great) (NN diversity)) (PP (IN in) (NP (NP (NNS styles)) (CC and) (NP (JJ short) (NN production) (NNS runs)))))))) (. .))) (ROOT (S (NP (NP (NNP Men) (POS 's)) (NN clothing)) (ADVP (RB typically)) (VP (VBZ has) (NP (DT the) (ADJP (RBS most) (JJ stylistic)) (NN diversity)) (PP (IN unlike) (NP (NP (DT the) (NN blandness)) (PP (IN of) (NP (NP (NNS women) (POS 's)) (NN fashion)))))) (. .))) Women's clothing is characterized by great diversity in styles and short production runs. Men's clothing typically has the most stylistic diversity unlike the blandness of women's fashion. +2 130938 130938 nineeleven ( ( ( ( ( Reports ( from ( ( two ( flight attendants ) ) ( in ( the ( coach cabin ) ) ) ) ) ) , ) ( ( ( Betty Ong ) and ) ( Madeline ( Amy Sweeney ) ) ) ) , ) ( ( ( tell us ) ( ( most ( of what ) ) ( we ( know ( about ( how ( ( the hijacking ) happened ) ) ) ) ) ) ) . ) ) ( ( ( The report ) ( on ( the hijacking ) ) ) ( ( ( was ( ( over ( five hundred ) ) pages ) ) long ) . ) ) (ROOT (S (NP (NP (NP (NNS Reports)) (PP (IN from) (NP (NP (CD two) (NN flight) (NNS attendants)) (PP (IN in) (NP (DT the) (NN coach) (NN cabin)))))) (, ,) (NP (NP (NNP Betty) (NNP Ong)) (CC and) (NP (NNP Madeline) (NNP Amy) (NNP Sweeney))) (, ,)) (VP (VBP tell) (NP (PRP us)) (SBAR (WHNP (JJS most) (WHPP (IN of) (WHNP (WP what)))) (S (NP (PRP we)) (VP (VBP know) (PP (IN about) (SBAR (WHADVP (WRB how)) (S (NP (DT the) (NN hijacking)) (VP (VBD happened))))))))) (. .))) (ROOT (S (NP (NP (DT The) (NN report)) (PP (IN on) (NP (DT the) (NN hijacking)))) (VP (VBD was) (NP (QP (RB over) (CD five) (CD hundred)) (NNS pages)) (ADVP (RB long))) (. .))) Reports from two flight attendants in the coach cabin, Betty Ong and Madeline Amy Sweeney, tell us most of what we know about how the hijacking happened. The report on the hijacking was over five hundred pages long. +3 40009 40009 nineeleven ( ( At ( about 9:20 ) ) ( , ( ( ( security personnel ) ( at ( FAA headquarters ) ) ) ( ( ( ( set up ) ( a ( hijacking teleconference ) ) ) ( with ( ( ( several agencies ) , ) ( including ( the ( Defense Department ) ) ) ) ) ) . ) ) ) ) ( ( The teleconference ) ( ( lasted ( for ( 13 ( straight hours ) ) ) ) . ) ) (ROOT (S (PP (IN At) (NP (QP (RB about) (CD 9:20)))) (, ,) (NP (NP (NN security) (NNS personnel)) (PP (IN at) (NP (NNP FAA) (NNS headquarters)))) (VP (VBD set) (PRT (RP up)) (NP (DT a) (VBG hijacking) (NN teleconference)) (PP (IN with) (NP (NP (JJ several) (NNS agencies)) (, ,) (PP (VBG including) (NP (DT the) (NNP Defense) (NNP Department)))))) (. .))) (ROOT (S (NP (DT The) (NN teleconference)) (VP (VBD lasted) (PP (IN for) (NP (CD 13) (JJ straight) (NNS hours)))) (. .))) At about 9:20, security personnel at FAA headquarters set up a hijacking teleconference with several agencies, including the Defense Department. The teleconference lasted for 13 straight hours. +4 105266 105266 nineeleven ( So ( we ( ( 've ( ( got ( ( a couple ) ( of aircraft ) ) ) ( ( up there ) ( that ( ( have ( those instructions ) ) ( at ( this ( present time ) ) ) ) ) ) ) ) ? ) ) ) ( ( At ( the ( present time ) ) ) ( , ( there ( ( ( ( ( were n't ) ( ( any aircraft ) ( in ( the air ) ) ) ) , ) right ) ? ) ) ) ) (ROOT (S (IN So) (NP (PRP we)) (VP (VBP 've) (VP (VBD got) (NP (NP (DT a) (NN couple)) (PP (IN of) (NP (NN aircraft)))) (ADVP (ADVP (RB up) (RB there)) (SBAR (WHNP (WDT that)) (S (VP (VBP have) (NP (DT those) (NNS instructions)) (PP (IN at) (NP (DT this) (JJ present) (NN time))))))))) (. ?))) (ROOT (S (PP (IN At) (NP (DT the) (JJ present) (NN time))) (, ,) (NP (EX there)) (VP (VBD were) (RB n't) (NP (NP (DT any) (NN aircraft)) (PP (IN in) (NP (DT the) (NN air)))) (, ,) (ADJP (JJ right))) (. ?))) So we've got a couple of aircraft up there that have those instructions at this present time? At the present time, there weren't any aircraft in the air, right? diff --git a/test/data_for_tests/io/MNLI/train.tsv b/test/data_for_tests/io/MNLI/train.tsv new file mode 100755 index 00000000..4ceebefd --- /dev/null +++ b/test/data_for_tests/io/MNLI/train.tsv @@ -0,0 +1,7 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 gold_label +0 31193 31193n government ( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) ) ( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) ) (ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .))) (ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .))) Conceptually cream skimming has two basic dimensions - product and geography. Product and geography are what make cream skimming work. neutral neutral +1 101457 101457e telephone ( you ( ( know ( during ( ( ( the season ) and ) ( i guess ) ) ) ) ( at ( at ( ( your level ) ( uh ( you ( ( ( lose them ) ( to ( the ( next level ) ) ) ) ( if ( ( if ( they ( decide ( to ( recall ( the ( the ( parent team ) ) ) ) ) ) ) ) ( ( the Braves ) ( decide ( to ( call ( to ( ( recall ( a guy ) ) ( from ( ( triple A ) ( ( ( then ( ( a ( double ( A guy ) ) ) ( ( goes up ) ( to ( replace him ) ) ) ) ) and ) ( ( a ( single ( A guy ) ) ) ( ( goes up ) ( to ( replace him ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( You ( ( ( ( lose ( the things ) ) ( to ( the ( following level ) ) ) ) ( if ( ( the people ) recall ) ) ) . ) ) (ROOT (S (NP (PRP you)) (VP (VBP know) (PP (IN during) (NP (NP (DT the) (NN season)) (CC and) (NP (FW i) (FW guess)))) (PP (IN at) (IN at) (NP (NP (PRP$ your) (NN level)) (SBAR (S (INTJ (UH uh)) (NP (PRP you)) (VP (VBP lose) (NP (PRP them)) (PP (TO to) (NP (DT the) (JJ next) (NN level))) (SBAR (IN if) (S (SBAR (IN if) (S (NP (PRP they)) (VP (VBP decide) (S (VP (TO to) (VP (VB recall) (NP (DT the) (DT the) (NN parent) (NN team)))))))) (NP (DT the) (NNPS Braves)) (VP (VBP decide) (S (VP (TO to) (VP (VB call) (S (VP (TO to) (VP (VB recall) (NP (DT a) (NN guy)) (PP (IN from) (NP (NP (RB triple) (DT A)) (SBAR (S (S (ADVP (RB then)) (NP (DT a) (JJ double) (NNP A) (NN guy)) (VP (VBZ goes) (PRT (RP up)) (S (VP (TO to) (VP (VB replace) (NP (PRP him))))))) (CC and) (S (NP (DT a) (JJ single) (NNP A) (NN guy)) (VP (VBZ goes) (PRT (RP up)) (S (VP (TO to) (VP (VB replace) (NP (PRP him)))))))))))))))))))))))))))) (ROOT (S (NP (PRP You)) (VP (VBP lose) (NP (DT the) (NNS things)) (PP (TO to) (NP (DT the) (JJ following) (NN level))) (SBAR (IN if) (S (NP (DT the) (NNS people)) (VP (VBP recall))))) (. .))) you know during the season and i guess at at your level uh you lose them to the next level if if they decide to recall the the parent team the Braves decide to call to recall a guy from triple A then a double A guy goes up to replace him and a single A guy goes up to replace him You lose the things to the following level if the people recall. entailment entailment +2 134793 134793e fiction ( ( One ( of ( our number ) ) ) ( ( will ( ( ( carry out ) ( your instructions ) ) minutely ) ) . ) ) ( ( ( A member ) ( of ( my team ) ) ) ( ( will ( ( execute ( your orders ) ) ( with ( immense precision ) ) ) ) . ) ) (ROOT (S (NP (NP (CD One)) (PP (IN of) (NP (PRP$ our) (NN number)))) (VP (MD will) (VP (VB carry) (PRT (RP out)) (NP (PRP$ your) (NNS instructions)) (ADVP (RB minutely)))) (. .))) (ROOT (S (NP (NP (DT A) (NN member)) (PP (IN of) (NP (PRP$ my) (NN team)))) (VP (MD will) (VP (VB execute) (NP (PRP$ your) (NNS orders)) (PP (IN with) (NP (JJ immense) (NN precision))))) (. .))) One of our number will carry out your instructions minutely. A member of my team will execute your orders with immense precision. entailment entailment +3 37397 37397e fiction ( ( How ( ( ( do you ) know ) ? ) ) ( ( All this ) ( ( ( is ( their information ) ) again ) . ) ) ) ( ( This information ) ( ( belongs ( to them ) ) . ) ) (ROOT (S (SBARQ (WHADVP (WRB How)) (SQ (VBP do) (NP (PRP you)) (VP (VB know))) (. ?)) (NP (PDT All) (DT this)) (VP (VBZ is) (NP (PRP$ their) (NN information)) (ADVP (RB again))) (. .))) (ROOT (S (NP (DT This) (NN information)) (VP (VBZ belongs) (PP (TO to) (NP (PRP them)))) (. .))) How do you know? All this is their information again. This information belongs to them. entailment entailment +4 50563 50563n telephone ( yeah ( i ( ( tell you ) ( what ( ( though ( if ( you ( go ( price ( some ( of ( those ( tennis shoes ) ) ) ) ) ) ) ) ) ( i ( can ( see ( why ( now ( you ( know ( they ( 're ( ( getting up ) ( in ( the ( hundred ( dollar range ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( ( The ( tennis shoes ) ) ( ( have ( ( a range ) ( of prices ) ) ) . ) ) (ROOT (S (VP (VB yeah) (S (NP (FW i)) (VP (VB tell) (NP (PRP you)) (SBAR (WHNP (WP what)) (S (SBAR (RB though) (IN if) (S (NP (PRP you)) (VP (VBP go) (VP (VB price) (NP (NP (DT some)) (PP (IN of) (NP (DT those) (NN tennis) (NNS shoes)))))))) (NP (FW i)) (VP (MD can) (VP (VB see) (SBAR (WHADVP (WRB why)) (S (ADVP (RB now)) (NP (PRP you)) (VP (VBP know) (SBAR (S (NP (PRP they)) (VP (VBP 're) (VP (VBG getting) (PRT (RP up)) (PP (IN in) (NP (DT the) (CD hundred) (NN dollar) (NN range))))))))))))))))))) (ROOT (S (NP (DT The) (NN tennis) (NNS shoes)) (VP (VBP have) (NP (NP (DT a) (NN range)) (PP (IN of) (NP (NNS prices))))) (. .))) yeah i tell you what though if you go price some of those tennis shoes i can see why now you know they're getting up in the hundred dollar range The tennis shoes have a range of prices. neutral neutral +11 11877 11877c travel ( ( Fun ( for ( ( adults and ) children ) ) ) . ) ( ( Fun ( for ( only children ) ) ) . ) (ROOT (S (VP (VB Fun) (PP (IN for) (NP (NNS adults) (CC and) (NNS children)))) (. .))) (ROOT (S (VP (VB Fun) (PP (IN for) (NP (JJ only) (NNS children)))) (. .))) Fun for adults and children. Fun for only children. contradiction contradiction diff --git a/test/data_for_tests/io/MSRA_NER/dev.conll b/test/data_for_tests/io/MSRA_NER/dev.conll new file mode 100755 index 00000000..792efce8 --- /dev/null +++ b/test/data_for_tests/io/MSRA_NER/dev.conll @@ -0,0 +1,38 @@ +把 O +欧 B-LOC + +美 B-LOC +、 O + +港 B-LOC +台 B-LOC + +流 O +行 O + +的 O +食 O + +品 O +类 O + +图 O +谱 O + +马 B-PER +列 B-PER + +主 O +义 O + +在 O +中 B-LOC + +国 I-LOC +传 O + +播 O +的 O + +历 O +史 O \ No newline at end of file diff --git a/test/data_for_tests/io/MSRA_NER/test.conll b/test/data_for_tests/io/MSRA_NER/test.conll new file mode 100755 index 00000000..d611fcdd --- /dev/null +++ b/test/data_for_tests/io/MSRA_NER/test.conll @@ -0,0 +1,31 @@ +中 B-ORG +共 I-ORG + +中 I-ORG +央 I-ORG + +致 O +中 B-ORG + +国 I-ORG +致 I-ORG + +公 I-ORG +党 I-ORG + +十 I-ORG +一 I-ORG + +大 I-ORG +的 O + +贺 O +词 O + + +各 O + +位 O +代 O + +表 O diff --git a/test/data_for_tests/io/MSRA_NER/train.conll b/test/data_for_tests/io/MSRA_NER/train.conll new file mode 100755 index 00000000..9edd3aef --- /dev/null +++ b/test/data_for_tests/io/MSRA_NER/train.conll @@ -0,0 +1,60 @@ +是 O +我 O + +们 O +收 O + +藏 O +北 B-LOC + +京 I-LOC +史 O + +料 O + +调 O +查 O + +范 O +围 O + +涉 O +及 O + +故 B-LOC +宫 I-LOC + +、 O +历 B-LOC + +博 I-LOC +、 O + +古 B-ORG +研 I-ORG + +所 I-ORG +、 O + +北 B-LOC +大 I-LOC + +清 I-LOC +华 I-LOC + +图 I-LOC +书 I-LOC + +馆 I-LOC +. O + +夏 B-PER +财 I-PER + +兴 I-PER +家 O + +分 O +到 O + +田 O diff --git a/test/data_for_tests/io/OntoNotes/dev.txt b/test/data_for_tests/io/OntoNotes/dev.txt new file mode 100644 index 00000000..e99207a1 --- /dev/null +++ b/test/data_for_tests/io/OntoNotes/dev.txt @@ -0,0 +1,10 @@ + +bc/msnbc/00/msnbc_0000 0 0 Hi UH (TOP(FRAG(INTJ*) - - - Dan_Abrams * - +bc/msnbc/00/msnbc_0000 0 1 everyone NN (NP*) - - - Dan_Abrams * - +bc/msnbc/00/msnbc_0000 0 2 /. . *)) - - - Dan_Abrams * - + +bc/msnbc/00/msnbc_0000 0 0 first RB (TOP(S(ADVP* - - - Dan_Abrams * (ARGM-TMP* * * * - +bc/msnbc/00/msnbc_0000 0 1 up RB * - - - Dan_Abrams * * * * * - +bc/msnbc/00/msnbc_0000 0 2 on IN (PP* - - - Dan_Abrams * * * * * - +bc/msnbc/00/msnbc_0000 0 3 the DT (NP* - - - Dan_Abrams * * * * * - +bc/msnbc/00/msnbc_0000 0 4 docket NN *)) docket - - Dan_Abrams * * * * * - diff --git a/test/data_for_tests/io/OntoNotes/test.txt b/test/data_for_tests/io/OntoNotes/test.txt new file mode 100644 index 00000000..c94069e0 --- /dev/null +++ b/test/data_for_tests/io/OntoNotes/test.txt @@ -0,0 +1,10 @@ + +bc/msnbc/00/msnbc_0007 0 0 Dealing VBG (TOP(VP* deal 01 - speaker_1 * (V*) - +bc/msnbc/00/msnbc_0007 0 1 with IN (PP* - - - speaker_1 * (ARG1* - +bc/msnbc/00/msnbc_0007 0 2 serial JJ (NP(NP* - - - speaker_1 * * (156 +bc/msnbc/00/msnbc_0007 0 3 crimes NNS *) crime - 1 speaker_1 * * 156) +bc/msnbc/00/msnbc_0007 0 4 per FW (ADVP* - - - speaker_1 * * - +bc/msnbc/00/msnbc_0007 0 5 se FW *))) - - - speaker_1 * *) - +bc/msnbc/00/msnbc_0007 0 6 /. . *)) - - - speaker_1 * * - + +bc/msnbc/00/msnbc_0007 0 0 We PRP (TOP(S(NP*) - - - speaker_1 * (ARG0*) * (90) diff --git a/test/data_for_tests/io/OntoNotes/train.txt b/test/data_for_tests/io/OntoNotes/train.txt new file mode 100644 index 00000000..36f14c73 --- /dev/null +++ b/test/data_for_tests/io/OntoNotes/train.txt @@ -0,0 +1,50 @@ + +bc/msnbc/00/msnbc_0003 0 0 The DT (TOP(S(NP* - - - Chris_Matthews * * (ARG1* * * * * - +bc/msnbc/00/msnbc_0003 0 1 move NN *) move 02 2 Chris_Matthews * (V*) *) * * * * - +bc/msnbc/00/msnbc_0003 0 2 comes VBZ (VP* come 03 2 Chris_Matthews * * (V*) * * * * - +bc/msnbc/00/msnbc_0003 0 3 a DT (SBAR(NP* - - - Chris_Matthews (DATE* * (ARGM-TMP* * * * * - +bc/msnbc/00/msnbc_0003 0 4 month NN *) month - 2 Chris_Matthews *) * * * * * * - +bc/msnbc/00/msnbc_0003 0 5 before IN * - - - Chris_Matthews * * * * * * * - +bc/msnbc/00/msnbc_0003 0 6 the DT (S(NP* - - - Chris_Matthews * * * * (ARG1* (ARG0* * - +bc/msnbc/00/msnbc_0003 0 7 Senate NNP *) - - - Chris_Matthews (ORG) * * * *) *) * - +bc/msnbc/00/msnbc_0003 0 8 is VBZ (VP* be 03 - Chris_Matthews * * * (V*) * * * - +bc/msnbc/00/msnbc_0003 0 9 scheduled VBN (VP* schedule 01 - Chris_Matthews * * * * (V*) * * - +bc/msnbc/00/msnbc_0003 0 10 to TO (S(VP* - - - Chris_Matthews * * * * (ARG2* * * - +bc/msnbc/00/msnbc_0003 0 11 hold VB (VP* hold 04 8 Chris_Matthews * * * * * (V*) * - +bc/msnbc/00/msnbc_0003 0 12 confirmation NN (NP(NP* - - - Chris_Matthews * * * * * (ARG1* (ARG2*) - +bc/msnbc/00/msnbc_0003 0 13 hearings NNS *) hearing 01 1 Chris_Matthews * * * * * * (V*) - +bc/msnbc/00/msnbc_0003 0 14 on IN (PP* - - - Chris_Matthews * * * * * * (ARG1* - +bc/msnbc/00/msnbc_0003 0 15 President NNP (NP(NP(NP* - - - Chris_Matthews * * * * * * * (194 +bc/msnbc/00/msnbc_0003 0 16 Bush NNP * - - - Chris_Matthews (PERSON) * * * * * * - +bc/msnbc/00/msnbc_0003 0 17 's POS *) - - - Chris_Matthews * * * * * * * 194) +bc/msnbc/00/msnbc_0003 0 18 Supreme NNP (NML* - - - Chris_Matthews (ORG* * * * * * * - +bc/msnbc/00/msnbc_0003 0 19 Court NNP *) - - - Chris_Matthews *) * * * * * * - +bc/msnbc/00/msnbc_0003 0 20 nominee NN *) - - - Chris_Matthews * * * * * * * - +bc/msnbc/00/msnbc_0003 0 21 John NNP (NP* - - - Chris_Matthews (PERSON* * * * * * * - +bc/msnbc/00/msnbc_0003 0 22 Roberts NNP *)))))))))))) - - - Chris_Matthews *) * *) * *) *) *) - +bc/msnbc/00/msnbc_0003 0 23 /. . *)) - - - Chris_Matthews * * * * * * * - + +bc/msnbc/00/msnbc_0003 0 0 Senator NNP (TOP(S(NP(NP* - - - Chris_Matthews * (ARG1* * * (162 +bc/msnbc/00/msnbc_0003 0 1 Chris NNP * - - - Chris_Matthews (PERSON* * * * - +bc/msnbc/00/msnbc_0003 0 2 Dodd NNP *) - - - Chris_Matthews *) * * * - +bc/msnbc/00/msnbc_0003 0 3 of IN (PP* - - - Chris_Matthews * * * * - +bc/msnbc/00/msnbc_0003 0 4 Connecticut NNP (NP*))) - - - Chris_Matthews (GPE) *) * * 162) +bc/msnbc/00/msnbc_0003 0 5 was VBD (VP* be 01 1 Chris_Matthews * (V*) * * - +bc/msnbc/00/msnbc_0003 0 6 among IN (PP* - - - Chris_Matthews * (ARG2* * * - +bc/msnbc/00/msnbc_0003 0 7 those DT (NP(NP* - - - Chris_Matthews * * (ARG0* * - +bc/msnbc/00/msnbc_0003 0 8 Democrats NNPS *) - - - Chris_Matthews (NORP) * *) * - +bc/msnbc/00/msnbc_0003 0 9 who WP (SBAR(WHNP*) - - - Chris_Matthews * * (R-ARG0*) * - +bc/msnbc/00/msnbc_0003 0 10 spoke VBD (S(VP* speak 03 5 Chris_Matthews * * (V*) * - +bc/msnbc/00/msnbc_0003 0 11 out RP (PRT*) - - - Chris_Matthews * * * * - +bc/msnbc/00/msnbc_0003 0 12 against IN (PP* - - - Chris_Matthews * * (ARG1* * - +bc/msnbc/00/msnbc_0003 0 13 Bolton NNP (NP(NP* - - - Chris_Matthews (PERSON) * * (ARG1* (31|(130 +bc/msnbc/00/msnbc_0003 0 14 's POS *) - - - Chris_Matthews * * * *) 31) +bc/msnbc/00/msnbc_0003 0 15 appointment NN *)) appointment 01 1 Chris_Matthews * * *) (V*) 130) +bc/msnbc/00/msnbc_0003 0 16 today NN (NP*))))))) today - 2 Chris_Matthews (DATE) *) (ARGM-TMP*) * (121) +bc/msnbc/00/msnbc_0003 0 17 /. . *)) - - - Chris_Matthews * * * * - + +bc/msnbc/00/msnbc_0003 0 0 I PRP (TOP(S(NP*) - - - Christopher_Dodd * * (ARG0*) * (162) +bc/msnbc/00/msnbc_0003 0 1 just RB (ADVP*) - - - Christopher_Dodd * * (ARGM-ADV*) * - +bc/msnbc/00/msnbc_0003 0 2 do VBP (VP* do 01 - Christopher_Dodd * (V*) * * - +bc/msnbc/00/msnbc_0003 0 3 n't RB * - - - Christopher_Dodd * * (ARGM-NEG*) * - +bc/msnbc/00/msnbc_0003 0 4 think VB (VP* think 01 1 Christopher_Dodd * * (V*) * - diff --git a/test/data_for_tests/io/QNLI/dev.tsv b/test/data_for_tests/io/QNLI/dev.tsv new file mode 100755 index 00000000..ac4ecabe --- /dev/null +++ b/test/data_for_tests/io/QNLI/dev.tsv @@ -0,0 +1,6 @@ +index question sentence label +0 What came into force after the new constitution was herald? As of that day, the new constitution heralding the Second Republic came into force. entailment +1 What is the first major city in the stream of the Rhine? The most important tributaries in this area are the Ill below of Strasbourg, the Neckar in Mannheim and the Main across from Mainz. not_entailment +2 What is the minimum required if you want to teach in Canada? In most provinces a second Bachelor's Degree such as a Bachelor of Education is required to become a qualified teacher. not_entailment +3 How was Temüjin kept imprisoned by the Tayichi'ud? The Tayichi'ud enslaved Temüjin (reportedly with a cangue, a sort of portable stocks), but with the help of a sympathetic guard, the father of Chilaun (who later became a general of Genghis Khan), he was able to escape from the ger (yurt) in the middle of the night by hiding in a river crevice.[citation needed] entailment +4 What did Herr Gott, dich loben wir become known as ? He paraphrased the Te Deum as "Herr Gott, dich loben wir" with a simplified form of the melody. not_entailment diff --git a/test/data_for_tests/io/QNLI/test.tsv b/test/data_for_tests/io/QNLI/test.tsv new file mode 100755 index 00000000..55bfbeaa --- /dev/null +++ b/test/data_for_tests/io/QNLI/test.tsv @@ -0,0 +1,6 @@ +index question sentence +0 What organization is devoted to Jihad against Israel? For some decades prior to the First Palestine Intifada in 1987, the Muslim Brotherhood in Palestine took a "quiescent" stance towards Israel, focusing on preaching, education and social services, and benefiting from Israel's "indulgence" to build up a network of mosques and charitable organizations. +1 In what century was the Yarrow-Schlick-Tweedy balancing system used? In the late 19th century, the Yarrow-Schlick-Tweedy balancing 'system' was used on some marine triple expansion engines. +2 The largest brand of what store in the UK is located in Kingston Park? Close to Newcastle, the largest indoor shopping centre in Europe, the MetroCentre, is located in Gateshead. +3 What does the IPCC rely on for research? In principle, this means that any significant new evidence or events that change our understanding of climate science between this deadline and publication of an IPCC report cannot be included. +4 What is the principle about relating spin and space variables? Thus in the case of two fermions there is a strictly negative correlation between spatial and spin variables, whereas for two bosons (e.g. quanta of electromagnetic waves, photons) the correlation is strictly positive. diff --git a/test/data_for_tests/io/QNLI/train.tsv b/test/data_for_tests/io/QNLI/train.tsv new file mode 100755 index 00000000..fc0b966e --- /dev/null +++ b/test/data_for_tests/io/QNLI/train.tsv @@ -0,0 +1,6 @@ +index question sentence label +0 When did the third Digimon series begin? Unlike the two seasons before it and most of the seasons that followed, Digimon Tamers takes a darker and more realistic approach to its story featuring Digimon who do not reincarnate after their deaths and more complex character development in the original Japanese. not_entailment +1 Which missile batteries often have individual launchers several kilometres from one another? When MANPADS is operated by specialists, batteries may have several dozen teams deploying separately in small sections; self-propelled air defence guns may deploy in pairs. not_entailment +2 What two things does Popper argue Tarski's theory involves in an evaluation of truth? He bases this interpretation on the fact that examples such as the one described above refer to two things: assertions and the facts to which they refer. entailment +3 What is the name of the village 9 miles north of Calafat where the Ottoman forces attacked the Russians? On 31 December 1853, the Ottoman forces at Calafat moved against the Russian force at Chetatea or Cetate, a small village nine miles north of Calafat, and engaged them on 6 January 1854. entailment +4 What famous palace is located in London? London contains four World Heritage Sites: the Tower of London; Kew Gardens; the site comprising the Palace of Westminster, Westminster Abbey, and St Margaret's Church; and the historic settlement of Greenwich (in which the Royal Observatory, Greenwich marks the Prime Meridian, 0° longitude, and GMT). not_entailment diff --git a/test/data_for_tests/io/Quora/dev.tsv b/test/data_for_tests/io/Quora/dev.tsv new file mode 100644 index 00000000..8182f190 --- /dev/null +++ b/test/data_for_tests/io/Quora/dev.tsv @@ -0,0 +1,2 @@ +1 How do I get funding for my web based startup idea ? How do I get seed funding pre product ? 327970 +0 Is honey a viable alternative to sugar for diabetics ? How would you compare the United States ' euthanasia laws to Denmark ? 90348 diff --git a/test/data_for_tests/io/Quora/test.tsv b/test/data_for_tests/io/Quora/test.tsv new file mode 100644 index 00000000..9582aa14 --- /dev/null +++ b/test/data_for_tests/io/Quora/test.tsv @@ -0,0 +1,2 @@ +1 What should I do to avoid sleeping in class ? How do I not sleep in a boring class ? 50018 +0 Do women support each other more than men do ? Do women need more compliments than men ? 126924 diff --git a/test/data_for_tests/io/Quora/train.tsv b/test/data_for_tests/io/Quora/train.tsv new file mode 100644 index 00000000..e82940c9 --- /dev/null +++ b/test/data_for_tests/io/Quora/train.tsv @@ -0,0 +1,2 @@ +1 What is your review of Hidden Figures -LRB- 2016 movie -RRB- ? What are your impressions of Hidden Figures -LRB- 2017 movie -RRB- ? 11877 +0 Currently , all Supreme Court Justices come from very elite law schools , is it similar for the best lawyers in private practice ? What 's your type of jungle -LRB- concrete or nature -RRB- and why ? 221489 diff --git a/test/data_for_tests/io/RTE/dev.tsv b/test/data_for_tests/io/RTE/dev.tsv new file mode 100644 index 00000000..f8f72536 --- /dev/null +++ b/test/data_for_tests/io/RTE/dev.tsv @@ -0,0 +1,6 @@ +index sentence1 sentence2 label +0 Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. Christopher Reeve had an accident. not_entailment +1 Yet, we now are discovering that antibiotics are losing their effectiveness against illness. Disease-causing bacteria are mutating faster than we can come up with new antibiotics to fight the new variations. Bacteria is winning the war against antibiotics. entailment +2 Cairo is now home to some 15 million people - a burgeoning population that produces approximately 10,000 tonnes of rubbish per day, putting an enormous strain on public services. In the past 10 years, the government has tried hard to encourage private investment in the refuse sector, but some estimate 4,000 tonnes of waste is left behind every day, festering in the heat as it waits for someone to clear it up. It is often the people in the poorest neighbourhoods that are worst affected. But in some areas they are fighting back. In Shubra, one of the northern districts of the city, the residents have taken to the streets armed with dustpans and brushes to clean up public areas which have been used as public dumps. 15 million tonnes of rubbish are produced daily in Cairo. not_entailment +3 The Amish community in Pennsylvania, which numbers about 55,000, lives an agrarian lifestyle, shunning technological advances like electricity and automobiles. And many say their insular lifestyle gives them a sense that they are protected from the violence of American society. But as residents gathered near the school, some wearing traditional garb and arriving in horse-drawn buggies, they said that sense of safety had been shattered. "If someone snaps and wants to do something stupid, there's no distance that's going to stop them," said Jake King, 56, an Amish lantern maker who knew several families whose children had been shot. Pennsylvania has the biggest Amish community in the U.S. not_entailment +4 Security forces were on high alert after an election campaign in which more than 1,000 people, including seven election candidates, have been killed. Security forces were on high alert after a campaign marred by violence. entailment diff --git a/test/data_for_tests/io/RTE/test.tsv b/test/data_for_tests/io/RTE/test.tsv new file mode 100644 index 00000000..e52dfac4 --- /dev/null +++ b/test/data_for_tests/io/RTE/test.tsv @@ -0,0 +1,6 @@ +index sentence1 sentence2 +0 Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case. Shukla is related to Mangla. +1 Authorities in Brazil say that more than 200 people are being held hostage in a prison in the country's remote, Amazonian-jungle state of Rondonia. Authorities in Brazil hold 200 people as hostage. +2 A mercenary group faithful to the warmongering policy of former Somozist colonel Enrique Bermudez attacked an IFA truck belonging to the interior ministry at 0900 on 26 March in El Jicote, wounded and killed an interior ministry worker and wounded five others. An interior ministry worker was killed by a mercenary group. +3 The British ambassador to Egypt, Derek Plumbly, told Reuters on Monday that authorities had compiled the list of 10 based on lists from tour companies and from families whose relatives have not been in contact since the bombings. Derek Plumbly resides in Egypt. +4 Tibone estimated diamond production at four mines operated by Debswana -- Botswana's 50-50 joint venture with De Beers -- could reach 33 million carats this year. Botswana is a business partner of De Beers. diff --git a/test/data_for_tests/io/RTE/train.tsv b/test/data_for_tests/io/RTE/train.tsv new file mode 100644 index 00000000..70e5414f --- /dev/null +++ b/test/data_for_tests/io/RTE/train.tsv @@ -0,0 +1,6 @@ +index sentence1 sentence2 label +0 No Weapons of Mass Destruction Found in Iraq Yet. Weapons of Mass Destruction Found in Iraq. not_entailment +1 A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI. Pope Benedict XVI is the new leader of the Roman Catholic Church. entailment +2 Herceptin was already approved to treat the sickest breast cancer patients, and the company said, Monday, it will discuss with federal regulators the possibility of prescribing the drug for more breast cancer patients. Herceptin can be used to treat breast cancer. entailment +3 Judie Vivian, chief executive at ProMedica, a medical service company that helps sustain the 2-year-old Vietnam Heart Institute in Ho Chi Minh City (formerly Saigon), said that so far about 1,500 children have received treatment. The previous name of Ho Chi Minh City was Saigon. entailment +4 A man is due in court later charged with the murder 26 years ago of a teenager whose case was the first to be featured on BBC One's Crimewatch. Colette Aram, 16, was walking to her boyfriend's house in Keyworth, Nottinghamshire, on 30 October 1983 when she disappeared. Her body was later found in a field close to her home. Paul Stewart Hutchinson, 50, has been charged with murder and is due before Nottingham magistrates later. Paul Stewart Hutchinson is accused of having stabbed a girl. not_entailment diff --git a/test/data_for_tests/io/SNLI/snli_1.0_dev.jsonl b/test/data_for_tests/io/SNLI/snli_1.0_dev.jsonl new file mode 100755 index 00000000..2d091c73 --- /dev/null +++ b/test/data_for_tests/io/SNLI/snli_1.0_dev.jsonl @@ -0,0 +1,5 @@ +{"annotator_labels": ["neutral", "entailment", "neutral", "neutral", "neutral"], "captionID": "4705552913.jpg#2", "gold_label": "neutral", "pairID": "4705552913.jpg#2r1n", "sentence1": "Two women are embracing while holding to go packages.", "sentence1_binary_parse": "( ( Two women ) ( ( are ( embracing ( while ( holding ( to ( go packages ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP are) (VP (VBG embracing) (SBAR (IN while) (S (NP (VBG holding)) (VP (TO to) (VP (VB go) (NP (NNS packages)))))))) (. .)))", "sentence2": "The sisters are hugging goodbye while holding to go packages after just eating lunch.", "sentence2_binary_parse": "( ( The sisters ) ( ( are ( ( hugging goodbye ) ( while ( holding ( to ( ( go packages ) ( after ( just ( eating lunch ) ) ) ) ) ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP are) (VP (VBG hugging) (NP (UH goodbye)) (PP (IN while) (S (VP (VBG holding) (S (VP (TO to) (VP (VB go) (NP (NNS packages)) (PP (IN after) (S (ADVP (RB just)) (VP (VBG eating) (NP (NN lunch))))))))))))) (. .)))"} +{"annotator_labels": ["entailment", "entailment", "entailment", "entailment", "entailment"], "captionID": "4705552913.jpg#2", "gold_label": "entailment", "pairID": "4705552913.jpg#2r1e", "sentence1": "Two women are embracing while holding to go packages.", "sentence1_binary_parse": "( ( Two women ) ( ( are ( embracing ( while ( holding ( to ( go packages ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP are) (VP (VBG embracing) (SBAR (IN while) (S (NP (VBG holding)) (VP (TO to) (VP (VB go) (NP (NNS packages)))))))) (. .)))", "sentence2": "Two woman are holding packages.", "sentence2_binary_parse": "( ( Two woman ) ( ( are ( holding packages ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are) (VP (VBG holding) (NP (NNS packages)))) (. .)))"} +{"annotator_labels": ["contradiction", "contradiction", "contradiction", "contradiction", "contradiction"], "captionID": "4705552913.jpg#2", "gold_label": "contradiction", "pairID": "4705552913.jpg#2r1c", "sentence1": "Two women are embracing while holding to go packages.", "sentence1_binary_parse": "( ( Two women ) ( ( are ( embracing ( while ( holding ( to ( go packages ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP are) (VP (VBG embracing) (SBAR (IN while) (S (NP (VBG holding)) (VP (TO to) (VP (VB go) (NP (NNS packages)))))))) (. .)))", "sentence2": "The men are fighting outside a deli.", "sentence2_binary_parse": "( ( The men ) ( ( are ( fighting ( outside ( a deli ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NNS men)) (VP (VBP are) (VP (VBG fighting) (PP (IN outside) (NP (DT a) (NNS deli))))) (. .)))"} +{"annotator_labels": ["entailment", "entailment", "entailment", "entailment", "entailment"], "captionID": "2407214681.jpg#0", "gold_label": "entailment", "pairID": "2407214681.jpg#0r1e", "sentence1": "Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.", "sentence1_binary_parse": "( ( ( Two ( young children ) ) ( in ( ( ( ( ( blue jerseys ) , ) ( one ( with ( the ( number 9 ) ) ) ) ) and ) ( one ( with ( the ( number 2 ) ) ) ) ) ) ) ( ( are ( ( ( standing ( on ( ( wooden steps ) ( in ( a bathroom ) ) ) ) ) and ) ( ( washing ( their hands ) ) ( in ( a sink ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (CD Two) (JJ young) (NNS children)) (PP (IN in) (NP (NP (JJ blue) (NNS jerseys)) (, ,) (NP (NP (CD one)) (PP (IN with) (NP (DT the) (NN number) (CD 9)))) (CC and) (NP (NP (CD one)) (PP (IN with) (NP (DT the) (NN number) (CD 2))))))) (VP (VBP are) (VP (VP (VBG standing) (PP (IN on) (NP (NP (JJ wooden) (NNS steps)) (PP (IN in) (NP (DT a) (NN bathroom)))))) (CC and) (VP (VBG washing) (NP (PRP$ their) (NNS hands)) (PP (IN in) (NP (DT a) (NN sink)))))) (. .)))", "sentence2": "Two kids in numbered jerseys wash their hands.", "sentence2_binary_parse": "( ( ( Two kids ) ( in ( numbered jerseys ) ) ) ( ( wash ( their hands ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN in) (NP (JJ numbered) (NNS jerseys)))) (VP (VBP wash) (NP (PRP$ their) (NNS hands))) (. .)))"} +{"annotator_labels": ["neutral", "neutral", "neutral", "entailment", "entailment"], "captionID": "2407214681.jpg#0", "gold_label": "neutral", "pairID": "2407214681.jpg#0r1n", "sentence1": "Two young children in blue jerseys, one with the number 9 and one with the number 2 are standing on wooden steps in a bathroom and washing their hands in a sink.", "sentence1_binary_parse": "( ( ( Two ( young children ) ) ( in ( ( ( ( ( blue jerseys ) , ) ( one ( with ( the ( number 9 ) ) ) ) ) and ) ( one ( with ( the ( number 2 ) ) ) ) ) ) ) ( ( are ( ( ( standing ( on ( ( wooden steps ) ( in ( a bathroom ) ) ) ) ) and ) ( ( washing ( their hands ) ) ( in ( a sink ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (CD Two) (JJ young) (NNS children)) (PP (IN in) (NP (NP (JJ blue) (NNS jerseys)) (, ,) (NP (NP (CD one)) (PP (IN with) (NP (DT the) (NN number) (CD 9)))) (CC and) (NP (NP (CD one)) (PP (IN with) (NP (DT the) (NN number) (CD 2))))))) (VP (VBP are) (VP (VP (VBG standing) (PP (IN on) (NP (NP (JJ wooden) (NNS steps)) (PP (IN in) (NP (DT a) (NN bathroom)))))) (CC and) (VP (VBG washing) (NP (PRP$ their) (NNS hands)) (PP (IN in) (NP (DT a) (NN sink)))))) (. .)))", "sentence2": "Two kids at a ballgame wash their hands.", "sentence2_binary_parse": "( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( wash ( their hands ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN at) (NP (DT a) (NN ballgame)))) (VP (VBP wash) (NP (PRP$ their) (NNS hands))) (. .)))"} diff --git a/test/data_for_tests/io/SNLI/snli_1.0_test.jsonl b/test/data_for_tests/io/SNLI/snli_1.0_test.jsonl new file mode 100755 index 00000000..49d40720 --- /dev/null +++ b/test/data_for_tests/io/SNLI/snli_1.0_test.jsonl @@ -0,0 +1,5 @@ +{"annotator_labels": ["neutral", "contradiction", "contradiction", "neutral", "neutral"], "captionID": "2677109430.jpg#1", "gold_label": "neutral", "pairID": "2677109430.jpg#1r1n", "sentence1": "This church choir sings to the masses as they sing joyous songs from the book at a church.", "sentence1_binary_parse": "( ( This ( church choir ) ) ( ( ( sings ( to ( the masses ) ) ) ( as ( they ( ( sing ( joyous songs ) ) ( from ( ( the book ) ( at ( a church ) ) ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT This) (NN church) (NN choir)) (VP (VBZ sings) (PP (TO to) (NP (DT the) (NNS masses))) (SBAR (IN as) (S (NP (PRP they)) (VP (VBP sing) (NP (JJ joyous) (NNS songs)) (PP (IN from) (NP (NP (DT the) (NN book)) (PP (IN at) (NP (DT a) (NN church))))))))) (. .)))", "sentence2": "The church has cracks in the ceiling.", "sentence2_binary_parse": "( ( The church ) ( ( has ( cracks ( in ( the ceiling ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN church)) (VP (VBZ has) (NP (NP (NNS cracks)) (PP (IN in) (NP (DT the) (NN ceiling))))) (. .)))"} +{"annotator_labels": ["entailment", "entailment", "entailment", "neutral", "entailment"], "captionID": "2677109430.jpg#1", "gold_label": "entailment", "pairID": "2677109430.jpg#1r1e", "sentence1": "This church choir sings to the masses as they sing joyous songs from the book at a church.", "sentence1_binary_parse": "( ( This ( church choir ) ) ( ( ( sings ( to ( the masses ) ) ) ( as ( they ( ( sing ( joyous songs ) ) ( from ( ( the book ) ( at ( a church ) ) ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT This) (NN church) (NN choir)) (VP (VBZ sings) (PP (TO to) (NP (DT the) (NNS masses))) (SBAR (IN as) (S (NP (PRP they)) (VP (VBP sing) (NP (JJ joyous) (NNS songs)) (PP (IN from) (NP (NP (DT the) (NN book)) (PP (IN at) (NP (DT a) (NN church))))))))) (. .)))", "sentence2": "The church is filled with song.", "sentence2_binary_parse": "( ( The church ) ( ( is ( filled ( with song ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN church)) (VP (VBZ is) (VP (VBN filled) (PP (IN with) (NP (NN song))))) (. .)))"} +{"annotator_labels": ["contradiction", "contradiction", "contradiction", "contradiction", "contradiction"], "captionID": "2677109430.jpg#1", "gold_label": "contradiction", "pairID": "2677109430.jpg#1r1c", "sentence1": "This church choir sings to the masses as they sing joyous songs from the book at a church.", "sentence1_binary_parse": "( ( This ( church choir ) ) ( ( ( sings ( to ( the masses ) ) ) ( as ( they ( ( sing ( joyous songs ) ) ( from ( ( the book ) ( at ( a church ) ) ) ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (DT This) (NN church) (NN choir)) (VP (VBZ sings) (PP (TO to) (NP (DT the) (NNS masses))) (SBAR (IN as) (S (NP (PRP they)) (VP (VBP sing) (NP (JJ joyous) (NNS songs)) (PP (IN from) (NP (NP (DT the) (NN book)) (PP (IN at) (NP (DT a) (NN church))))))))) (. .)))", "sentence2": "A choir singing at a baseball game.", "sentence2_binary_parse": "( ( ( A choir ) ( singing ( at ( a ( baseball game ) ) ) ) ) . )", "sentence2_parse": "(ROOT (NP (NP (DT A) (NN choir)) (VP (VBG singing) (PP (IN at) (NP (DT a) (NN baseball) (NN game)))) (. .)))"} +{"annotator_labels": ["neutral", "neutral", "neutral", "neutral", "neutral"], "captionID": "6160193920.jpg#4", "gold_label": "neutral", "pairID": "6160193920.jpg#4r1n", "sentence1": "A woman with a green headscarf, blue shirt and a very big grin.", "sentence1_binary_parse": "( ( ( A woman ) ( with ( ( ( ( ( a ( green headscarf ) ) , ) ( blue shirt ) ) and ) ( a ( ( very big ) grin ) ) ) ) ) . )", "sentence1_parse": "(ROOT (NP (NP (DT A) (NN woman)) (PP (IN with) (NP (NP (DT a) (JJ green) (NN headscarf)) (, ,) (NP (JJ blue) (NN shirt)) (CC and) (NP (DT a) (ADJP (RB very) (JJ big)) (NN grin)))) (. .)))", "sentence2": "The woman is young.", "sentence2_binary_parse": "( ( The woman ) ( ( is young ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is) (ADJP (JJ young))) (. .)))"} +{"annotator_labels": ["entailment", "entailment", "contradiction", "entailment", "neutral"], "captionID": "6160193920.jpg#4", "gold_label": "entailment", "pairID": "6160193920.jpg#4r1e", "sentence1": "A woman with a green headscarf, blue shirt and a very big grin.", "sentence1_binary_parse": "( ( ( A woman ) ( with ( ( ( ( ( a ( green headscarf ) ) , ) ( blue shirt ) ) and ) ( a ( ( very big ) grin ) ) ) ) ) . )", "sentence1_parse": "(ROOT (NP (NP (DT A) (NN woman)) (PP (IN with) (NP (NP (DT a) (JJ green) (NN headscarf)) (, ,) (NP (JJ blue) (NN shirt)) (CC and) (NP (DT a) (ADJP (RB very) (JJ big)) (NN grin)))) (. .)))", "sentence2": "The woman is very happy.", "sentence2_binary_parse": "( ( The woman ) ( ( is ( very happy ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is) (ADJP (RB very) (JJ happy))) (. .)))"} diff --git a/test/data_for_tests/io/SNLI/snli_1.0_train.jsonl b/test/data_for_tests/io/SNLI/snli_1.0_train.jsonl new file mode 100755 index 00000000..8be03c11 --- /dev/null +++ b/test/data_for_tests/io/SNLI/snli_1.0_train.jsonl @@ -0,0 +1,5 @@ +{"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"} +{"annotator_labels": ["contradiction"], "captionID": "3416050480.jpg#4", "gold_label": "contradiction", "pairID": "3416050480.jpg#4r1c", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is at a diner, ordering an omelette.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) ( ordering ( an omelette ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) (NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP (DT an) (NN omelette))))) (. .)))"} +{"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"} +{"annotator_labels": ["neutral"], "captionID": "2267923837.jpg#2", "gold_label": "neutral", "pairID": "2267923837.jpg#2r1n", "sentence1": "Children smiling and waving at camera", "sentence1_binary_parse": "( Children ( ( ( smiling and ) waving ) ( at camera ) ) )", "sentence1_parse": "(ROOT (NP (S (NP (NNP Children)) (VP (VBG smiling) (CC and) (VBG waving) (PP (IN at) (NP (NN camera)))))))", "sentence2": "They are smiling at their parents", "sentence2_binary_parse": "( They ( are ( smiling ( at ( their parents ) ) ) ) )", "sentence2_parse": "(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VBG smiling) (PP (IN at) (NP (PRP$ their) (NNS parents)))))))"} +{"annotator_labels": ["entailment"], "captionID": "2267923837.jpg#2", "gold_label": "entailment", "pairID": "2267923837.jpg#2r1e", "sentence1": "Children smiling and waving at camera", "sentence1_binary_parse": "( Children ( ( ( smiling and ) waving ) ( at camera ) ) )", "sentence1_parse": "(ROOT (NP (S (NP (NNP Children)) (VP (VBG smiling) (CC and) (VBG waving) (PP (IN at) (NP (NN camera)))))))", "sentence2": "There are children present", "sentence2_binary_parse": "( There ( ( are children ) present ) )", "sentence2_parse": "(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NNS children)) (ADVP (RB present)))))"} diff --git a/test/data_for_tests/io/SST-2/dev.tsv b/test/data_for_tests/io/SST-2/dev.tsv new file mode 100755 index 00000000..3fec0fa6 --- /dev/null +++ b/test/data_for_tests/io/SST-2/dev.tsv @@ -0,0 +1,6 @@ +sentence label +it 's a charming and often affecting journey . 1 +unflinchingly bleak and desperate 0 +allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . 1 +the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . 1 +it 's slow -- very , very slow . 0 diff --git a/test/data_for_tests/io/SST-2/test.tsv b/test/data_for_tests/io/SST-2/test.tsv new file mode 100755 index 00000000..6ad46368 --- /dev/null +++ b/test/data_for_tests/io/SST-2/test.tsv @@ -0,0 +1,6 @@ +index sentence +0 uneasy mishmash of styles and genres . +1 this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation . +2 by the end of no such thing the audience , like beatrice , has a watchful affection for the monster . +3 director rob marshall went out gunning to make a great one . +4 lathan and diggs have considerable personal charm , and their screen rapport makes the old story seem new . diff --git a/test/data_for_tests/io/SST-2/train.tsv b/test/data_for_tests/io/SST-2/train.tsv new file mode 100755 index 00000000..4d7ea56c --- /dev/null +++ b/test/data_for_tests/io/SST-2/train.tsv @@ -0,0 +1,6 @@ +sentence label +hide new secretions from the parental units 0 +contains no wit , only labored gags 0 +that loves its characters and communicates something rather beautiful about human nature 1 +remains utterly satisfied to remain the same throughout 0 +on the worst revenge-of-the-nerds clichés the filmmakers could dredge up 0 diff --git a/test/data_for_tests/io/SST/dev.txt b/test/data_for_tests/io/SST/dev.txt new file mode 100755 index 00000000..46fca6bf --- /dev/null +++ b/test/data_for_tests/io/SST/dev.txt @@ -0,0 +1,6 @@ +(3 (2 It) (4 (4 (2 's) (4 (3 (2 a) (4 (3 lovely) (2 film))) (3 (2 with) (4 (3 (3 lovely) (2 performances)) (2 (2 by) (2 (2 (2 Buy) (2 and)) (2 Accorsi))))))) (2 .))) +(2 (2 (1 No) (2 one)) (1 (1 (2 goes) (2 (1 (2 (2 unindicted) (2 here)) (2 ,)) (2 (2 which) (3 (2 (2 is) (2 probably)) (3 (2 for) (4 (2 the) (4 best))))))) (2 .))) +(3 (2 And) (4 (3 (2 if) (1 (2 you) (1 (2 (2 (2 're) (1 not)) (2 nearly)) (4 (3 (3 moved) (2 (2 to) (1 tears))) (2 (2 by) (2 (2 (2 a) (2 couple)) (2 (2 of) (2 scenes)))))))) (2 (2 ,) (2 (2 you) (2 (2 (2 've) (1 (2 got) (2 (3 (2 ice) (2 water)) (2 (2 in) (2 (2 your) (2 veins)))))) (2 .)))))) +(4 (4 (2 A) (4 (3 (3 warm) (2 ,)) (3 funny))) (3 (2 ,) (3 (4 (4 engaging) (2 film)) (2 .)))) +(4 (3 (2 Uses) (3 (3 (4 (3 sharp) (4 (3 (4 humor) (2 and)) (2 insight))) (2 (2 into) (3 (2 human) (2 nature)))) (2 (2 to) (2 (2 examine) (2 (2 class) (1 conflict)))))) (2 (2 ,) (2 (2 (2 adolescent) (2 (2 (2 yearning) (2 ,)) (3 (2 (2 the) (2 roots)) (3 (2 of) (2 (2 friendship) (2 (2 and) (2 (2 sexual) (2 identity)))))))) (2 .)))) +(2 (2 (2 Half) (1 (2 (2 (2 (2 (2 Submarine) (2 flick)) (2 ,)) (2 (2 Half) (2 (2 Ghost) (2 Story)))) (2 ,)) (2 (2 All) (2 (2 in) (2 (2 one) (2 criminally)))))) (1 (1 neglected) (2 film))) diff --git a/test/data_for_tests/io/SST/test.txt b/test/data_for_tests/io/SST/test.txt new file mode 100755 index 00000000..ebf325d8 --- /dev/null +++ b/test/data_for_tests/io/SST/test.txt @@ -0,0 +1,6 @@ +(2 (3 (3 Effective) (2 but)) (1 (1 too-tepid) (2 biopic))) +(3 (3 (2 If) (3 (2 you) (3 (2 sometimes) (2 (2 like) (3 (2 to) (3 (3 (2 go) (2 (2 to) (2 (2 the) (2 movies)))) (3 (2 to) (3 (2 have) (4 fun))))))))) (2 (2 ,) (2 (2 Wasabi) (3 (3 (2 is) (2 (2 a) (2 (3 good) (2 (2 place) (2 (2 to) (2 start)))))) (2 .))))) +(4 (4 (4 (3 (2 Emerges) (3 (2 as) (3 (2 something) (3 rare)))) (2 ,)) (4 (2 (2 an) (2 (2 issue) (2 movie))) (3 (2 that) (3 (3 (2 's) (4 (3 (3 (2 so) (4 honest)) (2 and)) (3 (2 keenly) (2 observed)))) (2 (2 that) (2 (2 it) (2 (1 (2 does) (2 n't)) (2 (2 feel) (2 (2 like) (2 one)))))))))) (2 .)) +(2 (2 (2 The) (2 film)) (3 (3 (3 (3 provides) (2 (2 some) (3 (4 great) (2 insight)))) (3 (2 into) (3 (2 (2 the) (2 (2 neurotic) (2 mindset))) (3 (2 of) (2 (2 (2 (2 (2 all) (2 comics)) (2 --)) (2 even)) (3 (2 those) (4 (2 who) (4 (2 have) (4 (2 reached) (4 (4 (2 the) (3 (2 absolute) (2 top))) (2 (2 of) (2 (2 the) (2 game))))))))))))) (2 .))) +(4 (4 (2 Offers) (3 (3 (2 that) (3 (3 rare) (2 combination))) (2 (2 of) (3 (3 (3 entertainment) (2 and)) (2 education))))) (2 .)) +(3 (2 Perhaps) (4 (2 (1 (1 no) (2 picture)) (2 (2 ever) (2 made))) (3 (3 (2 (2 has) (2 (2 more) (3 literally))) (3 (2 showed) (2 (2 that) (2 (1 (2 (2 the) (1 road)) (1 (2 to) (0 hell))) (3 (2 is) (3 (2 paved) (3 (2 with) (3 (3 good) (2 intentions))))))))) (2 .)))) diff --git a/test/data_for_tests/io/SST/train.txt b/test/data_for_tests/io/SST/train.txt new file mode 100755 index 00000000..d5296ab0 --- /dev/null +++ b/test/data_for_tests/io/SST/train.txt @@ -0,0 +1,6 @@ +(3 (2 (2 The) (2 Rock)) (4 (3 (2 is) (4 (2 destined) (2 (2 (2 (2 (2 to) (2 (2 be) (2 (2 the) (2 (2 21st) (2 (2 (2 Century) (2 's)) (2 (3 new) (2 (2 ``) (2 Conan)))))))) (2 '')) (2 and)) (3 (2 that) (3 (2 he) (3 (2 's) (3 (2 going) (3 (2 to) (4 (3 (2 make) (3 (3 (2 a) (3 splash)) (2 (2 even) (3 greater)))) (2 (2 than) (2 (2 (2 (2 (1 (2 Arnold) (2 Schwarzenegger)) (2 ,)) (2 (2 Jean-Claud) (2 (2 Van) (2 Damme)))) (2 or)) (2 (2 Steven) (2 Segal))))))))))))) (2 .))) +(4 (4 (4 (2 The) (4 (3 gorgeously) (3 (2 elaborate) (2 continuation)))) (2 (2 (2 of) (2 ``)) (2 (2 The) (2 (2 (2 Lord) (2 (2 of) (2 (2 the) (2 Rings)))) (2 (2 '') (2 trilogy)))))) (2 (3 (2 (2 is) (2 (2 so) (2 huge))) (2 (2 that) (3 (2 (2 (2 a) (2 column)) (2 (2 of) (2 words))) (2 (2 (2 (2 can) (1 not)) (3 adequately)) (2 (2 describe) (2 (3 (2 (2 co-writer\/director) (2 (2 Peter) (3 (2 Jackson) (2 's)))) (3 (2 expanded) (2 vision))) (2 (2 of) (2 (2 (2 J.R.R.) (2 (2 Tolkien) (2 's))) (2 Middle-earth))))))))) (2 .))) +(3 (3 (2 (2 (2 (2 (2 Singer\/composer) (2 (2 Bryan) (2 Adams))) (2 (2 contributes) (2 (2 (2 a) (2 slew)) (2 (2 of) (2 songs))))) (2 (2 --) (2 (2 (2 (2 a) (2 (2 few) (3 potential))) (2 (2 (2 hits) (2 ,)) (2 (2 (2 a) (2 few)) (1 (1 (2 more) (1 (2 simply) (2 intrusive))) (2 (2 to) (2 (2 the) (2 story))))))) (2 --)))) (2 but)) (3 (4 (2 the) (3 (2 whole) (2 package))) (2 (3 certainly) (3 (2 captures) (2 (1 (2 the) (2 (2 (2 intended) (2 (2 ,) (2 (2 er) (2 ,)))) (3 spirit))) (2 (2 of) (2 (2 the) (2 piece)))))))) (2 .)) +(2 (2 (2 You) (2 (2 'd) (2 (2 think) (2 (2 by) (2 now))))) (2 (2 America) (2 (2 (2 would) (1 (2 have) (2 (2 (2 had) (1 (2 enough) (2 (2 of) (2 (2 plucky) (2 (2 British) (1 eccentrics)))))) (4 (2 with) (4 (3 hearts) (3 (2 of) (3 gold))))))) (2 .)))) +(3 (2 ``) (3 (2 Frailty) (4 (2 '') (3 (4 (3 (2 has) (3 (2 been) (3 (4 (3 (3 written) (3 (2 so) (3 well))) (2 ,)) (2 (2 (2 that) (2 even)) (1 (2 (2 a) (2 simple)) (1 (2 ``) (0 Goddammit))))))) (2 !)) (2 ''))))) +(4 (2 (2 Whether) (2 (2 (2 (2 or) (1 not)) (3 (2 you) (2 (2 're) (3 (3 enlightened) (2 (2 by) (2 (2 any) (2 (2 of) (2 (2 Derrida) (2 's))))))))) (2 (2 lectures) (2 (2 on) (2 (2 ``) (2 (2 (2 (2 (2 (2 the) (2 other)) (2 '')) (2 and)) (2 ``)) (2 (2 the) (2 self)))))))) (3 (2 ,) (3 (2 '') (3 (2 Derrida) (3 (3 (2 is) (4 (2 an) (4 (4 (2 undeniably) (3 (4 (3 fascinating) (2 and)) (4 playful))) (2 fellow)))) (2 .)))))) diff --git a/test/data_for_tests/io/THUCNews/dev.txt b/test/data_for_tests/io/THUCNews/dev.txt new file mode 100644 index 00000000..e40ee4a0 --- /dev/null +++ b/test/data_for_tests/io/THUCNews/dev.txt @@ -0,0 +1,9 @@ +体育 调查-您如何评价热火客场胜绿军总分3-1夺赛点?新浪体育讯四年了,终于赢球了,热火在凯尔特人的主场经过加时98-90艰难战胜对手,总比分3-1领先,詹姆斯拿下35分14个篮板,韦德28分9篮板,波什20分12个篮板。您如何评价这场比赛? +娱乐 盘点好莱坞明星新年目标 布兰妮迪亚兹在列(图)新年伊始,又是制定新一年目标的时候了。大到关注环保、寻找真爱,小到改掉坏毛病、改变生活习惯,这些都是美国演艺明星在2009年中的目标。●告别烟圈好莱坞女星卡梅隆·迪亚兹计划在新的一年戒烟,和她目标相同者还有《实习医生格蕾》中的凯瑟琳·海格尔及《飞跃贝弗利》中的布莱恩·奥斯汀·格林。格林说:“每年我似乎都说要戒烟,看看今年行不行吧。”●不咬指甲女歌手布兰妮( 听歌)希望自己“改掉咬手指甲的毛病”。此外,她还表示:“我希望自己不再焦虑,以前的我无时无刻不在焦虑中,我要学会让自己幸福。”●寻觅真爱凭借《灵魂歌王》一片夺得2005年奥斯卡()奖的杰米·福克斯希望自己能在2009年找到真爱。●回归平静去年刚刚与男友分手的影星安妮·海瑟薇则希望过上平静的生活。●享受滑雪因出演《灵异第六感》而一举成名的影星黑利·乔尔·奥斯门特的最大愿望就是重拾自己滑雪的爱好,并从美国犹他州的某座高山上直冲而下。●致力环保曾主演《异形》和《冰风暴》等片的女演员西戈尼·威弗表示要为环保事业贡献力量。她说:“我不再使用塑料袋,手头现有的这些我也要循环使用。”●亲近素食《绝望主妇》中的伊娃·朗格利亚的目标是努力尝试吃素。●活络筋骨热门电视剧《汉娜·蒙塔娜》的主角麦莉·赛勒斯关心的问题则是“多做运动”。●回馈世界要说计划最为抽象的当数帕丽斯·希尔顿,她说:“我已经长大了,成熟了,我要怀着一颗感恩的心,开始回馈世界。”●计划“计划”1983年出演《战争游戏》的马修·布罗德里克的新年计划最别具一格,他的计划就是在2009年“拟订计划”。○据新华社 +家居 蓝景丽家尹勃乐居思路清晰 创新开拓(图)     新浪家居谢娟讯  10月16日,易居中国与新浪合资公司中国房产信息集团(简称CRIC)在美国纳斯达克成功上市。此消息一出,家居业界大腕在分享喜悦的同时,纷纷来电来函,向中国房产信息集团成功登陆纳斯达克表示祝贺,同时对CRIC在未来发展提出了中肯的建议和期待。新浪家居电话连线业内数位大腕,倾听他们对此事的看法,以及对中国房产信息集团上市寄语。【CRIC(中国房产信息集团)纳斯达克挂牌上市】       采访嘉宾:蓝景丽家总经理 尹勃         新浪家居:您好,尹总,我是新浪乐居家居频道编辑谢娟,感谢您接受本次访谈。   尹勃:您好。       新浪家居:北京时间2009年10月16日,易居中国与新浪合资公司中国房产信息集团在美国纳斯达克成功上市融资2亿美元。您是否知道此事?您对此有怎样的看法?       尹勃:刚刚知道!对家居很好的促进作用,希望能够加大北京市场支持力度,给予北京市场更高的重视。   新浪家居:感谢您的肯定。同时也希望您能给予建设性的意见。       尹勃:在罗总的带领下做的比较有声势,目前的思路更清晰。希望乐居做到较其他媒体更有高度,活动更有所创新。   新浪家居:您有怎样的祝语?             尹勃:祝新浪乐居越办越好,带动北京家居市场更上一层楼!      【嘉宾简介】       尹勃:(蓝景丽家总经理 北京市建筑装饰协会家装委员会副会长 北京市场协会家居分会副会长 北京家具协会常务理事 中国建材市场协会理事会副理事长)家居流通卖场一路走来,从昔日倒爷式的地摊、棚户到今天品牌型的综合、主题式购物广场,经历了多少时代的洗礼。尹勃作为这个行业中翘楚企业的负责人,见证了整个家具行业的变迁。名字后面这一连串的职务介绍足以说明他在整个行业中举足轻重的影响力,也更加肯定了他对“蓝景丽家”这个行业航母的巨大贡献。      【推荐阅读】        蓝景丽家十一精彩促销撼京城       百城万店无假货蓝景丽家启动       乐居装修日首战告捷 蓝景丽家销售额逆势暴涨       【媒体声音】      中国证券报:新浪易居合资公司CRIC登陆纳市       上证报:新浪易居合资公司CRIC逆市登陆纳市       第一财经日报:CRIC上市首日市值20亿美元       新华网:新浪与易居合资公司CRIC登陆纳斯达克       专访丁祖昱:CRIC在做前人没有做过的事情       专访罗军:CRIC具有巨大的商业潜力       专访曹国伟:在某些垂直领域会做更多尝试 【更多】     上市背景资料:      美国东部时间10月16日(北京时间10月16日)消息,易居中国与新浪合资公司中国房产信息集团(以下简称CRIC)在美国纳斯达克挂牌上市,首日开盘价12.28美元,超出发行价0.28美元。CRIC为易居中国与新浪的合资公司,股票代码为CRIC,发行价12美元,共发行美国存托股票(ADS)1800万股,同时承销商有权在未来30天内,行使总额达到270万股的超额配售权,此次IPO共计募集资金约2.16亿美元。作为中国在美国的地产科技第一股,CRIC是中国最大的专业房地产信息服务公司,并且拥有同时覆盖线上线下的房地产综合信息和服务平台。CRIC的成功上市,也创造了两家在美国上市的中国公司,分拆各自极具成长力的业务后进行合并,并进行二次上市的先河。CRIC联席董事长、CEO周忻表示;“我们很高兴看到CRIC成功上市,此次IPO将确立CRIC作为中国房地产信息服务第一品牌的地位,并有利于CRIC继续推进国内最大和最先进的房地产信息系统建设,使CRIC成为同时覆盖线上和线下的强大中国房地产网络信息服务平台,为房地产开发商、供应商、专业机构以及个人用户提供多元化房地产信息服务。CRIC联席董事长、新浪CEO曹国伟表示:“CRIC的成功上市,是易居中国和新浪合作的重要一步,也是我们在垂直领域商业模式探索的有益尝试,我们很高兴有机会发挥双方的协同效应。而进一步拓展和深化互联网垂直领域的商机,建立公司在细分市场的核心竞争力并做大做强,这也是新浪未来长远战略的重要组成部分。     +房产 弘阳大厦骏馆开盘 首日热销1亿昨天,位于南京大桥北路69号的红太阳销售中心人头攒动,当天开盘的弘阳大厦·骏馆取得了开门红,由于产品品质高端、户型精致总价又低,吸引了一拨又一拨看房者,当天销售额突破了一个亿。弘阳大厦·骏馆位于南京市浦口区大桥北路西侧,紧邻已建成的旭日华庭金棕榈园区旁,用地总面积6万多平米,包括一个包含酒店公寓、商业及办公的综合楼,一个酒店式公寓以及8万平方米的居住建筑和15000平方米的商业。弘阳大厦作为这块地块中的综合楼,主楼高99.65米,共28层,是集办公、商业、餐饮、公寓为一体的泛配套复合多功能商住楼。此次推出的弘阳大厦·骏馆,是弘阳大厦其中5-22层的酒店式公寓,主力户型为41-75平米商住先锋小户型。由于项目地处桥北新城的核心位置,离市区仅一桥之隔,规划中的地铁与过江隧道近在咫尺,兼具成熟配套资源优势。公共交通也非常方便,131、132、鼓珍、鼓扬、汉江、中六、汉六等多条公交线路可以直达该项目。除了地处桥北核心地段,具备传统的生活多方面配套以外,弘阳大厦·骏馆还拥有同属弘阳集团旗下的华东MALL完美商业配套。 我要评论 +教育 名师解析标准读音在四级考试中的重要性对于中国学生而言,都知道口语和听力很重要,但就是怎么也不好过关,究其原因就是他们英语发音不标准。一、口语。一口标准而流利的口语可以立即提升你的形象,给人以很好的第一印象。举例1:汤姆汉克斯主演的电影《幸福终点站》中有一个情节,大家应该很熟悉:他将a man of mystery“一个神秘的人”读成了a man of misery“一个痛苦的人”,意思相差了十万八千里,自然造成理解障碍。举例2:中文中v和w没有任何区别,说“我wo”的时候,如果上齿咬着下唇的话,也无所谓,因为不会产生任何歧义。但是英文中不一样,这两个音区别很大。vine表示“葡萄藤”;而wine则表示“葡萄酒”。green wine表示“新酒”;而green vine则表示“绿色的葡萄藤”。读错了音意思差别可就大了去了。举例3:一位外国人在中国马路上迷了路,见到一位姑娘,立即冲上前去,说道:“我想吻(问)你...”吓得姑娘连忙跑掉,就是因为读音的问题,外国人在中国也会遭遇理解障碍。二、听力。听力在四级考试中占35%的份额,如果听力不如意的话,考试想要及格真的是很难。听力过程中学生可能会有以下几种体会:1. 根本听不清楚读音——因为不熟悉英文的读音规则;2. 听清了读音,但对应不出是哪个单词——词汇量不够,没有好好记单词;3. 听清了读音,也知道是哪个单词,但忘了啥意思了——还是词汇量不够,对于单词不熟悉;4. 对于spot dictation题型而言,听清了,知道是哪个单词,但就是—写就出现拼写错误——还是词汇没记好。第一,注意单词的读音,英式的和美式的。如:It's very hot today. 中hot美语中几乎就读成了hut这个词的读音了。第二,句子一连读、失去爆破等,连单词的影子都找不到了。如:This-is-an ol(d) pi(c)ture-of-a bi(g) car。横线表示连读,连读起来都不知道到底是一个词还是几个词了,括号里是不发音的,所以这个句子一旦读出来就完全走了样了。但听力中这种现象确是很常见的。要想练习好听力,首先要练习好英文的读音,包括词和句的读音规则。尤其对于外地孩子来说,就更重要了。如湖南的孩子说“我来自湖南”,由于方言影响就成了“我来自弗兰”。而这些人都不认为自己的读音是错误的,所以他听别人这样说的时候也认为是正确的。总之,如果我们平时的读音是错误的话,当听到正确读音时反而会不知道是哪个词,所以要想加强听力,首先要加强自己的读音。(党敏) +时尚 组图:10款艳丽泳装熟女穿出少女情怀导语:时下的泳装注重层次和线条感的悠闲设计,流露出自然的气质。 简洁的色彩搭配,甜美感觉凸显少女情怀,抽象概念化的异域花卉,颜色和谐、明快,印花纱裙,感觉轻盈,细致有女人味。 +时政 台“中选会”称12月5日选举时程不变新华网消息 据台联合晚报报道,台“中选会”上午如期召开幕僚选务会议,仍按原定12月5日举办“三合一”选举时程进行相关作业规划。“中选会”将在9月4日发布选举公告。基于考量莫拉克风灾灾后重建,以及H1N1疫情发烧,有部分蓝绿政治人物倡议延后年底“三合一”选举。据了解,到目前为止,年底“三合一”选举的相关选务作业仍如期进行。“中选会”表示,“中选会”是选务机关,是否延选,仍须由政策决定,在政策未改变前,“中选会”将依既定时程,规划年底“三合一”选举的相关选务作业。 +游戏 《天问》国家系统神秘美丽女儿国初探传说在遥远的西域,有一个神秘美丽的国家,上至国王,下至百姓,全国居民都是美丽温婉的女性。唐僧四师徒一路西行,就是来到了这个风光如画的女性之国。粉色帷幔随风飘扬,阳光照耀着的粉色砖墙闪闪发亮;清澈的泉水边,风情万种的女子们悠闲地编制精美的地毯,蝴蝶在花香中起舞……西梁女国就是一位端坐西域的温柔而美丽的少女,带着神秘的微笑注视来来往往的游客。解阳山是全新的练级场景, 山上微风吹拂,仙鹤悠闲地梳理着翎羽,处处透露平和安逸的气氛。但是山顶一座简陋的道观,竟藏着不少金银财宝?西梁女国百姓最珍视的一口泉水,也隐藏在道观山之上,这里到底隐藏着什么秘密?在解阳山上有一个神秘的副本波月洞,里面溶岩密布,石柱高耸,组成了各种美妙的景观。然而,波月洞盘踞着以毒蝎精领导的一群女妖,这帮妖精已与女儿国争战多年。当群侠得知毒蝎精近来甚至企图绑架女儿国太子,以要挟国王就范时,不论是出于怜香惜玉,还是英雄救美,一场的激烈的战争终将不可避免的开始了…… +科技 五彩时尚MP3 三星U5仅售299元 三星YP-U5(2GB)共有蓝、粉、白、红、黑五种时尚漂亮颜色可供选择。色彩感很浓烈。三星YP-U5(2GB)的背面还提供了一个背夹,再加上五颜六色的款式,使它看上去很像一个美发卡。机身很小巧,三围尺寸只有25×88×11.8mm,重量也只有23g,完全可以随身携带。在机身正面可以看到一个OLED冷光屏,显示的字体比较清晰。三星YP-U5(2GB)可以支持mp3、wma、ogg、Flac音频格式文件播放,此外,它支持三星最新的DNSe代3代音效,5种音效,提供自动、正常、工作室、摇滚、节奏及布鲁斯、舞厅、音乐厅7种选择,也可以进行自定义,对EQ和3D进行调节,效果非常好。除了出色的音乐播放功能以外,三星YP-U5(2GB)还支持FM收音机、歌词显示、MIC录音等功能。编辑点评:U系列是三星主打平价市场的产品,主要针对学生、办公室一族。相信这款音质出众、色彩绚丽的时尚MP3,也将为学生和年轻白领一族的个性生活增添亮丽色彩。    三星YP-U5(2GB)      [参考价格] 299元    [联系方式] 13434155009      diff --git a/test/data_for_tests/io/THUCNews/test.txt b/test/data_for_tests/io/THUCNews/test.txt new file mode 100644 index 00000000..81d00e65 --- /dev/null +++ b/test/data_for_tests/io/THUCNews/test.txt @@ -0,0 +1,9 @@ +体育 凯尔特人vs尼克斯前瞻III纽约背水战 甜瓜必杀令新浪体育讯北京时间4月23日上午7点,凯尔特人将迎移师纽约城,挑战尼克斯,这是两队首轮的第三次交锋。前两场比赛中,小斯和安东尼轮番打出现象级的表现,可惜都无法为尼克斯带来一场胜利。目前凯尔特人总比分2-0领先,对尼克斯而言,他们没有退路。“第三场在主场击败,这是一场必胜的战争,我们根本输不起,这是本赛季为止将要面临的最艰难的一场比赛。”安东尼说。然而运气却不在纽约这边,他们接连以小分差输掉两场,与此同时,比卢普斯和小斯又接连出现伤病,第三场比赛两人的状态仍旧未知,小斯缺席了球队的训练,他在第二场下半场因为背部痉挛休战,但小斯仍希望能够在第三场出战,比卢普斯则有膝伤在身,能否复出还要等赛前决定。第二场比赛中,比卢普斯休战,小斯下半场未打,比尔-沃克全场11投0中,但是尼克斯凭借安东尼的42分17个篮板6次助攻,顽强的将比赛拖到最后一秒,直到最后时刻杰弗里斯的传球被KG抢断,才遗憾落败。德安东尼说:“很遗憾他们两不能上场,但从积极方面看,下半场球队打出的顽强表现,让我们信心满满。”小斯在第一场拿到28分11个篮板,但是安东尼在那场饱受犯规困扰,18投5中只拿到15分,下半场11投1中,尼克斯最终85-87落败,纽约人相信,如果安东尼和小斯同时发挥,他们有很大机会扳倒绿巨人。“我想这是一种精神折磨,你知道自己打得有多努力,有多棒,但两次我们都距离胜利差之毫厘。”安东尼说。第三战将是尼克斯自从2004年4月25日以来,首次在麦迪逊广场花园首次举办季后赛,这座举世闻名的篮球麦加殿堂已有七年未曾染指季后赛。对凯尔特人而言,他们的进攻出现了不少问题,季后赛前两场分别是靠雷-阿伦和凯文-加内特的关键球才勉强击败对手。里弗斯表示,球队表现需要提高,奥尼尔第三场能否出战还是谜,雷-阿伦连续两场打出不俗表现,隆多则在第二场砍下30分7次助攻,他们将尼克斯的命中率限制到35.6%,但与此同时,他们也丢失了大量的防守篮板,上场比赛尼克斯抢下了20个进攻篮板,而凯尔特人只有9个。小斯曾在这轮系列赛中和格伦-戴维斯大打口水仗,此战重回纽约,尼克斯急需他的发挥,接下来就看小斯带伤出战,能为尼克斯提供多少支援了。两队预计首发:凯尔特人:隆多、阿伦、皮尔斯、加内特、小奥尼尔尼克斯:道格拉斯、菲尔德斯、图里亚夫、安东尼、小斯(木瓜丁) +娱乐 独家探班李康生蔡明亮短片《自转》(组图)新浪娱乐讯蔡明亮(阿亮)导演、李康生(小康)演出的银幕组合让两人在国际影坛挣出一席地位,如今两人“角色互换”!李康生执导台湾公视《台北异想》影片中的短片──《自转》,请出已20年没站在镜头前的蔡明亮当演员,阿亮为了爱徒再次“下海”演戏,没想到自称对演员施以爱的教育的小康,拍第一场戏就让阿亮吃了18次NG,现场更放催泪音乐,让感情丰富的阿亮流下真情的眼泪。台湾公视的《台北异想》影片,概念将一天从清晨六点起分为八个时段,邀来李康生、郑芬芬、钮承泽、林靖杰等八位导演,拍摄八部十分钟短片,接力诠释24小时的台北故事。小康选了凌晨四时至六时的时段发挥,他说:“2006年,舞蹈家伍国柱、罗曼菲相继过世让我感触很深,蔡明亮拍摄电影《洞》时,罗曼菲担任舞蹈编排,她直率、认真的性格留给大家很深的印象。因此特别选择她凌晨四点多辞世的时段,拍摄《自转》,也希望将这部短片献给她。”蔡明亮自从20年前曾在电视单元剧中饰演乐团主唱后,即不再以演员身分现身萤光幕前,为了挺爱徒再站镜头前,阿亮坦言,剧中虽只需扮演自己,但被拍仍令他紧张,要不是近几年常受访,被媒体训练出减少对镜头的恐惧,不然他不会让自己名列演员名单中。被阿亮指导演戏惯了的小康,如何回过头来对恩师教戏?他虽说:“我让演员自由发挥,采取『爱的教育』!”但光是陆奕静炒咖啡豆,阿亮静坐咖啡厅一隅,这全剧第一个镜头就磨了十八次,现场播放雷光夏广播录音和林怀民舞作《挽歌》音乐,更催出阿亮的男儿泪,阿亮说:“我就是想到了罗曼菲,更感受到美好的事物都会消失,真想再看一次罗曼菲跳舞。”《自转》的最后一场戏,陆奕静衬着音乐转圈跳舞,阿亮也即兴起舞,但连两天熬夜赶戏体力透支,加上不停转圈,她拍到呕吐、阿亮则晕眩不止,小康却满意称赞:“这两人跳得不错嘛!”小康当导演,从第一场戏折腾演员到末场戏,堪称“有始有终”,蔡明亮笑说:“未来我还是选择继续当导演吧。”台湾特派记者郑伟柏/台北报导 声明:新浪网独家稿件,转载请注明出处。 +家居 打好算盘最省钱瓷砖选购法面对导购小姐的微笑更是心中打鼓:人家说的好像挺有道理,但会觉得说得越好,会不会上当啊,是不是有猫腻呢?本文从建筑卫生陶瓷角度来分析,其它建材选购原理也与之相差无几。瓷砖的选购很讲究,要知道瓷砖这玩意儿一旦铺上了要是再发现有问题,后果是很严重的!下面列出的几点问题是在装修前一定要想清楚的,这些问题往往决定了以后选择瓷砖的种类、规格、价位甚至家居的整体风格。1、到底铺什么?这个问题好像问得很白痴,但这却是最基本的,首先你得充分了解哪些空间适合用哪些瓷砖啊!其实这个问题的关键不是用什么铺地,而是各种材料该怎么搭配。比如:有些业主希望在客厅铺瓷砖,同时在卧室选择木地板,这样问题就产生了:如果客厅铺普通玻化砖,卧室铺强化复合地板,那么卧室与客厅就会存在3cm左右的高度差,这主要是由于强化地板下没有打龙骨造成的。那么是不是在卧室选择实木地板就行了呢?当然不是。通常实木地板由厂家安装都会使用3×2cm的龙骨,如果为了和客厅的瓷砖找平最好使用5×4cm规格的龙骨,但是各个地板厂商对于更换龙骨的服务条款可是不同的。所以要充分与业主沟通,毕竟我们的目的是要让业主满意,了解业主的最基本的要求,然后根据业主的原始思路,找出最合适的方案。如果业主希望选择地板与地砖混铺的方式,就一定要规划好,避免不必要的麻烦。下面介绍两种基本搭配方式:瓷砖+强化地板=铺地板的房间用水泥灰浆垫高3cm,瓷砖+实木地板=地板下采用5×4cm规格的龙骨。2、选择什么规格的地砖?是铺600的?800的?还是1000的或是其它规格的?这是一个问题!现在的地砖,尤其是客厅使用的地砖主要是500mm、600mm、 800mm和1000mm(即1米)等规格,其中使用最多的是600mm和800mm两种。那么该如何选择呢?建议根据铺贴的面积及家具的摆放进行选择。由于单位面积中600mm的砖比800mm的砖铺贴数量要多,所以视觉上能产生空间的扩张感,同时在铺贴边角时的废料率要低于800mm的砖,而空间大时铺800甚至1米规格的砖就显得大气。因此建议小于40平米的空间选择600mm规格的地砖;而大于40平米的空间则可以选择800mm或一米的地砖。值得注意的是,如果在房间中家具过多(如卧室),盖住大块地面时,最好也采用600mm的地砖。3、该铺怎样的砖?到底是选择铺怎样的砖呢?是仿古砖还是抛光砖?仿古砖自然、柔务,在复古风格、尤其是拼花上有着玻化砖无法比拟的优势。同时,由于表面釉层的保护,对于茶水、墨水甚至热烟头的抗污能力也优于玻化砖。但是玻化砖也并非一无是处。随着技术的发展,现在玻化砖表面玻化层的密实度、光洁度已经相当的高,不仅能够使居室显得更加亮堂,还决不会像釉面砖由于外力碰撞、摩擦产生釉面破损的现象。所以选择什么样的砖要根据你要体现的风格,要明亮、大气就选抛光砖,要自然、温馨就选仿古砖。建议居室空间、客厅如果采光相对有限选择玻化砖,而光线充足的客厅和和需防滑的厨房和卫生间地面,及阳台等可选择仿古砖或其它釉面砖。4、“微晶玉”、“微晶石”、“微晶钻”都是什么意思?很多人逛建材城最头疼的恐怕就是记录瓷砖的名字了。什么“微晶玉”、“微晶石”、“微晶钻”、“超炫石”、“聚晶玉”等等。其实大家根本没必要记住这些拗口的名字,它们描述的都是同一种东西——玻化砖,这些名字只是厂商为了区分产品的档次,进一步细化市场而使用的代号罢了。在选择时大家只要坚持自己的预算,尽量选择适合自己的产品就行了。微晶石表面很炫,但其硬度只有莫氏五度左右,不耐磨,不适于用在地面,比较适于用在外墙干挂。 +房产 迪拜危机启示录:空中楼阁迟早要倒塌美国拉斯维加斯,又一家奢侈至极的酒店在这个“罪恶之城”绽放。但此次,相较酒店豪华的各种天价服务和开幕典礼上的好莱坞群星璀璨外,似乎其幕后的主人更吸引人们的眼球--迪拜世界。仅仅一周前,迪拜世界这个名词牵动了世界每个角落的神经。11月25日,迪拜主权财富基金迪拜世界宣布,暂缓偿还债务。根据评级机构穆迪的估算,迪拜的债务预计接近1000亿美元。巨大的数额勾起了人们对去年雷曼兄弟倒闭以来那波汹涌澎湃的国际金融危机的回忆。汇丰、渣打、巴克莱、苏格兰皇家银行等在内的多家银行涉及在内。人们开始担心,我国是否也会因此受到波及。庆幸的是,国内几大商业银行随即申明表示,没有涉及迪拜世界、迪拜政府和其他相关迪拜主权基金及机构发行的债权。有所涉及的,比例也相当的小。记者致电多家研究所银行业分析师,均表示认为此事对国内银行业影响不大,目前没有特别关注。因此,公众的目光从银行投向了导致其债务根源的房地产业。迪拜世界的房产项目,现在已经成为了全世界最大的烂尾楼。而就在这债务问题凸显的时刻,其旗下的“重型”项目却闪亮登场。“城市中心”酒店的开幕,似乎使得地产行业最尴尬的一面展现在了公众眼中。反观我国的地产行业,近期拍卖地王频现,房屋交易价格再次飙升,种种迹象也让人们对其产生了许多担忧。有专家对记者表示,在高速成长时期,楼价和地价互相推动的背后,是资金的不断流入。在那些光鲜的大楼后被后默默支撑的是债券、贷款等各种负债工具。一个原本是沙漠中人口只有十几万的小城,在几乎没有任何实业的基础上,居然吸引了世界上各方的资金,建成了一个人口上百万的豪华都市。房地产市场的巨大利益诱惑在其中占据了重大的因素。不断高涨的楼市,加上免税的便利,使得国际游资疯狂涌入。在聚集了巨大资金后,其所投资的项目遍布世界,美国这次的拉斯维加斯“城市中心”项目,迪拜世界就砸了近50亿美元。这种推动与反推动作用,给予了人们一个璀璨的迪拜,但当问题暴露,留下的却是满目疮痍。“迪拜危机对我们而言更多的是警示作用。”中国社科院金融研究所中国经济评价中心主任刘煜辉在接受《证券日报》记者采访时如此表示。他认为,目前为止迪拜危机对我国银行业的影响不多,但由于有过全球金融危机的影响,心理上的波动是会有的。此外,刘煜辉还告诉记者,任何以过度负债支撑起来的价格上涨或资产泡沫都是需要高度警惕。因为一旦泡沫破裂,就会带来破坏性较强的连锁反应。相信通过这次迪拜危机的警示,国内更多的行业会关注本行业内的负债和泡沫,对于投机性行为和高风险项目将会更加冷静。我要评论 +教育 知名美国私立寄宿中学来华招生行程序号 学校 时间 地点 学校情况 1、北野山中学Northfield Mount Hermon School10月26日 星期三PM1:00 美丽园龙都美国教育部认可的示范型学校2、Cranbrook school10月27日 星期四AM8:40-10:20美丽园龙都每年本校学生的AP考试成绩都位列于全国成绩最好的学校之中3、The Storm King School10月29日 星期六PM4:30上海南京西路1515号嘉里中心1809室纽约州一所私立男女混合精英寄宿中学4、Villanova Preparatory School10月30日 星期日PM1:00-4:00虹桥万豪酒店美国唯一一所的男女混合寄宿制天主教教会学校5、Wyoming Seminary Upper School11月1日 星期二AM10:00香格里拉美国著名的百年贵族名校,也是美国东北部最古老的中学及大学预科学校6、胡桃山音乐学校Walnut Hill School11月2日 星期三PM1:00浦东香格里拉美国最古老的艺术高中7、弗莱堡学校Fryeburg Academy11月3日 星期四PM2:00-6:00上海南京西路1515号嘉里中心1809室一所独特的提供寄宿和走读学习的学校8、St.Johnsbury Academy11月8日 星期二AM9:00-12:00上海南京西路1515号嘉里中心1809室美国中学中拥有最棒校园的男女合校寄宿学校9、波特茅斯教会学校Portsmouth Abbey School11月8日 星期二PM1:00-3:00北京朝阳区建外SOHO,A座9层全国首屈一指的天主教混合住宿学校10、波特茅斯教会学校Portsmouth Abbey School11月15日 星期三PM1:00-4:00上海南京西路1515号嘉里中心1809室全国首屈一指的天主教混合住宿学校11、库欣高中Cushing Academy11月第三周待定美国最悠久男女合校寄宿中学之一12、West NottinghamAcademy11月19日 星期六PM2:00上海南京西路1515号嘉里中心1809室美国最早的学校,245年历史13、格瑞尔女子中学The Grier School11月26日 星期六PM9:45明天广场万豪历史悠久的著名女子寄宿学校14、萨菲尔德学院Suffield Academy11月30日 星期三 待定有170多年历史,是一所男女同校的私立中学15、威利斯顿 • 诺塞普顿中学The Williston Northampton School12月1日 星期四PM2:00-4:00上海南京西路1515号嘉里中心1809室学校以其优质的教学质量而闻名16、菲利普斯埃克塞特Philips Exeter Academy12月2日星期五PM6:30-8:30北京建国饭店牡丹厅(北京建国门外大街5号)“美国高中的哈佛” 、全美国最好的私立寄宿制高中17、菲利普斯埃克塞特Philips Exeter Academy12月3日星期六PM2:30-4:30上海浦东香格里拉浦江楼2层青岛厅“美国高中的哈佛” 、全美国最好的私立寄宿制高中18、菲利普斯埃克塞特Philips Exeter Academy12月5日星期一PM6:30-8:30浙江图书馆1楼文澜厅(杭州西湖区曙光路73号)“美国高中的哈佛” 、全美国最好的私立寄宿制高中19、坎特伯雷中学Canterbury School12月5日  星期一AM9:00-12:00 待定走读与寄宿都有的男女合校20、西城中学/威斯顿中学Westtown School12月5日 星期一AM9:00待定一所拥有205年悠远传统的中学21菲利普斯埃克塞特Philips Exeter Academy12月6日 星期二PM6:30-8:30广州天河区林和中路6号海肮威斯汀酒店5楼蓝厅“美国高中的哈佛” 、全美国最好的私立寄宿制高中22菲利普斯埃克塞特Philips Exeter Academy12月7日 星期三PM6:30-8:30深圳格兰云天酒店26楼云河厅(福田区深南中路3024号)“美国高中的哈佛” 、全美国最好的私立寄宿制高中23Cheshire Academy12月18日 星期日待定美国最早的传统寄宿中学24The Governor’s Academy待定待定美国最古老的寄宿高中之一25Peddie School待定待定著名的具有悠久历史的男女混合寄宿学校26Westover School待定待定美国著名的大学预备女子私立寄宿中学27Rabun Gap-Nacoochee School待定待定一所6-12年级的大学预备住宿走读中学28Ben Lippen School待定待定一所为学生提供大学准备课程的教会学院29George Stevens Academy待定待定一所拥有200多年历史的学校 +时尚 组图:纽约2011时装周 博主编辑街拍自成风景导语:纽约2011春夏时装秀正在如火如荼地进行着,打开任何时尚网站,你都可以看到这RUNWAY秀的图片,所以我不想在这里赘述了,反而我觉得秀场外这些赶赴现场的模特们和时尚博主以及时尚编辑的街拍更有意思。 +时政 台当局开放大陆银联卡在台刷卡消费中国台湾网7月16日消息 据台湾《联合报》报道,台当局“金管会”昨天发布修正“两岸金融业务往来许可办法”,开放大陆银联卡在台刷卡消费。最快9月初大陆民众就可以持银联卡在台刷卡消费,将可提高大陆游客赴台观光、消费意愿,并为台湾每年新增1000亿元(新台币,下同)刷卡商机。岛内银行也将可办理相关收单业务,对收单银行的手续费年收益至少可多出20亿元的贡献。报道称,台当局“金管会银行局副局长”萧长瑞表示,办法发布生效后,“金管会”就可开始受理岛内收单银行、联合信用卡中心等申请,台湾的联合信用卡中心也要跟大陆银联公司签约,估计最快9月初银联卡就可进入台湾。大陆银联卡赴台使用研议多时,消算等技术层面问题一直待克服,昨天“金管会”正式发布相关规定开放银联卡赴台,也代表技术面问题都已解决。根据“金管会”昨天发布的两岸金融业务往来许可办法第二条及第七条之一修正案,明定岛内信用卡业务机构经主管机关许可者,可以与银联公司从事信用卡或转帐卡的业务往来。主要包括银联卡在岛内刷卡消费的收单业务,以及交易授权与清算业务等两项。至于岛内银行发行银联卡的发卡业务则未开放。(高大林) +游戏 腾讯手游在线 《幻想西游》勇创新高根据腾讯QQ游戏中心2009年11月26日显示的在线数据,由腾讯和广州银汉联合运营的《幻想西游》再创新高,同时在线达到54336!54336同时在线一举打破之前的在线记录,创造手机游戏在线新高,这是《幻想西游》的光荣,也是手机游戏界的光荣!罗马不是一天建成的,《幻想西游》运营三年以前,开发组一直注重提升游戏品质和馈玩家,做属于玩家自己的游戏。这次创造在线人数新高,就是对开发组最高的褒奖。11月期间,《幻想西游》举行了“美在西游”系列活动吸引了数千美女玩家报名,6万多玩家参与了本次活动,掀起了11月的活动高潮。11月25日感恩节,开发组成员更是身怀感恩之心,化身GM来到游戏中倾听玩家的心声,并且心甘情愿地被玩家击败后奉上了感恩节礼物。12月将进入“美在西游”决赛阶段,广州银汉笑迎八方客,热情地邀请来自全国各地的美女玩家和跨服帮战优秀代表共聚羊城,共叙三年幻想情,畅谈西游未来路。《幻想西游》是根据名著《西游记》改编的手机网络游戏,具有操作简洁,界面美观,互动性好,娱乐性强的特点,营造出一个充满梦幻的西游世界。进入游戏:手机访问 http://3g.qq.com,选择游戏-网游-幻想手机官网 http://wap.01234.com.cn,选择快速进入 +科技 配18-135mm镜头 佳能7D国庆带票促销中(中关村在线数码影像行情报道)佳能EOS-7D是一款拥有1800万像素成像能力,每秒钟8张连怕性能,并具备高清摄像功能的单反相机。这款单反相机于上周登陆中关村市场,是目前APS-C规格单反中的旗舰机型。今天笔者在市场上了解到,配备有18-135mm防抖镜头的7D套机,价格为13800元带发票。EOS 7D实现了在约1800万有效像素的高画质下,高达约8张/秒的连拍速度。并搭载了高速智能的自动对焦系统等众多新功能。EOS 7D不仅达到了约1800万的有效像素,还实现了低噪点的精细图像表现。其搭载的CMOS图像感应器是佳能自行研发生产的产品。在提高像素感光度的同时,对像素内的晶体管进行了改良实现了更高的S/N(信噪)比。7D的常用ISO感光度为100-6400,扩展ISO感光度最高为12800。图像信号传输是在将单通道序列读取高速化的同时,采用8通道进行高速读取。与EOS 50D相比要快约1.3倍,实现了约8张/秒的高速连拍。另外,对更换镜头时以及反光镜、快门等动作时产生的感应器灰尘也采用了相应的综合除尘措施;同时还搭载了可从相机硬件和附带软件两方面进行除尘的“EOS综合除尘系统”,在除尘功能上考虑得十分周到。快门单元和机身盖采用了不易产生碎屑的特殊材料;即便是不小心进入了灰尘,也可以通过超声波使图像感应器最前面的低通滤镜产生振动将灰尘抖落。低通滤镜表面进行了氟涂层处理,不论是对难以脱落的具有较高粘度的灰尘还是潮湿的灰尘都有着很好的除尘效果。双DIGIC 4数字影像处理器实现了对通过8个通道从图像感应器中高速读取出的,具有约1800万像素的庞大数据的迅速且高精度处理。搭载了2个高性能数字影像处理器DIGIC 4,能够对各种数据进行并行处理,即使是约1800万有效像素也可以实现最高约8张/秒连拍的高速图像处理。EOS 7D搭载了多达19个的自动对焦点,并且提高了每个对焦点的对焦精度。19个对焦点全部采用对应F5.6光束的十字型自动对焦感应器。将用于检测纵向线条的横向线型自动对焦感应器与用于检测横向线条的纵向线型自动对焦感应器呈十字型排列,从而实现了很高的被摄体捕捉能力。中央对焦点在相对于F5.6光束十字型自动对焦感应器的斜方向上配置了对应F2.8光束精度更高的十字型自动对焦感应器。通过中央八向双十字自动对焦感应器的协同工作,实现了高速且高精度的合焦。追踪被摄体的人工智能伺服自动对焦功能也在EOS 7D上得到了大幅的进化。EOS 7D的光学取景器具有约100%的视野率和约1倍(100%)的放大倍率,同时具有29.4°的视角和22毫米的眼点,其光学性能在历代EOS单反相机中也名列前茅。通过视野率约100%的光学取景器观察到的范围与实际拍摄的范围基本一致,因此能够得到非常精确的构图。此外,EOS 7D还在光学取景器内搭载了具有背透型液晶面板的“智能信息显示光学取景器”,它能够在对焦屏上显示网格线和三维电子水准仪等内容。EOS 7D的机身外壳采用了重量轻,刚性高且具有电磁屏蔽效果的镁合金材料。表面涂层采用了与EOS数码单反相机中顶级的EOS-1D系列相同的涂层材料及工艺。此外,EOS 7D还具有防水滴防尘构造,镁合金的外部部件变为高精度接缝构造,电池仓、存储卡插槽盖以及各操作按钮周围等都采用了密封部件,来保护相机的内部。EOS 7D背面的液晶监视器采用了具有160°的广视角(上下左右方向)及高清晰的92万点新型液晶监视器——“3.0"清晰显示液晶监视器II型”,其内部构造也经过重新研发,采用了新技术。7D机身上分别设置了专用的“实时显示/短片拍摄开关 ”和相应的“开始/停止按钮 ”,并且短片拍摄时能够在手动模式下对曝光进行控制。此外,可实现每秒30/25/24帧,分辨率1920×1080像素的全高清短片拍摄,在使用高清画质(分辨率1280×720像素)及标清画质(分辨率640×480像素)时,能够以每秒60/50帧进行拍摄。编辑观点:佳能7D的出现,再一次丰富了E0S产品系列中APS-C规格单反的阵营。佳能也终于有了可以和尼康D300级别单反正面对抗的产品。而出色的性能表现,不论是摄影爱好者还是专业人士,都会对其青睐有加。而上市价格也比较合理,只是希望7D不要重蹈5D II缺货涨价的覆辙。 diff --git a/test/data_for_tests/io/THUCNews/train.txt b/test/data_for_tests/io/THUCNews/train.txt new file mode 100644 index 00000000..65ca8a36 --- /dev/null +++ b/test/data_for_tests/io/THUCNews/train.txt @@ -0,0 +1,9 @@ +体育 火箭这一胜有更多意义 这是联盟最差击败联盟王者根据ESPN记者亨利-艾伯特的报道,对于一支NBA球队来说,在比赛最后24秒落后一两分或者和对方打成平局,这时候得分能力的高下就将决定最后的胜负。根据近五年来的统计,在这样的关键时刻下,联盟里最擅长得分的球队是黄蜂队,而最不擅长得分的球队则是火箭队。今天这两支球队狭路相逢,最后的24秒正是这样的情形。如果根据近5年火箭和黄蜂的表现来开,那火箭输定了。可是,奇迹出现了,火箭在距离比赛还有22秒的时候以88-87领先对手1分,但是他们并未停下得分的脚步,通过马丁和科特尼-李的三次罚球,他们最终让联盟最会把握最后时刻的王者球队黄蜂最终只是在临近终场的时候由大卫-韦斯特投进了无关紧要的一球,而以2分的优势胜出。一向不善于打关键球的火箭队今天却在最后时刻顶住了压力,力挽狂澜,这相当于火箭用自己最差的技能战胜了全联盟此项技能最强的球队。这和我们以往印象中的火箭截然不同。以往火箭总是在最后时刻无人挺身而出。然而马丁的出色发挥保证了火箭在最后时刻对对手篮筐的冲击力,他不断地抢断、造对手犯规,让黄蜂无法跟上火箭的得分脚步。在今天的比赛中,我们没有看到那支曾经缩手缩脚的球队,也许交易截止日期过了之后,所有的球员终于能安心稳定下来打球了吧。所以一度拥有巨大领先优势、穿着庆祝节日盛装队服的黄蜂最后俨然不敢接受这样的现实,我们至少从保罗的眼神中读出了这失望。所以,这场比赛的胜利对于火箭来说有着更深一层的意义。不论火箭是否已经达到脱胎换骨的境界,至少全明星后的四连胜对火箭冲击季后赛这个短期目标来说,是个极好的兆头。(大猩猩) +娱乐 《山楂树》电影比原著还干净 删减情节曝光(图)《山楂树之恋》小说有20万字,要将原著的全部内容压缩到一部110分钟的电影里,实属不易。事实上,电影里删掉了小说原著中的几场吻戏和激情戏的大部分内容,比小说原著还“干净”。张艺谋自己在说到改编的时候也表示,“其实原作中很多情节我都拍了,但是实在是太长了,我希望能将更多的笔墨放在老三和静秋身上,又能让故事平静地娓娓道来,所以剪掉了大半,后来还做了一些字幕将一些年代关系简化掉。 ”删除部分——长林喜欢静秋小说:静秋刚到生产队长家时,队长老婆希望把她说给自己的二儿子长林,而憨厚的长林也确实喜欢静秋。于是他偷偷地以自己的方式表达着他的爱,然而当他知道老三喜欢静秋时,也觉得自己配不上静秋,默默地就收回了自己的这份感情。影片:影片中这个分支被彻底删掉了,长林到静秋家送过一次核桃和冰糖,但都是老三让他去的。不过静秋在队长家吃饭时,队长一一介绍大哥二哥三哥的时候,长林突然间站起来的反常表现,还是可以看出他面对静秋时候的紧张。很显然,张艺谋其实拍了长林这段,但后来剪掉了。大量枝杈人物小说:为了让故事更丰满,小说中有很多配角在不同的阶段出现。例如,为了表现静秋被欺负,安排了王长生、万驼子这样的反面角色,也安排了成医生一家的出场,静秋对于白血病的一些知识都是从成医生那儿得来的。书中的静秋有个哥哥,为了能让哥哥顺利娶媳妇,这一家人也是做了不少牺牲和努力。影片:这些人物不复存在,张艺谋明确表示,为了有充分空间描述静秋和老三的爱情,不得不舍弃。老三的告别信小说:静秋无意中得知老三得了白血病。两人在医院度过了难忘的一夜,静秋向老三表示:“如果你死了,我也去死。 ”因此,老三选择了离开,并留下一封告别信,表示自己根本没得白血病,只是感冒了,而他不打算要静秋了。影片:老三早早就就澄清自己只是感冒,而之后又不告而别,令静秋既迷惑又伤心,那封告别信并没有出现。更多亲密片段小说:虽然号称“史上最干净的爱情”,小说中也有老三亲吻静秋的描写,包括二人在医院度过难忘一夜中“床戏”的描写。影片:张艺谋拍得比作者写得更干净,能算得上亲密的只有老三用军大衣拥静秋入怀,在医院难忘一夜里,老三和静秋手握着手和衣而眠。对此,张艺谋的解释是,对于影片来说,小说中某些场面还是较为“露骨”,毕竟要考虑到国内电影的审查制度,而且两张清纯的面庞经不起附加太多的“性”。作者有话——改编忠实度把握不好而小说《山楂树之恋》的作者艾米,在接受专访时曾表示,电影删掉的原著中的几场吻戏,没什么道理。《山楂树之恋》的主线就是静秋由惧怕“失足”到主动要求“失足”的转变过程,每场吻戏都是这个过程不可或缺的部分。如果去掉,就等于去掉了故事的主线,静秋后来的要求“失足”就会显得突兀。艾米同时指出:“我以为,这两位导演改编的忠实度把握得不好。仅从现在已经透露出的信息来看,就做了几个很没水平的改编。 ”记者 王琳娜 陈妍妮 +家居 物业交地产公司 以月租10万英镑放盘一年(图)   丹尼尔明年9月担纲演百老汇剧《恋马狂》时,正好方便落脚,但他似乎并非如此打算,因为他已把物业交地产公司,以月租10万英镑(150万人民币)放盘一年。租客将可享用会所设施,包括泳池和蒸气浴室,以及酒店公寓服务。 +房产 开发商频频拿地 市场复苏谨防再起炒作风10日,经过50次举牌,广州市城市建设有限公司以总价34500万元夺得广州天河区珠江新城一地块,折合楼面地价15324元/平方米,而此前珠江新城最高楼面地价为11912元/平方米。 今年2月份以来,随着楼市“小阳春”的到来,沉寂了多个月的土地交易市场再起波澜,开发商们在土地收储上的集体爆发引人关注。再露繁荣景象的土地市场反映出房地产企业充足的资本和对后市的信心,同时,随之高涨的地价、房价也让人们担心,新一轮炒地提价的闸门是否已经悄然打开。 信心加资本撬动土地市场全面复苏 从绿地集团(企业专区,旗下楼盘)分别以9.57亿元和12亿元的价格接连拿下上海松江区辰花路15号B地块和徐汇区斜土街道107街坊,创今年上海土地出让价格的新高,到富力地产(企业专区,旗下楼盘)10.22亿元拿下北京广渠门外10号地,再到中洲宝城26.1亿元拿下深圳3宗捆绑商住地块,雅戈尔10.28亿元拿下宁波“地王”。一个多月的时间内,国内“地王”频现。 中国指数研究院最新的统计数据显示,6月1日至7日,全国20个重点城市共推出土地124宗,环比增加25%,推出土地面积608万平方米,环比增加25%,成交土地面积173万平方米,环比增加14%。 “优质地块一直是开发商们收储的对象,只不过去年楼市的低迷抑制了开发商的热情。”易居中国房地产研究院综合部部长杨红旭在接受采访时指出,目前的情况表明冷落已久的土地市场开始复苏,地产商对后市的预期正在转好,信心正在增强。 国内地产巨头万科近日发布的公告显示,在过去的一个多月中,公司已斥资23亿元多处拿地。这与其两个月前对于国内楼市“尚需进一步观察”的谨慎表态形成了鲜明的对比。 万科能在短时间内连连出手,表明公司“不差钱”。上述公告显示,5月份万科实现销售面积69.7万平方米,销售金额64.1亿元,同比分别增长19.3%和19.7%。这一销售额已经接近2007年高峰时期的单月最高纪录。而今年1至5月,万科的销售总额已达238.9亿元,较2008年同期大涨20.9%。 嘉华(中国)投资有限公司总经理助理谷文胜表示,近期国内楼市十分活跃,开发商在短时间内回笼了大量资金,而开发项目资本金比例也降低了15个百分点,这都使开发商的财务状况大大改善,现金流增加,出于持续发展的需要,买地是很自然的。 地价楼价再入上升通道引发担忧 然而伴随着土地市场的不断回暖,房地产市场成交价格的不断冲高也越来越成为人们关心的问题。 根据国家发展改革委、国家统计局调查显示,5月份,全国70个大中城市房屋销售价格同比下降0.6%,降幅比上月缩小0.5个百分点;环比上涨0.6%,涨幅比上月扩大0.2个百分点。 北京、上海、深圳等地不断传出各类楼市涨价新闻,其中北京朝阳区一处楼盘一个月内每平方米房价上涨5000元的消息更是加重了购房者对后市的担忧。就在富力集团高价拿下广渠门外10号地之后,周边的二手房价格就开始跟风上涨,虽然尚无准确的统计数据,但据业内人士透露,部分业主跟风涨价的行为已经在京城房地产市场上营造出了浓浓的涨价氛围。 “现在开发商又在大量买地,土地市场和楼市会不会再像2007年一样被炒出一波高涨的行情?”正准备买房的丁先生向记者表达了自己的担忧。 丁先生的担忧不无道理,一边是高调拿地,一边是悄悄涨价。虽然综合全国土地收储和开发的情况看,开发商先前收储的土地并没有完全消化,市场供求关系也没有发生根本性的变化。但主要开发商在土地市场上的频频出手,还是很容易让人联想起2007年地价、房价交替上涨的火暴局面。 市场复苏谨防再起炒作之风 “目前的土地市场仍处于恢复性增长阶段,尚未到达繁荣期。”面对地产商纷纷布局土地市场的现状,杨红旭表示,现在还处于宏观经济的低谷期,很多开发商仍旧不敢对后市过于乐观。开发商们在土地市场上频频出手、高价成交,虽然客观上会使楼市预期升温。但土地市场的回暖和楼市的回暖毕竟还是两回事。在宏观经济形势没有发生根本性变化之前,盲目看高后市的地产商有可能碰壁。 北京我爱我家市场研究部高级研究员秦瑞表示,开发商高价拿地之后,地块周边二手房的业主常常会盲目跟风追涨,但从目前的市场环境来看,较高的房价只可能吓退对价格特别敏感的刚性需求,进而导致成交量的萎缩,加重市场的观望情绪。 对于一季度的楼市暖春,再次走上炒地涨价之路,无论是对开发商还是中小业主都不一定是件好事。机构分析人士认为,造成目前房价普涨、开发商收地加快的原因,一方面是市场回暖,另一方面是开发商的去库存已接近尾声,开发商注意力将转向购地、新开工面积和涨价上。 不过“去年以来的经验让购房者变聪明了”,秦瑞告诉记者,如果现在开发商或是中小业主盲目利用市场回暖的时机涨价,那么购房者很可能会再次持币观望,交易量的回落不可避免,房价的持续上涨也不会有市场的依托。 把握推地节奏警惕泡沫出现 谷文胜表示,企业决定买地与否的主要根据是对宏观经济形势的判断和对未来的预期,但“也可能是在全球性通胀预期的驱动下进行资产保值的一种选择,毕竟,持有土地的风险要小于持有现金的风险”。 尽管对购买土地是否真能规避通胀风险存有不同意见。但业内人士还是普遍认为,当土地交易市场成为投资市场,泡沫就随时可能浮现。在全球经济尚未好转、国内信贷相对宽松的背景下,如果将土地进行资本化杠杆运作,频频制造高价抢地的现象,泡沫便会被迅速吹大。 目前看来,地方政府较好地掌握了推地节奏,企业也还比较理性,没有盲目抢地的现象。不少房地产企业判断,“只要政府调控得当,今年应该不会出现像2007年那么多的‘地王’”。 长期调研楼市的上海市政协人资环建委员会专职副主任孙钟炬认为,要让房地产业回归理性、减少泡沫,就需要降低房产成本,而地价成本是房价成本的一个重要组成部分。 “拿地还是要谨慎,现在把地价抬得过高,未来可能心生悔意,就如2007年很多高价拿地企业一样。”杨红旭说。(记者 罗宇凡 叶锋) 我要评论 +教育 澳驻华使馆:政府公布多项国际教育新规澳大利亚驻华使领馆教育处17日通报称,澳大利亚移民与公民事务部长克里斯·鲍恩(Chris Bowen)议员及教育、技能、工作和劳资关系部长克里斯·埃文斯(Chris Evans)参议员今日宣布将对学生签证项目进行复审以及为国际教育行业制订的多项具体措施。埃文斯表示,澳币升值,全球金融危机在一些国家的持续影响,以及逐步加剧的来自美国、新西兰和加拿大等国为吸引国际学生而形成的竞争,给澳大利亚国际教育行业带来的压力在不断增加。他说,国际教育行业的规模和性质在过去十年中也发生了剧大的变化,因此我们采取政府各部门间通力合作的方式来应对这些变化是至关重要的。复审担负着提高国际教育行业的持续竞争力和加强优化学生签证项目两项任务,将为教育机构和各利益相关方提供机会,阐述他们对国际教育行业未来的远见卓识。据介绍,吉拉德政府已任命了澳大利亚勋章获得者迈克尔(Michael Knight)负责复审工作,并于2011年中旬向鲍恩和埃文斯提交复审报告。鲍恩指出,复审工作将考察主要利益相关方与学生签证申请要求之间所建立起来的合作伙伴框架,并将就如何建立一个更加有效的合作伙伴框架提出建议。同时还将审视各种更好的针对学生签证案例中移民风险的管理方法,遏制违规及滥用学生签证项目的行为,并考虑各类学生签证对不同教育类别的适宜性。他介绍说,政府还将采取多项措施,在继续坚持优化学生签证项目的同时,精简低风险人群的签证申请审理程序。这些措施有力支撑了政府近期为优化学生签证项目而采取的改革措施,并再次强调技术移民项目应为澳大利亚中长期经济发展提供所需的高端技能。这些措施包括:——按照近期澳大利亚移民与公民事务部进行的评估等级复审的建议,从2011年4月起,降低一些学生签证评估等级。作为这项决策的一部分,来自中国和印度的高等教育类别的学生签证申请评估等级将会被降低;——调整规定使预付的寄宿学校住宿费可以从签证申请所要求的生活费中扣除;——促进政府和国际教育行业间的信息交流,这包括即将在移民部网站上公布学生签证季度统计数据,以便院校跟踪了解学生签证新趋势;——使职业教育与培训(VET)学生签证评估等级(AL)4的签证申请人能够就读证书级别的配套课程,并能满足获得学生签证的要求。使馆介绍说,今天的这项宣布是对最近澳大利亚政府为加强国际教育行业而实施的多项措施的补充。这些措施包括:针对《2000年海外学生教育服务(ESOS)法案》的贝尔德复审(BairdReview),要求所有提供国际教育的院校于2010年底前重新注册的《海外学生教育服务(ESOS)法案》修正案,以及发布由澳大利亚政府理事会(Councilof Australian Government)制订的《澳大利亚国际学生战略》。埃文斯说:“保持澳大利亚教育继续被高度公认为能够为赴澳留学的国际学生提供高质量课程是十分重要的。”即将于明年成立的国家职业教育与培训规范局(National VET Regulator)和高等教育质量和标准署(Tertiary Education Quality Standards Agency)将保障职业教育与培训和高等教育行业继续保持高质量。 +时尚 组图:香肩美锁骨 性感不张扬女人哪个部位最美最性感?不是红唇,不是翘臀,更不是波胸,而是肩膀。锁骨,是你身着斜肩上装引来同性羡慕的地方,是被抹胸曳地长礼服衬托得最耀眼的地方,它的美充满灵性,让女人立刻有了一种轻盈的气质。它堪称女人身上一道最美的风景线。今夏,单肩装将低调并一枝独秀地流行着,一抹香肩半边锁骨的靓丽,同时造就了几个层次的美感,不对称、错落感、优雅、性感……一切都在那微微倾斜的一道色彩。单肩休闲衫 搭配牛仔最IN如果你认为,单肩风潮仅仅适用于相对正式的礼服或小洋装,那你就大错特错了,一款棉质的普通T恤,只需在剪裁上作一些调整,同时将领口开大,就能轻松呈现出当季最In的单肩感觉,在斜肩处露出细细的肩带,搭配牛仔裤就很好看。时尚女王凯特-摩丝永远懂得美的定义,就连最普通的T恤,一样可以穿出最Fashion的感觉。单肩小洋装 呈现多样风格短款单肩连衣裙根据面料、剪裁的不同,往往可以展现出多样、多变的风格。礼服型的单肩连衣裙充满野性;而缎面、丝绸材质的连衣裙则散发着迷人的青春气息。“绯闻女孩”布莱克-莱弗利一袭玫红色缎面单肩小洋装,玲珑曲线凸显无遗。 +时政 全国95%以上地市建立特邀监察员制度新华网北京3月13日电(记者李亚杰)记者日前从监察部获悉,自1989年以来,监察部已聘请了四批特邀监察员,共计130人次。目前,全国31个省、自治区、直辖市,95%以上的地(市)、65%以上的县和中央国家机关的十多个部委,建立了特邀监察员制度。特邀监察员制度是中国共产党领导的多党合作和政治协商制度的重要组成部分,也是民主监督、参政议政在反腐败领域的成功实践。监察部有关负责人表示,自1989年建立特邀监察员制度以来,监察部一直高度重视,把这项工作作为监察机关的一项重要工作来抓,明确把专门监督与群众监督相结合的制度坚持得如何、特邀监察员工作是加强了还是削弱了,作为衡量和判断在纪检监察机关合署办公体制下行政监察职能是否得到加强的六条标准之一。特邀监察员工作开展近20年来,特邀监察员制度在实践中进一步得到完善和发展,特邀监察员队伍不断壮大,工作领域逐步拓宽,在党风廉政建设和反腐败工作中的作用也越来越明显。1989年5月,经过充分酝酿并经中央同意,监察部作出建立特邀监察员制度的决定。同年12月,监察部从民革、民盟、民建、民进、农工党、致公党、九三学社、台盟8个民主党派和全国工商联聘请了21位专家、学者为监察部首批特邀监察员。之后,特邀监察员工作在全国各级纪检监察机关逐步推开。1996年11月,监察部召开了全国纪检监察机关特邀监察员工作座谈会,这是特邀监察员制度建立以来召开的第一次全国性会议,总结交流了全国纪检监察机关开展特邀监察员工作的经验和做法,有力地推动了特邀监察员工作的深入开展。2004年10月颁布实施的《中华人民共和国行政监察法实施条例》进一步明确:监察机关根据工作需要,可以在国家行政机关、企业、事业单位、社会团体中聘请特邀监察员。聘请特邀监察员的具体办法由国务院监察机关规定。之后,监察部先后制定颁布了《监察部关于聘请特邀监察员的几点意见》、《关于改进特邀监察员工作的几点意见》、《中央纪委监察部关于加强和改进行政监察工作的意见》等一系列法规、文件和规定,明确了特邀监察员工作的总体要求和主要内容。即将颁布施行的《中国人民共和国行政监察法》,将进一步明确特邀监察员选聘程序、职责权限等,为特邀监察员全面履行职责提供法律依据。各地也结合工作实际,纷纷制定颁布了切实可行的工作制度。北京、上海、河南、广东、广西、山东、福建、四川、深圳等地还根据实践发展不断修订、完善特邀监察员工作办法等制度规定,特邀监察员工作的规范化、制度化水平不断提高。 +游戏 经典无法复制!《神鬼寓言3》PC版评析《神鬼寓言3》在一个异彩纷呈的虚拟世界,人类在电脑治下民主共存 -- 再没有什么比这更能激发想象的火花了。我的一个小巧玲珑的世界,我可以予取予求。力量感在我周身涌起,因为这结果完全由我来主宰。若是不想眼看着那帮狼人们凌虐镇子,我或者施法送出火球,或者挥舞宝剑,怎样都能拯救世界。我也可以将镇子寻求保护的一丝光芒熄灭干净,看着怪物们把尖叫的无辜百姓给撕成碎片。这些方面,《神鬼寓言3》做得可圈可点,但是 -- 太罕见了。在阿尔比昂大陆最新的故事里,纵然Lionhead工作室用令人荡气回肠的道德抉择设置了无数奇思妙想和激动时刻,它们却被深埋在了一堆毫不丰满的人物形象、冗长的故事和狗血情节里。如果你从来没玩儿过《神鬼寓言》,Xbox-360独占的《神鬼寓言2》也错过了 -- 没关系的,别担心为了了解《神鬼寓言3》而做好功课的事儿。所有需要你知道的,开篇全交代了:国王是个恶棍,需要被干掉。并不是遵循着最初的故事,总之我 -- 就是主角,从城堡里跑了,混迹市井之中,在阿尔比昂这个奇妙的大陆中徘徊,以期攒足人气资本,把国王搞下来,我自己坐这把交椅。《神鬼寓言3》所耍的手段在于,并不是我戴上王冠就终章了。那些我帮过的人,我还得给出承诺来;一旦取得王位,我得决定是旧账一律不认,还是一律兑现。这事儿让我真的很不舒服。我费大力气拯救出的那些人,敢情谁都不是跑龙套的,都等着最后来向我讨债,都等着我登基之后捎只胳膊带把手儿去拉他们一把。而且大多数的这种事儿都跟王国的安全这种更高层次的要求是冲突的。我不得不在践行诺言与保证阿尔比昂的安全之间竭力求取平衡,小心翼翼如履薄冰。这种构思其实挺不错,但是本来挺好的一件事儿,感觉怎么就这么恶心呢。首先这些人物就有问题。绝大多数的这些角色都同样地逡巡。相比行动来说,还是口音和衣着能有些区分。置他们的吁求不顾而去推广童工或者把妓院夷为平地,我这是多么撕心裂肺的抉择啊!除了我的导师与伙伴沃特,以及暴君洛根之外,剩下的角色全都一个心眼儿,根本就不比普通的三维物件强到哪里去。作为国王而背弃承诺之时,我真是毫无任何感觉,仅仅按下键盘命令,让他们滚,如是而已。穿插在《神鬼寓言3》的主线故事之中,有很多招募的任务 -- 几乎就没有哪个有意思。也有分支任务,可大部分都是教科书一般的护送或者刺杀任务。我可以购置实业,但是只有最基本的项目可供自定义。一个饶有趣味的帝国管理游戏就这样被剥夺了,成了一个单调、乏味的流程,仅仅在金钱进入游戏里钱包的那轻轻一声响更是放大了这一点。我可以杀死或者审判阿尔比昂的百姓,但是与此一道的各种冷笑话和莫名其妙的大打出手,完全把这种感受给毁了。哪怕是黎民们当面儿大喊大叫说我是“刽子手”,我也照旧可以傻乎乎地跳舞、做支线任务、去约会,搞不好就结婚了,还拖家带口的。游戏中的形成、发展和关系的维系,全因为这个设定被束缚住了。就算是《神鬼寓言3》在某些方面引入了阴谋和神秘的元素,例如我被丢到一个黑暗荒芜的洞穴之后,我不得不面对各种恐惧,这使得我无法探索每一个角落。恐惧在这个大陆上是最强大的邪恶,而且大约会在游戏进程的三分之二处出现,而且仅仅会遭遇几次而已。游戏给人的感觉就是完成度不高,而且赶工迹象明显。寻找游戏中的收集元素、参与小鸡快跑比赛、镇压地精等等事情都让人很难一直保持兴趣。而当我最终坐上王座之后,《神鬼寓言3》所能提供的选择少得可怜。还好《神鬼寓言3》有一些时尚和幽默。有些台词写得还是非常有意思的。虽然这样的台词对塑造人物没有任何意义,但是会让你一直一直笑。阿尔比昂仍然是个美丽的地方,而且角色模型、动画和环境光跟随构造除了这个美丽的世界。从墓地的薄雾到荒漠的午后阳光,这样一个充满生机的地方非常令人赞叹。配音做的很专业。任务繁多,讲述了一个宏大的故事,而且还有很多娱乐元素,不过所有这些都相互孤立,让本该成为一款佳作的《神鬼寓言3》就这样沦为了一款毫不出彩的作品。战斗过程令人兴奋,但是缺乏打击感。由于战斗过程的乏味,所以战斗无法使玩家的注意力从游戏剧情和肤浅的人物问题上转移开。格斗武器,枪支和魔法本质上来说都是一样的。基本上都是闪躲和攻击,这样的方法可以用来对付所有遇到的敌人。说实话,这样的战斗系统着实令人失望。武器升级所带来的外观和属性上的改变让我切实感受到了游戏的进程,不过由于战斗系统的失败,这样的设定也让人感到无聊。整体感觉6.5分:漂亮的界面,不过与PC平台毫不相称。杂乱无章的故事与游戏节奏画面表现7.5分:一些很棒的动画和特效,还有多彩和谐的艺术风格声效表现8.0分:令人振奋的音乐,配音表演相当完美上手体验6.0分:有很多可以做的内容,但只有很小部分令人兴奋。单调的战斗,重复的任务,只有很小部分值得情感投入耐玩性5.5分:你或许从合作游戏和大量的收集感到愉悦,但这也无法更改核心游戏体验总评6.0分:还行吧 +科技 摩托罗拉:GPON在FTTH中比EPON更有优势作 者:鲁义轩2009年,在国内光进铜退的火热趋势下,摩托罗拉携其在国际市场上已经获得丰富运营经验的GPON解决方案,大举进入中国的光通信市场。对于这一个时间点的选择,摩托罗拉宽带及移动网络事业部网络接入解决方案部全球营销与传播总监FloydWagoner的解释是:中国利用GPON推进光线到户的时机正在趋于成熟,而摩托罗拉在国际上的GPON研发和运营经验,可以更好地提升国内运营商推进FTTH的效率。GPON的国际性优势在亚洲地区,推进光线到户的多种技术中,EPON一直是非常强大并且主流的技术。而在亚洲以外的国际很多地区,运营商都开始越来越多地关注GPON,今年GPON预计占到全球光纤到户市场的40%。在FloydWagoner看来,EPON虽然仍然强大,而GPON的实力在显著加强。在带宽方面,GPON比EPON上下行带宽都加强了至少一倍。因为EPON利用率相对于GPON要低一些,在相同的用户部署、相同终端情况下,统计数据表明EPON支持上、下行29Mbit/s的带宽,而GPON可以达到下行79Mbit/s上行37Mbit/s的实际带宽,从根本上提升了对数据业务的支持。在服务的质量保证(QoS)上,目前EPON的业务主要是数据业务,而运营商要推广三网融合等复杂的业务,服务质量保证要求会更高。在这方面,GPON有了更好的机制来保证多业务服务质量的实现。此外,在部署的方便性上,光线路中的光功率意味着传输距离的长短。EPON的速率是24dB,而GPON是28dB,在相同的条件下,GPON的传输距离更远。运营商可以把ONT布置在更远的位置,节省线路的成本,将来可以覆盖更多、更远的终端单元。综合比较,无论在技术方面还是在业务保障方面以及在材料方面,GPON到现在为止所体现的趋势更加地优于EPON。而且GPON的成本价格已经下降很多,得到越来越多的运营商的青睐。目前国内中国电信、中国联通以及中国移动都已经表示过把GPON作为下一步光网络发展的优选。创新性的GPONONT和OLT据FloydWagoner介绍,凭借在全球FTTH领域积累的经验,摩托罗拉开发了创新产品,以满足服务供应商提供更低密度的OLT、满足更高密度的 MDU环境以及具集成功能的室内ONT等方面的需求。创新性的GPONONT和OLT,可以将光纤延伸至服务供应商网络的边缘,从而保证用户在任何地方都能享用端到端的超宽带服务。同时,摩托罗拉的FTTH网元管理系统AXSvision,还能简化网管界面,并帮助运营商加速新型、丰富的个性化娱乐业务推出速度。 diff --git a/test/data_for_tests/io/WeiboSenti100k/dev.txt b/test/data_for_tests/io/WeiboSenti100k/dev.txt new file mode 100644 index 00000000..fdca0212 --- /dev/null +++ b/test/data_for_tests/io/WeiboSenti100k/dev.txt @@ -0,0 +1,7 @@ +label text +1 多谢小莲,好运满满[爱你] +1 能在他乡遇老友真不赖,哈哈,珠儿,我也要用这个拼图软件!BTW,小飞人儿终于要飞回家啦,深圳行,谢谢每位工作人员的照顾![爱你] +0 [衰]补鞋的说鞋子是进口的,质量太好,刀子都切不进去!所以说大家以后别买进口,到时补都没的补![爱你] +0 第五季都没看了[泪]要补起来 +1 美图好诗![鼓掌] //@言家楼:回复@俺叫老鬼:【七律。感时】 叶随风舞身何处, 鸟逆风行觅树梢。 岁月风来无退路, 激流风助有波涛。 寒微风动曾言志, 富贵风骚似不牢。 雪竹风梅诗未尽, 休云风雨剪春刀。//鸢肩格:藏珠“风”。 +0 没敢问,她男朋友在旁边呢。。[泪]//@好饭换坏饭: 你问问她能不能调成静音模式 diff --git a/test/data_for_tests/io/WeiboSenti100k/test.txt b/test/data_for_tests/io/WeiboSenti100k/test.txt new file mode 100644 index 00000000..3d071fb2 --- /dev/null +++ b/test/data_for_tests/io/WeiboSenti100k/test.txt @@ -0,0 +1,8 @@ +label text +1 钟爱大粉的亲们,这一茬我们又种大粉了,座果也不错,能吃上了[嘻嘻] +0 //@北京全攻略: 我擦。。。牛逼~果断收藏[衰] +1 都有我都有我~~~我的2012注定是美美的精彩的不得了啊~哈哈哈[太开心]//@哆啦胖兔梦: 转发微博。 +1 这周的成果就是这样 刻的好累但是很喜欢[嘻嘻]#我的橡皮章# +1 你把我整?了。[抓狂] //@窦智耀:开 往大稿艺术区店开 带上祝贺的花篮。。。昨夜 杨家火锅 你把我灌醉。。。今夜 我要学会排队等位。再贺开业大吉![鼓掌][鼓掌][鼓掌] +1 [爱你]亲们,我刚刚发表了一篇文章,有图有真相,速来围观![围观]||#蚂蜂窝游记#《新疆,雨中的野核桃沟》,查看更多精彩>>> http://t.cn/zR4BMN3 (分享自 @蚂蜂窝旅游攻略) +0 [泪]//@平安北京: 珍爱生命,小心驾驶,驾车时请勿接打电话! diff --git a/test/data_for_tests/io/WeiboSenti100k/train.txt b/test/data_for_tests/io/WeiboSenti100k/train.txt new file mode 100644 index 00000000..4f0adf27 --- /dev/null +++ b/test/data_for_tests/io/WeiboSenti100k/train.txt @@ -0,0 +1,7 @@ +label text +1 //@实用小百科:这才是吃货本色[哈哈] +0 回复@邋遢大王诗文:好的[ok] //@邋遢大王诗文:回复@静冈叔叔:[ok]木有问题!回来了和我联系 //@静冈叔叔:回复@西瓜叫高荔蜒啊:在富士山静冈机场有很多小丸子的土产啊[嘻嘻] //@西瓜叫高荔蜒啊:祝你一路顺风~ 想要小丸子的お土?~[泪] +1 我花了两年最后被抢的只剩下一枚,情何以堪! //@自由橙的小窝:@程诗然 同学集卡速度最快,我花了两年时间才集全 //@怯弱的狮子Susan: 回复@阮导:@墙墙-墙根俱乐部 看你多抢手!快给我们各发一套吧![嘻嘻] //@阮导:回复@怯弱的狮子Susan:所以。。。。你要给我找一套撒。。哈哈哈哈哈!!! +1 KIMSCLOSET的年会,海鲜自助餐,太丰盛了!大家吃的HIGH,喝的HIGH,聊的HIGH!太开心了![哈哈][爱你] +1 在iPhone的便携鱼眼镜头之下,扣肉蝴蝶饱子显得多诱人呀![围观][馋嘴][嘻嘻] +0 英织,你知道不知道,他是我最最最爱的大叔,你跟他靠这么近,我的心都碎了!!!你说你说你说,你有没有他的签名![泪] diff --git a/test/data_for_tests/io/XNLI/dev.txt b/test/data_for_tests/io/XNLI/dev.txt new file mode 100644 index 00000000..eced8fac --- /dev/null +++ b/test/data_for_tests/io/XNLI/dev.txt @@ -0,0 +1,7 @@ +language gold_label sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 promptID pairID genre label1 label2 label3 label4 label5 sentence1_tokenized sentence2_tokenized match +zh neutral 他说,妈妈,我回来了。 校车把他放下后,他立即给他妈妈打了电话。 1 1 facetoface neutral contradiction neutral neutral neutral 他 说 , 妈妈 , 我 回来 了 。 校车 把 他 放下 后 , 他 立即 给 他 妈妈 打 了 电话 。 True +zh contradiction 他说,妈妈,我回来了。 他没说一句话。 1 2 facetoface contradiction contradiction contradiction contradiction contradiction 他 说 , 妈妈 , 我 回来 了 。 他 没 说 一 句 话 。 True +zh entailment 他说,妈妈,我回来了。 他告诉他的妈妈他已经回到家了。 1 3 facetoface entailment entailment neutral entailment entailment 他 说 , 妈妈 , 我 回来 了 。 他 告诉 他 的 妈妈 他 已经 回到家 了 。 True +zh neutral 他们停止了跟这家交朋友,因为他们决定了当白人。 种族紧张局势开始时,他们不再探望这家人。 13 39 facetoface neutral entailment entailment entailment entailment 他们 停止 了 跟 这家 交朋友 , 因为 他们 决定 了 当 白人 。 种族 紧张 局势 开始 时 , 他们 不再 探望 这家 人 。 False +zh contradiction 老太太以前常说她姐姐和姐丈是如何决定要搬到奥古斯塔城里去,并且被当做白人看待。 奶奶的妹妹是白人,搬到了德克萨斯州。 17 49 facetoface contradiction contradiction contradiction contradiction neutral 老太太 以前 常 说 她 姐姐 和 姐丈 是 如何 决定 要 搬 到 奥古斯塔 城里 去 , 并且 被 当做 白人 看待 。 奶奶 的 妹妹 是 白人 , 搬 到 了 德克萨斯州 。 True +zh entailment 老太太以前常说她姐姐和姐丈是如何决定要搬到奥古斯塔城里去,并且被当做白人看待。 奶奶的姐姐不是白人。 17 50 facetoface entailment entailment contradiction neutral entailment 老太太 以前 常 说 她 姐姐 和 姐丈 是 如何 决定 要 搬 到 奥古斯塔 城里 去 , 并且 被 当做 白人 看待 。 奶奶 的 姐姐 不 是 白人 。 True diff --git a/test/data_for_tests/io/XNLI/test.txt b/test/data_for_tests/io/XNLI/test.txt new file mode 100644 index 00000000..d5ff4c24 --- /dev/null +++ b/test/data_for_tests/io/XNLI/test.txt @@ -0,0 +1,7 @@ +language gold_label sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 promptID pairID genre label1 label2 label3 label4 label5 sentence1_tokenized sentence2_tokenized match +zh contradiction 嗯,我根本没想过,但是我很沮丧,最后我又和他说话了。 我还没有和他再次谈论。 2 4 facetoface contradiction contradiction contradiction contradiction contradiction 嗯 , 我 根本 没 想 过 , 但是 我 很 沮丧 , 最后 我 又 和 他 说话 了 。 我 还 没有 和 他 再次 谈论 。 True +zh entailment 嗯,我根本没想过,但是我很沮丧,最后我又和他说话了。 我非常沮丧,我刚刚开始跟他说话。 2 5 facetoface entailment entailment entailment entailment entailment 嗯 , 我 根本 没 想 过 , 但是 我 很 沮丧 , 最后 我 又 和 他 说话 了 。 我 非常 沮丧 , 我 刚刚 开始 跟 他 说话 。 True +zh neutral 嗯,我根本没想过,但是我很沮丧,最后我又和他说话了。 我们谈得很好。 2 6 facetoface neutral neutral neutral neutral neutral 嗯 , 我 根本 没 想 过 , 但是 我 很 沮丧 , 最后 我 又 和 他 说话 了 。 我们 谈 得 很 好 。 True +zh neutral 而我当初认为这是一个特权,我现在仍然这样想,我是唯一的922 Ex-O,也是我的AFFC空军职业生涯。 我不知道那天我不是唯一一个在场的人。 3 7 facetoface neutral contradiction contradiction contradiction contradiction 而 我 当初 认为 这 是 一个 特权 , 我 现在 仍然 这样 想 , 我 是 唯一 的 922 Ex-O , 也 是 我 的 AFFC 空军 职业生涯 。 我 不 知道 那天 我 不 是 唯一 一个 在场 的 人 。 False +zh contradiction 而我当初认为这是一个特权,我现在仍然这样想,我是唯一的922 Ex-O,也是我的AFFC空军职业生涯。 我们都被赋予了相同的确切数字,无论我们被许诺了何种特权,都是谎言。 3 9 facetoface contradiction contradiction entailment contradiction contradiction 而 我 当初 认为 这 是 一个 特权 , 我 现在 仍然 这样 想 , 我 是 唯一 的 922 Ex-O , 也 是 我 的 AFFC 空军 职业生涯 。 我们 都 被 赋予 了 相同 的 确切 数字 , 无论 我们 被 许诺 了 何种 特权 , 都 是 谎言 。 True +zh entailment 这是Fannie Flono,她在佐治亚州奥古斯塔长大,她会讲述她童年时的一些故事。 Fannie Flono就在这里,她将与我们分享她在奥古斯塔成长的童年故事。 12 35 facetoface entailment entailment entailment entailment entailment 这 是 Fannie Flono , 她 在 佐治亚州 奥古斯塔 长大 , 她 会讲 述 她 童年 时 的 一些 故事 。 Fannie Flono 就 在 这里 , 她 将 与 我们 分享 她 在 奥古斯塔 成 长 的 童年 故事 。 True diff --git a/test/data_for_tests/io/XNLI/train.txt b/test/data_for_tests/io/XNLI/train.txt new file mode 100644 index 00000000..8a2fd3a3 --- /dev/null +++ b/test/data_for_tests/io/XNLI/train.txt @@ -0,0 +1,9 @@ +premise hypo label +我们 家里 有 一个 但 我 没 找到 我 可以 用 的 时间 我们 家里 有 一个 但 我 从来 没有 时间 使用 它 . entailment +该镇 仍然 充满 雕塑家 , piazza alberica 是 一个 夏季 雕塑 比赛 的 现场 14 天 来 制作 一个 杰作 . 几乎 所有 的 雕塑家 都 离开 了 piazza alberica 为 其他 城市 . contradictory +土耳其 的 面包车 是 自己 坐 下 来 的 , 但 他们 喜欢 玩和呃 , 他们 喜欢 和 他们 一起 玩 , 他们 把 他们 的 社会 从 它 . neutral +好 吗 ? 我 问 benignantly , 因为 她 犹豫 了 . 我 抓住 她 的 胳膊 和 她 愤怒地 , 问 , 好 吗 ? contradictory +一 段 时间 来 看 , 这 一 运动 似乎 要 取得 成功 , 但 政治 事件 , 加 上 帕内尔 在 一个 令 人 愤慨 的 离婚案 中 被 称为 共同 答辩人 , 导致 许多 人 撤回 他们 的 支持 . 帕内尔 在 一个 令 人 愤慨 的 离婚 问题 上 的 法律 问题 使 这 场 运动 受到 了 影响 . entailment +看 在 这里 , 他 说 我们 不 希望 任何 律师 混在 这 一 点 . 他 说 看看 那 张 纸 neutral +Soderstrom 在 创伤 中心 进行 了 多次 筛选 测试 . 测试 必须 在 创伤 中心 进行 比较 , 否则 就 会 无效 . neutral +嗯 , 这 是 一 种 明显 的 我 的 意思 是 , 他们 甚至 把 它 带 到 现在 呢 , 他们 在 电视 上 做 广告 , 你 知道 如果 你 知道 , 如果 你 知道 这样 做 , 或者 如果 你 需要 这 个呃 , 我们 会 告 你 和 你 你 不用 给 我们 钱 , 但 他们 不 告诉 你 的 是 如果 他们 赢 了 你 给 他们 至少 三分之一 他们 赢 的 东西 , 所以 我 不 知道 它 是呃 , 它 得到 了 现在 做 更 多 的 生意 , 而 不 是呃 实际上 是 在 处理 犯罪 而 不 是 与 呃嗯 他们 的 律师 只 是 为了 钱 , 我 相信 , 我 知道 我 同意 你 , 我 认为 你 是 真实 的 你. 非常 正确 的 是 , 我 认为 他们 应该 有 同等 数量 的 你 知道 也许 他们 可以 有 几 个 , 但 我 认为 大多数 他们 应该 不 是 律师 在 事实 , 这 是 方式 他们 已经 进入 政治 , 这 是 因为 在 法律 上 , 你 知道 的 循环 和 一切 , 但 我 不 知道 我们 是 在 马里兰州 和呃 , 我们 有 同样 的 东西 人满为患 , 和呃 他们 让 他们 出来 我 的 意思 是 只 是 普通 的 监狱 判决 的 事情 , 他们 让. 他们 是 因为 他们 没有 任何 地方 可以 留住 他们 所以 你 可以 知道呃 , 除非 是 一个 重大 的 罪行 , 但呃 , 即使 是 小小的 东西 , 我 的 意思 是 那些 在 美国 失去 的 人 是 受害者 和 谁 可能 是 抢劫 或 毒品 , 或者 其他 什么 , 他们 是 谁 要 支付 , 他们 是 一个 会 受苦 , 另 一个 你 知道 的 人 , 如果 他们 被 逮捕 , 如果 他们 逮捕 他们嗯 , 然后 呢 , 你 知道 的 时间 法律 接管 了 一 半 时间 呃 他们 要么 让 他们 走 , 或者 他们 下 了 一个 句子 , 因为 他们 有 一个 律师 , 你 知道 的 感觉 他们 是 不 是 所有 在 一起 当 他们 做到 了 .它 我 不 知道 我们 怎么 到 这 一 点 , 虽然 . neutral diff --git a/test/data_for_tests/io/cmrc/dev.json b/test/data_for_tests/io/cmrc/dev.json new file mode 100644 index 00000000..c9069efe --- /dev/null +++ b/test/data_for_tests/io/cmrc/dev.json @@ -0,0 +1,155 @@ +{ + "version": "v1.0", + "data": [ + { + "paragraphs": [ + { + "id": "DEV_0", + "context": "《战国无双3》()是由光荣和ω-force开发的战国无双系列的正统第三续作。本作以三大故事为主轴,分别是以武田信玄等人为主的《关东三国志》,织田信长等人为主的《战国三杰》,石田三成等人为主的《关原的年轻武者》,丰富游戏内的剧情。此部份专门介绍角色,欲知武器情报、奥义字或擅长攻击类型等,请至战国无双系列1.由于乡里大辅先生因故去世,不得不寻找其他声优接手。从猛将传 and Z开始。2.战国无双 编年史的原创男女主角亦有专属声优。此模式是任天堂游戏谜之村雨城改编的新增模式。本作中共有20张战场地图(不含村雨城),后来发行的猛将传再新增3张战场地图。但游戏内战役数量繁多,部分地图会有兼用的状况,战役虚实则是以光荣发行的2本「战国无双3 人物真书」内容为主,以下是相关介绍。(注:前方加☆者为猛将传新增关卡及地图。)合并本篇和猛将传的内容,村雨城模式剔除,战国史模式可直接游玩。主打两大模式「战史演武」&「争霸演武」。系列作品外传作品", + "qas": [ + { + "question": "《战国无双3》是由哪两个公司合作开发的?", + "id": "DEV_0_QUERY_0", + "answers": [ + { + "text": "光荣和ω-force", + "answer_start": 11 + }, + { + "text": "光荣和ω-force", + "answer_start": 11 + }, + { + "text": "光荣和ω-force", + "answer_start": 11 + } + ] + }, + { + "question": "男女主角亦有专属声优这一模式是由谁改编的?", + "id": "DEV_0_QUERY_1", + "answers": [ + { + "text": "村雨城", + "answer_start": 226 + }, + { + "text": "村雨城", + "answer_start": 226 + }, + { + "text": "任天堂游戏谜之村雨城", + "answer_start": 219 + } + ] + }, + { + "question": "战国史模式主打哪两个模式?", + "id": "DEV_0_QUERY_2", + "answers": [ + { + "text": "「战史演武」&「争霸演武」", + "answer_start": 395 + }, + { + "text": "「战史演武」&「争霸演武」", + "answer_start": 395 + }, + { + "text": "「战史演武」&「争霸演武」", + "answer_start": 395 + } + ] + } + ] + } + ], + "id": "DEV_0", + "title": "战国无双3" + }, + { + "paragraphs": [ + { + "id": "DEV_1", + "context": "锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的演奏方法。常用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子和尾声,提示音乐的板式和速度,以及作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果,如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单皮鼓(板鼓)、大鼓、大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、大钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责,又称为四大件,领奏的师傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、昆剧和粤剧锣鼓中乐器对应的口诀用字:", + "qas": [ + { + "question": "锣鼓经是什么?", + "id": "DEV_1_QUERY_0", + "answers": [ + { + "text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法", + "answer_start": 4 + }, + { + "text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法", + "answer_start": 4 + }, + { + "text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法", + "answer_start": 4 + } + ] + }, + { + "question": "锣鼓经常用的节奏型称为什么?", + "id": "DEV_1_QUERY_1", + "answers": [ + { + "text": "锣鼓点", + "answer_start": 67 + }, + { + "text": "锣鼓点", + "answer_start": 67 + }, + { + "text": "锣鼓点", + "answer_start": 67 + } + ] + }, + { + "question": "锣鼓经运用的程式是什么?", + "id": "DEV_1_QUERY_2", + "answers": [ + { + "text": "依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点。", + "answer_start": 167 + }, + { + "text": "依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点。", + "answer_start": 167 + }, + { + "text": "依照角色行当的身份、性格、情绪以及环境,配合相应的锣鼓点", + "answer_start": 167 + } + ] + }, + { + "question": "戏曲锣鼓所运用的敲击乐器主要有什么类型?", + "id": "DEV_1_QUERY_3", + "answers": [ + { + "text": "鼓、锣、钹和板", + "answer_start": 237 + }, + { + "text": "鼓、锣、钹和板", + "answer_start": 237 + }, + { + "text": "鼓、锣、钹和板", + "answer_start": 237 + } + ] + } + ] + } + ], + "id": "DEV_1", + "title": "锣鼓经" + } + ] +} \ No newline at end of file diff --git a/test/data_for_tests/io/cmrc/train.json b/test/data_for_tests/io/cmrc/train.json new file mode 100644 index 00000000..823b9c80 --- /dev/null +++ b/test/data_for_tests/io/cmrc/train.json @@ -0,0 +1,161 @@ +{ + "version": "v1.0", + "data": [ + { + "paragraphs": [ + { + "id": "TRAIN_186", + "context": "范廷颂枢机(,),圣名保禄·若瑟(),是越南罗马天主教枢机。1963年被任为主教;1990年被擢升为天主教河内总教区宗座署理;1994年被擢升为总主教,同年年底被擢升为枢机;2009年2月离世。范廷颂于1919年6月15日在越南宁平省天主教发艳教区出生;童年时接受良好教育后,被一位越南神父带到河内继续其学业。范廷颂于1940年在河内大修道院完成神学学业。范廷颂于1949年6月6日在河内的主教座堂晋铎;及后被派到圣女小德兰孤儿院服务。1950年代,范廷颂在河内堂区创建移民接待中心以收容到河内避战的难民。1954年,法越战争结束,越南民主共和国建都河内,当时很多天主教神职人员逃至越南的南方,但范廷颂仍然留在河内。翌年管理圣若望小修院;惟在1960年因捍卫修院的自由、自治及拒绝政府在修院设政治课的要求而被捕。1963年4月5日,教宗任命范廷颂为天主教北宁教区主教,同年8月15日就任;其牧铭为「我信天主的爱」。由于范廷颂被越南政府软禁差不多30年,因此他无法到所属堂区进行牧灵工作而专注研读等工作。范廷颂除了面对战争、贫困、被当局迫害天主教会等问题外,也秘密恢复修院、创建女修会团体等。1990年,教宗若望保禄二世在同年6月18日擢升范廷颂为天主教河内总教区宗座署理以填补该教区总主教的空缺。1994年3月23日,范廷颂被教宗若望保禄二世擢升为天主教河内总教区总主教并兼天主教谅山教区宗座署理;同年11月26日,若望保禄二世擢升范廷颂为枢机。范廷颂在1995年至2001年期间出任天主教越南主教团主席。2003年4月26日,教宗若望保禄二世任命天主教谅山教区兼天主教高平教区吴光杰主教为天主教河内总教区署理主教;及至2005年2月19日,范廷颂因获批辞去总主教职务而荣休;吴光杰同日真除天主教河内总教区总主教职务。范廷颂于2009年2月22日清晨在河内离世,享年89岁;其葬礼于同月26日上午在天主教河内总教区总主教座堂举行。", + "qas": [ + { + "question": "范廷颂是什么时候被任为主教的?", + "id": "TRAIN_186_QUERY_0", + "answers": [ + { + "text": "1963年", + "answer_start": 30 + } + ] + }, + { + "question": "1990年,范廷颂担任什么职务?", + "id": "TRAIN_186_QUERY_1", + "answers": [ + { + "text": "1990年被擢升为天主教河内总教区宗座署理", + "answer_start": 41 + } + ] + }, + { + "question": "范廷颂是于何时何地出生的?", + "id": "TRAIN_186_QUERY_2", + "answers": [ + { + "text": "范廷颂于1919年6月15日在越南宁平省天主教发艳教区出生", + "answer_start": 97 + } + ] + }, + { + "question": "1994年3月,范廷颂担任什么职务?", + "id": "TRAIN_186_QUERY_3", + "answers": [ + { + "text": "1994年3月23日,范廷颂被教宗若望保禄二世擢升为天主教河内总教区总主教并兼天主教谅山教区宗座署理", + "answer_start": 548 + } + ] + }, + { + "question": "范廷颂是何时去世的?", + "id": "TRAIN_186_QUERY_4", + "answers": [ + { + "text": "范廷颂于2009年2月22日清晨在河内离世", + "answer_start": 759 + } + ] + } + ] + } + ], + "id": "TRAIN_186", + "title": "范廷颂" + }, + { + "paragraphs": [ + { + "id": "TRAIN_54", + "context": "安雅·罗素法(,),来自俄罗斯圣彼得堡的模特儿。她是《全美超级模特儿新秀大赛》第十季的亚军。2008年,安雅宣布改回出生时的名字:安雅·罗素法(Anya Rozova),在此之前是使用安雅·冈()。安雅于俄罗斯出生,后来被一个居住在美国夏威夷群岛欧胡岛檀香山的家庭领养。安雅十七岁时曾参与香奈儿、路易·威登及芬迪(Fendi)等品牌的非正式时装秀。2007年,她于瓦伊帕胡高级中学毕业。毕业后,她当了一名售货员。她曾为Russell Tanoue拍摄照片,Russell Tanoue称赞她是「有前途的新面孔」。安雅在半准决赛面试时说她对模特儿行业充满热诚,所以参加全美超级模特儿新秀大赛。她于比赛中表现出色,曾五次首名入围,平均入围顺序更拿下历届以来最优异的成绩(2.64),另外胜出三次小挑战,分别获得与评判尼祖·百克拍照、为柠檬味道的七喜拍摄广告的机会及十万美元、和盖马蒂洛(Gai Mattiolo)设计的晚装。在最后两强中,安雅与另一名参赛者惠妮·汤姆森为范思哲走秀,但评判认为她在台上不够惠妮突出,所以选了惠妮当冠军,安雅屈居亚军(但就整体表现来说,部份网友认为安雅才是第十季名副其实的冠军。)安雅在比赛拿五次第一,也胜出多次小挑战。安雅赛后再次与Russell Tanoue合作,为2008年4月30日出版的MidWeek杂志拍摄封面及内页照。其后她参加了V杂志与Supreme模特儿公司合办的模特儿选拔赛2008。她其后更与Elite签约。最近她与香港的模特儿公司 Style International Management 签约,并在香港发展其模特儿事业。她曾在很多香港的时装杂志中任模特儿,《Jet》、《东方日报》、《Elle》等。", + "qas": [ + { + "question": "安雅·罗素法参加了什么比赛获得了亚军?", + "id": "TRAIN_54_QUERY_0", + "answers": [ + { + "text": "《全美超级模特儿新秀大赛》第十季", + "answer_start": 26 + } + ] + }, + { + "question": "Russell Tanoue对安雅·罗素法的评价是什么?", + "id": "TRAIN_54_QUERY_1", + "answers": [ + { + "text": "有前途的新面孔", + "answer_start": 247 + } + ] + }, + { + "question": "安雅·罗素法合作过的香港杂志有哪些?", + "id": "TRAIN_54_QUERY_2", + "answers": [ + { + "text": "《Jet》、《东方日报》、《Elle》等", + "answer_start": 706 + } + ] + }, + { + "question": "毕业后的安雅·罗素法职业是什么?", + "id": "TRAIN_54_QUERY_3", + "answers": [ + { + "text": "售货员", + "answer_start": 202 + } + ] + } + ] + } + ], + "id": "TRAIN_54", + "title": "安雅·罗素法" + }, + { + "paragraphs": [ + { + "id": "TRAIN_756", + "context": "为日本漫画足球小将翼的一个角色,自小父母离异,与父亲一起四处为家,每个地方也是待一会便离开,但他仍然能够保持优秀的学业成绩。在第一次南葛市生活时,与同样就读于南葛小学的大空翼为黄金拍档,曾效力球队包括南葛小学、南葛高中、日本少年队、日本青年军、日本奥运队。效力日本青年军期间,因救同母异父的妹妹导致被车撞至断脚,在决赛周只在决赛的下半场十五分钟开始上场,成为日本队夺得世青冠军的其中一名功臣。基本资料绰号:球场上的艺术家出身地:日本南葛市诞生日:5月5日星座:金牛座球衣号码:11担任位置:中场、攻击中场、右中场擅长脚:右脚所属队伍:盘田山叶故事发展岬太郎在小学期间不断转换学校,在南葛小学就读时在全国大赛中夺得冠军;国中三年随父亲孤单地在法国留学;回国后三年的高中生涯一直输给日本王牌射手日向小次郎率领的东邦学院。在【Golden 23】年代,大空翼、日向小次郎等名将均转战海外,他与松山光、三杉淳组成了「3M」组合(松山光Hikaru Matsuyama、岬太郎Taro Misaki、三杉淳Jyun Misugi)。必杀技1. 回力刀射门2. S. S. S. 射门3. 双人射门(与大空翼合作)", + "qas": [ + { + "question": "岬太郎在第一次南葛市生活时的搭档是谁?", + "id": "TRAIN_756_QUERY_0", + "answers": [ + { + "text": "大空翼", + "answer_start": 84 + } + ] + }, + { + "question": "日本队夺得世青冠军,岬太郎发挥了什么作用?", + "id": "TRAIN_756_QUERY_1", + "answers": [ + { + "text": "在决赛周只在决赛的下半场十五分钟开始上场,成为日本队夺得世青冠军的其中一名功臣。", + "answer_start": 156 + } + ] + }, + { + "question": "岬太郎与谁一起组成了「3M」组合?", + "id": "TRAIN_756_QUERY_2", + "answers": [ + { + "text": "他与松山光、三杉淳组成了「3M」组合(松山光Hikaru Matsuyama、岬太郎Taro Misaki、三杉淳Jyun Misugi)。", + "answer_start": 391 + } + ] + } + ] + } + ], + "id": "TRAIN_756", + "title": "岬太郎" + } + ] +} \ No newline at end of file diff --git a/test/data_for_tests/io/cnndm/dev.label.jsonl b/test/data_for_tests/io/cnndm/dev.label.jsonl new file mode 100644 index 00000000..52a56ab0 --- /dev/null +++ b/test/data_for_tests/io/cnndm/dev.label.jsonl @@ -0,0 +1,4 @@ +{"label": [1, 19, 25], "text": ["marseille , france -lrb- cnn -rrb- the french prosecutor leading an investigation into the crash of germanwings flight 9525 insisted wednesday that he was not aware of any video footage from on board the plane .", "marseille prosecutor brice robin told cnn that `` so far no videos were used in the crash investigation . ''", "he added , `` a person who has such a video needs to immediately give it to the investigators . ''", "robin 's comments follow claims by two magazines , german daily bild and french paris match , of a cell phone video showing the harrowing final seconds from on board germanwings flight 9525 as it crashed into the french alps .", "all 150 on board were killed .", "paris match and bild reported that the video was recovered from a phone at the wreckage site .", "the two publications described the supposed video , but did not post it on their websites .", "the publications said that they watched the video , which was found by a source close to the investigation .", "`` one can hear cries of ` my god ' in several languages , '' paris match reported .", "`` metallic banging can also be heard more than three times , perhaps of the pilot trying to open the cockpit door with a heavy object .", "towards the end , after a heavy shake , stronger than the others , the screaming intensifies .", "then nothing . ''", "`` it is a very disturbing scene , '' said julian reichelt , editor-in-chief of bild online .", "an official with france 's accident investigation agency , the bea , said the agency is not aware of any such video .", "lt. col. jean-marc menichini , a french gendarmerie spokesman in charge of communications on rescue efforts around the germanwings crash site , told cnn that the reports were `` completely wrong '' and `` unwarranted . ''", "cell phones have been collected at the site , he said , but that they `` had n't been exploited yet . ''", "menichini said he believed the cell phones would need to be sent to the criminal research institute in rosny sous-bois , near paris , in order to be analyzed by specialized technicians working hand-in-hand with investigators .", "but none of the cell phones found so far have been sent to the institute , menichini said .", "asked whether staff involved in the search could have leaked a memory card to the media , menichini answered with a categorical `` no . ''", "reichelt told `` erin burnett : outfront '' that he had watched the video and stood by the report , saying bild and paris match are `` very confident '' that the clip is real .", "he noted that investigators only revealed they 'd recovered cell phones from the crash site after bild and paris match published their reports .", "`` that is something we did not know before .", "... overall we can say many things of the investigation were n't revealed by the investigation at the beginning , '' he said .", "what was mental state of germanwings co-pilot ?", "german airline lufthansa confirmed tuesday that co-pilot andreas lubitz had battled depression years before he took the controls of germanwings flight 9525 , which he 's accused of deliberately crashing last week in the french alps .", "lubitz told his lufthansa flight training school in 2009 that he had a `` previous episode of severe depression , '' the airline said tuesday .", "email correspondence between lubitz and the school discovered in an internal investigation , lufthansa said , included medical documents he submitted in connection with resuming his flight training .", "the announcement indicates that lufthansa , the parent company of germanwings , knew of lubitz 's battle with depression , allowed him to continue training and ultimately put him in the cockpit .", "lufthansa , whose ceo carsten spohr previously said lubitz was 100 % fit to fly , described its statement tuesday as a `` swift and seamless clarification '' and said it was sharing the information and documents -- including training and medical records -- with public prosecutors .", "spohr traveled to the crash site wednesday , where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside .", "he saw the crisis center set up in seyne-les-alpes , laid a wreath in the village of le vernet , closer to the crash site , where grieving families have left flowers at a simple stone memorial .", "menichini told cnn late tuesday that no visible human remains were left at the site but recovery teams would keep searching .", "french president francois hollande , speaking tuesday , said that it should be possible to identify all the victims using dna analysis by the end of the week , sooner than authorities had previously suggested .", "in the meantime , the recovery of the victims ' personal belongings will start wednesday , menichini said .", "among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board .", "check out the latest from our correspondents .", "the details about lubitz 's correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and lubitz 's possible motive for downing the jet .", "a lufthansa spokesperson told cnn on tuesday that lubitz had a valid medical certificate , had passed all his examinations and `` held all the licenses required . ''", "earlier , a spokesman for the prosecutor 's office in dusseldorf , christoph kumpa , said medical records reveal lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot 's license .", "kumpa emphasized there 's no evidence suggesting lubitz was suicidal or acting aggressively before the crash .", "investigators are looking into whether lubitz feared his medical condition would cause him to lose his pilot 's license , a european government official briefed on the investigation told cnn on tuesday .", "while flying was `` a big part of his life , '' the source said , it 's only one theory being considered .", "another source , a law enforcement official briefed on the investigation , also told cnn that authorities believe the primary motive for lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems .", "lubitz 's girlfriend told investigators he had seen an eye doctor and a neuropsychologist , both of whom deemed him unfit to work recently and concluded he had psychological issues , the european government official said .", "but no matter what details emerge about his previous mental health struggles , there 's more to the story , said brian russell , a forensic psychologist .", "`` psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they were n't going to keep doing their job and they 're upset about that and so they 're suicidal , '' he said .", "`` but there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person 's problems . ''", "germanwings crash compensation : what we know .", "who was the captain of germanwings flight 9525 ?", "cnn 's margot haddad reported from marseille and pamela brown from dusseldorf , while laura smith-spark wrote from london .", "cnn 's frederik pleitgen , pamela boykoff , antonia mortensen , sandrine amiel and anna-maja rappard contributed to this report ."], "summary": ["marseille prosecutor says `` so far no videos were used in the crash investigation '' despite media reports .", "journalists at bild and paris match are `` very confident '' the video clip is real , an editor says .", "andreas lubitz had informed his lufthansa training school of an episode of severe depression , airline says ."], "publication": "cnndm", "compression": 22.283333333333335, "coverage": 0.8666666666666667, "density": 4.6} +{"label": [3, 5, 24], "text": ["-lrb- cnn -rrb- the palestinian authority officially became the 123rd member of the international criminal court on wednesday , a step that gives the court jurisdiction over alleged crimes in palestinian territories .", "the formal accession was marked with a ceremony at the hague , in the netherlands , where the court is based .", "the palestinians signed the icc 's founding rome statute in january , when they also accepted its jurisdiction over alleged crimes committed `` in the occupied palestinian territory , including east jerusalem , since june 13 , 2014 . ''", "later that month , the icc opened a preliminary examination into the situation in palestinian territories , paving the way for possible war crimes investigations against israelis .", "as members of the court , palestinians may be subject to counter-charges as well .", "israel and the united states , neither of which is an icc member , opposed the palestinians ' efforts to join the body .", "but palestinian foreign minister riad al-malki , speaking at wednesday 's ceremony , said it was a move toward greater justice .", "`` as palestine formally becomes a state party to the rome statute today , the world is also a step closer to ending a long era of impunity and injustice , '' he said , according to an icc news release .", "`` indeed , today brings us closer to our shared goals of justice and peace . ''", "judge kuniko ozaki , a vice president of the icc , said acceding to the treaty was just the first step for the palestinians .", "`` as the rome statute today enters into force for the state of palestine , palestine acquires all the rights as well as responsibilities that come with being a state party to the statute .", "these are substantive commitments , which can not be taken lightly , '' she said .", "rights group human rights watch welcomed the development .", "`` governments seeking to penalize palestine for joining the icc should immediately end their pressure , and countries that support universal acceptance of the court 's treaty should speak out to welcome its membership , '' said balkees jarrah , international justice counsel for the group .", "`` what 's objectionable is the attempts to undermine international justice , not palestine 's decision to join a treaty to which over 100 countries around the world are members . ''", "in january , when the preliminary icc examination was opened , israeli prime minister benjamin netanyahu described it as an outrage , saying the court was overstepping its boundaries .", "the united states also said it `` strongly '' disagreed with the court 's decision .", "`` as we have said repeatedly , we do not believe that palestine is a state and therefore we do not believe that it is eligible to join the icc , '' the state department said in a statement .", "it urged the warring sides to resolve their differences through direct negotiations .", "`` we will continue to oppose actions against israel at the icc as counterproductive to the cause of peace , '' it said .", "but the icc begs to differ with the definition of a state for its purposes and refers to the territories as `` palestine . ''", "while a preliminary examination is not a formal investigation , it allows the court to review evidence and determine whether to investigate suspects on both sides .", "prosecutor fatou bensouda said her office would `` conduct its analysis in full independence and impartiality . ''", "the war between israel and hamas militants in gaza last summer left more than 2,000 people dead .", "the inquiry will include alleged war crimes committed since june .", "the international criminal court was set up in 2002 to prosecute genocide , crimes against humanity and war crimes .", "cnn 's vasco cotovio , kareem khadder and faith karimi contributed to this report ."], "summary": ["membership gives the icc jurisdiction over alleged crimes committed in palestinian territories since last june .", "israel and the united states opposed the move , which could open the door to war crimes investigations against israelis ."], "publication": "cnndm", "compression": 17.57894736842105, "coverage": 0.8947368421052632, "density": 3.1052631578947367} +{"label": [0, 6], "text": ["-lrb- cnn -rrb- governments around the world are using the threat of terrorism -- real or perceived -- to advance executions , amnesty international alleges in its annual report on the death penalty .", "`` the dark trend of governments using the death penalty in a futile attempt to tackle real or imaginary threats to state security and public safety was stark last year , '' said salil shetty , amnesty 's secretary general in a release .", "`` it is shameful that so many states around the world are essentially playing with people 's lives -- putting people to death for ` terrorism ' or to quell internal instability on the ill-conceived premise of deterrence . ''", "the report , `` death sentences and executions 2014 , '' cites the example of pakistan lifting a six-year moratorium on the execution of civilians following the horrific attack on a school in peshawar in december .", "china is also mentioned , as having used the death penalty as a tool in its `` strike hard '' campaign against terrorism in the restive far-western province of xinjiang .", "the annual report catalogs the use of state-sanctioned killing as a punitive measure across the globe , and this year 's edition contains some mixed findings .", "on one hand , the number of executions worldwide has gone down by almost 22 % on the previous year .", "at least 607 people were executed around the world in 2014 , compared to 778 in 2013 .", "amnesty 's figures do not include statistics on executions carried out in china , where information on the practice is regarded as a state secret .", "belarus and vietnam , too , do not release data on death penalty cases .", "`` the long-term trend is definitely positive -- we are seeing a decrease in the number of executions -lrb- worldwide -rrb- , '' audrey gaughran , amnesty 's director of global issues , told cnn .", "`` a number of countries are closer to abolition , and there are some signs that some countries will be abolitionist by 2015 .", "-lrb- there are -rrb- signals of a world that is nearing abolition . ''", "while the report notes some encouraging signs , it also highlights a marked increase in the number of people sentenced to death in 2014 .", "at least 2,466 people globally are confirmed to have been handed the sentence last year , an increase of 28 % compared with 2013 .", "the report notes that the spike in sentencing is attributable to mass-sentencing in countries including egypt and nigeria , `` against scores of people in some cases . ''", "the organization found `` positive developments '' worldwide , with most regions seeming to show reductions in the number of executions .", "opinion : sharp spike in death sentences .", "sub-saharan africa , for example , saw a 28 % fall in reported cases , and executions recorded in the middle east and north africa were down 23 % compared to 2013 .", "`` even though we 've highlighted some of the negative developments ... i think we would always highlight that there are positive developments , '' gaughran said .", "`` across the board , with the exception of europe and central asia there were fewer reports of executions in every region . ''", "the resumption of the use of capital punishment in belarus -- the only country in europe and central asia to execute people -- after a two year hiatus spoiled an near-universal decrease in countries using the death penalty by region .", "the united states has the dubious distinction of being the only country in the americas to conduct executions , but the number of convicts put to death here fell slightly , from 39 in 2013 to 35 in 2014 .", "the state of washington also imposed a moratorium on executions last year .", "the u.s. remains one of the worst offenders for imposing capital punishment , with only iran -lrb- 289 + -rrb- , iraq -lrb- 61 + -rrb- , and saudi arabia -lrb- 90 + -rrb- executing more people in 2014 .", "while figures are not available , amnesty estimates that china also executes `` thousands '' of prisoners each year , `` more than the rest of the world put together . ''", "the report also highlights the imperfections in the judiciary processes that lead to many sentenced to death .", "`` in the majority of countries where people were sentenced to death or executed , the death penalty was imposed after proceedings that did not meet international fair trial standards , '' the report stated .", "`` in 2014 amnesty international raised particular concerns in relation to court proceedings in afghanistan , bangladesh , china , egypt , iran , iraq , north korea , pakistan , saudi arabia and sri lanka . ''", "the united nations secretary-general , ban ki-moon , last year stressed the need to move toward abolition of capital punishment .", "`` the taking of life is too irreversible for one human being to inflict it on another , '' he said , in marking world day against death penalty in october .", "`` we must continue to argue strongly that the death penalty is unjust and incompatible with fundamental human rights . ''", "amnesty estimates that at least 19,094 people were believed to be on death row at the end of 2014 ."], "summary": ["amnesty 's annual death penalty report catalogs encouraging signs , but setbacks in numbers of those sentenced to death .", "organization claims that governments around the world are using the threat of terrorism to advance executions .", "the number of executions worldwide has gone down by almost 22 % compared with 2013 , but death sentences up by 28 % ."], "publication": "cnndm", "compression": 14.841269841269842, "coverage": 0.8888888888888888, "density": 5.079365079365079} +{"label": [8, 9, 34], "text": ["-lrb- cnn -rrb- on may 28 , 2014 , some 7,000 people gathered in a stadium in china 's northwestern xinjiang region .", "but they had not come to watch the local football team or any other grand sporting event .", "instead , the authorities paraded scores of prisoners dressed in orange jumpsuits .", "armed soldiers guarded the exits .", "in the patently unfair , open air trial that followed , 55 people were found guilty of a range of offenses linked to violent attacks in the region and jailed .", "three were sentenced to death .", "the public mass sentencing was part a china 's `` strike hard '' campaign against unrest in xinjiang , a campaign the government claims was launched to combat `` terrorism '' and `` separatism . ''", "but it was also indicative of a trend that was starkly evident last year around the world -- governments using the death penalty in a misguided , and often cynical , attempt to tackle crime and terrorism .", "today , amnesty international releases its annual review of the death penalty worldwide .", "much of it makes for grim reading .", "in pakistan , the government lifted a six-year moratorium on the execution of civilians in the wake of the horrific taliban attack on a school in peshawar in december .", "more than 60 people have been put to death since , and the government has threatened to send thousands more death row prisoners to the gallows .", "iran and iraq executed people for `` terrorism , '' and other countries expanded the scope of capital crimes in their penal codes .", "in a year when abhorrent summary executions by armed groups were branded on the global consciousness as never before , governments are themselves resorting to more executions in a knee-jerk reaction to terrorism .", "other countries made use of executions in similarly flawed attempts to address -- or appear to address -- crime rates .", "jordan ended an eight-year moratorium in december , putting 11 murder convicts to death , with the government saying it was a move to end a surge in violent crime .", "in indonesia , authorities announced plans to execute mainly drug traffickers to tackle a public safety `` national emergency . ''", "six people have already been executed this year .", "a sharp spike in death sentences recorded in 2014 -- up more than 500 on the previous year -- can also be attributed to governments using the death penalty as a political tool .", "the rise was largely because of developments in egypt and nigeria , where courts imposed hundreds of death sentences in the context of internal political instability or crime and armed conflict .", "the simple fact is that governments using the death penalty to tackle crime and security threats are deceiving themselves or the public or both .", "there is no evidence that the threat of execution is more of a deterrent to crime than a prison sentence , as united nations and other studies have repeatedly confirmed .", "it is high time that world leaders stop using the death penalty as an easy way out when times get tough .", "at amnesty international , we have campaigned for an end to the death penalty for decades .", "thankfully , most of the world now appears to agree with us .", "the numbers speak for themselves .", "in 1945 when the united nations was founded , only eight countries had abolished the death penalty .", "today , 140 states are abolitionist in law or practice .", "last year , we recorded executions in 22 countries , down by almost a half from 20 years ago .", "despite the troubling developments we recorded last year , there was still much good news to be found .", "the number of executions recorded around the world dropped significantly in 2014 compared with the previous year , from 778 to 607 .", "this number does not include china , where more people are put to death than the rest of the world put together , but with death penalty statistics treated as a state secret , the true figure is impossible to determine .", "executions were recorded in only three countries in sub-saharan africa -- equatorial guinea , somalia and sudan -- and the number of people put to death went down by more than a quarter .", "the americas continued to be execution-free , apart from the united states .", "those governments that still execute need to realize that they are on the wrong side of history .", "they must join the vast majority of countries which have dropped the ultimate cruel punishment .", "fighting for an end to the death penalty remains an uphill task , but all of us must try to make the world free of this punishment .", "with determination , i know that we can achieve this goal ."], "summary": ["amnesty international releases its annual review of the death penalty worldwide ; much of it makes for grim reading .", "salil shetty : countries that use executions to deal with problems are on the wrong side of history ."], "publication": "cnndm", "compression": 20.85, "coverage": 0.825, "density": 6.375} diff --git a/test/data_for_tests/io/cnndm/test.label.jsonl b/test/data_for_tests/io/cnndm/test.label.jsonl new file mode 100644 index 00000000..d74ebd9f --- /dev/null +++ b/test/data_for_tests/io/cnndm/test.label.jsonl @@ -0,0 +1,4 @@ +{"label": [2, 3], "text": ["-lrb- cnn -rrb- the rev.", "robert h. schuller , california televangelist and founder of the television ministry `` hour of power , '' died thursday , according to his family .", "he was 88 years old .", "schuller , also the founder of crystal cathedral megachurch , had been diagnosed with esophageal cancer in august 2013 , a release from `` hour of power '' said .", "`` my father-in-law passed away peacefully early this morning .", "he was a great dad and a great man of god , '' said schuller 's daughter-in-law , donna schuller , in a twitter message .", "schuller 's life followed an almost shakespearean arc .", "he was born in a iowa farmhouse without running water and longed to preach from his earliest days .", "in his autobiography , `` prayer : my soul 's adventure with god , '' he described standing alone by a river and picturing himself delivering sermons to a rapt congregation .", "after attending a hope college and western theological seminary in michigan , he met his wife of more than 60 years , arvella , while preaching at her church -lrb- she was the organist -rrb- .", "with their young family in tow , the schullers caravanned west to california , where he rented a drive-in theater and preached from the roof of the snack bar .", "it was beneath the dignity of christian ministry , some local pastors huffed .", "the `` passion pits '' where teenagers necked was no place for the gospel .", "schuller was undeterred , and he quickly outgrew the drive-in .", "he called the explosive growth of his tiny congregation a `` miracle , '' though his many mainstream critics had other names for it .", "his confident , breezy version of christianity -- too breezy , by some estimations -- drew hordes of seekers and lapsed christians who were put off by the hellfire fulminations of many post-war american preachers .", "schuller sold a softer , gentler message , which borrowed heavily , he acknowledged , from the father of the feel-good gospel , norman vincent peale .", "he preached not to convert or condemn people , but to encourage them , a sentiment he called `` possibility thinking . ''", "people loved it .", "`` evangelicalism at its best wants to be innovative and reach people , '' said timothy larsen , a professor of christian thought at wheaton college in illinois .", "`` and schuller was a master at that . ''", "`` what he got right is that the gospel is good news , '' larsen continued .", "`` and he preached an uplifting message about personal transformation and uplift and hope . ''", "some of schuller 's favored phrases , though , struck others as cornpone christianity .", "`` turn your hurt into a halo ? ''", "said randall balmer , a professor of american religious history at dartmouth college , citing one such phrase .", "`` that 's pretty weak tea . ''", "still , balmer gives schuller some credit .", "`` it may be bad theology , but it 's brilliant marketing . ''", "in 1970 , schuller began broadcasting `` hour of power , '' believed to be one of the first , if not the very first , sunday service to be shown regularly on television .", "with his genial smile , priestly robes and gray hair , he looked and talked like a guy who wanted nothing more than to see his flock succeed .", "the show , which ran for decades , reached millions , making schuller a televangelist before the term became tarnished by the sins of his many successors .", "schuller 's crowning achievement , at least architecturally , still stands in orange county , california , though it is now owned by the roman catholic church .", "the crystal cathedral , a great gleaming edifice with 10,000 glass panels , gave worshipers a look at the clouds that house the heavens , while schuller preached in the pulpit below .", "the message was clear to many : the road to the former ran through the latter .", "during the 1980s and 1990s , schuller 's star continued to rise , with presidents stopping by the crystal cathedral -- often during campaigns , it should be said -- and future megachurch pastors like rick warren and bill hybels seeking his advice .", "as schuller aged , though , his family was beset by a succession scandal straight from the pages of `` king lear . ''", "he tried to install his only son , bobby jr. , as pastor of crystal cathedral .", "but the preaching styles of father and son were too different for the congregation -- measured at times at 10,000 strong -- to countenance .", "bobby schuller jr. left `` hour of power '' and the pulpit at crystal cathedral after a short time .", "as the family searched for a new successor and tussled over finances , viewers and donations to the church and its television show dropped precipitously .", "crystal cathedral ministries filed for bankruptcy in 2010 , citing debts of more than $ 43 million , according to the associated press .", "schuller 's empire , which once soared as high as his glassy cathedral , had fallen to dust .", "eventually , schuller 's grandson , also named bobby , took over `` hour of power , '' though at a different church .", "in a statement on thursday , the younger schuller recalled standing atop crystal cathedral 's 12-story tower of hope with his grandfather as they surveyed the surrounding landscape .", "`` you could see the whole world from there , '' he said .", "people we 've lost in 2015 .", "cnn 's stella chan reported from los angeles ."], "summary": ["the rev.", "robert schuller , 88 , had been diagnosed with esophageal cancer in 2013 .", "his tv show , `` hour of power , '' was enormously popular in the 1970s and 1980s ."], "publication": "cnndm", "compression": 26.342105263157894, "coverage": 0.8421052631578947, "density": 3.4210526315789473} +{"label": [4, 6], "text": ["-lrb- cnn -rrb- never mind cats having nine lives .", "a stray pooch in washington state has used up at least three of her own after being hit by a car , apparently whacked on the head with a hammer in a misguided mercy killing and then buried in a field -- only to survive .", "that 's according to washington state university , where the dog -- a friendly white-and-black bully breed mix now named theia -- has been receiving care at the veterinary teaching hospital .", "four days after her apparent death , the dog managed to stagger to a nearby farm , dirt-covered and emaciated , where she was found by a worker who took her to a vet for help .", "she was taken in by moses lake , washington , resident sara mellado .", "`` considering everything that she 's been through , she 's incredibly gentle and loving , '' mellado said , according to wsu news .", "`` she 's a true miracle dog and she deserves a good life . ''", "theia is only one year old but the dog 's brush with death did not leave her unscathed .", "she suffered a dislocated jaw , leg injuries and a caved-in sinus cavity -- and still requires surgery to help her breathe .", "the veterinary hospital 's good samaritan fund committee awarded some money to help pay for the dog 's treatment , but mellado has set up a fundraising page to help meet the remaining cost of the dog 's care .", "she 's also created a facebook page to keep supporters updated .", "donors have already surpassed the $ 10,000 target , inspired by theia 's tale of survival against the odds .", "on the fundraising page , mellado writes , `` she is in desperate need of extensive medical procedures to fix her nasal damage and reset her jaw .", "i agreed to foster her until she finally found a loving home . ''", "she is dedicated to making sure theia gets the medical attention she needs , mellado adds , and wants to `` make sure she gets placed in a family where this will never happen to her again ! ''", "any additional funds raised will be `` paid forward '' to help other animals .", "theia is not the only animal to apparently rise from the grave in recent weeks .", "a cat in tampa , florida , found seemingly dead after he was hit by a car in january , showed up alive in a neighbor 's yard five days after he was buried by his owner .", "the cat was in bad shape , with maggots covering open wounds on his body and a ruined left eye , but remarkably survived with the help of treatment from the humane society ."], "summary": ["theia , a bully breed mix , was apparently hit by a car , whacked with a hammer and buried in a field .", "`` she 's a true miracle dog and she deserves a good life , '' says sara mellado , who is looking for a home for theia ."], "publication": "cnndm", "compression": 9.150943396226415, "coverage": 0.9433962264150944, "density": 4.7924528301886795} +{"label": [32, 36], "text": ["-lrb- cnn -rrb- if you 've been following the news lately , there are certain things you doubtless know about mohammad javad zarif .", "he is , of course , the iranian foreign minister .", "he has been u.s. secretary of state john kerry 's opposite number in securing a breakthrough in nuclear discussions that could lead to an end to sanctions against iran -- if the details can be worked out in the coming weeks .", "and he received a hero 's welcome as he arrived in iran on a sunny friday morning .", "`` long live zarif , '' crowds chanted as his car rolled slowly down the packed street .", "you may well have read that he is `` polished '' and , unusually for one burdened with such weighty issues , `` jovial . ''", "an internet search for `` mohammad javad zarif '' and `` jovial '' yields thousands of results .", "he certainly has gone a long way to bring iran in from the cold and allow it to rejoin the international community .", "but there are some facts about zarif that are less well-known .", "here are six : .", "in september 2013 , zarif tweeted `` happy rosh hashanah , '' referring to the jewish new year .", "that prompted christine pelosi , the daughter of house minority leader nancy pelosi , to respond with a tweet of her own : `` thanks .", "the new year would be even sweeter if you would end iran 's holocaust denial , sir . ''", "and , perhaps to her surprise , pelosi got a response .", "`` iran never denied it , '' zarif tweeted back .", "`` the man who was perceived to be denying it is now gone .", "happy new year . ''", "the reference was likely to former iranian president mahmoud ahmadinejad , who had left office the previous month .", "zarif was nominated to be foreign minister by ahmadinejad 's successor , hassan rouhami .", "his foreign ministry notes , perhaps defensively , that `` due to the political and security conditions of the time , he decided to continue his education in the united states . ''", "that is another way of saying that he was outside the country during the demonstrations against the shah of iran , which began in 1977 , and during the iranian revolution , which drove the shah from power in 1979 .", "zarif left the country in 1977 , received his undergraduate degree from san francisco state university in 1981 , his master 's in international relations from the university of denver in 1984 and his doctorate from the university of denver in 1988 .", "both of his children were born in the united states .", "the website of the iranian foreign ministry , which zarif runs , can not even agree with itself on when he was born .", "the first sentence of his official biography , perhaps in a nod to the powers that be in tehran , says zarif was `` born to a religious traditional family in tehran in 1959 . ''", "later on the same page , however , his date of birth is listed as january 8 , 1960 .", "and the iranian diplomacy website says he was born in in 1961 .", "so he is 54 , 55 or maybe even 56 .", "whichever , he is still considerably younger than his opposite number , kerry , who is 71 .", "the feds investigated him over his alleged role in controlling the alavi foundation , a charitable organization .", "the u.s. justice department said the organization was secretly run on behalf of the iranian government to launder money and get around u.s. sanctions .", "but last year , a settlement in the case , under which the foundation agreed to give a 36-story building in manhattan along with other properties to the u.s. government , did not mention zarif 's name .", "early in the iranian revolution , zarif was among the students who took over the iranian consulate in san francisco .", "the aim , says the website iranian.com -- which cites zarif 's memoirs , titled `` mr. ambassador '' -- was to expel from the consulate people who were not sufficiently islamic .", "later , the website says , zarif went to make a similar protest at the iranian mission to the united nations .", "in response , the iranian ambassador to the united nations offered him a job .", "in fact , he has now spent more time with kerry than any other foreign minister in the world .", "and that amount of quality time will only increase as the two men , with help from other foreign ministers as well , try to meet a june 30 deadline for nailing down the details of the agreement they managed to outline this week in switzerland ."], "summary": ["mohammad javad zarif has spent more time with john kerry than any other foreign minister .", "he once participated in a takeover of the iranian consulate in san francisco .", "the iranian foreign minister tweets in english ."], "publication": "cnndm", "compression": 20.85, "coverage": 0.825, "density": 2.825} +{"label": [2], "text": ["-lrb- cnn -rrb- for the first time in eight years , a tv legend returned to doing what he does best .", "contestants told to `` come on down ! ''", "on the april 1 edition of `` the price is right '' encountered not host drew carey but another familiar face in charge of the proceedings .", "instead , there was bob barker , who hosted the tv game show for 35 years before stepping down in 2007 .", "looking spry at 91 , barker handled the first price-guessing game of the show , the classic `` lucky seven , '' before turning hosting duties over to carey , who finished up .", "despite being away from the show for most of the past eight years , barker did n't seem to miss a beat ."], "summary": ["bob barker returned to host `` the price is right '' on wednesday .", "barker , 91 , had retired as host in 2007 ."], "publication": "cnndm", "compression": 5.346153846153846, "coverage": 0.8076923076923077, "density": 2.5} diff --git a/test/data_for_tests/io/cnndm/train.cnndm.jsonl b/test/data_for_tests/io/cnndm/train.cnndm.jsonl new file mode 100644 index 00000000..97719a61 --- /dev/null +++ b/test/data_for_tests/io/cnndm/train.cnndm.jsonl @@ -0,0 +1,10 @@ +{"label": [1, 19, 25], "text": ["marseille , france -lrb- cnn -rrb- the french prosecutor leading an investigation into the crash of germanwings flight 9525 insisted wednesday that he was not aware of any video footage from on board the plane .", "marseille prosecutor brice robin told cnn that `` so far no videos were used in the crash investigation . ''", "he added , `` a person who has such a video needs to immediately give it to the investigators . ''", "robin 's comments follow claims by two magazines , german daily bild and french paris match , of a cell phone video showing the harrowing final seconds from on board germanwings flight 9525 as it crashed into the french alps .", "all 150 on board were killed .", "paris match and bild reported that the video was recovered from a phone at the wreckage site .", "the two publications described the supposed video , but did not post it on their websites .", "the publications said that they watched the video , which was found by a source close to the investigation .", "`` one can hear cries of ` my god ' in several languages , '' paris match reported .", "`` metallic banging can also be heard more than three times , perhaps of the pilot trying to open the cockpit door with a heavy object .", "towards the end , after a heavy shake , stronger than the others , the screaming intensifies .", "then nothing . ''", "`` it is a very disturbing scene , '' said julian reichelt , editor-in-chief of bild online .", "an official with france 's accident investigation agency , the bea , said the agency is not aware of any such video .", "lt. col. jean-marc menichini , a french gendarmerie spokesman in charge of communications on rescue efforts around the germanwings crash site , told cnn that the reports were `` completely wrong '' and `` unwarranted . ''", "cell phones have been collected at the site , he said , but that they `` had n't been exploited yet . ''", "menichini said he believed the cell phones would need to be sent to the criminal research institute in rosny sous-bois , near paris , in order to be analyzed by specialized technicians working hand-in-hand with investigators .", "but none of the cell phones found so far have been sent to the institute , menichini said .", "asked whether staff involved in the search could have leaked a memory card to the media , menichini answered with a categorical `` no . ''", "reichelt told `` erin burnett : outfront '' that he had watched the video and stood by the report , saying bild and paris match are `` very confident '' that the clip is real .", "he noted that investigators only revealed they 'd recovered cell phones from the crash site after bild and paris match published their reports .", "`` that is something we did not know before .", "... overall we can say many things of the investigation were n't revealed by the investigation at the beginning , '' he said .", "what was mental state of germanwings co-pilot ?", "german airline lufthansa confirmed tuesday that co-pilot andreas lubitz had battled depression years before he took the controls of germanwings flight 9525 , which he 's accused of deliberately crashing last week in the french alps .", "lubitz told his lufthansa flight training school in 2009 that he had a `` previous episode of severe depression , '' the airline said tuesday .", "email correspondence between lubitz and the school discovered in an internal investigation , lufthansa said , included medical documents he submitted in connection with resuming his flight training .", "the announcement indicates that lufthansa , the parent company of germanwings , knew of lubitz 's battle with depression , allowed him to continue training and ultimately put him in the cockpit .", "lufthansa , whose ceo carsten spohr previously said lubitz was 100 % fit to fly , described its statement tuesday as a `` swift and seamless clarification '' and said it was sharing the information and documents -- including training and medical records -- with public prosecutors .", "spohr traveled to the crash site wednesday , where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside .", "he saw the crisis center set up in seyne-les-alpes , laid a wreath in the village of le vernet , closer to the crash site , where grieving families have left flowers at a simple stone memorial .", "menichini told cnn late tuesday that no visible human remains were left at the site but recovery teams would keep searching .", "french president francois hollande , speaking tuesday , said that it should be possible to identify all the victims using dna analysis by the end of the week , sooner than authorities had previously suggested .", "in the meantime , the recovery of the victims ' personal belongings will start wednesday , menichini said .", "among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board .", "check out the latest from our correspondents .", "the details about lubitz 's correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and lubitz 's possible motive for downing the jet .", "a lufthansa spokesperson told cnn on tuesday that lubitz had a valid medical certificate , had passed all his examinations and `` held all the licenses required . ''", "earlier , a spokesman for the prosecutor 's office in dusseldorf , christoph kumpa , said medical records reveal lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot 's license .", "kumpa emphasized there 's no evidence suggesting lubitz was suicidal or acting aggressively before the crash .", "investigators are looking into whether lubitz feared his medical condition would cause him to lose his pilot 's license , a european government official briefed on the investigation told cnn on tuesday .", "while flying was `` a big part of his life , '' the source said , it 's only one theory being considered .", "another source , a law enforcement official briefed on the investigation , also told cnn that authorities believe the primary motive for lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems .", "lubitz 's girlfriend told investigators he had seen an eye doctor and a neuropsychologist , both of whom deemed him unfit to work recently and concluded he had psychological issues , the european government official said .", "but no matter what details emerge about his previous mental health struggles , there 's more to the story , said brian russell , a forensic psychologist .", "`` psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they were n't going to keep doing their job and they 're upset about that and so they 're suicidal , '' he said .", "`` but there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person 's problems . ''", "germanwings crash compensation : what we know .", "who was the captain of germanwings flight 9525 ?", "cnn 's margot haddad reported from marseille and pamela brown from dusseldorf , while laura smith-spark wrote from london .", "cnn 's frederik pleitgen , pamela boykoff , antonia mortensen , sandrine amiel and anna-maja rappard contributed to this report ."], "summary": ["marseille prosecutor says `` so far no videos were used in the crash investigation '' despite media reports .", "journalists at bild and paris match are `` very confident '' the video clip is real , an editor says .", "andreas lubitz had informed his lufthansa training school of an episode of severe depression , airline says ."], "publication": "CNN", "compression": 22.283333333333335, "coverage": 0.8666666666666667, "density": 4.6} +{"label": [3, 5, 24], "text": ["-lrb- cnn -rrb- the palestinian authority officially became the 123rd member of the international criminal court on wednesday , a step that gives the court jurisdiction over alleged crimes in palestinian territories .", "the formal accession was marked with a ceremony at the hague , in the netherlands , where the court is based .", "the palestinians signed the icc 's founding rome statute in january , when they also accepted its jurisdiction over alleged crimes committed `` in the occupied palestinian territory , including east jerusalem , since june 13 , 2014 . ''", "later that month , the icc opened a preliminary examination into the situation in palestinian territories , paving the way for possible war crimes investigations against israelis .", "as members of the court , palestinians may be subject to counter-charges as well .", "israel and the united states , neither of which is an icc member , opposed the palestinians ' efforts to join the body .", "but palestinian foreign minister riad al-malki , speaking at wednesday 's ceremony , said it was a move toward greater justice .", "`` as palestine formally becomes a state party to the rome statute today , the world is also a step closer to ending a long era of impunity and injustice , '' he said , according to an icc news release .", "`` indeed , today brings us closer to our shared goals of justice and peace . ''", "judge kuniko ozaki , a vice president of the icc , said acceding to the treaty was just the first step for the palestinians .", "`` as the rome statute today enters into force for the state of palestine , palestine acquires all the rights as well as responsibilities that come with being a state party to the statute .", "these are substantive commitments , which can not be taken lightly , '' she said .", "rights group human rights watch welcomed the development .", "`` governments seeking to penalize palestine for joining the icc should immediately end their pressure , and countries that support universal acceptance of the court 's treaty should speak out to welcome its membership , '' said balkees jarrah , international justice counsel for the group .", "`` what 's objectionable is the attempts to undermine international justice , not palestine 's decision to join a treaty to which over 100 countries around the world are members . ''", "in january , when the preliminary icc examination was opened , israeli prime minister benjamin netanyahu described it as an outrage , saying the court was overstepping its boundaries .", "the united states also said it `` strongly '' disagreed with the court 's decision .", "`` as we have said repeatedly , we do not believe that palestine is a state and therefore we do not believe that it is eligible to join the icc , '' the state department said in a statement .", "it urged the warring sides to resolve their differences through direct negotiations .", "`` we will continue to oppose actions against israel at the icc as counterproductive to the cause of peace , '' it said .", "but the icc begs to differ with the definition of a state for its purposes and refers to the territories as `` palestine . ''", "while a preliminary examination is not a formal investigation , it allows the court to review evidence and determine whether to investigate suspects on both sides .", "prosecutor fatou bensouda said her office would `` conduct its analysis in full independence and impartiality . ''", "the war between israel and hamas militants in gaza last summer left more than 2,000 people dead .", "the inquiry will include alleged war crimes committed since june .", "the international criminal court was set up in 2002 to prosecute genocide , crimes against humanity and war crimes .", "cnn 's vasco cotovio , kareem khadder and faith karimi contributed to this report ."], "summary": ["membership gives the icc jurisdiction over alleged crimes committed in palestinian territories since last june .", "israel and the united states opposed the move , which could open the door to war crimes investigations against israelis ."], "publication": "CNN", "compression": 17.57894736842105, "coverage": 0.8947368421052632, "density": 3.1052631578947367} +{"label": [0, 6], "text": ["-lrb- cnn -rrb- governments around the world are using the threat of terrorism -- real or perceived -- to advance executions , amnesty international alleges in its annual report on the death penalty .", "`` the dark trend of governments using the death penalty in a futile attempt to tackle real or imaginary threats to state security and public safety was stark last year , '' said salil shetty , amnesty 's secretary general in a release .", "`` it is shameful that so many states around the world are essentially playing with people 's lives -- putting people to death for ` terrorism ' or to quell internal instability on the ill-conceived premise of deterrence . ''", "the report , `` death sentences and executions 2014 , '' cites the example of pakistan lifting a six-year moratorium on the execution of civilians following the horrific attack on a school in peshawar in december .", "china is also mentioned , as having used the death penalty as a tool in its `` strike hard '' campaign against terrorism in the restive far-western province of xinjiang .", "the annual report catalogs the use of state-sanctioned killing as a punitive measure across the globe , and this year 's edition contains some mixed findings .", "on one hand , the number of executions worldwide has gone down by almost 22 % on the previous year .", "at least 607 people were executed around the world in 2014 , compared to 778 in 2013 .", "amnesty 's figures do not include statistics on executions carried out in china , where information on the practice is regarded as a state secret .", "belarus and vietnam , too , do not release data on death penalty cases .", "`` the long-term trend is definitely positive -- we are seeing a decrease in the number of executions -lrb- worldwide -rrb- , '' audrey gaughran , amnesty 's director of global issues , told cnn .", "`` a number of countries are closer to abolition , and there are some signs that some countries will be abolitionist by 2015 .", "-lrb- there are -rrb- signals of a world that is nearing abolition . ''", "while the report notes some encouraging signs , it also highlights a marked increase in the number of people sentenced to death in 2014 .", "at least 2,466 people globally are confirmed to have been handed the sentence last year , an increase of 28 % compared with 2013 .", "the report notes that the spike in sentencing is attributable to mass-sentencing in countries including egypt and nigeria , `` against scores of people in some cases . ''", "the organization found `` positive developments '' worldwide , with most regions seeming to show reductions in the number of executions .", "opinion : sharp spike in death sentences .", "sub-saharan africa , for example , saw a 28 % fall in reported cases , and executions recorded in the middle east and north africa were down 23 % compared to 2013 .", "`` even though we 've highlighted some of the negative developments ... i think we would always highlight that there are positive developments , '' gaughran said .", "`` across the board , with the exception of europe and central asia there were fewer reports of executions in every region . ''", "the resumption of the use of capital punishment in belarus -- the only country in europe and central asia to execute people -- after a two year hiatus spoiled an near-universal decrease in countries using the death penalty by region .", "the united states has the dubious distinction of being the only country in the americas to conduct executions , but the number of convicts put to death here fell slightly , from 39 in 2013 to 35 in 2014 .", "the state of washington also imposed a moratorium on executions last year .", "the u.s. remains one of the worst offenders for imposing capital punishment , with only iran -lrb- 289 + -rrb- , iraq -lrb- 61 + -rrb- , and saudi arabia -lrb- 90 + -rrb- executing more people in 2014 .", "while figures are not available , amnesty estimates that china also executes `` thousands '' of prisoners each year , `` more than the rest of the world put together . ''", "the report also highlights the imperfections in the judiciary processes that lead to many sentenced to death .", "`` in the majority of countries where people were sentenced to death or executed , the death penalty was imposed after proceedings that did not meet international fair trial standards , '' the report stated .", "`` in 2014 amnesty international raised particular concerns in relation to court proceedings in afghanistan , bangladesh , china , egypt , iran , iraq , north korea , pakistan , saudi arabia and sri lanka . ''", "the united nations secretary-general , ban ki-moon , last year stressed the need to move toward abolition of capital punishment .", "`` the taking of life is too irreversible for one human being to inflict it on another , '' he said , in marking world day against death penalty in october .", "`` we must continue to argue strongly that the death penalty is unjust and incompatible with fundamental human rights . ''", "amnesty estimates that at least 19,094 people were believed to be on death row at the end of 2014 ."], "summary": ["amnesty 's annual death penalty report catalogs encouraging signs , but setbacks in numbers of those sentenced to death .", "organization claims that governments around the world are using the threat of terrorism to advance executions .", "the number of executions worldwide has gone down by almost 22 % compared with 2013 , but death sentences up by 28 % ."], "publication": "CNN", "compression": 14.841269841269842, "coverage": 0.8888888888888888, "density": 5.079365079365079} +{"label": [8, 9, 34], "text": ["-lrb- cnn -rrb- on may 28 , 2014 , some 7,000 people gathered in a stadium in china 's northwestern xinjiang region .", "but they had not come to watch the local football team or any other grand sporting event .", "instead , the authorities paraded scores of prisoners dressed in orange jumpsuits .", "armed soldiers guarded the exits .", "in the patently unfair , open air trial that followed , 55 people were found guilty of a range of offenses linked to violent attacks in the region and jailed .", "three were sentenced to death .", "the public mass sentencing was part a china 's `` strike hard '' campaign against unrest in xinjiang , a campaign the government claims was launched to combat `` terrorism '' and `` separatism . ''", "but it was also indicative of a trend that was starkly evident last year around the world -- governments using the death penalty in a misguided , and often cynical , attempt to tackle crime and terrorism .", "today , amnesty international releases its annual review of the death penalty worldwide .", "much of it makes for grim reading .", "in pakistan , the government lifted a six-year moratorium on the execution of civilians in the wake of the horrific taliban attack on a school in peshawar in december .", "more than 60 people have been put to death since , and the government has threatened to send thousands more death row prisoners to the gallows .", "iran and iraq executed people for `` terrorism , '' and other countries expanded the scope of capital crimes in their penal codes .", "in a year when abhorrent summary executions by armed groups were branded on the global consciousness as never before , governments are themselves resorting to more executions in a knee-jerk reaction to terrorism .", "other countries made use of executions in similarly flawed attempts to address -- or appear to address -- crime rates .", "jordan ended an eight-year moratorium in december , putting 11 murder convicts to death , with the government saying it was a move to end a surge in violent crime .", "in indonesia , authorities announced plans to execute mainly drug traffickers to tackle a public safety `` national emergency . ''", "six people have already been executed this year .", "a sharp spike in death sentences recorded in 2014 -- up more than 500 on the previous year -- can also be attributed to governments using the death penalty as a political tool .", "the rise was largely because of developments in egypt and nigeria , where courts imposed hundreds of death sentences in the context of internal political instability or crime and armed conflict .", "the simple fact is that governments using the death penalty to tackle crime and security threats are deceiving themselves or the public or both .", "there is no evidence that the threat of execution is more of a deterrent to crime than a prison sentence , as united nations and other studies have repeatedly confirmed .", "it is high time that world leaders stop using the death penalty as an easy way out when times get tough .", "at amnesty international , we have campaigned for an end to the death penalty for decades .", "thankfully , most of the world now appears to agree with us .", "the numbers speak for themselves .", "in 1945 when the united nations was founded , only eight countries had abolished the death penalty .", "today , 140 states are abolitionist in law or practice .", "last year , we recorded executions in 22 countries , down by almost a half from 20 years ago .", "despite the troubling developments we recorded last year , there was still much good news to be found .", "the number of executions recorded around the world dropped significantly in 2014 compared with the previous year , from 778 to 607 .", "this number does not include china , where more people are put to death than the rest of the world put together , but with death penalty statistics treated as a state secret , the true figure is impossible to determine .", "executions were recorded in only three countries in sub-saharan africa -- equatorial guinea , somalia and sudan -- and the number of people put to death went down by more than a quarter .", "the americas continued to be execution-free , apart from the united states .", "those governments that still execute need to realize that they are on the wrong side of history .", "they must join the vast majority of countries which have dropped the ultimate cruel punishment .", "fighting for an end to the death penalty remains an uphill task , but all of us must try to make the world free of this punishment .", "with determination , i know that we can achieve this goal ."], "summary": ["amnesty international releases its annual review of the death penalty worldwide ; much of it makes for grim reading .", "salil shetty : countries that use executions to deal with problems are on the wrong side of history ."], "publication": "CNN", "compression": 20.85, "coverage": 0.825, "density": 6.375} +{"label": [2, 3], "text": ["-lrb- cnn -rrb- seventy years ago , anne frank died of typhus in a nazi concentration camp at the age of 15 .", "just two weeks after her supposed death on march 31 , 1945 , the bergen-belsen concentration camp where she had been imprisoned was liberated -- timing that showed how close the jewish diarist had been to surviving the holocaust .", "but new research released by the anne frank house shows that anne and her older sister , margot frank , died at least a month earlier than previously thought .", "researchers re-examined archives of the red cross , the international training service and the bergen-belsen memorial , along with testimonies of survivors .", "they concluded that anne and margot probably did not survive to march 1945 -- contradicting the date of death which had previously been determined by dutch authorities .", "in 1944 , anne and seven others hiding in the amsterdam secret annex were arrested and sent to the auschwitz-birkenau concentration camp .", "anne frank 's final entry .", "that same year , anne and margot were separated from their mother and sent away to work as slave labor at the bergen-belsen camp in germany .", "days at the camp were filled with terror and dread , witnesses said .", "the sisters stayed in a section of the overcrowded camp with no lighting , little water and no latrine .", "they slept on lice-ridden straw and violent storms shredded the tents , according to the researchers .", "like the other prisoners , the sisters endured long hours at roll call .", "her classmate , nannette blitz , recalled seeing anne there in december 1944 : `` she was no more than a skeleton by then .", "she was wrapped in a blanket ; she could n't bear to wear her clothes anymore because they were crawling with lice . ''", "listen to anne frank 's friends describe her concentration camp experience .", "as the russians advanced further , the bergen-belsen concentration camp became even more crowded , bringing more disease .", "a deadly typhus outbreak caused thousands to die each day .", "typhus is an infectious disease caused by lice that breaks out in places with poor hygiene .", "the disease causes high fever , chills and skin eruptions .", "`` because of the lice infesting the bedstraw and her clothes , anne was exposed to the main carrier of epidemic typhus for an extended period , '' museum researchers wrote .", "they concluded that it 's unlikely the sisters survived until march , because witnesses at the camp said the sisters both had symptoms before february 7 .", "`` most deaths caused by typhus occur around twelve days after the first symptoms appear , '' wrote authors erika prins and gertjan broek .", "the exact dates of death for anne and margot remain unclear .", "margot died before anne .", "`` anne never gave up hope , '' said blitz , her friend .", "`` she was absolutely convinced she would survive . ''", "her diary endures as one of the world 's most popular books .", "read more about anne frank 's cousin , a keeper of her legacy ."], "summary": ["museum : anne frank died earlier than previously believed .", "researchers re-examined archives and testimonies of survivors .", "anne and older sister margot frank are believed to have died in february 1945 ."], "publication": "CNN", "compression": 14.864864864864865, "coverage": 0.8378378378378378, "density": 2.189189189189189} +{"label": [1, 2, 10, 14, 19], "text": ["it is a week which has seen him in deep water - both on and off the pitch .", "just days after dallas cowboys ' greg hardy was suspended from 10 nfl games he appeared to get into trouble when he drove his luxury car through flash floods in dallas , getting stuck when the car could not make it through the rising , fast flowing waters .", "the 25-year-old was forced to abandon his bentley , leaving it stranded until the waters receded and the car could be towed away .", "it took the tow truck several hours to successfully remove the car and hardy was later seen returning to the vehicle to collect some of his possessions .", "he left in another luxury car , a white ferrari .", "scroll down for video .", "greg hardy found himself in more deep water when he was forced to abandon his bentley in flash floods .", "the problem with his car comes as more bad news for hardy who was suspended by the nfl just days ago after an incident of domestic abuse that allegedly occurred last year .", "hardy , who signed with the dallas cowboys last month , will be forced to sit out the first 10 games of the season and will not receive his salary for these games .", "last year hardy , 25 , was convicted by a judge in charlotte , north carolina of beating , strangling and threatening to kill his ex-girlfriend , nicki holder .", "those charges were later dropped on an appeal when holder could not be located to testify .", "a two month investigation by the nfl followed and officials decided he had to be suspended .", "hardy was informed in a letter from nfl commissioner roger goodell that the probe determined there was ` sufficient credible evidence that hardy engaged in conduct that violated nfl policies in multiple respects . '", "hardy was dropped by his previous team , the carolina panthers , because of these charges last season , but was still able to collect his salary during that time , which was roughly $ 770,000 a week .", "hardy previously played for the carolina panthers but was dropped after allegations of domestic abuse emerged and was then signed by dallas cowboys and suspended for 10 games by the nfl .", "hardy is seen talking to officials after his bentley got stuck in flash floods in dallas this week . '", "i understand that i need to step away from football right now and take care of this legal matter , ' hardy said in a statement after he was cut from the panthers .", "the panthers had originally agreed to wait to take action until hardy had a jury trial regarding the incident in may .", "his previous conviction was the result of a bench trial .", "a jury trial ultimately led to all charges being dropped .", "holder told police that hardy choked her , slammed her against a bathtub , threw her to the floor and threatened to kill her after a fight at his charlotte condo .", "the dallas cowboys star was seen attempting to drive his bentley during the floods , but had to abandon it .", "it took officials and a tow truck several hours to pull the luxury bentley free from dallas flood waters .", "this all came at a time when the league was under heavy scrutiny in the wake of two abuse scandals involving stars ray rice and adrian peterson .", "many were upset with the punishments those two received , feeling the nfl was too lenient .", "video of rice punching then-fianc\u00e9e janay palmer went public last monday , and peterson was indicted on charges of reckless or negligent injury to a child on friday for an incident in which he hit his son with a switch back in may .", "hardy -lrb- above -rrb- was convicted by a judge last july of beating , strangling and threatening to kill ex-girlfriend nicki holder .", "the nfl announced that hardy would be suspended without pay for 10 games at the start of the 2015 season .", "holder -lrb- above with hardy -rrb- told police that he choked her , slammed her against a bathtub , threw her to the floor and threatened to kill her after a fight at his condo .", "rice was definitely suspended from the nfl and had his contract terminated by the baltimore ravens , while peterson , who was sidelined by the minnesota vikings last sunday , has now been suspended by the team .", "both men are expected by many to return to play in the 2015 , with peterson back on the vikings after an nfl decision and rice winning a wrongful termination suit during the off-season .", "rice even pocketed roughly $ 1.6 million in back pay ."], "summary": ["hardy was convicted of domestic abuse against ex-girlfriend nicki holder and was suspended from the dallas cowboys for 10 days by the nfl .", "charges were eventually dropped after holder could not be located when hardy 's lawyers appealed the decision and asked for a jury trial .", "this week he got stuck in his bentley in deep flash flood waters in dallas .", "hardy was forced to abandon his car and it was towed away hours later ."], "publication": "DailyMail", "compression": 9.845238095238095, "coverage": 0.9047619047619048, "density": 2.3333333333333335} +{"label": [1, 2], "text": ["an hiv self-testing kit is on sale for the first time in the uk .", "the 99.7 per cent accurate biosure hiv self test enables people to test themselves when and where they like .", "an estimated 26,000 people in the uk have hiv but are unaware of it and may be transmitting the disease to others .", "the 99.7 per cent accurate biosure hiv self test enables people to test themselves when and where they like .", "the testing kit , on sale online , uses a small amount of blood from a finger-prick sample to detect the presence of hiv antibodies , giving a result in just 15 minutes .", "treatments available mean hiv is now a manageable disease -- but late diagnosis can have a devastating impact on health and life expectancy .", "the national aids trust warns that 40 per cent of those living with hiv remain undiagnosed for at least four years , with those diagnosed late 11 times more likely to die in the first year after diagnosis .", "the testing kit , on sale online , uses a small amount of blood from a finger-prick sample to detect the presence of hiv antibodies , giving a result in just 15 minutes .", "biosure founder brigette bard said it is a significant step towards normalising hiv testing , adding : ` knowing your hiv status is critical and the launch of this product will empower people to discreetly test themselves when it is convenient to them and in a place where they feel comfortable . '", "positive test results need to be confirmed by a healthcare professional and those in high-risk groups are recommended to be tested every three months .", "the only alternative currently available is ` home sampling ' , which involves collecting a blood sample 160 times larger than that for the self-test and posting it to a laboratory , with results given five days later .", "biosure founder brigette bard said it is a significant step towards normalising hiv testing ."], "summary": ["the 99.7 per cent accurate biosure hiv self test enables people to test themselves when and where they like .", "an estimated 26,000 people in the uk have hiv but are unaware of it .", "treatments available mean hiv is now a manageable disease ."], "publication": "DailyMail", "compression": 7.468085106382978, "coverage": 0.9574468085106383, "density": 14.446808510638299} +{"label": [4, 10, 15], "text": ["everyone knows the tortoise beat the hare , but this little fellow has gone one better and beaten two cheetahs .", "these pictures capture the amazing moment when one of the notoriously slow-moving reptiles escaped becoming big cat fast food by retreating into its shell before scuttling off across desert sands .", "the baffled cheetahs surrounded the tortoise and attempted to scare it out of its shell with snarls but the reptile kept well tucked up inside its tough exterior forcing the big cats to wander off in search of another snack .", "hard target : the tortoise attempts a quick getaway under the watchful eye of one of the curious cheetahs .", "confused : the two cheetahs exchange glances as they move in to size up their potential meal .", "the intriguing scene was captured by john mullineux , a chemical engineer from secunda , south africa .", "he said : ` while driving on the sandy tracks of the kalahari desert in south africa , i came across two cheetahs lying in the shade near the road .", "` shortly after i stopped , they got up and slowly headed to the dunes .", "` halfway up the red sandy dune the younger one stopped to inspect a tortoise , the older one also stopped and tried to bite the shell but could n't manage it .", "now you see me : the tortoise retreats into its shell as the big cats get too close for comfort .", "snarl : one of the cheetahs gets up close and personal to the little reptile and tries to scare it out of its shell .", "` by the time the older cheetah had made it to the top of the dune , the younger one decided to run off and follow rather than spend more time at the hard meal .", "` the tortoise then casually moved on as if nothing unusual had happened .", "from a young age i have loved cheetahs for their elegance and speed - seeing two so close was dream but seeing them size up their lunch was unique .", "` it was something that was both exciting and naturally beautiful at the same time . '", "slow and steady : the tortoise continues his escape across the sands of the kalahari desert in south africa .", "john mullineux , a chemical engineer from secunda , south africa , spotted the scene while driving along a desert track .", "one of the cheetahs appears to admit defeat and wander off throwing a last glance of its shoulder at the lucky tortoise ."], "summary": ["amazing scene captured on film in south africa 's kalahari desert .", "two of the big cats approach the little reptile as it scuttled across the sands .", "but they were denied their meal and forced to wander off disappointed ."], "publication": "DailyMail", "compression": 10.209302325581396, "coverage": 0.7674418604651163, "density": 1.4651162790697674} +{"label": [4, 9, 33], "text": ["angus hawley 's brother has spoken of his shock after his brother , the ex-husband of antonia kidman , died of a suspected heart attack , age 46 , in new york on saturday .", "speaking to daily mail australia on monday , david hawley said : ` it 's a real shock , he was one of the fittest men i 've ever met -- he 's swimming everyday . '", "responding to a question about whether angus had a history of heart problems , david answered : ` no , no , not that we know of ' , adding : ` he 's so fit , i do n't understand . '", "scroll down for video .", "` he did n't have heart problems ' angus hawley 's brother reveals shock after ex-husband of antonia kidman dies from a suspected heart attack in new york after ` returning from a swim ' .", "angus and antonia pictured together in 2005 at the chuan spa opening in the langham hotel .", "mr hawley , who was in new york attending a business conference at the time , collapsed after returning from a swim .", "` he did go for a big swim in the morning , he trains very hard , ' david said of his brother , who he described as a ` bit of a fitness fanatic ' and was known to lead a healthy and active lifestyle . '", "i think his body clock was round the wrong way and it just got everything round the wrong way and he 's over done it . '", "mr hawley was a father to four children , lucia , 16 , hamish , 14 , james , 12 , and sybella , eight , all of whom he shared with nicole kidman 's sister antonia before their 2007 split .", "the children are reportedly set to join the family in sydney as they rally around david 's second wife prue fisher , who he married in palm beach in 2011 .", "sad news : antonia kidman 's former husband angus hawley has died of a suspected heart attack aged 46 in new york .", "the pair are seen here in 2003 .", "fitness fanatic : mr hawley 's brother says he does n't ` understand ' the death of his fit and healthy brother , pictured with his wife prue fisher in 2011 .", "led an active lifestyle : mr hawley , 46 , is believed to have suffered a heart attack after returning from a swim .", "the former couple are pictured above with antonia 's parents janelle and the late dr. antony kidman .", "david described his brother , a business development manager at valor private wealth , as ` one of the most beautiful men that i have ever known .", "` he is absolutely adored by everybody , he made everybody feel like he 's their best friend and that 's why everybody loved him .", "and he loved everybody else , it 's just a really emotional time . '", "prue is being comforted by her family in sydney , after they traveled from orange in new south wales to be by her side .", "she was reportedly seen at the bondi icebergs public pool , a place her late husband often frequented , on sunday .", "moved on : both antonia and mr hawley remarried following their divorce in 2007 - she to businessman craig marran -lrb- l -rrb- in 2010 , and he to sydney fashion boutique manager prue the following year -lrb- r -rrb- .", "david described prue as ` devastated ' saying she 's ` terrible , terrible ' , adding , ` it 's a huge hole in our lives .", "` they were absolutely devoted to each other and prue 's relationship with angus 's children was fantastic , ' said david of his late brother 's wife .", "` his wife adores him , and he adored her , his four children , it 's just so sad .", "it 's a tragic loss to our family and to his family , it 's just a nightmare .", "` no matter what happens for the rest of her life , she 'll still be my sister-in-law . '", "on saturday another of angus 's brothers phillip released a statement , describing his death as ` sudden ' and ` very unexpected ' to news.com.au .", "wedding day : antonia and angus wed in 1996 , they were together for 11 years before their divorced was finalised in 2007 .", "legacy : the 46-year-old was a father to four children in lucia , 16 , hamish , 14 , james , 12 , and sybella , eight , all of whom he shared with nicole kidman 's sister antonia , pictured .", "` there are no further details at this time as it only occurred last night , our time , ' the statement read .", "reports about his death have as yet been mixed , with news.com.au saying that mr hawley went to dinner with a friend in new york and then went into cardiac arrest .", "he is said to have later passed away in the ambulance on the way to hospital .", "mr hawley 's death comes less than seven months after the sudden passing of nicole and antonia 's father dr. antony kidman , who also suffered a suspected heart attack , in singapore .", "family tragedy : mr hawley 's death comes less than seven months after the sudden passing of nicole and antonia 's father dr. antony , who also suffered a heart attack , in singapore .", "both 44-years-old antonia and her ex husband both remarried following their divorce in 2007 - she to businessman craig marran in 2010 , and he to sydney fashion boutique manager prue , the following year .", "he has kept himself largely out of the spotlight following his split from antonia and a battle with depression .", "the father of four checked himself into a sydney rehab clinic in 2007 following a period of mental health issues .", "tragic : antonia 's second husband craig marran accompanied her , her sister nicole and husband keith urban to dr. antony 's funeral in september last year .", "he told woman 's day in 2009 : ' i was depressed , out of control and full of self-loathing , and i resorted to drugs to get through it . '", "i was n't in a happy place and it was an appalling thing , but i was sick , and at least i was big enough to do something about it . '", "merivale hotel founder justin hemmes , has paid tribute to his good friend angus , explaining to the daily telegraph that the pair became friends at just four years old .", "family man : dr. antony kidman was visiting antonia and her family in singapore when he passed away .", "day of mourning : antonia 's six children lucia , hamish , james , sybella , nicholas , two , and alexander , one , attended the funeral along with nicole 's daughters sunday rose and faith .", "support : keith and craig acted as pallbearers at the funeral , as did family friends russell crowe and channel nine newsreader peter overton .", "` he was my next door neighbour but quickly became a best friend , one i was fortunate enough to have by my side ever since , ' he said , describing mr hawley as ` the most caring , thoughtful and loving man . '", "` the most loving father to his four wonderful children and adoring wife .", "his family was his treasure .", "his kids were his life , ' he continued .", "mr hawley 's death is the second devastating loss the kidman family has suffered in the past seven months , after dr. antony kidman sadly collapsed and died in a singapore hotel last september at the age of 75 .", "family photo : antonia , janelle , dr. antony and nicole are seen here in 1990 .", "nicole said at his funeral she was ` so lucky ' to be her father 's daughter .", "close knit : nicole and antonia are pictured here with their late father in 1990 .", "a respected sydney psychologist , dr. antony was in the country visiting antonia and his six grandchildren .", "antonia , a journalist and writer , is currently based in singapore with her second husband with whom she shares two sons , nicholas , two , and alexander , one .", "she remembered the close relationship she had with her father at his funeral last year and said they were ` similar in many ways ' .", "new home : antonia resides in singapore with second husband craig .", "she 's pictured here with nicole , who lives in nashville with keith urban , in 2005 .", "` i 'm so lucky to be his daughter , ' 47-year-old nicole said , ` and that he chose my mother to make me with . '", "appearing on ellen last october , nicole said husband keith urban had to carry her , sometimes literally , because she was ` so devastated ' by the loss .", "daily mail australia has contacted the kidman family 's management .", "tribute : a good friend of mr hawley , merivale founder justin hemmes has described him as ` the most caring , thoughtful and loving man '"], "summary": ["angus hawley 's brother said his late sibling ` did n't have heart problems ' he is reported to have had a suspected heart attack in new york .", "angus was a father of four children - lucia , hamish , james and sybella .", "he had all four with nicole kidman 's sister antonia before their 2007 split .", "both 44-year-old antonia and angus , 46 , remarried following their divorce .", "angus ' death comes seven months after dr. antony kidman 's death .", "nicole and antonia 's father also died of a heart attack in singapore ."], "publication": "DailyMail", "compression": 15.157407407407407, "coverage": 0.9259259259259259, "density": 3.740740740740741} +{"label": [7, 17], "text": ["despite the hype surrounding its first watch , the iphone is still the engine behind apple 's phenomenal success , its latest figures have revealed .", "the results far surpassed most analysts ' expectations for the first three months of the year , when sales traditionally fall from their holiday-season peak .", "apple sold more than 61 million iphones in the quarter , accounting for more than two-thirds of its $ 58 billion in revenue for the quarter and the lion 's share of its $ 13.6 billion in profit - and up 40 % from a year ago .", "sales of iphones in china were also revealed to have outstripped those in the us .", "apple sold more than 61 million iphones in the quarter , accounting for more than two-thirds of its $ 58 billion in revenue for the quarter and the lion 's share of its $ 13.6 billion in profit .", "$ 58 billion in revenue , $ 13.6 billion in profit .", "$ 200 billion in cash , up from around $ 150 billion a year ago .", "more than 61 million iphones sole .", "ipad revenue fell 29 % to $ 5.4 billion .", "revenue from mac computers rose 2 % from a year earlier , to $ 5.6 billion .", "` we are thrilled by the continued strength of iphone , mac and the app store , which drove our best march quarter results ever , ' said tim cook , apple 's ceo .", "` we 're seeing a higher rate of people switching to iphone than we 've experienced in previous cycles , and we 're off to an exciting start to the june quarter with the launch of apple watch . '", "as expected , the numbers were down from the previous quarter , when holiday shoppers snapped up a record 74 million of apple 's new iphone 6 , 6 plus and older models .", "but it was a 40 percent increase over the number of iphones sold in the first three months of 2014 .", "` we 're seeing great results all over the world , ' apple chief financial officer luca maestri told the associated press , adding that iphone sales grew 72 percent in china , where the company has big hopes for expansion .", "other products played a much smaller role .", "revenue from mac computers rose 2 percent from a year earlier , to $ 5.6 billion , while ipad revenue fell 29 percent , to $ 5.4 billion -- continuing a steady decline in tablet sales .", "apple did n't report any results for the new apple watch , which it began selling this month , after the quarter ended .", "maestri said customer response had been ` positive . '", "analysts estimate about 2 million have sold to date , suggesting early demand is healthy but not of blockbuster proportions .", "apple shares have gained more than 50 percent over the last year , making it the world 's most valuable company .", "` it 's been really great to see the reaction of customers , ' said cook .", "` the response has been overwhelming .", "we ca n't wait to see more of the inspiring apps developers dream up . '", "the iphone is another story .", "since it began offering models with bigger screens last fall , apple has vied with south korea 's samsung for the no.", "1 position in the global smartphone market .", "by some estimates , apple outsold samsung in the quarter that ended in december , and analysts will be watching closely when samsung reports its latest results this week .", "apple also announced an expansion of its effort to return more of its sizable cash war chest to investors .", "the company said it will raise its quarterly dividend by 11 percent , to 52 cents a share , and has increased a $ 90 billion stock buyback program to $ 140 billion .", "apple did n't report any results for the new apple watch , which it began selling this month , after the quarter ended .", "in total , the company said the program will return $ 200 billion to investors by the end of march 2017 .", "as iphone sales have surged , so has apple 's stock .", "apple shares have gained more than 50 percent over the last year , making it the world 's most valuable company .", "the stock closed monday at $ 132.65 , up 1.8 percent for the day , and was rising in late trading .", "the iphone is n't just apple 's ` dominant product , ' said frank gillett , a tech industry analyst at forrester research .", "` it 's more than anything else what 's driving the success of their company . '", "market researchers , however , expect growth in the world smartphone market will slow this year , particularly at the higher price range where apple competes , as most consumers in developed countries have already bought one .", "that could make it difficult for apple to maintain its recent pace .", "` they 're extremely dependent on the iphone , ' said investment colin gillis at bgc partners .", "` at some point , the market dynamics change , ' he said , adding that ` the question is what could replace the iphone ' if sales begin to slow .", "customers looking at apple iphones in an apple store in shanghai , china , on january 14 , 2014 .", "apple ceo tim cook has said he 's optimistic about new markets such as china , where apple has made a strong showing against samsung and china 's xiaomi .", "and even if apple is increasingly selling new iphones to people who are simply upgrading older models , ` that 's still a pretty healthy market , ' said gartner analyst van baker , noting that more than 700 million iphones have been sold since the first model was introduced in 2007 .", "maestri also stressed the potential for new products like apple watch and apple pay , the company 's mobile payment service .", "while these currently provide minimal revenue , analysts say they have big potential .", "and they are designed to work closely with the iphone , which means each may bolster the other 's popularity in the future , gillett said ."], "summary": ["apple sold more than 61 million iphones in the quarter .", "apple did n't report any results for the new apple watch .", "believed around 2 million watches have been sold , according to estimates ."], "publication": "DailyMail", "compression": 28.657894736842106, "coverage": 0.868421052631579, "density": 6.342105263157895} diff --git a/test/data_for_tests/io/cnndm/vocab b/test/data_for_tests/io/cnndm/vocab new file mode 100644 index 00000000..26e83ade --- /dev/null +++ b/test/data_for_tests/io/cnndm/vocab @@ -0,0 +1,100 @@ +. 12172211 +the 11896296 +, 9609022 +to 5751102 +a 5100569 +and 4892246 +of 4867879 +in 4431149 +'s 2202754 +was 2086001 +for 1995054 +that 1944328 +' 1880335 +on 1858606 +` 1821696 +is 1797908 +he 1678396 +it 1603145 +with 1497568 +said 1348297 +: 1344327 +his 1302056 +at 1260578 +as 1230256 +i 1089458 +by 1064355 +have 1016505 +from 1015625 +has 969042 +her 935151 +be 932950 +'' 904149 +`` 898933 +but 884494 +are 865728 +she 850971 +they 816011 +an 766001 +not 738121 +had 725375 +who 722127 +this 721027 +after 669231 +were 655187 +been 647432 +their 645014 +we 625684 +will 577581 +when 506811 +-rrb- 501827 +n't 499765 +-lrb- 497508 +one 490666 +which 465040 +you 461359 +-- 460450 +up 437177 +more 433177 +out 432343 +about 428037 +would 400420 +- 399113 +or 399001 +there 389590 +people 386121 +new 380970 +also 380041 +all 350670 +two 343787 +can 341110 +him 338345 +do 330166 +into 319067 +last 315857 +so 308507 +than 306701 +just 305759 +time 302071 +police 301341 +could 298919 +told 298384 +over 297568 +if 297292 +what 293759 +years 288999 +first 283683 +no 274488 +my 273829 +year 272392 +them 270715 +its 269566 +now 262011 +before 260991 +mr 250970 +other 247663 +some 245191 +being 243458 +home 229570 +like 229425 +did 227833 diff --git a/test/data_for_tests/io/conll2003/dev.txt b/test/data_for_tests/io/conll2003/dev.txt new file mode 100644 index 00000000..90834721 --- /dev/null +++ b/test/data_for_tests/io/conll2003/dev.txt @@ -0,0 +1,49 @@ +-DOCSTART- -X- -X- O + +CRICKET NNP B-NP O +- : O O +LEICESTERSHIRE NNP B-NP B-ORG +TAKE NNP I-NP O +OVER IN B-PP O +AT NNP B-NP O +TOP NNP I-NP O +AFTER NNP I-NP O +INNINGS NNP I-NP O +VICTORY NN I-NP O +. . O O + +LONDON NNP B-NP B-LOC +1996-08-30 CD I-NP O + +Phil NNP B-NP B-PER +Simmons NNP I-NP I-PER +took VBD B-VP O +four CD B-NP O +for IN B-PP O +38 CD B-NP O +on IN B-PP O +Friday NNP B-NP O +as IN B-PP O +Leicestershire NNP B-NP B-ORG +beat VBD B-VP O +Somerset NNP B-NP B-ORG +by IN B-PP O +an DT B-NP O +innings NN I-NP O +and CC O O +39 CD B-NP O +runs NNS I-NP O +in IN B-PP O +two CD B-NP O +days NNS I-NP O +to TO B-VP O +take VB I-VP O +over IN B-PP O +at IN B-PP O +the DT B-NP O +head NN I-NP O +of IN B-PP O +the DT B-NP O +county NN I-NP O +championship NN I-NP O +. . O O diff --git a/test/data_for_tests/io/conll2003/test.txt b/test/data_for_tests/io/conll2003/test.txt new file mode 100644 index 00000000..b5b3aef0 --- /dev/null +++ b/test/data_for_tests/io/conll2003/test.txt @@ -0,0 +1,51 @@ +-DOCSTART- -X- -X- O + +SOCCER NN B-NP O +- : O O +JAPAN NNP B-NP B-LOC +GET VB B-VP O +LUCKY NNP B-NP O +WIN NNP I-NP O +, , O O +THE NP B-NP B-PER +CHINA NNP I-NP I-PER +IN IN B-PP O +SURPRISE DT B-NP O +DEFEAT NN I-NP O +. . O O + +Nadim NNP B-NP B-PER +Ladki NNP I-NP I-PER + +AL-AIN NNP B-NP B-LOC +, , O O +United NNP B-NP B-LOC +Arab NNP I-NP I-LOC +Emirates NNPS I-NP I-LOC +1996-12-06 CD I-NP O + +Japan NNP B-NP B-LOC +began VBD B-VP O +the DT B-NP O +defence NN I-NP O +of IN B-PP O +their PRP$ B-NP O +Asian JJ I-NP B-MISC +Cup NNP I-NP I-MISC +title NN I-NP O +with IN B-PP O +a DT B-NP O +lucky JJ I-NP O +2-1 CD I-NP O +win VBP B-VP O +against IN B-PP O +Syria NNP B-NP B-LOC +in IN B-PP O +a DT B-NP O +Group NNP I-NP O +C NNP I-NP O +championship NN I-NP O +match NN I-NP O +on IN B-PP O +Friday NNP B-NP O +. . O O diff --git a/test/data_for_tests/io/conll2003/train.txt b/test/data_for_tests/io/conll2003/train.txt new file mode 100644 index 00000000..4f0c4bf2 --- /dev/null +++ b/test/data_for_tests/io/conll2003/train.txt @@ -0,0 +1,48 @@ +-DOCSTART- -X- -X- O + +EU NNP B-NP B-ORG +rejects VBZ B-VP O +German JJ B-NP B-MISC +call NN I-NP O +to TO B-VP O +boycott VB I-VP O +British JJ B-NP B-MISC +lamb NN I-NP O +. . O O + +Peter NNP B-NP B-PER +Blackburn NNP I-NP I-PER + +BRUSSELS NNP B-NP B-LOC +1996-08-22 CD I-NP O + +The DT B-NP O +European NNP I-NP B-ORG +Commission NNP I-NP I-ORG +said VBD B-VP O +on IN B-PP O +Thursday NNP B-NP O +it PRP B-NP O +disagreed VBD B-VP O +with IN B-PP O +German JJ B-NP B-MISC +advice NN I-NP O +to TO B-PP O +consumers NNS B-NP O +to TO B-VP O +shun VB I-VP O +British JJ B-NP B-MISC +lamb NN I-NP O +until IN B-SBAR O +scientists NNS B-NP O +determine VBP B-VP O +whether IN B-SBAR O +mad JJ B-NP O +cow NN I-NP O +disease NN I-NP O +can MD B-VP O +be VB I-VP O +transmitted VBN I-VP O +to TO B-PP O +sheep NN B-NP O +. . O O diff --git a/test/data_for_tests/io/coreference/coreference_dev.json b/test/data_for_tests/io/coreference/coreference_dev.json new file mode 100644 index 00000000..bb6592d3 --- /dev/null +++ b/test/data_for_tests/io/coreference/coreference_dev.json @@ -0,0 +1 @@ +{"doc_key": "bc/cctv/00/cctv_0000_0", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], "clusters": [[[70, 70], [485, 486], [500, 500], [73, 73], [55, 55], [153, 154], [366, 366]]], "sentences": [["In", "the", "summer", "of", "2005", ",", "a", "picture", "that", "people", "have", "long", "been", "looking", "forward", "to", "started", "emerging", "with", "frequency", "in", "various", "major", "Hong", "Kong", "media", "."], ["With", "their", "unique", "charm", ",", "these", "well", "-", "known", "cartoon", "images", "once", "again", "caused", "Hong", "Kong", "to", "be", "a", "focus", "of", "worldwide", "attention", "."]]} diff --git a/test/data_for_tests/io/coreference/coreference_test.json b/test/data_for_tests/io/coreference/coreference_test.json new file mode 100644 index 00000000..9577da0e --- /dev/null +++ b/test/data_for_tests/io/coreference/coreference_test.json @@ -0,0 +1 @@ +{"doc_key": "bc/cctv/00/cctv_0005_0", "speakers": [["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"], ["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"]], "clusters": [[[57, 59], [25, 27], [42, 44]]], "sentences": [["--", "basically", ",", "it", "was", "unanimously", "agreed", "upon", "by", "the", "various", "relevant", "parties", "."], ["To", "express", "its", "determination", ",", "the", "Chinese", "securities", "regulatory", "department", "compares", "this", "stock", "reform", "to", "a", "die", "that", "has", "been", "cast", "."]]} \ No newline at end of file diff --git a/test/data_for_tests/io/coreference/coreference_train.json b/test/data_for_tests/io/coreference/coreference_train.json new file mode 100644 index 00000000..0c2940df --- /dev/null +++ b/test/data_for_tests/io/coreference/coreference_train.json @@ -0,0 +1 @@ +{"doc_key": "bc/cctv/00/cctv_0001_0", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], "clusters": [[[113, 114], [42, 45], [88, 91]]], "sentences": [["What", "kind", "of", "memory", "?"], ["We", "respectfully", "invite", "you", "to", "watch", "a", "special", "edition", "of", "Across", "China", "."]]} diff --git a/test/data_for_tests/io/cws_as/dev.txt b/test/data_for_tests/io/cws_as/dev.txt new file mode 100755 index 00000000..f4c96e9e --- /dev/null +++ b/test/data_for_tests/io/cws_as/dev.txt @@ -0,0 +1,6 @@ +時間 : +三月 十日 ( 星期四 ) 上午 十時 。 +並 辦理 加州 大學 退休 等 手續 。 +包括 一九七八年 獲有 數學 諾貝爾 之 稱 的 費爾茲獎 , +在 台大 的 四 年 裡 , +他 語重心長 的 勉勵 同學 們 一 番 話 , diff --git a/test/data_for_tests/io/cws_as/test.txt b/test/data_for_tests/io/cws_as/test.txt new file mode 100755 index 00000000..a61009b2 --- /dev/null +++ b/test/data_for_tests/io/cws_as/test.txt @@ -0,0 +1,6 @@ +許多 社區 長青 學苑 多 開設 有 書法 、 插花 、 土風舞班 , +文山區 長青 學苑 則 有 個 十分 特別 的 「 英文 歌唱班 」 , +成員 年齡 均 超過 六十 歲 , +這 群 白髮蒼蒼 , +爺爺 、 奶奶級 的 學員 唱起 英文 歌 來 字正腔圓 , +有模有樣 。 diff --git a/test/data_for_tests/io/cws_as/train.txt b/test/data_for_tests/io/cws_as/train.txt new file mode 100755 index 00000000..b6eab6a3 --- /dev/null +++ b/test/data_for_tests/io/cws_as/train.txt @@ -0,0 +1,6 @@ +地點 : +學術 活動 中心 一樓 簡報室 。 +主講 : +民族所 所長 莊英章 先生 。 +講題 : +閩 、 台 漢人 社會 研究 的 若干 考察 。 diff --git a/test/data_for_tests/io/cws_cityu/dev.txt b/test/data_for_tests/io/cws_cityu/dev.txt new file mode 100755 index 00000000..eac550f2 --- /dev/null +++ b/test/data_for_tests/io/cws_cityu/dev.txt @@ -0,0 +1,6 @@ +立會 選情 告一段落 民主 進程 還 看 明天 +所謂 「 左 」 的 勢力 , 是 指 以 鄭經翰 、 梁國雄 ( 長毛 ) 為 代表 的 激進 民主 勢力 , 他們 尖銳 批評 中央 和 特區 政府 , 積極 為 基層 勞工 爭取 福利 , 可能 會 為 民主派 與 中央 和解 增加 困難 , 牽制 民主黨 走 中產 溫和 路線 。 +特區 政府 應該 積極 與 民主派 改善 關係 , 尤其 要 爭取 中間 及 「 右 」 翼 的 民主 勢力 , 因為 這些 人 背後 反映 的 是 香港 的 主流 民意 , 除了 民主 步伐 和 涉及 中央 的 敏感 政治 議題 , 他們 和 建制派 的 溫和 力量 沒有 基本 不同 , 很 容易 達成 跨 黨派 的 共識 , 令 特區 政府 處於 不得不 從 的 被動 位置 , 23 條 立法 撤回 、 追究 SARS 責任 等 , 都 是 記憶猶新 的 例子 。 +為 何秀蘭 喝彩 為 香港 人 神傷 +單說 立法會 , 自 91 年 以來 , 經歷 5 次 類似 的 地區 直選 。 +點票 過程 出現 的 笑話 更 多 。 diff --git a/test/data_for_tests/io/cws_cityu/test.txt b/test/data_for_tests/io/cws_cityu/test.txt new file mode 100755 index 00000000..aa838fe2 --- /dev/null +++ b/test/data_for_tests/io/cws_cityu/test.txt @@ -0,0 +1,6 @@ +「 練 得 銅皮鐵骨 」 露宿 早 慣 蚊叮 +本 港 約 有 450 至 600 名 露宿者 , 其中 近 四分之一 , 即 約 150 人 露宿 在 深水埗 。 +有 外展 社工 稱 , 露宿者 日間 多 到 商場 等 冷氣 場所 避暑 , 流連 至 晚上 11 、 12 時 , 才 用 紙皮 在 公園 外 「 打地鋪 」 , 他們 早已 「 練 得 一 身 銅皮鐵骨 」 , 徹夜 被 蚊 叮 也 習以為常 , 但 社工 在 炎夏 仍 會 頻頻 給 他們 派發 蚊香 。 +基督教 關懷 無家者 協會 的 外展 社工 , 過去 一直 有 探訪 李鄭屋 遊樂場 外 的 露宿者 , 該 會 總幹事 賴淑芬 說 , 該 處 的 露宿者 只 有 數 人 , 且 流動性 很 大 。 +不管 被 多少 蚊 叮 也 沒 什 感覺 +她 指 這些 露宿者 日間 都 會 流連 於 冷氣 場所 , 晚上 才 到 遊樂場 露宿 , 但 礙於 遊樂場 晚上 關門 , 他們 只 可 在 外圍 「 打地鋪 」 。 diff --git a/test/data_for_tests/io/cws_cityu/train.txt b/test/data_for_tests/io/cws_cityu/train.txt new file mode 100755 index 00000000..6338621c --- /dev/null +++ b/test/data_for_tests/io/cws_cityu/train.txt @@ -0,0 +1,6 @@ +立法會 選舉 出現 了 戲劇性 的 結果 , 儘管 投票率 創下 新高 , 而 過去 經驗 顯示 高 投票率 對 民主派 較 有利 , 但 由於 名單 協調 不當 及 配票 策略 失誤 , 加上 醜聞 影響 選情 , 民主黨 的 議席 比 上 一 屆 減少 , 由 第 一 大 黨 跌 至 第 三 ; +而 泛民主派 在 30 席 普選 中 亦 只能 取得 18 席 , 比 選前 預期 的 20 席 少 ; +但 在 功能 組別 選舉 卻 有 意外 收穫 , 除 保住 原有 的 5 個 議席 , 還 搶佔 了 醫學 和 會計 兩 個 專業 界別 , 令 議席 總數 達到 25 席 , 比 上 一 屆 多 了 3 席 。 +更 值得 注意 的 是 , 泛民主派 候選人 在 普選 中 合共 取得 110萬 張 選票 , 佔 178萬 選票 總數 的 62 % , 顯示 多數 市民 認同 早日 實現 全面 普選 的 民主 訴求 , 這 一 點 應 為 政府 及 各 黨派 人士 所 尊重 。 +須 為 2012 全面 普選 創造 條件 +親 建制 陣營 方面 , 民建聯 和 自由黨 都 取得 佳績 , 分別 取得 12 席 和 11 席 , 成為 立法會 內 的 第 一 及 第 二 大 黨 。 diff --git a/test/data_for_tests/io/cws_msra/dev.txt b/test/data_for_tests/io/cws_msra/dev.txt new file mode 100644 index 00000000..9c6b34ee --- /dev/null +++ b/test/data_for_tests/io/cws_msra/dev.txt @@ -0,0 +1,2 @@ +“ 人们 常 说 生活 是 一 部 教科书 , 而 血 与 火 的 战争 更 是 不可多得 的 教科书 , 她 确实 是 名副其实 的 ‘ 我 的 大学 ’ 。 +他 “ 严格要求 自己 , 从 一个 科举 出身 的 进士 成为 一个 伟大 的 民主主义 者 , 进而 成为 一 位 杰出 的 党外 共产主义 战士 , 献身 于 崇高 的 共产主义 事业 。 diff --git a/test/data_for_tests/io/cws_msra/test.txt b/test/data_for_tests/io/cws_msra/test.txt new file mode 100644 index 00000000..8d5c6b3c --- /dev/null +++ b/test/data_for_tests/io/cws_msra/test.txt @@ -0,0 +1,2 @@ +扬帆 远东 做 与 中国 合作 的 先行 +希腊 的 经济 结构 较 特殊 。 diff --git a/test/data_for_tests/io/cws_msra/train.txt b/test/data_for_tests/io/cws_msra/train.txt new file mode 100644 index 00000000..35c2cad0 --- /dev/null +++ b/test/data_for_tests/io/cws_msra/train.txt @@ -0,0 +1,3 @@ +“ 心 静 渐 知 春 似 海 , 花 深 每 觉 影 生 香 。 +“ 吃 屎 的 东西 , 连 一 捆 麦 也 铡 不 动 呀 ? +复旦大学 百年 校庆 。 \ No newline at end of file diff --git a/test/data_for_tests/io/cws_pku/dev.txt b/test/data_for_tests/io/cws_pku/dev.txt new file mode 100755 index 00000000..df77c5ca --- /dev/null +++ b/test/data_for_tests/io/cws_pku/dev.txt @@ -0,0 +1,6 @@ +在 十五大 精神 指引 下 胜利 前进 —— 元旦 献辞 +我们 即将 以 丰收 的 喜悦 送 走 牛年 , 以 昂扬 的 斗志 迎来 虎年 。 我们 伟大 祖国 在 新 的 一 年 , 将 是 充满 生机 、 充满 希望 的 一 年 。 +李 鹏 在 北京 考察 企业 +李 鹏 说 : “ 作为 首都 的 电力 工作者 , 你们 为 首都 的 各项 重大 活动 的 顺利 进行 , 为 保障 人民 群众 的 工作 、 生活 和 学习 , 为 促进 首都 经济 的 发展 作出 了 自己 的 贡献 。 明天 就 是 元旦 , 你们 还有 许多 同志 要 坚守 岗位 , 我 向 你们 、 向 全体 电力 工作者 表示 感谢 。 现在 , 我们 的 首都 已经 结束 了 拉 闸 限 电 的 历史 , 希望 依靠 大家 , 使 拉 闸 限 电 的 历史 永远 不再 重演 。 同时 , 也 希望 你们 安全 生产 、 经济 调度 , 实现 经济 增长 方式 的 转变 。 ” 李 鹏 最后 向 电业 职工 , 向 全 北京市 的 人民 拜年 , 向 大家 致以 新春 的 问候 , 祝愿 电力 事业 取得 新 的 成绩 , 祝愿 北京市 在 改革 、 发展 和 稳定 的 各项 工作 中 取得 新 的 成就 。 +( 附 图片 1 张 ) +据 介绍 , 播音员 、 主持人 持证 上岗 工作 , 是 在 1996年 全国 广播 影视 系统 语言 工作 会议 上 提 出来 的 , 它 是 加强 宣传 队伍 建设 , 促进 语言 文字 走向 标准化 、 规范化 的 重要 举措 。 播音员 、 主持人 只有 通过 汉语 普通话 水平 测试 和 政治 、 业务 考核 后 才 能 获得 上岗 资格 证书 。 diff --git a/test/data_for_tests/io/cws_pku/test.txt b/test/data_for_tests/io/cws_pku/test.txt new file mode 100755 index 00000000..c7ad3e85 --- /dev/null +++ b/test/data_for_tests/io/cws_pku/test.txt @@ -0,0 +1,6 @@ +共同 创造 美好 的 新 世纪 —— 二○○一年 新年 贺词 +( 二○○○年 十二月 三十一日 ) ( 附 图片 1 张 ) +女士 们 , 先生 们 , 同志 们 , 朋友 们 : +2001年 新年 钟声 即将 敲响 。 人类 社会 前进 的 航船 就要 驶入 21 世纪 的 新 航程 。 中国 人民 进入 了 向 现代化 建设 第三 步 战略 目标 迈进 的 新 征程 。 +在 这个 激动人心 的 时刻 , 我 很 高兴 通过 中国 国际 广播 电台 、 中央 人民 广播 电台 和 中央 电视台 , 向 全国 各族 人民 , 向 香港 特别 行政区 同胞 、 澳门 特别 行政区 同胞 和 台湾 同胞 、 海外 侨胞 , 向 世界 各国 的 朋友 们 , 致以 新 世纪 第一 个 新年 的 祝贺 ! +过去 的 一 年 , 是 我国 社会主义 改革 开放 和 现代化 建设 进程 中 具有 标志 意义 的 一 年 。 在 中国 共产党 的 领导 下 , 全国 各族 人民 团结 奋斗 , 国民经济 继续 保持 较 快 的 发展 势头 , 经济 结构 的 战略性 调整 顺利 部署 实施 。 西部 大 开发 取得 良好 开端 。 精神文明 建设 和 民主 法制 建设 进一步 加强 。 我们 在 过去 几 年 取得 成绩 的 基础 上 , 胜利 完成 了 第九 个 五年计划 。 我国 已 进入 了 全面 建设 小康 社会 , 加快 社会主义 现代化 建设 的 新 的 发展 阶段 。 diff --git a/test/data_for_tests/io/cws_pku/train.txt b/test/data_for_tests/io/cws_pku/train.txt new file mode 100755 index 00000000..d28dbd8b --- /dev/null +++ b/test/data_for_tests/io/cws_pku/train.txt @@ -0,0 +1,6 @@ +迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ( 附 图片 1 张 ) +中共中央 总书记 、 国家 主席 江 泽民 +( 一九九七年 十二月 三十一日 ) +12月 31日 , 中共中央 总书记 、 国家 主席 江 泽民 发表 1998年 新年 讲话 《 迈向 充满 希望 的 新 世纪 》 。 ( 新华社 记者 兰 红光 摄 ) +同胞 们 、 朋友 们 、 女士 们 、 先生 们 : +在 1998年 来临 之际 , 我 十分 高兴 地 通过 中央 人民 广播 电台 、 中国 国际 广播 电台 和 中央 电视台 , 向 全国 各族 人民 , 向 香港 特别 行政区 同胞 、 澳门 和 台湾 同胞 、 海外 侨胞 , 向 世界 各国 的 朋友 们 , 致以 诚挚 的 问候 和 良好 的 祝愿 ! diff --git a/test/data_for_tests/io/imdb/dev.txt b/test/data_for_tests/io/imdb/dev.txt new file mode 100644 index 00000000..423e158b --- /dev/null +++ b/test/data_for_tests/io/imdb/dev.txt @@ -0,0 +1,6 @@ +neg You can never have seen either film and still know that The Jerk Too is a disaster. The question is not, "How did it get made," because if you throw money at anyone and tell them to make a film, they will do so.

No. The question is "Why, oh why, did Steve Martin allow it to be made?" I think he needed the money to fight a nuisance lawsuit and was determined it not cost him anything. He knew the sequel was going to be so frightful, that out of pride, he wouldn't even count it's royalties as income.

The only way this sequel could not be an embarrassment is to have had Carl Gottlieb and Steve Martin revive the nation's favorite poor black family.

And "dcreasy2001" (aka Mark Blankfield?): It's just transparently obvious that you worked on this film in some sad capacity, and the only way you can feel better about your involvement is to be the sequel's lone cheerleader as an IMDb user comment. I was praying for you to veer over into satire, but alas, you were really making an effort at spin. Why not 10 stars? +neg The Hazing is confused mumbo-jumbo that wants so hard to be The Evil Dead that it even references Bruce Campbell several times. The problem is, it is simply not in the same league as that terrific movie. This movie is nowhere near as original. The plot has been used before, by Kevin Tenney in Night of the Demons, and that was a lot more fun. This flick wastes too much time with complicated exposition before getting the kids into the spooky mansion and starting the demonic happenings.

Brad Dourif is, as usual, not given much to do here, but when he is on screen he puts in another over-the-top performance that would make Christopher Walken jealous. As for the acting of the kids, it's passable but by no means good. The shaky camera work is more annoying than clever or atmospheric. There are a few good moments when the first guy gets possessed and throws around some deadly one liners while dispatching his victims, but it was never scary for a second. The gore level is mid-range to low, but the director tries to make up for it by showing the actresses topless a few times. All in all, just okay if you have 87 minutes to waste. +neg I have seen bad movies before, but this one takes the "Worst Movie of a Lifetime" award by far !! Anthony Hopkins has to be completely mentally ill to have his name attached to this one - anywhere ! I will never see another movie with him in it, directing it, etc., etc. ! I can't believe the other actors & actresses that I liked, (in this picture), that stooped so low to be a part of this disaster ! There must be some great drugs out there ! For anyone to not be embarrassed to be a part of such a film, is beyond me ! Save your money on this one ! HUGE FLOP from beginning to end ! Shame on you Mr. Hopkins ! Also, shame on Christian Slater ! I can't believe you put your reputations on the line for this one ! +neg You may want to know up front that I am not a Mormon, unlike a good number of those who have already reviewed this film. I mention this so you'll understand that the way I look at the film may differ greatly from those in the faith. For some, being critical of the film might be seen as being critical of the faith--and that is NOT my intention. So, my review is that of an outsider trying to look inside and learn more about who this man and his people were. Well, after seeing the film, I doubt if I have learned much at all. Since I have been a history teacher, I have a good basic understanding about Young as well as Joseph Smith as well as the teachings of the church. But anyone wanting to see this film to really learn anything will probably be disappointed because the film seems so gosh-darn nice--too nice and too unrealistic in its portrayal. Plus, you learn practically nothing about the church's beliefs other than they are nice people, work hard and some have many wives (and this latter part is only barely hinted at in the film). Instead, the people are almost cartoon-like in their simplistic portrayals. Joseph Smith and Brigham Young and their followers are angelic, the non-Mormons were all devils and Brian Donlevy (playing EXACTLY the same sort of role Edward G. Robinson later played in THE TEN COMMANDMENTS) is the trouble-maker who claims to be a Mormon but just comes along so the film can have a bad guy. It's all so very simple....too simple. Almost like an indoctrination film or infomercial.

Brigham Young especially was a very complex man--with many good points (an excellent organizer and visionary) as well as bad (don't even get me started on his views about Blacks within the church or intermarriage). To portray him in such vague terms is just plain silly. It's also a lot like how Gandhi was portrayed in the film with Ben Kingsley--only the facts that led to his being almost super-human were emphasized. Heck, now that I think about that, this is the trouble with most religious films--they often come off as one-dimensional, trite and bland. Let's have a full and more complete film of these men--one that will stick to facts and not emotional appeals.

Now if you can ignore the fact that you won't learn very much about the faith or its second leader, the film is enjoyable enough. It's obvious someone at 20th Century-Fox really cared about the film, as they had a wonderful cast of both premier actors (Tyrone Power), up and coming actors (Linda Darnell, Jane Darwell and Vincent Price) and wonderful character actors (Dean Jagger, John Carradine and Brian Donlevy). The film also had wonderful location shooting and lots of gloss. It just didn't have a lot to tell us other than they were all "swell". Plus, there were plenty of factual errors and a few just plain dumb scenes. A few of the mistakes include Young taking over the helm immediately after the death of Joseph Smith (it was three years later), no mention of the various Mormon denominations and splinter groups, talk of "gold in California"--even though it was 1847 and gold wouldn't be discovered until 1948, as well as no specific mention of polygamy or Smith's many wives. Just plain dumb scenes include Carradine pulling out a gun and waving it about in the courtroom scene--and no one seemed to care--even though it was a very hostile audience! Don't you think at least the judge would tell him to put it away and stop threatening people with it?!

One final comment. Do not, I repeat, do not watch this film when it's shown on American Movie Classics (a one great station that has sunk a lot in recent years). While I am critical of the film because of its simplistic message, I was horrified with the complete disrespect the station had for the church and its traditions. What I mean is this. The film was punctuated with ads for penis enlargement formulas as well as tons of pop-ups (some advertising a show that features the "sexiest cast"). Talk about disrespectful and gross and I would be just as offended if they did this for any other religious film. By doing this, they not only insult the faith but marginalize their market--after all, who is into hearing about these things AND the life of Brigham Young?! Is this a movie, in this form, that you can show to your kids or recommend to others?! +pos Fifteen years later and Paris Is Burning is still aflame. This is a classic in black gay films, right up there with the other honorary black gay films, The Color Purple and Mahoganoy. This seminal work captures underground and underclass (i.e."underserved) black and Latin gay culture and community like no other work before or since, including all the sentimental Harlem Rennaissance gay retrospectives and renderings. They're good, but this is the best (dare I say the only "real") film you'll find on the subject. It's Relentlessy Cunty (the classic house music invention)comes to Hollywood, non-stop, hilarious camp (like only we do it) and dead-on social critique. All this by a white female director (who obviously must have been a Sister Gurl or Mizz Thing in a former life.) I could go on, but I think you get the point by now: I love this movie! +pos I have been an admirer of Edward Burtynsky's work for years, and it was such a pleasure to be able to see the man at work, thanks to Jennifer Baichwal's documentary. The severe beauty of the ship-breaking yard in Bangladesh, the stone quarry in Vermont, the enormous assembly plant in China, the beleaguered old neighbourhoods in Shanghai that are just waiting to be torn down: these landscapes are captured so well by the photographer and the filmmaker.

At times I thought of old TV documentaries on abandoned coal mines and plastic-mold factories; the sort of stuff I grew up watching. Burtynsky's work has the great value of pointing out how the industrial activity has only shifted to Asia, it has not stopped. The strangest scene for me was the computer scrap-yard somewhere in China--the waste had a threatening air about it, while the workers were very jovial. diff --git a/test/data_for_tests/io/imdb/test.txt b/test/data_for_tests/io/imdb/test.txt new file mode 100644 index 00000000..68768ec6 --- /dev/null +++ b/test/data_for_tests/io/imdb/test.txt @@ -0,0 +1,6 @@ +neg Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook. +neg I have seen this movie and I did not care for this movie anyhow. I would not think about going to Paris because I do not like this country and its national capital. I do not like to learn french anyhow because I do not understand their language. Why would I go to France when I rather go to Germany or the United Kingdom? Germany and the United Kingdom are the nations I tolerate. Apparently the Olsen Twins do not understand the French language just like me. Therefore I will not bother the France trip no matter what. I might as well stick to the United Kingdom and meet single women and play video games if there is a video arcade. That is all. +neg In Los Angeles, the alcoholic and lazy Hank Chinaski (Matt Dillon) performs a wide range of non-qualified functions just to get enough money to drink and gamble in horse races. His primary and only objective is writing and having sexy with dirty women.

"Factotum" is an uninteresting, pointless and extremely boring movie about an irresponsible drunken vagrant that works a couple of days or weeks just to get enough money to buy spirits and gamble, being immediately fired due to his reckless behavior. In accordance with IMDb, this character would be the fictional alter-ego of the author Charles Bukowski, and based on this story, I will certainly never read any of his novels. Honestly, if the viewer likes this theme of alcoholic couples, better off watching the touching and heartbreaking Hector Babenco's "Ironweed" or Marco Ferreri's "Storie di Ordinaria Follia" that is based on the life of the same writer. My vote is four.

Title (Brazil): "Factotum – Sem Destino" ("Factotum – Without Destiny") +neg This film is bundled along with "Gli fumavano le Colt... lo chiamavano Camposanto" and both films leave a lot to be desired in the way of their DVD prints. First, both films are very dark--occasionally making it hard to see exactly what's happening. Second, neither film has subtitles and you are forced to watch a dubbed film--though "Il Prezzo del Potere" does seem to have a better dub. Personally, I always prefer subtitles but for the non-purists out there this isn't a problem. These DVD problems, however, are not the fault of the original film makers--just the indifferent package being marketed four decades later.

As for the film, it's about the assassination of President Garfield. This is a MAJOR problem, as Van Johnson looks about as much like Garfield as Judy Garland. In no way whatsoever does he look like Garfield. He's missing the beard, has the wrong hair color and style and is just not even close in any way (trust me on this, I am an American History teacher and we are paid to know these sort of things!). The real life Garfield was a Civil War general and looked like the guys on the Smith Brothers cough drop boxes. Plus, using some other actor to provide the voice for Johnson in the dubbing is just surreal. Never before or since has Van Johnson sounded quite so macho!! He was a fine actor...but certainly not a convincing general or macho president.

In addition to the stupid casting, President Garfield's death was in no way like this film. It's obvious that the film makers are actually cashing in on the crazy speculation about conspiracies concerning the death of JFK, not Garfield. Garfield was shot in Washington, DC (not Dallas) by a lone gunman with severe mental problems--not a group of men with rifles. However, according to most experts, what actually killed Garfield (over two months later) were incompetent doctors--who probed and probed and probed to retrieve a bullet (to no avail) and never bothered cleaning their hands or implements in the process. In other words, like George Washington (who was basically killed by repeated bloodletting when suffering with pneumonia) he died due to malpractice. In the movie they got nothing right whatsoever...other than indeed President Garfield was shot.

Because the film bears almost no similarity to real history, it's like a history lesson as taught from someone from another planet or someone with a severe brain injury. Why not also include ninjas, fighting robots and the Greek gods while you're at it?!?! Aside from some decent acting and production values, because the script is utter cow crap, I don't recommend anyone watch it. It's just a complete and utter mess. +neg I only comment on really very good films and on utter rubbish. My aim is to help people who want to see great films to spend their time - and money - wisely.

I also want to stop people wasting their time on garbage, and want to publicize the fact that the director/producer of these garbage films can't get away with it for very long. We will find out who you are and will vote with out feet - and wallets.

This film clearly falls into the garbage category.

The director and writer is John Shiban. It's always a bad sign when the writer is also the director. Maybe he wants two pay cheques. He shouldn't get any. So remember the name - John SHIBAN. And if you see anything else by him, forget it.

I won't say anything about the plot - others have already. I am a little worried by how much the director likes to zoom in to the poor girl's face when she is crying and screaming. These long duration shots are a little worrying and may say something about the state of mind of Mr. Shiban. Maybe he should get psychiatric help.

Enough already. It's crap - don't waste your time on it. +neg When you look at the cover and read stuff about it an entirely different type of movie comes to mind than what you get here. Then again maybe I read the summary for the other movie called "Mausolem" instead as there were two movies of this title released about the same time with both featuring plots that had key elements in common. However, reading stuff about that movie here I know I saw this one and not that one and that movie is even less what one would imagine a movie with that title would be about. I will be honest, I expect more of a zombie type picture and you get that in this movie to some degree. However, there is more stuff involving the occult and strange powers as the opening scene of the people being taken away by the coroner at the beginning of the film will attest to. The movie also has the old theme of kids going somewhere they do not belong to have some crazy party, in this case it is in fact a mausoleum. The other movie I do not think really has that key feature playing that prominent role in the movie and I see the score for this one is higher too, still it was just not the movie I was expecting. diff --git a/test/data_for_tests/io/imdb/train.txt b/test/data_for_tests/io/imdb/train.txt new file mode 100644 index 00000000..bbf4d799 --- /dev/null +++ b/test/data_for_tests/io/imdb/train.txt @@ -0,0 +1,6 @@ +neg The monster from Enemy Mine somehow made his way into a small mountain community, where he has taken up residence. He's being hunted by a female doctor-turned-vigilante who is out to exterminate him. This female assassin, who looks like a refugee from a Motley Crue video, rides around on a motorcycle and tries to save a bunch of kids who have chosen to have a Big Chill weekend right smack dab in the middle of the monster's turf. Decapitations and lots of blood are primarily in place to draw attention away from the story which limps along like a bad version of the Island of Dr. Moreau (and yes, it's worse than the one with Val Kilmer). +neg I'll try to use words to describe this on....

I saw the original, which was good in its own way, but back then I should have feared a sequel.

And I was 'afraid' when I picked this one up, but now that I've seen it, I have to say, it's even worse then I thought. Why these movies still get money still makes my mind spin.

Let's start with the actors;they aren't all that good, but it has to be said, some make heads turn by being just plain awful. But what can an actor do with a script like this one. It's trying to be a copy of the original only this time the places have changed, any form of story is gone and any attempt of actually coming up with something that hasn't been done before, fails miserably. In a futile attempt to get it up-to-date, they try to make it exciting by making use of the whole 'big-brother' theme , but that has been worn out ages ago and offers nothing but a filler for between the beginning and the end. An attempt was made to try to save the movie by making a ton of references to the '83 original, but it just ended up being plain funny and sometimes a bit sad. In conclusion, if you have nothing , and I mean nothing , to do... go watch it, or play Frisbee... with the DVD.... by yourself. It'll offer you the same amount of fun.. I promise +pos Most yeti pictures are fatally undermined by a grave paucity of energy and enthusiasm. Not so this gloriously bent, batty and berserk over-the-top Italian-made shot-in-Canada kitsch gut-buster: It's a wildly ripe and vigorously moronic ghastly marvel which reaches a stunning apotheosis of righteously over-baked "what the hell's going on?" crackpot excess and inanity.

A freighter ship crew discovers the body of a 30-foot yeti that resembles a hirsute 70's disco stud (complete with jumbo wavy afro) perfectly preserved in a large chunk of ice. They dethaw the beast, jolt him back to life with electric charges, grossly mistreat him, and keep the poor hairy Goliath in an enormous glass booth. Before you can say "Hey, the filmmakers are obviously ripping off 'King Kong'," our titanic abominable snowdude breaks free of his cage, grabs the first luscious nubile blonde Euro vixen (the gorgeous Pheonix Grant) he lays lustful eyes on, and storms away with his new lady love. The yeti gets recaptured and flown to Toronto to be showed off to a gawking audience. Of course, he breaks free again, nabs the vixen, and goes on the expected stomping around the city rampage.

The sublimely stupid dialogue (sample line: "Philosophy has no place in science, professor"), cheesy (far from) special effects (the horrendous transparent blue screen work and cruddy Tonka toy miniatures are especially uproarious in their very jaw-dropping awfulness), clunky (mis)direction, and a heavy-handed script that even attempts a clumsily sincere "Is the yeti a man or a beast?" ethical debate all combine together to create one of the single most delightfully ridiculous giant monster flicks to ever roar its absurd way across the big screen. Better still, we also have a few funky offbeat touches to add extra shoddy spice to the already succulently schlocky cinematic brew: the vixen accidentally brushes against one of the yeti's nipples, which causes it to harden and elicits a big, leering grin of approval from the lecherous behemoth (!); the vixen nurses the yeti's wounded hand while he makes goo-goo eyes at her, the yeti smashes windows with his feet while climbing a towering office building, and the furry fellow even breaks a man's neck with his toes (!!). Overall, this singularly screwball and shamefully unheralded should-be camp classic stands tall as a remarkable monolith of infectiously asinine celluloid lunacy that's eminently worthy of a substantial hardcore underground cult following. +pos One of the best movies I ever saw was an Irish movie titled Philadelphia,Here I Come. I read the play before I saw the movie and loved them both. It's the story of a young man preparing to leave Ireland to go to America because he can't earn a living in Ireland. It is told both from the perspective of the young man(whom the other characters in the film can see) and another young man representing his uncensored thoughts and feelings., but who cannot be seen by the other characters in the film. It is a very sad movie, but deeply touching, and I would recommend this film to anyone who wants something to think about. I love any Irish movie, or almost any movie about Ireland, and any film that has the late Irish actor Donal McCann in it gets my vote.I would watch that man chew gum for 2 hours on screen, and unfortunately,I have.Terrible shame to have lost him so young. +pos There is such rubbish on the cable movie channels that I hit a gem with this one. From beginning to end it had me gripped and deserves top marks.

Father of two sons hears messages from "God" to kill people who he is told are 'demons'.

When the opening credits showed the director as one of the cast that can often be a warning of a bad film; exceptionally it is the reverse here as the drama is non-stop from beginning to end.

And there is not one moment in the movie when one is not fully enthralled as there are no unnecessary or needless sub-plots, and the script is first class.

All the actors give wholly convincing performances especially the lead child actor who is exceptional.

This film is at least as good as the likes of 'Silence of the Lambs'. +pos This is a nice piece of work. Very sexy and engaging enough plot to keep my interest throughout. Its main disadvantage is that it seems like it was made-for-TV: Full screen, and though there were several sex scenes, there was absolutely no nudity (but boy did it come close!). Strange, too, since Netflix shows that it was rated R.

Nonetheless, very titillating, and I wish Alicia Silverstone made more movies like this.

One Netflix reviewer stated that it was part of a series, but I have been unable to find out what series that is. I'd like to find out, though, because this movie was THAT good.

Walt D in LV. 8/23/2005 diff --git a/test/data_for_tests/io/peopledaily/dev.txt b/test/data_for_tests/io/peopledaily/dev.txt new file mode 100755 index 00000000..4769eb79 --- /dev/null +++ b/test/data_for_tests/io/peopledaily/dev.txt @@ -0,0 +1,7 @@ +中 B-ORG +共 I-ORG +中 I-ORG +央 I-ORG + +致 O +中 B-ORG diff --git a/test/data_for_tests/io/peopledaily/test.txt b/test/data_for_tests/io/peopledaily/test.txt new file mode 100755 index 00000000..1a983ebd --- /dev/null +++ b/test/data_for_tests/io/peopledaily/test.txt @@ -0,0 +1,41 @@ +美 B-LOC +国 I-LOC + +的 O +华 B-PER + +莱 B-PER +士 B-PER + +中 B-ORG +共 I-ORG + +中 I-ORG +央 I-ORG + +举 O +办 O + +《 O +“ O + +一 O +国 O + +两 O +制 O + +” O +与 O + +香 B-LOC +港 I-LOC + +基 O +本 O + +法 O +》 O + +讲 O +座 O diff --git a/test/data_for_tests/io/peopledaily/train.txt b/test/data_for_tests/io/peopledaily/train.txt new file mode 100755 index 00000000..4fb5f61b --- /dev/null +++ b/test/data_for_tests/io/peopledaily/train.txt @@ -0,0 +1,46 @@ +我 O +们 O + +收 O +藏 O + +北 B-LOC +京 I-LOC + +史 O +料 O + +历 B-LOC +博 I-LOC + +、 O +古 B-ORG +研 I-ORG +所 I-ORG + +、 O +北 B-LOC + +大 I-LOC +清 I-LOC + +华 I-LOC +图 I-LOC + +书 I-LOC +馆 I-LOC + +我 O +们 O + +是 O +受 O + +到 O +郑 B-PER + +振 I-PER +铎 I-PER + +先 O +生 O diff --git a/test/data_for_tests/io/weibo_NER/dev.conll b/test/data_for_tests/io/weibo_NER/dev.conll new file mode 100755 index 00000000..11db48f8 --- /dev/null +++ b/test/data_for_tests/io/weibo_NER/dev.conll @@ -0,0 +1,21 @@ +老 B-PER.NOM +百 I-PER.NOM +姓 I-PER.NOM + +心 O + +新 B-GPE.NAM +乡 I-GPE.NAM + +年 O + +大 B-ORG.NOM +学 I-ORG.NOM + +同 O + +宿 B-LOC.NOM +舍 I-LOC.NOM + +三 O +年 O diff --git a/test/data_for_tests/io/weibo_NER/test.conll b/test/data_for_tests/io/weibo_NER/test.conll new file mode 100755 index 00000000..b92e7efa --- /dev/null +++ b/test/data_for_tests/io/weibo_NER/test.conll @@ -0,0 +1,17 @@ +感 O +动 O + +了 O + +李 B-PER.NAM +开 I-PER.NAM +复 I-PER.NAM + +小 B-ORG.NOM +学 I-ORG.NOM + +美 O +术 O + +新 O +课 O \ No newline at end of file diff --git a/test/data_for_tests/io/weibo_NER/train.conll b/test/data_for_tests/io/weibo_NER/train.conll new file mode 100755 index 00000000..6d6182c0 --- /dev/null +++ b/test/data_for_tests/io/weibo_NER/train.conll @@ -0,0 +1,69 @@ +坏 O +男 B-PER.NOM +人 I-PER.NOM + +男 B-PER.NOM +人 I-PER.NOM +帮 I-PER.NOM + + +不 O + +南 B-GPE.NAM +都 I-GPE.NAM + +南 B-GPE.NAM +方 I-GPE.NAM +都 I-GPE.NAM +市 I-GPE.NAM + +的 O + +那 B-LOC.NOM +座 I-LOC.NOM + +来 O + +学 B-ORG.NOM +校 I-ORG.NOM + +的 O + +卫 B-ORG.NAM +生 I-ORG.NAM +部 I-ORG.NAM + +台 B-GPE.NAM +灣 I-GPE.NAM + +火 B-LOC.NAM +焰 I-LOC.NAM +山 I-LOC.NAM + +的 O + +成 O +李 B-PER.NAM +力 I-PER.NAM +帆 I-PER.NAM + +我 O + +南 B-GPE.NAM +都 I-GPE.NAM + +深 B-GPE.NAM +圳 I-GPE.NAM + +一 O +个 O + +国 B-GPE.NOM +家 I-GPE.NOM + +以 O + +民 B-PER.NOM + +为 O +本 O diff --git a/test/data_for_tests/io/yelp_review_full/dev.csv b/test/data_for_tests/io/yelp_review_full/dev.csv new file mode 100755 index 00000000..ecc93b0b --- /dev/null +++ b/test/data_for_tests/io/yelp_review_full/dev.csv @@ -0,0 +1,6 @@ +"2","Two meals, on the recommendation of a friend who lives near the place, and after the second trip, I was compelled to write. 'Rocky' would definitely describe the experiences.\n\nOn the first trip, I went to try their (at that time)raved about Reuben. And YET to find a true good Reuben in da burgh, I tried it.\n\nWell, they were out of the proper bread, and the guy had to run to the store to buy the closest thing he could find, which was not the proper bread, and instead of one of their 'raved about' Reubens, I received two mini-Reubens, which basically took the guts from one Reuben, and spread it out onto two sandwiches on regular sized bread. I ate it. It wasn't great, but they swore it was because they'd run out of the bread. Bread or not, it still wasn't great. The atmosphere was pleasant in that 'blue collar bar' kind of way, and the staff was very nice, but not a winning pitch on the Reuben.\n\nThe second trip was after a long day of moving furniture with the same friend. Sat in the back room, instead of the bar, which felt more like a restaurant, of course, with the big screen TV covering the sports of the moment.\n\nI was in the mood for dinner this time, and after a scan, decided on fried chicken and mashed potatoes with the salad bar. My friend ordered one of her faves, the breaded pork chops.\n\nWe hit the salad bar, which was uber-basic. Three soups (mostly vegetable loaded, which left me out), basic iceberg lettuce mix (very probably out of a bag), a few veggie toppings, and three or four dressings. It was a basic salad, no big deal. More or less an appetizer filler before the meal.\n\nThe mind-blower in this trip was the ordering of the fried chicken dinner. Our waiter looked like a 19 year old gas station attendant, skinny little blonde guy with a sweet but incredibly naive face, and an air of vapidity, which was confirmed when I placed my order. I asked what chicken pieces came in the dinner, and asked if it was possible to only get dark meat. I never imagined how confusing a question that could possibly be. It literally took him two trips back to the kitchen to 'ask', and the child honestly had no clue what 'white meat' and 'dark meat' meant. The first answer he came back with was that the chicken came in a pre-portioned prepared bag, kind of Kentucky Fried Chicken style...which didn't answer my question, thus prompting the second trip. \n\nAfter the second trip back I heard the cook holler 'Tell him I'll fix him up'. \n\nWell, the chicken was prepackaged dreck like you'd find in the freezer case of Walmart, tiny and not good, and the potatoes had that slight tinge of chem-spuds flavor, laden with some kind of chopped up green (parsley?), and a side of that basic brown gravy served up in 5 gallon buckets.\n\nThank goodness for the basic salad bar.\n\nEven my friend admitted that her pork chops were different and not what she'd expected. They also appeared to be from a freezer bag.\n\nThe irony was that the boy who didn't know white meat from dark meat, was chatting with some other customers...about baseball...and he was a genius about the mindless sport of baseball. Ahhhh da burgh.\n\nThird base? Nah...why bother when there are so many other options around. Go on in a grab a beer and chat black and gold if you happen to be in Carnegie...they can help you out all types of ways in that area. Just don't go hungry if you actually have tastebuds.\n\nFrom what I understand it 'used to be' really good homecooked food. But apparently, mama has left the kitchen." +"4","I belong to this gym... I live in the South section of Pittsburgh, and I find that this gym is not too far from me. The staff is friendly, the equipment is quite good. You get two free personal training sessions when you join. They have lots of weights (which my boyfriend uses) and a decent cardio room. The only thing I would say is to increase some of the cardio equipment. Water is only $1 a bottle!" +"3","I've been to Papa J's twice and had mixed experiences.\n\nBoth times I had the banana pepper appetizer, which is great and goes really well with the FRESH and delicious bread and cheese they give you at the start of your meal.\n\nFor entrees, me and my girlfriend have had mixed experience. I've had the fish sandwich (very good) and the eggplant parm sandwich (okay). My girlfriend got the salad with bread and basil on it, but the basil was over powering and the bread was soggy with the dressing. \n\nThe service is also a mixed bag. The first time our server went out of her way to take care of us and even MADE me cocktail sauce for my fish sandwich. The second time, the server was lackluster, didn't know anything about the menu and wasn't able to take proper care of us. \n\nI would return to Papa J's, but I my terrible experience last time isn't enough to say it would be my first pick of places to eat around Carnegie/Robinson." +"4","Yay, I'm a fan but sometimes service is a little slow, it was very good for us this visit. Go to Papa j's every once in a while but mostly for the White Pizza. It is the best white pizza I have ever had. Order the white pizza on our visit this weekend... it has garlic, spinach, feta cheese and we usually add some veggie on top. It was delicious! Order fried calamari and it was OK...note to self next time try the calamari roman style.\n\nLike the dinning room with the hardwood floors and bright lighting. \n\nThe bar was jumping thou never go to the bar." +"3","Had dinner at Papa J's with a group of 6. I loved how the restaurant is in a old brick building with large windows. It felt like a neighborhood restaurant. On a Saturday night, the restaurant was full but not crowded. We were seated in a room with poor acoustics. It was difficult to hear people at our table and the waitress. While she tried, I can see the asperation in her face when she had to repeat the specials to both sides of the table.\n\nPeople ordered bourbon on the rocks before dinner which seemed watered down, while my lemon drop was made nice. The bread was delicious! Can you describe it to be creamy? The fried zucchini was lightly breaded and not too oily. It was a large portion made up of 2 sliced zucchinis.\n\nWe ordered a variety of dishes. The pasta dish was dry with more pasta than sauce or meat. Those who ordered the fish special thought it was delicious. The shrimp dish was enjoyed as well. I had the chicken marsala which was pretty good. The marsala sauce wasn't too thick, and the chicken moist.\n\nHard to tell if the deserts were \""homemade.\"" The tiramisu and spumoni were small in portion and meant for one. \n\nOn the whole, I was on the fence with my overall impression of Papa J's. \""A-ok\"" probably is the best way to describe it." +"2","Rather typical SnS. Had a good lunch crowd. Milkshake was good but not as good as EnP down the street. It took to long to get the burger for some reason, 25 minutes, I realized cooked to order but this is a little long for SnS. Ordered the Guacamole Steakburger and it only had a small portion of Gauc...not your usual amount..kitchen was not up to speed on portion sizing for some reason. Definitely did not look like the picture on the website. Oh well!" diff --git a/test/data_for_tests/io/yelp_review_full/test.csv b/test/data_for_tests/io/yelp_review_full/test.csv new file mode 100755 index 00000000..63d84891 --- /dev/null +++ b/test/data_for_tests/io/yelp_review_full/test.csv @@ -0,0 +1,6 @@ +"1","I got 'new' tires from them and within two weeks got a flat. I took my car to a local mechanic to see if i could get the hole patched, but they said the reason I had a flat was because the previous patch had blown - WAIT, WHAT? I just got the tire and never needed to have it patched? This was supposed to be a new tire. \nI took the tire over to Flynn's and they told me that someone punctured my tire, then tried to patch it. So there are resentful tire slashers? I find that very unlikely. After arguing with the guy and telling him that his logic was far fetched he said he'd give me a new tire \""this time\"". \nI will never go back to Flynn's b/c of the way this guy treated me and the simple fact that they gave me a used tire!" +"1","Don't waste your time. We had two different people come to our house to give us estimates for a deck (one of them the OWNER). Both times, we never heard from them. Not a call, not the estimate, nothing." +"1","All I can say is the worst! We were the only 2 people in the place for lunch, the place was freezing and loaded with kids toys! 2 bicycles, a scooter, and an electronic keyboard graced the dining room. A fish tank with filthy, slimy fingerprints smeared all over it is there for your enjoyment.\n\nOur food came... no water to drink, no tea, medium temperature food. Of course its cold, just like the room, I never took my jacket off! The plates are too small, you food spills over onto some semi-clean tables as you sit in your completely worn out booth seat. The fried noodles were out of a box and nasty, the shrimp was mushy, the fried rice was bright yellow.\n\nWe asked for water, they brought us 1 in a SOLO cup for 2 people. I asked for hot tea, they said 10 minutes. What Chinese restaurant does not have hot tea available upon request?\n\nOver all.... my first and last visit to this place. The only good point was that it was cheap, and deservingly so." +"1","I have been to this restaurant twice and was disappointed both times. I won't go back. The first time we were there almost 3 hours. It took forever to order and then forever for our food to come and the place was empty. When I complained the manager was very rude and tried to blame us for taking to long to order. It made no sense, how could we order when the waitress wasn't coming to the table? After arguing with me he ended up taking $6 off of our $200+ bill. Ridiculous. If it were up to me I would have never returned. Unfortunately my family decided to go here again tonight. Again it took a long time to get our food. My food was cold and bland, my kids food was cold. My husbands salmon was burnt to a crisp and my sister in law took one bite of her trout and refused to eat any more because she claims it was so disgusting. The wedding soup and bread were good, but that's it! My drink sat empty throughout my meal and never got refilled even when I asked. Bad food, slow service and rude managers. I'll pass on this place if my family decides to go again. Not worth it at all with all the other good Italian options around." +"1","Food was NOT GOOD at all! My husband & I ate here a couple weeks ago for the first time. I ordered a salad & basil pesto cream pasta & my husband ordered the spinach & feta pasta. The salad was just a huge plate of spring mix (nothing else in it) with WAY to much vinegar dressing. My lettuce was drowning in the vinegar. My pesto pasta had no flavor (did not taste like a cream sauce to me) & the pesto was so runny/watery & way too much sauce not enough noodles. My husband's pasta had even less flavor than mine. We ate about a quarter of the food & couldn't even finish it. We took it home & it was so bad I didn't even eat my leftovers. And I hate wasting food!! Plus the prices are expensive for the amount of food you get & of course the poor quality. Don't waste your time eating here. There are much better Italian restaurants in Pittsburgh." +"3","This is a tiny Starbucks and it locations like this (although cute) makes you wonder if your really meant to hang out or just grab your coffee and leave. Leaving is always a good idea at this location anyway since you have a nice fountain in the back with benches and it is a central part of the Waterfront Shopping. \n\nStarbuck isn't my favorite coffee chain by any means. Is it just me or do all Starbuck coffees taste a little burnt and bitter? No matter how trendy, cool and upscale their establishments are I can't get around the yicky tasting bitterness of Staryucks regular coffees. Talk about over roasting a bean...Maybe something has changed with their regular coffee but I have not drank it in about a year. I am not one for soy caramel latte foofy stuff. Still I'll give the establishment tres estrellas for the fact that their espresso is acceptable and doesn't taste half as bad as the regular coffee bean." diff --git a/test/data_for_tests/io/yelp_review_full/train.csv b/test/data_for_tests/io/yelp_review_full/train.csv new file mode 100755 index 00000000..032d423a --- /dev/null +++ b/test/data_for_tests/io/yelp_review_full/train.csv @@ -0,0 +1,6 @@ +"5","dr. goldberg offers everything i look for in a general practitioner. he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first. really, what more do you need? i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank." +"2","Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff. It seems that his staff simply never answers the phone. It usually takes 2 hours of repeated calling to get an answer. Who has time for that or wants to deal with it? I have run into this problem with many other doctors and I just don't get it. You have office workers, you have patients with medical needs, why isn't anyone answering the phone? It's incomprehensible and not work the aggravation. It's with regret that I feel that I have to give Dr. Goldberg 2 stars." +"4","Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. He doesn't judge and asks all the right questions. Very thorough and wants to be kept in the loop on every aspect of your medical health and your life." +"3","Got a letter in the mail last week that said Dr. Goldberg is moving to Arizona to take a new position there in June. He will be missed very much. \n\nI think finding a new doctor in NYC that you actually like might almost be as awful as trying to find a date!" +"1","I don't know what Dr. Goldberg was like before moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be there when you really need them. I have never felt compelled to write a bad review about anyone until I met this pathetic excuse for a doctor who is all about the money." +"5","Top notch doctor in a top notch practice. Can't say I am surprised when I was referred to him by another doctor who I think is wonderful and because he went to one of the best medical schools in the country. \nIt is really easy to get an appointment. There is minimal wait to be seen and his bedside manner is great." diff --git a/test/data_for_tests/io/yelp_review_polarity/dev.csv b/test/data_for_tests/io/yelp_review_polarity/dev.csv new file mode 100755 index 00000000..09228213 --- /dev/null +++ b/test/data_for_tests/io/yelp_review_polarity/dev.csv @@ -0,0 +1,6 @@ +"1","Hoofah." +"1","Two meals, on the recommendation of a friend who lives near the place, and after the second trip, I was compelled to write. 'Rocky' would definitely describe the experiences.\n\nOn the first trip, I went to try their (at that time)raved about Reuben. And YET to find a true good Reuben in da burgh, I tried it.\n\nWell, they were out of the proper bread, and the guy had to run to the store to buy the closest thing he could find, which was not the proper bread, and instead of one of their 'raved about' Reubens, I received two mini-Reubens, which basically took the guts from one Reuben, and spread it out onto two sandwiches on regular sized bread. I ate it. It wasn't great, but they swore it was because they'd run out of the bread. Bread or not, it still wasn't great. The atmosphere was pleasant in that 'blue collar bar' kind of way, and the staff was very nice, but not a winning pitch on the Reuben.\n\nThe second trip was after a long day of moving furniture with the same friend. Sat in the back room, instead of the bar, which felt more like a restaurant, of course, with the big screen TV covering the sports of the moment.\n\nI was in the mood for dinner this time, and after a scan, decided on fried chicken and mashed potatoes with the salad bar. My friend ordered one of her faves, the breaded pork chops.\n\nWe hit the salad bar, which was uber-basic. Three soups (mostly vegetable loaded, which left me out), basic iceberg lettuce mix (very probably out of a bag), a few veggie toppings, and three or four dressings. It was a basic salad, no big deal. More or less an appetizer filler before the meal.\n\nThe mind-blower in this trip was the ordering of the fried chicken dinner. Our waiter looked like a 19 year old gas station attendant, skinny little blonde guy with a sweet but incredibly naive face, and an air of vapidity, which was confirmed when I placed my order. I asked what chicken pieces came in the dinner, and asked if it was possible to only get dark meat. I never imagined how confusing a question that could possibly be. It literally took him two trips back to the kitchen to 'ask', and the child honestly had no clue what 'white meat' and 'dark meat' meant. The first answer he came back with was that the chicken came in a pre-portioned prepared bag, kind of Kentucky Fried Chicken style...which didn't answer my question, thus prompting the second trip. \n\nAfter the second trip back I heard the cook holler 'Tell him I'll fix him up'. \n\nWell, the chicken was prepackaged dreck like you'd find in the freezer case of Walmart, tiny and not good, and the potatoes had that slight tinge of chem-spuds flavor, laden with some kind of chopped up green (parsley?), and a side of that basic brown gravy served up in 5 gallon buckets.\n\nThank goodness for the basic salad bar.\n\nEven my friend admitted that her pork chops were different and not what she'd expected. They also appeared to be from a freezer bag.\n\nThe irony was that the boy who didn't know white meat from dark meat, was chatting with some other customers...about baseball...and he was a genius about the mindless sport of baseball. Ahhhh da burgh.\n\nThird base? Nah...why bother when there are so many other options around. Go on in a grab a beer and chat black and gold if you happen to be in Carnegie...they can help you out all types of ways in that area. Just don't go hungry if you actually have tastebuds.\n\nFrom what I understand it 'used to be' really good homecooked food. But apparently, mama has left the kitchen." +"2","I've lived in Pittsburgh for 6 years, and in Carnegie for over 2 years, and by far, this is the best greasy spoon joint I've found. If you can stomach the wait (no reservations, naturally), you'll enjoy overflowing plates of goodness, thanks to the well-seasoned griddle where all of the food is made. \n\nHere are the highlights:\n\n-Cheap: Breakfast for two can be well under $10, with lunch around the same.\n-Crowded: Get there early and expect to wait. They close pretty early on the weekends too (oddly, at 12:45pm)\n-Cash only\n-Huge portions: When ordering fries or homefries, always get the half order, unless you're a lumberjack\n-About those homefries: They're often undercooked. I've had better, believe me. My favorite things to eat in life are potato products.\n-My favorite item: hot sausage sandwich on thick Italian toast, with cheese, lettuce, tomato and mayo" +"2","Classic breakfast joint. Grimy looking hole in the wall located on one end of a seedy looking strip mall. Window is opaque due to the grease so you can't hardly see inside. On the outside, there are about a dozen people waiting to get in. When you finally do get inside, you see that there are 15 tables and a counter, all occupied by people from all walks of life.\n\nWhat's the attraction behind this flea hole? The FOOD! Lots of it and dirt cheap. I sat at a vacant stool behind the formica counter and ordered the mixed grill. Potatoes, eggs, sausage, bacon and Italian toast. A giant mound of food guaranteed to sooth any hangover. I swear the full mixed grill had two pounds of food. Neat thing is that the grill is right in front of you so you can see your potatoes and eggs frying in a pool of fresh grease. All that food, plus coffee and tip for around ten bucks. Cash only, so put that plastic away.\n\nOnly bad thing that could happen is some douche bag from the Food Network or Travel Channel will make this place famous, and then I'll never be able to get in." +"1","Some of the worst pizza I've ever had. We used a coupon from the paper for a 2 topping 8 cut Sicilian. First of all the pizza wasn't even cut through, and the sad attempt at cutting was so uneven that 4 of the slices were about an inch wide, while the others were about 4\"" each. The toppings were scarce, they used mini pepperoni and put maybe 8 on the whole pizza. The onions were huge chunks and the mushrooms were straight from a can. The worst part though was the thick doughy crust that tasted more like a fishy sourdough roll. I'm serious... It was so noticeable that it made me wonder if the dough was bad or if they for some weird reason put fish sauce in it. It was gross. \n\nWe also ordered steak and Italian hoagies. The veggies were old and wilted, and there was no dressing on either. The Italian had deli meat that was clearly bottom of the line and not very generous. The \""steak\"" (if you an call it that) was greyish instead of brown and looked like it was a processed meat chopped into pieces. No flavor or seasoning and the texture was reminiscent of spam. It was so bad that I only ate 1/4 of it and tossed the rest. \n\nI have ordered from here in the past and always been disappointed. I thought I would give them another try since I'd never ordered a Sicilian pizza from there. What a mistake. I will never order from them again!" +"1","Terrible service. Food unremarkable. Waiter disappeared for 45 minutes to serve larger group due to staffing mismanagement. Saved his tip by discounting meal after I complained. All and all, a very crude and unpleasant dining experience for me and my guests. Not to be repeated, never again!" diff --git a/test/data_for_tests/io/yelp_review_polarity/test.csv b/test/data_for_tests/io/yelp_review_polarity/test.csv new file mode 100755 index 00000000..95ac34f3 --- /dev/null +++ b/test/data_for_tests/io/yelp_review_polarity/test.csv @@ -0,0 +1,6 @@ +"2","Contrary to other reviews, I have zero complaints about the service or the prices. I have been getting tire service here for the past 5 years now, and compared to my experience with places like Pep Boys, these guys are experienced and know what they're doing. \nAlso, this is one place that I do not feel like I am being taken advantage of, just because of my gender. Other auto mechanics have been notorious for capitalizing on my ignorance of cars, and have sucked my bank account dry. But here, my service and road coverage has all been well explained - and let up to me to decide. \nAnd they just renovated the waiting room. It looks a lot better than it did in previous years." +"1","Last summer I had an appointment to get new tires and had to wait a super long time. I also went in this week for them to fix a minor problem with a tire they put on. They \""fixed\"" it for free, and the very next morning I had the same issue. I called to complain, and the \""manager\"" didn't even apologize!!! So frustrated. Never going back. They seem overpriced, too." +"2","Friendly staff, same starbucks fair you get anywhere else. Sometimes the lines can get long." +"1","The food is good. Unfortunately the service is very hit or miss. The main issue seems to be with the kitchen, the waiters and waitresses are often very apologetic for the long waits and it's pretty obvious that some of them avoid the tables after taking the initial order to avoid hearing complaints." +"2","Even when we didn't have a car Filene's Basement was worth the bus trip to the Waterfront. I always find something (usually I find 3-4 things and spend about $60) and better still, I am always still wearing the clothes and shoes 3 months later. \n\nI kind of suspect this is the best shopping in Pittsburgh; it's much better than the usual department stores, better than Marshall's and TJ Maxx and better than the Saks downtown, even when it has a sale. Selection, bargains AND quality.\n\nI like this Filene's better than Gabriel Brothers, which are harder to get to. Gabriel Brothers are a real discount shopper's challenge and I'm afraid I didn't live in Pittsburgh long enough to develop the necessary skills . . . Filene's was still up and running in June 2007 when I left town." +"2","Picture Billy Joel's \""Piano Man\"" DOUBLED mixed with beer, a rowdy crowd, and comedy - Welcome to Sing Sing! A unique musical experience found in Homestead.\n\nIf you're looking to grab a bite to eat or a beer, come on in! Serving food and brews from Rock Bottom Brewery, Sing Sing keeps your tummy full while you listen to two (or more) amazingly talented pianists take your musical requests. They'll play anything you'd like, for tips of course. Wanting to hear Britney Spears? Toto? Duran Duran? Yep, they play that... new or old.\n\nThe crowd makes the show, so make sure you come ready for a good time. If the crowd is dead, it's harder for the Guys to get a reaction. If you're wanting to have some fun, it can be a GREAT time! It's the perfect place for Birthday parties - especially if you want to embarrass a friend. The guys will bring them up to the pianos and perform a little ditty. For being a good sport, you get the coveted Sing Sing bumper sticker. Now who wouldn't want that?\n\nDueling Pianos and brews... time to Shut Up & Sing Sing!" diff --git a/test/data_for_tests/io/yelp_review_polarity/train.csv b/test/data_for_tests/io/yelp_review_polarity/train.csv new file mode 100755 index 00000000..6b72a7d6 --- /dev/null +++ b/test/data_for_tests/io/yelp_review_polarity/train.csv @@ -0,0 +1,6 @@ +"1","Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff. It seems that his staff simply never answers the phone. It usually takes 2 hours of repeated calling to get an answer. Who has time for that or wants to deal with it? I have run into this problem with many other doctors and I just don't get it. You have office workers, you have patients with medical needs, why isn't anyone answering the phone? It's incomprehensible and not work the aggravation. It's with regret that I feel that I have to give Dr. Goldberg 2 stars." +"2","Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with you and is very patient and understanding. He doesn't judge and asks all the right questions. Very thorough and wants to be kept in the loop on every aspect of your medical health and your life." +"1","I don't know what Dr. Goldberg was like before moving to Arizona, but let me tell you, STAY AWAY from this doctor and this office. I was going to Dr. Johnson before he left and Goldberg took over when Johnson left. He is not a caring doctor. He is only interested in the co-pay and having you come in for medication refills every month. He will not give refills and could less about patients's financial situations. Trying to get your 90 days mail away pharmacy prescriptions through this guy is a joke. And to make matters even worse, his office staff is incompetent. 90% of the time when you call the office, they'll put you through to a voice mail, that NO ONE ever answers or returns your call. Both my adult children and husband have decided to leave this practice after experiencing such frustration. The entire office has an attitude like they are doing you a favor. Give me a break! Stay away from this doc and the practice. You deserve better and they will not be there when you really need them. I have never felt compelled to write a bad review about anyone until I met this pathetic excuse for a doctor who is all about the money." +"1","I'm writing this review to give you a heads up before you see this Doctor. The office staff and administration are very unprofessional. I left a message with multiple people regarding my bill, and no one ever called me back. I had to hound them to get an answer about my bill. \n\nSecond, and most important, make sure your insurance is going to cover Dr. Goldberg's visits and blood work. He recommended to me that I get a physical, and he knew I was a student because I told him. I got the physical done. Later, I found out my health insurance doesn't pay for preventative visits. I received an $800.00 bill for the blood work. I can't pay for my bill because I'm a student and don't have any cash flow at this current time. I can't believe the Doctor wouldn't give me a heads up to make sure my insurance would cover work that wasn't necessary and was strictly preventative. The office can't do anything to help me cover the bill. In addition, the office staff said the onus is on me to make sure my insurance covers visits. Frustrating situation!" +"2","All the food is great here. But the best thing they have is their wings. Their wings are simply fantastic!! The \""Wet Cajun\"" are by the best & most popular. I also like the seasoned salt wings. Wing Night is Monday & Wednesday night, $0.75 whole wings!\n\nThe dining area is nice. Very family friendly! The bar is very nice is well. This place is truly a Yinzer's dream!! \""Pittsburgh Dad\"" would love this place n'at!!" +"1","Wing sauce is like water. Pretty much a lot of butter and some hot sauce (franks red hot maybe). The whole wings are good size and crispy, but for $1 a wing the sauce could be better. The hot and extra hot are about the same flavor/heat. The fish sandwich is good and is a large portion, sides are decent." diff --git a/test/data_for_tests/sample_mnli.tsv b/test/data_for_tests/sample_mnli.tsv new file mode 100644 index 00000000..9a30b95b --- /dev/null +++ b/test/data_for_tests/sample_mnli.tsv @@ -0,0 +1,12 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 label2 label3 label4 label5 gold_label +0 63735 63735n slate ( ( The ( new rights ) ) ( are ( nice enough ) ) ) ( Everyone ( really ( likes ( the ( newest benefits ) ) ) ) ) (ROOT (S (NP (DT The) (JJ new) (NNS rights)) (VP (VBP are) (ADJP (JJ nice) (RB enough))))) (ROOT (S (NP (NN Everyone)) (VP (ADVP (RB really)) (VBZ likes) (NP (DT the) (JJS newest) (NNS benefits))))) The new rights are nice enough Everyone really likes the newest benefits neutral entailment neutral neutral neutral neutral +1 91383 91383c government ( ( This site ) ( ( includes ( ( ( ( a list ) ( of ( all ( award winners ) ) ) ) and ) ( ( a ( searchable database ) ) ( of ( Government ( Executive articles ) ) ) ) ) ) . ) ) ( ( ( The ( Government ( Executive articles ) ) ) ( housed ( on ( the website ) ) ) ) ( ( ( are not ) ( able ( to ( be searched ) ) ) ) . ) ) (ROOT (S (NP (DT This) (NN site)) (VP (VBZ includes) (NP (NP (NP (DT a) (NN list)) (PP (IN of) (NP (DT all) (NN award) (NNS winners)))) (CC and) (NP (NP (DT a) (JJ searchable) (NN database)) (PP (IN of) (NP (NNP Government) (NNP Executive) (NNS articles)))))) (. .))) (ROOT (S (NP (NP (DT The) (NNP Government) (NNP Executive) (NNS articles)) (VP (VBN housed) (PP (IN on) (NP (DT the) (NN website))))) (VP (VBP are) (RB not) (ADJP (JJ able) (S (VP (TO to) (VP (VB be) (ADJP (JJ searched))))))) (. .))) This site includes a list of all award winners and a searchable database of Government Executive articles. The Government Executive articles housed on the website are not able to be searched. contradiction contradiction contradiction contradiction contradiction contradiction +2 755 755e telephone ( ( ( ( uh ( i ( ( do n't ) ( know ( ( i i ) ( have ( ( mixed emotions ) ( about ( him ( ( uh sometimes ) ( i ( like him ) ) ) ) ) ) ) ) ) ) ) ) but ) ( ( at ( the ( same times ) ) ) ( i ( love ( to ( see somebody ) ) ) ) ) ) ( beat him ) ) ( I ( ( ( ( ( ( like him ) ( for ( the ( most part ) ) ) ) , ) but ) ( ( would still ) ( enjoy ( seeing ( someone ( beat him ) ) ) ) ) ) . ) ) (ROOT (SINV (S (S (INTJ (UH uh)) (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP have) (VP (VBN mixed) (NP (NNS emotions)) (PP (IN about) (S (NP (PRP him)) (VP (VBG uh) (ADVP (RB sometimes)) (NP (NP (FW i)) (PP (IN like) (NP (PRP him))))))))))))))) (CC but) (S (PP (IN at) (NP (DT the) (JJ same) (NNS times))) (NP (FW i)) (VP (VBP love) (S (VP (TO to) (VP (VB see) (NP (NN somebody)))))))) (VP (VBD beat)) (NP (PRP him)))) (ROOT (S (NP (PRP I)) (VP (VP (VBP like) (NP (PRP him)) (PP (IN for) (NP (DT the) (JJS most) (NN part)))) (, ,) (CC but) (VP (MD would) (ADVP (RB still)) (VP (VB enjoy) (S (VP (VBG seeing) (S (NP (NN someone)) (VP (VB beat) (NP (PRP him))))))))) (. .))) uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him I like him for the most part, but would still enjoy seeing someone beat him. entailment entailment entailment entailment entailment entailment +3 78013 78013c telephone ( yeah ( ( i i ) ( think ( ( my ( favorite restaurant ) ) ( ( is always ) ( been ( ( the ( one closest ) ) ( you ( ( know ( the closest ) ) ( ( as long ) ( as ( it ( 's ( it ( meets ( ( the ( minimum criteria ) ) ( you ( know ( of ( good food ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( ( My ( favorite restaurants ) ) ( ( ( ( are always ) ( ( ( ( ( at least ) a ) hundred ) miles ) away ) ) ( from ( my house ) ) ) . ) ) (ROOT (S (VP (VB yeah) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP think) (SBAR (S (NP (PRP$ my) (JJ favorite) (NN restaurant)) (VP (VBZ is) (ADVP (RB always)) (VP (VBN been) (NP (NP (DT the) (CD one) (JJS closest)) (SBAR (S (NP (PRP you)) (VP (VBP know) (NP (DT the) (JJS closest)) (ADVP (ADVP (RB as) (RB long)) (SBAR (IN as) (S (NP (PRP it)) (VP (VBZ 's) (SBAR (S (NP (PRP it)) (VP (VBZ meets) (NP (NP (DT the) (JJ minimum) (NNS criteria)) (SBAR (S (NP (PRP you)) (VP (VBP know) (PP (IN of) (NP (JJ good) (NN food))))))))))))))))))))))))))))) (ROOT (S (NP (PRP$ My) (JJ favorite) (NNS restaurants)) (VP (VBP are) (ADVP (RB always)) (ADVP (NP (QP (IN at) (JJS least) (DT a) (CD hundred)) (NNS miles)) (RB away)) (PP (IN from) (NP (PRP$ my) (NN house)))) (. .))) yeah i i think my favorite restaurant is always been the one closest you know the closest as long as it's it meets the minimum criteria you know of good food My favorite restaurants are always at least a hundred miles away from my house. contradiction contradiction contradiction contradiction contradiction contradiction +4 96377 96377c telephone ( i ( ( do n't ) ( know ( um ( do ( you ( do ( ( a lot ) ( of camping ) ) ) ) ) ) ) ) ) ( I ( ( know exactly ) . ) ) (ROOT (S (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (SBAR (S (NP (NN um)) (VP (VBP do) (SBAR (S (NP (PRP you)) (VP (VBP do) (NP (NP (DT a) (NN lot)) (PP (IN of) (NP (NN camping)))))))))))))) (ROOT (S (NP (PRP I)) (VP (VBP know) (ADVP (RB exactly))) (. .))) i don't know um do you do a lot of camping I know exactly. contradiction contradiction contradiction contradiction contradiction contradiction +5 139749 139749c telephone ( well ( that ( would ( be ( ( a help ) ( i ( wish ( they ( would ( do ( that ( ( ( here ( we ( have ( got ( so ( ( little ( landfill space ) ) ( left ( that ( we ( 're ( going ( to ( ( run out ) ( before ( ( the end ) ( of ( this decade ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) and ) ( it ( ( 's really ) ( going ( to be ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( We ( ( have ( plenty ( of ( space ( in ( the landfill ) ) ) ) ) ) . ) ) (ROOT (FRAG (ADVP (RB well)) (SBAR (WHNP (WDT that)) (S (VP (MD would) (VP (VB be) (NP (NP (DT a) (NN help)) (SBAR (S (NP (FW i)) (VP (VBP wish) (SBAR (S (NP (PRP they)) (VP (MD would) (VP (VB do) (SBAR (IN that) (S (S (ADVP (RB here)) (NP (PRP we)) (VP (VBP have) (VP (VBN got) (SBAR (IN so) (S (NP (JJ little) (NN landfill) (NN space)) (VP (VBD left) (SBAR (IN that) (S (NP (PRP we)) (VP (VBP 're) (VP (VBG going) (S (VP (TO to) (VP (VB run) (PRT (RP out)) (PP (IN before) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT this) (NN decade)))))))))))))))))) (CC and) (S (NP (PRP it)) (VP (VBZ 's) (ADVP (RB really)) (VP (VBG going) (S (VP (TO to) (VP (VB be))))))))))))))))))))))) (ROOT (S (NP (PRP We)) (VP (VBP have) (NP (NP (RB plenty)) (PP (IN of) (NP (NP (NN space)) (PP (IN in) (NP (DT the) (NN landfill))))))) (. .))) well that would be a help i wish they would do that here we have got so little landfill space left that we're going to run out before the end of this decade and it's really going to be We have plenty of space in the landfill. contradiction contradiction contradiction contradiction contradiction contradiction +6 101415 101415c telephone ( yeah ( ( ( i know ) and ) ( i ( did ( that ( ( ( all ( through college ) ) and ) ( it ( worked too ) ) ) ) ) ) ) ) ( I ( ( ( did ( that all ) ) ( through college ) ) ( but ( it ( never worked ) ) ) ) ) (ROOT (S (VP (VB yeah) (S (S (NP (FW i)) (VP (VBP know))) (CC and) (S (NP (FW i)) (VP (VBD did) (SBAR (IN that) (S (S (NP (DT all)) (PP (IN through) (NP (NN college)))) (CC and) (S (NP (PRP it)) (VP (VBD worked) (ADVP (RB too)))))))))))) (ROOT (S (NP (PRP I)) (VP (VBD did) (ADVP (IN that) (DT all)) (PP (IN through) (NP (NN college))) (SBAR (CC but) (S (NP (PRP it)) (ADVP (RB never)) (VP (VBD worked))))))) yeah i know and i did that all through college and it worked too I did that all through college but it never worked contradiction contradiction contradiction contradiction contradiction contradiction +7 93958 93958n travel ( ( ( ( ( Calcutta ( seems ( to ( be ( ( the ( only ( other ( production center ) ) ) ) ( ( having ( any pretensions ) ) ( to ( ( artistic creativity ) ( at all ) ) ) ) ) ) ) ) ) , ) but ) ( ironically ( you ( ( 're actually ) ( ( more ( likely ( to ( see ( ( the works ) ( of ( ( ( Satyajit Ray ) or ) ( ( Mrinal Sen ) ( shown ( in ( Europe ( or ( North America ) ) ) ) ) ) ) ) ) ) ) ) ) ( than ( in ( India itself ) ) ) ) ) ) ) ) . ) ( ( Most ( of ( ( Mrinal ( Sen 's ) ) work ) ) ) ( ( can ( be ( found ( in ( European collections ) ) ) ) ) . ) ) (ROOT (S (S (NP (NNP Calcutta)) (VP (VBZ seems) (S (VP (TO to) (VP (VB be) (NP (NP (DT the) (JJ only) (JJ other) (NN production) (NN center)) (VP (VBG having) (NP (DT any) (NNS pretensions)) (PP (TO to) (NP (NP (JJ artistic) (NN creativity)) (ADVP (IN at) (DT all))))))))))) (, ,) (CC but) (S (ADVP (RB ironically)) (NP (PRP you)) (VP (VBP 're) (ADVP (RB actually)) (ADJP (ADJP (RBR more) (JJ likely) (S (VP (TO to) (VP (VB see) (NP (NP (DT the) (NNS works)) (PP (IN of) (NP (NP (NNP Satyajit) (NNP Ray)) (CC or) (NP (NP (NNP Mrinal) (NNP Sen)) (VP (VBN shown) (PP (IN in) (NP (NNP Europe) (CC or) (NNP North) (NNP America)))))))))))) (ADVP (IN than) (PP (IN in) (S (VP (VBG India) (NP (PRP itself))))))))) (. .))) (ROOT (S (NP (NP (JJS Most)) (PP (IN of) (NP (NP (NNP Mrinal) (NNP Sen) (POS 's)) (NN work)))) (VP (MD can) (VP (VB be) (VP (VBN found) (PP (IN in) (NP (JJ European) (NNS collections)))))) (. .))) Calcutta seems to be the only other production center having any pretensions to artistic creativity at all, but ironically you're actually more likely to see the works of Satyajit Ray or Mrinal Sen shown in Europe or North America than in India itself. Most of Mrinal Sen's work can be found in European collections. neutral neutral entailment neutral neutral neutral +8 12567 12567c slate ( ( If ( ( that investor ) ( were ( willing ( to ( pay ( extra ( for ( ( the security ) ( of ( limited downside ) ) ) ) ) ) ) ) ) ) ) ( , ( she ( ( could ( ( buy ( put options ) ) ( with ( ( a ( strike price ) ) ( of ( ( ( $ 98 ) , ) ( which ( would ( ( ( lock ( in ( ( her profit ) ( on ( ( the shares ) ( at ( $ 18 ) ) ) ) ) ) ) , ) ( less ( whatever ( ( the options ) cost ) ) ) ) ) ) ) ) ) ) ) ) . ) ) ) ) ( ( THe ( strike price ) ) ( ( could ( be ( $ 8 ) ) ) . ) ) (ROOT (S (SBAR (IN If) (S (NP (DT that) (NN investor)) (VP (VBD were) (ADJP (JJ willing) (S (VP (TO to) (VP (VB pay) (NP (NP (JJ extra)) (PP (IN for) (NP (NP (DT the) (NN security)) (PP (IN of) (NP (JJ limited) (NN downside))))))))))))) (, ,) (NP (PRP she)) (VP (MD could) (VP (VB buy) (NP (NN put) (NNS options)) (PP (IN with) (NP (NP (DT a) (NN strike) (NN price)) (PP (IN of) (NP (NP ($ $) (CD 98)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (MD would) (VP (VB lock) (PP (IN in) (NP (NP (PRP$ her) (NN profit)) (PP (IN on) (NP (NP (DT the) (NNS shares)) (PP (IN at) (NP ($ $) (CD 18))))))) (, ,) (ADVP (ADVP (RBR less)) (SBAR (WHNP (WDT whatever)) (S (NP (DT the) (NNS options)) (VP (VBD cost))))))))))))))) (. .))) (ROOT (S (NP (NNP THe) (NN strike) (NN price)) (VP (MD could) (VP (VB be) (NP ($ $) (CD 8)))) (. .))) If that investor were willing to pay extra for the security of limited downside, she could buy put options with a strike price of $98, which would lock in her profit on the shares at $18, less whatever the options cost. THe strike price could be $8. contradiction contradiction contradiction contradiction contradiction contradiction +9 117487 117487n slate ( ( 3 -RRB- ) ( ( Dare ( you ( ( ( rise ( to ( ( ( ( the occasion ) , ) ( like Raskolnikov ) ) , ) ) ) and ) ( reject ( ( the ( petty rules ) ) ( that ( govern ( lesser men ) ) ) ) ) ) ) ) ? ) ) ( ( ( Would you ) ( ( ( rise up ) and ) ( defeaat ( ( all ( evil lords ) ) ( in ( the town ) ) ) ) ) ) ? ) (ROOT (S (LST (LS 3) (-RRB- -RRB-)) (VP (VB Dare) (S (NP (PRP you)) (VP (VP (VB rise) (PP (TO to) (NP (NP (DT the) (NN occasion)) (, ,) (PP (IN like) (NP (NNP Raskolnikov))) (, ,)))) (CC and) (VP (VB reject) (NP (NP (DT the) (JJ petty) (NNS rules)) (SBAR (WHNP (WDT that)) (S (VP (VBP govern) (NP (JJR lesser) (NNS men)))))))))) (. ?))) (ROOT (SQ (MD Would) (NP (PRP you)) (VP (VP (VB rise) (PRT (RP up))) (CC and) (VP (VB defeaat) (NP (NP (DT all) (JJ evil) (NNS lords)) (PP (IN in) (NP (DT the) (NN town)))))) (. ?))) 3) Dare you rise to the occasion, like Raskolnikov, and reject the petty rules that govern lesser men? Would you rise up and defeaat all evil lords in the town? neutral neutral neutral neutral neutral neutral +10 9616 9616c travel ( ( The ( ( most important ) directions ) ) ( ( ( are ( simply ( ( up and ) up ) ) ) ( ( ( ( ( ( ( ( leads eventually ) ( to ( the cathedral ) ) ) and ) ( fortress ( commanding ( the hilltop ) ) ) ) , ) and ) down ) ( inevitably ( ( leads ( to ( one ( of ( three gates ) ) ) ) ) ( through ( ( the wall ) ( to ( the ( new town ) ) ) ) ) ) ) ) ) . ) ) ( Go ( ( downwards ( to ( one ( of ( ( ( the gates ) , ) ( ( all ( of which ) ) ( will ( ( lead you ) ( into ( the cathedral ) ) ) ) ) ) ) ) ) ) . ) ) (ROOT (S (NP (DT The) (ADJP (RBS most) (JJ important)) (NNS directions)) (VP (VBP are) (PRN (ADVP (RB simply)) (ADVP (RB up) (CC and) (RB up))) (VP (VP (VBZ leads) (ADVP (RB eventually)) (PP (TO to) (NP (DT the) (NN cathedral)))) (CC and) (VP (VBZ fortress) (NP (JJ commanding) (DT the) (NN hilltop))) (, ,) (CC and) (ADVP (RB down)) (VP (ADVP (RB inevitably)) (VBZ leads) (PP (TO to) (NP (NP (CD one)) (PP (IN of) (NP (CD three) (NNS gates))))) (PP (IN through) (NP (NP (DT the) (NN wall)) (PP (TO to) (NP (DT the) (JJ new) (NN town)))))))) (. .))) (ROOT (S (NP (NNP Go)) (VP (VBZ downwards) (PP (TO to) (NP (NP (CD one)) (PP (IN of) (NP (NP (DT the) (NNS gates)) (, ,) (SBAR (WHNP (DT all) (WHPP (IN of) (WHNP (WDT which)))) (S (VP (MD will) (VP (VB lead) (NP (PRP you)) (PP (IN into) (NP (DT the) (NN cathedral)))))))))))) (. .))) The most important directions are simply up and up leads eventually to the cathedral and fortress commanding the hilltop, and down inevitably leads to one of three gates through the wall to the new town. Go downwards to one of the gates, all of which will lead you into the cathedral. contradiction contradiction entailment contradiction contradiction contradiction diff --git a/reproduction/coreference_resolution/test/__init__.py b/test/embeddings/__init__.py similarity index 100% rename from reproduction/coreference_resolution/test/__init__.py rename to test/embeddings/__init__.py diff --git a/test/embeddings/test_bert_embedding.py b/test/embeddings/test_bert_embedding.py new file mode 100644 index 00000000..2a8550c3 --- /dev/null +++ b/test/embeddings/test_bert_embedding.py @@ -0,0 +1,48 @@ +import unittest +from fastNLP import Vocabulary +from fastNLP.embeddings import BertEmbedding, BertWordPieceEncoder +import torch +import os + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestDownload(unittest.TestCase): + def test_download(self): + # import os + vocab = Vocabulary().add_word_lst("This is a test .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='en') + words = torch.LongTensor([[2, 3, 4, 0]]) + print(embed(words).size()) + + for pool_method in ['first', 'last', 'max', 'avg']: + for include_cls_sep in [True, False]: + embed = BertEmbedding(vocab, model_dir_or_name='en', pool_method=pool_method, + include_cls_sep=include_cls_sep) + print(embed(words).size()) + + def test_word_drop(self): + vocab = Vocabulary().add_word_lst("This is a test .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='en', dropout=0.1, word_dropout=0.2) + for i in range(10): + words = torch.LongTensor([[2, 3, 4, 0]]) + print(embed(words).size()) + + +class TestBertEmbedding(unittest.TestCase): + def test_bert_embedding_1(self): + vocab = Vocabulary().add_word_lst("this is a test . [SEP]".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) + requires_grad = embed.requires_grad + embed.requires_grad = not requires_grad + embed.train() + words = torch.LongTensor([[2, 3, 4, 0]]) + result = embed(words) + self.assertEqual(result.size(), (1, 4, 16)) + + +class TestBertWordPieceEncoder(unittest.TestCase): + def test_bert_word_piece_encoder(self): + embed = BertWordPieceEncoder(model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) + from fastNLP import DataSet + ds = DataSet({'words': ["this is a test . [SEP]".split()]}) + embed.index_datasets(ds, field_name='words') + self.assertTrue(ds.has_field('word_pieces')) diff --git a/test/embeddings/test_elmo_embedding.py b/test/embeddings/test_elmo_embedding.py new file mode 100644 index 00000000..ed6910b4 --- /dev/null +++ b/test/embeddings/test_elmo_embedding.py @@ -0,0 +1,37 @@ + +import unittest +from fastNLP import Vocabulary +from fastNLP.embeddings import ElmoEmbedding +import torch +import os + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestDownload(unittest.TestCase): + def test_download_small(self): + # import os + vocab = Vocabulary().add_word_lst("This is a test .".split()) + elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='en-small') + words = torch.LongTensor([[0, 1, 2]]) + print(elmo_embed(words).size()) + + +# 首先保证所有权重可以加载;上传权重;验证可以下载 + + +class TestRunElmo(unittest.TestCase): + def test_elmo_embedding(self): + vocab = Vocabulary().add_word_lst("This is a test .".split()) + elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_elmo', layers='0,1') + words = torch.LongTensor([[0, 1, 2]]) + hidden = elmo_embed(words) + print(hidden.size()) + self.assertEqual(hidden.size(), (1, 3, elmo_embed.embedding_dim)) + + def test_elmo_embedding_layer_assertion(self): + vocab = Vocabulary().add_word_lst("This is a test .".split()) + try: + elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_elmo', + layers='0,1,2') + except AssertionError as e: + print(e) + diff --git a/test/embeddings/test_static_embedding.py b/test/embeddings/test_static_embedding.py index 0c8fc739..7d1e8302 100644 --- a/test/embeddings/test_static_embedding.py +++ b/test/embeddings/test_static_embedding.py @@ -3,13 +3,140 @@ import unittest from fastNLP.embeddings import StaticEmbedding from fastNLP import Vocabulary import torch +import os + + +class TestLoad(unittest.TestCase): + def test_norm1(self): + # 测试只对可以找到的norm + vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile']) + embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/' + 'glove.6B.50d_test.txt', + only_norm_found_vector=True) + self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1) + self.assertNotEqual(torch.norm(embed(torch.LongTensor([[4]]))).item(), 1) + + def test_norm2(self): + # 测试对所有都norm + vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile']) + embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/' + 'glove.6B.50d_test.txt', + normalize=True) + self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1) + self.assertEqual(round(torch.norm(embed(torch.LongTensor([[4]]))).item(), 4), 1) + + def test_dropword(self): + # 测试是否可以通过drop word + vocab = Vocabulary().add_word_lst([chr(i) for i in range(1, 200)]) + embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=10, dropout=0.1, word_dropout=0.4) + for i in range(10): + length = torch.randint(1, 50, (1,)).item() + batch = torch.randint(1, 4, (1,)).item() + words = torch.randint(1, 200, (batch, length)).long() + embed(words) class TestRandomSameEntry(unittest.TestCase): def test_same_vector(self): - vocab = Vocabulary().add_word_lst(["The", "the", "THE"]) + vocab = Vocabulary().add_word_lst(["The", "the", "THE", 'a', "A"]) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True) - words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE"]]]) + words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'a', 'A']]]) words = embed(words) embed_0 = words[0, 0] - for i in range(1, words.size(1)): + for i in range(1, 3): assert torch.sum(embed_0==words[0, i]).eq(len(embed_0)) + embed_0 = words[0, 3] + for i in range(3, 5): + assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0)) + + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_same_vector2(self): + vocab = Vocabulary().add_word_lst(["The", 'a', 'b', "the", "THE", "B", 'a', "A"]) + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', + lower=True) + words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'b', "B", 'a', 'A']]]) + words = embed(words) + embed_0 = words[0, 0] + for i in range(1, 3): + assert torch.sum(embed_0==words[0, i]).eq(len(embed_0)) + embed_0 = words[0, 3] + for i in range(3, 5): + assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0)) + + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_same_vector3(self): + # 验证lower + word_lst = ["The", "the"] + no_create_word_lst = ['of', 'Of', 'With', 'with'] + vocab = Vocabulary().add_word_lst(word_lst) + vocab.add_word_lst(no_create_word_lst, no_create_entry=True) + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', + lower=True) + words = torch.LongTensor([[vocab.to_index(word) for word in word_lst+no_create_word_lst]]) + words = embed(words) + + lowered_word_lst = [word.lower() for word in word_lst] + lowered_no_create_word_lst = [word.lower() for word in no_create_word_lst] + lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) + lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) + lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', + lower=False) + lowered_words = torch.LongTensor([[lowered_vocab.to_index(word) for word in lowered_word_lst+lowered_no_create_word_lst]]) + lowered_words = lowered_embed(lowered_words) + + all_words = word_lst + no_create_word_lst + + for idx, (word_i, word_j) in enumerate(zip(words[0], lowered_words[0])): + with self.subTest(idx=idx, word=all_words[idx]): + assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size) + + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_same_vector4(self): + # 验证在有min_freq下的lower + word_lst = ["The", "the", "the", "The", "a", "A"] + no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with'] + all_words = word_lst[:-2] + no_create_word_lst[:-2] + vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) + vocab.add_word_lst(no_create_word_lst, no_create_entry=True) + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', + lower=True) + words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) + words = embed(words) + + lowered_word_lst = [word.lower() for word in word_lst] + lowered_no_create_word_lst = [word.lower() for word in no_create_word_lst] + lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) + lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) + lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', + lower=False) + lowered_words = torch.LongTensor([[lowered_vocab.to_index(word.lower()) for word in all_words]]) + lowered_words = lowered_embed(lowered_words) + + for idx in range(len(all_words)): + word_i, word_j = words[0, idx], lowered_words[0, idx] + with self.subTest(idx=idx, word=all_words[idx]): + assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size) + + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_same_vector5(self): + # 检查通过使用min_freq后的word是否内容一致 + word_lst = ["they", "the", "they", "the", 'he', 'he', "a", "A"] + no_create_word_lst = ['of', "of", "she", "she", 'With', 'with'] + all_words = word_lst[:-2] + no_create_word_lst[:-2] + vocab = Vocabulary().add_word_lst(word_lst) + vocab.add_word_lst(no_create_word_lst, no_create_entry=True) + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', + lower=False, min_freq=2) + words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) + words = embed(words) + + min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) + min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True) + min_freq_embed = StaticEmbedding(min_freq_vocab, model_dir_or_name='en-glove-6B-100d', + lower=False) + min_freq_words = torch.LongTensor([[min_freq_vocab.to_index(word.lower()) for word in all_words]]) + min_freq_words = min_freq_embed(min_freq_words) + + for idx in range(len(all_words)): + word_i, word_j = words[0, idx], min_freq_words[0, idx] + with self.subTest(idx=idx, word=all_words[idx]): + assert torch.sum(word_i == word_j).eq(min_freq_embed.embed_size) \ No newline at end of file diff --git a/reproduction/seqence_labelling/chinese_ner/data/__init__.py b/test/io/__init__.py similarity index 100% rename from reproduction/seqence_labelling/chinese_ner/data/__init__.py rename to test/io/__init__.py diff --git a/test/io/loader/test_classification_loader.py b/test/io/loader/test_classification_loader.py new file mode 100644 index 00000000..f4ecd47d --- /dev/null +++ b/test/io/loader/test_classification_loader.py @@ -0,0 +1,49 @@ + +import unittest + +import os + +from fastNLP.io import DataBundle +from fastNLP.io.loader.classification import YelpFullLoader, YelpPolarityLoader, IMDBLoader, \ + SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestDownload(unittest.TestCase): + def test_download(self): + for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader, ChnSentiCorpLoader]: + loader().download() + + def test_load(self): + for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader, ChnSentiCorpLoader]: + data_bundle = loader().load() + print(data_bundle) + + +class TestLoad(unittest.TestCase): + def test_process_from_file(self): + data_set_dict = { + 'yelp.p': ('test/data_for_tests/io/yelp_review_polarity', YelpPolarityLoader, (6, 6, 6), False), + 'yelp.f': ('test/data_for_tests/io/yelp_review_full', YelpFullLoader, (6, 6, 6), False), + 'sst-2': ('test/data_for_tests/io/SST-2', SST2Loader, (5, 5, 5), True), + 'sst': ('test/data_for_tests/io/SST', SSTLoader, (6, 6, 6), False), + 'imdb': ('test/data_for_tests/io/imdb', IMDBLoader, (6, 6, 6), False), + 'ChnSentiCorp': ('test/data_for_tests/io/ChnSentiCorp', ChnSentiCorpLoader, (6, 6, 6), False), + 'THUCNews': ('test/data_for_tests/io/THUCNews', THUCNewsLoader, (9, 9, 9), False), + 'WeiboSenti100k': ('test/data_for_tests/io/WeiboSenti100k', WeiboSenti100kLoader, (7, 6, 6), False), + } + for k, v in data_set_dict.items(): + path, loader, data_set, warns = v + with self.subTest(loader=loader): + if warns: + with self.assertWarns(Warning): + data_bundle = loader().load(path) + else: + data_bundle = loader().load(path) + + self.assertTrue(isinstance(data_bundle, DataBundle)) + self.assertEqual(len(data_set), data_bundle.num_dataset) + for x, y in zip(data_set, data_bundle.iter_datasets()): + name, dataset = y + self.assertEqual(x, len(dataset)) + diff --git a/test/io/loader/test_conll_loader.py b/test/io/loader/test_conll_loader.py new file mode 100644 index 00000000..6668cccf --- /dev/null +++ b/test/io/loader/test_conll_loader.py @@ -0,0 +1,37 @@ + +import unittest +import os +from fastNLP.io.loader.conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, \ + Conll2003Loader + + +class TestMSRANER(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_download(self): + MsraNERLoader().download(re_download=False) + data_bundle = MsraNERLoader().load() + print(data_bundle) + + +class TestPeopleDaily(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_download(self): + PeopleDailyNERLoader().download() + + +class TestWeiboNER(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_download(self): + WeiboNERLoader().download() + + +class TestConll2003Loader(unittest.TestCase): + def test_load(self): + Conll2003Loader()._load('test/data_for_tests/conll_2003_example.txt') + + +class TestConllLoader(unittest.TestCase): + def test_conll(self): + db = Conll2003Loader().load('test/data_for_tests/io/conll2003') + print(db) + diff --git a/test/io/loader/test_coreference_loader.py b/test/io/loader/test_coreference_loader.py new file mode 100644 index 00000000..02f3a1c5 --- /dev/null +++ b/test/io/loader/test_coreference_loader.py @@ -0,0 +1,26 @@ +from fastNLP.io.loader.coreference import CoReferenceLoader +import unittest + + +class TestCR(unittest.TestCase): + def test_load(self): + + test_root = "test/data_for_tests/io/coreference/" + train_path = test_root+"coreference_train.json" + dev_path = test_root+"coreference_dev.json" + test_path = test_root+"coreference_test.json" + paths = {"train": train_path, "dev": dev_path, "test": test_path} + + bundle1 = CoReferenceLoader().load(paths) + bundle2 = CoReferenceLoader().load(test_root) + print(bundle1) + print(bundle2) + + self.assertEqual(bundle1.num_dataset, 3) + self.assertEqual(bundle2.num_dataset, 3) + self.assertEqual(bundle1.num_vocab, 0) + self.assertEqual(bundle2.num_vocab, 0) + + self.assertEqual(len(bundle1.get_dataset('train')), 1) + self.assertEqual(len(bundle1.get_dataset('dev')), 1) + self.assertEqual(len(bundle1.get_dataset('test')), 1) diff --git a/test/io/loader/test_cws_loader.py b/test/io/loader/test_cws_loader.py new file mode 100644 index 00000000..80ca0406 --- /dev/null +++ b/test/io/loader/test_cws_loader.py @@ -0,0 +1,24 @@ +import unittest +import os +from fastNLP.io.loader import CWSLoader + + +class TestCWSLoader(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_download(self): + dataset_names = ['pku', 'cityu', 'as', 'msra'] + for dataset_name in dataset_names: + with self.subTest(dataset_name=dataset_name): + data_bundle = CWSLoader(dataset_name=dataset_name).load() + print(data_bundle) + + +class TestRunCWSLoader(unittest.TestCase): + def test_cws_loader(self): + dataset_names = ['msra', 'cityu', 'as', 'msra'] + for dataset_name in dataset_names: + with self.subTest(dataset_name=dataset_name): + data_bundle = CWSLoader(dataset_name=dataset_name).load( + f'test/data_for_tests/io/cws_{dataset_name}' + ) + print(data_bundle) diff --git a/test/io/loader/test_matching_loader.py b/test/io/loader/test_matching_loader.py new file mode 100644 index 00000000..70367f6d --- /dev/null +++ b/test/io/loader/test_matching_loader.py @@ -0,0 +1,50 @@ + +import unittest + +import os + +from fastNLP.io import DataBundle +from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \ + BQCorpusLoader, CNXNLILoader, LCQMCLoader + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestMatchingDownload(unittest.TestCase): + def test_download(self): + for loader in [RTELoader, QNLILoader, SNLILoader, MNLILoader]: + loader().download() + with self.assertRaises(Exception): + QuoraLoader().load() + + def test_load(self): + for loader in [RTELoader, QNLILoader, SNLILoader, MNLILoader]: + data_bundle = loader().load() + print(data_bundle) + + +class TestMatchingLoad(unittest.TestCase): + def test_load(self): + data_set_dict = { + 'RTE': ('test/data_for_tests/io/RTE', RTELoader, (5, 5, 5), True), + 'SNLI': ('test/data_for_tests/io/SNLI', SNLILoader, (5, 5, 5), False), + 'QNLI': ('test/data_for_tests/io/QNLI', QNLILoader, (5, 5, 5), True), + 'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True), + 'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False), + 'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False), + 'XNLI': ('test/data_for_tests/io/XNLI', CNXNLILoader, (6, 8, 6), False), + 'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False), + } + for k, v in data_set_dict.items(): + path, loader, instance, warns = v + if warns: + with self.assertWarns(Warning): + data_bundle = loader().load(path) + else: + data_bundle = loader().load(path) + + self.assertTrue(isinstance(data_bundle, DataBundle)) + self.assertEqual(len(instance), data_bundle.num_dataset) + for x, y in zip(instance, data_bundle.iter_datasets()): + name, dataset = y + self.assertEqual(x, len(dataset)) + diff --git a/test/io/loader/test_qa_loader.py b/test/io/loader/test_qa_loader.py new file mode 100644 index 00000000..eea067cd --- /dev/null +++ b/test/io/loader/test_qa_loader.py @@ -0,0 +1,14 @@ +import unittest + +from fastNLP.io.loader.qa import CMRC2018Loader + +class TestCMRC2018Loader(unittest.TestCase): + def test__load(self): + loader = CMRC2018Loader() + dataset = loader._load('test/data_for_tests/io/cmrc/train.json') + print(dataset) + + def test_load(self): + loader = CMRC2018Loader() + data_bundle = loader.load('test/data_for_tests/io/cmrc/') + print(data_bundle) diff --git a/test/io/pipe/test_classification.py b/test/io/pipe/test_classification.py new file mode 100644 index 00000000..036530c3 --- /dev/null +++ b/test/io/pipe/test_classification.py @@ -0,0 +1,69 @@ +import unittest +import os + +from fastNLP.io import DataBundle +from fastNLP.io.pipe.classification import SSTPipe, SST2Pipe, IMDBPipe, YelpFullPipe, YelpPolarityPipe +from fastNLP.io.pipe.classification import ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestClassificationPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [YelpPolarityPipe, SST2Pipe, IMDBPipe, YelpFullPipe, SSTPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe(tokenizer='raw').process_from_file() + print(data_bundle) + + +class TestRunPipe(unittest.TestCase): + def test_load(self): + for pipe in [IMDBPipe]: + data_bundle = pipe(tokenizer='raw').process_from_file('test/data_for_tests/io/imdb') + print(data_bundle) + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestCNClassificationPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [ChnSentiCorpPipe]: + with self.subTest(pipe=pipe): + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file() + print(data_bundle) + + +class TestRunClassificationPipe(unittest.TestCase): + def test_process_from_file(self): + data_set_dict = { + 'yelp.p': ('test/data_for_tests/io/yelp_review_polarity', YelpPolarityPipe, (6, 6, 6), (1176, 2), False), + 'yelp.f': ('test/data_for_tests/io/yelp_review_full', YelpFullPipe, (6, 6, 6), (1023, 5), False), + 'sst-2': ('test/data_for_tests/io/SST-2', SST2Pipe, (5, 5, 5), (139, 2), True), + 'sst': ('test/data_for_tests/io/SST', SSTPipe, (6, 354, 6), (232, 5), False), + 'imdb': ('test/data_for_tests/io/imdb', IMDBPipe, (6, 6, 6), (1670, 2), False), + 'ChnSentiCorp': ('test/data_for_tests/io/ChnSentiCorp', ChnSentiCorpPipe, (6, 6, 6), (529, 1296, 1483, 2), False), + 'Chn-THUCNews': ('test/data_for_tests/io/THUCNews', THUCNewsPipe, (9, 9, 9), (1864, 9), False), + 'Chn-WeiboSenti100k': ('test/data_for_tests/io/WeiboSenti100k', WeiboSenti100kPipe, (7, 6, 6), (452, 2), False), + } + for k, v in data_set_dict.items(): + path, pipe, data_set, vocab, warns = v + with self.subTest(pipe=pipe): + if 'Chn' not in k: + if warns: + with self.assertWarns(Warning): + data_bundle = pipe(tokenizer='raw').process_from_file(path) + else: + data_bundle = pipe(tokenizer='raw').process_from_file(path) + else: + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file(path) + + self.assertTrue(isinstance(data_bundle, DataBundle)) + self.assertEqual(len(data_set), data_bundle.num_dataset) + for x, y in zip(data_set, data_bundle.iter_datasets()): + name, dataset = y + self.assertEqual(x, len(dataset)) + + self.assertEqual(len(vocab), data_bundle.num_vocab) + for x, y in zip(vocab, data_bundle.iter_vocabs()): + name, vocabs = y + self.assertEqual(x, len(vocabs)) + diff --git a/test/io/pipe/test_conll.py b/test/io/pipe/test_conll.py new file mode 100644 index 00000000..ad41ae18 --- /dev/null +++ b/test/io/pipe/test_conll.py @@ -0,0 +1,52 @@ +import unittest +import os +from fastNLP.io import MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, Conll2003Pipe, Conll2003NERPipe, \ + OntoNotesNERPipe + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestConllPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [MsraNERPipe, PeopleDailyPipe, WeiboNERPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file() + print(data_bundle) + data_bundle = pipe(encoding_type='bioes').process_from_file() + print(data_bundle) + + +class TestRunPipe(unittest.TestCase): + def test_conll2003(self): + for pipe in [Conll2003Pipe, Conll2003NERPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe().process_from_file('test/data_for_tests/conll_2003_example.txt') + print(data_bundle) + + +class TestNERPipe(unittest.TestCase): + def test_process_from_file(self): + data_dict = { + 'weibo_NER': WeiboNERPipe, + 'peopledaily': PeopleDailyPipe, + 'MSRA_NER': MsraNERPipe, + } + for k, v in data_dict.items(): + pipe = v + with self.subTest(pipe=pipe): + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file(f'test/data_for_tests/io/{k}') + print(data_bundle) + data_bundle = pipe(encoding_type='bioes').process_from_file(f'test/data_for_tests/io/{k}') + print(data_bundle) + + +class TestConll2003Pipe(unittest.TestCase): + def test_conll(self): + with self.assertWarns(Warning): + data_bundle = Conll2003Pipe().process_from_file('test/data_for_tests/io/conll2003') + print(data_bundle) + + def test_OntoNotes(self): + data_bundle = OntoNotesNERPipe().process_from_file('test/data_for_tests/io/OntoNotes') + print(data_bundle) diff --git a/test/io/pipe/test_coreference.py b/test/io/pipe/test_coreference.py new file mode 100644 index 00000000..3a492419 --- /dev/null +++ b/test/io/pipe/test_coreference.py @@ -0,0 +1,33 @@ +import unittest +from fastNLP.io.pipe.coreference import CoReferencePipe + + +class TestCR(unittest.TestCase): + + def test_load(self): + class Config(): + max_sentences = 50 + filter = [3, 4, 5] + char_path = None + config = Config() + + file_root_path = "test/data_for_tests/io/coreference/" + train_path = file_root_path + "coreference_train.json" + dev_path = file_root_path + "coreference_dev.json" + test_path = file_root_path + "coreference_test.json" + + paths = {"train": train_path, "dev": dev_path, "test": test_path} + + bundle1 = CoReferencePipe(config).process_from_file(paths) + bundle2 = CoReferencePipe(config).process_from_file(file_root_path) + print(bundle1) + print(bundle2) + self.assertEqual(bundle1.num_dataset, 3) + self.assertEqual(bundle2.num_dataset, 3) + self.assertEqual(bundle1.num_vocab, 1) + self.assertEqual(bundle2.num_vocab, 1) + + self.assertEqual(len(bundle1.get_dataset('train')), 1) + self.assertEqual(len(bundle1.get_dataset('dev')), 1) + self.assertEqual(len(bundle1.get_dataset('test')), 1) + self.assertEqual(len(bundle1.get_vocab('words1')), 84) diff --git a/test/io/pipe/test_cws.py b/test/io/pipe/test_cws.py new file mode 100644 index 00000000..09fce3f0 --- /dev/null +++ b/test/io/pipe/test_cws.py @@ -0,0 +1,24 @@ + +import unittest +import os +from fastNLP.io.pipe.cws import CWSPipe + + +class TestCWSPipe(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_process_from_file(self): + dataset_names = ['pku', 'cityu', 'as', 'msra'] + for dataset_name in dataset_names: + with self.subTest(dataset_name=dataset_name): + data_bundle = CWSPipe(dataset_name=dataset_name).process_from_file() + print(data_bundle) + + +class TestRunCWSPipe(unittest.TestCase): + def test_process_from_file(self): + dataset_names = ['msra', 'cityu', 'as', 'pku'] + for dataset_name in dataset_names: + with self.subTest(dataset_name=dataset_name): + data_bundle = CWSPipe(bigrams=True, trigrams=True).\ + process_from_file(f'test/data_for_tests/io/cws_{dataset_name}') + print(data_bundle) diff --git a/test/io/pipe/test_matching.py b/test/io/pipe/test_matching.py new file mode 100644 index 00000000..bfd65db2 --- /dev/null +++ b/test/io/pipe/test_matching.py @@ -0,0 +1,107 @@ + +import unittest +import os + +from fastNLP.io import DataBundle +from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \ + CNXNLIPipe, BQCorpusPipe, LCQMCPipe +from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \ + CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestMatchingPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [SNLIPipe, RTEPipe, QNLIPipe, MNLIPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe(tokenizer='raw').process_from_file() + print(data_bundle) + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestMatchingBertPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [SNLIBertPipe, RTEBertPipe, QNLIBertPipe, MNLIBertPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe(tokenizer='raw').process_from_file() + print(data_bundle) + + +class TestRunMatchingPipe(unittest.TestCase): + + def test_load(self): + data_set_dict = { + 'RTE': ('test/data_for_tests/io/RTE', RTEPipe, RTEBertPipe, (5, 5, 5), (449, 2), True), + 'SNLI': ('test/data_for_tests/io/SNLI', SNLIPipe, SNLIBertPipe, (5, 5, 5), (110, 3), False), + 'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True), + 'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True), + 'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False), + 'XNLI': ('test/data_for_tests/io/XNLI', CNXNLIPipe, CNXNLIBertPipe, (6, 8, 6), (39, 3), False), + 'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False), + } + for k, v in data_set_dict.items(): + path, pipe1, pipe2, data_set, vocab, warns = v + if warns: + with self.assertWarns(Warning): + data_bundle1 = pipe1(tokenizer='raw').process_from_file(path) + data_bundle2 = pipe2(tokenizer='raw').process_from_file(path) + else: + data_bundle1 = pipe1(tokenizer='raw').process_from_file(path) + data_bundle2 = pipe2(tokenizer='raw').process_from_file(path) + + self.assertTrue(isinstance(data_bundle1, DataBundle)) + self.assertEqual(len(data_set), data_bundle1.num_dataset) + print(k) + print(data_bundle1) + print(data_bundle2) + for x, y in zip(data_set, data_bundle1.iter_datasets()): + name, dataset = y + self.assertEqual(x, len(dataset)) + self.assertEqual(len(data_set), data_bundle2.num_dataset) + for x, y in zip(data_set, data_bundle2.iter_datasets()): + name, dataset = y + self.assertEqual(x, len(dataset)) + + self.assertEqual(len(vocab), data_bundle1.num_vocab) + for x, y in zip(vocab, data_bundle1.iter_vocabs()): + name, vocabs = y + self.assertEqual(x, len(vocabs)) + self.assertEqual(len(vocab), data_bundle2.num_vocab) + for x, y in zip(vocab, data_bundle1.iter_vocabs()): + name, vocabs = y + self.assertEqual(x + 1 if name == 'words' else x, len(vocabs)) + + def test_spacy(self): + data_set_dict = { + 'Quora': ('test/data_for_tests/io/Quora', QuoraPipe, QuoraBertPipe, (2, 2, 2), (93, 2)), + } + for k, v in data_set_dict.items(): + path, pipe1, pipe2, data_set, vocab = v + + data_bundle1 = pipe1(tokenizer='spacy').process_from_file(path) + data_bundle2 = pipe2(tokenizer='spacy').process_from_file(path) + + self.assertTrue(isinstance(data_bundle1, DataBundle)) + self.assertEqual(len(data_set), data_bundle1.num_dataset) + print(k) + print(data_bundle1) + print(data_bundle2) + for x, y in zip(data_set, data_bundle1.iter_datasets()): + name, dataset = y + self.assertEqual(x, len(dataset)) + self.assertEqual(len(data_set), data_bundle2.num_dataset) + for x, y in zip(data_set, data_bundle2.iter_datasets()): + name, dataset = y + self.assertEqual(x, len(dataset)) + + self.assertEqual(len(vocab), data_bundle1.num_vocab) + for x, y in zip(vocab, data_bundle1.iter_vocabs()): + name, vocabs = y + self.assertEqual(x, len(vocabs)) + self.assertEqual(len(vocab), data_bundle2.num_vocab) + for x, y in zip(vocab, data_bundle1.iter_vocabs()): + name, vocabs = y + self.assertEqual(x + 1 if name == 'words' else x, len(vocabs)) + diff --git a/test/io/pipe/test_qa.py b/test/io/pipe/test_qa.py new file mode 100644 index 00000000..ad6581f9 --- /dev/null +++ b/test/io/pipe/test_qa.py @@ -0,0 +1,24 @@ + +import unittest +from fastNLP.io.pipe.qa import CMRC2018BertPipe +from fastNLP.io.loader.qa import CMRC2018Loader + + +class CMRC2018PipeTest(unittest.TestCase): + def test_process(self): + data_bundle = CMRC2018Loader().load('test/data_for_tests/io/cmrc/') + pipe = CMRC2018BertPipe() + data_bundle = pipe.process(data_bundle) + + for name, dataset in data_bundle.iter_datasets(): + for ins in dataset: + if 'target_start' in ins: + # 抓到的答案是对应上的 + start_index = ins['target_start'] + end_index = ins['target_end']+1 + extract_answer = ''.join(ins['raw_chars'][start_index:end_index]) + self.assertEqual(extract_answer, ins['answers'][0]) + # 测试context_len是对的 + raw_chars = ins['raw_chars'] + expect_len = raw_chars.index('[SEP]') + self.assertEqual(expect_len, ins['context_len']) diff --git a/test/io/pipe/test_summary.py b/test/io/pipe/test_summary.py new file mode 100644 index 00000000..32508a15 --- /dev/null +++ b/test/io/pipe/test_summary.py @@ -0,0 +1,69 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# __author__="Danqing Wang" + +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest +import os + +from fastNLP.io import DataBundle +from fastNLP.io.pipe.summarization import ExtCNNDMPipe + + +class TestRunExtCNNDMPipe(unittest.TestCase): + + def test_load(self): + data_dir = 'test/data_for_tests/io/cnndm' + vocab_size = 100000 + VOCAL_FILE = 'test/data_for_tests/io/cnndm/vocab' + sent_max_len = 100 + doc_max_timesteps = 50 + dbPipe = ExtCNNDMPipe(vocab_size=vocab_size, + vocab_path=VOCAL_FILE, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps) + dbPipe2 = ExtCNNDMPipe(vocab_size=vocab_size, + vocab_path=VOCAL_FILE, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps, + domain=True) + db = dbPipe.process_from_file(data_dir) + db2 = dbPipe2.process_from_file(data_dir) + + self.assertTrue(isinstance(db, DataBundle)) + self.assertTrue(isinstance(db2, DataBundle)) + + dbPipe3 = ExtCNNDMPipe(vocab_size=vocab_size, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps, + domain=True) + db3 = dbPipe3.process_from_file(data_dir) + self.assertTrue(isinstance(db3, DataBundle)) + + with self.assertRaises(RuntimeError): + dbPipe4 = ExtCNNDMPipe(vocab_size=vocab_size, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps) + db4 = dbPipe4.process_from_file(os.path.join(data_dir, 'train.cnndm.jsonl')) + + dbPipe5 = ExtCNNDMPipe(vocab_size=vocab_size, + vocab_path=VOCAL_FILE, + sent_max_len=sent_max_len, + doc_max_timesteps=doc_max_timesteps,) + db5 = dbPipe5.process_from_file(os.path.join(data_dir, 'train.cnndm.jsonl')) + self.assertIsInstance(db5, DataBundle) + diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py deleted file mode 100644 index 492545f6..00000000 --- a/test/io/test_dataset_loader.py +++ /dev/null @@ -1,77 +0,0 @@ -import unittest -import os -from fastNLP.io import CSVLoader, JsonLoader -from fastNLP.io.data_loader import SSTLoader, SNLILoader, Conll2003Loader, PeopleDailyCorpusLoader - - -class TestDatasetLoader(unittest.TestCase): - - def test_Conll2003Loader(self): - """ - Test the the loader of Conll2003 dataset - """ - dataset_path = "test/data_for_tests/conll_2003_example.txt" - loader = Conll2003Loader() - dataset_2003 = loader.load(dataset_path) - - def test_PeopleDailyCorpusLoader(self): - data_set = PeopleDailyCorpusLoader().load("test/data_for_tests/people_daily_raw.txt") - - def test_CSVLoader(self): - ds = CSVLoader(sep='\t', headers=['words', 'label']) \ - .load('test/data_for_tests/tutorial_sample_dataset.csv') - assert len(ds) > 0 - - def test_SNLILoader(self): - ds = SNLILoader().load('test/data_for_tests/sample_snli.jsonl') - assert len(ds) == 3 - - def test_JsonLoader(self): - ds = JsonLoader().load('test/data_for_tests/sample_snli.jsonl') - assert len(ds) == 3 - - def no_test_SST(self): - train_data = """(3 (2 (2 The) (2 Rock)) (4 (3 (2 is) (4 (2 destined) (2 (2 (2 (2 (2 to) (2 (2 be) (2 (2 the) (2 (2 21st) (2 (2 (2 Century) (2 's)) (2 (3 new) (2 (2 ``) (2 Conan)))))))) (2 '')) (2 and)) (3 (2 that) (3 (2 he) (3 (2 's) (3 (2 going) (3 (2 to) (4 (3 (2 make) (3 (3 (2 a) (3 splash)) (2 (2 even) (3 greater)))) (2 (2 than) (2 (2 (2 (2 (1 (2 Arnold) (2 Schwarzenegger)) (2 ,)) (2 (2 Jean-Claud) (2 (2 Van) (2 Damme)))) (2 or)) (2 (2 Steven) (2 Segal))))))))))))) (2 .))) -(4 (4 (4 (2 The) (4 (3 gorgeously) (3 (2 elaborate) (2 continuation)))) (2 (2 (2 of) (2 ``)) (2 (2 The) (2 (2 (2 Lord) (2 (2 of) (2 (2 the) (2 Rings)))) (2 (2 '') (2 trilogy)))))) (2 (3 (2 (2 is) (2 (2 so) (2 huge))) (2 (2 that) (3 (2 (2 (2 a) (2 column)) (2 (2 of) (2 words))) (2 (2 (2 (2 can) (1 not)) (3 adequately)) (2 (2 describe) (2 (3 (2 (2 co-writer\/director) (2 (2 Peter) (3 (2 Jackson) (2 's)))) (3 (2 expanded) (2 vision))) (2 (2 of) (2 (2 (2 J.R.R.) (2 (2 Tolkien) (2 's))) (2 Middle-earth))))))))) (2 .))) -(3 (3 (2 (2 (2 (2 (2 Singer\/composer) (2 (2 Bryan) (2 Adams))) (2 (2 contributes) (2 (2 (2 a) (2 slew)) (2 (2 of) (2 songs))))) (2 (2 --) (2 (2 (2 (2 a) (2 (2 few) (3 potential))) (2 (2 (2 hits) (2 ,)) (2 (2 (2 a) (2 few)) (1 (1 (2 more) (1 (2 simply) (2 intrusive))) (2 (2 to) (2 (2 the) (2 story))))))) (2 --)))) (2 but)) (3 (4 (2 the) (3 (2 whole) (2 package))) (2 (3 certainly) (3 (2 captures) (2 (1 (2 the) (2 (2 (2 intended) (2 (2 ,) (2 (2 er) (2 ,)))) (3 spirit))) (2 (2 of) (2 (2 the) (2 piece)))))))) (2 .)) -(2 (2 (2 You) (2 (2 'd) (2 (2 think) (2 (2 by) (2 now))))) (2 (2 America) (2 (2 (2 would) (1 (2 have) (2 (2 (2 had) (1 (2 enough) (2 (2 of) (2 (2 plucky) (2 (2 British) (1 eccentrics)))))) (4 (2 with) (4 (3 hearts) (3 (2 of) (3 gold))))))) (2 .)))) -""" - test_data = """(3 (2 Yet) (3 (2 (2 the) (2 act)) (3 (4 (3 (2 is) (3 (2 still) (4 charming))) (2 here)) (2 .)))) -(4 (2 (2 Whether) (2 (2 (2 (2 or) (1 not)) (3 (2 you) (2 (2 're) (3 (3 enlightened) (2 (2 by) (2 (2 any) (2 (2 of) (2 (2 Derrida) (2 's))))))))) (2 (2 lectures) (2 (2 on) (2 (2 ``) (2 (2 (2 (2 (2 (2 the) (2 other)) (2 '')) (2 and)) (2 ``)) (2 (2 the) (2 self)))))))) (3 (2 ,) (3 (2 '') (3 (2 Derrida) (3 (3 (2 is) (4 (2 an) (4 (4 (2 undeniably) (3 (4 (3 fascinating) (2 and)) (4 playful))) (2 fellow)))) (2 .)))))) -(4 (3 (2 (2 Just) (2 (2 the) (2 labour))) (3 (2 involved) (3 (2 in) (4 (2 creating) (3 (3 (2 the) (3 (3 layered) (2 richness))) (3 (2 of) (3 (2 (2 the) (2 imagery)) (2 (2 in) (3 (2 (2 this) (2 chiaroscuro)) (2 (2 of) (2 (2 (2 madness) (2 and)) (2 light)))))))))))) (3 (3 (2 is) (4 astonishing)) (2 .))) -(3 (3 (2 Part) (3 (2 of) (4 (2 (2 the) (3 charm)) (2 (2 of) (2 (2 Satin) (2 Rouge)))))) (3 (3 (2 is) (3 (2 that) (3 (2 it) (2 (1 (2 avoids) (2 (2 the) (1 obvious))) (3 (2 with) (3 (3 (3 humour) (2 and)) (2 lightness))))))) (2 .))) -(4 (2 (2 a) (2 (2 screenplay) (2 more))) (3 (4 ingeniously) (2 (2 constructed) (2 (2 (2 (2 than) (2 ``)) (2 Memento)) (2 ''))))) -(3 (2 ``) (3 (2 (2 Extreme) (2 Ops)) (3 (2 '') (4 (4 (3 exceeds) (2 expectations)) (2 .))))) -""" - train, test = 'train--', 'test--' - with open(train, 'w', encoding='utf-8') as f: - f.write(train_data) - with open(test, 'w', encoding='utf-8') as f: - f.write(test_data) - - loader = SSTLoader() - info = loader.process( - {train: train, test: test}, - train_ds=[train], - src_vocab_op=dict(min_freq=2) - ) - assert len(list(info.vocabs.items())) == 2 - assert len(list(info.datasets.items())) == 2 - print(info.vocabs) - print(info.datasets) - os.remove(train), os.remove(test) - - def test_import(self): - import fastNLP - from fastNLP.io import SNLILoader - ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True, - get_index=True, seq_len_type='seq_len', extra_split=['-']) - assert 'train' in ds.datasets - assert len(ds.datasets) == 1 - assert len(ds.datasets['train']) == 3 - - ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True, - get_index=True, seq_len_type='seq_len') - assert 'train' in ds.datasets - assert len(ds.datasets) == 1 - assert len(ds.datasets['train']) == 3 diff --git a/test/io/test_embed_loader.py b/test/io/test_embed_loader.py index bbfe8858..70b367ec 100644 --- a/test/io/test_embed_loader.py +++ b/test/io/test_embed_loader.py @@ -8,8 +8,8 @@ from fastNLP.io import EmbedLoader class TestEmbedLoader(unittest.TestCase): def test_load_with_vocab(self): vocab = Vocabulary() - glove = "test/data_for_tests/glove.6B.50d_test.txt" - word2vec = "test/data_for_tests/word2vec_test.txt" + glove = "test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt" + word2vec = "test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt" vocab.add_word('the') vocab.add_word('none') g_m = EmbedLoader.load_with_vocab(glove, vocab) @@ -20,8 +20,8 @@ class TestEmbedLoader(unittest.TestCase): def test_load_without_vocab(self): words = ['the', 'of', 'in', 'a', 'to', 'and'] - glove = "test/data_for_tests/glove.6B.50d_test.txt" - word2vec = "test/data_for_tests/word2vec_test.txt" + glove = "test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt" + word2vec = "test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt" g_m, vocab = EmbedLoader.load_without_vocab(glove) self.assertEqual(g_m.shape, (8, 50)) for word in words: diff --git a/test/io/test_model_io.py b/test/io/test_model_io.py new file mode 100644 index 00000000..b8960492 --- /dev/null +++ b/test/io/test_model_io.py @@ -0,0 +1,25 @@ +import os +import unittest + +from fastNLP.io import ModelSaver, ModelLoader +from fastNLP.models import CNNText + + +class TestModelIO(unittest.TestCase): + def test_save_and_load(self): + model = CNNText((10, 10), 2) + saver = ModelSaver('tmp') + loader = ModelLoader() + saver.save_pytorch(model) + + new_cnn = CNNText((10, 10), 2) + loader.load_pytorch(new_cnn, 'tmp') + + new_model = loader.load_pytorch_model('tmp') + + for i in range(10): + for j in range(10): + self.assertEqual(model.embed.embed.weight[i, j], new_cnn.embed.embed.weight[i, j]) + self.assertEqual(model.embed.embed.weight[i, j], new_model["embed.embed.weight"][i, j]) + + os.system('rm tmp') diff --git a/test/models/test_bert.py b/test/models/test_bert.py index 05ee6d5a..c3ba9454 100644 --- a/test/models/test_bert.py +++ b/test/models/test_bert.py @@ -2,68 +2,170 @@ import unittest import torch -from fastNLP.models.bert import * +from fastNLP.core import Vocabulary, Const +from fastNLP.models.bert import BertForSequenceClassification, BertForQuestionAnswering, \ + BertForTokenClassification, BertForMultipleChoice, BertForSentenceMatching +from fastNLP.embeddings.bert_embedding import BertEmbedding class TestBert(unittest.TestCase): def test_bert_1(self): - from fastNLP.core.const import Const - from fastNLP.modules.encoder.bert import BertConfig + vocab = Vocabulary().add_word_lst("this is a test .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=True) - model = BertForSequenceClassification(2, BertConfig(32000)) + model = BertForSequenceClassification(embed, 2) - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + input_ids = torch.LongTensor([[1, 2, 3], [5, 6, 0]]) - pred = model(input_ids, token_type_ids, input_mask) + pred = model(input_ids) self.assertTrue(isinstance(pred, dict)) self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2)) + pred = model(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2)) + + def test_bert_1_w(self): + vocab = Vocabulary().add_word_lst("this is a test .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False) + + with self.assertWarns(Warning): + model = BertForSequenceClassification(embed, 2) + + input_ids = torch.LongTensor([[1, 2, 3], [5, 6, 0]]) + + pred = model.predict(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2,)) + def test_bert_2(self): - from fastNLP.core.const import Const - from fastNLP.modules.encoder.bert import BertConfig - model = BertForMultipleChoice(2, BertConfig(32000)) + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=True) + + model = BertForMultipleChoice(embed, 2) - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + input_ids = torch.LongTensor([[[2, 6, 7], [1, 6, 5]]]) + print(input_ids.size()) - pred = model(input_ids, token_type_ids, input_mask) + pred = model(input_ids) self.assertTrue(isinstance(pred, dict)) self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (1, 2)) + def test_bert_2_w(self): + + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False) + + with self.assertWarns(Warning): + model = BertForMultipleChoice(embed, 2) + + input_ids = torch.LongTensor([[[2, 6, 7], [1, 6, 5]]]) + print(input_ids.size()) + + pred = model.predict(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (1,)) + def test_bert_3(self): - from fastNLP.core.const import Const - from fastNLP.modules.encoder.bert import BertConfig - model = BertForTokenClassification(7, BertConfig(32000)) + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False) + model = BertForTokenClassification(embed, 7) - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) - pred = model(input_ids, token_type_ids, input_mask) + pred = model(input_ids) self.assertTrue(isinstance(pred, dict)) self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 3, 7)) + def test_bert_3_w(self): + + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=True) + + with self.assertWarns(Warning): + model = BertForTokenClassification(embed, 7) + + input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) + + pred = model.predict(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 3)) + def test_bert_4(self): - from fastNLP.core.const import Const - from fastNLP.modules.encoder.bert import BertConfig + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False) + model = BertForQuestionAnswering(embed) + + input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) + + pred = model(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue('pred_start' in pred) + self.assertTrue('pred_end' in pred) + self.assertEqual(tuple(pred['pred_start'].shape), (2, 3)) + self.assertEqual(tuple(pred['pred_end'].shape), (2, 3)) + + def test_bert_for_question_answering_train(self): + from fastNLP import CMRC2018Loss + from fastNLP.io import CMRC2018BertPipe + from fastNLP import Trainer + + data_bundle = CMRC2018BertPipe().process_from_file('test/data_for_tests/io/cmrc') + data_bundle.rename_field('chars', 'words') + train_data = data_bundle.get_dataset('train') + vocab = data_bundle.get_vocab('words') + + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False, auto_truncate=True) + model = BertForQuestionAnswering(embed) + loss = CMRC2018Loss() + + trainer = Trainer(train_data, model, loss=loss, use_tqdm=False) + trainer.train(load_best_model=False) + + def test_bert_5(self): - model = BertForQuestionAnswering(BertConfig(32000)) + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=True) + model = BertForSentenceMatching(embed) - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) - pred = model(input_ids, token_type_ids, input_mask) + pred = model(input_ids) self.assertTrue(isinstance(pred, dict)) - self.assertTrue(Const.OUTPUTS(0) in pred) - self.assertTrue(Const.OUTPUTS(1) in pred) - self.assertEqual(tuple(pred[Const.OUTPUTS(0)].shape), (2, 3)) - self.assertEqual(tuple(pred[Const.OUTPUTS(1)].shape), (2, 3)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2)) + + def test_bert_5_w(self): + + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False) + + with self.assertWarns(Warning): + model = BertForSentenceMatching(embed) + + input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) + + pred = model.predict(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2,)) + diff --git a/test/models/test_biaffine_parser.py b/test/models/test_biaffine_parser.py index 4f93b994..4b38d816 100644 --- a/test/models/test_biaffine_parser.py +++ b/test/models/test_biaffine_parser.py @@ -27,7 +27,7 @@ def prepare_parser_data(): class TestBiaffineParser(unittest.TestCase): def test_train(self): - model = BiaffineParser(init_embed=(VOCAB_SIZE, 10), + model = BiaffineParser(embed=(VOCAB_SIZE, 10), pos_vocab_size=VOCAB_SIZE, pos_emb_dim=10, rnn_hidden_size=10, arc_mlp_size=10, @@ -37,7 +37,7 @@ class TestBiaffineParser(unittest.TestCase): RUNNER.run_model(model, ds, loss=ParserLoss(), metrics=ParserMetric()) def test_train2(self): - model = BiaffineParser(init_embed=(VOCAB_SIZE, 10), + model = BiaffineParser(embed=(VOCAB_SIZE, 10), pos_vocab_size=VOCAB_SIZE, pos_emb_dim=10, rnn_hidden_size=16, arc_mlp_size=10, diff --git a/test/models/test_cnn_text_classification.py b/test/models/test_cnn_text_classification.py index 2ea48220..29154bd6 100644 --- a/test/models/test_cnn_text_classification.py +++ b/test/models/test_cnn_text_classification.py @@ -6,12 +6,24 @@ from fastNLP.models.cnn_text_classification import CNNText class TestCNNText(unittest.TestCase): + def init_model(self, kernel_sizes, kernel_nums=(1,3,5)): + model = CNNText((VOCAB_SIZE, 30), + NUM_CLS, + kernel_nums=kernel_nums, + kernel_sizes=kernel_sizes) + return model + def test_case1(self): # 测试能否正常运行CNN - init_emb = (VOCAB_SIZE, 30) - model = CNNText(init_emb, - NUM_CLS, - kernel_nums=(1, 3, 5), - kernel_sizes=(1, 3, 5), - dropout=0.5) + model = self.init_model((1,3,5)) + RUNNER.run_model_with_task(TEXT_CLS, model) + + def test_init_model(self): + self.assertRaises(Exception, self.init_model, (2,4)) + self.assertRaises(Exception, self.init_model, (2,)) + + def test_output(self): + model = self.init_model((3,), (1,)) + global MAX_LEN + MAX_LEN = 2 RUNNER.run_model_with_task(TEXT_CLS, model) diff --git a/test/models/test_sequence_labeling.py b/test/models/test_sequence_labeling.py index 3a70e381..815d7047 100644 --- a/test/models/test_sequence_labeling.py +++ b/test/models/test_sequence_labeling.py @@ -3,9 +3,24 @@ import unittest from .model_runner import * -from fastNLP.models.sequence_labeling import SeqLabeling, AdvSeqLabel +from fastNLP.models.sequence_labeling import SeqLabeling, AdvSeqLabel, BiLSTMCRF from fastNLP.core.losses import LossInForward +class TestBiLSTM(unittest.TestCase): + def test_case1(self): + # 测试能否正常运行CNN + init_emb = (VOCAB_SIZE, 30) + model = BiLSTMCRF(init_emb, + hidden_size=30, + num_classes=NUM_CLS) + + data = RUNNER.prepare_pos_tagging_data() + data.set_input('target') + loss = LossInForward() + metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET, seq_len=C.INPUT_LEN) + RUNNER.run_model(model, data, loss, metric) + + class TesSeqLabel(unittest.TestCase): def test_case1(self): # 测试能否正常运行CNN diff --git a/test/models/test_snli.py b/test/models/test_snli.py new file mode 100644 index 00000000..7a588a4c --- /dev/null +++ b/test/models/test_snli.py @@ -0,0 +1,9 @@ +import unittest +from .model_runner import * +from fastNLP.models.snli import ESIM + + +class TestSNLIModel(unittest.TestCase): + def test_snli(self): + model = ESIM((VOCAB_SIZE, 10), num_labels=NUM_CLS, dropout_rate=0) + RUNNER.run_model_with_task(NLI, model) diff --git a/reproduction/seqence_labelling/cws/test/__init__.py b/test/modules/__init__.py similarity index 100% rename from reproduction/seqence_labelling/cws/test/__init__.py rename to test/modules/__init__.py diff --git a/reproduction/seqence_labelling/ner/__init__.py b/test/modules/decoder/__init__.py similarity index 100% rename from reproduction/seqence_labelling/ner/__init__.py rename to test/modules/decoder/__init__.py diff --git a/test/modules/decoder/test_CRF.py b/test/modules/decoder/test_CRF.py index 647af7d3..94b4ab7a 100644 --- a/test/modules/decoder/test_CRF.py +++ b/test/modules/decoder/test_CRF.py @@ -1,6 +1,6 @@ import unittest - +from fastNLP import Vocabulary class TestCRF(unittest.TestCase): def test_case1(self): @@ -14,7 +14,8 @@ class TestCRF(unittest.TestCase): id2label = {0: 'B', 1:'M', 2:'E', 3:'S'} expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)} - self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES', include_start_end=True))) + self.assertSetEqual(expected_res, set( + allowed_transitions(id2label, encoding_type='BMES', include_start_end=True))) id2label = {0: 'B', 1: 'I', 2:'O', 3: '', 4:""} allowed_transitions(id2label, include_start_end=True) @@ -37,7 +38,100 @@ class TestCRF(unittest.TestCase): expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4), (3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0), (7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)} - self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES', include_start_end=True))) + self.assertSetEqual(expected_res, set( + allowed_transitions(id2label, include_start_end=True))) + + def test_case11(self): + # 测试自动推断encoding类型 + from fastNLP.modules.decoder.crf import allowed_transitions + + id2label = {0: 'B', 1: 'I', 2: 'O'} + expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2), + (2, 4), (3, 0), (3, 2)} + self.assertSetEqual(expected_res, set(allowed_transitions(id2label, include_start_end=True))) + + id2label = {0: 'B', 1: 'M', 2: 'E', 3: 'S'} + expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)} + self.assertSetEqual(expected_res, set( + allowed_transitions(id2label, include_start_end=True))) + + id2label = {0: 'B', 1: 'I', 2: 'O', 3: '', 4: ""} + allowed_transitions(id2label, include_start_end=True) + + labels = ['O'] + for label in ['X', 'Y']: + for tag in 'BI': + labels.append('{}-{}'.format(tag, label)) + id2label = {idx: label for idx, label in enumerate(labels)} + expected_res = {(0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1), + (2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3), + (4, 4), (4, 6), (5, 0), (5, 1), (5, 3)} + self.assertSetEqual(expected_res, set(allowed_transitions(id2label, include_start_end=True))) + + labels = [] + for label in ['X', 'Y']: + for tag in 'BMES': + labels.append('{}-{}'.format(tag, label)) + id2label = {idx: label for idx, label in enumerate(labels)} + expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4), + (3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0), + (7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)} + self.assertSetEqual(expected_res, set( + allowed_transitions(id2label, include_start_end=True))) + + def test_case12(self): + # 测试能否通过vocab生成转移矩阵 + from fastNLP.modules.decoder.crf import allowed_transitions + + id2label = {0: 'B', 1: 'I', 2: 'O'} + vocab = Vocabulary(unknown=None, padding=None) + for idx, tag in id2label.items(): + vocab.add_word(tag) + expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2), + (2, 4), (3, 0), (3, 2)} + self.assertSetEqual(expected_res, set(allowed_transitions(vocab, include_start_end=True))) + + id2label = {0: 'B', 1: 'M', 2: 'E', 3: 'S'} + vocab = Vocabulary(unknown=None, padding=None) + for idx, tag in id2label.items(): + vocab.add_word(tag) + expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)} + self.assertSetEqual(expected_res, set( + allowed_transitions(vocab, include_start_end=True))) + + id2label = {0: 'B', 1: 'I', 2: 'O', 3: '', 4: ""} + vocab = Vocabulary() + for idx, tag in id2label.items(): + vocab.add_word(tag) + allowed_transitions(vocab, include_start_end=True) + + labels = ['O'] + for label in ['X', 'Y']: + for tag in 'BI': + labels.append('{}-{}'.format(tag, label)) + id2label = {idx: label for idx, label in enumerate(labels)} + expected_res = {(0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1), + (2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3), + (4, 4), (4, 6), (5, 0), (5, 1), (5, 3)} + vocab = Vocabulary(unknown=None, padding=None) + for idx, tag in id2label.items(): + vocab.add_word(tag) + self.assertSetEqual(expected_res, set(allowed_transitions(vocab, include_start_end=True))) + + labels = [] + for label in ['X', 'Y']: + for tag in 'BMES': + labels.append('{}-{}'.format(tag, label)) + id2label = {idx: label for idx, label in enumerate(labels)} + vocab = Vocabulary(unknown=None, padding=None) + for idx, tag in id2label.items(): + vocab.add_word(tag) + expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4), + (3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0), + (7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)} + self.assertSetEqual(expected_res, set( + allowed_transitions(vocab, include_start_end=True))) + def test_case2(self): # 测试CRF能否避免解码出非法跃迁, 使用allennlp做了验证。 diff --git a/test/modules/encoder/test_bert.py b/test/modules/decoder/test_bert.py similarity index 92% rename from test/modules/encoder/test_bert.py rename to test/modules/decoder/test_bert.py index 0fcf01e4..56946f5d 100644 --- a/test/modules/encoder/test_bert.py +++ b/test/modules/decoder/test_bert.py @@ -3,7 +3,7 @@ import unittest import torch -from fastNLP.models.bert import BertModel +from fastNLP.modules.encoder.bert import BertModel class TestBert(unittest.TestCase): diff --git a/reproduction/seqence_labelling/ner/test/__init__.py b/test/modules/encoder/__init__.py similarity index 100% rename from reproduction/seqence_labelling/ner/test/__init__.py rename to test/modules/encoder/__init__.py diff --git a/test/modules/encoder/test_pooling.py b/test/modules/encoder/test_pooling.py new file mode 100644 index 00000000..5adca4ff --- /dev/null +++ b/test/modules/encoder/test_pooling.py @@ -0,0 +1,41 @@ +import unittest + +import torch + +from fastNLP.modules.encoder.pooling import MaxPool, MaxPoolWithMask, KMaxPool, AvgPool, AvgPoolWithMask + + +class TestPooling(unittest.TestCase): + def test_MaxPool(self): + max_pool_1d = MaxPool(dimension=1) + x = torch.randn(5, 6, 7) + self.assertEqual(max_pool_1d(x).size(), (5, 7)) + + max_pool_2d = MaxPool(dimension=2) + self.assertEqual(max_pool_2d(x).size(), (5, 1)) + + max_pool_3d = MaxPool(dimension=3) + x = torch.randn(4, 5, 6, 7) + self.assertEqual(max_pool_3d(x).size(), (4, 1, 1)) + + def test_MaxPoolWithMask(self): + pool = MaxPoolWithMask() + x = torch.randn(5, 6, 7) + mask = (torch.randn(5, 6) > 0).long() + self.assertEqual(pool(x, mask).size(), (5, 7)) + + def test_KMaxPool(self): + k_pool = KMaxPool(k=3) + x = torch.randn(4, 5, 6) + self.assertEqual(k_pool(x).size(), (4, 15)) + + def test_AvgPool(self): + pool = AvgPool() + x = torch.randn(4, 5, 6) + self.assertEqual(pool(x).size(), (4, 5)) + + def test_AvgPoolWithMask(self): + pool = AvgPoolWithMask() + x = torch.randn(5, 6, 7) + mask = (torch.randn(5, 6) > 0).long() + self.assertEqual(pool(x, mask).size(), (5, 7)) diff --git a/test/modules/test_utils.py b/test/modules/test_utils.py new file mode 100644 index 00000000..340fedd9 --- /dev/null +++ b/test/modules/test_utils.py @@ -0,0 +1,20 @@ +import unittest + +import torch + +from fastNLP.models import CNNText +from fastNLP.modules.utils import get_dropout_mask, summary + + +class TestUtil(unittest.TestCase): + def test_get_dropout_mask(self): + tensor = torch.randn(3, 4) + mask = get_dropout_mask(0.3, tensor) + self.assertSequenceEqual(mask.size(), torch.Size([3, 4])) + + def test_summary(self): + model = CNNText(embed=(4, 4), num_classes=2, kernel_nums=(9,5), kernel_sizes=(1,3)) + # 4 * 4 + 4 * (9 * 1 + 5 * 3) + 2 * (9 + 5 + 1) = 142 + self.assertSequenceEqual((142, 142, 0), summary(model)) + model.embed.requires_grad = False + self.assertSequenceEqual((142, 126, 16), summary(model)) diff --git a/test/test_tutorials.py b/test/test_tutorials.py index 6f4a8347..3ec0e381 100644 --- a/test/test_tutorials.py +++ b/test/test_tutorials.py @@ -5,14 +5,13 @@ from fastNLP import Instance from fastNLP import Vocabulary from fastNLP.core.losses import CrossEntropyLoss from fastNLP.core.metrics import AccuracyMetric - +from fastNLP.io.loader import CSVLoader class TestTutorial(unittest.TestCase): def test_fastnlp_10min_tutorial(self): # 从csv读取数据到DataSet sample_path = "test/data_for_tests/tutorial_sample_dataset.csv" - dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), - sep='\t') + dataset = CSVLoader(headers=['raw_sentence', 'label'], sep=' ')._load(sample_path) print(len(dataset)) print(dataset[0]) print(dataset[-3]) @@ -110,7 +109,7 @@ class TestTutorial(unittest.TestCase): def test_fastnlp_1min_tutorial(self): # tutorials/fastnlp_1min_tutorial.ipynb data_path = "test/data_for_tests/tutorial_sample_dataset.csv" - ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t') + ds = CSVLoader(headers=['raw_sentence', 'label'], sep=' ')._load(data_path) print(ds[1]) # 将所有数字转为小写 diff --git a/tutorials/README.md b/tutorials/README.md index 83df2bb9..2c228af2 100644 --- a/tutorials/README.md +++ b/tutorials/README.md @@ -1,7 +1,3 @@ # fastNLP 教程 -### 上手教程 Quick Start -`quickstart.ipynb` [Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/quickstart.ipynb) - -### 详细教程 Tutorial 1 -十分钟上手:`tutorial_1.ipynb` [Click Here](https://github.com/fastnlp/fastNLP/tree/master/tutorials/tutorial_1.ipynb) +这里只保留了部分的 \ No newline at end of file diff --git a/tutorials/bert_embedding_tutorial.ipynb b/tutorials/bert_embedding_tutorial.ipynb new file mode 100644 index 00000000..a893fef0 --- /dev/null +++ b/tutorials/bert_embedding_tutorial.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BertEmbedding的各种用法\n", + "fastNLP的BertEmbedding以pytorch-transformer.BertModel的代码为基础,是一个使用BERT对words进行编码的Embedding。\n", + "\n", + "使用BertEmbedding和fastNLP.models.bert里面模型可以搭建BERT应用到五种下游任务的模型。\n", + "\n", + "*预训练好的Embedding参数及数据集的介绍和自动下载功能见 [Embedding教程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_3_embedding.html) 和 [数据处理教程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_4_load_dataset.html)。*\n", + "\n", + "## 1. BERT for Squence Classification\n", + "在文本分类任务中,我们采用SST数据集作为例子来介绍BertEmbedding的使用方法。" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "import torch\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "In total 3 datasets:\n", + "\ttest has 2210 instances.\n", + "\ttrain has 8544 instances.\n", + "\tdev has 1101 instances.\n", + "In total 2 vocabs:\n", + "\twords has 21701 entries.\n", + "\ttarget has 5 entries." + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 载入数据集\n", + "from fastNLP.io import SSTPipe\n", + "data_bundle = SSTPipe(subtree=False, train_subtree=False, lower=False, tokenizer='raw').process_from_file()\n", + "data_bundle" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading vocabulary file /remote-home/source/fastnlp_caches/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/source/fastnlp_caches/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 21701 words out of 21701.\n" + ] + } + ], + "source": [ + "# 载入BertEmbedding\n", + "from fastNLP.embeddings import BertEmbedding\n", + "embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-cased', include_cls_sep=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# 载入模型\n", + "from fastNLP.models import BertForSequenceClassification\n", + "model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input fields after batch(if batch size is 2):\n", + "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 37]) \n", + "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "target fields after batch(if batch size is 2):\n", + "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "\n", + "training epochs started 2019-09-11-17-35-26\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=268), HTML(value='')), layout=Layout(display=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 2.08 seconds!\n", + "Evaluation on dev at Epoch 1/2. Step:134/268: \n", + "AccuracyMetric: acc=0.459582\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 2.2 seconds!\n", + "Evaluation on dev at Epoch 2/2. Step:268/268: \n", + "AccuracyMetric: acc=0.468665\n", + "\n", + "\n", + "In Epoch:2/Step:268, got best dev performance:\n", + "AccuracyMetric: acc=0.468665\n", + "Reloaded the best model.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'best_eval': {'AccuracyMetric': {'acc': 0.468665}},\n", + " 'best_epoch': 2,\n", + " 'best_step': 268,\n", + " 'seconds': 114.5}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 训练模型\n", + "from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n", + "trainer = Trainer(data_bundle.get_dataset('train'), model, \n", + " optimizer=Adam(model_params=model.parameters(), lr=2e-5), \n", + " loss=CrossEntropyLoss(), device=[0],\n", + " batch_size=64, dev_data=data_bundle.get_dataset('dev'), \n", + " metrics=AccuracyMetric(), n_epochs=2, print_every=1)\n", + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 4.52 seconds!\n", + "[tester] \n", + "AccuracyMetric: acc=0.504072\n" + ] + }, + { + "data": { + "text/plain": [ + "{'AccuracyMetric': {'acc': 0.504072}}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 测试结果并删除模型\n", + "from fastNLP import Tester\n", + "tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric())\n", + "tester.test()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## 2. BERT for Sentence Matching\n", + "在Matching任务中,我们采用RTE数据集作为例子来介绍BertEmbedding的使用方法。" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "In total 3 datasets:\n", + "\ttest has 3000 instances.\n", + "\ttrain has 2490 instances.\n", + "\tdev has 277 instances.\n", + "In total 2 vocabs:\n", + "\twords has 41281 entries.\n", + "\ttarget has 2 entries." + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 载入数据集\n", + "from fastNLP.io import RTEBertPipe\n", + "data_bundle = RTEBertPipe(lower=False, tokenizer='raw').process_from_file()\n", + "data_bundle" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading vocabulary file /remote-home/source/fastnlp_caches/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/source/fastnlp_caches/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 41279 words out of 41281.\n" + ] + } + ], + "source": [ + "# 载入BertEmbedding\n", + "from fastNLP.embeddings import BertEmbedding\n", + "embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-cased', include_cls_sep=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# 载入模型\n", + "from fastNLP.models import BertForSentenceMatching\n", + "model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target')))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input fields after batch(if batch size is 2):\n", + "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 45]) \n", + "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "target fields after batch(if batch size is 2):\n", + "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "\n", + "training epochs started 2019-09-11-17-37-36\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=312), HTML(value='')), layout=Layout(display=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 1.72 seconds!\n", + "Evaluation on dev at Epoch 1/2. Step:156/312: \n", + "AccuracyMetric: acc=0.624549\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 1.74 seconds!\n", + "Evaluation on dev at Epoch 2/2. Step:312/312: \n", + "AccuracyMetric: acc=0.649819\n", + "\n", + "\n", + "In Epoch:2/Step:312, got best dev performance:\n", + "AccuracyMetric: acc=0.649819\n", + "Reloaded the best model.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'best_eval': {'AccuracyMetric': {'acc': 0.649819}},\n", + " 'best_epoch': 2,\n", + " 'best_step': 312,\n", + " 'seconds': 109.87}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 训练模型\n", + "from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n", + "trainer = Trainer(data_bundle.get_dataset('train'), model, \n", + " optimizer=Adam(model_params=model.parameters(), lr=2e-5), \n", + " loss=CrossEntropyLoss(), device=[0],\n", + " batch_size=16, dev_data=data_bundle.get_dataset('dev'), \n", + " metrics=AccuracyMetric(), n_epochs=2, print_every=1)\n", + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/cn_cls_example.png b/tutorials/cn_cls_example.png new file mode 100644 index 00000000..5055bb02 Binary files /dev/null and b/tutorials/cn_cls_example.png differ diff --git a/tutorials/quickstart.ipynb b/tutorials/quickstart.ipynb deleted file mode 100644 index 00c30c93..00000000 --- a/tutorials/quickstart.ipynb +++ /dev/null @@ -1,280 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# 快速入门" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str}" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP.io import CSVLoader\n", - "\n", - "loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\\t')\n", - "dataset = loader.load(\"./sample_data/tutorial_sample_dataset.csv\")\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.'] type=list}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 将所有字母转为小写, 并所有句子变成单词序列\n", - "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')\n", - "dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Vocabulary\n", - "\n", - "# 使用Vocabulary类统计单词,并将单词序列转化为数字序列\n", - "vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')\n", - "vocab.index_dataset(dataset, field_name='words',new_field_name='words')\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n", - "'target': 1 type=int}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 将label转为整数,并设置为 target\n", - "dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True)\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "CNNText(\n", - " (embed): Embedding(\n", - " 177, 50\n", - " (dropout): Dropout(p=0.0)\n", - " )\n", - " (conv_pool): ConvMaxpool(\n", - " (convs): ModuleList(\n", - " (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n", - " (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n", - " (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n", - " )\n", - " )\n", - " (dropout): Dropout(p=0.1)\n", - " (fc): Linear(in_features=12, out_features=5, bias=True)\n", - ")" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP.models import CNNText\n", - "model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n", - "model" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(62, 15)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 分割训练集/验证集\n", - "train_data, dev_data = dataset.split(0.2)\n", - "len(train_data), len(dev_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "input fields after batch(if batch size is 2):\n", - "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n", - "target fields after batch(if batch size is 2):\n", - "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "\n", - "training epochs started 2019-05-09-10-59-39\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.333333\n", - "\n", - "Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.533333\n", - "\n", - "Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.533333\n", - "\n", - "Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.533333\n", - "\n", - "Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.6\n", - "\n", - "Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.8\n", - "\n", - "Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.8\n", - "\n", - "Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.733333\n", - "\n", - "Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.733333\n", - "\n", - "Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.733333\n", - "\n", - "\n", - "In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.8\n", - "Reloaded the best model.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'best_eval': {'AccuracyMetric': {'acc': 0.8}},\n", - " 'best_epoch': 6,\n", - " 'best_step': 12,\n", - " 'seconds': 0.22}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric\n", - "\n", - "# 定义trainer并进行训练\n", - "trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,\n", - " loss=CrossEntropyLoss(), metrics=AccuracyMetric())\n", - "trainer.train()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/tutorials/sample_data/tutorial_sample_dataset.csv b/tutorials/sample_data/tutorial_sample_dataset.csv deleted file mode 100644 index e5c0a74f..00000000 --- a/tutorials/sample_data/tutorial_sample_dataset.csv +++ /dev/null @@ -1,77 +0,0 @@ -A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . 1 -This quiet , introspective and entertaining independent is worth seeking . 4 -Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . 1 -A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . 3 -Aggressive self-glorification and a manipulative whitewash . 1 -A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . 4 -Narratively , Trouble Every Day is a plodding mess . 1 -The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations 3 -But it does n't leave you with much . 1 -You could hate it for the same reason . 1 -There 's little to recommend Snow Dogs , unless one considers cliched dialogue and perverse escapism a source of high hilarity . 1 -Kung Pow is Oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that . 1 -The performances are an absolute joy . 4 -Fresnadillo has something serious to say about the ways in which extravagant chance can distort our perspective and throw us off the path of good sense . 3 -I still like Moonlight Mile , better judgment be damned . 3 -A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 -a bilingual charmer , just like the woman who inspired it 3 -Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 -As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 -It 's everything you 'd expect -- but nothing more . 2 -Best indie of the year , so far . 4 -Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 -It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 -That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 -The plot is romantic comedy boilerplate from start to finish . 2 -It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 -A film that clearly means to preach exclusively to the converted . 2 -While The Importance of Being Earnest offers opportunities for occasional smiles and chuckles , it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances . 1 -The latest vapid actor 's exercise to appropriate the structure of Arthur Schnitzler 's Reigen . 1 -More vaudeville show than well-constructed narrative , but on those terms it 's inoffensive and actually rather sweet . 2 -Nothing more than a run-of-the-mill action flick . 2 -Hampered -- no , paralyzed -- by a self-indulgent script ... that aims for poetry and ends up sounding like satire . 0 -Ice Age is the first computer-generated feature cartoon to feel like other movies , and that makes for some glacial pacing early on . 2 -There 's very little sense to what 's going on here , but the makers serve up the cliches with considerable dash . 2 -Cattaneo should have followed the runaway success of his first film , The Full Monty , with something different . 2 -They 're the unnamed , easily substitutable forces that serve as whatever terror the heroes of horror movies try to avoid . 1 -It almost feels as if the movie is more interested in entertaining itself than in amusing us . 1 -The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . ' 0 -I still like Moonlight Mile , better judgment be damned . 3 -A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 -a bilingual charmer , just like the woman who inspired it 3 -Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 -As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 -It 's everything you 'd expect -- but nothing more . 2 -Best indie of the year , so far . 4 -Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 -It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 -That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 -The plot is romantic comedy boilerplate from start to finish . 2 -It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 -A film that clearly means to preach exclusively to the converted . 2 -I still like Moonlight Mile , better judgment be damned . 3 -A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 -a bilingual charmer , just like the woman who inspired it 3 -Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 -As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 -It 's everything you 'd expect -- but nothing more . 2 -Best indie of the year , so far . 4 -Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 -It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 -That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 -The plot is romantic comedy boilerplate from start to finish . 2 -It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 -A film that clearly means to preach exclusively to the converted . 2 -I still like Moonlight Mile , better judgment be damned . 3 -A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 -a bilingual charmer , just like the woman who inspired it 3 -Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 -As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 -It 's everything you 'd expect -- but nothing more . 2 -Best indie of the year , so far . 4 -Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 -It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 -That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 -The plot is romantic comedy boilerplate from start to finish . 2 -It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 -A film that clearly means to preach exclusively to the converted . 2 \ No newline at end of file diff --git a/tutorials/tutorial_1.ipynb b/tutorials/tutorial_1.ipynb deleted file mode 100644 index db302238..00000000 --- a/tutorials/tutorial_1.ipynb +++ /dev/null @@ -1,831 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# 详细指南" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 数据读入" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str}" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP.io import CSVLoader\n", - "\n", - "loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\\t')\n", - "dataset = loader.load(\"./sample_data/tutorial_sample_dataset.csv\")\n", - "dataset[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Instance表示一个样本,由一个或多个field(域,属性,特征)组成,每个field有名字和值。\n", - "\n", - "在初始化Instance时即可定义它包含的域,使用 \"field_name=field_value\"的写法。" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': fake data type=str,\n", - "'label': 0 type=str}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Instance\n", - "\n", - "dataset.append(Instance(raw_sentence='fake data', label='0'))\n", - "dataset[-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 数据处理" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n", - "'target': 1 type=int}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Vocabulary\n", - "\n", - "# 将所有字母转为小写, 并所有句子变成单词序列\n", - "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')\n", - "dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words')\n", - "\n", - "# 使用Vocabulary类统计单词,并将单词序列转化为数字序列\n", - "vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')\n", - "vocab.index_dataset(dataset, field_name='words',new_field_name='words')\n", - "\n", - "# 将label转为整数\n", - "dataset.apply(lambda x: int(x['label']), new_field_name='target')\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n", - "'target': 1 type=int,\n", - "'seq_len': 37 type=int}\n" - ] - } - ], - "source": [ - "# 增加长度信息\n", - "dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')\n", - "print(dataset[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 使用内置模块CNNText\n", - "设置为符合内置模块的名称" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "CNNText(\n", - " (embed): Embedding(\n", - " 177, 50\n", - " (dropout): Dropout(p=0.0)\n", - " )\n", - " (conv_pool): ConvMaxpool(\n", - " (convs): ModuleList(\n", - " (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n", - " (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n", - " (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n", - " )\n", - " )\n", - " (dropout): Dropout(p=0.1)\n", - " (fc): Linear(in_features=12, out_features=5, bias=True)\n", - ")" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP.models import CNNText\n", - "\n", - "model_cnn = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n", - "model_cnn" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "我们在使用内置模块的时候,还应该使用应该注意把 field 设定成符合内置模型输入输出的名字。" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "words\n", - "seq_len\n", - "target\n" - ] - } - ], - "source": [ - "from fastNLP import Const\n", - "\n", - "dataset.rename_field('words', Const.INPUT)\n", - "dataset.rename_field('seq_len', Const.INPUT_LEN)\n", - "dataset.rename_field('target', Const.TARGET)\n", - "\n", - "dataset.set_input(Const.INPUT, Const.INPUT_LEN)\n", - "dataset.set_target(Const.TARGET)\n", - "\n", - "print(Const.INPUT)\n", - "print(Const.INPUT_LEN)\n", - "print(Const.TARGET)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 分割训练集/验证集/测试集" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(64, 7, 7)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_dev_data, test_data = dataset.split(0.1)\n", - "train_data, dev_data = train_dev_data.split(0.1)\n", - "len(train_data), len(dev_data), len(test_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 训练(model_cnn)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### loss\n", - "训练模型需要提供一个损失函数\n", - "\n", - "下面提供了一个在分类问题中常用的交叉熵损失。注意它的**初始化参数**。\n", - "\n", - "pred参数对应的是模型的forward返回的dict的一个key的名字,这里是\"output\"。\n", - "\n", - "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP import CrossEntropyLoss\n", - "\n", - "# loss = CrossEntropyLoss()\n", - "# 等价于\n", - "loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Metric\n", - "定义评价指标\n", - "\n", - "这里使用准确率。参数的“命名规则”跟上面类似。\n", - "\n", - "pred参数对应的是模型的predict方法返回的dict的一个key的名字,这里是\"predict\"。\n", - "\n", - "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP import AccuracyMetric\n", - "\n", - "# metrics=AccuracyMetric()\n", - "# 等价于\n", - "metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "input fields after batch(if batch size is 2):\n", - "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n", - "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "target fields after batch(if batch size is 2):\n", - "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "\n", - "training epochs started 2019-05-12-21-38-34\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.285714\n", - "\n", - "Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n", - "\n", - "\n", - "In Epoch:8/Step:16, got best dev performance:AccuracyMetric: acc=0.857143\n", - "Reloaded the best model.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n", - " 'best_epoch': 8,\n", - " 'best_step': 16,\n", - " 'seconds': 0.21}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Trainer\n", - "\n", - "trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics)\n", - "trainer.train()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 测试(model_cnn)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[tester] \n", - "AccuracyMetric: acc=0.857143\n" - ] - }, - { - "data": { - "text/plain": [ - "{'AccuracyMetric': {'acc': 0.857143}}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Tester\n", - "\n", - "tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())\n", - "tester.test()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 编写自己的模型\n", - "\n", - "完全支持 pytorch 的模型,与 pytorch 唯一不同的是返回结果是一个字典,字典中至少需要包含 \"pred\" 这个字段" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "\n", - "class LSTMText(nn.Module):\n", - " def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim=64, num_layers=2, dropout=0.5):\n", - " super().__init__()\n", - "\n", - " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", - " self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=dropout)\n", - " self.fc = nn.Linear(hidden_dim * 2, output_dim)\n", - " self.dropout = nn.Dropout(dropout)\n", - "\n", - " def forward(self, words):\n", - " # (input) words : (batch_size, seq_len)\n", - " words = words.permute(1,0)\n", - " # words : (seq_len, batch_size)\n", - "\n", - " embedded = self.dropout(self.embedding(words))\n", - " # embedded : (seq_len, batch_size, embedding_dim)\n", - " output, (hidden, cell) = self.lstm(embedded)\n", - " # output: (seq_len, batch_size, hidden_dim * 2)\n", - " # hidden: (num_layers * 2, batch_size, hidden_dim)\n", - " # cell: (num_layers * 2, batch_size, hidden_dim)\n", - "\n", - " hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)\n", - " hidden = self.dropout(hidden)\n", - " # hidden: (batch_size, hidden_dim * 2)\n", - "\n", - " pred = self.fc(hidden.squeeze(0))\n", - " # result: (batch_size, output_dim)\n", - " return {\"pred\":pred}" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "input fields after batch(if batch size is 2):\n", - "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n", - "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "target fields after batch(if batch size is 2):\n", - "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "\n", - "training epochs started 2019-05-12-21-38-36\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.714286\n", - "\n", - "Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n", - "\n", - "\n", - "In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.857143\n", - "Reloaded the best model.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n", - " 'best_epoch': 6,\n", - " 'best_step': 12,\n", - " 'seconds': 2.15}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_lstm = LSTMText(len(vocab),50,5)\n", - "trainer = Trainer(model=model_lstm, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics)\n", - "trainer.train()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[tester] \n", - "AccuracyMetric: acc=0.857143\n" - ] - }, - { - "data": { - "text/plain": [ - "{'AccuracyMetric': {'acc': 0.857143}}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tester = Tester(test_data, model_lstm, metrics=AccuracyMetric())\n", - "tester.test()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 使用 Batch编写自己的训练过程" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 0 Avg Loss: 3.11 18ms\n", - "Epoch 1 Avg Loss: 2.88 30ms\n", - "Epoch 2 Avg Loss: 2.69 42ms\n", - "Epoch 3 Avg Loss: 2.47 54ms\n", - "Epoch 4 Avg Loss: 2.38 67ms\n", - "Epoch 5 Avg Loss: 2.10 78ms\n", - "Epoch 6 Avg Loss: 2.06 91ms\n", - "Epoch 7 Avg Loss: 1.92 103ms\n", - "Epoch 8 Avg Loss: 1.91 114ms\n", - "Epoch 9 Avg Loss: 1.76 126ms\n", - "[tester] \n", - "AccuracyMetric: acc=0.571429\n" - ] - }, - { - "data": { - "text/plain": [ - "{'AccuracyMetric': {'acc': 0.571429}}" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import BucketSampler\n", - "from fastNLP import Batch\n", - "import torch\n", - "import time\n", - "\n", - "model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n", - "\n", - "def train(epoch, data):\n", - " optim = torch.optim.Adam(model.parameters(), lr=0.001)\n", - " lossfunc = torch.nn.CrossEntropyLoss()\n", - " batch_size = 32\n", - "\n", - " # 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。\n", - " # 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)\n", - " train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')\n", - " train_batch = Batch(batch_size=batch_size, dataset=data, sampler=train_sampler)\n", - " \n", - " start_time = time.time()\n", - " for i in range(epoch):\n", - " loss_list = []\n", - " for batch_x, batch_y in train_batch:\n", - " optim.zero_grad()\n", - " output = model(batch_x['words'])\n", - " loss = lossfunc(output['pred'], batch_y['target'])\n", - " loss.backward()\n", - " optim.step()\n", - " loss_list.append(loss.item())\n", - " print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=\" \")\n", - " print('{:d}ms'.format(round((time.time()-start_time)*1000)))\n", - " loss_list.clear()\n", - " \n", - "train(10, train_data)\n", - "tester = Tester(test_data, model, metrics=AccuracyMetric())\n", - "tester.test()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 使用 Callback 实现自己想要的效果" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "input fields after batch(if batch size is 2):\n", - "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n", - "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "target fields after batch(if batch size is 2):\n", - "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "\n", - "training epochs started 2019-05-12-21-38-40\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.285714\n", - "\n", - "Sum Time: 51ms\n", - "\n", - "\n", - "Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.285714\n", - "\n", - "Sum Time: 69ms\n", - "\n", - "\n", - "Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.285714\n", - "\n", - "Sum Time: 91ms\n", - "\n", - "\n", - "Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 107ms\n", - "\n", - "\n", - "Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 125ms\n", - "\n", - "\n", - "Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 142ms\n", - "\n", - "\n", - "Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 158ms\n", - "\n", - "\n", - "Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 176ms\n", - "\n", - "\n", - "Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.714286\n", - "\n", - "Sum Time: 193ms\n", - "\n", - "\n", - "Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Sum Time: 212ms\n", - "\n", - "\n", - "\n", - "In Epoch:10/Step:20, got best dev performance:AccuracyMetric: acc=0.857143\n", - "Reloaded the best model.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n", - " 'best_epoch': 10,\n", - " 'best_step': 20,\n", - " 'seconds': 0.2}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Callback\n", - "\n", - "start_time = time.time()\n", - "\n", - "class MyCallback(Callback):\n", - " def on_epoch_end(self):\n", - " print('Sum Time: {:d}ms\\n\\n'.format(round((time.time()-start_time)*1000)))\n", - " \n", - "\n", - "model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n", - "trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,\n", - " loss=CrossEntropyLoss(), metrics=AccuracyMetric(), callbacks=[MyCallback()])\n", - "trainer.train()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/tutorials/tutorial_10_callback.ipynb b/tutorials/tutorial_10_callback.ipynb new file mode 100644 index 00000000..ed71a9b0 --- /dev/null +++ b/tutorials/tutorial_10_callback.ipynb @@ -0,0 +1,622 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 使用 Callback 自定义你的训练过程" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- 什么是 Callback\n", + "- 使用 Callback \n", + "- 一些常用的 Callback\n", + "- 自定义实现 Callback" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "什么是Callback\n", + "------\n", + "\n", + "Callback 是与 Trainer 紧密结合的模块,利用 Callback 可以在 Trainer 训练时,加入自定义的操作,比如梯度裁剪,学习率调节,测试模型的性能等。定义的 Callback 会在训练的特定阶段被调用。\n", + "\n", + "fastNLP 中提供了很多常用的 Callback ,开箱即用。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "使用 Callback\n", + " ------\n", + "\n", + "使用 Callback 很简单,将需要的 callback 按 list 存储,以对应参数 ``callbacks`` 传入对应的 Trainer。Trainer 在训练时就会自动执行这些 Callback 指定的操作了。" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2019-09-17T07:34:46.465871Z", + "start_time": "2019-09-17T07:34:30.648758Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In total 3 datasets:\n", + "\ttest has 1200 instances.\n", + "\ttrain has 9600 instances.\n", + "\tdev has 1200 instances.\n", + "In total 2 vocabs:\n", + "\tchars has 4409 entries.\n", + "\ttarget has 2 entries.\n", + "\n", + "training epochs started 2019-09-17-03-34-34\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=900), HTML(value='')), layout=Layout(display=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.1 seconds!\n", + "Evaluation on dev at Epoch 1/3. Step:300/900: \n", + "AccuracyMetric: acc=0.863333\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.11 seconds!\n", + "Evaluation on dev at Epoch 2/3. Step:600/900: \n", + "AccuracyMetric: acc=0.886667\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.1 seconds!\n", + "Evaluation on dev at Epoch 3/3. Step:900/900: \n", + "AccuracyMetric: acc=0.890833\n", + "\n", + "\r\n", + "In Epoch:3/Step:900, got best dev performance:\n", + "AccuracyMetric: acc=0.890833\n", + "Reloaded the best model.\n" + ] + } + ], + "source": [ + "from fastNLP import (Callback, EarlyStopCallback,\n", + " Trainer, CrossEntropyLoss, AccuracyMetric)\n", + "from fastNLP.models import CNNText\n", + "import torch.cuda\n", + "\n", + "# prepare data\n", + "def get_data():\n", + " from fastNLP.io import ChnSentiCorpPipe as pipe\n", + " data = pipe().process_from_file()\n", + " print(data)\n", + " data.rename_field('chars', 'words')\n", + " train_data = data.datasets['train']\n", + " dev_data = data.datasets['dev']\n", + " test_data = data.datasets['test']\n", + " vocab = data.vocabs['words']\n", + " tgt_vocab = data.vocabs['target']\n", + " return train_data, dev_data, test_data, vocab, tgt_vocab\n", + "\n", + "# prepare model\n", + "train_data, dev_data, _, vocab, tgt_vocab = get_data()\n", + "device = 'cuda:0' if torch.cuda.is_available() else 'cpu'\n", + "model = CNNText((len(vocab),50), num_classes=len(tgt_vocab))\n", + "\n", + "# define callback\n", + "callbacks=[EarlyStopCallback(5)]\n", + "\n", + "# pass callbacks to Trainer\n", + "def train_with_callback(cb_list):\n", + " trainer = Trainer(\n", + " device=device,\n", + " n_epochs=3,\n", + " model=model, \n", + " train_data=train_data, \n", + " dev_data=dev_data, \n", + " loss=CrossEntropyLoss(), \n", + " metrics=AccuracyMetric(), \n", + " callbacks=cb_list, \n", + " check_code_level=-1\n", + " )\n", + " trainer.train()\n", + "\n", + "train_with_callback(callbacks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "fastNLP 中的 Callback\n", + "-------\n", + "fastNLP 中提供了很多常用的 Callback,如梯度裁剪,训练时早停和测试验证集,fitlog 等等。具体 Callback 请参考 fastNLP.core.callbacks" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2019-09-17T07:35:02.182727Z", + "start_time": "2019-09-17T07:34:49.443863Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2019-09-17-03-34-49\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=900), HTML(value='')), layout=Layout(display=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.13 seconds!\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.12 seconds!\n", + "Evaluation on data-test:\n", + "AccuracyMetric: acc=0.890833\n", + "Evaluation on dev at Epoch 1/3. Step:300/900: \n", + "AccuracyMetric: acc=0.890833\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.09 seconds!\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.09 seconds!\n", + "Evaluation on data-test:\n", + "AccuracyMetric: acc=0.8875\n", + "Evaluation on dev at Epoch 2/3. Step:600/900: \n", + "AccuracyMetric: acc=0.8875\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.11 seconds!\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.1 seconds!\n", + "Evaluation on data-test:\n", + "AccuracyMetric: acc=0.885\n", + "Evaluation on dev at Epoch 3/3. Step:900/900: \n", + "AccuracyMetric: acc=0.885\n", + "\n", + "\r\n", + "In Epoch:1/Step:300, got best dev performance:\n", + "AccuracyMetric: acc=0.890833\n", + "Reloaded the best model.\n" + ] + } + ], + "source": [ + "from fastNLP import EarlyStopCallback, GradientClipCallback, EvaluateCallback\n", + "callbacks = [\n", + " EarlyStopCallback(5),\n", + " GradientClipCallback(clip_value=5, clip_type='value'),\n", + " EvaluateCallback(dev_data)\n", + "]\n", + "\n", + "train_with_callback(callbacks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "自定义 Callback\n", + "------\n", + "\n", + "这里我们以一个简单的 Callback作为例子,它的作用是打印每一个 Epoch 平均训练 loss。\n", + "\n", + "#### 创建 Callback\n", + " \n", + "要自定义 Callback,我们要实现一个类,继承 fastNLP.Callback。\n", + "\n", + "这里我们定义 MyCallBack ,继承 fastNLP.Callback 。\n", + "\n", + "#### 指定 Callback 调用的阶段\n", + " \n", + "Callback 中所有以 on_ 开头的类方法会在 Trainer 的训练中在特定阶段调用。 如 on_train_begin() 会在训练开始时被调用,on_epoch_end() 会在每个 epoch 结束时调用。 具体有哪些类方法,参见 Callback 文档。\n", + "\n", + "这里, MyCallBack 在求得loss时调用 on_backward_begin() 记录当前 loss ,在每一个 epoch 结束时调用 on_epoch_end() ,求当前 epoch 平均loss并输出。\n", + "\n", + "#### 使用 Callback 的属性访问 Trainer 的内部信息\n", + " \n", + "为了方便使用,可以使用 Callback 的属性,访问 Trainer 中的对应信息,如 optimizer, epoch, n_epochs,分别对应训练时的优化器,当前 epoch 数,和总 epoch 数。 具体可访问的属性,参见文档 Callback 。\n", + "\n", + "这里, MyCallBack 为了求平均 loss ,需要知道当前 epoch 的总步数,可以通过 self.step 属性得到当前训练了多少步。\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2019-09-17T07:43:10.907139Z", + "start_time": "2019-09-17T07:42:58.488177Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training epochs started 2019-09-17-03-42-58\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=900), HTML(value='')), layout=Layout(display=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.11 seconds!\n", + "Evaluation on dev at Epoch 1/3. Step:300/900: \n", + "AccuracyMetric: acc=0.883333\n", + "\n", + "Avg loss at epoch 1, 0.100254\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.1 seconds!\n", + "Evaluation on dev at Epoch 2/3. Step:600/900: \n", + "AccuracyMetric: acc=0.8775\n", + "\n", + "Avg loss at epoch 2, 0.183511\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluate data in 0.13 seconds!\n", + "Evaluation on dev at Epoch 3/3. Step:900/900: \n", + "AccuracyMetric: acc=0.875833\n", + "\n", + "Avg loss at epoch 3, 0.257103\n", + "\r\n", + "In Epoch:1/Step:300, got best dev performance:\n", + "AccuracyMetric: acc=0.883333\n", + "Reloaded the best model.\n" + ] + } + ], + "source": [ + "from fastNLP import Callback\n", + "from fastNLP import logger\n", + "\n", + "class MyCallBack(Callback):\n", + " \"\"\"Print average loss in each epoch\"\"\"\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.total_loss = 0\n", + " self.start_step = 0\n", + " \n", + " def on_backward_begin(self, loss):\n", + " self.total_loss += loss.item()\n", + " \n", + " def on_epoch_end(self):\n", + " n_steps = self.step - self.start_step\n", + " avg_loss = self.total_loss / n_steps\n", + " logger.info('Avg loss at epoch %d, %.6f', self.epoch, avg_loss)\n", + " self.start_step = self.step\n", + "\n", + "callbacks = [MyCallBack()]\n", + "train_with_callback(callbacks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tutorials/文本分类.ipynb b/tutorials/文本分类.ipynb new file mode 100644 index 00000000..de29f632 --- /dev/null +++ b/tutorials/文本分类.ipynb @@ -0,0 +1,834 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 文本分类(Text classification)\n", + "文本分类任务是将一句话或一段话划分到某个具体的类别。比如垃圾邮件识别,文本情绪分类等。\n", + "\n", + "Example:: \n", + "1,商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!\n", + "\n", + "\n", + "其中开头的1是只这条评论的标签,表示是正面的情绪。我们将使用到的数据可以通过http://dbcloud.irocn.cn:8989/api/public/dl/dataset/chn_senti_corp.zip 下载并解压,当然也可以通过fastNLP自动下载该数据。\n", + "\n", + "数据中的内容如下图所示。接下来,我们将用fastNLP在这个数据上训练一个分类网络。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![jupyter](./cn_cls_example.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 步骤\n", + "一共有以下的几个步骤 \n", + "(1) 读取数据 \n", + "(2) 预处理数据 \n", + "(3) 选择预训练词向量 \n", + "(4) 创建模型 \n", + "(5) 训练模型 " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (1) 读取数据\n", + "fastNLP提供多种数据的自动下载与自动加载功能,对于这里我们要用到的数据,我们可以用\\ref{Loader}自动下载并加载该数据。更多有关Loader的使用可以参考\\ref{Loader}" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP.io import ChnSentiCorpLoader\n", + "\n", + "loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader\n", + "data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回\n", + "data_bundle = loader.load(data_dir) # 这一行代码将从{data_dir}处读取数据至DataBundle" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataBundle的相关介绍,可以参考\\ref{}。我们可以打印该data_bundle的基本信息。" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In total 3 datasets:\n", + "\tdev has 1200 instances.\n", + "\ttrain has 9600 instances.\n", + "\ttest has 1200 instances.\n", + "In total 0 vocabs:\n", + "\n" + ] + } + ], + "source": [ + "print(data_bundle)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以看出,该data_bundle中一个含有三个\\ref{DataSet}。通过下面的代码,我们可以查看DataSet的基本情况" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataSet({'raw_chars': 选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。 服务吗,一般 type=str,\n", + "'target': 1 type=str},\n", + "{'raw_chars': 15.4寸笔记本的键盘确实爽,基本跟台式机差不多了,蛮喜欢数字小键盘,输数字特方便,样子也很美观,做工也相当不错 type=str,\n", + "'target': 1 type=str})\n" + ] + } + ], + "source": [ + "print(data_bundle.get_dataset('train')[:2]) # 查看Train集前两个sample" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (2) 预处理数据\n", + "在NLP任务中,预处理一般包括: (a)将一整句话切分成汉字或者词; (b)将文本转换为index \n", + "\n", + "fastNLP中也提供了多种数据集的处理类,这里我们直接使用fastNLP的ChnSentiCorpPipe。更多关于Pipe的说明可以参考\\ref{Pipe}。" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP.io import ChnSentiCorpPipe\n", + "\n", + "pipe = ChnSentiCorpPipe()\n", + "data_bundle = pipe.process(data_bundle) # 所有的Pipe都实现了process()方法,且输入输出都为DataBundle类型" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In total 3 datasets:\n", + "\tdev has 1200 instances.\n", + "\ttrain has 9600 instances.\n", + "\ttest has 1200 instances.\n", + "In total 2 vocabs:\n", + "\tchars has 4409 entries.\n", + "\ttarget has 2 entries.\n", + "\n" + ] + } + ], + "source": [ + "print(data_bundle) # 打印data_bundle,查看其变化" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以看到除了之前已经包含的3个\\ref{DataSet}, 还新增了两个\\ref{Vocabulary}。我们可以打印DataSet中的内容" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataSet({'raw_chars': 选择珠江花园的原因就是方便,有电动扶梯直接到达海边,周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般,但还算整洁。 泳池在大堂的屋顶,因此很小,不过女儿倒是喜欢。 包的早餐是西式的,还算丰富。 服务吗,一般 type=str,\n", + "'target': 1 type=int,\n", + "'chars': [338, 464, 1400, 784, 468, 739, 3, 289, 151, 21, 5, 88, 143, 2, 9, 81, 134, 2573, 766, 233, 196, 23, 536, 342, 297, 2, 405, 698, 132, 281, 74, 744, 1048, 74, 420, 387, 74, 412, 433, 74, 2021, 180, 8, 219, 1929, 213, 4, 34, 31, 96, 363, 8, 230, 2, 66, 18, 229, 331, 768, 4, 11, 1094, 479, 17, 35, 593, 3, 1126, 967, 2, 151, 245, 12, 44, 2, 6, 52, 260, 263, 635, 5, 152, 162, 4, 11, 336, 3, 154, 132, 5, 236, 443, 3, 2, 18, 229, 761, 700, 4, 11, 48, 59, 653, 2, 8, 230] type=list,\n", + "'seq_len': 106 type=int},\n", + "{'raw_chars': 15.4寸笔记本的键盘确实爽,基本跟台式机差不多了,蛮喜欢数字小键盘,输数字特方便,样子也很美观,做工也相当不错 type=str,\n", + "'target': 1 type=int,\n", + "'chars': [50, 133, 20, 135, 945, 520, 343, 24, 3, 301, 176, 350, 86, 785, 2, 456, 24, 461, 163, 443, 128, 109, 6, 47, 7, 2, 916, 152, 162, 524, 296, 44, 301, 176, 2, 1384, 524, 296, 259, 88, 143, 2, 92, 67, 26, 12, 277, 269, 2, 188, 223, 26, 228, 83, 6, 63] type=list,\n", + "'seq_len': 56 type=int})\n" + ] + } + ], + "source": [ + "print(data_bundle.get_dataset('train')[:2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "新增了一列为数字列表的chars,以及变为数字的target列。可以看出这两列的名称和刚好与data_bundle中两个Vocabulary的名称是一致的,我们可以打印一下Vocabulary看一下里面的内容。" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vocabulary(['选', '择', '珠', '江', '花']...)\n" + ] + } + ], + "source": [ + "char_vocab = data_bundle.get_vocab('chars')\n", + "print(char_vocab)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Vocabulary是一个记录着词语与index之间映射关系的类,比如" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'选'的index是338\n", + "index:338对应的汉字是选\n" + ] + } + ], + "source": [ + "index = char_vocab.to_index('选')\n", + "print(\"'选'的index是{}\".format(index)) # 这个值与上面打印出来的第一个instance的chars的第一个index是一致的\n", + "print(\"index:{}对应的汉字是{}\".format(index, char_vocab.to_word(index))) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (3) 选择预训练词向量 \n", + "由于Word2vec, Glove, Elmo, Bert等预训练模型可以增强模型的性能,所以在训练具体任务前,选择合适的预训练词向量非常重要。在fastNLP中我们提供了多种Embedding使得加载这些预训练模型的过程变得更加便捷。更多关于Embedding的说明可以参考\\ref{Embedding}。这里我们先给出一个使用word2vec的中文汉字预训练的示例,之后再给出一个使用Bert的文本分类。这里使用的预训练词向量为'cn-fastnlp-100d',fastNLP将自动下载该embedding至本地缓存,fastNLP支持使用名字指定的Embedding以及相关说明可以参见\\ref{Embedding}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 4321 out of 4409 words in the pre-training embedding.\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import StaticEmbedding\n", + "\n", + "word2vec_embed = StaticEmbedding(char_vocab, model_dir_or_name='cn-char-fastnlp-100d')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (4) 创建模型\n", + "这里我们使用到的模型结构如下所示,补图" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from torch import nn\n", + "from fastNLP.modules import LSTM\n", + "import torch\n", + "\n", + "# 定义模型\n", + "class BiLSTMMaxPoolCls(nn.Module):\n", + " def __init__(self, embed, num_classes, hidden_size=400, num_layers=1, dropout=0.3):\n", + " super().__init__()\n", + " self.embed = embed\n", + " \n", + " self.lstm = LSTM(self.embed.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers, \n", + " batch_first=True, bidirectional=True)\n", + " self.dropout_layer = nn.Dropout(dropout)\n", + " self.fc = nn.Linear(hidden_size, num_classes)\n", + " \n", + " def forward(self, chars, seq_len): # 这里的名称必须和DataSet中相应的field对应,比如之前我们DataSet中有chars,这里就必须为chars\n", + " # chars:[batch_size, max_len]\n", + " # seq_len: [batch_size, ]\n", + " chars = self.embed(chars)\n", + " outputs, _ = self.lstm(chars, seq_len)\n", + " outputs = self.dropout_layer(outputs)\n", + " outputs, _ = torch.max(outputs, dim=1)\n", + " outputs = self.fc(outputs)\n", + " \n", + " return {'pred':outputs} # [batch_size,], 返回值必须是dict类型,且预测值的key建议设为pred\n", + "\n", + "# 初始化模型\n", + "model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target')))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (5) 训练模型\n", + "fastNLP提供了Trainer对象来组织训练过程,包括完成loss计算(所以在初始化Trainer的时候需要指定loss类型),梯度更新(所以在初始化Trainer的时候需要提供优化器optimizer)以及在验证集上的性能验证(所以在初始化时需要提供一个Metric)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input fields after batch(if batch size is 2):\n", + "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "\tchars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106]) \n", + "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "target fields after batch(if batch size is 2):\n", + "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "\n", + "Evaluate data in 0.01 seconds!\n", + "training epochs started 2019-09-03-23-57-10\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3000), HTML(value='')), layout=Layout(display…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.43 seconds!\n", + "\r", + "Evaluation on dev at Epoch 1/10. Step:300/3000: \n", + "\r", + "AccuracyMetric: acc=0.81\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.44 seconds!\n", + "\r", + "Evaluation on dev at Epoch 2/10. Step:600/3000: \n", + "\r", + "AccuracyMetric: acc=0.8675\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.44 seconds!\n", + "\r", + "Evaluation on dev at Epoch 3/10. Step:900/3000: \n", + "\r", + "AccuracyMetric: acc=0.878333\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.43 seconds!\n", + "\r", + "Evaluation on dev at Epoch 4/10. Step:1200/3000: \n", + "\r", + "AccuracyMetric: acc=0.873333\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.44 seconds!\n", + "\r", + "Evaluation on dev at Epoch 5/10. Step:1500/3000: \n", + "\r", + "AccuracyMetric: acc=0.878333\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.42 seconds!\n", + "\r", + "Evaluation on dev at Epoch 6/10. Step:1800/3000: \n", + "\r", + "AccuracyMetric: acc=0.895833\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.44 seconds!\n", + "\r", + "Evaluation on dev at Epoch 7/10. Step:2100/3000: \n", + "\r", + "AccuracyMetric: acc=0.8975\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.43 seconds!\n", + "\r", + "Evaluation on dev at Epoch 8/10. Step:2400/3000: \n", + "\r", + "AccuracyMetric: acc=0.894167\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.48 seconds!\n", + "\r", + "Evaluation on dev at Epoch 9/10. Step:2700/3000: \n", + "\r", + "AccuracyMetric: acc=0.8875\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.43 seconds!\n", + "\r", + "Evaluation on dev at Epoch 10/10. Step:3000/3000: \n", + "\r", + "AccuracyMetric: acc=0.895833\n", + "\n", + "\r\n", + "In Epoch:7/Step:2100, got best dev performance:\n", + "AccuracyMetric: acc=0.8975\n", + "Reloaded the best model.\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=19), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 0.34 seconds!\n", + "[tester] \n", + "AccuracyMetric: acc=0.8975\n" + ] + }, + { + "data": { + "text/plain": [ + "{'AccuracyMetric': {'acc': 0.8975}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from fastNLP import Trainer\n", + "from fastNLP import CrossEntropyLoss\n", + "from torch.optim import Adam\n", + "from fastNLP import AccuracyMetric\n", + "\n", + "loss = CrossEntropyLoss()\n", + "optimizer = Adam(model.parameters(), lr=0.001)\n", + "metric = AccuracyMetric()\n", + "device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快\n", + "\n", + "trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, \n", + " optimizer=optimizer, batch_size=32, dev_data=data_bundle.get_dataset('dev'),\n", + " metrics=metric, device=device)\n", + "trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型\n", + "\n", + "# 在测试集上测试一下模型的性能\n", + "from fastNLP import Tester\n", + "print(\"Performance on test is:\")\n", + "tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n", + "tester.test()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 使用Bert进行文本分类" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading vocabulary file /home/yh/.fastNLP/embedding/bert-chinese-wwm/vocab.txt\n", + "Load pre-trained BERT parameters from file /home/yh/.fastNLP/embedding/bert-chinese-wwm/chinese_wwm_pytorch.bin.\n", + "Start to generating word pieces for word.\n", + "Found(Or segment into word pieces) 4286 words out of 4409.\n", + "input fields after batch(if batch size is 2):\n", + "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "\tchars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106]) \n", + "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "target fields after batch(if batch size is 2):\n", + "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", + "\n", + "Evaluate data in 0.05 seconds!\n", + "training epochs started 2019-09-04-00-02-37\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=3600), HTML(value='')), layout=Layout(display…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=150), HTML(value='')), layout=Layout(display=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 15.89 seconds!\n", + "\r", + "Evaluation on dev at Epoch 1/3. Step:1200/3600: \n", + "\r", + "AccuracyMetric: acc=0.9\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=150), HTML(value='')), layout=Layout(display=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 15.92 seconds!\n", + "\r", + "Evaluation on dev at Epoch 2/3. Step:2400/3600: \n", + "\r", + "AccuracyMetric: acc=0.904167\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=150), HTML(value='')), layout=Layout(display=…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 15.91 seconds!\n", + "\r", + "Evaluation on dev at Epoch 3/3. Step:3600/3600: \n", + "\r", + "AccuracyMetric: acc=0.918333\n", + "\n", + "\r\n", + "In Epoch:3/Step:3600, got best dev performance:\n", + "AccuracyMetric: acc=0.918333\n", + "Reloaded the best model.\n", + "Performance on test is:\n" + ] + }, + { + "data": { + "text/plain": [ + "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=19), HTML(value='')), layout=Layout(display='…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\r", + "Evaluate data in 29.24 seconds!\n", + "[tester] \n", + "AccuracyMetric: acc=0.919167\n" + ] + }, + { + "data": { + "text/plain": [ + "{'AccuracyMetric': {'acc': 0.919167}}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 只需要切换一下Embedding即可\n", + "from fastNLP.embeddings import BertEmbedding\n", + "\n", + "# 这里为了演示一下效果,所以默认Bert不更新权重\n", + "bert_embed = BertEmbedding(char_vocab, model_dir_or_name='cn', auto_truncate=True, requires_grad=False)\n", + "model = BiLSTMMaxPoolCls(bert_embed, len(data_bundle.get_vocab('target')), )\n", + "\n", + "\n", + "import torch\n", + "from fastNLP import Trainer\n", + "from fastNLP import CrossEntropyLoss\n", + "from torch.optim import Adam\n", + "from fastNLP import AccuracyMetric\n", + "\n", + "loss = CrossEntropyLoss()\n", + "optimizer = Adam(model.parameters(), lr=2e-5)\n", + "metric = AccuracyMetric()\n", + "device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快\n", + "\n", + "trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, \n", + " optimizer=optimizer, batch_size=16, dev_data=data_bundle.get_dataset('test'),\n", + " metrics=metric, device=device, n_epochs=3)\n", + "trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型\n", + "\n", + "# 在测试集上测试一下模型的性能\n", + "from fastNLP import Tester\n", + "print(\"Performance on test is:\")\n", + "tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n", + "tester.test()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}