You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tutorial_1_data_preprocess.ipynb 8.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# fastNLP中的DataSet"
  8. ]
  9. },
  10. {
  11. "cell_type": "code",
  12. "execution_count": 1,
  13. "metadata": {},
  14. "outputs": [
  15. {
  16. "name": "stdout",
  17. "output_type": "stream",
  18. "text": [
  19. "+------------------------------+---------------------------------------------+---------+\n",
  20. "| raw_words | words | seq_len |\n",
  21. "+------------------------------+---------------------------------------------+---------+\n",
  22. "| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
  23. "| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
  24. "| Third instance . | ['Third', 'instance', '.'] | 3 |\n",
  25. "+------------------------------+---------------------------------------------+---------+\n"
  26. ]
  27. }
  28. ],
  29. "source": [
  30. "from fastNLP import DataSet\n",
  31. "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
  32. " 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
  33. " 'seq_len': [6, 3, 3]}\n",
  34. "dataset = DataSet(data)\n",
  35. "# 传入的dict的每个key的value应该为具有相同长度的list\n",
  36. "print(dataset)"
  37. ]
  38. },
  39. {
  40. "cell_type": "markdown",
  41. "metadata": {},
  42. "source": [
  43. "## DataSet的构建"
  44. ]
  45. },
  46. {
  47. "cell_type": "code",
  48. "execution_count": 2,
  49. "metadata": {},
  50. "outputs": [
  51. {
  52. "data": {
  53. "text/plain": [
  54. "+----------------------------+---------------------------------------------+---------+\n",
  55. "| raw_words | words | seq_len |\n",
  56. "+----------------------------+---------------------------------------------+---------+\n",
  57. "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
  58. "+----------------------------+---------------------------------------------+---------+"
  59. ]
  60. },
  61. "execution_count": 2,
  62. "metadata": {},
  63. "output_type": "execute_result"
  64. }
  65. ],
  66. "source": [
  67. "from fastNLP import DataSet\n",
  68. "from fastNLP import Instance\n",
  69. "dataset = DataSet()\n",
  70. "instance = Instance(raw_words=\"This is the first instance\",\n",
  71. " words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
  72. " seq_len=6)\n",
  73. "dataset.append(instance)\n",
  74. "dataset"
  75. ]
  76. },
  77. {
  78. "cell_type": "code",
  79. "execution_count": 3,
  80. "metadata": {},
  81. "outputs": [
  82. {
  83. "data": {
  84. "text/plain": [
  85. "+----------------------------+---------------------------------------------+---------+\n",
  86. "| raw_words | words | seq_len |\n",
  87. "+----------------------------+---------------------------------------------+---------+\n",
  88. "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
  89. "| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
  90. "+----------------------------+---------------------------------------------+---------+"
  91. ]
  92. },
  93. "execution_count": 3,
  94. "metadata": {},
  95. "output_type": "execute_result"
  96. }
  97. ],
  98. "source": [
  99. "from fastNLP import DataSet\n",
  100. "from fastNLP import Instance\n",
  101. "dataset = DataSet([\n",
  102. " Instance(raw_words=\"This is the first instance\",\n",
  103. " words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
  104. " seq_len=6),\n",
  105. " Instance(raw_words=\"Second instance .\",\n",
  106. " words=['Second', 'instance', '.'],\n",
  107. " seq_len=3)\n",
  108. " ])\n",
  109. "dataset"
  110. ]
  111. },
  112. {
  113. "cell_type": "markdown",
  114. "metadata": {},
  115. "source": [
  116. "## DataSet的删除"
  117. ]
  118. },
  119. {
  120. "cell_type": "code",
  121. "execution_count": 4,
  122. "metadata": {},
  123. "outputs": [
  124. {
  125. "data": {
  126. "text/plain": [
  127. "+----+---+\n",
  128. "| a | c |\n",
  129. "+----+---+\n",
  130. "| -5 | 0 |\n",
  131. "| -4 | 0 |\n",
  132. "| -3 | 0 |\n",
  133. "| -2 | 0 |\n",
  134. "| -1 | 0 |\n",
  135. "| 0 | 0 |\n",
  136. "| 1 | 0 |\n",
  137. "| 2 | 0 |\n",
  138. "| 3 | 0 |\n",
  139. "| 4 | 0 |\n",
  140. "+----+---+"
  141. ]
  142. },
  143. "execution_count": 4,
  144. "metadata": {},
  145. "output_type": "execute_result"
  146. }
  147. ],
  148. "source": [
  149. "from fastNLP import DataSet\n",
  150. "dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n",
  151. "dataset"
  152. ]
  153. },
  154. {
  155. "cell_type": "code",
  156. "execution_count": 5,
  157. "metadata": {},
  158. "outputs": [
  159. {
  160. "data": {
  161. "text/plain": [
  162. "+---+\n",
  163. "| c |\n",
  164. "+---+\n",
  165. "| 0 |\n",
  166. "| 0 |\n",
  167. "| 0 |\n",
  168. "| 0 |\n",
  169. "+---+"
  170. ]
  171. },
  172. "execution_count": 5,
  173. "metadata": {},
  174. "output_type": "execute_result"
  175. }
  176. ],
  177. "source": [
  178. "# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet\n",
  179. "dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n",
  180. "# 在dataset中删除满足条件的instance\n",
  181. "dataset.drop(lambda ins:ins['a']<0)\n",
  182. "# 删除第3个instance\n",
  183. "dataset.delete_instance(2)\n",
  184. "# 删除名为'a'的field\n",
  185. "dataset.delete_field('a')\n",
  186. "dataset"
  187. ]
  188. },
  189. {
  190. "cell_type": "markdown",
  191. "metadata": {},
  192. "source": [
  193. "## 简单的数据预处理"
  194. ]
  195. },
  196. {
  197. "cell_type": "code",
  198. "execution_count": 6,
  199. "metadata": {},
  200. "outputs": [
  201. {
  202. "name": "stdout",
  203. "output_type": "stream",
  204. "text": [
  205. "False\n"
  206. ]
  207. },
  208. {
  209. "data": {
  210. "text/plain": [
  211. "4"
  212. ]
  213. },
  214. "execution_count": 6,
  215. "metadata": {},
  216. "output_type": "execute_result"
  217. }
  218. ],
  219. "source": [
  220. "# 检查是否存在名为'a'的field\n",
  221. "print(dataset.has_field('a')) # 或 ('a' in dataset)\n",
  222. "# 将名为'a'的field改名为'b'\n",
  223. "dataset.rename_field('c', 'b')\n",
  224. "# DataSet的长度\n",
  225. "len(dataset)"
  226. ]
  227. },
  228. {
  229. "cell_type": "code",
  230. "execution_count": 7,
  231. "metadata": {},
  232. "outputs": [
  233. {
  234. "data": {
  235. "text/plain": [
  236. "+------------------------------+-------------------------------------------------+\n",
  237. "| raw_words | words |\n",
  238. "+------------------------------+-------------------------------------------------+\n",
  239. "| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n",
  240. "| Second instance . | ['Second', 'instance', '.'] |\n",
  241. "| Third instance . | ['Third', 'instance', '.'] |\n",
  242. "+------------------------------+-------------------------------------------------+"
  243. ]
  244. },
  245. "execution_count": 7,
  246. "metadata": {},
  247. "output_type": "execute_result"
  248. }
  249. ],
  250. "source": [
  251. "from fastNLP import DataSet\n",
  252. "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n",
  253. "dataset = DataSet(data)\n",
  254. "\n",
  255. "# 将句子分成单词形式, 详见DataSet.apply()方法\n",
  256. "dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n",
  257. "\n",
  258. "# 或使用DataSet.apply_field()\n",
  259. "dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n",
  260. "\n",
  261. "# 除了匿名函数,也可以定义函数传递进去\n",
  262. "def get_words(instance):\n",
  263. " sentence = instance['raw_words']\n",
  264. " words = sentence.split()\n",
  265. " return words\n",
  266. "dataset.apply(get_words, new_field_name='words')\n",
  267. "dataset"
  268. ]
  269. }
  270. ],
  271. "metadata": {
  272. "kernelspec": {
  273. "display_name": "Python Now",
  274. "language": "python",
  275. "name": "now"
  276. },
  277. "language_info": {
  278. "codemirror_mode": {
  279. "name": "ipython",
  280. "version": 3
  281. },
  282. "file_extension": ".py",
  283. "mimetype": "text/x-python",
  284. "name": "python",
  285. "nbconvert_exporter": "python",
  286. "pygments_lexer": "ipython3",
  287. "version": "3.8.0"
  288. }
  289. },
  290. "nbformat": 4,
  291. "nbformat_minor": 2
  292. }