You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_decisionTree.ipynb 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 8,
  6. "id": "64ff4cb2-6a11-4558-9b58-02d23d391b34",
  7. "metadata": {},
  8. "outputs": [],
  9. "source": [
  10. "import pandas as pd\n",
  11. "import json\n",
  12. "from sklearn import tree\n",
  13. "from sklearn.model_selection import train_test_split as tsplit \n",
  14. "from sklearn.metrics import classification_report\n",
  15. "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
  16. "from sklearn.preprocessing import OneHotEncoder"
  17. ]
  18. },
  19. {
  20. "cell_type": "code",
  21. "execution_count": 9,
  22. "id": "4afcae4b-305f-4ce6-af54-08edba088e0b",
  23. "metadata": {},
  24. "outputs": [],
  25. "source": [
  26. "def transform_value(val):\n",
  27. " return val.split(':')[1]"
  28. ]
  29. },
  30. {
  31. "cell_type": "code",
  32. "execution_count": 10,
  33. "id": "1b1287ad-40c8-4059-ad05-097bad2feac7",
  34. "metadata": {},
  35. "outputs": [],
  36. "source": [
  37. "def extract_data(s):\n",
  38. " data_temp = pd.read_json(s)\n",
  39. " columns = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']\n",
  40. " vectorString = data_temp['vectorString']\n",
  41. " temp = []\n",
  42. " for i in range(vectorString.size):\n",
  43. " part = vectorString[i].split('/')\n",
  44. " list_items = part[1::]\n",
  45. " temp.append(list_items)\n",
  46. " data = pd.DataFrame(temp, columns=columns)\n",
  47. " data = data.applymap(transform_value)\n",
  48. " data['severity'] = data_temp['severity']\n",
  49. " return data"
  50. ]
  51. },
  52. {
  53. "cell_type": "code",
  54. "execution_count": 11,
  55. "id": "6962b88e-2523-4bde-8fa6-df96bfbc5221",
  56. "metadata": {},
  57. "outputs": [
  58. {
  59. "name": "stdout",
  60. "output_type": "stream",
  61. "text": [
  62. " AV AC PR UI S C I A severity\n",
  63. "0 N L N R C L L N MEDIUM\n",
  64. "1 N L N N U N N H HIGH\n",
  65. "2 N L N N U N N H HIGH\n",
  66. "3 N L N R C L L N MEDIUM\n",
  67. "4 N L N R C L L N MEDIUM\n",
  68. ".. .. .. .. .. .. .. .. .. ...\n",
  69. "705 N L N N U H H H CRITICAL\n",
  70. "706 L L L N U H N N MEDIUM\n",
  71. "707 N L N N U H H H CRITICAL\n",
  72. "708 N L N N U N L L MEDIUM\n",
  73. "709 N L N N U H N N HIGH\n",
  74. "\n",
  75. "[710 rows x 9 columns]\n"
  76. ]
  77. }
  78. ],
  79. "source": [
  80. "data_train = extract_data('SIR_train_set.json')\n",
  81. "data_test = extract_data('SIR_test_set.json')\n",
  82. "data_validation = extract_data('SIR_validation_set.json')\n",
  83. "data_train\n",
  84. "print(data_test)"
  85. ]
  86. },
  87. {
  88. "cell_type": "code",
  89. "execution_count": 12,
  90. "id": "49ccfdf6-99f0-4c5e-9772-03e500e6b6d6",
  91. "metadata": {},
  92. "outputs": [
  93. {
  94. "data": {
  95. "text/html": [
  96. "<div>\n",
  97. "<style scoped>\n",
  98. " .dataframe tbody tr th:only-of-type {\n",
  99. " vertical-align: middle;\n",
  100. " }\n",
  101. "\n",
  102. " .dataframe tbody tr th {\n",
  103. " vertical-align: top;\n",
  104. " }\n",
  105. "\n",
  106. " .dataframe thead th {\n",
  107. " text-align: right;\n",
  108. " }\n",
  109. "</style>\n",
  110. "<table border=\"1\" class=\"dataframe\">\n",
  111. " <thead>\n",
  112. " <tr style=\"text-align: right;\">\n",
  113. " <th></th>\n",
  114. " <th>AV</th>\n",
  115. " <th>AC</th>\n",
  116. " <th>PR</th>\n",
  117. " <th>UI</th>\n",
  118. " <th>S</th>\n",
  119. " <th>C</th>\n",
  120. " <th>I</th>\n",
  121. " <th>A</th>\n",
  122. " </tr>\n",
  123. " </thead>\n",
  124. " <tbody>\n",
  125. " <tr>\n",
  126. " <th>0</th>\n",
  127. " <td>N</td>\n",
  128. " <td>L</td>\n",
  129. " <td>N</td>\n",
  130. " <td>N</td>\n",
  131. " <td>U</td>\n",
  132. " <td>H</td>\n",
  133. " <td>N</td>\n",
  134. " <td>N</td>\n",
  135. " </tr>\n",
  136. " <tr>\n",
  137. " <th>1</th>\n",
  138. " <td>N</td>\n",
  139. " <td>L</td>\n",
  140. " <td>N</td>\n",
  141. " <td>N</td>\n",
  142. " <td>U</td>\n",
  143. " <td>H</td>\n",
  144. " <td>H</td>\n",
  145. " <td>H</td>\n",
  146. " </tr>\n",
  147. " <tr>\n",
  148. " <th>2</th>\n",
  149. " <td>N</td>\n",
  150. " <td>L</td>\n",
  151. " <td>N</td>\n",
  152. " <td>N</td>\n",
  153. " <td>U</td>\n",
  154. " <td>H</td>\n",
  155. " <td>N</td>\n",
  156. " <td>N</td>\n",
  157. " </tr>\n",
  158. " <tr>\n",
  159. " <th>3</th>\n",
  160. " <td>N</td>\n",
  161. " <td>H</td>\n",
  162. " <td>N</td>\n",
  163. " <td>N</td>\n",
  164. " <td>U</td>\n",
  165. " <td>H</td>\n",
  166. " <td>H</td>\n",
  167. " <td>H</td>\n",
  168. " </tr>\n",
  169. " <tr>\n",
  170. " <th>4</th>\n",
  171. " <td>N</td>\n",
  172. " <td>L</td>\n",
  173. " <td>N</td>\n",
  174. " <td>R</td>\n",
  175. " <td>U</td>\n",
  176. " <td>H</td>\n",
  177. " <td>H</td>\n",
  178. " <td>H</td>\n",
  179. " </tr>\n",
  180. " <tr>\n",
  181. " <th>...</th>\n",
  182. " <td>...</td>\n",
  183. " <td>...</td>\n",
  184. " <td>...</td>\n",
  185. " <td>...</td>\n",
  186. " <td>...</td>\n",
  187. " <td>...</td>\n",
  188. " <td>...</td>\n",
  189. " <td>...</td>\n",
  190. " </tr>\n",
  191. " <tr>\n",
  192. " <th>5619</th>\n",
  193. " <td>N</td>\n",
  194. " <td>L</td>\n",
  195. " <td>N</td>\n",
  196. " <td>N</td>\n",
  197. " <td>U</td>\n",
  198. " <td>N</td>\n",
  199. " <td>N</td>\n",
  200. " <td>H</td>\n",
  201. " </tr>\n",
  202. " <tr>\n",
  203. " <th>5620</th>\n",
  204. " <td>N</td>\n",
  205. " <td>L</td>\n",
  206. " <td>N</td>\n",
  207. " <td>R</td>\n",
  208. " <td>C</td>\n",
  209. " <td>L</td>\n",
  210. " <td>L</td>\n",
  211. " <td>N</td>\n",
  212. " </tr>\n",
  213. " <tr>\n",
  214. " <th>5621</th>\n",
  215. " <td>N</td>\n",
  216. " <td>L</td>\n",
  217. " <td>N</td>\n",
  218. " <td>R</td>\n",
  219. " <td>U</td>\n",
  220. " <td>N</td>\n",
  221. " <td>H</td>\n",
  222. " <td>N</td>\n",
  223. " </tr>\n",
  224. " <tr>\n",
  225. " <th>5622</th>\n",
  226. " <td>N</td>\n",
  227. " <td>L</td>\n",
  228. " <td>N</td>\n",
  229. " <td>R</td>\n",
  230. " <td>U</td>\n",
  231. " <td>N</td>\n",
  232. " <td>H</td>\n",
  233. " <td>N</td>\n",
  234. " </tr>\n",
  235. " <tr>\n",
  236. " <th>5623</th>\n",
  237. " <td>N</td>\n",
  238. " <td>L</td>\n",
  239. " <td>L</td>\n",
  240. " <td>R</td>\n",
  241. " <td>C</td>\n",
  242. " <td>L</td>\n",
  243. " <td>L</td>\n",
  244. " <td>N</td>\n",
  245. " </tr>\n",
  246. " </tbody>\n",
  247. "</table>\n",
  248. "<p>5624 rows × 8 columns</p>\n",
  249. "</div>"
  250. ],
  251. "text/plain": [
  252. " AV AC PR UI S C I A\n",
  253. "0 N L N N U H N N\n",
  254. "1 N L N N U H H H\n",
  255. "2 N L N N U H N N\n",
  256. "3 N H N N U H H H\n",
  257. "4 N L N R U H H H\n",
  258. "... .. .. .. .. .. .. .. ..\n",
  259. "5619 N L N N U N N H\n",
  260. "5620 N L N R C L L N\n",
  261. "5621 N L N R U N H N\n",
  262. "5622 N L N R U N H N\n",
  263. "5623 N L L R C L L N\n",
  264. "\n",
  265. "[5624 rows x 8 columns]"
  266. ]
  267. },
  268. "execution_count": 12,
  269. "metadata": {},
  270. "output_type": "execute_result"
  271. }
  272. ],
  273. "source": [
  274. "lw = data_train[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']]\n",
  275. "lw"
  276. ]
  277. },
  278. {
  279. "cell_type": "code",
  280. "execution_count": 13,
  281. "id": "eef35137-c9f8-49cb-8232-506d564f1fb4",
  282. "metadata": {},
  283. "outputs": [
  284. {
  285. "name": "stdout",
  286. "output_type": "stream",
  287. "text": [
  288. " AV_A AV_L AV_N AV_P AC_H AC_L PR_H PR_L PR_N UI_N ... S_U \\\n",
  289. "0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  290. "1 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  291. "2 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  292. "3 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  293. "4 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 \n",
  294. "... ... ... ... ... ... ... ... ... ... ... ... ... \n",
  295. "5619 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  296. "5620 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n",
  297. "5621 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 \n",
  298. "5622 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 \n",
  299. "5623 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 ... 0.0 \n",
  300. "\n",
  301. " C_H C_L C_N I_H I_L I_N A_H A_L A_N \n",
  302. "0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n",
  303. "1 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
  304. "2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n",
  305. "3 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
  306. "4 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
  307. "... ... ... ... ... ... ... ... ... ... \n",
  308. "5619 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 \n",
  309. "5620 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
  310. "5621 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n",
  311. "5622 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n",
  312. "5623 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
  313. "\n",
  314. "[5624 rows x 22 columns]\n"
  315. ]
  316. },
  317. {
  318. "name": "stderr",
  319. "output_type": "stream",
  320. "text": [
  321. "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
  322. " warnings.warn(\n"
  323. ]
  324. }
  325. ],
  326. "source": [
  327. "def encode(data):\n",
  328. " # 初始化 OneHotEncoder\n",
  329. " encoder = OneHotEncoder(sparse=False)\n",
  330. "\n",
  331. " # 转换字符数据为数值\n",
  332. " encoded_features = encoder.fit_transform(data[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
  333. " encoded_data = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']))\n",
  334. " return encoded_data\n",
  335. "print(encode(lw))\n"
  336. ]
  337. },
  338. {
  339. "cell_type": "code",
  340. "execution_count": 16,
  341. "id": "25bbd901-d4aa-44cb-8f1f-2720c553bfad",
  342. "metadata": {},
  343. "outputs": [
  344. {
  345. "name": "stdout",
  346. "output_type": "stream",
  347. "text": [
  348. " AV_A AV_L AV_N AV_P AC_H AC_L PR_H PR_L PR_N UI_N ... S_U \\\n",
  349. "0 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n",
  350. "1 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  351. "2 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  352. "3 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n",
  353. "4 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n",
  354. ".. ... ... ... ... ... ... ... ... ... ... ... ... \n",
  355. "705 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  356. "706 0 1.0 0.0 0 0.0 1.0 0.0 1.0 0.0 1.0 ... 1.0 \n",
  357. "707 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  358. "708 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  359. "709 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
  360. "\n",
  361. " C_H C_L C_N I_H I_L I_N A_H A_L A_N \n",
  362. "0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
  363. "1 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 \n",
  364. "2 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 \n",
  365. "3 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
  366. "4 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
  367. ".. ... ... ... ... ... ... ... ... ... \n",
  368. "705 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
  369. "706 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n",
  370. "707 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
  371. "708 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 \n",
  372. "709 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n",
  373. "\n",
  374. "[710 rows x 22 columns]\n"
  375. ]
  376. },
  377. {
  378. "name": "stderr",
  379. "output_type": "stream",
  380. "text": [
  381. "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
  382. " warnings.warn(\n"
  383. ]
  384. }
  385. ],
  386. "source": [
  387. "x_test = encode(data_test[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
  388. "x_test.insert(0, 'AV_A', 0)\n",
  389. "x_test.insert(3, 'AV_P', 0)\n",
  390. "print(x_test)"
  391. ]
  392. },
  393. {
  394. "cell_type": "code",
  395. "execution_count": 15,
  396. "id": "12c94e10-99e6-48ed-b659-6d99bd41d049",
  397. "metadata": {},
  398. "outputs": [
  399. {
  400. "name": "stdout",
  401. "output_type": "stream",
  402. "text": [
  403. "分类报告:\n",
  404. " precision recall f1-score support\n",
  405. "\n",
  406. " CRITICAL 0.99 0.97 0.98 155\n",
  407. " HIGH 0.98 1.00 0.99 241\n",
  408. " LOW 0.00 0.00 0.00 0\n",
  409. " MEDIUM 1.00 0.99 1.00 314\n",
  410. "\n",
  411. " accuracy 0.99 710\n",
  412. " macro avg 0.74 0.74 0.74 710\n",
  413. "weighted avg 0.99 0.99 0.99 710\n",
  414. "\n",
  415. "测试集分类的准确率:0.9901\n",
  416. "\n",
  417. "分类报告:\n",
  418. " precision recall f1-score support\n",
  419. "\n",
  420. " CRITICAL 1.00 1.00 1.00 137\n",
  421. " HIGH 1.00 1.00 1.00 228\n",
  422. " LOW 0.67 0.80 0.73 5\n",
  423. " MEDIUM 0.99 0.99 0.99 333\n",
  424. "\n",
  425. " accuracy 0.99 703\n",
  426. " macro avg 0.92 0.95 0.93 703\n",
  427. "weighted avg 0.99 0.99 0.99 703\n",
  428. "\n",
  429. "验证集分类的准确率:0.9943\n"
  430. ]
  431. },
  432. {
  433. "name": "stderr",
  434. "output_type": "stream",
  435. "text": [
  436. "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
  437. " warnings.warn(\n",
  438. "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
  439. " warnings.warn(\n",
  440. "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
  441. " warnings.warn(\n",
  442. "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1469: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
  443. " _warn_prf(average, modifier, msg_start, len(result))\n",
  444. "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1469: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
  445. " _warn_prf(average, modifier, msg_start, len(result))\n",
  446. "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1469: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
  447. " _warn_prf(average, modifier, msg_start, len(result))\n"
  448. ]
  449. }
  450. ],
  451. "source": [
  452. "x_train = encode(data_train[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
  453. "y_train = data_train['severity']\n",
  454. "x_test = encode(data_test[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
  455. "x_test.insert(0, 'AV_A', 0)\n",
  456. "x_test.insert(3, 'AV_P', 0)\n",
  457. "y_test = data_test['severity']\n",
  458. "x_validation = encode(data_validation[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
  459. "y_validation = data_validation['severity']\n",
  460. "# 创建并训练决策树分类器\n",
  461. "m = tree.DecisionTreeClassifier()\n",
  462. "m.fit(x_train, y_train)\n",
  463. "\n",
  464. "# 使用模型进行预测\n",
  465. "y_test_pred = m.predict(x_test)\n",
  466. "\n",
  467. "# 打印测试集分类报告\n",
  468. "print('分类报告:\\n', classification_report(y_test, y_test_pred))\n",
  469. "\n",
  470. "# 打印准确率\n",
  471. "test_accuracy = m.score(x_test, y_test)\n",
  472. "print('测试集分类的准确率:%0.4f' % test_accuracy)\n",
  473. "\n",
  474. "print()\n",
  475. "\n",
  476. "# 使用模型进行预测\n",
  477. "y_validation_pred = m.predict(x_validation)\n",
  478. "\n",
  479. "# 打印测试集分类报告\n",
  480. "print('分类报告:\\n', classification_report(y_validation, y_validation_pred))\n",
  481. "\n",
  482. "# 打印准确率\n",
  483. "validation_accuracy = m.score(x_validation, y_validation)\n",
  484. "print('验证集分类的准确率:%0.4f' % validation_accuracy)"
  485. ]
  486. },
  487. {
  488. "cell_type": "code",
  489. "execution_count": null,
  490. "id": "98e4b40f-8269-4c7b-94a3-567c5f48184d",
  491. "metadata": {},
  492. "outputs": [],
  493. "source": []
  494. },
  495. {
  496. "cell_type": "code",
  497. "execution_count": null,
  498. "id": "9d307a7e-c229-4eb1-8376-0366cbcc961b",
  499. "metadata": {},
  500. "outputs": [],
  501. "source": []
  502. },
  503. {
  504. "cell_type": "code",
  505. "execution_count": null,
  506. "id": "29780f8c-bbc1-4bb3-8c4d-bb1fec7ab7e3",
  507. "metadata": {},
  508. "outputs": [],
  509. "source": []
  510. }
  511. ],
  512. "metadata": {
  513. "kernelspec": {
  514. "display_name": "Python 3 (ipykernel)",
  515. "language": "python",
  516. "name": "python3"
  517. },
  518. "language_info": {
  519. "codemirror_mode": {
  520. "name": "ipython",
  521. "version": 3
  522. },
  523. "file_extension": ".py",
  524. "mimetype": "text/x-python",
  525. "name": "python",
  526. "nbconvert_exporter": "python",
  527. "pygments_lexer": "ipython3",
  528. "version": "3.11.4"
  529. }
  530. },
  531. "nbformat": 4,
  532. "nbformat_minor": 5
  533. }

在信息安全领域,漏洞评估和管理是关键任务之一。本作品探讨了如何利用预训练文本大模型来评估和研判漏洞的严重等级,具体基于通用漏洞评分系统。传统漏洞评分方法依赖于手动分析和专家评审。而基于自然语言处理文本大模型通过其深度学习能力,可以自动化地处理和分析大量的安全相关文本数据,从而提高漏洞评估的效率和准确性。结合词干提取、词性还原能够更好地发挥自然语言处理文本大模型的预测能力与准确度。