You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

csv_process.ipynb 30 kB


  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "id": "676be61a-bd65-4510-8357-94859f596330",
  7. "metadata": {},
  8. "outputs": [],
  9. "source": [
  10. "import pandas as pd\n",
  11. "import json"
  12. ]
  13. },
  14. {
  15. "cell_type": "code",
  16. "execution_count": 2,
  17. "id": "94f312ae-87d2-4d5e-ae75-9fc85a2a980c",
  18. "metadata": {},
  19. "outputs": [],
  20. "source": [
  21. "data = pd.read_json('../data/SIR_test_set.json')"
  22. ]
  23. },
  24. {
  25. "cell_type": "code",
  26. "execution_count": 3,
  27. "id": "7d949b04-4929-4921-a818-4e8cbb57826b",
  28. "metadata": {},
  29. "outputs": [
  30. {
  31. "name": "stdout",
  32. "output_type": "stream",
  33. "text": [
  34. " CVE_ID Issue_Url_old \\\n",
  35. "0 CVE-2021-45822 https://github.com/btiteam/xbtit-3.1/issues/7 \n",
  36. "1 CVE-2021-45769 https://github.com/mz-automation/libiec61850/i... \n",
  37. "2 CVE-2021-45773 https://github.com/mz-automation/lib60870/issu... \n",
  38. "3 CVE-2022-25014 https://github.com/gamonoid/icehrm/issues/283 \n",
  39. "4 CVE-2022-25013 https://github.com/gamonoid/icehrm/issues/284 \n",
  40. ".. ... ... \n",
  41. "705 CVE-2022-32417 https://github.com/Snakinya/Vuln/issues/1 \n",
  42. "706 CVE-2021-34485 https://github.com/github/advisory-database/is... \n",
  43. "707 CVE-2021-44906 https://github.com/minimistjs/minimist/issues/11 \n",
  44. "708 CVE-2020-8927 https://github.com/github/advisory-database/is... \n",
  45. "709 CVE-2021-31402 https://github.com/cfug/dio/issues/1752 \n",
  46. "\n",
  47. " Issue_Url_new \\\n",
  48. "0 https://github.com/btiteam/xbtit-3.1/issues/7 \n",
  49. "1 https://github.com/mz-automation/libiec61850/i... \n",
  50. "2 https://github.com/mz-automation/lib60870/issu... \n",
  51. "3 https://github.com/gamonoid/icehrm/issues/283 \n",
  52. "4 https://github.com/gamonoid/icehrm/issues/284 \n",
  53. ".. ... \n",
  54. "705 https://github.com/snakinya/vuln/issues/1 \n",
  55. "706 https://github.com/github/advisory-database/is... \n",
  56. "707 https://github.com/minimistjs/minimist/issues/11 \n",
  57. "708 https://github.com/github/advisory-database/is... \n",
  58. "709 https://github.com/cfug/dio/issues/1752 \n",
  59. "\n",
  60. " Repo_new Issue_Created_At \\\n",
  61. "0 btiteam/xbtit-3.1 2021-12-22 20:25:58+00:00 \n",
  62. "1 mz-automation/libiec61850 2021-12-23 00:53:55+00:00 \n",
  63. "2 mz-automation/lib60870 2021-12-23 06:01:26+00:00 \n",
  64. "3 gamonoid/icehrm 2021-12-23 08:09:18+00:00 \n",
  65. "4 gamonoid/icehrm 2021-12-23 08:13:20+00:00 \n",
  66. ".. ... ... \n",
  67. "705 Snakinya/Vuln 2022-08-04 10:38:48+00:00 \n",
  68. "706 github/advisory-database 2022-10-12 20:44:32+00:00 \n",
  69. "707 minimistjs/minimist 2022-10-19 14:23:14+00:00 \n",
  70. "708 github/advisory-database 2022-10-31 20:04:11+00:00 \n",
  71. "709 cfug/dio 2023-03-21 16:54:52+00:00 \n",
  72. "\n",
  73. " description \\\n",
  74. "0 Stored & Reflected XSS affecting Xbtit NUMBERT... \n",
  75. "1 NULL Pointer Dereference in APITAG NULL Pointe... \n",
  76. "2 NULL Pointer Dereference in APITAG NULL Pointe... \n",
  77. "3 Reflected XSS vulnerability NUMBERTAG in icehr... \n",
  78. "4 Reflected XSS vulnerabilities NUMBERTAG in ice... \n",
  79. ".. ... \n",
  80. "705 pboot cms NUMBERTAG RCE. 漏洞详情: URLTAG 声明 APITA... \n",
  81. "706 .NET CVE backfill round NUMBERTAG Hello, Pleas... \n",
  82. "707 Backport of NUMBERTAG fixes to NUMBERTAG Thank... \n",
  83. "708 Update impacted packages for CVETAG . Hi, This... \n",
  84. "709 CVE Dio NUMBERTAG Google OVS Scanner. Package ... \n",
  85. "\n",
  86. " vectorString severity baseScore \\\n",
  87. "0 CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:C/C:L/I:L/A:N MEDIUM 6.1 \n",
  88. "1 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H HIGH 7.5 \n",
  89. "2 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H HIGH 7.5 \n",
  90. "3 CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:C/C:L/I:L/A:N MEDIUM 6.1 \n",
  91. "4 CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:C/C:L/I:L/A:N MEDIUM 6.1 \n",
  92. ".. ... ... ... \n",
  93. "705 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H CRITICAL 9.8 \n",
  94. "706 CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:N/A:N MEDIUM 5.5 \n",
  95. "707 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H CRITICAL 9.8 \n",
  96. "708 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:L/A:L MEDIUM 6.5 \n",
  97. "709 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N HIGH 7.5 \n",
  98. "\n",
  99. " impactScore exploitabilityScore \n",
  100. "0 2.7 2.8 \n",
  101. "1 3.6 3.9 \n",
  102. "2 3.6 3.9 \n",
  103. "3 2.7 2.8 \n",
  104. "4 2.7 2.8 \n",
  105. ".. ... ... \n",
  106. "705 5.9 3.9 \n",
  107. "706 3.6 1.8 \n",
  108. "707 5.9 3.9 \n",
  109. "708 2.5 3.9 \n",
  110. "709 3.6 3.9 \n",
  111. "\n",
  112. "[710 rows x 11 columns]\n"
  113. ]
  114. }
  115. ],
  116. "source": [
  117. "train_data_temp = pd.DataFrame()\n",
  118. "print(data)"
  119. ]
  120. },
  121. {
  122. "cell_type": "code",
  123. "execution_count": 4,
  124. "id": "d4272d23-2c40-416a-aa83-40b09817ea0a",
  125. "metadata": {},
  126. "outputs": [],
  127. "source": [
  128. "train_data_temp['description'] = data['description']"
  129. ]
  130. },
  131. {
  132. "cell_type": "code",
  133. "execution_count": 5,
  134. "id": "101f87d6-38d7-4562-a572-49ab74eec58d",
  135. "metadata": {},
  136. "outputs": [
  137. {
  138. "name": "stdout",
  139. "output_type": "stream",
  140. "text": [
  141. "0 False\n",
  142. "1 False\n",
  143. "2 False\n",
  144. "3 False\n",
  145. "4 False\n",
  146. " ... \n",
  147. "705 False\n",
  148. "706 False\n",
  149. "707 False\n",
  150. "708 False\n",
  151. "709 False\n",
  152. "Name: description, Length: 710, dtype: bool\n"
  153. ]
  154. }
  155. ],
  156. "source": [
  157. "print(train_data_temp['description'].isna())"
  158. ]
  159. },
  160. {
  161. "cell_type": "code",
  162. "execution_count": 6,
  163. "id": "85b0cd35-3862-43fb-b4ab-d88c0ceae6da",
  164. "metadata": {},
  165. "outputs": [
  166. {
  167. "name": "stdout",
  168. "output_type": "stream",
  169. "text": [
  170. "Empty DataFrame\n",
  171. "Columns: [description]\n",
  172. "Index: []\n"
  173. ]
  174. }
  175. ],
  176. "source": [
  177. "# 获取 NaN 值的行索引\n",
  178. "nan_rows = train_data_temp[train_data_temp['description'].isna()]\n",
  179. "print(nan_rows)"
  180. ]
  181. },
  182. {
  183. "cell_type": "code",
  184. "execution_count": 7,
  185. "id": "8eaf202e-b96b-4f79-8b3b-89e4757add04",
  186. "metadata": {},
  187. "outputs": [],
  188. "source": [
  189. "vectorString = data['vectorString']"
  190. ]
  191. },
  192. {
  193. "cell_type": "code",
  194. "execution_count": 8,
  195. "id": "49331613-93f9-4d86-9e43-384c16ff8813",
  196. "metadata": {},
  197. "outputs": [
  198. {
  199. "name": "stdout",
  200. "output_type": "stream",
  201. "text": [
  202. " AV AC PR UI S C I A\n",
  203. "0 N L N R C L L N\n",
  204. "1 N L N N U N N H\n",
  205. "2 N L N N U N N H\n",
  206. "3 N L N R C L L N\n",
  207. "4 N L N R C L L N\n",
  208. ".. .. .. .. .. .. .. .. ..\n",
  209. "705 N L N N U H H H\n",
  210. "706 L L L N U H N N\n",
  211. "707 N L N N U H H H\n",
  212. "708 N L N N U N L L\n",
  213. "709 N L N N U H N N\n",
  214. "\n",
  215. "[710 rows x 8 columns]\n"
  216. ]
  217. },
  218. {
  219. "name": "stderr",
  220. "output_type": "stream",
  221. "text": [
  222. "C:\\Users\\lx\\AppData\\Local\\Temp\\ipykernel_38864\\3052899741.py:14: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
  223. " train_data = train_data.applymap(transform_value)\n"
  224. ]
  225. }
  226. ],
  227. "source": [
  228. "#转换数据\n",
  229. "def transform_value(val):\n",
  230. " return val.split(':')[1]\n",
  231. " \n",
  232. "columns = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']\n",
  233. "\n",
  234. "temp = []\n",
  235. "\n",
  236. "for i in range(vectorString.size):\n",
  237. " part = vectorString[i].split('/')\n",
  238. " list_items = part[1::]\n",
  239. " temp.append(list_items)\n",
  240. "train_data = pd.DataFrame(temp, columns=columns)\n",
  241. "train_data = train_data.applymap(transform_value)\n",
  242. "print(train_data)"
  243. ]
  244. },
  245. {
  246. "cell_type": "code",
  247. "execution_count": 9,
  248. "id": "79a6f3ee-0517-4a4f-b26a-6f2dabf9d3b0",
  249. "metadata": {},
  250. "outputs": [],
  251. "source": [
  252. "def calculate_cvss_score(params):\n",
  253. " # 字典映射分值\n",
  254. " AV = {'N': 0.85, 'A': 0.62, 'L': 0.55, 'P': 0.2}\n",
  255. " AC = {'L': 0.77, 'H': 0.44}\n",
  256. " PR = {'N': 0.85, 'L': 0.68, 'H': 0.5}\n",
  257. " UI = {'N': 0.85, 'R': 0.62}\n",
  258. " S = {'U': 1, 'C': 1.08}\n",
  259. " C = {'N': 0, 'L': 0.22, 'H': 0.56}\n",
  260. " I = {'N': 0, 'L': 0.22, 'H': 0.56}\n",
  261. " A = {'N': 0, 'L': 0.22, 'H': 0.56}\n",
  262. "\n",
  263. " # 获取参数值\n",
  264. " av = AV[params['AV']]\n",
  265. " ac = AC[params['AC']]\n",
  266. " pr = PR[params['PR']]\n",
  267. " ui = UI[params['UI']]\n",
  268. " s = S[params['S']]\n",
  269. " c = C[params['C']]\n",
  270. " i = I[params['I']]\n",
  271. " a = A[params['A']]\n",
  272. "\n",
  273. " # 计算临时分数\n",
  274. " impact = 1 - (1 - c) * (1 - i) * (1 - a)\n",
  275. " exploitability = 8.22 * av * ac * pr * ui\n",
  276. "\n",
  277. " if impact == 0:\n",
  278. " base_score = 0\n",
  279. " else:\n",
  280. " if s == 1: # 未改变\n",
  281. " base_score = round(min(1.176 * (exploitability + impact), 10), 1)\n",
  282. " else: # 改变\n",
  283. " base_score = round(min(1.08 * (exploitability + impact), 10), 1)\n",
  284. "\n",
  285. " return base_score"
  286. ]
  287. },
  288. {
  289. "cell_type": "code",
  290. "execution_count": 10,
  291. "id": "622cf1dd-082c-4d2a-a880-34d22e96d053",
  292. "metadata": {},
  293. "outputs": [
  294. {
  295. "name": "stdout",
  296. "output_type": "stream",
  297. "text": [
  298. " AV AC PR UI S C I A score\n",
  299. "0 N L N R C L L N 3.5\n",
  300. "1 N L N N U N N H 5.2\n",
  301. "2 N L N N U N N H 5.2\n",
  302. "3 N L N R C L L N 3.5\n",
  303. "4 N L N R C L L N 3.5\n",
  304. ".. .. .. .. .. .. .. .. .. ...\n",
  305. "705 N L N N U H H H 5.6\n",
  306. "706 L L L N U H N N 3.0\n",
  307. "707 N L N N U H H H 5.6\n",
  308. "708 N L N N U N L L 5.0\n",
  309. "709 N L N N U H N N 5.2\n",
  310. "\n",
  311. "[710 rows x 9 columns]\n"
  312. ]
  313. }
  314. ],
  315. "source": [
  316. "# 为每一行创建字典\n",
  317. "train_dicts = train_data.apply(lambda row: {col: row[col][0] for col in train_data.columns}, axis=1)\n",
  318. "train_score = train_dicts.apply(calculate_cvss_score)\n",
  319. "train_data['score'] = train_score\n",
  320. "print(train_data)"
  321. ]
  322. },
  323. {
  324. "cell_type": "code",
  325. "execution_count": 11,
  326. "id": "f767e3c9-634b-4c0d-9145-eb4c013e1a6e",
  327. "metadata": {},
  328. "outputs": [],
  329. "source": [
  330. "dict = {\n",
  331. " 'AV': {\n",
  332. " 'N': 'NETWORK',\n",
  333. " 'A': 'ADJACENT',\n",
  334. " 'L': 'LOCAL',\n",
  335. " 'P': 'PHYSICAL'\n",
  336. " },\n",
  337. " 'AC': {\n",
  338. " 'L': 'LOW',\n",
  339. " 'H': 'HIGH'\n",
  340. " }, \n",
  341. " 'PR': {\n",
  342. " 'N': 'NONE',\n",
  343. " 'L': 'LOW',\n",
  344. " 'H': 'HIGH'\n",
  345. " }, \n",
  346. " 'UI': {\n",
  347. " 'N': 'NONE',\n",
  348. " 'R': 'REQUIRED'\n",
  349. " },\n",
  350. " 'S': {\n",
  351. " 'U': 'UNCHANGED',\n",
  352. " 'C': 'CHANGED'\n",
  353. " },\n",
  354. " 'C': {\n",
  355. " 'N': 'NONE',\n",
  356. " 'L': 'LOW',\n",
  357. " 'H': 'HIGH'\n",
  358. " },\n",
  359. " 'I': {\n",
  360. " 'N': 'NONE',\n",
  361. " 'L': 'LOW',\n",
  362. " 'H': 'HIGH'\n",
  363. " },\n",
  364. " 'A': {\n",
  365. " 'N': 'NONE', \n",
  366. " 'L': 'LOW',\n",
  367. " 'H': 'HIGH'\n",
  368. " }\n",
  369. "}"
  370. ]
  371. },
  372. {
  373. "cell_type": "code",
  374. "execution_count": 12,
  375. "id": "d42106a3-9eb5-4580-9143-d7ae061b6d4c",
  376. "metadata": {},
  377. "outputs": [
  378. {
  379. "data": {
  380. "text/plain": " AV AC PR UI S C I A score\n0 NETWORK LOW NONE REQUIRED CHANGED LOW LOW NONE 3.5\n1 NETWORK LOW NONE NONE UNCHANGED NONE NONE HIGH 5.2\n2 NETWORK LOW NONE NONE UNCHANGED NONE NONE HIGH 5.2\n3 NETWORK LOW NONE REQUIRED CHANGED LOW LOW NONE 3.5\n4 NETWORK LOW NONE REQUIRED CHANGED LOW LOW NONE 3.5\n.. ... ... ... ... ... ... ... ... ...\n705 NETWORK LOW NONE NONE UNCHANGED HIGH HIGH HIGH 5.6\n706 LOCAL LOW LOW NONE UNCHANGED HIGH NONE NONE 3.0\n707 NETWORK LOW NONE NONE UNCHANGED HIGH HIGH HIGH 5.6\n708 NETWORK LOW NONE NONE UNCHANGED NONE LOW LOW 5.0\n709 NETWORK LOW NONE NONE UNCHANGED HIGH NONE NONE 5.2\n\n[710 rows x 9 columns]",
  381. "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>AV</th>\n <th>AC</th>\n <th>PR</th>\n <th>UI</th>\n <th>S</th>\n <th>C</th>\n <th>I</th>\n <th>A</th>\n <th>score</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>NETWORK</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>REQUIRED</td>\n <td>CHANGED</td>\n <td>LOW</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>3.5</td>\n </tr>\n <tr>\n <th>1</th>\n <td>NETWORK</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>UNCHANGED</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>HIGH</td>\n <td>5.2</td>\n </tr>\n <tr>\n <th>2</th>\n <td>NETWORK</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>UNCHANGED</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>HIGH</td>\n <td>5.2</td>\n </tr>\n <tr>\n <th>3</th>\n <td>NETWORK</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>REQUIRED</td>\n <td>CHANGED</td>\n <td>LOW</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>3.5</td>\n </tr>\n <tr>\n <th>4</th>\n <td>NETWORK</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>REQUIRED</td>\n <td>CHANGED</td>\n <td>LOW</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>3.5</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>705</th>\n <td>NETWORK</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>UNCHANGED</td>\n <td>HIGH</td>\n <td>HIGH</td>\n <td>HIGH</td>\n <td>5.6</td>\n </tr>\n <tr>\n <th>706</th>\n <td>LOCAL</td>\n <td>LOW</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>UNCHANGED</td>\n <td>HIGH</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>3.0</td>\n </tr>\n <tr>\n <th>707</th>\n <td>NETWORK</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>UNCHANGED</td>\n <td>HIGH</td>\n <td>HIGH</td>\n <td>HIGH</td>\n <td>5.6</td>\n </tr>\n <tr>\n <th>708</th>\n <td>NETWORK</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>UNCHANGED</td>\n <td>NONE</td>\n <td>LOW</td>\n <td>LOW</td>\n <td>5.0</td>\n </tr>\n <tr>\n <th>709</th>\n <td>NETWORK</td>\n <td>LOW</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>UNCHANGED</td>\n <td>HIGH</td>\n <td>NONE</td>\n <td>NONE</td>\n <td>5.2</td>\n </tr>\n </tbody>\n</table>\n<p>710 rows × 9 columns</p>\n</div>"
  382. },
  383. "execution_count": 12,
  384. "metadata": {},
  385. "output_type": "execute_result"
  386. }
  387. ],
  388. "source": [
  389. "# 替换 DataFrame 中的值\n",
  390. "train_data.replace(dict, inplace=True)\n",
  391. "train_data"
  392. ]
  393. },
  394. {
  395. "cell_type": "code",
  396. "execution_count": 13,
  397. "id": "f07b2d92-8271-46c8-ab90-a19570dd2566",
  398. "metadata": {},
  399. "outputs": [],
  400. "source": [
  401. "train_data.insert(0, 'description', train_data_temp)"
  402. ]
  403. },
  404. {
  405. "cell_type": "code",
  406. "execution_count": 14,
  407. "id": "5ca97546-e120-4e80-b7dc-67a00c1bbf45",
  408. "metadata": {},
  409. "outputs": [
  410. {
  411. "name": "stdout",
  412. "output_type": "stream",
  413. "text": [
  414. " description AV AC PR \\\n",
  415. "0 Stored & Reflected XSS affecting Xbtit NUMBERT... NETWORK LOW NONE \n",
  416. "1 NULL Pointer Dereference in APITAG NULL Pointe... NETWORK LOW NONE \n",
  417. "2 NULL Pointer Dereference in APITAG NULL Pointe... NETWORK LOW NONE \n",
  418. "3 Reflected XSS vulnerability NUMBERTAG in icehr... NETWORK LOW NONE \n",
  419. "4 Reflected XSS vulnerabilities NUMBERTAG in ice... NETWORK LOW NONE \n",
  420. ".. ... ... ... ... \n",
  421. "705 pboot cms NUMBERTAG RCE. 漏洞详情: URLTAG 声明 APITA... NETWORK LOW NONE \n",
  422. "706 .NET CVE backfill round NUMBERTAG Hello, Pleas... LOCAL LOW LOW \n",
  423. "707 Backport of NUMBERTAG fixes to NUMBERTAG Thank... NETWORK LOW NONE \n",
  424. "708 Update impacted packages for CVETAG . Hi, This... NETWORK LOW NONE \n",
  425. "709 CVE Dio NUMBERTAG Google OVS Scanner. Package ... NETWORK LOW NONE \n",
  426. "\n",
  427. " UI S C I A score \n",
  428. "0 REQUIRED CHANGED LOW LOW NONE 3.5 \n",
  429. "1 NONE UNCHANGED NONE NONE HIGH 5.2 \n",
  430. "2 NONE UNCHANGED NONE NONE HIGH 5.2 \n",
  431. "3 REQUIRED CHANGED LOW LOW NONE 3.5 \n",
  432. "4 REQUIRED CHANGED LOW LOW NONE 3.5 \n",
  433. ".. ... ... ... ... ... ... \n",
  434. "705 NONE UNCHANGED HIGH HIGH HIGH 5.6 \n",
  435. "706 NONE UNCHANGED HIGH NONE NONE 3.0 \n",
  436. "707 NONE UNCHANGED HIGH HIGH HIGH 5.6 \n",
  437. "708 NONE UNCHANGED NONE LOW LOW 5.0 \n",
  438. "709 NONE UNCHANGED HIGH NONE NONE 5.2 \n",
  439. "\n",
  440. "[710 rows x 10 columns]\n"
  441. ]
  442. }
  443. ],
  444. "source": [
  445. "print(train_data)"
  446. ]
  447. },
  448. {
  449. "cell_type": "code",
  450. "execution_count": 15,
  451. "id": "3e5e14c5-8f88-43d3-945c-1505e11a2490",
  452. "metadata": {},
  453. "outputs": [
  454. {
  455. "name": "stdout",
  456. "output_type": "stream",
  457. "text": [
  458. " description AV AC PR \\\n",
  459. "0 Stored & Reflected XSS affecting Xbtit NUMBERT... NETWORK LOW NONE \n",
  460. "10 illegal memcpy during njs_vmcode_typeof in PAT... NETWORK LOW NONE \n",
  461. "11 Heap UAF in njs_await_fulfilled. Env CODETAG P... NETWORK LOW NONE \n",
  462. "48 Add nonce to the logout link. The logout link ... NETWORK LOW NONE \n",
  463. "50 Divide By Zero in H5T__complete_copy () at PAT... NETWORK LOW NONE \n",
  464. ".. ... ... ... ... \n",
  465. "705 pboot cms NUMBERTAG RCE. 漏洞详情: URLTAG 声明 APITA... NETWORK LOW NONE \n",
  466. "706 .NET CVE backfill round NUMBERTAG Hello, Pleas... LOCAL LOW LOW \n",
  467. "707 Backport of NUMBERTAG fixes to NUMBERTAG Thank... NETWORK LOW NONE \n",
  468. "708 Update impacted packages for CVETAG . Hi, This... NETWORK LOW NONE \n",
  469. "709 CVE Dio NUMBERTAG Google OVS Scanner. Package ... NETWORK LOW NONE \n",
  470. "\n",
  471. " UI S C I A score prefix \n",
  472. "0 REQUIRED CHANGED LOW LOW NONE 3.5 Stored & R \n",
  473. "10 NONE UNCHANGED HIGH HIGH HIGH 5.6 illegal me \n",
  474. "11 NONE UNCHANGED HIGH HIGH HIGH 5.6 Heap UAF i \n",
  475. "48 REQUIRED CHANGED NONE HIGH NONE 3.7 Add nonce \n",
  476. "50 REQUIRED UNCHANGED NONE NONE HIGH 4.0 Divide By \n",
  477. ".. ... ... ... ... ... ... ... \n",
  478. "705 NONE UNCHANGED HIGH HIGH HIGH 5.6 pboot cms \n",
  479. "706 NONE UNCHANGED HIGH NONE NONE 3.0 .NET CVE b \n",
  480. "707 NONE UNCHANGED HIGH HIGH HIGH 5.6 Backport o \n",
  481. "708 NONE UNCHANGED NONE LOW LOW 5.0 Update imp \n",
  482. "709 NONE UNCHANGED HIGH NONE NONE 5.2 CVE Dio NU \n",
  483. "\n",
  484. "[264 rows x 11 columns]\n"
  485. ]
  486. }
  487. ],
  488. "source": [
  489. "# 提取前20个字符\n",
  490. "train_data['prefix'] = train_data['description'].str[:10]\n",
  491. "\n",
  492. "# 计算每个前20个字符的出现次数\n",
  493. "prefix_counts = train_data['prefix'].value_counts()\n",
  494. "\n",
  495. "# 只保留那些前20个字符出现次数为1的描述\n",
  496. "unique_prefixes = prefix_counts[prefix_counts == 1].index\n",
  497. "unique_descriptions = train_data[train_data['prefix'].isin(unique_prefixes)]\n",
  498. "print(unique_descriptions)"
  499. ]
  500. },
  501. {
  502. "cell_type": "code",
  503. "execution_count": 16,
  504. "id": "d660a495-fdab-41fc-932c-6e593babc88e",
  505. "metadata": {},
  506. "outputs": [
  507. {
  508. "name": "stdout",
  509. "output_type": "stream",
  510. "text": [
  511. " description AV AC PR \\\n",
  512. "0 Stored & Reflected XSS affecting Xbtit NUMBERT... NETWORK LOW NONE \n",
  513. "10 illegal memcpy during njs_vmcode_typeof in PAT... NETWORK LOW NONE \n",
  514. "11 Heap UAF in njs_await_fulfilled. Env CODETAG P... NETWORK LOW NONE \n",
  515. "48 Add nonce to the logout link. The logout link ... NETWORK LOW NONE \n",
  516. "50 Divide By Zero in H5T__complete_copy () at PAT... NETWORK LOW NONE \n",
  517. ".. ... ... ... ... \n",
  518. "698 A NUMBERTAG specific heap buffer overflow with... NETWORK LOW NONE \n",
  519. "699 Mitigation for CVETAG . Hi there. It appears a... NETWORK LOW NONE \n",
  520. "703 Contact APITAG Product Security Team and ask t... NETWORK LOW NONE \n",
  521. "707 Backport of NUMBERTAG fixes to NUMBERTAG Thank... NETWORK LOW NONE \n",
  522. "709 CVE Dio NUMBERTAG Google OVS Scanner. Package ... NETWORK LOW NONE \n",
  523. "\n",
  524. " UI S C I A score prefix \n",
  525. "0 REQUIRED CHANGED LOW LOW NONE 3.5 Stored & R \n",
  526. "10 NONE UNCHANGED HIGH HIGH HIGH 5.6 illegal me \n",
  527. "11 NONE UNCHANGED HIGH HIGH HIGH 5.6 Heap UAF i \n",
  528. "48 REQUIRED CHANGED NONE HIGH NONE 3.7 Add nonce \n",
  529. "50 REQUIRED UNCHANGED NONE NONE HIGH 4.0 Divide By \n",
  530. ".. ... ... ... ... ... ... ... \n",
  531. "698 NONE UNCHANGED HIGH HIGH HIGH 5.6 A NUMBERTA \n",
  532. "699 NONE UNCHANGED NONE NONE HIGH 5.2 Mitigation \n",
  533. "703 NONE UNCHANGED LOW LOW LOW 5.2 Contact AP \n",
  534. "707 NONE UNCHANGED HIGH HIGH HIGH 5.6 Backport o \n",
  535. "709 NONE UNCHANGED HIGH NONE NONE 5.2 CVE Dio NU \n",
  536. "\n",
  537. "[197 rows x 11 columns]\n"
  538. ]
  539. }
  540. ],
  541. "source": [
  542. "# 删除描述长度大于1000的行\n",
  543. "temp = pd.DataFrame()\n",
  544. "filtered_train_data = pd.DataFrame()\n",
  545. "temp = unique_descriptions[unique_descriptions['description'].str.len() <= 1000]\n",
  546. "filtered_train_data = temp[temp['description'].str.len() > 100]\n",
  547. "print(filtered_train_data)"
  548. ]
  549. },
  550. {
  551. "cell_type": "code",
  552. "execution_count": 17,
  553. "id": "eaa6e29e-7fc1-45b9-809b-a4d43686362c",
  554. "metadata": {},
  555. "outputs": [
  556. {
  557. "name": "stderr",
  558. "output_type": "stream",
  559. "text": [
  560. "C:\\Users\\lx\\AppData\\Local\\Temp\\ipykernel_38864\\3885197511.py:1: SettingWithCopyWarning: \n",
  561. "A value is trying to be set on a copy of a slice from a DataFrame\n",
  562. "\n",
  563. "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
  564. " filtered_train_data.sort_values(by='prefix', inplace=True)\n"
  565. ]
  566. }
  567. ],
  568. "source": [
  569. "filtered_train_data.sort_values(by='prefix', inplace=True)"
  570. ]
  571. },
  572. {
  573. "cell_type": "code",
  574. "execution_count": 18,
  575. "id": "db7adb10-83a9-438f-9681-ac02293cba3e",
  576. "metadata": {},
  577. "outputs": [
  578. {
  579. "name": "stdout",
  580. "output_type": "stream",
  581. "text": [
  582. " description AV AC PR \\\n",
  583. "698 A NUMBERTAG specific heap buffer overflow with... NETWORK LOW NONE \n",
  584. "488 A Remote Code Execution (RCE) vulnerability ex... NETWORK LOW HIGH \n",
  585. "51 A heap use after free in in H5AC_unpin_entry. ... NETWORK LOW NONE \n",
  586. "408 A package should never try to do unrelated thi... NETWORK LOW NONE \n",
  587. "236 A security vulnerability which will lead to co... NETWORK LOW NONE \n",
  588. ".. ... ... ... ... \n",
  589. "480 一个后台存储型xss漏洞. When adding movie names, malicio... NETWORK LOW LOW \n",
  590. "639 关于 CVETAG 漏洞,不要在发 issues 了!!!. APITAG 如果你从前端传递... NETWORK LOW NONE \n",
  591. "606 后台服务器组中存在XSS漏洞. 进入后台,点击视频 >服务器组 >添加, 在名称框插入pay... NETWORK LOW LOW \n",
  592. "509 固定的cookie NUMBERTAG APITAG FILETAG NUMBERTAG H... NETWORK LOW NONE \n",
  593. "52 默认的 APITAG 为什么选择 APITAG 呢?. 版本情况 JDK版本: corret... NETWORK LOW NONE \n",
  594. "\n",
  595. " UI S C I A score \n",
  596. "698 NONE UNCHANGED HIGH HIGH HIGH 5.6 \n",
  597. "488 NONE UNCHANGED HIGH HIGH HIGH 3.8 \n",
  598. "51 REQUIRED UNCHANGED HIGH HIGH HIGH 4.4 \n",
  599. "408 NONE UNCHANGED HIGH HIGH HIGH 5.6 \n",
  600. "236 NONE UNCHANGED HIGH HIGH HIGH 5.6 \n",
  601. ".. ... ... ... ... ... ... \n",
  602. "480 REQUIRED CHANGED LOW LOW NONE 2.9 \n",
  603. "639 NONE UNCHANGED HIGH HIGH HIGH 5.6 \n",
  604. "606 REQUIRED CHANGED LOW LOW NONE 2.9 \n",
  605. "509 NONE UNCHANGED HIGH HIGH HIGH 5.6 \n",
  606. "52 NONE UNCHANGED HIGH HIGH HIGH 5.6 \n",
  607. "\n",
  608. "[197 rows x 10 columns]\n"
  609. ]
  610. },
  611. {
  612. "name": "stderr",
  613. "output_type": "stream",
  614. "text": [
  615. "C:\\Users\\lx\\AppData\\Local\\Temp\\ipykernel_38864\\3846312463.py:1: SettingWithCopyWarning: \n",
  616. "A value is trying to be set on a copy of a slice from a DataFrame\n",
  617. "\n",
  618. "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
  619. " filtered_train_data.drop('prefix', axis=1, inplace=True)\n"
  620. ]
  621. }
  622. ],
  623. "source": [
  624. "filtered_train_data.drop('prefix', axis=1, inplace=True)\n",
  625. "print(filtered_train_data)"
  626. ]
  627. },
  628. {
  629. "cell_type": "code",
  630. "execution_count": 19,
  631. "id": "9cc98bb5-1ae4-4024-b9cc-d3db88996221",
  632. "metadata": {},
  633. "outputs": [
  634. {
  635. "data": {
  636. "text/plain": "2.0"
  637. },
  638. "execution_count": 19,
  639. "metadata": {},
  640. "output_type": "execute_result"
  641. }
  642. ],
  643. "source": [
  644. "filtered_train_data['score'].min()"
  645. ]
  646. },
  647. {
  648. "cell_type": "code",
  649. "execution_count": 20,
  650. "id": "0c2e00f4-cf7b-4b15-8901-537abb4524e5",
  651. "metadata": {},
  652. "outputs": [],
  653. "source": [
  654. "filtered_train_data.to_csv(r\"../dataset/filtered_test_dataset.csv\",header=None,index=None)"
  655. ]
  656. },
  657. {
  658. "cell_type": "code",
  659. "execution_count": 20,
  660. "id": "080b3051-8eb8-4c4f-81ad-d3a215c2f693",
  661. "metadata": {},
  662. "outputs": [],
  663. "source": []
  664. }
  665. ],
  666. "metadata": {
  667. "kernelspec": {
  668. "display_name": "Python 3 (ipykernel)",
  669. "language": "python",
  670. "name": "python3"
  671. },
  672. "language_info": {
  673. "codemirror_mode": {
  674. "name": "ipython",
  675. "version": 3
  676. },
  677. "file_extension": ".py",
  678. "mimetype": "text/x-python",
  679. "name": "python",
  680. "nbconvert_exporter": "python",
  681. "pygments_lexer": "ipython3",
  682. "version": "3.11.4"
  683. }
  684. },
  685. "nbformat": 4,
  686. "nbformat_minor": 5
  687. }

在信息安全领域,漏洞评估和管理是关键任务之一。本作品探讨了如何利用预训练文本大模型来评估和研判漏洞的严重等级,具体基于通用漏洞评分系统。传统漏洞评分方法依赖于手动分析和专家评审。而基于自然语言处理文本大模型通过其深度学习能力,可以自动化地处理和分析大量的安全相关文本数据,从而提高漏洞评估的效率和准确性。结合词干提取、词性还原能够更好地发挥自然语言处理文本大模型的预测能力与准确度。