|
|
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import json\n",
- "from sklearn.decomposition import PCA\n",
- "from sklearn.cluster import KMeans\n",
- "import pandas as pd\n",
- "import numpy as np\n",
- "from sklearn.preprocessing import StandardScaler\n",
- "from sklearn import preprocessing\n",
- "import matplotlib.pyplot as plt"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "outputs": [],
- "source": [
- "data = pd.read_json(r'../..//data/SIR_train_set.json')\n",
- "CVE_ID = data[\"CVE_ID\"]"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "outputs": [
- {
- "data": {
- "text/plain": " baseScore impactScore exploitabilityScore severity\n0 7.5 3.6 3.9 HIGH\n1 9.8 5.9 3.9 CRITICAL\n2 7.5 3.6 3.9 HIGH\n3 8.1 5.9 2.2 HIGH\n4 8.8 5.9 2.8 HIGH\n... ... ... ... ...\n5619 7.5 3.6 3.9 HIGH\n5620 6.1 2.7 2.8 MEDIUM\n5621 6.5 3.6 2.8 MEDIUM\n5622 6.5 3.6 2.8 MEDIUM\n5623 5.4 2.7 2.3 MEDIUM\n\n[5624 rows x 4 columns]",
- "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>baseScore</th>\n <th>impactScore</th>\n <th>exploitabilityScore</th>\n <th>severity</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>7.5</td>\n <td>3.6</td>\n <td>3.9</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>1</th>\n <td>9.8</td>\n <td>5.9</td>\n <td>3.9</td>\n <td>CRITICAL</td>\n </tr>\n <tr>\n <th>2</th>\n <td>7.5</td>\n <td>3.6</td>\n <td>3.9</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>3</th>\n <td>8.1</td>\n <td>5.9</td>\n <td>2.2</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>4</th>\n <td>8.8</td>\n <td>5.9</td>\n <td>2.8</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5619</th>\n <td>7.5</td>\n <td>3.6</td>\n <td>3.9</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>5620</th>\n <td>6.1</td>\n <td>2.7</td>\n <td>2.8</td>\n <td>MEDIUM</td>\n </tr>\n <tr>\n <th>5621</th>\n <td>6.5</td>\n <td>3.6</td>\n <td>2.8</td>\n <td>MEDIUM</td>\n </tr>\n <tr>\n <th>5622</th>\n <td>6.5</td>\n <td>3.6</td>\n <td>2.8</td>\n <td>MEDIUM</td>\n </tr>\n <tr>\n <th>5623</th>\n <td>5.4</td>\n <td>2.7</td>\n <td>2.3</td>\n <td>MEDIUM</td>\n </tr>\n </tbody>\n</table>\n<p>5624 rows × 4 columns</p>\n</div>"
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "columns_1 = ['baseScore', 'impactScore', 'exploitabilityScore', 'severity']\n",
- "train_data = data[columns_1]\n",
- "train_data"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "outputs": [
- {
- "data": {
- "text/plain": "0 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N\n1 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H\n2 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:N/A:N\n3 CVSS:3.0/AV:N/AC:H/PR:N/UI:N/S:U/C:H/I:H/A:H\n4 CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:U/C:H/I:H/A:H\n ... \n5619 CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H\n5620 CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:C/C:L/I:L/A:N\n5621 CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:U/C:N/I:H/A:N\n5622 CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:U/C:N/I:H/A:N\n5623 CVSS:3.1/AV:N/AC:L/PR:L/UI:R/S:C/C:L/I:L/A:N\nName: vectorString, Length: 5624, dtype: object"
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "vectorString = data['vectorString']\n",
- "vectorString"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "outputs": [
- {
- "data": {
- "text/plain": "{'AV': {'network': 2897, 'adjacent': 5516, 'local': 2334},\n 'AC': {'low': 2659, 'medium': 5396, 'high': 2152},\n 'Au': {'none': 3904, 'single': 2309, 'multiple': 3674},\n 'C': {'none': 3904, 'partial': 7704, 'complete': 3143},\n 'I': {'none': 3904, 'partial': 7704, 'complete': 3143},\n 'A': {'none': 3904, 'partial': 7704, 'complete': 3143},\n 'severity': {'low': 2659, 'medium': 5396, 'high': 2152}}"
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 打开并读取JSON文件\n",
- "with open('../..//data/label_word_ids_CVSS2.json', 'r') as file:\n",
- " # 解析JSON文件\n",
- " cvss2 = json.load(file)\n",
- "cvss2"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "outputs": [
- {
- "data": {
- "text/plain": "9"
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 打开并读取JSON文件\n",
- "with open('../../data/label_word_ids.json', 'r') as file:\n",
- " # 解析JSON文件\n",
- " cvss = json.load(file)\n",
- "cvss['AV']\n",
- "len(cvss)"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "outputs": [
- {
- "data": {
- "text/plain": "{'AV': {'N': 0.20251660258650822,\n 'A': 0.38559944075498076,\n 'L': 0.16315973435861586,\n 'P': 0.24872422229989513},\n 'AC': {'L': 0.5526917480773228, 'H': 0.4473082519226772},\n 'PR': {'N': 0.44796328169822147,\n 'L': 0.3051061388410786,\n 'H': 0.24693057946069993},\n 'UI': {'N': 0.5477760628595482, 'R': 0.4522239371404518},\n 'S': {'U': 0.8439380911435942, 'C': 0.15606190885640583},\n 'C': {'N': 0.44796328169822147,\n 'L': 0.3051061388410786,\n 'H': 0.24693057946069993},\n 'I': {'N': 0.44796328169822147,\n 'L': 0.3051061388410786,\n 'H': 0.24693057946069993},\n 'A': {'N': 0.44796328169822147,\n 'L': 0.3051061388410786,\n 'H': 0.24693057946069993},\n 'severity': {'low': 0.18472974850632207,\n 'medium': 0.37487842156454076,\n 'high': 0.14950673891899402,\n 'critical': 0.2908850910101431}}"
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "columns_2 = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']\n",
- "for column in columns_2:\n",
- " cvss[column] = {k[0].upper(): v for k, v in cvss[column].items()}\n",
- "# 计算每一行的总值\n",
- "summ = {key: sum(values.values()) for key, values in cvss.items()}\n",
- "\n",
- "# 计算每个值除以总值\n",
- "cvss = {\n",
- " key: {subkey: value / summ[key] for subkey, value in values.items()}\n",
- " for key, values in cvss.items()\n",
- "}\n",
- "cvss"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\lx\\AppData\\Local\\Temp\\ipykernel_58136\\555386878.py:10: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.\n",
- " train_data_temp = train_data_temp.applymap(transform_value)\n"
- ]
- },
- {
- "data": {
- "text/plain": " AV AC PR UI S C I A\n0 N L N N U H N N\n1 N L N N U H H H\n2 N L N N U H N N\n3 N H N N U H H H\n4 N L N R U H H H\n... .. .. .. .. .. .. .. ..\n5619 N L N N U N N H\n5620 N L N R C L L N\n5621 N L N R U N H N\n5622 N L N R U N H N\n5623 N L L R C L L N\n\n[5624 rows x 8 columns]",
- "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>AV</th>\n <th>AC</th>\n <th>PR</th>\n <th>UI</th>\n <th>S</th>\n <th>C</th>\n <th>I</th>\n <th>A</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>N</td>\n <td>L</td>\n <td>N</td>\n <td>N</td>\n <td>U</td>\n <td>H</td>\n <td>N</td>\n <td>N</td>\n </tr>\n <tr>\n <th>1</th>\n <td>N</td>\n <td>L</td>\n <td>N</td>\n <td>N</td>\n <td>U</td>\n <td>H</td>\n <td>H</td>\n <td>H</td>\n </tr>\n <tr>\n <th>2</th>\n <td>N</td>\n <td>L</td>\n <td>N</td>\n <td>N</td>\n <td>U</td>\n <td>H</td>\n <td>N</td>\n <td>N</td>\n </tr>\n <tr>\n <th>3</th>\n <td>N</td>\n <td>H</td>\n <td>N</td>\n <td>N</td>\n <td>U</td>\n <td>H</td>\n <td>H</td>\n <td>H</td>\n </tr>\n <tr>\n <th>4</th>\n <td>N</td>\n <td>L</td>\n <td>N</td>\n <td>R</td>\n <td>U</td>\n <td>H</td>\n <td>H</td>\n <td>H</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5619</th>\n <td>N</td>\n <td>L</td>\n <td>N</td>\n <td>N</td>\n <td>U</td>\n <td>N</td>\n <td>N</td>\n <td>H</td>\n </tr>\n <tr>\n <th>5620</th>\n <td>N</td>\n <td>L</td>\n <td>N</td>\n <td>R</td>\n <td>C</td>\n <td>L</td>\n <td>L</td>\n <td>N</td>\n </tr>\n <tr>\n <th>5621</th>\n <td>N</td>\n <td>L</td>\n <td>N</td>\n <td>R</td>\n <td>U</td>\n <td>N</td>\n <td>H</td>\n <td>N</td>\n </tr>\n <tr>\n <th>5622</th>\n <td>N</td>\n <td>L</td>\n <td>N</td>\n <td>R</td>\n <td>U</td>\n <td>N</td>\n <td>H</td>\n <td>N</td>\n </tr>\n <tr>\n <th>5623</th>\n <td>N</td>\n <td>L</td>\n <td>L</td>\n <td>R</td>\n <td>C</td>\n <td>L</td>\n <td>L</td>\n <td>N</td>\n </tr>\n </tbody>\n</table>\n<p>5624 rows × 8 columns</p>\n</div>"
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#转换数据\n",
- "def transform_value(val):\n",
- " return val.split(':')[1]\n",
- "temp = []\n",
- "for i in range(vectorString.size):\n",
- " part = vectorString[i].split('/')\n",
- " list_items = part[1::]\n",
- " temp.append(list_items)\n",
- "train_data_temp = pd.DataFrame(temp, columns=columns_2)\n",
- "train_data_temp = train_data_temp.applymap(transform_value)\n",
- "train_data_temp"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\lx\\AppData\\Local\\Temp\\ipykernel_58136\\1422158133.py:2: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
- " train_data_temp.replace(cvss, inplace=True)\n"
- ]
- },
- {
- "data": {
- "text/plain": " AV AC PR UI S C I \\\n0 0.202517 0.552692 0.447963 0.547776 0.843938 0.246931 0.447963 \n1 0.202517 0.552692 0.447963 0.547776 0.843938 0.246931 0.246931 \n2 0.202517 0.552692 0.447963 0.547776 0.843938 0.246931 0.447963 \n3 0.202517 0.447308 0.447963 0.547776 0.843938 0.246931 0.246931 \n4 0.202517 0.552692 0.447963 0.452224 0.843938 0.246931 0.246931 \n... ... ... ... ... ... ... ... \n5619 0.202517 0.552692 0.447963 0.547776 0.843938 0.447963 0.447963 \n5620 0.202517 0.552692 0.447963 0.452224 0.156062 0.305106 0.305106 \n5621 0.202517 0.552692 0.447963 0.452224 0.843938 0.447963 0.246931 \n5622 0.202517 0.552692 0.447963 0.452224 0.843938 0.447963 0.246931 \n5623 0.202517 0.552692 0.305106 0.452224 0.156062 0.305106 0.305106 \n\n A \n0 0.447963 \n1 0.246931 \n2 0.447963 \n3 0.246931 \n4 0.246931 \n... ... \n5619 0.246931 \n5620 0.447963 \n5621 0.447963 \n5622 0.447963 \n5623 0.447963 \n\n[5624 rows x 8 columns]",
- "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>AV</th>\n <th>AC</th>\n <th>PR</th>\n <th>UI</th>\n <th>S</th>\n <th>C</th>\n <th>I</th>\n <th>A</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>0.447963</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>0.246931</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>0.447963</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.202517</td>\n <td>0.447308</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>0.246931</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>0.246931</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5619</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.447963</td>\n <td>0.447963</td>\n <td>0.246931</td>\n </tr>\n <tr>\n <th>5620</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.156062</td>\n <td>0.305106</td>\n <td>0.305106</td>\n <td>0.447963</td>\n </tr>\n <tr>\n <th>5621</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.843938</td>\n <td>0.447963</td>\n <td>0.246931</td>\n <td>0.447963</td>\n </tr>\n <tr>\n <th>5622</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.843938</td>\n <td>0.447963</td>\n <td>0.246931</td>\n <td>0.447963</td>\n </tr>\n <tr>\n <th>5623</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.305106</td>\n <td>0.452224</td>\n <td>0.156062</td>\n <td>0.305106</td>\n <td>0.305106</td>\n <td>0.447963</td>\n </tr>\n </tbody>\n</table>\n<p>5624 rows × 8 columns</p>\n</div>"
- },
- "execution_count": 36,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 替换 DataFrame 中的值\n",
- "train_data_temp.replace(cvss, inplace=True)\n",
- "train_data_temp"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "outputs": [
- {
- "data": {
- "text/plain": " AV AC PR UI S C I \\\n0 0.202517 0.552692 0.447963 0.547776 0.843938 0.246931 0.447963 \n1 0.202517 0.552692 0.447963 0.547776 0.843938 0.246931 0.246931 \n2 0.202517 0.552692 0.447963 0.547776 0.843938 0.246931 0.447963 \n3 0.202517 0.447308 0.447963 0.547776 0.843938 0.246931 0.246931 \n4 0.202517 0.552692 0.447963 0.452224 0.843938 0.246931 0.246931 \n... ... ... ... ... ... ... ... \n5619 0.202517 0.552692 0.447963 0.547776 0.843938 0.447963 0.447963 \n5620 0.202517 0.552692 0.447963 0.452224 0.156062 0.305106 0.305106 \n5621 0.202517 0.552692 0.447963 0.452224 0.843938 0.447963 0.246931 \n5622 0.202517 0.552692 0.447963 0.452224 0.843938 0.447963 0.246931 \n5623 0.202517 0.552692 0.305106 0.452224 0.156062 0.305106 0.305106 \n\n A baseScore impactScore exploitabilityScore severity \n0 0.447963 7.5 3.6 3.9 HIGH \n1 0.246931 9.8 5.9 3.9 CRITICAL \n2 0.447963 7.5 3.6 3.9 HIGH \n3 0.246931 8.1 5.9 2.2 HIGH \n4 0.246931 8.8 5.9 2.8 HIGH \n... ... ... ... ... ... \n5619 0.246931 7.5 3.6 3.9 HIGH \n5620 0.447963 6.1 2.7 2.8 MEDIUM \n5621 0.447963 6.5 3.6 2.8 MEDIUM \n5622 0.447963 6.5 3.6 2.8 MEDIUM \n5623 0.447963 5.4 2.7 2.3 MEDIUM \n\n[5624 rows x 12 columns]",
- "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>AV</th>\n <th>AC</th>\n <th>PR</th>\n <th>UI</th>\n <th>S</th>\n <th>C</th>\n <th>I</th>\n <th>A</th>\n <th>baseScore</th>\n <th>impactScore</th>\n <th>exploitabilityScore</th>\n <th>severity</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>0.447963</td>\n <td>7.5</td>\n <td>3.6</td>\n <td>3.9</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>9.8</td>\n <td>5.9</td>\n <td>3.9</td>\n <td>CRITICAL</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>0.447963</td>\n <td>7.5</td>\n <td>3.6</td>\n <td>3.9</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.202517</td>\n <td>0.447308</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>8.1</td>\n <td>5.9</td>\n <td>2.2</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>8.8</td>\n <td>5.9</td>\n <td>2.8</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5619</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.447963</td>\n <td>0.447963</td>\n <td>0.246931</td>\n <td>7.5</td>\n <td>3.6</td>\n <td>3.9</td>\n <td>HIGH</td>\n </tr>\n <tr>\n <th>5620</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.156062</td>\n <td>0.305106</td>\n <td>0.305106</td>\n <td>0.447963</td>\n <td>6.1</td>\n <td>2.7</td>\n <td>2.8</td>\n <td>MEDIUM</td>\n </tr>\n <tr>\n <th>5621</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.843938</td>\n <td>0.447963</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>6.5</td>\n <td>3.6</td>\n <td>2.8</td>\n <td>MEDIUM</td>\n </tr>\n <tr>\n <th>5622</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.843938</td>\n <td>0.447963</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>6.5</td>\n <td>3.6</td>\n <td>2.8</td>\n <td>MEDIUM</td>\n </tr>\n <tr>\n <th>5623</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.305106</td>\n <td>0.452224</td>\n <td>0.156062</td>\n <td>0.305106</td>\n <td>0.305106</td>\n <td>0.447963</td>\n <td>5.4</td>\n <td>2.7</td>\n <td>2.3</td>\n <td>MEDIUM</td>\n </tr>\n </tbody>\n</table>\n<p>5624 rows × 12 columns</p>\n</div>"
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "train_data = pd.concat([train_data_temp, train_data], axis=1)\n",
- "train_data"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\lx\\AppData\\Local\\Temp\\ipykernel_58136\\4130220277.py:2: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
- " train_data['severity'] = train_data['severity'].replace(category_replacement)\n"
- ]
- },
- {
- "data": {
- "text/plain": " AV AC PR UI S C I \\\n0 0.202517 0.552692 0.447963 0.547776 0.843938 0.246931 0.447963 \n1 0.202517 0.552692 0.447963 0.547776 0.843938 0.246931 0.246931 \n2 0.202517 0.552692 0.447963 0.547776 0.843938 0.246931 0.447963 \n3 0.202517 0.447308 0.447963 0.547776 0.843938 0.246931 0.246931 \n4 0.202517 0.552692 0.447963 0.452224 0.843938 0.246931 0.246931 \n... ... ... ... ... ... ... ... \n5619 0.202517 0.552692 0.447963 0.547776 0.843938 0.447963 0.447963 \n5620 0.202517 0.552692 0.447963 0.452224 0.156062 0.305106 0.305106 \n5621 0.202517 0.552692 0.447963 0.452224 0.843938 0.447963 0.246931 \n5622 0.202517 0.552692 0.447963 0.452224 0.843938 0.447963 0.246931 \n5623 0.202517 0.552692 0.305106 0.452224 0.156062 0.305106 0.305106 \n\n A baseScore impactScore exploitabilityScore severity \n0 0.447963 7.5 3.6 3.9 0.149507 \n1 0.246931 9.8 5.9 3.9 0.290885 \n2 0.447963 7.5 3.6 3.9 0.149507 \n3 0.246931 8.1 5.9 2.2 0.149507 \n4 0.246931 8.8 5.9 2.8 0.149507 \n... ... ... ... ... ... \n5619 0.246931 7.5 3.6 3.9 0.149507 \n5620 0.447963 6.1 2.7 2.8 0.374878 \n5621 0.447963 6.5 3.6 2.8 0.374878 \n5622 0.447963 6.5 3.6 2.8 0.374878 \n5623 0.447963 5.4 2.7 2.3 0.374878 \n\n[5624 rows x 12 columns]",
- "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>AV</th>\n <th>AC</th>\n <th>PR</th>\n <th>UI</th>\n <th>S</th>\n <th>C</th>\n <th>I</th>\n <th>A</th>\n <th>baseScore</th>\n <th>impactScore</th>\n <th>exploitabilityScore</th>\n <th>severity</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>0.447963</td>\n <td>7.5</td>\n <td>3.6</td>\n <td>3.9</td>\n <td>0.149507</td>\n </tr>\n <tr>\n <th>1</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>9.8</td>\n <td>5.9</td>\n <td>3.9</td>\n <td>0.290885</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>0.447963</td>\n <td>7.5</td>\n <td>3.6</td>\n <td>3.9</td>\n <td>0.149507</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.202517</td>\n <td>0.447308</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>8.1</td>\n <td>5.9</td>\n <td>2.2</td>\n <td>0.149507</td>\n </tr>\n <tr>\n <th>4</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.843938</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>0.246931</td>\n <td>8.8</td>\n <td>5.9</td>\n <td>2.8</td>\n <td>0.149507</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>5619</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.547776</td>\n <td>0.843938</td>\n <td>0.447963</td>\n <td>0.447963</td>\n <td>0.246931</td>\n <td>7.5</td>\n <td>3.6</td>\n <td>3.9</td>\n <td>0.149507</td>\n </tr>\n <tr>\n <th>5620</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.156062</td>\n <td>0.305106</td>\n <td>0.305106</td>\n <td>0.447963</td>\n <td>6.1</td>\n <td>2.7</td>\n <td>2.8</td>\n <td>0.374878</td>\n </tr>\n <tr>\n <th>5621</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.843938</td>\n <td>0.447963</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>6.5</td>\n <td>3.6</td>\n <td>2.8</td>\n <td>0.374878</td>\n </tr>\n <tr>\n <th>5622</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.447963</td>\n <td>0.452224</td>\n <td>0.843938</td>\n <td>0.447963</td>\n <td>0.246931</td>\n <td>0.447963</td>\n <td>6.5</td>\n <td>3.6</td>\n <td>2.8</td>\n <td>0.374878</td>\n </tr>\n <tr>\n <th>5623</th>\n <td>0.202517</td>\n <td>0.552692</td>\n <td>0.305106</td>\n <td>0.452224</td>\n <td>0.156062</td>\n <td>0.305106</td>\n <td>0.305106</td>\n <td>0.447963</td>\n <td>5.4</td>\n <td>2.7</td>\n <td>2.3</td>\n <td>0.374878</td>\n </tr>\n </tbody>\n</table>\n<p>5624 rows × 12 columns</p>\n</div>"
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "category_replacement = {'HIGH':0.14950673891899402, 'MEDIUM':0.37487842156454076, 'CRITICAL':0.2908850910101431, 'LOW':0.18472974850632207}\n",
- "train_data['severity'] = train_data['severity'].replace(category_replacement)\n",
- "train_data"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "outputs": [],
- "source": [
- "# 复制元数据col\n",
- "new_col = train_data.columns"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "outputs": [
- {
- "data": {
- "text/plain": "(5624, 3)"
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# PCA降维\n",
- "pca=PCA(3)\n",
- "pca.fit(train_data)\n",
- "new_data=pca.transform(train_data)\n",
- "new_data.shape"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "outputs": [
- {
- "data": {
- "text/plain": " 0 1 2\n0 0.013773 1.328624 0.647031\n1 1.539486 0.577848 -0.185040\n2 0.013773 1.328624 0.647031\n3 0.763329 -1.328923 0.107140\n4 1.074067 -0.631252 -0.089019\n5 1.073885 -0.633773 -0.097836\n6 0.616217 -1.754719 0.053234\n7 0.013862 1.327285 0.635674\n8 -0.889005 0.561839 -1.746661\n9 1.073885 -0.633773 -0.097836",
- "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n <th>1</th>\n <th>2</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0.013773</td>\n <td>1.328624</td>\n <td>0.647031</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1.539486</td>\n <td>0.577848</td>\n <td>-0.185040</td>\n </tr>\n <tr>\n <th>2</th>\n <td>0.013773</td>\n <td>1.328624</td>\n <td>0.647031</td>\n </tr>\n <tr>\n <th>3</th>\n <td>0.763329</td>\n <td>-1.328923</td>\n <td>0.107140</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1.074067</td>\n <td>-0.631252</td>\n <td>-0.089019</td>\n </tr>\n <tr>\n <th>5</th>\n <td>1.073885</td>\n <td>-0.633773</td>\n <td>-0.097836</td>\n </tr>\n <tr>\n <th>6</th>\n <td>0.616217</td>\n <td>-1.754719</td>\n <td>0.053234</td>\n </tr>\n <tr>\n <th>7</th>\n <td>0.013862</td>\n <td>1.327285</td>\n <td>0.635674</td>\n </tr>\n <tr>\n <th>8</th>\n <td>-0.889005</td>\n <td>0.561839</td>\n <td>-1.746661</td>\n </tr>\n <tr>\n <th>9</th>\n <td>1.073885</td>\n <td>-0.633773</td>\n <td>-0.097836</td>\n </tr>\n </tbody>\n</table>\n</div>"
- },
- "execution_count": 41,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 对数据进行预处理标准化\n",
- "scaler=preprocessing.StandardScaler().fit(new_data)\n",
- "data_s=pd.DataFrame(scaler.fit_transform(new_data,y=train_data.columns))\n",
- "data_s.head(10)"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 42,
- "outputs": [
- {
- "data": {
- "text/plain": "<Figure size 640x480 with 1 Axes>",
- "image/png": ""
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# 元数据散点图\n",
- "plt.rcParams['font.sans-serif']=['SimHei']\n",
- "plt.rcParams['axes.unicode_minus'] = False\n",
- "plt.scatter(data_s[0],data_s[1],c='r',label='散点')\n",
- "plt.savefig(\"cluster1.svg\", dpi=300,format=\"svg\")\n",
- "plt.show()"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "outputs": [],
- "source": [
- "from sklearn.preprocessing import MinMaxScaler\n",
- "from sklearn.metrics import silhouette_score\n",
- "from sklearn.cluster import KMeans"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 44,
- "outputs": [
- {
- "data": {
- "text/plain": "Text(0, 0.5, '$J(C_K)$')"
- },
- "execution_count": 44,
- "metadata": {},
- "output_type": "execute_result"
- },
- {
- "data": {
- "text/plain": "<Figure size 640x480 with 1 Axes>",
- "image/png": ""
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# 确定Kmeans K值\n",
- "inertia=[]\n",
- "for k in range(2,10):\n",
- " kmeans=KMeans(n_clusters=k,random_state=1).fit(data_s)\n",
- " inertia.append(np.sqrt(kmeans.inertia_))\n",
- "plt.plot(range(2,10),inertia,marker='s')\n",
- "plt.xlabel('$k$') # K\n",
- "plt.ylabel('$J(C_K)$') # 误差平方和"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 45,
- "outputs": [
- {
- "data": {
- "text/plain": "<Figure size 640x480 with 1 Axes>",
- "image/png": ""
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "plt.ylabel('$J(C_K)$') # 误差平方和\n",
- "# 存放轮廓系数\n",
- "Scores = []\n",
- "plt.rcParams['font.sans-serif'] = ['SimHei'] #显示中文标签\n",
- "plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号\n",
- "for k in range(2, 9):\n",
- " estimator = KMeans(n_clusters=k) #构造聚类器\n",
- " estimator.fit(data_s)\n",
- " Scores.append(silhouette_score(data_s, estimator.labels_, metric='euclidean'))\n",
- "X = range(2, 9)\n",
- "plt.xlabel('K')\n",
- "plt.ylabel('轮廓系数')\n",
- "plt.plot(X, Scores, 'o-')\n",
- "plt.show()"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "outputs": [
- {
- "data": {
- "text/plain": "<Figure size 640x480 with 1 Axes>",
- "image/png": ""
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# calinski_haarbaz指数\n",
- "from sklearn.metrics import calinski_harabasz_score\n",
- "haraba=[]\n",
- "plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签\n",
- "plt.rcParams['axes.unicode_minus']=False #用来正常显示负号\n",
- "for k in range(2,9):\n",
- " estimator=KMeans(n_clusters=k) #构造聚类器\n",
- " estimator.fit(data_s)\n",
- " haraba.append(calinski_harabasz_score(data_s,estimator.labels_))\n",
- "X=range(2,9)\n",
- "plt.xlabel('K')\n",
- "plt.ylabel('calinski_harabaz指数')\n",
- "plt.plot(X,haraba,'o-')\n",
- "plt.show()"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "outputs": [],
- "source": [
- "#建立模型\n",
- "cluster=KMeans(n_clusters=4,random_state=5).fit(data_s)"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "outputs": [
- {
- "data": {
- "text/plain": "array([[ 0.36355751, -1.81817227, 0.18306699],\n [-0.95490222, 0.17679215, -1.69155755],\n [ 1.31656935, 0.10349958, -0.16019117],\n [-0.51047021, 0.36281987, 0.90438576]])"
- },
- "execution_count": 48,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "#聚类中心\n",
- "centers=cluster.cluster_centers_ #聚类中心\n",
- "centers"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "outputs": [
- {
- "data": {
- "text/plain": "4749.265326507992"
- },
- "execution_count": 49,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 查看总距离平方和\n",
- "inertia = cluster.inertia_\n",
- "inertia"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "outputs": [
- {
- "data": {
- "text/plain": "0.5513427055662409"
- },
- "execution_count": 50,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 轮廓系数均值\n",
- "c_preds1 = cluster.labels_\n",
- "silhouette_score(data_s,c_preds1)"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "outputs": [
- {
- "data": {
- "text/plain": "<Figure size 640x480 with 1 Axes>",
- "image/png": ""
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "import matplotlib.pyplot as plt\n",
- "plt.figure()\n",
- "plt.scatter(data_s.values[:, 0], data_s.values[:, 1], c=c_preds1)#原始数据散点图,按照分类查看\n",
- "plt.scatter(centers[:, 0], centers[:, 1],\n",
- " marker='x', s=169, linewidths=3,\n",
- " color='r', zorder=10) # 重心红色X进行突出\n",
- "plt.savefig(\"cluster.svg\", dpi=300,format=\"svg\")"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "outputs": [],
- "source": [
- "# 层次聚类\n",
- "from sklearn.cluster import AgglomerativeClustering\n",
- "from sklearn.metrics import confusion_matrix\n",
- "clustering = AgglomerativeClustering(linkage='ward', n_clusters=3)\n",
- "res = clustering.fit(data_s)"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "outputs": [
- {
- "data": {
- "text/plain": "<Figure size 640x480 with 1 Axes>",
- "image/png": ""
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# 存放轮廓系数\n",
- "Scores = []\n",
- "plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签\n",
- "plt.rcParams['axes.unicode_minus']=False #用来正常显示负号\n",
- "for k in range(2, 9):\n",
- " estimator = AgglomerativeClustering(n_clusters=k) # 构造聚类器\n",
- " estimator.fit(data_s)\n",
- " Scores.append(silhouette_score(data_s, estimator.labels_, metric='euclidean'))\n",
- "X = range(2, 9)\n",
- "plt.xlabel('k')\n",
- "plt.ylabel('轮廓系数')\n",
- "plt.plot(X, Scores, 'o-')\n",
- "plt.show()"
- ],
- "metadata": {
- "collapsed": false
- }
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "outputs": [
- {
- "data": {
- "text/plain": "<Figure size 1000x1000 with 2 Axes>",
|