|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "64ff4cb2-6a11-4558-9b58-02d23d391b34",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import json\n",
- "from sklearn import tree\n",
- "from sklearn.model_selection import train_test_split as tsplit \n",
- "from sklearn.metrics import classification_report\n",
- "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
- "from sklearn.preprocessing import OneHotEncoder"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "4afcae4b-305f-4ce6-af54-08edba088e0b",
- "metadata": {},
- "outputs": [],
- "source": [
- "def transform_value(val):\n",
- " return val.split(':')[1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "1b1287ad-40c8-4059-ad05-097bad2feac7",
- "metadata": {},
- "outputs": [],
- "source": [
- "def extract_data(s):\n",
- " data_temp = pd.read_json(s)\n",
- " columns = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']\n",
- " vectorString = data_temp['vectorString']\n",
- " temp = []\n",
- " for i in range(vectorString.size):\n",
- " part = vectorString[i].split('/')\n",
- " list_items = part[1::]\n",
- " temp.append(list_items)\n",
- " data = pd.DataFrame(temp, columns=columns)\n",
- " data = data.applymap(transform_value)\n",
- " data['severity'] = data_temp['severity']\n",
- " return data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "6962b88e-2523-4bde-8fa6-df96bfbc5221",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " AV AC PR UI S C I A severity\n",
- "0 N L N R C L L N MEDIUM\n",
- "1 N L N N U N N H HIGH\n",
- "2 N L N N U N N H HIGH\n",
- "3 N L N R C L L N MEDIUM\n",
- "4 N L N R C L L N MEDIUM\n",
- ".. .. .. .. .. .. .. .. .. ...\n",
- "705 N L N N U H H H CRITICAL\n",
- "706 L L L N U H N N MEDIUM\n",
- "707 N L N N U H H H CRITICAL\n",
- "708 N L N N U N L L MEDIUM\n",
- "709 N L N N U H N N HIGH\n",
- "\n",
- "[710 rows x 9 columns]\n"
- ]
- }
- ],
- "source": [
- "data_train = extract_data('SIR_train_set.json')\n",
- "data_test = extract_data('SIR_test_set.json')\n",
- "data_validation = extract_data('SIR_validation_set.json')\n",
- "data_train\n",
- "print(data_test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "49ccfdf6-99f0-4c5e-9772-03e500e6b6d6",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "<div>\n",
- "<style scoped>\n",
- " .dataframe tbody tr th:only-of-type {\n",
- " vertical-align: middle;\n",
- " }\n",
- "\n",
- " .dataframe tbody tr th {\n",
- " vertical-align: top;\n",
- " }\n",
- "\n",
- " .dataframe thead th {\n",
- " text-align: right;\n",
- " }\n",
- "</style>\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>AV</th>\n",
- " <th>AC</th>\n",
- " <th>PR</th>\n",
- " <th>UI</th>\n",
- " <th>S</th>\n",
- " <th>C</th>\n",
- " <th>I</th>\n",
- " <th>A</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td>N</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " <td>N</td>\n",
- " <td>U</td>\n",
- " <td>H</td>\n",
- " <td>N</td>\n",
- " <td>N</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td>N</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " <td>N</td>\n",
- " <td>U</td>\n",
- " <td>H</td>\n",
- " <td>H</td>\n",
- " <td>H</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td>N</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " <td>N</td>\n",
- " <td>U</td>\n",
- " <td>H</td>\n",
- " <td>N</td>\n",
- " <td>N</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td>N</td>\n",
- " <td>H</td>\n",
- " <td>N</td>\n",
- " <td>N</td>\n",
- " <td>U</td>\n",
- " <td>H</td>\n",
- " <td>H</td>\n",
- " <td>H</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td>N</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " <td>R</td>\n",
- " <td>U</td>\n",
- " <td>H</td>\n",
- " <td>H</td>\n",
- " <td>H</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>...</th>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " <td>...</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5619</th>\n",
- " <td>N</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " <td>N</td>\n",
- " <td>U</td>\n",
- " <td>N</td>\n",
- " <td>N</td>\n",
- " <td>H</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5620</th>\n",
- " <td>N</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " <td>R</td>\n",
- " <td>C</td>\n",
- " <td>L</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5621</th>\n",
- " <td>N</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " <td>R</td>\n",
- " <td>U</td>\n",
- " <td>N</td>\n",
- " <td>H</td>\n",
- " <td>N</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5622</th>\n",
- " <td>N</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " <td>R</td>\n",
- " <td>U</td>\n",
- " <td>N</td>\n",
- " <td>H</td>\n",
- " <td>N</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>5623</th>\n",
- " <td>N</td>\n",
- " <td>L</td>\n",
- " <td>L</td>\n",
- " <td>R</td>\n",
- " <td>C</td>\n",
- " <td>L</td>\n",
- " <td>L</td>\n",
- " <td>N</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>5624 rows × 8 columns</p>\n",
- "</div>"
- ],
- "text/plain": [
- " AV AC PR UI S C I A\n",
- "0 N L N N U H N N\n",
- "1 N L N N U H H H\n",
- "2 N L N N U H N N\n",
- "3 N H N N U H H H\n",
- "4 N L N R U H H H\n",
- "... .. .. .. .. .. .. .. ..\n",
- "5619 N L N N U N N H\n",
- "5620 N L N R C L L N\n",
- "5621 N L N R U N H N\n",
- "5622 N L N R U N H N\n",
- "5623 N L L R C L L N\n",
- "\n",
- "[5624 rows x 8 columns]"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "lw = data_train[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']]\n",
- "lw"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "eef35137-c9f8-49cb-8232-506d564f1fb4",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " AV_A AV_L AV_N AV_P AC_H AC_L PR_H PR_L PR_N UI_N ... S_U \\\n",
- "0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "1 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "2 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "3 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "4 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 \n",
- "... ... ... ... ... ... ... ... ... ... ... ... ... \n",
- "5619 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "5620 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n",
- "5621 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 \n",
- "5622 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 \n",
- "5623 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 ... 0.0 \n",
- "\n",
- " C_H C_L C_N I_H I_L I_N A_H A_L A_N \n",
- "0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n",
- "1 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
- "2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n",
- "3 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
- "4 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
- "... ... ... ... ... ... ... ... ... ... \n",
- "5619 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 \n",
- "5620 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
- "5621 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n",
- "5622 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n",
- "5623 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
- "\n",
- "[5624 rows x 22 columns]\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
- " warnings.warn(\n"
- ]
- }
- ],
- "source": [
- "def encode(data):\n",
- " # 初始化 OneHotEncoder\n",
- " encoder = OneHotEncoder(sparse=False)\n",
- "\n",
- " # 转换字符数据为数值\n",
- " encoded_features = encoder.fit_transform(data[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
- " encoded_data = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']))\n",
- " return encoded_data\n",
- "print(encode(lw))\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "25bbd901-d4aa-44cb-8f1f-2720c553bfad",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " AV_A AV_L AV_N AV_P AC_H AC_L PR_H PR_L PR_N UI_N ... S_U \\\n",
- "0 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n",
- "1 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "2 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "3 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n",
- "4 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n",
- ".. ... ... ... ... ... ... ... ... ... ... ... ... \n",
- "705 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "706 0 1.0 0.0 0 0.0 1.0 0.0 1.0 0.0 1.0 ... 1.0 \n",
- "707 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "708 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "709 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n",
- "\n",
- " C_H C_L C_N I_H I_L I_N A_H A_L A_N \n",
- "0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
- "1 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 \n",
- "2 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 \n",
- "3 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
- "4 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n",
- ".. ... ... ... ... ... ... ... ... ... \n",
- "705 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
- "706 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n",
- "707 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n",
- "708 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 \n",
- "709 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n",
- "\n",
- "[710 rows x 22 columns]\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
- " warnings.warn(\n"
- ]
- }
- ],
- "source": [
- "x_test = encode(data_test[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
- "x_test.insert(0, 'AV_A', 0)\n",
- "x_test.insert(3, 'AV_P', 0)\n",
- "print(x_test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "12c94e10-99e6-48ed-b659-6d99bd41d049",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "分类报告:\n",
- " precision recall f1-score support\n",
- "\n",
- " CRITICAL 0.99 0.97 0.98 155\n",
- " HIGH 0.98 1.00 0.99 241\n",
- " LOW 0.00 0.00 0.00 0\n",
- " MEDIUM 1.00 0.99 1.00 314\n",
- "\n",
- " accuracy 0.99 710\n",
- " macro avg 0.74 0.74 0.74 710\n",
- "weighted avg 0.99 0.99 0.99 710\n",
- "\n",
- "测试集分类的准确率:0.9901\n",
- "\n",
- "分类报告:\n",
- " precision recall f1-score support\n",
- "\n",
- " CRITICAL 1.00 1.00 1.00 137\n",
- " HIGH 1.00 1.00 1.00 228\n",
- " LOW 0.67 0.80 0.73 5\n",
- " MEDIUM 0.99 0.99 0.99 333\n",
- "\n",
- " accuracy 0.99 703\n",
- " macro avg 0.92 0.95 0.93 703\n",
- "weighted avg 0.99 0.99 0.99 703\n",
- "\n",
- "验证集分类的准确率:0.9943\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
- " warnings.warn(\n",
- "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
- " warnings.warn(\n",
- "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
- " warnings.warn(\n",
- "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1469: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
- " _warn_prf(average, modifier, msg_start, len(result))\n",
- "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1469: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
- " _warn_prf(average, modifier, msg_start, len(result))\n",
- "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1469: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
- " _warn_prf(average, modifier, msg_start, len(result))\n"
- ]
- }
- ],
- "source": [
- "x_train = encode(data_train[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
- "y_train = data_train['severity']\n",
- "x_test = encode(data_test[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
- "x_test.insert(0, 'AV_A', 0)\n",
- "x_test.insert(3, 'AV_P', 0)\n",
- "y_test = data_test['severity']\n",
- "x_validation = encode(data_validation[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n",
- "y_validation = data_validation['severity']\n",
- "# 创建并训练决策树分类器\n",
- "m = tree.DecisionTreeClassifier()\n",
- "m.fit(x_train, y_train)\n",
- "\n",
- "# 使用模型进行预测\n",
- "y_test_pred = m.predict(x_test)\n",
- "\n",
- "# 打印测试集分类报告\n",
- "print('分类报告:\\n', classification_report(y_test, y_test_pred))\n",
- "\n",
- "# 打印准确率\n",
- "test_accuracy = m.score(x_test, y_test)\n",
- "print('测试集分类的准确率:%0.4f' % test_accuracy)\n",
- "\n",
- "print()\n",
- "\n",
- "# 使用模型进行预测\n",
- "y_validation_pred = m.predict(x_validation)\n",
- "\n",
- "# 打印测试集分类报告\n",
- "print('分类报告:\\n', classification_report(y_validation, y_validation_pred))\n",
- "\n",
- "# 打印准确率\n",
- "validation_accuracy = m.score(x_validation, y_validation)\n",
- "print('验证集分类的准确率:%0.4f' % validation_accuracy)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "98e4b40f-8269-4c7b-94a3-567c5f48184d",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9d307a7e-c229-4eb1-8376-0366cbcc961b",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "29780f8c-bbc1-4bb3-8c4d-bb1fec7ab7e3",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
- }
|