{ "cells": [ { "cell_type": "code", "execution_count": 8, "id": "64ff4cb2-6a11-4558-9b58-02d23d391b34", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "from sklearn import tree\n", "from sklearn.model_selection import train_test_split as tsplit \n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", "from sklearn.preprocessing import OneHotEncoder" ] }, { "cell_type": "code", "execution_count": 9, "id": "4afcae4b-305f-4ce6-af54-08edba088e0b", "metadata": {}, "outputs": [], "source": [ "def transform_value(val):\n", " return val.split(':')[1]" ] }, { "cell_type": "code", "execution_count": 10, "id": "1b1287ad-40c8-4059-ad05-097bad2feac7", "metadata": {}, "outputs": [], "source": [ "def extract_data(s):\n", " data_temp = pd.read_json(s)\n", " columns = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']\n", " vectorString = data_temp['vectorString']\n", " temp = []\n", " for i in range(vectorString.size):\n", " part = vectorString[i].split('/')\n", " list_items = part[1::]\n", " temp.append(list_items)\n", " data = pd.DataFrame(temp, columns=columns)\n", " data = data.applymap(transform_value)\n", " data['severity'] = data_temp['severity']\n", " return data" ] }, { "cell_type": "code", "execution_count": 11, "id": "6962b88e-2523-4bde-8fa6-df96bfbc5221", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " AV AC PR UI S C I A severity\n", "0 N L N R C L L N MEDIUM\n", "1 N L N N U N N H HIGH\n", "2 N L N N U N N H HIGH\n", "3 N L N R C L L N MEDIUM\n", "4 N L N R C L L N MEDIUM\n", ".. .. .. .. .. .. .. .. .. ...\n", "705 N L N N U H H H CRITICAL\n", "706 L L L N U H N N MEDIUM\n", "707 N L N N U H H H CRITICAL\n", "708 N L N N U N L L MEDIUM\n", "709 N L N N U H N N HIGH\n", "\n", "[710 rows x 9 columns]\n" ] } ], "source": [ "data_train = extract_data('SIR_train_set.json')\n", "data_test = extract_data('SIR_test_set.json')\n", "data_validation = extract_data('SIR_validation_set.json')\n", "data_train\n", "print(data_test)" ] }, { "cell_type": "code", "execution_count": 12, "id": "49ccfdf6-99f0-4c5e-9772-03e500e6b6d6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AVACPRUISCIA
0NLNNUHNN
1NLNNUHHH
2NLNNUHNN
3NHNNUHHH
4NLNRUHHH
...........................
5619NLNNUNNH
5620NLNRCLLN
5621NLNRUNHN
5622NLNRUNHN
5623NLLRCLLN
\n", "

5624 rows × 8 columns

\n", "
" ], "text/plain": [ " AV AC PR UI S C I A\n", "0 N L N N U H N N\n", "1 N L N N U H H H\n", "2 N L N N U H N N\n", "3 N H N N U H H H\n", "4 N L N R U H H H\n", "... .. .. .. .. .. .. .. ..\n", "5619 N L N N U N N H\n", "5620 N L N R C L L N\n", "5621 N L N R U N H N\n", "5622 N L N R U N H N\n", "5623 N L L R C L L N\n", "\n", "[5624 rows x 8 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lw = data_train[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']]\n", "lw" ] }, { "cell_type": "code", "execution_count": 13, "id": "eef35137-c9f8-49cb-8232-506d564f1fb4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " AV_A AV_L AV_N AV_P AC_H AC_L PR_H PR_L PR_N UI_N ... S_U \\\n", "0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "1 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "2 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "3 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "4 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 \n", "... ... ... ... ... ... ... ... ... ... ... ... ... \n", "5619 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "5620 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n", "5621 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 \n", "5622 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 ... 1.0 \n", "5623 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 ... 0.0 \n", "\n", " C_H C_L C_N I_H I_L I_N A_H A_L A_N \n", "0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n", "1 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n", "2 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n", "3 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n", "4 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... ... \n", "5619 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 \n", "5620 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n", "5621 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n", "5622 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n", "5623 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n", "\n", "[5624 rows x 22 columns]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n", " warnings.warn(\n" ] } ], "source": [ "def encode(data):\n", " # 初始化 OneHotEncoder\n", " encoder = OneHotEncoder(sparse=False)\n", "\n", " # 转换字符数据为数值\n", " encoded_features = encoder.fit_transform(data[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n", " encoded_data = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']))\n", " return encoded_data\n", "print(encode(lw))\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "25bbd901-d4aa-44cb-8f1f-2720c553bfad", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " AV_A AV_L AV_N AV_P AC_H AC_L PR_H PR_L PR_N UI_N ... S_U \\\n", "0 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n", "1 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "2 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "3 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n", "4 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 0.0 ... 0.0 \n", ".. ... ... ... ... ... ... ... ... ... ... ... ... \n", "705 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "706 0 1.0 0.0 0 0.0 1.0 0.0 1.0 0.0 1.0 ... 1.0 \n", "707 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "708 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "709 0 0.0 1.0 0 0.0 1.0 0.0 0.0 1.0 1.0 ... 1.0 \n", "\n", " C_H C_L C_N I_H I_L I_N A_H A_L A_N \n", "0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n", "1 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 \n", "2 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 \n", "3 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n", "4 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 \n", ".. ... ... ... ... ... ... ... ... ... \n", "705 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n", "706 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n", "707 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 \n", "708 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 \n", "709 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 \n", "\n", "[710 rows x 22 columns]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n", " warnings.warn(\n" ] } ], "source": [ "x_test = encode(data_test[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n", "x_test.insert(0, 'AV_A', 0)\n", "x_test.insert(3, 'AV_P', 0)\n", "print(x_test)" ] }, { "cell_type": "code", "execution_count": 15, "id": "12c94e10-99e6-48ed-b659-6d99bd41d049", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "分类报告:\n", " precision recall f1-score support\n", "\n", " CRITICAL 0.99 0.97 0.98 155\n", " HIGH 0.98 1.00 0.99 241\n", " LOW 0.00 0.00 0.00 0\n", " MEDIUM 1.00 0.99 1.00 314\n", "\n", " accuracy 0.99 710\n", " macro avg 0.74 0.74 0.74 710\n", "weighted avg 0.99 0.99 0.99 710\n", "\n", "测试集分类的准确率:0.9901\n", "\n", "分类报告:\n", " precision recall f1-score support\n", "\n", " CRITICAL 1.00 1.00 1.00 137\n", " HIGH 1.00 1.00 1.00 228\n", " LOW 0.67 0.80 0.73 5\n", " MEDIUM 0.99 0.99 0.99 333\n", "\n", " accuracy 0.99 703\n", " macro avg 0.92 0.95 0.93 703\n", "weighted avg 0.99 0.99 0.99 703\n", "\n", "验证集分类的准确率:0.9943\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n", " warnings.warn(\n", "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n", " warnings.warn(\n", "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:972: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n", " warnings.warn(\n", "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1469: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1469: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n", "C:\\Users\\lw\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\sklearn\\metrics\\_classification.py:1469: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", " _warn_prf(average, modifier, msg_start, len(result))\n" ] } ], "source": [ "x_train = encode(data_train[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n", "y_train = data_train['severity']\n", "x_test = encode(data_test[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n", "x_test.insert(0, 'AV_A', 0)\n", "x_test.insert(3, 'AV_P', 0)\n", "y_test = data_test['severity']\n", "x_validation = encode(data_validation[['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']])\n", "y_validation = data_validation['severity']\n", "# 创建并训练决策树分类器\n", "m = tree.DecisionTreeClassifier()\n", "m.fit(x_train, y_train)\n", "\n", "# 使用模型进行预测\n", "y_test_pred = m.predict(x_test)\n", "\n", "# 打印测试集分类报告\n", "print('分类报告:\\n', classification_report(y_test, y_test_pred))\n", "\n", "# 打印准确率\n", "test_accuracy = m.score(x_test, y_test)\n", "print('测试集分类的准确率:%0.4f' % test_accuracy)\n", "\n", "print()\n", "\n", "# 使用模型进行预测\n", "y_validation_pred = m.predict(x_validation)\n", "\n", "# 打印测试集分类报告\n", "print('分类报告:\\n', classification_report(y_validation, y_validation_pred))\n", "\n", "# 打印准确率\n", "validation_accuracy = m.score(x_validation, y_validation)\n", "print('验证集分类的准确率:%0.4f' % validation_accuracy)" ] }, { "cell_type": "code", "execution_count": null, "id": "98e4b40f-8269-4c7b-94a3-567c5f48184d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "9d307a7e-c229-4eb1-8376-0366cbcc961b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "29780f8c-bbc1-4bb3-8c4d-bb1fec7ab7e3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }