{ "cells": [ { "cell_type": "code", "execution_count": 8, "id": "64ff4cb2-6a11-4558-9b58-02d23d391b34", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "from sklearn import tree\n", "from sklearn.model_selection import train_test_split as tsplit \n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", "from sklearn.preprocessing import OneHotEncoder" ] }, { "cell_type": "code", "execution_count": 9, "id": "4afcae4b-305f-4ce6-af54-08edba088e0b", "metadata": {}, "outputs": [], "source": [ "def transform_value(val):\n", " return val.split(':')[1]" ] }, { "cell_type": "code", "execution_count": 10, "id": "1b1287ad-40c8-4059-ad05-097bad2feac7", "metadata": {}, "outputs": [], "source": [ "def extract_data(s):\n", " data_temp = pd.read_json(s)\n", " columns = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']\n", " vectorString = data_temp['vectorString']\n", " temp = []\n", " for i in range(vectorString.size):\n", " part = vectorString[i].split('/')\n", " list_items = part[1::]\n", " temp.append(list_items)\n", " data = pd.DataFrame(temp, columns=columns)\n", " data = data.applymap(transform_value)\n", " data['severity'] = data_temp['severity']\n", " return data" ] }, { "cell_type": "code", "execution_count": 11, "id": "6962b88e-2523-4bde-8fa6-df96bfbc5221", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " AV AC PR UI S C I A severity\n", "0 N L N R C L L N MEDIUM\n", "1 N L N N U N N H HIGH\n", "2 N L N N U N N H HIGH\n", "3 N L N R C L L N MEDIUM\n", "4 N L N R C L L N MEDIUM\n", ".. .. .. .. .. .. .. .. .. ...\n", "705 N L N N U H H H CRITICAL\n", "706 L L L N U H N N MEDIUM\n", "707 N L N N U H H H CRITICAL\n", "708 N L N N U N L L MEDIUM\n", "709 N L N N U H N N HIGH\n", "\n", "[710 rows x 9 columns]\n" ] } ], "source": [ "data_train = extract_data('SIR_train_set.json')\n", "data_test = extract_data('SIR_test_set.json')\n", "data_validation = extract_data('SIR_validation_set.json')\n", "data_train\n", "print(data_test)" ] }, { "cell_type": "code", "execution_count": 12, "id": "49ccfdf6-99f0-4c5e-9772-03e500e6b6d6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | AV | \n", "AC | \n", "PR | \n", "UI | \n", "S | \n", "C | \n", "I | \n", "A | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "N | \n", "L | \n", "N | \n", "N | \n", "U | \n", "H | \n", "N | \n", "N | \n", "
| 1 | \n", "N | \n", "L | \n", "N | \n", "N | \n", "U | \n", "H | \n", "H | \n", "H | \n", "
| 2 | \n", "N | \n", "L | \n", "N | \n", "N | \n", "U | \n", "H | \n", "N | \n", "N | \n", "
| 3 | \n", "N | \n", "H | \n", "N | \n", "N | \n", "U | \n", "H | \n", "H | \n", "H | \n", "
| 4 | \n", "N | \n", "L | \n", "N | \n", "R | \n", "U | \n", "H | \n", "H | \n", "H | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 5619 | \n", "N | \n", "L | \n", "N | \n", "N | \n", "U | \n", "N | \n", "N | \n", "H | \n", "
| 5620 | \n", "N | \n", "L | \n", "N | \n", "R | \n", "C | \n", "L | \n", "L | \n", "N | \n", "
| 5621 | \n", "N | \n", "L | \n", "N | \n", "R | \n", "U | \n", "N | \n", "H | \n", "N | \n", "
| 5622 | \n", "N | \n", "L | \n", "N | \n", "R | \n", "U | \n", "N | \n", "H | \n", "N | \n", "
| 5623 | \n", "N | \n", "L | \n", "L | \n", "R | \n", "C | \n", "L | \n", "L | \n", "N | \n", "
5624 rows × 8 columns
\n", "