import json import csv import pandas as pd import re def process_unicode_escape(text): return re.sub(r'\\u([\da-fA-F]{4})', lambda x: chr(int(x.group(1), 16)), text) def main(): jsonToCsv() dataProcess() dataProcess2() dataProcess3() def jsonToCsv(): with open('../data/SIR_train_set.json', 'r', encoding='UTF-8') as json_file: data = json.load(json_file) # 如果 data 是列表,遍历每一项 if isinstance(data, list): for item in data: if 'description' in item: item['description'] = process_unicode_escape(item['description']) # 写入 CSV 文件 with open('../data_process_cache/SIR_train_set.csv', 'w', newline='', encoding='UTF-8') as csv_file: if isinstance(data, list) and len(data) > 0: fieldnames = data[0].keys() writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() writer.writerows(data) def dataProcess(): columns_to_keep = ['description', 'vectorString'] with open('../data_process_cache/SIR_train_set.csv', 'r', encoding='UTF-8') as infile: reader = csv.DictReader(infile) filtered_rows = [{col: row[col] for col in columns_to_keep} for row in reader] with open('../data_process_cache/train_vecStr.csv', 'w', newline='', encoding='utf-8') as outfile: if filtered_rows: writer = csv.DictWriter(outfile, fieldnames=columns_to_keep) writer.writeheader() # Ensure headers are written writer.writerows(filtered_rows) def dataProcess2(): df = pd.read_csv('../data_process_cache/train_vecStr.csv', encoding='UTF-8') df_expanded = df['vectorString'].str.split('/', expand=True) df = pd.concat([df, df_expanded], axis=1) df.to_csv('../data_process_cache/output_train.csv', index=False, encoding='UTF-8') def dataProcess3(): df = pd.read_csv('../data_process_cache/output_train.csv', encoding='UTF-8') df.replace({ 'AV:L': 'LOCAL', 'AV:N': 'NETWORK', 'AV:A': 'ADJACENT', 'AV:P': 'PHYSICAL', 'AC:L': 'LOW', 'AC:H': 'HIGH', 'PR:N': 'NONE', 'PR:L': 'LOW', 'PR:H': 'HIGH', 'UI:N': 'NONE', 'UI:R': 'REQUIRED', 'S:U': 'UNCHANGED', 'S:C': 'CHANGED', 'C:N': 'NONE', 'C:L': 'LOW', 'C:H': 'HIGH', 'I:N': 'NONE', 'I:L': 'LOW', 'I:H': 'HIGH', 'A:N': 'NONE', 'A:L': 'LOW', 'A:H': 'HIGH' }, inplace=True) df.to_csv('../dataset/SIR_train_set.csv', index=False, encoding='UTF-8') if __name__ == '__main__': main()