|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- import json
- import csv
- import pandas as pd
- import re
-
- def process_unicode_escape(text):
- return re.sub(r'\\u([\da-fA-F]{4})', lambda x: chr(int(x.group(1), 16)), text)
-
- def main():
- jsonToCsv()
- dataProcess()
- dataProcess2()
- dataProcess3()
-
- def jsonToCsv():
- with open('../data/SIR_train_set.json', 'r', encoding='UTF-8') as json_file:
- data = json.load(json_file)
-
- # 如果 data 是列表,遍历每一项
- if isinstance(data, list):
- for item in data:
- if 'description' in item:
- item['description'] = process_unicode_escape(item['description'])
-
- # 写入 CSV 文件
- with open('../data_process_cache/SIR_train_set.csv', 'w', newline='', encoding='UTF-8') as csv_file:
- if isinstance(data, list) and len(data) > 0:
- fieldnames = data[0].keys()
- writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
- writer.writeheader()
- writer.writerows(data)
-
- def dataProcess():
- columns_to_keep = ['description', 'vectorString']
-
- with open('../data_process_cache/SIR_train_set.csv', 'r', encoding='UTF-8') as infile:
- reader = csv.DictReader(infile)
- filtered_rows = [{col: row[col] for col in columns_to_keep} for row in reader]
-
- with open('../data_process_cache/train_vecStr.csv', 'w', newline='', encoding='utf-8') as outfile:
- if filtered_rows:
- writer = csv.DictWriter(outfile, fieldnames=columns_to_keep)
- writer.writeheader() # Ensure headers are written
- writer.writerows(filtered_rows)
-
- def dataProcess2():
- df = pd.read_csv('../data_process_cache/train_vecStr.csv', encoding='UTF-8')
- df_expanded = df['vectorString'].str.split('/', expand=True)
- df = pd.concat([df, df_expanded], axis=1)
- df.to_csv('../data_process_cache/output_train.csv', index=False, encoding='UTF-8')
-
- def dataProcess3():
- df = pd.read_csv('../data_process_cache/output_train.csv', encoding='UTF-8')
- df.replace({
- 'AV:L': 'LOCAL', 'AV:N': 'NETWORK', 'AV:A': 'ADJACENT', 'AV:P': 'PHYSICAL',
- 'AC:L': 'LOW', 'AC:H': 'HIGH',
- 'PR:N': 'NONE', 'PR:L': 'LOW', 'PR:H': 'HIGH',
- 'UI:N': 'NONE', 'UI:R': 'REQUIRED',
- 'S:U': 'UNCHANGED', 'S:C': 'CHANGED',
- 'C:N': 'NONE', 'C:L': 'LOW', 'C:H': 'HIGH',
- 'I:N': 'NONE', 'I:L': 'LOW', 'I:H': 'HIGH',
- 'A:N': 'NONE', 'A:L': 'LOW', 'A:H': 'HIGH'
- }, inplace=True)
- df.to_csv('../dataset/SIR_train_set.csv', index=False, encoding='UTF-8')
-
- if __name__ == '__main__':
- main()
|