You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

json2csv.py 2.6 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. import json
  2. import csv
  3. import pandas as pd
  4. import re
  5. def process_unicode_escape(text):
  6. return re.sub(r'\\u([\da-fA-F]{4})', lambda x: chr(int(x.group(1), 16)), text)
  7. def main():
  8. jsonToCsv()
  9. dataProcess()
  10. dataProcess2()
  11. dataProcess3()
  12. def jsonToCsv():
  13. with open('../data/SIR_train_set.json', 'r', encoding='UTF-8') as json_file:
  14. data = json.load(json_file)
  15. # 如果 data 是列表,遍历每一项
  16. if isinstance(data, list):
  17. for item in data:
  18. if 'description' in item:
  19. item['description'] = process_unicode_escape(item['description'])
  20. # 写入 CSV 文件
  21. with open('../data_process_cache/SIR_train_set.csv', 'w', newline='', encoding='UTF-8') as csv_file:
  22. if isinstance(data, list) and len(data) > 0:
  23. fieldnames = data[0].keys()
  24. writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
  25. writer.writeheader()
  26. writer.writerows(data)
  27. def dataProcess():
  28. columns_to_keep = ['description', 'vectorString']
  29. with open('../data_process_cache/SIR_train_set.csv', 'r', encoding='UTF-8') as infile:
  30. reader = csv.DictReader(infile)
  31. filtered_rows = [{col: row[col] for col in columns_to_keep} for row in reader]
  32. with open('../data_process_cache/train_vecStr.csv', 'w', newline='', encoding='utf-8') as outfile:
  33. if filtered_rows:
  34. writer = csv.DictWriter(outfile, fieldnames=columns_to_keep)
  35. writer.writeheader() # Ensure headers are written
  36. writer.writerows(filtered_rows)
  37. def dataProcess2():
  38. df = pd.read_csv('../data_process_cache/train_vecStr.csv', encoding='UTF-8')
  39. df_expanded = df['vectorString'].str.split('/', expand=True)
  40. df = pd.concat([df, df_expanded], axis=1)
  41. df.to_csv('../data_process_cache/output_train.csv', index=False, encoding='UTF-8')
  42. def dataProcess3():
  43. df = pd.read_csv('../data_process_cache/output_train.csv', encoding='UTF-8')
  44. df.replace({
  45. 'AV:L': 'LOCAL', 'AV:N': 'NETWORK', 'AV:A': 'ADJACENT', 'AV:P': 'PHYSICAL',
  46. 'AC:L': 'LOW', 'AC:H': 'HIGH',
  47. 'PR:N': 'NONE', 'PR:L': 'LOW', 'PR:H': 'HIGH',
  48. 'UI:N': 'NONE', 'UI:R': 'REQUIRED',
  49. 'S:U': 'UNCHANGED', 'S:C': 'CHANGED',
  50. 'C:N': 'NONE', 'C:L': 'LOW', 'C:H': 'HIGH',
  51. 'I:N': 'NONE', 'I:L': 'LOW', 'I:H': 'HIGH',
  52. 'A:N': 'NONE', 'A:L': 'LOW', 'A:H': 'HIGH'
  53. }, inplace=True)
  54. df.to_csv('../dataset/SIR_train_set.csv', index=False, encoding='UTF-8')
  55. if __name__ == '__main__':
  56. main()

在信息安全领域,漏洞评估和管理是关键任务之一。本作品探讨了如何利用预训练文本大模型来评估和研判漏洞的严重等级,具体基于通用漏洞评分系统。传统漏洞评分方法依赖于手动分析和专家评审。而基于自然语言处理文本大模型通过其深度学习能力,可以自动化地处理和分析大量的安全相关文本数据,从而提高漏洞评估的效率和准确性。结合词干提取、词性还原能够更好地发挥自然语言处理文本大模型的预测能力与准确度。