You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

lemmatization.py 1.8 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. from nltk.tokenize import word_tokenize
  2. from nltk.stem import WordNetLemmatizer
  3. from nltk.corpus import wordnet
  4. from nltk import pos_tag
  5. def get_wordnet_pos(word):
  6. """Map POS tag to first character lemmatize() accepts"""
  7. tag = pos_tag([word])[0][1][0].upper()
  8. tag_dict = {"J": wordnet.ADJ,
  9. "N": wordnet.NOUN,
  10. "V": wordnet.VERB,
  11. "R": wordnet.ADV}
  12. return tag_dict.get(tag, wordnet.NOUN)
  13. def lemmatize_sentence(sentence):
  14. word_list = word_tokenize(sentence)
  15. # lemmatized_output = ' '.join([lemmatize_word(w) for w in word_list]) # ALL LEMMATIZATION
  16. lemmatized_output = ' '.join([lemmatize_noun(w) for w in word_list]) # NOUN LEMMATIZATION (OLD)
  17. return lemmatized_output
  18. def lemmatize(train_texts, test_texts=None):
  19. ### Lemmatize Sentences
  20. lemmatized_texts_train = []
  21. lemmatized_texts_test = []
  22. for text in train_texts:
  23. lemmatized_texts_train.append(lemmatize_sentence(text))
  24. if test_texts is not None:
  25. for text in test_texts:
  26. lemmatized_texts_test.append(lemmatize_sentence(text))
  27. return lemmatized_texts_train, lemmatized_texts_test
  28. def lemmatize_word(word):
  29. lemmatizer = WordNetLemmatizer()
  30. pos_tag = get_wordnet_pos(word)
  31. word_lemmatized = lemmatizer.lemmatize(word, pos_tag)
  32. if pos_tag == "r" or pos_tag == "R":
  33. try:
  34. lemmas = wordnet.synset(word+'.r.1').lemmas()
  35. pertainyms = lemmas[0].pertainyms()
  36. name = pertainyms[0].name()
  37. return name
  38. except Exception:
  39. return word_lemmatized
  40. else:
  41. return word_lemmatized
  42. def lemmatize_noun(word):
  43. lemmatizer = WordNetLemmatizer()
  44. word_lemmatized = lemmatizer.lemmatize(word)
  45. return word_lemmatized

在信息安全领域,漏洞评估和管理是关键任务之一。本作品探讨了如何利用预训练文本大模型来评估和研判漏洞的严重等级,具体基于通用漏洞评分系统。传统漏洞评分方法依赖于手动分析和专家评审。而基于自然语言处理文本大模型通过其深度学习能力,可以自动化地处理和分析大量的安全相关文本数据,从而提高漏洞评估的效率和准确性。结合词干提取、词性还原能够更好地发挥自然语言处理文本大模型的预测能力与准确度。