|
- from nltk.tokenize import word_tokenize
- from nltk.stem import WordNetLemmatizer
- from nltk.corpus import wordnet
- from nltk import pos_tag
-
- def get_wordnet_pos(word):
- """Map POS tag to first character lemmatize() accepts"""
- tag = pos_tag([word])[0][1][0].upper()
- tag_dict = {"J": wordnet.ADJ,
- "N": wordnet.NOUN,
- "V": wordnet.VERB,
- "R": wordnet.ADV}
-
- return tag_dict.get(tag, wordnet.NOUN)
-
- def lemmatize_sentence(sentence):
- word_list = word_tokenize(sentence)
- # lemmatized_output = ' '.join([lemmatize_word(w) for w in word_list]) # ALL LEMMATIZATION
- lemmatized_output = ' '.join([lemmatize_noun(w) for w in word_list]) # NOUN LEMMATIZATION (OLD)
-
- return lemmatized_output
-
- def lemmatize(train_texts, test_texts=None):
- ### Lemmatize Sentences
- lemmatized_texts_train = []
- lemmatized_texts_test = []
- for text in train_texts:
- lemmatized_texts_train.append(lemmatize_sentence(text))
- if test_texts is not None:
- for text in test_texts:
- lemmatized_texts_test.append(lemmatize_sentence(text))
-
- return lemmatized_texts_train, lemmatized_texts_test
-
- def lemmatize_word(word):
- lemmatizer = WordNetLemmatizer()
- pos_tag = get_wordnet_pos(word)
- word_lemmatized = lemmatizer.lemmatize(word, pos_tag)
-
- if pos_tag == "r" or pos_tag == "R":
- try:
- lemmas = wordnet.synset(word+'.r.1').lemmas()
- pertainyms = lemmas[0].pertainyms()
- name = pertainyms[0].name()
- return name
- except Exception:
- return word_lemmatized
- else:
- return word_lemmatized
-
- def lemmatize_noun(word):
- lemmatizer = WordNetLemmatizer()
- word_lemmatized = lemmatizer.lemmatize(word)
-
- return word_lemmatized
|