|
1234567891011121314151617181920 |
- from nltk.tokenize import word_tokenize
- from nltk.stem import PorterStemmer
-
- def stemmatize_sentence(sentence):
- stemmer = PorterStemmer()
- word_list = word_tokenize(sentence)
- stemmatized_ouput = ' '.join([stemmer.stem(w) for w in word_list])
- return stemmatized_ouput
-
- def stemmatize(train_texts, test_texts=None):
- ### Stemmatize Sentences
- stemmatized_texts_train = []
- stemmatized_texts_test = []
- for text in train_texts:
- stemmatized_texts_train.append(stemmatize_sentence(text))
- if test_texts is not None:
- for text in test_texts:
- stemmatized_texts_test.append(stemmatize_sentence(text))
-
- return stemmatized_texts_train, stemmatized_texts_test
|