You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

preprocess.py 1.5 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. ''''
  2. Tokenize yelp dataset's documents using stanford core nlp
  3. '''
  4. import json
  5. import os
  6. import pickle
  7. import nltk
  8. from nltk.tokenize import stanford
  9. input_filename = 'review.json'
  10. # config for stanford core nlp
  11. os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
  12. path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
  13. tokenizer = stanford.CoreNLPTokenizer()
  14. in_dirname = 'review'
  15. out_dirname = 'reviews'
  16. f = open(input_filename, encoding='utf-8')
  17. samples = []
  18. j = 0
  19. for i, line in enumerate(f.readlines()):
  20. review = json.loads(line)
  21. samples.append((review['stars'], review['text']))
  22. if (i + 1) % 5000 == 0:
  23. print(i)
  24. pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb'))
  25. j += 1
  26. samples = []
  27. pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb'))
  28. # samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
  29. # print(samples[0])
  30. for fn in os.listdir(in_dirname):
  31. print(fn)
  32. precessed = []
  33. for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
  34. tokens = []
  35. sents = nltk.tokenize.sent_tokenize(text)
  36. for s in sents:
  37. tokens.append(tokenizer.tokenize(s))
  38. precessed.append((stars, tokens))
  39. # print(tokens)
  40. if len(precessed) % 100 == 0:
  41. print(len(precessed))
  42. pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))