Browse Source

update file structures

tags/v0.1.0
FengZiYjun 6 years ago
parent
commit
3e1d995b3c
11 changed files with 173092 additions and 10 deletions
  1. +9
    -10
      reproduction/CNN-sentence_classification/dataset.py
  2. +0
    -0
      reproduction/HAN-document_classification/README.md
  3. +0
    -0
      reproduction/HAN-document_classification/data/test_samples.pkl
  4. +0
    -0
      reproduction/HAN-document_classification/data/train_samples.pkl
  5. +0
    -0
      reproduction/HAN-document_classification/data/yelp.word2vec
  6. +0
    -0
      reproduction/HAN-document_classification/evaluate.py
  7. +0
    -0
      reproduction/HAN-document_classification/model.py
  8. +0
    -0
      reproduction/HAN-document_classification/preprocess.py
  9. +0
    -0
      reproduction/HAN-document_classification/train.py
  10. +18860
    -0
      tests/data_for_tests/cws_test
  11. +154223
    -0
      tests/data_for_tests/cws_train

+ 9
- 10
reproduction/CNN-sentence_classification/dataset.py View File

@@ -71,20 +71,19 @@ class MRDataset(Dataset):

def word_embeddings(self, path="./GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin"):
# establish from google
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

print('Please wait ... (it could take a while to load the file : {})'.format(path))
word_dict = self.word2id_dict
embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300))
print('Please wait ... (it could take a while to load the file : {})'.format(path))
word_dict = self.word2id_dict
embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300))

for word in word_dict:
word_id = word_dict[word]
if word in model.wv.vocab:
embedding_weights[word_id, :] = model[word]
return embedding_weights
for word in word_dict:
word_id = word_dict[word]
if word in model.wv.vocab:
embedding_weights[word_id, :] = model[word]
return embedding_weights

def __len__(self):

return len(self.MRDataset_frame)

def __getitem__(self,idx):


HAN-document_classification/README.md → reproduction/HAN-document_classification/README.md View File


HAN-document_classification/data/test_samples.pkl → reproduction/HAN-document_classification/data/test_samples.pkl View File


HAN-document_classification/data/train_samples.pkl → reproduction/HAN-document_classification/data/train_samples.pkl View File


HAN-document_classification/data/yelp.word2vec → reproduction/HAN-document_classification/data/yelp.word2vec View File


HAN-document_classification/evaluate.py → reproduction/HAN-document_classification/evaluate.py View File


HAN-document_classification/model.py → reproduction/HAN-document_classification/model.py View File


HAN-document_classification/preprocess.py → reproduction/HAN-document_classification/preprocess.py View File


HAN-document_classification/train.py → reproduction/HAN-document_classification/train.py View File


+ 18860
- 0
tests/data_for_tests/cws_test
File diff suppressed because it is too large
View File


+ 154223
- 0
tests/data_for_tests/cws_train
File diff suppressed because it is too large
View File


Loading…
Cancel
Save