| @@ -35,37 +35,28 @@ def count_words(input_file): | |||||
| def lrg_info(word_freq, total_word, min_freq, min_mtro): | def lrg_info(word_freq, total_word, min_freq, min_mtro): | ||||
| l_dict = {} | l_dict = {} | ||||
| r_dict = {} | r_dict = {} | ||||
| k = 0 | |||||
| for word, freq in word_freq.items(): | for word, freq in word_freq.items(): | ||||
| k += 1 | |||||
| if len(word) < 3: | if len(word) < 3: | ||||
| continue | continue | ||||
| left_word = word[:-1] | left_word = word[:-1] | ||||
| ml = word_freq[left_word] | |||||
| if ml > min_freq: | |||||
| mul_info1 = ml * total_word / (word_freq[left_word[1:]] * word_freq[left_word[0]]) | |||||
| mul_info2 = ml * total_word / (word_freq[left_word[-1]] * word_freq[left_word[:-1]]) | |||||
| mul_info = min(mul_info1, mul_info2) | |||||
| if mul_info > min_mtro: | |||||
| if left_word in l_dict: | |||||
| l_dict[left_word].append(freq) | |||||
| else: | |||||
| l_dict[left_word] = [ml, freq] | |||||
| right_word = word[1:] | right_word = word[1:] | ||||
| mr = word_freq[right_word] | |||||
| if mr > min_freq: | |||||
| mul_info1 = mr * total_word / (word_freq[right_word[1:]] * word_freq[right_word[0]]) | |||||
| mul_info2 = mr * total_word / (word_freq[right_word[-1]] * word_freq[right_word[:-1]]) | |||||
| mul_info = min(mul_info1, mul_info2) | |||||
| if mul_info > min_mtro: | |||||
| if right_word in r_dict: | |||||
| r_dict[right_word].append(freq) | |||||
| else: | |||||
| r_dict[right_word] = [mr, freq] | |||||
| def __update_dict(side_dict, side_word): | |||||
| side_word_freq = word_freq[side_word] | |||||
| if side_word_freq > min_freq: | |||||
| mul_info1 = side_word_freq * total_word / (word_freq[side_word[1:]] * word_freq[side_word[0]]) | |||||
| mul_info2 = side_word_freq * total_word / (word_freq[side_word[-1]] * word_freq[side_word[:-1]]) | |||||
| mul_info = min(mul_info1, mul_info2) | |||||
| if mul_info > min_mtro: | |||||
| if side_word in side_dict: | |||||
| side_dict[side_word].append(freq) | |||||
| else: | |||||
| side_dict[side_word] = [side_word_freq, freq] | |||||
| __update_dict(l_dict, left_word) | |||||
| __update_dict(r_dict, right_word) | |||||
| return l_dict, r_dict | return l_dict, r_dict | ||||
| @@ -75,15 +66,11 @@ def cal_entro(r_dict): | |||||
| m_list = r_dict[word] | m_list = r_dict[word] | ||||
| r_list = m_list[1:] | r_list = m_list[1:] | ||||
| fm = m_list[0] | |||||
| entro_r = 0 | entro_r = 0 | ||||
| krm = fm - sum(r_list) | |||||
| if krm > 0: | |||||
| entro_r -= 1 / fm * log(1 / fm, 2) * krm | |||||
| sum_r_list = sum(r_list) | |||||
| for rm in r_list: | for rm in r_list: | ||||
| entro_r -= rm / fm * log(rm / fm, 2) | |||||
| entro_r -= rm / sum_r_list * log(rm / sum_r_list, 2) | |||||
| entro_r_dict[word] = entro_r | entro_r_dict[word] = entro_r | ||||
| return entro_r_dict | return entro_r_dict | ||||
| @@ -104,21 +91,17 @@ def entro_lr_fusion(entro_r_dict, entro_l_dict): | |||||
| def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro): | def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro): | ||||
| entro_dict = {} | entro_dict = {} | ||||
| l, r, rl = 0, 0, 0 | |||||
| for word in entro_in_rl_dict: | for word in entro_in_rl_dict: | ||||
| if entro_in_rl_dict[word][0] > min_entro and entro_in_rl_dict[word][1] > min_entro: | if entro_in_rl_dict[word][0] > min_entro and entro_in_rl_dict[word][1] > min_entro: | ||||
| entro_dict[word] = word_freq[word] | entro_dict[word] = word_freq[word] | ||||
| rl += 1 | |||||
| for word in entro_in_l_dict: | for word in entro_in_l_dict: | ||||
| if entro_in_l_dict[word] > min_entro: | if entro_in_l_dict[word] > min_entro: | ||||
| entro_dict[word] = word_freq[word] | entro_dict[word] = word_freq[word] | ||||
| l += 1 | |||||
| for word in entro_in_r_dict: | for word in entro_in_r_dict: | ||||
| if entro_in_r_dict[word] > min_entro: | if entro_in_r_dict[word] > min_entro: | ||||
| entro_dict[word] = word_freq[word] | entro_dict[word] = word_freq[word] | ||||
| r += 1 | |||||
| return entro_dict | return entro_dict | ||||