diff --git a/jiagu/findword.py b/jiagu/findword.py index ecf8674..68a987f 100644 --- a/jiagu/findword.py +++ b/jiagu/findword.py @@ -35,37 +35,28 @@ def count_words(input_file): def lrg_info(word_freq, total_word, min_freq, min_mtro): l_dict = {} r_dict = {} - k = 0 for word, freq in word_freq.items(): - k += 1 if len(word) < 3: continue left_word = word[:-1] - ml = word_freq[left_word] - if ml > min_freq: - mul_info1 = ml * total_word / (word_freq[left_word[1:]] * word_freq[left_word[0]]) - mul_info2 = ml * total_word / (word_freq[left_word[-1]] * word_freq[left_word[:-1]]) - mul_info = min(mul_info1, mul_info2) - - if mul_info > min_mtro: - if left_word in l_dict: - l_dict[left_word].append(freq) - else: - l_dict[left_word] = [ml, freq] - right_word = word[1:] - mr = word_freq[right_word] - if mr > min_freq: - mul_info1 = mr * total_word / (word_freq[right_word[1:]] * word_freq[right_word[0]]) - mul_info2 = mr * total_word / (word_freq[right_word[-1]] * word_freq[right_word[:-1]]) - mul_info = min(mul_info1, mul_info2) - - if mul_info > min_mtro: - if right_word in r_dict: - r_dict[right_word].append(freq) - else: - r_dict[right_word] = [mr, freq] + + def __update_dict(side_dict, side_word): + side_word_freq = word_freq[side_word] + if side_word_freq > min_freq: + mul_info1 = side_word_freq * total_word / (word_freq[side_word[1:]] * word_freq[side_word[0]]) + mul_info2 = side_word_freq * total_word / (word_freq[side_word[-1]] * word_freq[side_word[:-1]]) + mul_info = min(mul_info1, mul_info2) + if mul_info > min_mtro: + if side_word in side_dict: + side_dict[side_word].append(freq) + else: + side_dict[side_word] = [side_word_freq, freq] + + __update_dict(l_dict, left_word) + __update_dict(r_dict, right_word) + return l_dict, r_dict @@ -75,15 +66,11 @@ def cal_entro(r_dict): m_list = r_dict[word] r_list = m_list[1:] - fm = m_list[0] entro_r = 0 - krm = fm - sum(r_list) - if krm > 0: - entro_r -= 1 / fm * log(1 / fm, 2) * krm - + sum_r_list = sum(r_list) for rm in r_list: - entro_r -= rm / fm * log(rm / fm, 2) + entro_r -= rm / sum_r_list * log(rm / sum_r_list, 2) entro_r_dict[word] = entro_r return entro_r_dict @@ -104,21 +91,17 @@ def entro_lr_fusion(entro_r_dict, entro_l_dict): def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro): entro_dict = {} - l, r, rl = 0, 0, 0 for word in entro_in_rl_dict: if entro_in_rl_dict[word][0] > min_entro and entro_in_rl_dict[word][1] > min_entro: entro_dict[word] = word_freq[word] - rl += 1 for word in entro_in_l_dict: if entro_in_l_dict[word] > min_entro: entro_dict[word] = word_freq[word] - l += 1 for word in entro_in_r_dict: if entro_in_r_dict[word] > min_entro: entro_dict[word] = word_freq[word] - r += 1 return entro_dict