Browse Source

optimize findwords

master
dirtdust GitHub 6 years ago
parent
commit
889d35d415
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 18 additions and 35 deletions
  1. +18
    -35
      jiagu/findword.py

+ 18
- 35
jiagu/findword.py View File

@@ -35,37 +35,28 @@ def count_words(input_file):
def lrg_info(word_freq, total_word, min_freq, min_mtro):
l_dict = {}
r_dict = {}
k = 0
for word, freq in word_freq.items():
k += 1
if len(word) < 3:
continue

left_word = word[:-1]
ml = word_freq[left_word]
if ml > min_freq:
mul_info1 = ml * total_word / (word_freq[left_word[1:]] * word_freq[left_word[0]])
mul_info2 = ml * total_word / (word_freq[left_word[-1]] * word_freq[left_word[:-1]])
mul_info = min(mul_info1, mul_info2)

if mul_info > min_mtro:
if left_word in l_dict:
l_dict[left_word].append(freq)
else:
l_dict[left_word] = [ml, freq]

right_word = word[1:]
mr = word_freq[right_word]
if mr > min_freq:
mul_info1 = mr * total_word / (word_freq[right_word[1:]] * word_freq[right_word[0]])
mul_info2 = mr * total_word / (word_freq[right_word[-1]] * word_freq[right_word[:-1]])
mul_info = min(mul_info1, mul_info2)

if mul_info > min_mtro:
if right_word in r_dict:
r_dict[right_word].append(freq)
else:
r_dict[right_word] = [mr, freq]

def __update_dict(side_dict, side_word):
side_word_freq = word_freq[side_word]
if side_word_freq > min_freq:
mul_info1 = side_word_freq * total_word / (word_freq[side_word[1:]] * word_freq[side_word[0]])
mul_info2 = side_word_freq * total_word / (word_freq[side_word[-1]] * word_freq[side_word[:-1]])
mul_info = min(mul_info1, mul_info2)
if mul_info > min_mtro:
if side_word in side_dict:
side_dict[side_word].append(freq)
else:
side_dict[side_word] = [side_word_freq, freq]

__update_dict(l_dict, left_word)
__update_dict(r_dict, right_word)

return l_dict, r_dict


@@ -75,15 +66,11 @@ def cal_entro(r_dict):
m_list = r_dict[word]

r_list = m_list[1:]
fm = m_list[0]

entro_r = 0
krm = fm - sum(r_list)
if krm > 0:
entro_r -= 1 / fm * log(1 / fm, 2) * krm

sum_r_list = sum(r_list)
for rm in r_list:
entro_r -= rm / fm * log(rm / fm, 2)
entro_r -= rm / sum_r_list * log(rm / sum_r_list, 2)
entro_r_dict[word] = entro_r

return entro_r_dict
@@ -104,21 +91,17 @@ def entro_lr_fusion(entro_r_dict, entro_l_dict):

def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro):
entro_dict = {}
l, r, rl = 0, 0, 0
for word in entro_in_rl_dict:
if entro_in_rl_dict[word][0] > min_entro and entro_in_rl_dict[word][1] > min_entro:
entro_dict[word] = word_freq[word]
rl += 1

for word in entro_in_l_dict:
if entro_in_l_dict[word] > min_entro:
entro_dict[word] = word_freq[word]
l += 1

for word in entro_in_r_dict:
if entro_in_r_dict[word] > min_entro:
entro_dict[word] = word_freq[word]
r += 1

return entro_dict



Loading…
Cancel
Save