from math import log2 def file_deal(dir_name): with open(dir_name,'r',encoding="utf8") as F: File_line=F.readlines() # readlines don't forget "s" for line in File_line: if len(line)<5: continue yield line[9:-11] def compute_entropy_less_memory(): ''' 减少了18%的内存使用:48%->30%; 结束后11%;8G 语料文本大小:1.6 G ''' print("start...") dir_name="/path(...)/corpus.txt" str_list=[] word_freq_table=dict() Global_entropy,all_num=0,0 print("table build start ...;") for line in file_deal(dir_name): for word in line: if word is '': continue if word in word_freq_table: word_freq_table[word]+=1 else: word_freq_table[word]=1 word_num=len(word_freq_table) print("词频表的长度为",word_num) print("计算分母...") for i in word_freq_table.values(): all_num+=i print("计算熵...") entropy=lambda freq :-(freq/all_num)*log2(freq/all_num) for freq_num in word_freq_table.values(): Global_entropy+=entropy(freq_num) print("the entropy is : ",Global_entropy) compute_entropy_less_memory()
def compute_entropy(): print("start...") dir_name="/path/corpus.txt" str_list=[] word_freq_table=dict() Global_entropy,all_num=0,0 #multi_file_deal(dir_name) with open(dir_name,'r',encoding="utf8") as F: File_line=F.readlines() # readlines don't forget "s" for line in File_line: if len(line)<5: continue str_list.append(line[9:-11]) print("file read end;",str_list[1][:10]) for line in str_list: for word in line: if word is '': continue if word in word_freq_table: word_freq_table[word]+=1 else: word_freq_table[word]=1 print("table build end;") word_num=len(word_freq_table) print("词频表的长度为",word_num) print("计算分母...") for i in word_freq_table.values(): all_num+=i print("计算熵...") entropy=lambda freq :-(freq/all_num)*log2(freq/all_num) for freq_num in word_freq_table.values(): Global_entropy+=entropy(freq_num) print("the entropy is : ",Global_entropy) compute_entropy()
词频表的长度为 9124
the entropy is : 9.561760817055223
词频表的长度为 105
the entropy is : 4.796600336973779
词频表的长度为 2620
the entropy is : 5.05736257154819
每个bit都有0|1两种可能.要覆盖八种情况就需要3bit
再用概率乘以空间需求即为实际空间需求
最大熵:使熵值最大的概率分布最真实的反映了事件的分布情况