malist= banlist= #################################### #贝叶斯的实现函数 def bayesworddictspamdicthamdict#此处的worddict应为词频前15个单词的dict#首先对垃圾邮件进行处理dict0=dict1=for key in worddict if key in spamdict #pri
#完整的垃圾邮件分类函数 import os import jieba import func import spam_dict import ham_dict import math import count import mail import get_mail
malist=[] banlist=[]
def bayes(worddict,spamdict,hamdict): #此处的worddict应为词频前15个单词的dict
#首先对垃圾邮件进行处理
dict0={}
dict1={}
for key in worddict:
if key in spamdict:
dict0[key]=spamdict[key]
else:
dict0[key]=1
for key in worddict:
if key in hamdict:
dict1[key]=hamdict[key]
else:
dict1[key]=1
#计算两个词典的单词总数
spam_num=spam_dict.spamdict_count(spamdict)
ham_num=ham_dict.hamdict_count(hamdict)
#根据总单词数确定概率值
if int(ham_num)<10000:
prob=0.42
if int(ham_num)>630000:
prob=0.476
if ham_num>10000 and ham_num<500000:
prob=0.479
#计算15个单词的出现概率
p1dict={}
p2dict={}
for key in dict0:
p1dict[key]=dict0[key]/spam_num
for key in dict1:
p2dict[key]=dict1[key]/ham_num
p1=0
p2=0
for key in p1dict:
p1=p1+math.log(p1dict[key])
for key in p2dict:
p2=p2+math.log(p2dict[key])
#根据概率值判断邮件分类
p1=(math.e**p1)*prob
p2=(math.e**p2)*(1-prob)
if len(worddict)<15:
print('这封邮件过短,是可疑邮件')
say='这封邮件过短,是可疑邮件'
return say
if p1>p2:
print('是垃圾邮件!')
print(' ')
return False
else:
print('非垃圾邮件')
print(' ')
return True
def get_probdict(worddict,spamdict,hamdict): #根据词频计算每个单词出现在垃圾邮件中的概率 wordprobdict={} for word,num in worddict.items():
if word in spamdict.keys() and word in hamdict.keys():
#该文件中包含词个数
pw_s=spamdict[word]/spam_dict.spamdict_count(spamdict)
pw_n=hamdict[word]/ham_dict.hamdict_count(hamdict)
ps_w=pw_s/(pw_s+pw_n)
wordprobdict[word]=ps_w
if word in spamdict.keys() and word not in hamdict.keys():
pw_s=spamdict[word]/spam_dict.spamdict_count(spamdict)
pw_n=2.0409797330712506e-05
ps_w=pw_s/(pw_s+pw_n)
wordprobdict[word]=ps_w
if word not in spamdict.keys() and word in hamdict.keys():
pw_s=4.517012197187658e-05
pw_n=hamdict[word]/ham_dict.hamdict_count(hamdict)
ps_w=pw_s/(pw_s+pw_n)
wordprobdict[word]=ps_w
if word not in spamdict.keys() and word not in hamdict.keys():
ps_w=0.44419625422613
wordprobdict[word]=ps_w
return wordprobdict
def bayes2(wordprobdict,spamdict,hamdict): #根据每个单词出现在垃圾邮件中的概率值计算整封邮件为垃圾邮件的概率 ps_w=1 ps_n=1
for word,prob in wordprobdict.items() :
ps_w=ps_w*wordprobdict[word]
ps_n=ps_n*(1-wordprobdict[word])
p=ps_w/(ps_w+ps_n)
return p
def count_words(num3): #对测试邮件的字频统计,并按降序输出key值的dict global this_path mail_num=str(num3) stop_list=func.get_stop_list() mail_list=[] new_dir=count.get_path(mail_num)
if os.path.isfile(new_dir) : #如果是文件
with open(new_dir,"r") as f:
strg=mail.mail_api(f)
this_path=new_dir
temp_list=list(jieba.cut(strg))
for i in temp_list:
if (i not in stop_list) and i.strip()!='' and func.filter_words(i):
mail_list.append(i)
else:
return False
global malist
malist=mail_list
mail_dict0=func.get_diction(mail_list)#初始化字典
if os.path.isfile(new_dir) :
with open(new_dir,"r") as f:
dict_word=mail.mail_api(f)
time_str=list(jieba.cut(dict_word))
for i in time_str:
if i in mail_dict0.keys():
mail_dict0[i]=int(mail_dict0[i])+1
count_mail_dict=dict(sorted(mail_dict0.items(),key=lambda x:x[1],reverse=True))
cnt = 0
dict1={}
for key, value in count_mail_dict.items():
cnt += 1
dict1[key]=value
if cnt > 15:
break
top_mail_dict=dict1
return top_mail_dict
def count_words1(): #对测试邮件的字频统计,并按降序输出key值的dict stop_list=func.get_stop_list() mail_list=[] strg=get_mail.get_content() temp_list=list(jieba.cut(strg)) for i in temp_list: if (i not in stop_list) and i.strip()!='' and func.filter_words(i): mail_list.append(i)
global malist
malist=mail_list
mail_dict0=func.get_diction(mail_list)#初始化字典
dict_word=get_mail.get_content()
time_str=list(jieba.cut(dict_word))
for i in time_str:
if i in mail_dict0.keys():
mail_dict0[i]=int(mail_dict0[i])+1
count_mail_dict=dict(sorted(mail_dict0.items(),key=lambda x:x[1],reverse=True))
cnt = 0
dict1={}
for key, value in count_mail_dict.items():
cnt += 1
dict1[key]=value
if cnt > 15:
break
top_mail_dict=dict1
return top_mail_dict
def this_mail(): global this_path return this_path
def ban(banlist): global malist mlist=malist blist=banlist
if list(set(mlist).intersection(set(blist))):
return True
else:
return False
def spam_filter(mail_num): #完整的垃圾邮件分类函数 top_mail_dict=count_words(mail_num) if top_mail_dict==False: return '邮件无法识别'
wordprobdict=get_probdict(top_mail_dict,spam_dict.spam_dict,ham_dict.ham_dict)
p=bayes2(wordprobdict,spam_dict.spam_dict,ham_dict.ham_dict)
if ban(banlist):
print('该邮件包含违禁内容,属于垃圾邮件!')
return '该邮件包含违禁内容,属于垃圾邮件!'
elif p>0.9:
print('该邮件为垃圾邮件!')
return '该邮件为垃圾邮件!'
else:
print('该邮件为正常邮件!')
return '该邮件为正常邮件!
原文地址: https://www.cveoy.top/t/topic/foRB 著作权归作者所有。请勿转载和采集!