#完整的垃圾邮件分类函数 import os import jieba import func import spam_dict import ham_dict import math import count import mail import get_mail

malist=[] banlist=[]

def bayes(worddict,spamdict,hamdict): #此处的worddict应为词频前15个单词的dict

#首先对垃圾邮件进行处理
dict0={}
dict1={}
for key in worddict:
    if key in spamdict:
        dict0[key]=spamdict[key]
    else:
        dict0[key]=1
           
    for key in worddict:
        if key in hamdict:
            dict1[key]=hamdict[key]
        else:
            dict1[key]=1
            
#计算两个词典的单词总数
spam_num=spam_dict.spamdict_count(spamdict)
ham_num=ham_dict.hamdict_count(hamdict)

#根据总单词数确定概率值
if int(ham_num)<10000:
    prob=0.42
if int(ham_num)>630000:
    prob=0.476
if ham_num>10000  and ham_num<500000:
    prob=0.479
    
#计算15个单词的出现概率
p1dict={}
p2dict={}
for key in dict0:
    p1dict[key]=dict0[key]/spam_num

for key in dict1:
    p2dict[key]=dict1[key]/ham_num

p1=0
p2=0
for key in p1dict:
    p1=p1+math.log(p1dict[key])

for key in p2dict:
    p2=p2+math.log(p2dict[key])

#根据概率值判断邮件分类
p1=(math.e**p1)*prob
p2=(math.e**p2)*(1-prob)

if len(worddict)<15:
    print('这封邮件过短,是可疑邮件')
    say='这封邮件过短,是可疑邮件'
    return say

if p1>p2:
    print('是垃圾邮件!')
    print(' ')
    return False
else:
    print('非垃圾邮件')
    print(' ')
    return True

def get_probdict(worddict,spamdict,hamdict): #根据词频计算每个单词出现在垃圾邮件中的概率 wordprobdict={} for word,num in worddict.items():

    if word in spamdict.keys() and word in hamdict.keys():
        #该文件中包含词个数
        pw_s=spamdict[word]/spam_dict.spamdict_count(spamdict)
        pw_n=hamdict[word]/ham_dict.hamdict_count(hamdict)
        ps_w=pw_s/(pw_s+pw_n) 
        wordprobdict[word]=ps_w
    if word in spamdict.keys() and word not in hamdict.keys():
        pw_s=spamdict[word]/spam_dict.spamdict_count(spamdict)
        pw_n=2.0409797330712506e-05
        
        ps_w=pw_s/(pw_s+pw_n) 
        wordprobdict[word]=ps_w
    if word not in spamdict.keys() and word in hamdict.keys():
        pw_s=4.517012197187658e-05
        
        pw_n=hamdict[word]/ham_dict.hamdict_count(hamdict)
        ps_w=pw_s/(pw_s+pw_n) 
        wordprobdict[word]=ps_w
    if word not in spamdict.keys() and word not in hamdict.keys():
        ps_w=0.44419625422613
        
        wordprobdict[word]=ps_w
        
return wordprobdict

def bayes2(wordprobdict,spamdict,hamdict): #根据每个单词出现在垃圾邮件中的概率值计算整封邮件为垃圾邮件的概率 ps_w=1 ps_n=1

for word,prob in wordprobdict.items() :
    ps_w=ps_w*wordprobdict[word]
    ps_n=ps_n*(1-wordprobdict[word])

p=ps_w/(ps_w+ps_n)

return p  

def count_words(num3): #对测试邮件的字频统计,并按降序输出key值的dict global this_path mail_num=str(num3) stop_list=func.get_stop_list() mail_list=[] new_dir=count.get_path(mail_num)

if os.path.isfile(new_dir) : #如果是文件 
    with open(new_dir,"r") as f:
        strg=mail.mail_api(f) 
        this_path=new_dir 
        temp_list=list(jieba.cut(strg)) 
        for i in temp_list: 
            if (i not in stop_list) and i.strip()!='' and func.filter_words(i): 
                mail_list.append(i) 
            else: 
                return False

    global malist
    malist=mail_list

    mail_dict0=func.get_diction(mail_list)#初始化字典
    if os.path.isfile(new_dir) :         
        with open(new_dir,"r") as f:
               dict_word=mail.mail_api(f)
               time_str=list(jieba.cut(dict_word))
               for i in time_str:
                   if i in mail_dict0.keys():
                       mail_dict0[i]=int(mail_dict0[i])+1
                          
    count_mail_dict=dict(sorted(mail_dict0.items(),key=lambda x:x[1],reverse=True))
    cnt = 0 
    dict1={}
    for key, value in count_mail_dict.items():
        cnt += 1
        dict1[key]=value
        if cnt > 15:
            break

    top_mail_dict=dict1
    return top_mail_dict

def count_words1(): #对测试邮件的字频统计,并按降序输出key值的dict stop_list=func.get_stop_list() mail_list=[] strg=get_mail.get_content() temp_list=list(jieba.cut(strg)) for i in temp_list: if (i not in stop_list) and i.strip()!='' and func.filter_words(i): mail_list.append(i)

global malist
malist=mail_list

mail_dict0=func.get_diction(mail_list)#初始化字典
dict_word=get_mail.get_content()
time_str=list(jieba.cut(dict_word))
for i in time_str:
    if i in mail_dict0.keys():
        mail_dict0[i]=int(mail_dict0[i])+1
                      
count_mail_dict=dict(sorted(mail_dict0.items(),key=lambda x:x[1],reverse=True))
cnt = 0 
dict1={}
for key, value in count_mail_dict.items():
    cnt += 1
    dict1[key]=value
    if cnt > 15:
        break

top_mail_dict=dict1
return top_mail_dict    

def this_mail(): global this_path return this_path

def ban(banlist): global malist mlist=malist blist=banlist

if list(set(mlist).intersection(set(blist))):
    return True
else:
    return False

def spam_filter(mail_num): #完整的垃圾邮件分类函数 top_mail_dict=count_words(mail_num) if top_mail_dict==False: return '邮件无法识别'

wordprobdict=get_probdict(top_mail_dict,spam_dict.spam_dict,ham_dict.ham_dict)
p=bayes2(wordprobdict,spam_dict.spam_dict,ham_dict.ham_dict)

if ban(banlist):
    print('该邮件包含违禁内容,属于垃圾邮件!')
    return '该邮件包含违禁内容,属于垃圾邮件!'
elif p>0.9:
    print('该邮件为垃圾邮件!')
    return '该邮件为垃圾邮件!'
else:
    print('该邮件为正常邮件!')
    return '该邮件为正常邮件!

原文地址: https://www.cveoy.top/t/topic/foRB 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录