垃圾邮件分类器：基于贝叶斯算法和词频统计

完整的垃圾邮件分类函数

import os
import jieba
import func
import spam_dict
import ham_dict
import math
import count
import mail
import get_mail

malist=[] 
banlist=[] 

def bayes(worddict,spamdict,hamdict):
    #此处的worddict应为词频前15个单词的dict

    #首先对垃圾邮件进行处理
    dict0={}
    dict1={}
    for key in worddict:
        if key in spamdict:
            dict0[key]=spamdict[key]
        else:
            dict0[key]=1
               
        for key in worddict:
            if key in hamdict:
                dict1[key]=hamdict[key]
            else:
                dict1[key]=1
                
    #计算两个词典的单词总数
    spam_num=spam_dict.spamdict_count(spamdict)
    ham_num=ham_dict.hamdict_count(hamdict)
    
    #根据总单词数确定概率值
    if int(ham_num)<10000:
        prob=0.42
    if int(ham_num)>630000:
        prob=0.476
    if ham_num>10000  and ham_num<500000:
        prob=0.479
        
    #计算15个单词的出现概率
    p1dict={}
    p2dict={}
    for key in dict0:
        p1dict[key]=dict0[key]/spam_num

    for key in dict1:
        p2dict[key]=dict1[key]/ham_num

    p1=0
    p2=0
    for key in p1dict:
        p1=p1+math.log(p1dict[key])

    for key in p2dict:
        p2=p2+math.log(p2dict[key])

    #根据概率值判断邮件分类
    p1=(math.e**p1)*prob
    p2=(math.e**p2)*(1-prob)

    if len(worddict)<15:
        print('这封邮件过短，是可疑邮件')
        say='这封邮件过短，是可疑邮件'
        return say

    if p1>p2:
        print('是垃圾邮件！')
        print(' ')
        return False
    else:
        print('非垃圾邮件')
        print(' ')
        return True

def get_probdict(worddict,spamdict,hamdict):
    #根据词频计算每个单词出现在垃圾邮件中的概率
    wordprobdict={} 
    for word,num in worddict.items():

        if word in spamdict.keys() and word in hamdict.keys():
            #该文件中包含词个数
            pw_s=spamdict[word]/spam_dict.spamdict_count(spamdict)
            pw_n=hamdict[word]/ham_dict.hamdict_count(hamdict)
            ps_w=pw_s/(pw_s+pw_n) 
            wordprobdict[word]=ps_w
        if word in spamdict.keys() and word not in hamdict.keys():
            pw_s=spamdict[word]/spam_dict.spamdict_count(spamdict)
            pw_n=2.0409797330712506e-05
            
            ps_w=pw_s/(pw_s+pw_n) 
            wordprobdict[word]=ps_w
        if word not in spamdict.keys() and word in hamdict.keys():
            pw_s=4.517012197187658e-05
            
            pw_n=hamdict[word]/ham_dict.hamdict_count(hamdict)
            ps_w=pw_s/(pw_s+pw_n) 
            wordprobdict[word]=ps_w
        if word not in spamdict.keys() and word not in hamdict.keys():
            ps_w=0.44419625422613
            
            wordprobdict[word]=ps_w
            
    return wordprobdict

def bayes2(wordprobdict,spamdict,hamdict):
    #根据每个单词出现在垃圾邮件中的概率值计算整封邮件为垃圾邮件的概率
    ps_w=1
    ps_n=1
     
    for word,prob in wordprobdict.items() :
        ps_w=ps_w*wordprobdict[word]
        ps_n=ps_n*(1-wordprobdict[word])

    p=ps_w/(ps_w+ps_n)
   
    return p  

def count_words(num3):
    #对测试邮件的字频统计，并按降序输出key值的dict 
    global this_path
    mail_num=str(num3) 
    stop_list=func.get_stop_list() 
    mail_list=[] 
    new_dir=count.get_path(mail_num) 
    
    if os.path.isfile(new_dir) : #如果是文件 
        with open(new_dir,'r') as f:
            strg=mail.mail_api(f) 
            this_path=new_dir 
            temp_list=list(jieba.cut(strg)) 
            for i in temp_list: 
                if (i not in stop_list) and i.strip()!='' and func.filter_words(i): 
                    mail_list.append(i) 
                else: 
                    return False

        global malist
        malist=mail_list

        mail_dict0=func.get_diction(mail_list)#初始化字典
        if os.path.isfile(new_dir) :         
            with open(new_dir,'r') as f:
                   dict_word=mail.mail_api(f)
                   time_str=list(jieba.cut(dict_word))
                   for i in time_str:
                       if i in mail_dict0.keys():
                           mail_dict0[i]=int(mail_dict0[i])+1
                              
        count_mail_dict=dict(sorted(mail_dict0.items(),key=lambda x:x[1],reverse=True))
        cnt = 0 
        dict1={}
        for key, value in count_mail_dict.items():
            cnt += 1
            dict1[key]=value
            if cnt > 15:
                break

        top_mail_dict=dict1
        return top_mail_dict

def count_words1():
    #对测试邮件的字频统计，并按降序输出key值的dict 
    stop_list=func.get_stop_list() 
    mail_list=[] 
    strg=get_mail.get_content() 
    temp_list=list(jieba.cut(strg)) 
    for i in temp_list: 
        if (i not in stop_list) and i.strip()!='' and func.filter_words(i): 
            mail_list.append(i)

    global malist
    malist=mail_list

    mail_dict0=func.get_diction(mail_list)#初始化字典
    dict_word=get_mail.get_content()
    time_str=list(jieba.cut(dict_word))
    for i in time_str:
        if i in mail_dict0.keys():
            mail_dict0[i]=int(mail_dict0[i])+1
                          
    count_mail_dict=dict(sorted(mail_dict0.items(),key=lambda x:x[1],reverse=True))
    cnt = 0 
    dict1={}
    for key, value in count_mail_dict.items():
        cnt += 1
        dict1[key]=value
        if cnt > 15:
            break

    top_mail_dict=dict1
    return top_mail_dict    

def this_mail(): 
    global this_path 
    return this_path

def ban(banlist): 
    global malist 
    mlist=malist 
    blist=banlist

    if list(set(mlist).intersection(set(blist))):
        return True
    else:
        return False

def spam_filter(mail_num):
    #完整的垃圾邮件分类函数
    top_mail_dict=count_words(mail_num)
    if top_mail_dict==False:
        return '邮件无法识别'

    wordprobdict=get_probdict(top_mail_dict,spam_dict.spam_dict,ham_dict.ham_dict)
    p=bayes2(wordprobdict,spam_dict.spam_dict,ham_dict.ham_dict)

    if ban(banlist):
        print('该邮件包含违禁内容，属于垃圾邮件！')
        return '该邮件包含违禁内容，属于垃圾邮件！'
    elif p>0.9:
        print('该邮件为垃圾邮件！')
        return '该邮件为垃圾邮件！'
    else:
        print('该邮件为正常邮件！')
        return '该邮件为正常邮件！'