import pymongo import nltk import functools import math from bson.objectid import ObjectId from bson import json_util as jsonb from nltk import word_tokenize,pos_tag from gensim import corpora, models from nltk.corpus import stopwords from nltk.stem import PorterStemmer

client = pymongo.MongoClient(host='localhost', port=27017) db = client.dump

TF-IDF

class TfIdf(object): # 四个参数分别是：训练好的idf字典，默认idf值，处理后的待提取文本，关键词数量 def init(self, idf_dic, default_idf, doc_list, keyword_num): self.doc_list = doc_list self.idf_dic, self.default_idf = idf_dic, default_idf self.tf_dic = self.get_tf_dic() self.keyword_num = keyword_num

# 统计tf值
def get_tf_dic(self):
    tf_dic = {}
    for word in self.doc_list:
        tf_dic[word] = tf_dic.get(word, 0.0) + 1.0

    tt_count = len(self.doc_list)
    for k, v in tf_dic.items():
        tf_dic[k] = float(v) / tt_count

    return tf_dic

# 按公式计算tf-idf
def get_tfidf(self):
    tfidf_dic = {}
    for word in self.doc_list:
        idf = self.idf_dic.get(word, self.default_idf)
        tf = self.tf_dic.get(word, 0)

        tfidf = tf * idf
        tfidf_dic[word] = tfidf


    tfidf_dic.items()
    # 根据tf-idf排序，去排名前keyword_num的词作为关键词
    for k, v in sorted(tfidf_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
        print(k + '/ ', end='')
        print(v)
    print()

idf值统计方法

def train_idf(doc_list): idf_dic = {} # 总文档数 tt_count = len(doc_list)

# 每个词出现的文档数
for doc in doc_list:
    for word in set(doc):
        idf_dic[word] = idf_dic.get(word, 0.0) + 1.0

# 按公式转换为idf值，分母加1进行平滑处理
for k, v in idf_dic.items():
    idf_dic[k] = math.log(tt_count / (1.0 + v))

# 对于没有在字典中的词，默认其仅在一个文档出现，得到默认idf值
default_idf = math.log(tt_count / (1.0))
return idf_dic, default_idf

排序函数，用于topK关键词的按值排序

def cmp(e1, e2): import numpy as np res = np.sign(e1[1] - e2[1]) if res != 0: return res else: a = e1[0] + e2[0] b = e2[0] + e1[0] if a > b: return 1 elif a == b: return 0 else: return -1

分析每个酒店的评论

def analyze_hotel_comments(hotel_name): # 查询指定酒店的评论文档 comments = db.hotels.find_one({'hotel_name': hotel_name})['comments']

# 分词，去除标点符号和停用词，词干提取
cutword1 = word_tokenize(comments)
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '}','{', '``', ' '' ', 'time', 'great']
cutwords2 = [word for word in cutword1 if word not in interpunctuations]
stops = set(stopwords.words('english'))
cutword3 = [word for word in cutwords2 if word not in stops and len(word)>4]
tags = set(['NN', 'NNS', 'NNP', 'NNPS'])
pos_tags =nltk.pos_tag(cutword3)
cutword4 = []
for word,pos in pos_tags:
    if (pos in tags):
        cutword4.append(word)
doc_list = []
for cutword in cutword4:
    doc_list.append(PorterStemmer().stem(cutword))

# 提取关键词
print('关键词提取结果：')
tfidf_extract(doc_list)

def tfidf_extract(doc_list, pos=False, keyword_num=25):

doc_list = load_data(pos)

idf_dic, default_idf = train_idf(doc_list)
tfidf_model = TfIdf(idf_dic, default_idf, doc_list, keyword_num)
tfidf_model.get_tfidf()

if name == 'main': # 指定需要分析的酒店名称列表 hotel_names = ['hotel1', 'hotel2', 'hotel3'] for hotel_name in hotel_names: analyze_hotel_comments(hotel_name)

酒店评论关键词提取：TF-IDF算法应用与MongoDB数据库连接

TF-IDF

idf值统计方法

排序函数，用于topK关键词的按值排序

分析每个酒店的评论

doc_list = load_data(pos)