酒店评论关键词提取:TF-IDF算法应用与MongoDB数据库连接
import pymongo import nltk import functools import math from bson.objectid import ObjectId from bson import json_util as jsonb from nltk import word_tokenize,pos_tag from gensim import corpora, models from nltk.corpus import stopwords from nltk.stem import PorterStemmer
client = pymongo.MongoClient(host='localhost', port=27017) db = client.dump
TF-IDF
class TfIdf(object): # 四个参数分别是:训练好的idf字典,默认idf值,处理后的待提取文本,关键词数量 def init(self, idf_dic, default_idf, doc_list, keyword_num): self.doc_list = doc_list self.idf_dic, self.default_idf = idf_dic, default_idf self.tf_dic = self.get_tf_dic() self.keyword_num = keyword_num
# 统计tf值
def get_tf_dic(self):
tf_dic = {}
for word in self.doc_list:
tf_dic[word] = tf_dic.get(word, 0.0) + 1.0
tt_count = len(self.doc_list)
for k, v in tf_dic.items():
tf_dic[k] = float(v) / tt_count
return tf_dic
# 按公式计算tf-idf
def get_tfidf(self):
tfidf_dic = {}
for word in self.doc_list:
idf = self.idf_dic.get(word, self.default_idf)
tf = self.tf_dic.get(word, 0)
tfidf = tf * idf
tfidf_dic[word] = tfidf
tfidf_dic.items()
# 根据tf-idf排序,去排名前keyword_num的词作为关键词
for k, v in sorted(tfidf_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
print(k + '/ ', end='')
print(v)
print()
idf值统计方法
def train_idf(doc_list): idf_dic = {} # 总文档数 tt_count = len(doc_list)
# 每个词出现的文档数
for doc in doc_list:
for word in set(doc):
idf_dic[word] = idf_dic.get(word, 0.0) + 1.0
# 按公式转换为idf值,分母加1进行平滑处理
for k, v in idf_dic.items():
idf_dic[k] = math.log(tt_count / (1.0 + v))
# 对于没有在字典中的词,默认其仅在一个文档出现,得到默认idf值
default_idf = math.log(tt_count / (1.0))
return idf_dic, default_idf
排序函数,用于topK关键词的按值排序
def cmp(e1, e2): import numpy as np res = np.sign(e1[1] - e2[1]) if res != 0: return res else: a = e1[0] + e2[0] b = e2[0] + e1[0] if a > b: return 1 elif a == b: return 0 else: return -1
分析每个酒店的评论
def analyze_hotel_comments(hotel_name): # 查询指定酒店的评论文档 comments = db.hotels.find_one({'hotel_name': hotel_name})['comments']
# 分词,去除标点符号和停用词,词干提取
cutword1 = word_tokenize(comments)
interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '}','{', '``', ' '' ', 'time', 'great']
cutwords2 = [word for word in cutword1 if word not in interpunctuations]
stops = set(stopwords.words('english'))
cutword3 = [word for word in cutwords2 if word not in stops and len(word)>4]
tags = set(['NN', 'NNS', 'NNP', 'NNPS'])
pos_tags =nltk.pos_tag(cutword3)
cutword4 = []
for word,pos in pos_tags:
if (pos in tags):
cutword4.append(word)
doc_list = []
for cutword in cutword4:
doc_list.append(PorterStemmer().stem(cutword))
# 提取关键词
print('关键词提取结果:')
tfidf_extract(doc_list)
def tfidf_extract(doc_list, pos=False, keyword_num=25):
doc_list = load_data(pos)
idf_dic, default_idf = train_idf(doc_list)
tfidf_model = TfIdf(idf_dic, default_idf, doc_list, keyword_num)
tfidf_model.get_tfidf()
if name == 'main': # 指定需要分析的酒店名称列表 hotel_names = ['hotel1', 'hotel2', 'hotel3'] for hotel_name in hotel_names: analyze_hotel_comments(hotel_name)
原文地址: http://www.cveoy.top/t/topic/ojIm 著作权归作者所有。请勿转载和采集!